Python Web Scraping Yahoo - Result in empty list - python

import requests
import csv
from bs4 import BeautifulSoup
ticker = input('Enter the ticker symbol: ')
url = f'https://finance.yahoo.com/quote/{ticker}/history?p={ticker}'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', {'class': 'W(100%) M(0)'})
rows = table.tbody.find_all('tr')
stock_prices = []
for row in rows:
cells = row.find_all('td')
if cells:
try:
stock_prices.append(float(cells[4].text.replace(',', '')))
except ValueError:
print('Error parsing stock price')
print(stock_prices)
I'm trying to scrap yahoo finance for "market close" prices of a given stock. I went through the html and am not sure what table row or cell I have wrong. The output list is empty.
I'm trying to scrap yahoo finance for "market close" prices of a given stock. I went through the html and am not sure what table row or cell I have wrong. The output list is empty.

Try to set User-Agent header when requesting the page from Yahoo:
import requests
from bs4 import BeautifulSoup
ticker = input("Enter the ticker symbol: ")
url = f"https://finance.yahoo.com/quote/{ticker}/history?p={ticker}"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0"
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
table = soup.find("table", {"class": "W(100%) M(0)"})
rows = table.tbody.find_all("tr")
stock_prices = []
for row in rows:
cells = row.find_all("td")
if cells and len(cells) > 3:
try:
stock_prices.append(float(cells[4].text.replace(",", "")))
except ValueError:
print("Error parsing stock price")
print(stock_prices)
Prints (for example AAPL):
Enter the ticker symbol: AAPL
https://finance.yahoo.com/quote/AAPL/history?p=AAPL
[124.9, 129.93, 129.61, 126.04, 130.03, 131.86, 132.23, 135.45, 132.3, 132.37, 134.51, 136.5, 143.21, 145.47, 144.49, 142.16, 142.65, 140.94, 142.91, 146.63, 147.81, 148.31, 148.03, 141.17, 144.22, 148.11, 151.07, 150.18, 148.01, 151.29, 150.72, 148.79, 150.04, 148.28, 149.7, 146.87, 134.87, 139.5, 138.92, 138.38, 138.88, 145.03, 150.65, 153.34, 155.74, 144.8, 149.35, 152.34, 149.45, 147.27, 143.39, 143.86, 143.75, 142.41, 138.38, 142.99, 138.34, 138.98, 140.42, 140.09, 145.43, 146.4, 146.1, 142.45, 138.2, 142.48, 149.84, 151.76, 150.77, 150.43, 152.74, 153.72, 156.9, 154.48, 150.7, 152.37, 155.31, 153.84, 163.43, 157.37, 154.46, 155.96, 154.53, 155.81, 157.96, 157.22, 158.91, 161.38, 163.62, 170.03, 167.53, 167.23, 167.57, 171.52, 174.15, 174.55, 173.03, 173.19, 172.1]

Related

web scraping can't get data of all links in page at same time

From someday I am trying to crawl all vessel data from vesselfinder with its description page, like from description page I want its information like vessel type, Imo number etc. in table form. I try different way to do this but still a lot of errors. First, I found that how I go through these links to its description page, how to get all these links from all pages, also how to get specific table data from its description page (which is still not complete but get some).
But today I try get the data from all links with its description pages at same time, it gives me a lot of error which make me so confused (by combining the code).
I attached my code, which is not good but to this point #print(len(vessellist)) it work after that… errors..
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {
'user-agent': 'Mozilla/5.0',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
baseurl = 'https://www.vesselfinder.com/vessels'
vessellist = []
for x in range(1,6):
response = requests.get(
f'https://www.vesselfinder.com/vessels?page={x}',
headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
contents = soup.find_all('td', class_='v2')
for property in contents:
for item in property.find_all('a', href=True):
vessellist.append(baseurl + item['href'])
for link in vessellist:
response = requests.get(link, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', class_ = 'tparams')
head = []
for i in table.find_all('td', class_ = 'n3'):
title = i.text
head.append(title)
values =[]
for row in table.find_all('td', class_ = 'v3'):
data = row.text
values.append(data)
df = pd.DataFrame(values)
print(df)
two steps: get summary data (includes href).Next get detailled ones. Theses two steps are implemented in two functions. Here I get first 10 pages, 200 are available.
import requests as rq
from bs4 import BeautifulSoup as bs
from requests.api import head
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"}
def getSummaryData():
data = []
url = "https://www.vesselfinder.com/vessels"
for page in range(1, 10+1, 1): # only 200 first pages autorized ?
print("Page : %d/10" % page)
resp = rq.get(url + "?page=%s" % page, headers=headers)
soup = bs(resp.content, "lxml")
section = soup.find_all('section', {'class', 'listing'})[0]
tbody = section.find_all('tbody')[0]
trs = tbody.find_all('tr')
for tr in trs:
tds = tr.find_all('td')
# column 1 data
sub = tds[1].find('a')
href = sub['href']
divs = sub.find_all('div')
country = divs[0]['title']
sub_divs = divs[1].find_all('div')
vessel_name = sub_divs[0].text
vessel_type = sub_divs[1].text
# column 2 data
build_year = tds[2].text
# column 3 data
gt = tds[3].text
# column 4 data
dwt = tds[4].text
# column 5 data
size = tds[5].text
# save data
tr_data = {'country': country,
'vessel_name': vessel_name,
'vessel_type': vessel_type,
'build_year': build_year,
'gt': gt,
'dwt': dwt,
'size': size,
'href': href}
data.append(tr_data)
return data
def getDetailledData(data):
for (iel, el) in enumerate(data):
print("%d/%d" % (iel+1, len(data)))
url = "https://www.vesselfinder.com" + el['href']
# make get call
resp = rq.get(url, headers=headers)
soup = bs(resp.content, "lxml")
# position and voyage data
table = soup.find_all('table', {'class', 'aparams'})[0]
trs = table.find_all('tr')
labels = ["course_speed", "current_draught","navigation_status",
"position_received", "IMO_MMSI", "callsign", "flag", "length_beam"]
for (i, tr) in enumerate(trs):
td = tr.find_all('td')[1]
el.update({'%s' % labels[i]: td.text})
# vessel particulars
table = soup.find_all('table', {'class', 'tparams'})[0]
trs = table.find_all('tr')
labels = ["IMO_number", "vessel_name", "ship_type", "flag",
"homeport", "gross_tonnage", "summer_deadweight_t",
"length_overall_m", "beam_m", "draught_m", "year_of_built",
"builder", "place_of_built", "yard", "TEU", "crude", "grain",
"bale", "classification_society", "registered_owner", "manager"]
for (i, tr) in enumerate(trs):
td = tr.find_all('td')[1]
el.update({'%s' % labels[i]: td.text})
#break
return data
Call theses functions :
data = getSummaryData() # href include
data = getDetailledData(data)
Don't rely on 'class' tag to target the data. Generally, you need to go throught table -> tbody and then get tds or trs to be sure that's the correct ones.

BeautifulSoup - Scraping html table on multipe pages

I'm newbie on Python and BeautifulSoup, I would like to scrape multiple pages in csv but when I'm trying to store those 3 links only the last one it's stored in csv.
How can I fix my issue ?
## importing bs4, requests, fake_useragent and csv modules
from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
import csv
## create an array with URLs
urls = [
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=750300360&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=030780118&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=620103432&editable_length=10'
]
## initializing the UserAgent object
user_agent = UserAgent()
## starting the loop
for url in urls:
## getting the reponse from the page using get method of requests module
page = requests.get(url, headers={"user-agent": user_agent.chrome})
## storing the content of the page in a variable
html = page.content
## creating BeautifulSoup object
soup = BeautifulSoup(html, "html.parser")
table = soup.findAll("table", {"class":"table"})[0]
rows = table.findAll("tr")
with open("test.csv", "wt+", newline="") as f:
writer = csv.writer(f)
for row in rows:
csv_row = []
for cell in row.findAll(["td", "th"]):
csv_row.append(cell.get_text())
writer.writerow(csv_row)
Thanks a lot !
To simplify the reading process of the rows, you could also give a shot with pandas:
import csv
import requests
from bs4 import BeautifulSoup
import pandas as pd
urls = [
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=750300360&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=030780118&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=620103432&editable_length=10'
]
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0'}
all_data = []
for url in urls:
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
table = soup.findAll("table", {"class":"table"})[0]
df_table = pd.read_html(str(table))[0]
#add a column with additional info
df_table['hit'] = soup.find("span", {"class":"c"}).text.strip()
#store the table in a list of tables
all_data.append(df_table)
#concat the tables and export them to csv
pd.concat(all_data).to_csv('test.csv',index=False)
In your code, you don't store rows variable to anywhere, so you write only values from your last URL to CSV file. This example will write values from all three URLs:
import csv
import requests
from bs4 import BeautifulSoup
urls = [
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=750300360&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=030780118&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=620103432&editable_length=10'
]
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0'}
all_data = []
for url in urls:
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
table = soup.findAll("table", {"class":"table"})[0]
# here I store all rows to list `all_data`
for row in table.findAll('tr'):
tds = [cell.get_text(strip=True, separator=' ') for cell in row.findAll(["td", "th"])]
all_data.append(tds)
print(*tds)
# write list `all_data` to CSV
with open("test.csv", "wt+", newline="") as f:
writer = csv.writer(f)
for row in all_data:
writer.writerow(row)
Writes test.csv from all three URLs (screenshot from LibreOffice):

Issue while extracting table data using beautifulsoup in python

The code below gives me the required product data as a table. It works fine for most links, however in some it stops midway giving an error NoneType object has no attribute find_all in table.find_all('tr).
I believe this is because table does not exist in some products, so I tried creating an if condition on the existence of the table, but that also doesn't seem to help. What changes should I make in the code below?
import requests, json, time
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.1800wheelchair.com/category/toilet-accessories/?p="
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
data = []
for i in range(1,3):
print(i)
res = requests.get(url + str(i), headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
p_links = [i["data-link"] for i in soup.find("ul", {"id":"products-list"}).find_all("li",class_=["openlink","item"])]
for prod_url in p_links:
print(prod_url)
temp = {"Product URL": prod_url}
prod_res = requests.get(prod_url, headers = headers)
prod_soup = BeautifulSoup(prod_res.text, "html.parser")
for p in prod_soup.find("div", class_="basic-information").find_all("p"):
if "item" in p.text.lower(): temp["item number"] = p.find("span").text.strip()
elif "brand" in p.text.lower(): temp["manufacturer"] = p.find("span").text.strip()
elif "sku" in p.text.lower(): temp["sku"] = p.find("span").text.strip()
table = prod_soup.find("table",{"class":"specifications"})
for tr in table.find_all("tr"):
temp[tr.find("td", {"class":"tdLabel"}).text.strip()] = tr.find("td", {"class":"tdValue"}).text.strip()
data.append(temp)
pd.DataFrame(data).to_csv("toilet-acc.csv", index=False)
You can use Try and Except (documentation):
try:
for tr in table.find_all("tr"):
temp[tr.find("td", {"class":"tdLabel"}).text.strip()] = tr.find("td", {"class":"tdValue"}).text.strip()
except:
pass
you can use this:
tables = soup.select('table', attrs={"class":"specifications"})
rows = tables.findChildren(['tr'])
import requests, json, time
from bs4 import BeautifulSoup
import pandas as pd
url = "https://www.1800wheelchair.com/category/toilet-accessories/?p="
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
data = []
for i in range(1,3):
print(i)
res = requests.get(url + str(i), headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
p_links = [i["data-link"] for i in soup.find("ul", {"id":"products-list"}).find_all("li",class_=["openlink","item"])]
for prod_url in p_links:
print(prod_url)
temp = {"Product URL": prod_url}
prod_res = requests.get(prod_url, headers = headers)
prod_soup = BeautifulSoup(prod_res.text, "html.parser")
try:
for p in prod_soup.find("div", class_="basic-information").find_all("p"):
if "item" in p.text.lower(): temp["item number"] = p.find("span").text.strip()
elif "brand" in p.text.lower(): temp["manufacturer"] = p.find("span").text.strip()
elif "sku" in p.text.lower(): temp["sku"] = p.find("span").text.strip()
table = prod_soup.find("table",{"class":"specifications"})
for tr in table.find_all("tr"):
temp[tr.find("td", {"class":"tdLabel"}).text.strip()] = tr.find("td", {"class":"tdValue"}).text.strip()
except:
print("Failed for URL {}".format(prod_url))
data.append(temp)
time.sleep(2)
pd.DataFrame(data).to_csv("toilet-acc.csv", index=False)
Put a try/except not only to extract product specification but also to extract item/brand/sku. But in the except put a print statement to know which all urls failed so that you can try them again

BeautifulSoup - Pagination error on the last page

The battle to finish my first scraping script continues. I think that I'm almost finishing but I hit a new roadblock.
So, the problem is that when I reach the last pagination page I'm getting this error:
Traceback (most recent call last):
File "C:/Users/Andre/Desktop/scripts python/scrape_learn/ttc_quase.py", line 50, in <module>
url_tag = soup.find('li', {"id": "next-page-link"}).find('a')
AttributeError: 'NoneType' object has no attribute 'find'
I think that the error is related with the way I'm finding url_tag, but I'm not seeign any other way to grab the "next page". I tried to use the Try/Except method but when apply it I just get the listings on the first page.
So I'm not sure what should be my next step. If someone could help I will appreciate.
My full code:
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = "https://timetochoose.co.ao/?ct_keyword&ct_ct_status&ct_property_type&ct_beds&search-listings=true&ct_country=portugal&ct_state&ct_city&ct_price_to&ct_mls&lat&lng"
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0'}
anuncios_ttc = {}
anuncios_nr = 0
while True:
response = requests.get(url, headers=headers)
print(response)
data = response.text
print(data)
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
anuncios = soup.find_all("div", {"class": "grid-listing-info"})
for anuncios in anuncios:
titles = anuncios.find("a",{"class": "listing-link"}).text
location = anuncios.find("p",{"class": "location muted marB0"}).text
link = anuncios.find("a",{"class": "listing-link"}).get("href")
anuncios_response = requests.get(link, headers=headers)
anuncios_data = anuncios_response.text
anuncios_soup = BeautifulSoup(anuncios_data, 'html.parser')
conteudo = anuncios_soup.find("div", {"id":"listing-content"}).text
preco = anuncios_soup.find("span",{"class": "listing-price"})
preco_imo = preco.text if preco else "N/A"
quartos = anuncios_soup.find("li", {"class": "row beds"})
nr_quartos = quartos.text if quartos else "N/A"
wcs = anuncios_soup.find("li", {"class": "row baths"})
nr_wcs = wcs.text if wcs else "N/A"
tipo = anuncios_soup.find("li", {"class": "row property-type"})
tipo_imo = tipo.text if tipo else "N/A"
bairro = anuncios_soup.find("li", {"class": "row community"})
bairro1 = bairro.text if bairro else "N/A"
ref = anuncios_soup.find("li", {"class": "row propid"}).text
anuncios_nr+=1
anuncios_ttc[anuncios_nr] = [titles, location, bairro1, preco_imo, tipo_imo, nr_quartos, nr_wcs, conteudo, ref, link]
print("Título", titles, "\nLocalização", location, "\nPreço", preco_imo, "\nLink", link, "\nReferencia", ref, "\nTipo", tipo_imo, "\nQuartos", nr_quartos, "\nWC", nr_wcs, "\nBairro", bairro1, "\nConteudo", conteudo)
url_tag = soup.find('li', {"id": "next-page-link"}).find('a')
if url_tag.get('href'):
url = url_tag.get('href')
print(url)
else:
break
print("Nr Total de Anuncios: ", anuncios_nr)
anuncios_ttc_df = pd.DataFrame.from_dict(anuncios_ttc, orient = 'index', columns =['Titulo', 'Localização', 'Bairro', 'Preço', 'Tipo', 'Quartos', 'WCs', 'Descrição', 'Referência', 'Ligação'])
anuncios_ttc_df.head()
anuncios_ttc_df.to_csv('ttc_python.csv')
The answer for this question ended up to be provided in other thread where I was trying to identify better the URL_Tag element.
With the help of #Andrej Kesely I was able to solve the problem with:
url_tag = soup.find('li', {"id": "next-page-link"})
if not url_tag:
break
url = url_tag.find('a')['href']
Now the script is able to run until the end and to generate the csv file as intended.

i scraped title and price and links and info table and when i write csv file i get duplicated title and price and links

I want to replace duplicate title and price and links with empty column values.
import requests
import csv
from bs4 import BeautifulSoup
requests.packages.urllib3.disable_warnings()
import pandas as pd
url = 'http://shop.kvgems-preciousstones.com/'
while True:
session = requests.Session()
session.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}
content = session.get(url, verify=False).content
soup = BeautifulSoup(content, "html.parser")
posts = soup.find_all('li',{'class':'item'})
data = []
for url in posts:
title = url.find('h2',{'product-name'}).text
price = url.find('span',{'price'}).text
link = url.find('a').get('href')
url_response = requests.get(link)
url_data = url_response.text
url_soup = BeautifulSoup(url_data, 'html.parser')
desciption = url_soup.find('tr')
for tr in url_soup.find_all('tr'):
planet_data = dict()
values = [td.text for td in tr.find_all('td')]
planet_data['name'] = tr.find('td').text.strip()
planet_data['info'] = tr.find_all('td')[1].text.strip()
data.append((title,price,planet_data,link))
#data_new = data +","+ data_desciption
#urls = soup.find('a',{'class': 'next i-next'}).get('href')
#url = urls
#print(url)
with open('ineryrge5szdqzrt.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(['title','price','name','info','link'])
#The for loop
for title,price,planet_data,link in data:
writer.writerow([title,price,planet_data['name'],planet_data['info'],link])
When I write CSV I got the result of duplicated title, price, link but I want to get only 1 title, price, info and link while the rest are empty.
The first for loop extracts the common values (title, price and link). The second for loop then extracts all the data attributes for each item.
However, you are then writing title, price and link fields to the CSV file for every row of data. You only need to do it for the first row of data.
To detect if your second for loop is on the first row or not, you can change it to use the enumerate function which gives you an extra index variable. You can then use this value to only write the title, price, link if 0:
for index, tr in enumerate(url_soup.find_all('tr')):
planet_data = dict()
values = [td.text for td in tr.find_all('td')]
planet_data['name'] = tr.find('td').text.strip()
planet_data['info'] = tr.find_all('td')[1].text.strip()
if index == 0:
data.append((title,price,planet_data,link))
else:
data.append((None,None,planet_data,None))
(Also I don't think you need the initial while True: part.)

Categories