I'm trying to grab all prices from a website, using the xpath. all prices have the same xpath, and only [0], or I assume the 1st item works... let me show you:
webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
print(dom.xpath('/html/body/div[1]/div[5]/div/div/div/div[1]/ul/li[1]/article/div[1]/div[2]/div')[0].text)
This successfully prints the 1st price!!!
I tried changing "[0].text" to 1, to print the 2nd item but it returned "out of range".
Then I was trying to think of some For loop that would print All Items, so I could create an average.
Any help would be Greatly appreciated!!!
I apologize edited in is the code
from bs4 import BeautifulSoup
from lxml import etree
import requests
URL = "https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709"
#HEADERS = you'll need to add your own headers here, won't let post.
webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
print(dom.xpath('/html/body/div[10]/div[4]/section/div/div/div[2]/div/div/div/div[2]/div/div[2]/div[2]/div[1]/div/div[2]/ul/li[3]/strong')[0].text)
You could just use css selectors which, in this instance, are a lot more readable. I would also remove some of the offers info to leave just the actual price.
import requests
from bs4 import BeautifulSoup as bs
from pprint import pprint
r = requests.get("https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709", headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.text, features="lxml")
prices = {}
for i in soup.select('.item-container'):
if a:=i.select_one('.price-current-num'): a.decompose()
prices[i.select_one('.item-title').text] = i.select_one('.price-current').get_text(strip=True)[:-1]
pprint(prices)
prices as list of floats
import requests, re
from bs4 import BeautifulSoup as bs
from pprint import pprint
r = requests.get("https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709", headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.text, features="lxml")
prices = []
for i in soup.select('.item-container'):
if a:=i.select_one('.price-current-num'): a.decompose()
prices.append(float(re.sub('\$|,', '', i.select_one('.price-current').get_text(strip=True)[:-1])))
pprint(prices)
Related
Just started learning python (3.8), building a scraper to get some football stats. Here's the code so far.
I originally wanted to pull a div with id = 'div_alphabet' which is clearly in the html tree on the website, but for some reason bs4 wasn't pulling it in. I investigated further and noticed that when I pull in the parent div 'all_alphabet' and then look for all child divs, 'div_alphabet' is missing. The only thing weird about the html structure is the long block comment that sits right above 'div_alphabet'. Is this a potential issue?
https://www.pro-football-reference.com/players
import requests
from bs4 import BeautifulSoup
URL = 'https://www.pro-football-reference.com/'
homepage = requests.get(URL)
home_soup = BeautifulSoup(homepage.content, 'html.parser')
players_nav_URL = home_soup.find(id='header_players').a['href']
players_directory_page = requests.get(URL + players_nav_URL)
players_directory_soup = BeautifulSoup(players_directory_page.content, 'html.parser')
alphabet_nav = players_directory_soup.find(id='all_alphabet')
all_letters = alphabet_nav.find_all('div')
print(all_letters)
links = [a['href'] for a in players_directory_soup.select('ul.page_index li div a')]
names = [a.get_text() for a in players_directory_soup.select('ul.page_index li div a')]
This gives you a list and names of all the relative links of alphabetised players.
I wouldn't concern yourself with the div_alphabet it doesn't have any useful information.
Here we are selecting the ul tag with class "page_index". But you'll get a list, so we need to do a for loop and grab the href attribute. The get_text() also gives you the names.
If you haven't come across list comprehensions then this would also be acceptable.
links = []
for a in players_directory_soup.select('ul.page_index li div a'):
links.append(a['href'])
names = []
for a in players_directory_soup.select('ul.page_index li div a'):
names.append(a.get_text())
Something like this cod will make it:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 '}
r = requests.get('https://www.pro-football-reference.com/players/', headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
data = soup.select('ul.page_index li div')
for link in data:
print(*[f'{a.get("href")}\n' for a in link.select('a')])
A more useful way to do this is to make a DataFrame with pandas of it and save it as a csv or something:
import requests
from bs4 import BeautifulSoup
import pandas as pd
players = []
headers = {'User-Agent': 'Mozilla/5.0 '}
r = requests.get('https://www.pro-football-reference.com/players/', headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
data = soup.select('ul.page_index li div a')
for link in data:
players.append([link.get_text(strip=True), 'https://www.pro-football-reference.com' + link.get('href')])
print(players[0])
df = pd.DataFrame(players, columns=['Player name', 'Url'])
print(df.head())
df.to_csv('players.csv', index=False)
I'm trying scrape the data into a dictionary from this site,
from bs4 import BeautifulSoup
import requests
from pprint import pprint
page = requests.get('https://webscraper.io/')
soup = BeautifulSoup(page.text, "lxml")
info = []
for x in range(1,7):
items = soup.findAll("div",{"class":f"info{x}"})
info.append(items)
however, the HTML tags are not being removed.
You need to use .text. Then to get in the way you want, would need to do a bit of string manipulation.
from bs4 import BeautifulSoup
import requests
from pprint import pprint
url = 'https://webscraper.io/'
page = requests.get(url)
soup = BeautifulSoup(page.text, "lxml")
info = []
for x in range(1,7):
item = soup.find("div",{"class":"info%s" %x}).text.strip().replace('\n',': ')
info.append(item)
info = '\n'.join(info)
print (info)
Something like this might work? (Replace the webscraper.io url with your actual request URL; Also, you'd still need to clean up the \n characters from the output):
from bs4 import BeautifulSoup
import requests
from pprint import pprint
page = requests.get('https://webscraper.io/')
soup = BeautifulSoup(page.text, "lxml")
info = []
for x in range(1,7):
items = soup.findAll("div",{"class":f"info{x}"})
info += [item.text for item in items]
I.e. item.text, and concatenate the resulting array with info
How can I scrape the yahoo earnings calendar to pull out the dates?
This is for python 3.
from bs4 import BeautifulSoup as soup
import urllib
url = 'https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm'
response = urllib.request.urlopen(url)
html = response.read()
page_soup = soup(html,'lxml')
table = page_soup.find('p')
print(table)
the output is "None"
Beautiful Soup has some find functions that you can use to inspect the DOM , please refer to the documentation
from bs4 import BeautifulSoup as soup
import urllib.request
url = 'https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm'
response = urllib.request.urlopen(url)
html = response.read()
page_soup = soup(html,'lxml')
table = page_soup.find_all('td')
Dates = []
for something in table:
try:
if something['aria-label'] == "Earnings Date":
Dates.append(something.text)
except:
print('')
print(Dates)
Might be off-topic but since you want to get a table from a webpage, you might consider using pandas which works with two lines:
import pandas as pd
earnings = pd.read_html('https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm')[0]
Here are two succinct ways
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm&guccounter=1')
soup = bs(r.content, 'lxml')
# using attribute = value selector
dates = [td.text for td in soup.select('[aria-label="Earnings Date"]')]
#using nth-of-type to get column
dates = [td.text for td in soup.select('#cal-res-table td:nth-of-type(3)')]
I would like to get the value 100 from the tag below using python and beautiful soup
<span style="font-size:90%"><b>100</b> <cite style="color:#cc0000"><b>-0.10</b> (0.52%)</cite></span>
The code below gives me the following output
100 -0.10 (0.52%)
How can I extract only the value 100?
Code:
from urllib.request import Request, urlopen
import bs4
import re
url = 'url.com'
req = Request(url, headers = {'User-Agent': 'Mozilla/5.0'})
page = urlopen(req).read()
soup = bs4.BeautifulSoup(page, 'html.parser')
data = soup.find('span',style=re.compile('font-size:90%'))
value = data.text
You can get the first element of soup.contents:
from bs4 import BeautifulSoup as soup
d = soup(page, 'html.parser').find('span', {'style':'font-size:90%'}).contents[0].text
Output:
'100'
Just Find the <b> tag it will give you 100.
data = soup.find('span',style=re.compile('font-size:90%'))
value = data.find('b').text
Little problem with BeautifulSoup:
from bs4 import BeautifulSoup
import requests
link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
req = requests.get(link)
web = req.text
soup = BeautifulSoup(web, "lxml")
cve_name = []
cve_link = []
for par_ in soup.find_all('div', attrs={'class':'fl'}):
for link_ in par_.find_all('p'):
for text_ in link_.find_all('a'):
print (text_.string)
print (text_['href'])
print ("==========")
#cve_name.append(text_.string)
#cve_link.append(text_['href'])
And it gives me twice records :V That probably is easy to solve :V
The same elements are in two places on page so you have to use find()/find_all() to select only one place i.e find(class_='list_list') in
soup.find(class_='list_list').find_all('div', attrs={'class':'fl'}):
Full code:
from bs4 import BeautifulSoup
import requests
link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
req = requests.get(link)
web = req.text
soup = BeautifulSoup(web, "lxml")
cve_name = []
cve_link = []
for par_ in soup.find(class_='list_list').find_all('div', attrs={'class':'fl'}):
print(len(par_))
for link_ in par_.find_all('p'):
for text_ in link_.find_all('a'):
print (text_.string)
print (text_['href'])
print ("==========")
#cve_name.append(text_.string)
#cve_link.append(text_['href'])
How about this. I used css selectors to do the same.
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
res = requests.get(link)
soup = BeautifulSoup(res.text, "lxml")
for item in soup.select('.fl p a'):
print("Item: {}\nItem_link: {}".format(item.text,urljoin(link,item['href'])))
Partial Output:
Item: CNNVD-201712-811
Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-811
Item: CNNVD-201712-810
Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-810
Item: CNNVD-201712-809
Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-809