lxml to grab All items that share a certain xpath

lxml to grab All items that share a certain xpath - python

I'm trying to grab all prices from a website, using the xpath. all prices have the same xpath, and only [0], or I assume the 1st item works... let me show you:
webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
print(dom.xpath('/html/body/div[1]/div[5]/div/div/div/div[1]/ul/li[1]/article/div[1]/div[2]/div')[0].text)
This successfully prints the 1st price!!!
I tried changing "[0].text" to 1, to print the 2nd item but it returned "out of range".
Then I was trying to think of some For loop that would print All Items, so I could create an average.
Any help would be Greatly appreciated!!!
I apologize edited in is the code
from bs4 import BeautifulSoup
from lxml import etree
import requests
URL = "https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709"
#HEADERS = you'll need to add your own headers here, won't let post.
webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
print(dom.xpath('/html/body/div[10]/div[4]/section/div/div/div[2]/div/div/div/div[2]/div/div[2]/div[2]/div[1]/div/div[2]/ul/li[3]/strong')[0].text)

You could just use css selectors which, in this instance, are a lot more readable. I would also remove some of the offers info to leave just the actual price.
import requests
from bs4 import BeautifulSoup as bs
from pprint import pprint
r = requests.get("https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709", headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.text, features="lxml")
prices = {}
for i in soup.select('.item-container'):
if a:=i.select_one('.price-current-num'): a.decompose()
prices[i.select_one('.item-title').text] = i.select_one('.price-current').get_text(strip=True)[:-1]
pprint(prices)
prices as list of floats
import requests, re
from bs4 import BeautifulSoup as bs
from pprint import pprint
r = requests.get("https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709", headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.text, features="lxml")
prices = []
for i in soup.select('.item-container'):
if a:=i.select_one('.price-current-num'): a.decompose()
prices.append(float(re.sub('\$|,', '', i.select_one('.price-current').get_text(strip=True)[:-1])))
pprint(prices)

Related

Beautiful Soup - HTML Parser seems to not pull in things after comment

Just started learning python (3.8), building a scraper to get some football stats. Here's the code so far.
I originally wanted to pull a div with id = 'div_alphabet' which is clearly in the html tree on the website, but for some reason bs4 wasn't pulling it in. I investigated further and noticed that when I pull in the parent div 'all_alphabet' and then look for all child divs, 'div_alphabet' is missing. The only thing weird about the html structure is the long block comment that sits right above 'div_alphabet'. Is this a potential issue?
https://www.pro-football-reference.com/players
import requests
from bs4 import BeautifulSoup
URL = 'https://www.pro-football-reference.com/'
homepage = requests.get(URL)
home_soup = BeautifulSoup(homepage.content, 'html.parser')
players_nav_URL = home_soup.find(id='header_players').a['href']
players_directory_page = requests.get(URL + players_nav_URL)
players_directory_soup = BeautifulSoup(players_directory_page.content, 'html.parser')
alphabet_nav = players_directory_soup.find(id='all_alphabet')
all_letters = alphabet_nav.find_all('div')
print(all_letters)

links = [a['href'] for a in players_directory_soup.select('ul.page_index li div a')]
names = [a.get_text() for a in players_directory_soup.select('ul.page_index li div a')]
This gives you a list and names of all the relative links of alphabetised players.
I wouldn't concern yourself with the div_alphabet it doesn't have any useful information.
Here we are selecting the ul tag with class "page_index". But you'll get a list, so we need to do a for loop and grab the href attribute. The get_text() also gives you the names.
If you haven't come across list comprehensions then this would also be acceptable.
links = []
for a in players_directory_soup.select('ul.page_index li div a'):
links.append(a['href'])
names = []
for a in players_directory_soup.select('ul.page_index li div a'):
names.append(a.get_text())

Something like this cod will make it:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 '}
r = requests.get('https://www.pro-football-reference.com/players/', headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
data = soup.select('ul.page_index li div')
for link in data:
print(*[f'{a.get("href")}\n' for a in link.select('a')])
A more useful way to do this is to make a DataFrame with pandas of it and save it as a csv or something:
import requests
from bs4 import BeautifulSoup
import pandas as pd
players = []
headers = {'User-Agent': 'Mozilla/5.0 '}
r = requests.get('https://www.pro-football-reference.com/players/', headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
data = soup.select('ul.page_index li div a')
for link in data:
players.append([link.get_text(strip=True), 'https://www.pro-football-reference.com' + link.get('href')])
print(players[0])
df = pd.DataFrame(players, columns=['Player name', 'Url'])
print(df.head())
df.to_csv('players.csv', index=False)

Scraping data using BeautifulSoup

I'm trying scrape the data into a dictionary from this site,
from bs4 import BeautifulSoup
import requests
from pprint import pprint
page = requests.get('https://webscraper.io/')
soup = BeautifulSoup(page.text, "lxml")
info = []
for x in range(1,7):
items = soup.findAll("div",{"class":f"info{x}"})
info.append(items)
however, the HTML tags are not being removed.

You need to use .text. Then to get in the way you want, would need to do a bit of string manipulation.
from bs4 import BeautifulSoup
import requests
from pprint import pprint
url = 'https://webscraper.io/'
page = requests.get(url)
soup = BeautifulSoup(page.text, "lxml")
info = []
for x in range(1,7):
item = soup.find("div",{"class":"info%s" %x}).text.strip().replace('\n',': ')
info.append(item)
info = '\n'.join(info)
print (info)

Something like this might work? (Replace the webscraper.io url with your actual request URL; Also, you'd still need to clean up the \n characters from the output):
from bs4 import BeautifulSoup
import requests
from pprint import pprint
page = requests.get('https://webscraper.io/')
soup = BeautifulSoup(page.text, "lxml")
info = []
for x in range(1,7):
items = soup.findAll("div",{"class":f"info{x}"})
info += [item.text for item in items]
I.e. item.text, and concatenate the resulting array with info

How to scrape the yahoo earnings calendar with beautifulsoup

How can I scrape the yahoo earnings calendar to pull out the dates?
This is for python 3.
from bs4 import BeautifulSoup as soup
import urllib
url = 'https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm'
response = urllib.request.urlopen(url)
html = response.read()
page_soup = soup(html,'lxml')
table = page_soup.find('p')
print(table)
the output is "None"

Beautiful Soup has some find functions that you can use to inspect the DOM , please refer to the documentation
from bs4 import BeautifulSoup as soup
import urllib.request
url = 'https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm'
response = urllib.request.urlopen(url)
html = response.read()
page_soup = soup(html,'lxml')
table = page_soup.find_all('td')
Dates = []
for something in table:
try:
if something['aria-label'] == "Earnings Date":
Dates.append(something.text)
except:
print('')
print(Dates)

Might be off-topic but since you want to get a table from a webpage, you might consider using pandas which works with two lines:
import pandas as pd
earnings = pd.read_html('https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm')[0]

Here are two succinct ways
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm&guccounter=1')
soup = bs(r.content, 'lxml')
# using attribute = value selector
dates = [td.text for td in soup.select('[aria-label="Earnings Date"]')]
#using nth-of-type to get column
dates = [td.text for td in soup.select('#cal-res-table td:nth-of-type(3)')]

Get data from HTML page using python

I would like to get the value 100 from the tag below using python and beautiful soup
<span style="font-size:90%"><b>100</b> <cite style="color:#cc0000"><b>-0.10</b> (0.52%)</cite></span>
The code below gives me the following output
100 -0.10 (0.52%)
How can I extract only the value 100?
Code:
from urllib.request import Request, urlopen
import bs4
import re
url = 'url.com'
req = Request(url, headers = {'User-Agent': 'Mozilla/5.0'})
page = urlopen(req).read()
soup = bs4.BeautifulSoup(page, 'html.parser')
data = soup.find('span',style=re.compile('font-size:90%'))
value = data.text

You can get the first element of soup.contents:
from bs4 import BeautifulSoup as soup
d = soup(page, 'html.parser').find('span', {'style':'font-size:90%'}).contents[0].text
Output:
'100'

Just Find the <b> tag it will give you 100.
data = soup.find('span',style=re.compile('font-size:90%'))
value = data.find('b').text

Beautiful Soup PYTHON - inside tags

Little problem with BeautifulSoup:
from bs4 import BeautifulSoup
import requests
link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
req = requests.get(link)
web = req.text
soup = BeautifulSoup(web, "lxml")
cve_name = []
cve_link = []
for par_ in soup.find_all('div', attrs={'class':'fl'}):
for link_ in par_.find_all('p'):
for text_ in link_.find_all('a'):
print (text_.string)
print (text_['href'])
print ("==========")
#cve_name.append(text_.string)
#cve_link.append(text_['href'])
And it gives me twice records :V That probably is easy to solve :V

The same elements are in two places on page so you have to use find()/find_all() to select only one place i.e find(class_='list_list') in
soup.find(class_='list_list').find_all('div', attrs={'class':'fl'}):
Full code:
from bs4 import BeautifulSoup
import requests
link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
req = requests.get(link)
web = req.text
soup = BeautifulSoup(web, "lxml")
cve_name = []
cve_link = []
for par_ in soup.find(class_='list_list').find_all('div', attrs={'class':'fl'}):
print(len(par_))
for link_ in par_.find_all('p'):
for text_ in link_.find_all('a'):
print (text_.string)
print (text_['href'])
print ("==========")
#cve_name.append(text_.string)
#cve_link.append(text_['href'])

How about this. I used css selectors to do the same.
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
link = "http://www.cnnvd.org.cn/web/vulnerability/querylist.tag"
res = requests.get(link)
soup = BeautifulSoup(res.text, "lxml")
for item in soup.select('.fl p a'):
print("Item: {}\nItem_link: {}".format(item.text,urljoin(link,item['href'])))
Partial Output:
Item: CNNVD-201712-811
Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-811
Item: CNNVD-201712-810
Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-810
Item: CNNVD-201712-809
Item_link: http://www.cnnvd.org.cn/web/xxk/ldxqById.tag?CNNVD=CNNVD-201712-809

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

lxml to grab All items that share a certain xpath - python

Related

Beautiful Soup - HTML Parser seems to not pull in things after comment

Scraping data using BeautifulSoup

How to scrape the yahoo earnings calendar with beautifulsoup

Get data from HTML page using python

Beautiful Soup PYTHON - inside tags

Categories

Resources