Beautiful Soup get value from page which updates daily

Beautiful Soup get value from page which updates daily - python

I am trying to get a single value from website which will update daily.
I am trying to get latest price in Vijayawada in page
below is my code but I am getting empty space as output but expecting 440 as output.
import requests
from bs4 import BeautifulSoup
import csv
res = requests.get('https://www.e2necc.com/home/eggprice')
soup = BeautifulSoup(res.content, 'html.parser')
price = soup.select("Vijayawada")
Looking to get value: 440 [Which is today value] any suggestions?

One approach could be to select the tr by its text and iterate over its strings to pick the last one that isdigit():
[x for x in soup.select_one('tr:-soup-contains("Vijayawada")').stripped_strings if x.isdigit()][-1]
Example
import requests
from bs4 import BeautifulSoup
res = requests.get('https://www.e2necc.com/home/eggprice')
soup = BeautifulSoup(res.content, 'html.parser')
price = [x for x in soup.select_one('tr:-soup-contains("Vijayawada")').stripped_strings if x.isdigit()][-1]
print(price)
Another one is to pick the element by day:
import requests
from bs4 import BeautifulSoup
import datetime
d = datetime.datetime.now().strftime("%d")
res = requests.get('https://www.e2necc.com/home/eggprice')
soup = BeautifulSoup(res.content, 'html.parser')
soup.select('tr:-soup-contains("Vijayawada") td')[int(d)].text
Output
440

Related

lxml to grab All items that share a certain xpath

I'm trying to grab all prices from a website, using the xpath. all prices have the same xpath, and only [0], or I assume the 1st item works... let me show you:
webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
print(dom.xpath('/html/body/div[1]/div[5]/div/div/div/div[1]/ul/li[1]/article/div[1]/div[2]/div')[0].text)
This successfully prints the 1st price!!!
I tried changing "[0].text" to 1, to print the 2nd item but it returned "out of range".
Then I was trying to think of some For loop that would print All Items, so I could create an average.
Any help would be Greatly appreciated!!!
I apologize edited in is the code
from bs4 import BeautifulSoup
from lxml import etree
import requests
URL = "https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709"
#HEADERS = you'll need to add your own headers here, won't let post.
webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
print(dom.xpath('/html/body/div[10]/div[4]/section/div/div/div[2]/div/div/div/div[2]/div/div[2]/div[2]/div[1]/div/div[2]/ul/li[3]/strong')[0].text)

You could just use css selectors which, in this instance, are a lot more readable. I would also remove some of the offers info to leave just the actual price.
import requests
from bs4 import BeautifulSoup as bs
from pprint import pprint
r = requests.get("https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709", headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.text, features="lxml")
prices = {}
for i in soup.select('.item-container'):
if a:=i.select_one('.price-current-num'): a.decompose()
prices[i.select_one('.item-title').text] = i.select_one('.price-current').get_text(strip=True)[:-1]
pprint(prices)
prices as list of floats
import requests, re
from bs4 import BeautifulSoup as bs
from pprint import pprint
r = requests.get("https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709", headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.text, features="lxml")
prices = []
for i in soup.select('.item-container'):
if a:=i.select_one('.price-current-num'): a.decompose()
prices.append(float(re.sub('\$|,', '', i.select_one('.price-current').get_text(strip=True)[:-1])))
pprint(prices)

BS4 + html, b Tag issue

This question is about web scraping with bs4
this is the code I have written:
import requests
from bs4 import BeautifulSoup
import json
import csv
page = requests.get('https://www.alibaba.com/product-detail/Portable-Small-USB-Travel-LED-Makeup_60830030133.html?spm=a2700.details.maylikever.2.1fb53cc2uSVPvx')
# Create a BeautifulSoup object
soup = BeautifulSoup(page.text, 'html.parser')
#extract product score **(This is what I want to extract)**
stars = soup.select_one('a[class="score-lite"]', namespaces=None, flags=0)
#score = json.loads(stars)
print('Stars', stars)
My outcome:
<a class="score-lite" data-spm-click="gostr=/details.index.reviewLevel;locaid=dreviewLevel" href="https://onuliss.en.alibaba.com/company_profile/feedback.html" target="_blank"><b>4.8 </b><img src="//img.alicdn.com/tfs/TB1MJPmiQL0gK0jSZFtXXXQCXXa-8-9.svg"/></a>
The outcome I want is just the 4.8 number between the 'b' tags
What do I have to do with the = soup.select_one() function?
Thank you very much :)

Try with a more specific selector, the string property of the match and strip() to get rid of eventual extra spaces.
import requests
from bs4 import BeautifulSoup
import json
import csv
page = requests.get('https://www.alibaba.com/product-detail/Portable-Small-USB-Travel-LED-Makeup_60830030133.html?spm=a2700.details.maylikever.2.1fb53cc2uSVPvx')
# Create a BeautifulSoup object
soup = BeautifulSoup(page.text, 'html.parser')
#extract product score **(This is what I want to extract)**
stars = soup.select_one('a[class="score-lite"] > b', namespaces=None, flags=0).get_text(strip=True)
#score = json.loads(stars)
print('Stars', stars)
Stars 4.8

how about SimplifiedDoc
import requests
from simplified_scrapy.simplified_doc import SimplifiedDoc
page = requests.get('https://www.alibaba.com/product-detail/Portable-Small-USB-Travel-LED-Makeup_60830030133.html?spm=a2700.details.maylikever.2.1fb53cc2uSVPvx')
# Create a SimplifiedDoc object
doc = SimplifiedDoc(page.text)
# get element use tag and class
stars = doc.getElement('a','class',"score-lite")
print('Stars', stars.text, stars.b.text) # Stars 4.8 4.8

import requests
from bs4 import BeautifulSoup
r = requests.get(
'https://www.alibaba.com/product-detail/Portable-Small-USB-Travel-LED-Makeup_60830030133.html?spm=a2700.details.maylikever.2.1fb53cc2uSVPvx')
soup = BeautifulSoup(r.text, 'html.parser')
if r.status_code == 200:
item = soup.find('a', {'class': 'score-lite'}).find('b')
print(item.get_text(strip=True))
output:
4.8

How to scrape the yahoo earnings calendar with beautifulsoup

How can I scrape the yahoo earnings calendar to pull out the dates?
This is for python 3.
from bs4 import BeautifulSoup as soup
import urllib
url = 'https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm'
response = urllib.request.urlopen(url)
html = response.read()
page_soup = soup(html,'lxml')
table = page_soup.find('p')
print(table)
the output is "None"

Beautiful Soup has some find functions that you can use to inspect the DOM , please refer to the documentation
from bs4 import BeautifulSoup as soup
import urllib.request
url = 'https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm'
response = urllib.request.urlopen(url)
html = response.read()
page_soup = soup(html,'lxml')
table = page_soup.find_all('td')
Dates = []
for something in table:
try:
if something['aria-label'] == "Earnings Date":
Dates.append(something.text)
except:
print('')
print(Dates)

Might be off-topic but since you want to get a table from a webpage, you might consider using pandas which works with two lines:
import pandas as pd
earnings = pd.read_html('https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm')[0]

Here are two succinct ways
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm&guccounter=1')
soup = bs(r.content, 'lxml')
# using attribute = value selector
dates = [td.text for td in soup.select('[aria-label="Earnings Date"]')]
#using nth-of-type to get column
dates = [td.text for td in soup.select('#cal-res-table td:nth-of-type(3)')]

Get a <span> value using python web scrape

I am trying to get a product price using BeautifulSoup in python.
But i keep getting erroes, no matter what I try.
The picture of the site i am trying to web scrape
I want to get the 19,90 value.
I have already done a code to get all the product names, and now need their prices.
import requests
from bs4 import BeautifulSoup
url = 'https://www.zattini.com.br/busca?nsCat=Natural&q=amaro&searchTermCapitalized=Amaro&page=1'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
price = soup.find('span', itemprop_='price')
print(price)

Less ideal is parsing out the JSON containing the prices
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup
url = 'https://www.zattini.com.br/busca?nsCat=Natural&q=amaro&searchTermCapitalized=Amaro&page=1'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'lxml')
scripts = [script.text for script in soup.select('script') if 'var freedom = freedom ||' in script.text]
pricesJson = scripts[0].split('"items":')[1].split(']')[0] + ']'
prices = [item['price'] for item in json.loads(pricesJson)]
names = [name.text for name in soup.select('#item-list [itemprop=name]')]
results = list(zip(names,prices))
df = pd.DataFrame(results)
print(df)
Sample output:

span[itemprop='price'] is generated by javascript. Original value stored in div[data-final-price] with value like 1990 and you can format it to 19,90 with Regex.
import re
...
soup = BeautifulSoup(page.text, 'html.parser')
prices = soup.select('div[data-final-price]')
for price in prices:
price = re.sub(r'(\d\d$)', r',\1', price['data-final-price'])
print(price)
Results:
19,90
134,89
29,90
119,90
104,90
59,90
....

Removing UTF 8 encoding in python

I tried scraping the webpage for Passengers & Cargo data. I couldn't convert them into normal data, and web encoding seems to be the challenge.
The Code I used is:
from __future__ import print_function
import requests
import pandas as pd
from bs4 import BeautifulSoup
import urllib
url = "https://www.faa.gov/data_research/passengers_cargo/unruly_passengers/"
r = requests.get(url)
soup = BeautifulSoup(r.content)
links = soup.find_all("tbody")
for link in links:
print(link.text)
Output1
This prints in the format Year and Total. But when I append it to a list, the encoding ruins the data. You can see that in Output1
names = []
for link in links:
names.append(link.text)
names = map(lambda x: x.strip().encode('ascii'), names)
print(names)
Output2
The desired output should be Years and Total for me to perform analyses

You can use find_all tr and td like this:
import requests
from bs4 import BeautifulSoup
import urllib
url = "https://www.faa.gov/data_research/passengers_cargo/unruly_passengers/"
r = requests.get(url)
soup = BeautifulSoup(r.content)
links = soup.find_all("tr")
data = []
for link in links:
tds = link.find_all('td')
if tds:
data.append({'year':tds[0].text,'total':tds[1].text})
print(data)
It's worked.
Hope it helps you

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Beautiful Soup get value from page which updates daily - python

Related

lxml to grab All items that share a certain xpath

BS4 + html, b Tag issue

How to scrape the yahoo earnings calendar with beautifulsoup

Get a <span> value using python web scrape

Removing UTF 8 encoding in python

Categories

Resources