data scraping - field value - issue

data scraping - field value - issue - python

I like to obtain actual informaction about the numbers of infected from this website: https://www.gov.pl/web/koronawirus/wykaz-zarazen-koronawirusem-sars-cov-2
my code looks like:
import requests
from bs4 import BeautifulSoup
adresURL = 'https://www.gov.pl/web/koronawirus/wykaz-zarazen-koronawirusem-sars-cov-2'
res = requests.get(adresURL)
soup = BeautifulSoup(res.text, 'html.parser')
data = soup.select('.details-property-value')
print(data)
as a result I'm receiving:
[<div class="details-property-value" tabindex="0">{{selectedRecord[commonColumns[index]] || '-'}}</div>]
Any ideas how to get value of fields ? Am i missing sth ?

I'm guessing you're trying to scrape the table on that page. It looks like there is some JSON baked into the HTML:
import requests
from bs4 import BeautifulSoup
import json
url = "https://www.gov.pl/web/koronawirus/wykaz-zarazen-koronawirusem-sars-cov-2"
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
data = json.loads(soup.find("pre", {"id": "registerData"}).text)
print(data)

Related

lxml to grab All items that share a certain xpath

I'm trying to grab all prices from a website, using the xpath. all prices have the same xpath, and only [0], or I assume the 1st item works... let me show you:
webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
print(dom.xpath('/html/body/div[1]/div[5]/div/div/div/div[1]/ul/li[1]/article/div[1]/div[2]/div')[0].text)
This successfully prints the 1st price!!!
I tried changing "[0].text" to 1, to print the 2nd item but it returned "out of range".
Then I was trying to think of some For loop that would print All Items, so I could create an average.
Any help would be Greatly appreciated!!!
I apologize edited in is the code
from bs4 import BeautifulSoup
from lxml import etree
import requests
URL = "https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709"
#HEADERS = you'll need to add your own headers here, won't let post.
webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
print(dom.xpath('/html/body/div[10]/div[4]/section/div/div/div[2]/div/div/div/div[2]/div/div[2]/div[2]/div[1]/div/div[2]/ul/li[3]/strong')[0].text)

You could just use css selectors which, in this instance, are a lot more readable. I would also remove some of the offers info to leave just the actual price.
import requests
from bs4 import BeautifulSoup as bs
from pprint import pprint
r = requests.get("https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709", headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.text, features="lxml")
prices = {}
for i in soup.select('.item-container'):
if a:=i.select_one('.price-current-num'): a.decompose()
prices[i.select_one('.item-title').text] = i.select_one('.price-current').get_text(strip=True)[:-1]
pprint(prices)
prices as list of floats
import requests, re
from bs4 import BeautifulSoup as bs
from pprint import pprint
r = requests.get("https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709", headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.text, features="lxml")
prices = []
for i in soup.select('.item-container'):
if a:=i.select_one('.price-current-num'): a.decompose()
prices.append(float(re.sub('\$|,', '', i.select_one('.price-current').get_text(strip=True)[:-1])))
pprint(prices)

How do I scrape data from URLs in a python-scraped list of URLs?

I'm trying to use BeautifulSoup4 in Orange to scrape data from a list of URLs scraped from that same website.
I have managed to scraped the data from a single page when I set the URL manually.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import csv
import re
url = "https://data.ushja.org/awards-standings/zone-points.aspx?year=2021&zone=1&section=1901"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
rank = soup.find("table", class_="table-standings-body")
for child in rank.children:
print(url,child)
and I have been able to scrape the list of URLs I need
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import csv
import re
url = "https://data.ushja.org/awards-standings/zones.aspx?year=2021&zone=1"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
rank = soup.find("table", class_="table-standings-body")
link = soup.find('div',class_='contentSection')
url_list = link.find('a').get('href')
for url_list in link.find_all('a'):
print (url_list.get('href'))
But so far I haven't been able to combine both to scrape the data from that URL list. Can I do that only by nesting for loops, and if so, how? Or how can I do it?
I am sorry if this is a stupid question, but I only started trying with Python and Web-Scraping yesterday and I have not been able to figure this by consulting similar-ish topics.

Try:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://data.ushja.org/awards-standings/zones.aspx?year=2021&zone=1"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
# get all links
url_list = []
for a in soup.find("div", class_="contentSection").find_all("a"):
url_list.append(a["href"].replace("§", "&sect"))
# get all data from URLs
all_data = []
for url in url_list:
print(url)
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
h2 = soup.h2
sub = h2.find_next("p")
for tr in soup.select("tr:has(td)"):
all_data.append(
[
h2.get_text(strip=True),
sub.get_text(strip=True),
*[td.get_text(strip=True) for td in tr.select("td")],
]
)
# save data to CSV
df = pd.DataFrame(
all_data,
columns=[
"title",
"sub_title",
"Rank",
"Horse / Owner",
"Points",
"Total Comps",
],
)
print(df)
df.to_csv("data.csv", index=None)
This traverses all URLs and saves all data to data.csv (screenshot from LibreOffice):

BeautifulSoup: Reading Span Class Elements

I am having some issues web scraping information from a particular pages span class element, using the beautifulsoup and requests addon in python. It keeps returning me blank information: " ". Heres my code:
headers = {'User-Agent':'Mozilla/5.0'}
res = requests.get('https://www.theweathernetwork.com/ca/weather/ontario/toronto')
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, 'html.parser')
weather_elem = soup.find('span', {'class':'wxcondition'})
weather = weather_elem
print(weather)
return weather`

The data is loaded through JavaScript so BeautifulSoup doesn't see anything. But you can simulate the Ajax with the requests module:
import json
import requests
from bs4 import BeautifulSoup
url = 'https://www.theweathernetwork.com/ca/weather/ontario/toronto'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
place_code = soup.select_one('link[rel="alternate"]')['href'].split('=')[-1].lower()
ajax_url = 'https://weatherapi.pelmorex.com/api/v1/observation/placecode/' + place_code
data = requests.get(ajax_url).json()
# uncomment to print all data:
# print(json.dumps(data, indent=4))
print(data['observation']['weatherCode']['text'])
Prints:
Partly cloudy

How to scrape the yahoo earnings calendar with beautifulsoup

How can I scrape the yahoo earnings calendar to pull out the dates?
This is for python 3.
from bs4 import BeautifulSoup as soup
import urllib
url = 'https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm'
response = urllib.request.urlopen(url)
html = response.read()
page_soup = soup(html,'lxml')
table = page_soup.find('p')
print(table)
the output is "None"

Beautiful Soup has some find functions that you can use to inspect the DOM , please refer to the documentation
from bs4 import BeautifulSoup as soup
import urllib.request
url = 'https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm'
response = urllib.request.urlopen(url)
html = response.read()
page_soup = soup(html,'lxml')
table = page_soup.find_all('td')
Dates = []
for something in table:
try:
if something['aria-label'] == "Earnings Date":
Dates.append(something.text)
except:
print('')
print(Dates)

Might be off-topic but since you want to get a table from a webpage, you might consider using pandas which works with two lines:
import pandas as pd
earnings = pd.read_html('https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm')[0]

Here are two succinct ways
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://finance.yahoo.com/calendar/earnings?day=2019-06-13&symbol=ibm&guccounter=1')
soup = bs(r.content, 'lxml')
# using attribute = value selector
dates = [td.text for td in soup.select('[aria-label="Earnings Date"]')]
#using nth-of-type to get column
dates = [td.text for td in soup.select('#cal-res-table td:nth-of-type(3)')]

Get data from HTML page using python

I would like to get the value 100 from the tag below using python and beautiful soup
<span style="font-size:90%"><b>100</b> <cite style="color:#cc0000"><b>-0.10</b> (0.52%)</cite></span>
The code below gives me the following output
100 -0.10 (0.52%)
How can I extract only the value 100?
Code:
from urllib.request import Request, urlopen
import bs4
import re
url = 'url.com'
req = Request(url, headers = {'User-Agent': 'Mozilla/5.0'})
page = urlopen(req).read()
soup = bs4.BeautifulSoup(page, 'html.parser')
data = soup.find('span',style=re.compile('font-size:90%'))
value = data.text

You can get the first element of soup.contents:
from bs4 import BeautifulSoup as soup
d = soup(page, 'html.parser').find('span', {'style':'font-size:90%'}).contents[0].text
Output:
'100'

Just Find the <b> tag it will give you 100.
data = soup.find('span',style=re.compile('font-size:90%'))
value = data.find('b').text

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

data scraping - field value - issue - python

Related

lxml to grab All items that share a certain xpath

How do I scrape data from URLs in a python-scraped list of URLs?

BeautifulSoup: Reading Span Class Elements

How to scrape the yahoo earnings calendar with beautifulsoup

Get data from HTML page using python

Categories

Resources