I am having some issues web scraping information from a particular pages span class element, using the beautifulsoup and requests addon in python. It keeps returning me blank information: " ". Heres my code:
headers = {'User-Agent':'Mozilla/5.0'}
res = requests.get('https://www.theweathernetwork.com/ca/weather/ontario/toronto')
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, 'html.parser')
weather_elem = soup.find('span', {'class':'wxcondition'})
weather = weather_elem
print(weather)
return weather`
The data is loaded through JavaScript so BeautifulSoup doesn't see anything. But you can simulate the Ajax with the requests module:
import json
import requests
from bs4 import BeautifulSoup
url = 'https://www.theweathernetwork.com/ca/weather/ontario/toronto'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
place_code = soup.select_one('link[rel="alternate"]')['href'].split('=')[-1].lower()
ajax_url = 'https://weatherapi.pelmorex.com/api/v1/observation/placecode/' + place_code
data = requests.get(ajax_url).json()
# uncomment to print all data:
# print(json.dumps(data, indent=4))
print(data['observation']['weatherCode']['text'])
Prints:
Partly cloudy
Related
I'm trying to grab all prices from a website, using the xpath. all prices have the same xpath, and only [0], or I assume the 1st item works... let me show you:
webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
print(dom.xpath('/html/body/div[1]/div[5]/div/div/div/div[1]/ul/li[1]/article/div[1]/div[2]/div')[0].text)
This successfully prints the 1st price!!!
I tried changing "[0].text" to 1, to print the 2nd item but it returned "out of range".
Then I was trying to think of some For loop that would print All Items, so I could create an average.
Any help would be Greatly appreciated!!!
I apologize edited in is the code
from bs4 import BeautifulSoup
from lxml import etree
import requests
URL = "https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709"
#HEADERS = you'll need to add your own headers here, won't let post.
webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "html.parser")
dom = etree.HTML(str(soup))
print(dom.xpath('/html/body/div[10]/div[4]/section/div/div/div[2]/div/div/div/div[2]/div/div[2]/div[2]/div[1]/div/div[2]/ul/li[3]/strong')[0].text)
You could just use css selectors which, in this instance, are a lot more readable. I would also remove some of the offers info to leave just the actual price.
import requests
from bs4 import BeautifulSoup as bs
from pprint import pprint
r = requests.get("https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709", headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.text, features="lxml")
prices = {}
for i in soup.select('.item-container'):
if a:=i.select_one('.price-current-num'): a.decompose()
prices[i.select_one('.item-title').text] = i.select_one('.price-current').get_text(strip=True)[:-1]
pprint(prices)
prices as list of floats
import requests, re
from bs4 import BeautifulSoup as bs
from pprint import pprint
r = requests.get("https://www.newegg.com/p/pl?d=GPU&N=601357247%20100007709", headers = {'User-Agent':'Mozilla/5.0'})
soup = bs(r.text, features="lxml")
prices = []
for i in soup.select('.item-container'):
if a:=i.select_one('.price-current-num'): a.decompose()
prices.append(float(re.sub('\$|,', '', i.select_one('.price-current').get_text(strip=True)[:-1])))
pprint(prices)
from bs4 import BeautifulSoup
import requests
url = 'https://www.mediacorp.sg/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686'
artiste_name = 'celeb-name'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt = soup.find_all('h1', attrs={'class':artiste_name})
print(txt)
with the above code, i get the output:
[<*h1 class="celeb-name">Ayden Sng</h1*>] #asterisks added to show h1 tags
What do i need to change in my code or how can i make it such that i only get 'Ayden Sng' as my output?
Iterate over each entry of the txt list and extract its txt property:
txt = [element.text for element in txt] # ['Ayden Sng']
Repl.it
from bs4 import BeautifulSoup
import requests
url = 'https://www.mediacorp.sg/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686'
artiste_name = 'celeb-name'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt = soup.find_all('h1', attrs={'class':artiste_name})
print(txt[0].text)
if there are more than one reuslt you can use this code:
from bs4 import BeautifulSoup
import requests
url = 'https://www.mediacorp.sg/en/your-mediacorp/our-artistes/tca/male-artistes/ayden-sng-12357686'
artiste_name = 'celeb-name'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt = soup.find_all('h1', attrs={'class':artiste_name})
for i in txt:
print(i.text)
I'm trying to use BeautifulSoup4 in Orange to scrape data from a list of URLs scraped from that same website.
I have managed to scraped the data from a single page when I set the URL manually.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import csv
import re
url = "https://data.ushja.org/awards-standings/zone-points.aspx?year=2021&zone=1§ion=1901"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
rank = soup.find("table", class_="table-standings-body")
for child in rank.children:
print(url,child)
and I have been able to scrape the list of URLs I need
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import csv
import re
url = "https://data.ushja.org/awards-standings/zones.aspx?year=2021&zone=1"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
rank = soup.find("table", class_="table-standings-body")
link = soup.find('div',class_='contentSection')
url_list = link.find('a').get('href')
for url_list in link.find_all('a'):
print (url_list.get('href'))
But so far I haven't been able to combine both to scrape the data from that URL list. Can I do that only by nesting for loops, and if so, how? Or how can I do it?
I am sorry if this is a stupid question, but I only started trying with Python and Web-Scraping yesterday and I have not been able to figure this by consulting similar-ish topics.
Try:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://data.ushja.org/awards-standings/zones.aspx?year=2021&zone=1"
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
# get all links
url_list = []
for a in soup.find("div", class_="contentSection").find_all("a"):
url_list.append(a["href"].replace("ยง", "§"))
# get all data from URLs
all_data = []
for url in url_list:
print(url)
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
h2 = soup.h2
sub = h2.find_next("p")
for tr in soup.select("tr:has(td)"):
all_data.append(
[
h2.get_text(strip=True),
sub.get_text(strip=True),
*[td.get_text(strip=True) for td in tr.select("td")],
]
)
# save data to CSV
df = pd.DataFrame(
all_data,
columns=[
"title",
"sub_title",
"Rank",
"Horse / Owner",
"Points",
"Total Comps",
],
)
print(df)
df.to_csv("data.csv", index=None)
This traverses all URLs and saves all data to data.csv (screenshot from LibreOffice):
I am trying to get all the article links in a given website below.
However, my code does not print anything at all although I specified the class id and the path to it.
below is my code.
import requests
from lxml import html
from bs4 import BeautifulSoup
from urllib.request import urlopen
html = urlopen("https://uynaa.wordpress.com/category/%d0%be%d1%80%d1%87%d1%83%d1%83%d0%bb%d0%b3%d1%8b%d0%bd-%d0%bd%d0%b8%d0%b9%d1%82%d0%bb%d1%8d%d0%bb/").read()
soup = BeautifulSoup(html, "lxml")
productDivs = soup.findAll('div', attrs={'class' : 'post type-post status-publish format-standard hentry category-56456384'})
for div in productDivs:
print(div.find('h2')[a]['href'])
How do I fetch all the links?
The links are loaded dynamically via JavaScript from external URL. You can use this example to print all links:
import json
import requests
from bs4 import BeautifulSoup
data = {'action': 'infinite_scroll', 'page': 1}
api_url = 'https://uynaa.wordpress.com/?infinity=scrolling'
page = 1
while True:
data['page'] = page
data = requests.post(api_url, data=data).json()
# uncomment next line to print all data:
# print(json.dumps(data, indent=4))
for p in data['postflair']:
print(p)
if data['lastbatch']:
break
page += 1
Prints:
https://uynaa.wordpress.com/2014/01/02/2013-in-review/
https://uynaa.wordpress.com/2013/10/07/%d0%b0%d1%84%d0%b3%d0%b0%d0%bd%d0%b8%d1%81%d1%82%d0%b0%d0%bd-%d0%b0%d0%bd%d1%85%d0%b4%d0%b0%d0%b3%d1%87-%d1%88%d0%b0%d0%bb%d1%82%d0%b3%d0%b0%d0%b0%d0%bd/
https://uynaa.wordpress.com/2013/10/07/%d0%b5-%d0%ba%d0%b0%d1%81%d0%bf%d0%b5%d1%80%d1%81%d0%ba%d0%b8%d0%b9-%d0%b1%d0%b8-%d0%b4%d0%b0%d1%80%d0%b0%d0%bd%d0%b3%d1%83%d0%b9%d0%bb%d0%b0%d0%bb-%d1%82%d0%be%d0%b3%d1%82%d0%be%d0%be-%d0%b3%d1%8d/
https://uynaa.wordpress.com/2013/10/07/%d1%88%d0%b0%d0%bd%d1%85%d0%b0%d0%b9-%d0%bd%d0%be%d0%b3%d0%be%d0%be%d0%bd/
https://uynaa.wordpress.com/2013/10/07/%d1%8d%d0%bd%d1%8d-%d0%b3%d0%b0%d0%b7%d0%b0%d1%80-%d0%bc%d0%b0%d0%bd%d0%b0%d0%b9%d1%85-%d0%b1%d0%b0%d0%b9%d1%81%d0%b0%d0%bd-%d1%8e%d0%bc/
https://uynaa.wordpress.com/2013/10/07/500-%d0%b6%d0%b8%d0%bb-%d0%b0%d1%80%d1%87%d0%bb%d1%83%d1%83%d0%bb%d0%b0%d0%b0%d0%b3%d2%af%d0%b9-%d0%b4%d1%8d%d0%bb%d1%85%d0%b8%d0%b9%d0%bd-%d1%86%d0%be%d1%80%d1%8b%d0%bd-%d0%b3%d0%b0%d0%bd%d1%86/
https://uynaa.wordpress.com/2013/02/01/%d1%83%d0%bb%d0%b7-%d0%bd%d1%83%d1%82%d0%b3%d0%b8%d0%b9%d0%bd-%d0%bf%d0%b8%d1%84%d0%b0%d0%b3%d0%be%d1%80/
https://uynaa.wordpress.com/2013/01/21/%d1%82%d0%b5%d0%bb%d0%b5%d0%b2%d0%b8%d0%b7%d0%b8%d0%b9%d0%bd-%d1%82%d2%af%d2%af%d1%85%d1%8d%d0%bd-%d0%b4%d1%8d%d1%85-%d1%85%d0%b0%d0%bc%d0%b3%d0%b8%d0%b9%d0%bd-%d0%b3%d0%b0%d0%b6%d0%b8%d0%b3-%d1%88/
https://uynaa.wordpress.com/2013/01/18/%d0%b0%d0%bf%d0%be%d1%84%d0%b8%d1%81-%d0%be%d0%be%d1%81-%d2%af%d2%af%d0%b4%d1%8d%d0%bd-%d3%a9%d1%80%d0%bd%d3%a9%d1%85-%d0%b6%d2%af%d0%b6%d0%b8%d0%b3/
https://uynaa.wordpress.com/2013/01/17/%d0%b0%d1%80%d0%b8%d1%83%d0%bd%d1%82%d0%bd%d1%8b-%d0%bd%d1%83%d1%82%d0%b0%d0%b3-%d0%b8%d0%b9%d0%b3-%d1%8d%d0%b7%d1%8d%d0%b3%d0%bd%d1%8d%d1%85-%d1%85%d0%b0%d0%bd/
https://uynaa.wordpress.com/2013/01/15/%d1%81%d0%b0%d1%83%d0%b4%d1%8b%d0%bd-%d1%82%d0%b0%d0%b3%d0%bd%d1%83%d1%83%d0%bb%d1%87%d0%b8%d0%b4-%d0%b0%d1%81%d0%b0%d0%b4%d1%8b%d0%b3-%d0%be%d0%bb%d0%b6%d1%8d%d1%8d/
https://uynaa.wordpress.com/2013/01/15/%d0%bc%d0%b0%d0%bb%d0%b8%d0%b3%d1%8d%d1%8d%d1%81-%d1%81%d0%be%d0%bc%d0%b0%d0%bb%d0%b8-%d1%85%d2%af%d1%80%d1%82%d1%8d%d0%bb/
https://uynaa.wordpress.com/2013/01/10/%d1%85%d0%be%d1%80%d0%b2%d0%be%d0%be-%d0%b5%d1%80%d1%82%d3%a9%d0%bd%d1%86-%d1%85%d0%b0%d0%bb%d0%b0%d0%b0%d1%81%d0%b0%d0%bd%d0%b4-%d0%b1%d0%b0%d0%b3%d1%82%d0%b0%d0%bd%d0%b0/
https://uynaa.wordpress.com/2013/01/10/%d1%82%d0%b0%d0%bd%d0%b3%d0%b0%d1%80%d0%b0%d0%b3-%d3%a9%d1%80%d0%b3%d3%a9%d1%85-%d1%91%d1%81%d0%bb%d0%be%d0%bb-%d1%85%d2%af%d0%bb%d1%8d%d1%8d%d0%b6-%d0%b1%d0%b0%d0%b9%d0%b3-%d1%8d%d1%8d/
https://uynaa.wordpress.com/2013/01/09/%d0%b1%d0%be%d0%bb%d0%bb%d0%b8%d0%b2%d1%83%d0%b4%d1%8b%d0%bd-%d0%ba%d0%b8%d0%bd%d0%be%d0%bd%d0%be%d0%be%d1%81-%d1%87-%d0%b0%d0%b9%d0%bc%d0%b0%d0%b0%d1%80/
https://uynaa.wordpress.com/2013/01/08/%d0%bf%d0%b5%d0%bd%d1%82%d0%b0%d0%b3%d0%be%d0%bd-%d0%b1%d0%be%d0%bb%d0%be%d0%bd-%d1%82%d1%82%d0%b3-%d1%8b%d0%b3-%d1%83%d0%b4%d0%b8%d1%80%d0%b4%d0%b0%d1%85-%d0%bc%d0%b0%d0%b3%d0%b0%d0%b4%d0%bb%d0%b0/
https://uynaa.wordpress.com/2013/01/07/%d0%b7%d0%b8%d0%b0%d0%b4-%d1%82%d0%b0%d0%ba%d0%b8%d0%b5%d0%b4%d0%b4%d0%b8%d0%bd/
...and so on.
EDIT: To filter the links only to specified category, you can use this script:
import json
import requests
from bs4 import BeautifulSoup
data = {'action': 'infinite_scroll', 'page': 1}
api_url = 'https://uynaa.wordpress.com/?infinity=scrolling'
all_links = []
page = 1
while True:
data['page'] = page
data = requests.post(api_url, data=data).json()
# uncomment next line to print all data:
# print(json.dumps(data, indent=4))
soup = BeautifulSoup(data['html'], 'html.parser')
for p in soup.select('.post'):
if any('%d0%be%d1%80%d1%87%d1%83%d1%83%d0%bb%d0%b3%d1%8b%d0%bd-%d0%bd%d0%b8%d0%b9%d1%82%d0%bb%d1%8d%d0%bb' in cat['href'] for cat in p.select('[rel="category tag"]')):
if p.h2.a['href'] not in all_links:
print(p.h2.a['href'])
all_links.append(p.h2.a['href'])
if data['lastbatch']:
break
page += 1
print(len(all_links))
Prints 135 links:
...
https://uynaa.wordpress.com/2011/05/13/%e2%80%9c%d1%83%d1%85%d0%b0%d0%b0%d0%bd-%d0%bc%d1%83%d1%83%d1%82%d0%bd%d1%83%d1%83%d0%b4%d1%8b%d0%bd-%d2%af%d0%b5%e2%80%9d/
https://uynaa.wordpress.com/2011/05/04/%d2%af%d1%85%d0%bb%d0%b8%d0%b9%d0%bd-%d1%82%d0%be%d0%b3%d0%bb%d0%be%d0%be%d0%bc/
https://uynaa.wordpress.com/2011/05/04/%d0%be%d1%81%d0%b0%d0%bc%d0%b0-%d0%b1%d0%b8%d0%bd-%d0%bb%d0%b0%d0%b4%d0%b5%d0%bd%d0%b8%d0%b9%d0%b3-%d1%8f%d0%b0%d0%b6-%d0%b8%d0%bb%d1%80%d2%af%d2%af%d0%bb%d1%81%d1%8d%d0%bd-%d0%b1%d1%8d/
135
Not sure why your codes don't work. For me, I used the below codes to get all the links first.
list_href = []
a_tags = soup.find_all('a')
for tag in a_tags:
list_href.append(tag.get('href'))
The links of the articles are in list_href[5:26].
I am trying to scrape the links from the "box score" button on this page. The button is supposed to look like this
http://www.espn.com/nfl/boxscore?gameId=400874795
I tried to use this code to see if I could access the buttons but I cannot.
from bs4 import BeautifulSoup
import requests
url = 'http://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/1/week/2'
advanced = url
r = requests.get(advanced)
data = r.text
soup = BeautifulSoup(data,"html.parser")
for link in soup.find_all('a'):
print link
As wpercy mentions in his comment, you can't do this using requests, as a suggestion you should use selenium together with Chromedriver/PhantomJS for handling the JavaScript:
from selenium import webdriver
from bs4 import BeautifulSoup
url = "http://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/1/week/2"
browser = webdriver.Chrome()
browser.get(url)
html = browser.page_source
soup = BeautifulSoup(html,'html.parser')
boxList = soup.findAll('a',{'name':'&lpos=nfl:scoreboard:boxscore'})
All score buttons's a tag have the attribute name = &lpos=nfl:scoreboard:boxscore, so we first use .findAll and now a simple list comprehension can extract each href attribute:
>>> links = [box['href'] for box in boxList]
>>> links
['/nfl/boxscore?gameId=400874795', '/nfl/boxscore?gameId=400874854', '/nfl/boxscore?gameId=400874753', '/nfl/boxscore?gameId=400874757', '/nfl/boxscore?gameId=400874772', '/nfl/boxscore?gameId=400874777', '/nfl/boxscore?gameId=400874767', '/nfl/boxscore?gameId=400874812', '/nfl/boxscore?gameId=400874761', '/nfl/boxscore?gameId=400874764', '/nfl/boxscore?gameId=400874781', '/nfl/boxscore?gameId=400874796', '/nfl/boxscore?gameId=400874750', '/nfl/boxscore?gameId=400873867', '/nfl/boxscore?gameId=400874775', '/nfl/boxscore?gameId=400874798']
here is the solution i did , and it scrapes all the link which are there on the url you have provided in your answer . you can check it out
# from BeautifulSoup import *
from bs4 import BeautifulSoup
# import requests
import urllib
url = 'http://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/1/week/2'
# advanced = url
html = urllib.urlopen(url).read()
# r = requests.get(html)
# data = r.text
soup = BeautifulSoup(html)
tags = soup('a')
# for link in soup.find_all('a'):
for i,tag in enumerate(tags):
# print tag;
print i;
ans = tag.get('href',None)
print ans;
print "\n";
The answer from Gopal Chitalia didn't work for me, so I decided to post the working one (for python 3.6.5)
# from BeautifulSoup import *
from bs4 import BeautifulSoup
# import requests
import urllib
url = 'http://www.espn.com/nfl/scoreboard/_/year/2016/seasontype/1/week/2'
# advanced = url
html = urllib.request.urlopen(url)
# urlopen(url).read()
# r = requests.get(html)
# data = r.text
soup = BeautifulSoup(html)
tags = soup('a')
# for link in soup.find_all('a'):
for i,tag in enumerate(tags):
# print tag;
print (i);
ans = tag.get('href',None)
print (ans);
print ("\n");