I'm trying to scrape the search result on this link: https://www.inecnigeria.org/elections/polling-units/ which requires that I select a dropdown value and then another shows up which I have to select from before searching. I am able to get the values from the first dropdown selection but not from the others. Here's what I have currently:
from bs4 import BeautifulSoup
import requests
base = 'https://www.inecnigeria.org/elections/polling-units/'
base_req = requests.get(base, verify=False)
soup = BeautifulSoup( base_req.text, "html.parser" )
# states
states = soup.find('select', id = "statePoll")
stateItems = states.select('option[value]')
stateValues = [ stateItem.text for stateItem in stateItems ]
# print(stateValues)
lgas = soup.find('select', id = "lgaPoll")
lgaItems = lgas.select('option[value]')
lgaValues = [ lgaItem.text for lgaItem in lgaItems ]
print(lgas)
Indeed you can't get those values by scraping the HTML on that page. The page uses JavaScript to request the options from another page and dynamically insert them into the page. You will have to use the information you can scrape to make such requests yourself. Here is an example of how to get the next step that should show you the general idea:
from bs4 import BeautifulSoup
import requests
base = 'https://www.inecnigeria.org/elections/polling-units/'
lga_view = 'https://www.inecnigeria.org/wp-content/themes/independent-national-electoral-commission/custom/views/lgaView.php'
base_req = requests.get(base, verify=False)
soup = BeautifulSoup(base_req.text, "html.parser" )
states = soup.find('select', id = "statePoll")
state_options = states.find_all('option')
states = {opt.text: int(opt['value']) for opt in state_options if 'value' in opt.attrs}
lga = {k: requests.post(lga_view, {'state_id': v}, verify=False).json() for k,v in states.items()}
print(lga)
Related
I've struggled on this for days and not sure what the issue could be - basically, I'm trying to extract the profile box data (picture below) of each link -- going through inspector, I thought I could pull the p tags and do so.
I'm new to this and trying to understand, but here's what I have thus far:
-- a code that (somewhat) succesfully pulls the info for ONE link:
import requests
from bs4 import BeautifulSoup
# getting html
url = 'https://basketball.realgm.com/player/Darius-Adams/Summary/28720'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
playerinfo = container.find_all('p')
print(playerinfo)
I then also have a code that pulls all of the HREF tags from multiple links:
from bs4 import BeautifulSoup
import requests
def get_links(url):
links = []
website = requests.get(url)
website_text = website.text
soup = BeautifulSoup(website_text)
for link in soup.find_all('a'):
links.append(link.get('href'))
for link in links:
print(link)
print(len(links))
get_links('https://basketball.realgm.com/dleague/players/2022')
get_links('https://basketball.realgm.com/dleague/players/2021')
get_links('https://basketball.realgm.com/dleague/players/2020')
So basically, my goal is to combine these two, and get one code that will pull all of the P tags from multiple URLs. I've been trying to do it, and I'm really not sure at all why this isn't working here:
from bs4 import BeautifulSoup
import requests
def get_profile(url):
profiles = []
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
for profile in container.find_all('a'):
profiles.append(profile.get('p'))
for profile in profiles:
print(profile)
get_profile('https://basketball.realgm.com/player/Darius-Adams/Summary/28720')
get_profile('https://basketball.realgm.com/player/Marial-Shayok/Summary/26697')
Again, I'm really new to web scraping with Python but any advice would be greatly appreciated. Ultimately, my end goal is to have a tool that can scrape this data in a clean way all at once.
(Player name, Current Team, Born, Birthplace, etc).. maybe I'm doing it entirely wrong but any guidance is welcome!
You need to combine your two scripts together and make requests for each player. Try the following approach. This searches for <td> tags that have the data-td=Player attribute:
import requests
from bs4 import BeautifulSoup
def get_links(url):
data = []
req_url = requests.get(url)
soup = BeautifulSoup(req_url.content, "html.parser")
for td in soup.find_all('td', {'data-th' : 'Player'}):
a_tag = td.a
name = a_tag.text
player_url = a_tag['href']
print(f"Getting {name}")
req_player_url = requests.get(f"https://basketball.realgm.com{player_url}")
soup_player = BeautifulSoup(req_player_url.content, "html.parser")
div_profile_box = soup_player.find("div", class_="profile-box")
row = {"Name" : name, "URL" : player_url}
for p in div_profile_box.find_all("p"):
try:
key, value = p.get_text(strip=True).split(':', 1)
row[key.strip()] = value.strip()
except: # not all entries have values
pass
data.append(row)
return data
urls = [
'https://basketball.realgm.com/dleague/players/2022',
'https://basketball.realgm.com/dleague/players/2021',
'https://basketball.realgm.com/dleague/players/2020',
]
for url in urls:
print(f"Getting: {url}")
data = get_links(url)
for entry in data:
print(entry)
i am trying to scrape news from reuters but there is a click to view more at the bottom on the website. I could not know how to load the hidden results by using beautiful soup.
from bs4 import BeautifulSoup
import urllib.request
def scrape_reuters_news(ticker):
url = "https://www.reuters.com/search/news?sortBy=relevance&dateRange=pastWeek&blob="+ticker
scraped_data = urllib.request.urlopen(url)
scraped_data = scraped_data.read()
parsed_articles = BeautifulSoup(scraped_data, 'lxml')
links = parsed_articles.find_all("h3")
articles = []
titles = []
title_class = "Text__text___3eVx1j Text__dark-grey___AS2I_p Text__medium___1ocDap Text__heading_2___sUlNJP Heading__base___1dDlXY Heading__heading_2___3f_bIW ArticleHeader__heading___3ibi0Q"
for link in links:
paragraphs = ""
url = "https://www.reuters.com/"+str(link)[41:63]
scraped_data = urllib.request.urlopen(url)
scraped_data = scraped_data.read()
parsed_article = BeautifulSoup(scraped_data, 'lxml')
article = parsed_article.find_all("p")
title = parsed_article.select("h1", {"class": title_class})
titles.append(title[0].text.strip())
for paragraph in article:
paragraphs += paragraph.text + " "
articles.append(paragraphs)
return titles, articles
# edit
ticker = "apple"
news = scrape_reuters_news(ticker)
When you click the load more a callback is issued that you can find in the network tab. If you grab the number of results from the search page, you can add this into the callback to get all results in one go. I then use regex to extract the id to reconstruct each detail page url and the title (headline)
You would then visit each link to get the paragraph info.
Please note:
There is some de-duplication work to do. There exist different ids which lead to same content. So perhaps exclude based on title?
You may need to consider whether any pre-processing of ticker needs to happen e.g. convert to lowercase, replace spaces with "-". I don't know all your use cases.
from bs4 import BeautifulSoup as bs
import requests, re
ticker = 'apple'
with requests.Session() as s:
r = s.get(f'https://www.reuters.com/search/news?sortBy=relevance&dateRange=pastWeek&blob={ticker}')
soup = bs(r.content, 'lxml')
num_results = soup.select_one('.search-result-count-num').text
r = s.get(f'https://www.reuters.com/assets/searchArticleLoadMoreJson?blob={ticker}&bigOrSmall=big&articleWithBlog=true&sortBy=relevance&dateRange=pastWeek&numResultsToShow={num_results}&pn=&callback=addMoreNewsResults')
p = re.compile(r'id: "(.*?)"')
p2 = re.compile(r'headline: "(.*?)"')
links = [f'https://www.reuters.com/article/id{i}' for i in p.findall(r.text)]
headlines = [bs(i, 'lxml').get_text() for i in p2.findall(r.text)]
print(len(links), len(headlines))
From the detail pages you can get the paragraphs with
paras = ' '.join([i.get_text() for i in soup.select('[data-testid*=paragraph-]')])
I've created a script in Python to fetch different product links from a webpage. Although I know the content of that site are dynamic, I tried conventional way to let you inform that I tried. I looked for APIs in the dev tools but could not find one. Ain't there any way to get those links using requests?
Site Link
I've written so far:
import requests
from bs4 import BeautifulSoup
link = "https://www.amazon.com/stores/node/10699640011"
def fetch_product_links(url):
res = requests.get(url,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.text,"lxml")
for item_link in soup.select("[id^='ProductGrid-'] li[class^='style__itemOuter__'] > a"):
print(item_link.get("href"))
if __name__ == '__main__':
fetch_product_links(link)
How can I fetch different product links from that site using requests?
I think you only need the asins which you can collect from another url construct you can see in network tab i.e. you can significantly shorten the final urls. You do however need to make a request to your original url to pick up an identifier to use in second url. Returns 146 links.
import requests, re, json
node = '10699640011'
with requests.Session() as s:
r = s.get(f'https://www.amazon.com/stores/node/{node}')
p = re.compile(r'var slotsStr = "\[(.*?,){3} share\]";')
identifier = p.findall(r.text)[0]
identifier = identifier.strip()[:-1]
r = s.get(f'https://www.amazon.com/stores/slot/{identifier}?node={node}')
p = re.compile(r'var config = (.*?);')
data = json.loads(p.findall(r.text)[0])
asins = data['content']['ASINList']
links = [f'https://www.amazon.com/dp/{asin}' for asin in asins]
print(links)
EDIT:
With two given nodes:
import requests, re, json
from bs4 import BeautifulSoup as bs
nodes = ['3039806011','10699640011']
with requests.Session() as s:
for node in nodes:
r = s.get(f'https://www.amazon.com/stores/node/{node}')
soup = bs(r.content, 'lxml')
identifier = soup.select('.stores-widget-btf:not([id=share],[id*=RECOMMENDATION])')[-1]['id']
r = s.get(f'https://www.amazon.com/stores/slot/{identifier}?node={node}')
p = re.compile(r'var config = (.*?);')
data = json.loads(p.findall(r.text)[0])
asins = data['content']['ASINList']
links = [f'https://www.amazon.com/dp/{asin}' for asin in asins]
print(links)
I've created a script to harvest the links of different products from a webpage. My intention is to scrape the links only when the products have Ajouter au panier sign, meaning Add to Basket. The html structures are very straightforward and easy to play with but the logic to get the desired links appears to be tricky. I've used three different links to show the variation.
Few urls lead to the desired products but still there are catalogues (if i make use of their links) which produces some more products. Check out the image links to see for yourself. I've drawn circles around the catalogues in the first image which can still produces the desired products whereas in that page few desired products are already there.
check out the variation
another one: only catalogues
This is the script I've written:
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
urls = (
"https://www.directmedical.fr/categorie/aspirateurs-de-mucosite.html",
"https://www.directmedical.fr/categorie/literie.html",
"https://www.directmedical.fr/categorie/vetement.html"
)
def get_links(link):
r = requests.get(link)
soup = BeautifulSoup(r.text,"lxml")
for item in soup.select(".browseCategoryName a"):
ilink = urljoin(link,item.get("href"))
print(ilink)
if __name__ == '__main__':
for url in urls:
get_links(url)
How can I get all the products links having Ajouter au panier signs using those urls?
If you need to select Product links from both initial page and (if there are no products on initial page) from Category page, try
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
domain = "https://www.directmedical.fr/"
urls = (
"https://www.directmedical.fr/categorie/aspirateurs-de-mucosite.html",
"https://www.directmedical.fr/categorie/literie.html",
"https://www.directmedical.fr/categorie/vetement.html"
)
def get_links(link):
r = requests.get(link)
soup = BeautifulSoup(r.text, "lxml")
products = soup.select(".browseElements td > a")
if products:
for item in products:
ilink = urljoin(link, item.get("href"))
print(ilink)
else:
categories = [urljoin(domain, item.get("href")) for item in soup.select(".browseChildsCategorys td > a")]
for category in categories:
c = requests.get(category)
c_soup = BeautifulSoup(c.text, "lxml")
for item in c_soup.select(".browseElements td > a"):
c_link = urljoin(domain, item.get("href"))
print(c_link)
if __name__ == '__main__':
for url in urls:
get_links(url)
I'm trying to scrape information on greyhound races. For example, I want to scrape http://www.gbgb.org.uk/RaceCard.aspx?dogName=Hardwick%20Serena. This page shows all results for the dog Hardwick Serena, but it is split over several pages.
Inspecting the page, it shows under the 'next page' button:
<input type="submit" name="ctl00$ctl00$mainContent$cmscontent$DogRaceCard$lvDogRaceCard$ctl00$ctl03$ctl01$ctl12" value=" " title="Next Page" class="rgPageNext">.
I was hoping for a HTML link, that I could use for the next iteration of the scrape, but no luck.
Further inspection, by looking at network traffic, shows that the browser send a horribly long (hashed?) string for __VIEWSTATE, among others. Likely to protect the database?
I'm looking for a way to scrape all pages of one dog, either by iterating over all pages, or by increasing the page length to show 100+ lines on page 1. The underlying database is .aspx.
I'm using Python 3.5 and BeautifulSoup.
current code:
import requests
from bs4 import BeautifulSoup
url = 'http://www.gbgb.org.uk/RaceCard.aspx?dogName=Hardwick%20Serena'
with requests.session() as s:
s.headers['user-agent'] = 'Mozilla/5.0'
r = s.get(url)
soup = BeautifulSoup(r.content, 'html5lib')
target = 'ctl00$ctl00$mainContent$cmscontent$DogRaceCard$btnFilter_input'
data = { tag['name']: tag['value']
for tag in soup.select('input[name^=ctl00]') if tag.get('value')
}
state = { tag['name']: tag['value']
for tag in soup.select('input[name^=__]')
}
data.update(state)
numberpages = int(str(soup.find('div', 'rgWrap rgInfoPart')).split(' ')[-2].split('>')[1].split('<')[0])
# for page in range(last_page + 1):
for page in range(numberpages):
data['__EVENTTARGET'] = target.format(page)
#data['__VIEWSTATE'] = target.format(page)
print(10)
r = s.post(url, data=data)
soup = BeautifulSoup(r.content, 'html5lib')
tables = soup.findChildren('table')
my_table = tables[9]
rows = my_table.findChildren(['th', 'tr'])
tabel = [[]]
for i in range(len(rows)):
cells = rows[i].findChildren('td')
tabel.append([])
for j in range(len(cells)):
value = cells[j].string
tabel[i].append(value)
table = []
for i in range(len(tabel)):
if len(tabel[i]) == 16:
del tabel[i][-2:]
table.append(tabel[i])
In this case, for each page requested a POST request is issued with form url encoded parameter __EVENTTARGET & __VIEWSTATE :
__VIEWSTATE can be easily extracted from an input tag
__EVENTTARGET is different for each page and the value is passed from a javacript function for each page link so you can extract it with a regex :
<a href="javascript:__doPostBack('ctl00$ctl00$mainContent$cmscontent$DogRaceCard$lvDogRaceCard$ctl00$ctl03$ctl01$ctl07','')">
<span>2</span>
</a>
The python script :
from bs4 import BeautifulSoup
import requests
import re
# extract data from page
def extract_data(soup):
tables = soup.find_all("div", {"class":"race-card"})[0].find_all("tbody")
item_list = [
(
t[0].text.strip(), #date
t[1].text.strip(), #dist
t[2].text.strip(), #TP
t[3].text.strip(), #StmHCP
t[4].text.strip(), #Fin
t[5].text.strip(), #By
t[6].text.strip(), #WinnerOr2nd
t[7].text.strip(), #Venue
t[8].text.strip(), #Remarks
t[9].text.strip(), #WinTime
t[10].text.strip(), #Going
t[11].text.strip(), #SP
t[12].text.strip(), #Class
t[13].text.strip() #CalcTm
)
for t in (t.find_all('td') for t in tables[1].find_all('tr'))
if t
]
print(item_list)
session = requests.Session()
url = 'http://www.gbgb.org.uk/RaceCard.aspx?dogName=Hardwick%20Serena'
response = session.get(url)
soup = BeautifulSoup(response.content, "html.parser")
# get view state value
view_state = soup.find_all("input", {"id":"__VIEWSTATE"})[0]["value"]
# get all event target values
event_target = soup.find_all("div", {"class":"rgNumPart"})[0]
event_target_list = [
re.search('__doPostBack\(\'(.*)\',', t["href"]).group(1)
for t in event_target.find_all('a')
]
# extract data for the 1st page
extract_data(soup)
# extract data for each page except the first
for link in event_target_list[1:]:
print("get page {0}".format(link))
post_data = {
'__EVENTTARGET': link,
'__VIEWSTATE': view_state
}
response = session.post(url, data=post_data)
soup = BeautifulSoup(response.content, "html.parser")
extract_data(soup)