BeautifulSoup generating inconsistent results - python

I'm using BeautifulSoup to pull data out of Reddit sidebars on a selection of subreddits, but my results are changing pretty much every time I run my script.
Specifically, the results in sidebar_urls changes from iteration to iteration; sometimes it will result in [XYZ.com/abc, XYZ.com/def], other times it will return just [XYZ.com/def], and finally, it will sometimes return [].
Any ideas why this might be happening using the code below?
sidebar_urls = []
for i in range(0, len(reddit_urls)):
req = urllib.request.Request(reddit_urls[i], headers=headers)
resp = urllib.request.urlopen(req)
soup = BeautifulSoup(resp, 'html.parser')
links = soup.find_all(href=True)
for link in links:
if "XYZ.com" in str(link['href']):
sidebar_urls.append(link['href'])

It seems you sometimes get a page that does not have a side bar. It could be because Reddit is recognizing you as a robot and returning a default page instead of the one you expect. Consider identifying yourself when requesting the pages, using the User-Agent field:
reddit_urls = [
"https://www.reddit.com/r/leagueoflegends/",
"https://www.reddit.com/r/pokemon/"
]
# Update this to identify yourself
user_agent = "me#example.com"
sidebar_urls = []
for reddit_url in reddit_urls:
response = requests.get(reddit_url, headers={"User-Agent": user_agent})
soup = BeautifulSoup(response.text, "html.parser")
# Find the sidebar tag
side_tag = soup.find("div", {"class": "side"})
if side_tag is None:
print("Could not find a sidebar in page: {}".format(reddit_url))
continue
# Find all links in the sidebar tag
link_tags = side_tag.find_all("a")
for link in link_tags:
link_text = str(link["href"])
sidebar_urls.append(link_text)
print(sidebar_urls)

Related

Pulling p tags from multiple URLs

I've struggled on this for days and not sure what the issue could be - basically, I'm trying to extract the profile box data (picture below) of each link -- going through inspector, I thought I could pull the p tags and do so.
I'm new to this and trying to understand, but here's what I have thus far:
-- a code that (somewhat) succesfully pulls the info for ONE link:
import requests
from bs4 import BeautifulSoup
# getting html
url = 'https://basketball.realgm.com/player/Darius-Adams/Summary/28720'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
playerinfo = container.find_all('p')
print(playerinfo)
I then also have a code that pulls all of the HREF tags from multiple links:
from bs4 import BeautifulSoup
import requests
def get_links(url):
links = []
website = requests.get(url)
website_text = website.text
soup = BeautifulSoup(website_text)
for link in soup.find_all('a'):
links.append(link.get('href'))
for link in links:
print(link)
print(len(links))
get_links('https://basketball.realgm.com/dleague/players/2022')
get_links('https://basketball.realgm.com/dleague/players/2021')
get_links('https://basketball.realgm.com/dleague/players/2020')
So basically, my goal is to combine these two, and get one code that will pull all of the P tags from multiple URLs. I've been trying to do it, and I'm really not sure at all why this isn't working here:
from bs4 import BeautifulSoup
import requests
def get_profile(url):
profiles = []
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
for profile in container.find_all('a'):
profiles.append(profile.get('p'))
for profile in profiles:
print(profile)
get_profile('https://basketball.realgm.com/player/Darius-Adams/Summary/28720')
get_profile('https://basketball.realgm.com/player/Marial-Shayok/Summary/26697')
Again, I'm really new to web scraping with Python but any advice would be greatly appreciated. Ultimately, my end goal is to have a tool that can scrape this data in a clean way all at once.
(Player name, Current Team, Born, Birthplace, etc).. maybe I'm doing it entirely wrong but any guidance is welcome!
You need to combine your two scripts together and make requests for each player. Try the following approach. This searches for <td> tags that have the data-td=Player attribute:
import requests
from bs4 import BeautifulSoup
def get_links(url):
data = []
req_url = requests.get(url)
soup = BeautifulSoup(req_url.content, "html.parser")
for td in soup.find_all('td', {'data-th' : 'Player'}):
a_tag = td.a
name = a_tag.text
player_url = a_tag['href']
print(f"Getting {name}")
req_player_url = requests.get(f"https://basketball.realgm.com{player_url}")
soup_player = BeautifulSoup(req_player_url.content, "html.parser")
div_profile_box = soup_player.find("div", class_="profile-box")
row = {"Name" : name, "URL" : player_url}
for p in div_profile_box.find_all("p"):
try:
key, value = p.get_text(strip=True).split(':', 1)
row[key.strip()] = value.strip()
except: # not all entries have values
pass
data.append(row)
return data
urls = [
'https://basketball.realgm.com/dleague/players/2022',
'https://basketball.realgm.com/dleague/players/2021',
'https://basketball.realgm.com/dleague/players/2020',
]
for url in urls:
print(f"Getting: {url}")
data = get_links(url)
for entry in data:
print(entry)

Cannot find the table data within the soup, but I know its there

I am trying create a function that scrapes college baseball team roster pages for a project. And I have created a function that crawls the roster page, gets a list of the links I want to scrape. But when I try to scrape the individual links for each player, it works but cannot find the data that is on their page.
This is the link to the page I am crawling from at the start:
https://gvsulakers.com/sports/baseball/roster
These are just functions that I call within the function that I am having a problem with:
def parse_row(rows):
return [str(x.string)for x in rows.find_all('td')]
def scrape(url):
page = requests.get(url, headers = headers)
html = page.text
soop = BeautifulSoup(html, 'lxml')
return(soop)
def find_data(url):
page = requests.get(url, headers = headers)
html = page.text
soop = BeautifulSoup(html, 'lxml')
row = soop.find_all('tr')
lopr = [parse_row(rows) for rows in row]
return(lopr)
Here is what I am having an issue with. when I assign type1_roster with a variable and print it, i only get an empty list. Ideally it should contain data about a player or players from a players roster page.
# Roster page crawler
def type1_roster(team_id):
url = "https://" + team_id + ".com/sports/baseball/roster"
soop = scrape(url)
href_tags = soop.find_all(href = True)
hrefs = [tag.get('href') for tag in href_tags]
# get all player links
player_hrefs = []
for href in hrefs:
if 'sports/baseball/roster' in href:
if 'sports/baseball/roster/coaches' not in href:
if 'https:' not in href:
player_hrefs.append(href)
# get rid of duplicates
player_links = list(set(player_hrefs))
# scrape the roster links
for link in player_links:
player_ = url + link[24:]
return(find_data(player_))
A number of things:
I would pass the headers as a global
You are slicing 1 character too late the link I think for player_
You need to re-work the logic of find_data(), as data is present in a mixture of element types and not in table/tr/td elements e.g. found in spans. The html attributes are nice and descriptive and will support targeting content easily
You can target the player links from the landing page more tightly with the css selector list shown below. This removes the need for multiple loops as well as the use of list(set())
import requests
from bs4 import BeautifulSoup
HEADERS = {'User-Agent': 'Mozilla/5.0'}
def scrape(url):
page = requests.get(url, headers=HEADERS)
html = page.text
soop = BeautifulSoup(html, 'lxml')
return(soop)
def find_data(url):
page = requests.get(url, headers=HEADERS)
#print(page)
html = page.text
soop = BeautifulSoup(html, 'lxml')
# re-think logic here to return desired data e.g.
# soop.select_one('.sidearm-roster-player-jersey-number').text
first_name = soop.select_one('.sidearm-roster-player-first-name').text
# soop.select_one('.sidearm-roster-player-last-name').text
# need targeted string cleaning possibly
bio = soop.select_one('#sidearm-roster-player-bio').get_text('')
return (first_name, bio)
def type1_roster(team_id):
url = "https://" + team_id + ".com/sports/baseball/roster"
soop = scrape(url)
player_links = [i['href'] for i in soop.select(
'.sidearm-roster-players-container .sidearm-roster-player h3 > a')]
# scrape the roster links
for link in player_links:
player_ = url + link[23:]
# print(player_)
return(find_data(player_))
print(type1_roster('gvsulakers'))

Web scraping nested comments on Reddit using beautifulsoup

This code gets the page. My problem is I need to scrape the content of users comments not the number of comments. It is nested inside the number of comments section but I am not sure how I can access the link and parse through and scrape the user comments.
request_list = []
id_list = [0]
for i in range(0,200,25):
response = requests.get("https://www.reddit.com/r/CryptoCurrency/?count="+str(i)+"&after="+str(id_list[-1]), headers = {'User-agent':'No Bot'})
soup = BeautifulSoup(response.content, 'lxml')
request_list.append(soup)
id_list.append(soup.find_all('div', attrs={'data-type': 'link'})[-1]['data-fullname'])
print(i, id_list)
if i%100 == 0:
time.sleep(1)
The code below I tried writing a function that is supposed to access the nested comments but I have no clue.
def extract_comment_contents(request_list):
comment_contents_list = []
for i in request_list:
if response.status_code == 200:
for each in i.find_all('a', attrs={'data-inbound-url': '/r/CryptoCurrency/comments/'}):
comment_contents_list.append(each.text)
else:
print("Call failed at request ", i)
return comment_contents_list
fetch_comment_contents_list = extract_comment_contents(request_list)
print(fetch_comment_contents_list)
For each thread, you need to send another request to get the comments page. The url for the comments page can be found using soup.find_all('a', class_='bylink comments may-blank'). This will give all the a tags that have to url for the comments page. I'll show you one example to get to the comments page.
r = requests.get('https://www.reddit.com/r/CryptoCurrency/?count=0&after=0')
soup = BeautifulSoup(r.text, 'lxml')
for comments_tag in soup.find_all('a', class_='bylink comments may-blank', href=True):
url = comments_tag['href']
r2 = requests.get(url)
soup = BeautifulSoup(r2.text, 'lxml')
# Your job is to parse this soup object and get all the comments.

Python Pagination Loop

I'm doing some simple web scraping, and need to find a better way to loop through pagination on the target site. The only way I could do this was to write 10 + "for loops" to get it to work.
Basically I'm looking for a "Next" icon in the url, if it exist, I need to grab the parent link of the icon image and append it to the url, go to the new updated url, and search for the same icon and repeat until I get to the last page (the icon will be gone).
How could I perform this without hard coding a bunch of for loops?
url = "http://www.somewebsite.com/"
r = requests.get(wurl)
soup = BeautifulSoup(r.text, "lxml")
for img in soup.findAll("img"):
if "/Next_Icon" in img["src"]:
link = img.find_parent("a", href=True)
extLink = (link["href"])
url = "http://www.somewebsite.com/" + extLink
url_stack = ["http://www.somewebsite.com/"]
while url_stack:
wurl = url_stack.pop()
r = requests.get(wurl)
soup = BeautifulSoup(r.text, "lxml")
for img in soup.findAll("img"):
if "/Next_Icon" in img["src"]:
link = img.find_parent("a", href=True)
extLink = (link["href"])
url = "http://www.somewebsite.com/" + extLink
url_stack.append(url)
You should use list to store all the urls

How to scrape whole website using beautifulsoup

I am trying to get all the unique urls of the website by calling the all_pages function recursively but this function is not giving all the urls of the website.
All I want to do is get all the unique urls of the website using BeautifulSoup. My code looks like this:
base_url = "http://www.readings.com.pk/"
unique_urls=[]
def all_pages(base_url,unique_urls=[]):
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "html.parser")
for link in soup.find_all("a"):
url = link["href"]
absolute_url = urljoin(base_url, url)
if absolute_url not in unique_urls:
if base_url in absolute_url:
unique_urls.append(absolute_url)
print (absolute_url)
all_pages(absolute_url,unique_urls,book_urls)
all_pages(base_url,unique_urls)
Use response.text instead of response.content
Also, you need to return at some point. Additionally, instead of making unique_urls a list, make it a set and they will always be unique.
Additionally, your method is recursive and python has a max recursion depth, so maybe you should instead do this:
base_url = "http://www.readings.com.pk/"
def all_pages(base_url):
response = requests.get(base_url)
unique_urls = {base_url}
visited_urls = set()
while len(unique_urls) > len(visited_urls)
soup = BeautifulSoup(response.text, "html.parser")
for link in soup.find_all("a"):
try:
url = link["href"]
except:
continue
absolute_url = base_url + url
unique_urls.add(absolute_url)
unvisited_url = (unique_urls - visited_urls).pop()
visited_urls.add(unvisited_url)
response = requests.get(unvisited_url)
return unique_urls
all_pages(base_url)

Categories