Webscraping multiple pages from a site gives just the home page contents?

Webscraping multiple pages from a site gives just the home page contents? - python

I am trying to scrape the pages from the site https://www.rithmschool.com/blog. Though the contents of the 1st page are getting scraped, the problem with my code is - the same content is being scraped even for all the other pages of the site. Below is the code.
Can anyone please help me fix the code.
My code
import requests
from bs4 import BeautifulSoup
from csv import writer
html = requests.get('https://www.rithmschool.com/blog')
soup = BeautifulSoup(html.text, 'html.parser')
articles = soup.find_all('article')
with open('scraped_rithm.csv', 'w') as f:
data = writer(f)
data.writerow(['Title','Link','Date'])
for article in articles:
title = article.find('a').get_text()
link = article.find('a')['href']
date = article.find('time')['datetime']
data.writerow([title,link,date])
spans = soup.find_all('span', {'class' : 'page'})
for span in spans:
if span.find('a'):
urls = ((span.find('a')['href'])).split(',')
for url in urls:
nw_urls = (f"https://www.rithmschool.com{url}")
print(nw_urls)
nw_response = requests.get(nw_urls)
nw_soup = BeautifulSoup(nw_response.text,'html.parser')
articles = soup.find_all('article')
for article in articles:
title = article.find('a').get_text()
link = article.find('a')['href']
date = article.find('time')['datetime']
data.writerow([title,link,date])

You are still using the old soup object after creating a new nw_soup object
Try changing this from
nw_soup = BeautifulSoup(nw_response.text,'html.parser')
articles = soup.find_all('article')
to
nw_soup = BeautifulSoup(nw_response.text,'html.parser')
articles = nw_soup.find_all('article')

Related

Pulling p tags from multiple URLs

I've struggled on this for days and not sure what the issue could be - basically, I'm trying to extract the profile box data (picture below) of each link -- going through inspector, I thought I could pull the p tags and do so.
I'm new to this and trying to understand, but here's what I have thus far:
-- a code that (somewhat) succesfully pulls the info for ONE link:
import requests
from bs4 import BeautifulSoup
# getting html
url = 'https://basketball.realgm.com/player/Darius-Adams/Summary/28720'
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
playerinfo = container.find_all('p')
print(playerinfo)
I then also have a code that pulls all of the HREF tags from multiple links:
from bs4 import BeautifulSoup
import requests
def get_links(url):
links = []
website = requests.get(url)
website_text = website.text
soup = BeautifulSoup(website_text)
for link in soup.find_all('a'):
links.append(link.get('href'))
for link in links:
print(link)
print(len(links))
get_links('https://basketball.realgm.com/dleague/players/2022')
get_links('https://basketball.realgm.com/dleague/players/2021')
get_links('https://basketball.realgm.com/dleague/players/2020')
So basically, my goal is to combine these two, and get one code that will pull all of the P tags from multiple URLs. I've been trying to do it, and I'm really not sure at all why this isn't working here:
from bs4 import BeautifulSoup
import requests
def get_profile(url):
profiles = []
req = requests.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
container = soup.find('div', attrs={'class', 'main-container'})
for profile in container.find_all('a'):
profiles.append(profile.get('p'))
for profile in profiles:
print(profile)
get_profile('https://basketball.realgm.com/player/Darius-Adams/Summary/28720')
get_profile('https://basketball.realgm.com/player/Marial-Shayok/Summary/26697')
Again, I'm really new to web scraping with Python but any advice would be greatly appreciated. Ultimately, my end goal is to have a tool that can scrape this data in a clean way all at once.
(Player name, Current Team, Born, Birthplace, etc).. maybe I'm doing it entirely wrong but any guidance is welcome!

You need to combine your two scripts together and make requests for each player. Try the following approach. This searches for <td> tags that have the data-td=Player attribute:
import requests
from bs4 import BeautifulSoup
def get_links(url):
data = []
req_url = requests.get(url)
soup = BeautifulSoup(req_url.content, "html.parser")
for td in soup.find_all('td', {'data-th' : 'Player'}):
a_tag = td.a
name = a_tag.text
player_url = a_tag['href']
print(f"Getting {name}")
req_player_url = requests.get(f"https://basketball.realgm.com{player_url}")
soup_player = BeautifulSoup(req_player_url.content, "html.parser")
div_profile_box = soup_player.find("div", class_="profile-box")
row = {"Name" : name, "URL" : player_url}
for p in div_profile_box.find_all("p"):
try:
key, value = p.get_text(strip=True).split(':', 1)
row[key.strip()] = value.strip()
except: # not all entries have values
pass
data.append(row)
return data
urls = [
'https://basketball.realgm.com/dleague/players/2022',
'https://basketball.realgm.com/dleague/players/2021',
'https://basketball.realgm.com/dleague/players/2020',
]
for url in urls:
print(f"Getting: {url}")
data = get_links(url)
for entry in data:
print(entry)

Pulling all yelp reviews via beautifulsoup

I need some help in pulling all reviews for a hotel using beautiful soup; this is what i have thus far, but i need some inspiration pulling all the reviews via API or regular.
import time
import random
from bs4 import BeautifulSoup as bs
import urllib.request as url
html = urllib.request.urlopen('https://www.yelp.com/biz/shore-cliff-hotel-pismo-beach-2').read().decode('utf-8')
soup = bs(html, 'html.parser')
relevant= soup.find_all('p', class_='comment__09f24__gu0rG css-qgunke')
reviews = []
for div in relevant:
for html_class in div.find_all('span',class_="raw__09f24__T4Ezm"):
text = html_class.find('span')
review = html_class.getText(
reviews.append(review)
enter code here

This does the job,
base_url = "https://www.yelp.com/biz/capri-laguna-laguna-beach"
new_page = "?start={}"
content = requests.get(url).content
soup = BeautifulSoup(content, "html.parser")
reviews = []
for i in range(0, 501, 10):
new_page_url = url + new_page.format(i)
new_content = requests.get(url).content
new_soup = BeautifulSoup(content, "html.parser")
relevant= new_soup.find_all('p', class_='comment__09f24__gu0rG css-qgunke')
for div in relevant:
for html_class in div.find_all('span',class_="raw__09f24__T4Ezm"):
text = html_class.find('span')
review = html_class.getText()
reviews.append(review)
Code explaination -
If you click to go to the 2nd page you'll see that ?start=10 get's add to the base URL https://www.yelp.com/biz/capri-laguna-laguna-beach. If you go to the 3rd page then you'll see ?start=20 and so on. The number here is the index of the review, and each page has 10 of them. There are 51 total pages meaning the first review on the 51st page would have the index 501. So the added part to the URL would be ?start=500.
So for each page on the website, the code creates a new URL, gets the HTML content of that URL, creates a soup for it and fetches the review from this newly created soup.

Web Scraping multiple pages of a web site?

My problem is i am trying to scrape data form a web page with multiple web pages of witch each is a separate blog post. But the current code only scrapes the data form the url that i have set as variable source.
source = requests.get('https://www.trenerklemen.si/objave/').text.
I would like to scrape text from other url but one part is dynamic and i have no idea how to access it
source = requests.get('https://www.trenerklemen.si/?????/').text
How would i get the ???? part if it changes. Thanks for the answers.
from bs4 import BeautifulSoup
import requests
source = requests.get('https://www.trenerklemen.si/objave/').text
soup = BeautifulSoup(source,'lxml')
article = soup.find('article')
headline = article.h2.text
print(headline)
summary = article.find('div', class_='post-content').p.text
print(summary)
video = article.find('iframe', class_ ='youtube-player')['src']
video_id = video.split('/')[4]
video_id = video_id.split('?')[0]
yt_link = f'https://youtube.com/watch?v={video_id}'
print(yt_link)

from bs4 import BeautifulSoup
import requests
mainLink = "https://www.trenerklemen.si"
ALL_links = ["https://www.trenerklemen.si/objave/"]
counter = 0
while(counter < len(ALL_links)):
source = requests.get(ALL_links[counter]).text
soup = BeautifulSoup(source,'lxml')
for link in soup.findAll('a'):
LinkNow = str(link.get('href'))
if(len(LinkNow) > 0):
if(LinkNow not in ALL_links):
if(LinkNow[0] == '/'):
ALL_links.append("https://www."+mainLink+LinkNow)
if(mainLink in LinkNow):
ALL_links.append(LinkNow)
try:
article = soup.find('article')
headline = article.h2.text
print(headline)
summary = article.find('div', class_='post-content').p.text
print(summary)
video = article.find('iframe', class_ ='youtube-player')['src']
video_id = video.split('/')[4]
video_id = video_id.split('?')[0]
yt_link = f'https://youtube.com/watch?v={video_id}'
print(yt_link)
except Exception as e:
print("Error: "+str(e))
counter += 1

Websites usually use sitemaps for search engines to be able to crawl the content. You can use the sitemap as a source of your links to scrape.
Sitemap for your website: https://www.trenerklemen.si/post-sitemap.xml

Getting web links to all items in a table and then doing pagination

I am able to get all the links on a particular web page but am having trouble with the pagination.
I am doing the following:
import requests, bs4, re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
r = requests.get(start_url)
soup = BeautifulSoup(r.text,'html.parser')
a_tags = soup.find_all('a')
print(a_tags)
links = [urljoin(start_url, a['href'])for a in a_tags]
print(links)
As a toy example, I am using the following website:
start_url = 'https://www.opencodez.com/page/1'
I am able to get all the links this way. However, I am trying to automate it more by going to the next page and doing the same thing, and outputting all the links to a csv file.
I tried the following but get no outputs:
start_url = 'https://www.opencodez.com/'
with open('names.csv', mode='w') as csv_file:
fieldnames = ['Name']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
article_link = []
def scraping(webpage, page_number):
next_page = webpage + str(page_number)
r = requests.get(str(next_page))
soup = BeautifulSoup(r.text,'html.parser')
a_tags = soup.find_all('a')
print(a_tags)
links = [urljoin(start_url, a['href'])for a in a_tags]
print(links)
for x in range(len(soup)):
article_link.append(links)
if page_number < 16:
page_number = page_number + 1
scraping(webpage, page_number)
scraping('https://www.opencodez.com/page/', 1)
#creating the data frame and populating its data into the csv file
data = { 'Name': article_link}
df = DataFrame(data, columns = ['Article_Link'])
df.to_csv(r'C:\Users\xxxxx\names.csv')
Could you please help me determine where I am going wrong?
I do not mind getting the links in either the output console or printed in a csv file

There were issues here and there with your code but this worked for me:
import requests, bs4, re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
start_url = 'https://www.opencodez.com/'
r = requests.get(start_url) # first page scraping
soup = BeautifulSoup(r.text,'html.parser')
a_tags = soup.find_all('a')
article_link = []
links = [urljoin(start_url, a['href'])for a in a_tags]
article_link.append(links)
for page in range(2,19): # for every page after 1
links = [] # resetting lists on every page just in case
a_tags = []
url = 'https://www.opencodez.com/page/'+str(page)
r = requests.get(start_url)
soup = BeautifulSoup(r.text,'html.parser')
a_tags = soup.find_all('a')
links = [urljoin(start_url, a['href'])for a in a_tags]
article_link.append(links)
print(article_link)
I basically just changed how you append to the list article_link. This variable at the moment is a list of length 18. Each list within article_link is a list of 136 links.

Get links from a site's homepage using python

I want to write a script to get a home page's links to social media (twitter / facebook mostly), and I'm completely stuck since I am fairly new to Python.
The task I want to accomplish is to parse the website, find the social media links, and save it in a new data frame where each column would contain the original URL, the twitter link, and the facebook link. Here's what I have so far of this code for the new york times website:
from bs4 import BeautifulSoup
import requests
url = "http://www.nytimes.com"
r = requests.get(url)
sm_sites = ['twitter.com','facebook.com']
soup = BeautifulSoup(r.content, 'html5lib')
all_links = soup.find_all('a', href = True)
for site in sm_sites:
if all(site in sm_sites for link in all_links):
print(site)
else:
print('no link')
I'm having some problems understanding what the loop is doing, or how to make it work for what I need it to. I also had tried to store the site instead of doing print(site) but that was not working... So I figured I'd ask for help. Before asking, I went through a bunch of responses here but none could get me to do what I needed to do.

the way this code works, you already have your links. Your homepage link is the starting url, so http://www.nytimes.com.
And you have the social media urls sm_sites = ['twitter.com','facebook.com'], all you're doing is confirming they exist on the main page. If you want to save the list of confirmed social media urls, then append them to a list
Here is one way to get the social media links off a page
import requests
from bs4 import BeautifulSoup
url = "https://stackoverflow.com/questions/tagged/python"
r = requests.get(url)
sm_sites = ['twitter.com','facebook.com']
sm_sites_present = []
soup = BeautifulSoup(r.content, 'html5lib')
all_links = soup.find_all('a', href = True)
for sm_site in sm_sites:
for link in all_links:
if sm_site in link.attrs['href']:
sm_sites_present.append(link.attrs['href'])
print(sm_sites_present)
output:
['https://twitter.com/stackoverflow', 'https://www.facebook.com/officialstackoverflow/']
Update
for a df of urls
import requests
import pandas as pd
from bs4 import BeautifulSoup
from IPython.display import display
urls = [
"https://stackoverflow.com/questions/tagged/python",
"https://www.nytimes.com/",
"https://en.wikipedia.org/"
]
sm_sites = ['twitter.com','facebook.com']
sm_sites_present = []
columns = ['url'] + sm_sites
df = pd.DataFrame(data={'url' : urls}, columns=columns)
def get_sm(row):
r = requests.get(row['url'])
output = pd.Series()
soup = BeautifulSoup(r.content, 'html5lib')
all_links = soup.find_all('a', href = True)
for sm_site in sm_sites:
for link in all_links:
if sm_site in link.attrs['href']:
output[sm_site] = link.attrs['href']
return output
sm_columns = df.apply(get_sm, axis=1)
df.update(sm_columns)
df.fillna(value='no link')
output

This will do what you want with regards to adding it to a DataFrame. You can iterate through a list of websites (urlsToSearch), adding a row to the dataframe for each one containing the base website, all facebook links, and all twitter links.
from bs4 import BeautifulSoup
import requests
import pandas as pd
df = pd.DataFrame(columns=["Website", "Facebook", "Twitter"])
urlsToSearch = ["http://www.nytimes.com","http://www.businessinsider.com/"]
for url in urlsToSearch:
r = requests.get(url)
tw_links = []
fb_links = []
soup = BeautifulSoup(r.text, 'html.parser')
all_links = [link['href'] for link in soup.find_all('a', href = True)] #only get href
for link in all_links:
if "twitter.com" in link:
tw_links.append(link)
elif "facebook.com" in link:
fb_links.append(link)
df.loc[df.shape[0]] = [url,fb_links,tw_links] #Add row to end of df

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Webscraping multiple pages from a site gives just the home page contents? - python

You are still using the old soup object after creating a new nw_soup object Try changing this from nw_soup = BeautifulSoup(nw_response.text,'html.parser') articles = soup.find_all('article') to nw_soup = BeautifulSoup(nw_response.text,'html.parser') articles = nw_soup.find_all('article')

Related

Pulling p tags from multiple URLs

Pulling all yelp reviews via beautifulsoup

Web Scraping multiple pages of a web site?

Getting web links to all items in a table and then doing pagination

Get links from a site's homepage using python

Categories

Resources