Web scraper in Python for multiple articles of the same webpage - python

I want to download multiple articles at once and save into a csv file.
I have set a limit for 5 pages of the website on the specific topic, but I am not sure it is suppose to be so long. Each page has 10 articles, so it is supposed to be for 50 articles in total.
The code has been running for more than one hour now, is this normal?
Thank you.
I am using Google Colab for the code:
#Generalized function to get all news-related articles from a Nasdaq webpage
def get_news_urls(links_site):
resp = requests.get(links_site)
if not resp.ok:
return None
html = resp.content
soup = BeautifulSoup(html , 'lxml')
links = soup.find_all('a')
urls = [link.get('href') for link in links]
urls = [url for url in urls if url is not None]
news_urls = [url for url in urls if '/article/' in url]
return news_urls
def scrape_all_articles(ticker , upper_page_limit = 5):
landing_site = 'http://www.nasdaq.com/symbol/' + ticker + '/news-headlines'
all_news_urls = get_news_urls(landing_site)
current_urls_list = all_news_urls.copy()
index = 2
while (current_urls_list is not None) and (current_urls_list != []) and \
(index <= upper_page_limit):
current_site = landing_site + '?page=' + str(index)
current_urls_list = get_news_urls(current_site)
all_news_urls = all_news_urls + current_urls_list
index = index + 1
all_news_urls = list(set(all_news_urls))
#Now, we have a list of urls, we need to actually scrape the text
all_articles = [scrape_news_text(news_url) for news_url in all_news_urls]
return all_articles
nflx_articles = scrape_all_articles('nflx' , 5)

Related

Python / BeautifulSoup webscraper returning "None"

trying to build a webscraper to return lists of freelance gig postings on different websites into one place. My code is below and it keeps returning "None". I'm a bit stuck at this point, if you can help identify why it keeps doing this that would be great.
import requests
from bs4 import BeautifulSoup
import pprint
res1 = requests.get('https://www.airtasker.com/tasks/?task_states=posted&lat=-33.7918&lon=151.0806&location_name=Eastwood%2C%20NSW&radius=20000000&carl_ids=&task_types=both&max_price=9999&min_price=5&search_term=python&badges=&sort_by=posted_desc') # this is where we will scrape the info from
soup1 = BeautifulSoup(res1.text, 'html.parser') # this tells BS to give us HTML code for the page
links1 = soup1.select('.new-task-list-item new-task-list-item--open') # link of each gig
subtext1 = soup1.select('.new-task-list-item__date at-icon-calendar') # date of each gig
res2 = requests.get('https://www.airtasker.com/tasks/?task_states=posted&lat=-33.7918&lon=151.0806&location_name=Eastwood%2C%20NSW&radius=20000000&carl_ids=&task_types=both&max_price=9999&min_price=5&search_term=web%20developer&badges=&sort_by=posted_desc')
soup2 = BeautifulSoup(res2.text, 'html.parser')
links2 = soup2.select('.new-task-list-item new-task-list-item--open')
subtext2 = soup2.select('.new-task-list-item__date at-icon-calendar')
res3 = requests.get('https://www.upwork.com/freelance-jobs/website/')
soup3 = BeautifulSoup(res3.text, 'html.parser')
links3 = soup3.select('.job-title')
subtext3 = soup3.select('.text-muted')
res4 = requests.get('https://www.upwork.com/freelance-jobs/data-science/')
soup4 = BeautifulSoup(res4.text, 'html.parser')
links4 = soup4.select('.job-title')
subtext4 = soup4.select('.text-muted')
res5 = requests.get('https://www.upwork.com/freelance-jobs/bot-development/')
soup5 = BeautifulSoup(res5.text, 'html.parser')
links5 = soup5.select('.job-title')
subtext5 = soup5.select('.text-muted')
res6 = requests.get('https://www.upwork.com/freelance-jobs/python-script/')
soup6 = BeautifulSoup(res6.text, 'html.parser')
links6 = soup6.select('.job-title')
subtext6 = soup6.select('.text-muted')
mega_links = links1 + links2 + links3 + links4 + links5 + links6
mega_subtext = subtext1 + subtext2 + subtext3 + subtext4 + subtext5 + subtext6
def extract(links, subtexts):
joblist = []
for indx, item in enumerate(links):
title = item.getText()
href = item.get('href')
joblist.append({'title': title, 'link': href})
return joblist
pprint.pprint(extract(mega_links , mega_subtext))
I have no idea what exactly you are trying to extract from the scraped web page requests. Here's what I tried from my end:
Your links variable are null or empty lists since there is no such querySelector present for the web page you're trying to scrape. For example, the console of the first web page that you are scraping (the element you're trying to scrape doesn't exist):
I would recommend you to confirm the element you're trying to scrape and confirm it's class.
Another Point of Consideration:
When you will print your soup variables you will notice that you get CloudFare as the output.

Web Crawler Looping the URL to crawl many pages

I am lost with making a loop to go through all of the pages on this book site. The url ends in 'all?page=' followed by the page number, so it should be easy I thought, but I'm stuck. All the info gathering works fine, I just don't know how to move to the next pages. Any help would be appreciated.
import requests
from bs4 import BeautifulSoup
URL = 'https://www.bookdepository.com/category/352/Science-Fiction/browse/viewmode/all?page=' +str(page)
page = 1
page += 1
for page in max_pages:
html = requests.get(URL)
soup = BeautifulSoup(html.content, "html.parser")
# ^This part I need help with^
# results = all books present on page
# books = each individual book on the page
results = soup.find(class_='tab search')
books = results.find_all('div', class_='book-item')
for book in books:
title = book.h3.a
author = book.p.span
# in case there is no rating on a book
if len(book.find('div','rating-wrap').findAll('span', 'full-star')) == None:
pass
else: rating = len(book.find('div','rating-wrap').findAll('span', 'full-star'))
publish_date = book.find(class_='published')
format = book.find(class_='format')
price = book.find('span', class_='sale-price').text.strip()
# if there is no discount
if book.find(class_='rrp') == None:
pass
else:
original_price = book.find(class_='rrp').text.strip()
if book.find(class_='price-save') == None:
pass
else:
discount = book.find(class_='price-save').text.strip()
# unneeded text removed such as 'US' before the price shown
price = price.replace('US', '')
original_price = original_price.replace('US', '')
discount = discount.replace('Save US', '')
# .text.strip() gets text and rids of empty spaces
print(title.text.strip())
print(author.text.strip())
print(rating, 'stars')
print(publish_date.text.strip())
print(format.text.strip())
print(price)
print(original_price)
print(discount, 'in savings!')
What the code does is it loops 5 times in this case with page going up one every singe time.
max_pages = 5
for page in range(max_pages):
URL = f"https://www.bookdepository.com/category/352/Science-Fiction/browse/viewmode/all?page={page}"
html = requests.get(URL)
soup = BeautifulSoup(html.content, "html.parser")

Script crawls only the first page instead of multiple pages

I am trying to crawl multiple pages of a website. But the program can only crawl the first page.
import requests
from bs4 import BeautifulSoup
import re
import json
import time
def make_soup(url):
source = requests.get(url).text
soup = BeautifulSoup(source, 'lxml')
pattern = re.compile(r'window.__WEB_CONTEXT__={pageManifest:(\{.*\})};')
script = soup.find("script", text=pattern)
jsonData = pattern.search(script.text).group(1)
pattern_number = re.compile(r'\"[0-9]{9,12}\":(\{\"data\":\{\"cachedFilters\":(.*?)\}\}),\"[0-9]{9,11}\"')
jsonData2 = pattern_number.search(jsonData).group(1)
dictData = json.loads(jsonData2)
return dictData
def get_reviews(dictData):
""" Return a list of five dicts with reviews.
"""
all_dictionaries = []
for data in dictData['data']['locations']:
for reviews in data['reviewListPage']['reviews']:
review_dict = {}
review_dict["reviewid"] = reviews['id']
review_dict["reviewurl"] = reviews['absoluteUrl']
review_dict["reviewlang"] = reviews['language']
review_dict["reviewdate"] = reviews['createdDate']
userProfile = reviews['userProfile']
review_dict["author"] = userProfile['displayName']
all_dictionaries.append(review_dict)
return all_dictionaries
def main():
url = 'https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-Coronado_Hotel-Zurich.html#REVIEWS'
dictData = make_soup(url)
review_list = get_reviews(dictData) # list with five dicts
#print(review_list)
page_number = 5
while page_number <= 260: # number in the URL
next_url = 'https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or' + str(page_number) + '-Coronado_Hotel-Zurich.html#REVIEWS'
dictData = make_soup(url)
review_list2 = get_reviews(dictData)
print(review_list2)
page_number += 5
time.sleep(0.5)
if __name__ == "__main__":
main()
And I'm not sure if I can crawl multiple pages with this URL. On the website there are 54 pages, but in the URL I always have to add the number 5, like this:
Page 1
https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-Coronado_Hotel-Zurich.html#REVIEWS
Page2
https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or5-Coronado_Hotel-Zurich.html#REVIEWS
Page3
https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or10-Coronado_Hotel-Zurich.html#REVIEWS
I don't know if this is a good idea.
Do you have any suggestions? Thank you in advance!
You assing new url to next_url but you use url to read page.
next_url = 'https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or' + str(page_number) + '-Coronado_Hotel-Zurich.html#REVIEWS'
dictData = make_soup(url)
You have to rename variable
url = 'https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-or' + str(page_number) + '-Coronado_Hotel-Zurich.html#REVIEWS'
dictData = make_soup(url)

Scrape web untill the "next" page is disabled

url = 'https://www.tripadvisor.ie/Attraction_Review-g295424-d2038312-Reviews-Global_Village-Dubai_Emirate_of_Dubai.html'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
def get_links():
review_links = []
for review_link in soup.find_all('a', {'class':'title'},href=True):
review_link = review_link['href']
review_links.append(review_link)
return review_links
link = 'https://www.tripadvisor.ie'
review_urls = []
for i in get_links():
review_url = link + i
print (review_url)
review_urls.append(review_url)
Here this code to save all the hyperlinks present on this webpage - but I want to scrape all the hyperlinks on the pages till 319. Not able to implement when pagination is disabled
There is a param you can change in the url to loop and get all the reviews.
So I just added a loop and requests all the urls
def get_page(index):
url = "https://www.tripadvisor.ie/Attraction_Review-g295424-d2038312-Reviews-or{}-Global_Village-Dubai_Emirate_of_Dubai.html".format(str(index))
html = requests.get(url)
page = soup(html.text, 'html.parser')
return page
nb_review = 3187
for i in range(0, nb_review, 10):
page = get_page(i)
The full code using your snippet is:
from bs4 import BeautifulSoup as soup
import requests
def get_page(index):
url = "https://www.tripadvisor.ie/Attraction_Review-g295424-d2038312-Reviews-or{}-Global_Village-Dubai_Emirate_of_Dubai.html".format(str(index))
html = requests.get(url)
page = soup(html.text, 'html.parser')
return page
def get_links(page):
review_links = []
for review_link in page.find_all('a', {'class':'title'},href=True):
review_link = review_link['href']
review_links.append(review_link)
return review_links
link = 'https://www.tripadvisor.ie'
review_urls = []
nb_review = 3187
for i in range(0, nb_review, 10):
page = get_page(i)
for i in get_links(page):
review_url = link + i
review_urls.append(review_url)
print(len(review_urls))
OUTPUT:
3187
EDIT:
You can obviously scrape the first page and get the review number to upgrade the code to make it more customizable

Python shows same results for every page. Beautiful Soup

I'm super new to Python and I am trying to scrape some stuff from google scholar as a project. The code with the problem looks like this:
yearList = []
def getYear():
for div in soup.find_all("div", class_='gs_a'):
yearRegex = re.compile(r".*(\d\d\d\d).*")
yo = yearRegex.findall(div.text)
yearList.append(yo)
print(yearList)
page = 0
i = 0
while i < numPages:
link = 'https://scholar.google.de/scholar?start=' + str(page) + '&q=' + search + '&hl=de&as_sdt=0,5'
res = requests.get(link)
soup = bs4.BeautifulSoup(res.text, 'html.parser')
getYear() #this is the function that extracts the data
page += 20 #to get to the next page of the results
i += 1`
The page variable and the link actually change by 20 each time. However, for some reason the program just scrapes the first page of the search results, as if the link variable had never changed. What am I missing?

Categories