I tried the following code to download all pdf file from the links but with that It download all files when I run these code every time. Recommended: First time it should download all pdf, and from next time it should download only which one is new.(it should check first which one is new)
My Code:
import requests
from bs4 import BeautifulSoup
root_url = 'https://www.iea.org'
def getLinks(url):
all_links = []
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
for href in soup.find_all(class_='omrlist'):
all_links.append(root_url + href.find('a').get('href'))
return all_links
yearLinks = getLinks(root_url +'/oilmarketreport/reports/')
# get report URL
reportLinks = []
for url in yearLinks:
links = getLinks(url)
#reportLinks.extend(links)
#print(reportLinks)
i =0
for url_ in links:
if "AnnualStatisticalSupplement" not in url_:
url__ = url_.replace("org..", "org").replace("../", "")
response = requests.get(url__, stream=True)
lastindex= url__.rfind('/')
strlen = len(url__)
filename = url__[lastindex:strlen]
with open('/home/pdfs/'+ str(filename), 'wb') as pdffile:
pdffile.write(response.content)
i += 1
print(url__)
print("Download Completed")
Then I need to store that file is Mongo DB, How should i do that by making three column(pdf name, reported date, flag of process).
Sorry for the significant change in your code. because your code is too messy to read.
if you want to download the pdf you don't have since some time, you must add if-loop to control your action. by the way if you add page url into your database that you need not to access one more time to get the pdf name.
import requests
from bs4 import BeautifulSoup
root_url = 'https://www.iea.org'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'}
downloaded = ["2018-02-13.pdf"] # the latest i have
def getLinks(url):
page = requests.get(url,headers=headers)
soup = BeautifulSoup(page.text, 'lxml')
li = soup.find_all("li",class_="omrlist")
links = [root_url + href.a.get('href') for href in li]
return links
def get_pdf(url,flag=1):
# find page link in the month directory
pdf_page = requests.get(url,headers=headers)
soup = BeautifulSoup(pdf_page.text, 'lxml')
li = soup.find_all("li",class_="omrlist")[::-1] # latest -> old
latest_pdf_set = [root_url + href.a.get('href') for href in li]
# find pdf link
pdf_links = []
for pdf_url in latest_pdf_set:
text = requests.get(pdf_url,headers=headers).text
soup = BeautifulSoup(text,"lxml")
link = soup.find("div",class_="omrreport pL10").find("a").get("href")
if link.split("/")[-1] in downloaded:
flag = 0 # if flag = 0 means you found the pdf that you already had
break
pdf_links.append(root_url + link)
return pdf_links,flag
yearLinks = getLinks(root_url +'/oilmarketreport/reports/')
all_ = []
for each in yearLinks:
pdf_links = get_pdf(each)
all_ += pdf_links[0]
if not pdf_links[1]:
# flag = 0 break
break
print(all_)
Related
Here is the website I am trying to scrape: https://books.toscrape.com/
Below are my functions. The scrape_all_pages() is not working. Is there a better way to get the page number from the website directly so I can use the range function instead?
I did checkout Finding number of pages using Python BeautifulSoup
import requests
from bs4 import BeautifulSoup
def get_soup(url):
"""Takes a URL and returns a BeautifulSoup() instance representing the HTML of the page."""
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
return soup
def scrape_page(num):
"""Takes a page and returns a list of links to the book that are on the page."""
BASE_URL = 'http://books.toscrape.com/catalogue/'
PAGE_URL = BASE_URL + str('page-')
book_url = []
soup = get_soup(PAGE_URL + str(num)+ '.html')
for x in soup.findAll("article", class_ = "product_pod"):
url = x.div.a.get('href')
link = BASE_URL + url
if x not in book_url:
book_url.append(link)
return book_url
def scrape_all_pages():
"""Scrapes all pages, returning a list of book links."""
page_num = 0
all_urls = []
while True:
url = scrape_page(page_num)
if not url:
break
all_urls += url
page_num += 1
return all_urls
It do not need range() in most cases. Would recommend to change strategy and take a look if there is a link to next page available or not:
if soup.select_one('li.next a[href]'):
nextPage = BASE_URL + soup.select_one('li.next a[href]')['href']
else:
nextPage = None
or from python3.8 and later:
nextPage = BASE_URL + a['href'] if(a := soup.select_one('li.next a[href]')) else None
Example
Note Starts from https://books.toscrape.com/catalogue/page-45.html to limit for demo. You could simply change it to get https://books.toscrape.com/ getting all pages scraped
import requests
from bs4 import BeautifulSoup
def get_soup(url):
"""Takes a URL and returns a BeautifulSoup() instance representing the HTML of the page."""
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, "html.parser")
return soup
def scrape_page(url):
"""Takes a page and append link of the books that are on the page to global list"""
BASE_URL = 'http://books.toscrape.com/catalogue/'
soup = get_soup(url)
for x in soup.find_all("article", class_ = "product_pod"):
url = x.div.a.get('href')
link = BASE_URL + url
if x not in book_url:
book_url.append(link)
if soup.select_one('li.next a[href]'):
nextPage = BASE_URL + soup.select_one('li.next a[href]')['href']
else:
nextPage = None
return nextPage
def scrape_all_pages(url):
"""Scrapes all pages, returning a list of book links."""
while True:
if url:
print(url)
url = scrape_page(url)
else:
break
return book_url
book_url = []
scrape_all_pages('https://books.toscrape.com/catalogue/page-45.html')
Output
['http://books.toscrape.com/catalogue/annie-on-my-mind_120/index.html',
'http://books.toscrape.com/catalogue/and-then-there-were-none_119/index.html',
'http://books.toscrape.com/catalogue/a-walk-in-the-woods-rediscovering-america-on-the-appalachian-trail_118/index.html',
'http://books.toscrape.com/catalogue/a-visit-from-the-goon-squad_117/index.html',
'http://books.toscrape.com/catalogue/a-storm-of-swords-a-song-of-ice-and-fire-3_116/index.html',
'http://books.toscrape.com/catalogue/a-heartbreaking-work-of-staggering-genius_115/index.html',
'http://books.toscrape.com/catalogue/8-keys-to-mental-health-through-exercise_114/index.html',
'http://books.toscrape.com/catalogue/girlboss_113/index.html',
'http://books.toscrape.com/catalogue/the-suffragettes-little-black-classics-96_112/index.html',
'http://books.toscrape.com/catalogue/the-sense-of-an-ending_111/index.html',
'http://books.toscrape.com/catalogue/the-sandman-vol-2-the-dolls-house-the-sandman-volumes-2_110/index.html',
'http://books.toscrape.com/catalogue/the-course-of-love_109/index.html',
'http://books.toscrape.com/catalogue/sugar-rush-offensive-line-2_108/index.html',
'http://books.toscrape.com/catalogue/saga-volume-2-saga-collected-editions-2_107/index.html',
'http://books.toscrape.com/catalogue/run-spot-run-the-ethics-of-keeping-pets_106/index.html',...]
I am trying create a function that scrapes college baseball team roster pages for a project. And I have created a function that crawls the roster page, gets a list of the links I want to scrape. But when I try to scrape the individual links for each player, it works but cannot find the data that is on their page.
This is the link to the page I am crawling from at the start:
https://gvsulakers.com/sports/baseball/roster
These are just functions that I call within the function that I am having a problem with:
def parse_row(rows):
return [str(x.string)for x in rows.find_all('td')]
def scrape(url):
page = requests.get(url, headers = headers)
html = page.text
soop = BeautifulSoup(html, 'lxml')
return(soop)
def find_data(url):
page = requests.get(url, headers = headers)
html = page.text
soop = BeautifulSoup(html, 'lxml')
row = soop.find_all('tr')
lopr = [parse_row(rows) for rows in row]
return(lopr)
Here is what I am having an issue with. when I assign type1_roster with a variable and print it, i only get an empty list. Ideally it should contain data about a player or players from a players roster page.
# Roster page crawler
def type1_roster(team_id):
url = "https://" + team_id + ".com/sports/baseball/roster"
soop = scrape(url)
href_tags = soop.find_all(href = True)
hrefs = [tag.get('href') for tag in href_tags]
# get all player links
player_hrefs = []
for href in hrefs:
if 'sports/baseball/roster' in href:
if 'sports/baseball/roster/coaches' not in href:
if 'https:' not in href:
player_hrefs.append(href)
# get rid of duplicates
player_links = list(set(player_hrefs))
# scrape the roster links
for link in player_links:
player_ = url + link[24:]
return(find_data(player_))
A number of things:
I would pass the headers as a global
You are slicing 1 character too late the link I think for player_
You need to re-work the logic of find_data(), as data is present in a mixture of element types and not in table/tr/td elements e.g. found in spans. The html attributes are nice and descriptive and will support targeting content easily
You can target the player links from the landing page more tightly with the css selector list shown below. This removes the need for multiple loops as well as the use of list(set())
import requests
from bs4 import BeautifulSoup
HEADERS = {'User-Agent': 'Mozilla/5.0'}
def scrape(url):
page = requests.get(url, headers=HEADERS)
html = page.text
soop = BeautifulSoup(html, 'lxml')
return(soop)
def find_data(url):
page = requests.get(url, headers=HEADERS)
#print(page)
html = page.text
soop = BeautifulSoup(html, 'lxml')
# re-think logic here to return desired data e.g.
# soop.select_one('.sidearm-roster-player-jersey-number').text
first_name = soop.select_one('.sidearm-roster-player-first-name').text
# soop.select_one('.sidearm-roster-player-last-name').text
# need targeted string cleaning possibly
bio = soop.select_one('#sidearm-roster-player-bio').get_text('')
return (first_name, bio)
def type1_roster(team_id):
url = "https://" + team_id + ".com/sports/baseball/roster"
soop = scrape(url)
player_links = [i['href'] for i in soop.select(
'.sidearm-roster-players-container .sidearm-roster-player h3 > a')]
# scrape the roster links
for link in player_links:
player_ = url + link[23:]
# print(player_)
return(find_data(player_))
print(type1_roster('gvsulakers'))
I need to download all the files from this page :
https://www.dmo.gov.uk/publications/?offset=0&itemsPerPage=1000000&parentFilter=1433&childFilter=1433%7C1450&startMonth=1&startYear=2008&endMonth=6&endYear=2021
that have "Auction of" on their titles. This is the source for one of the files for example:
Auction of £2,500 million of 0 5/8% Treasury Gilt 2035
I am trying to adapt some code I found from another question, but the pages are coming back empty:
import os
import re
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
def download_pgn(task):
session, url, destination_path = task
response = session.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
game_url = host + soup.find("a", text="download").get("href")
filename = re.search(r"\w+\.pgn", game_url).group()
path = os.path.join(destination_path, filename)
response = session.get(game_url, stream=True)
response.raise_for_status()
with open(path, "wb") as f:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
if __name__ == "__main__":
destination_path = "pgns"
max_workers = 8
if not os.path.exists(destination_path):
os.makedirs(destination_path)
with requests.Session() as session:
response = session.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
pages = soup.find_all("a", href=re.compile(r".*Auction of\?.*"))
tasks = [
(session, host + page.get("href"), destination_path)
for page in pages
]
with ThreadPoolExecutor(max_workers=max_workers) as pool:
pool.map(download_pgn, tasks)
Check your regular expression syntax. The regex r".*Auction of\?.*" will only match titles with an actual of? in the title.
But the href= parameter will search against the URL in the link, so that won't help you much either. This will find the links with the matching titles:
links = soup.find_all("a", string=re.compile(r"Auction of\b"))
And this will extract their URLs so you can retrieve them:
[ file["href"] for file in links ]
This is what ended up working for me:
from bs4 import BeautifulSoup
import requests
import re
links = []
url = 'https://www.dmo.gov.uk/publications/?offset=0&itemsPerPage=1000000000&parentFilter=1433&childFilter=1433|1450&startMonth=1&startYear=2000&endMonth=6&endYear=2021'
req = requests.get(url)
soup = BeautifulSoup(req.text, "lxml")
for a in soup.find_all("a",{"aria-label":re.compile(r"^Auction of\b")}, href=True):
links.append(a['href'])
def download_file(url):
path = url.split('/')[-1].split('?')[0]
r = requests.get(url, stream=True)
if r.status_code == 200:
with open(path, 'wb') as f:
for chunk in r:
f.write(chunk)
host = 'https://www.dmo.gov.uk/'
for link in links:
url = host + link
download_file(url)
The find_all() method accepts a function. You can create a lambda function to filter all for a tags that contain "Auction of":
for tag in soup.find_all(lambda t: t.name == "a" and "Auction of" in t):
print(tag.text)
Or, you can use an [attribute*=value]:
# Find all `aria-label` attributes under an `a` that contain `Auction of`
for tag in soup.select("a[aria-label*='Auction of']"):
print(tag.text)
I can scrape all the reviews from the web page.But I am not getting full content.Only half review content i can scrape.I need to scrape the full content.
from bs4 import BeautifulSoup import requests import re
s = requests.Session()
def get_soup(url):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'}
r = s.get(url, headers=headers)
#with open('temp.html', 'wb') as f:
# f.write(r.content)
# webbrowser.open('temp.html')
if r.status_code != 200:
print('status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def parse(url, response):
if not response:
print('no response:', url)
return
# get number of reviews
# num_reviews = response.find('span', class_='reviews_header_count').text
# num_reviews = num_reviews[1:-1] # remove `( )`
# num_reviews = num_reviews.replace(',', '') # remove `,`
# num_reviews = int(num_reviews)
# print('num_reviews:', num_reviews, type(num_reviews))
num_reviews = (20)
# num_reviews = num_reviews[1:-1] # remove `( )`
# num_reviews = num_reviews.replace(',', '') # remove `,`
# num_reviews = int(num_reviews)
print('num_reviews:', num_reviews, type(num_reviews))
# create template for urls to pages with reviews
url = url.replace('Hilton_New_York_Grand_Central-New_York_City_New_York.html', 'or{}-Hilton_New_York_Grand_Central-New_York_City_New_York.html')
print('template:', url)
# add requests to list
for offset in range(0, num_reviews, 5):
print('url:', url.format(offset))
url_ = url.format(offset)
parse_reviews(url_, get_soup(url_))
#return # for test only - to stop after first page
def parse_reviews(url, response):
print('review:', url)
if not response:
print('no response:', url)
return
for idx, review in enumerate(response.find_all('div', class_='review-container')):
item = {
'hotel_name': response.find('h1', class_='heading_title').text,
'review_title': review.find('span', class_='noQuotes').text,
'review_body': review.find('p', class_='partial_entry').text,
'review_date': review.find('span', class_='relativeDate')['title'],#.text,#[idx],
# 'num_reviews_reviewer': review.find('span', class_='badgetext').text,
'reviewer_name': review.find('span', class_='scrname').text,
'bubble_rating': review.select_one('div.reviewItemInline span.ui_bubble_rating')['class'][1][7:],
}
#~ yield item
results.append(item)
for key,val in item.items():
print(key, ':', val)
print('----')
#return # for test only - to stop after first review
start_urls = [
'https://www.tripadvisor.in/Hotel_Review-g60763-d93339-Reviews-Hilton_New_York_Grand_Central-New_York_City_New_York.html',
#'https://www.tripadvisor.com/Hotel_Review-g60795-d102542-Reviews-Courtyard_Philadelphia_Airport-Philadelphia_Pennsylvania.html',
#'https://www.tripadvisor.com/Hotel_Review-g60795-d122332-Reviews-The_Ritz_Carlton_Philadelphia-Philadelphia_Pennsylvania.html', ]
results = []
for url in start_urls:
parse(url, get_soup(url))
import pandas as pd
df = pd.DataFrame(results) # <--- convert list to DataFrame df.to_csv('output.csv')
I am getting an output sample in csv file from review like:
I went on a family trip and it was amazing, I hope to come back soon. The room was small but what can you expect from New York. It was close to many things and the staff was perfect.I will come back again soon.More...
I just want to expand that more. I need a help..I really have no clue to do it.Please help.
I have written one more code but unable to pull the id from next page.Code is given below
import re
import urllib
#import webbrowser``
s = requests.Session()
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'}
for i in range(0,10,5):
url = ("https://www.tripadvisor.in/Hotel_Review-g60763-d93339-Reviews-or{}-Hilton_New_York_Grand_Central-New_York_City_New_York.html").format(i)
print(url)
r = s.get(url,headers=headers)
html = BeautifulSoup(r.text, 'html.parser')
pattern = re.compile(r"UID_(\w+)\-SRC_(\w+)")
id = soup.find("div", id=pattern)["id"]
uid = pattern.match(id).group(2)
print(uid)
url1 ="https://www.tripadvisor.in/ShowUserReviews-g60763-d93339-r"+str(uid)+"-Hilton_New_York_Grand_Central-New_York_City_New_York.html#CHECK_RATES_CONT"
print(url1)
url2 = ('"' + url1 + '"')`enter code here`
print(url2)
The site uses ajax to expand the review content. The full content is not downloaded until the More link is clicked.
One way to access the content would be to figure out the ajax request format and then issue a HTTP request for the same. That might be difficult, perhaps not.
Another, easier, way is by noticing that the review title is a clickable link which loads the full review in a new page. You can therefore scrape the URL for each review and send a similar GET request. Then scrape the data from the response.
I am trying to get all the unique urls of the website by calling the all_pages function recursively but this function is not giving all the urls of the website.
All I want to do is get all the unique urls of the website using BeautifulSoup. My code looks like this:
base_url = "http://www.readings.com.pk/"
unique_urls=[]
def all_pages(base_url,unique_urls=[]):
response = requests.get(base_url)
soup = BeautifulSoup(response.content, "html.parser")
for link in soup.find_all("a"):
url = link["href"]
absolute_url = urljoin(base_url, url)
if absolute_url not in unique_urls:
if base_url in absolute_url:
unique_urls.append(absolute_url)
print (absolute_url)
all_pages(absolute_url,unique_urls,book_urls)
all_pages(base_url,unique_urls)
Use response.text instead of response.content
Also, you need to return at some point. Additionally, instead of making unique_urls a list, make it a set and they will always be unique.
Additionally, your method is recursive and python has a max recursion depth, so maybe you should instead do this:
base_url = "http://www.readings.com.pk/"
def all_pages(base_url):
response = requests.get(base_url)
unique_urls = {base_url}
visited_urls = set()
while len(unique_urls) > len(visited_urls)
soup = BeautifulSoup(response.text, "html.parser")
for link in soup.find_all("a"):
try:
url = link["href"]
except:
continue
absolute_url = base_url + url
unique_urls.add(absolute_url)
unvisited_url = (unique_urls - visited_urls).pop()
visited_urls.add(unvisited_url)
response = requests.get(unvisited_url)
return unique_urls
all_pages(base_url)