Web Crawler Looping the URL to crawl many pages

Web Crawler Looping the URL to crawl many pages - python

I am lost with making a loop to go through all of the pages on this book site. The url ends in 'all?page=' followed by the page number, so it should be easy I thought, but I'm stuck. All the info gathering works fine, I just don't know how to move to the next pages. Any help would be appreciated.
import requests
from bs4 import BeautifulSoup
URL = 'https://www.bookdepository.com/category/352/Science-Fiction/browse/viewmode/all?page=' +str(page)
page = 1
page += 1
for page in max_pages:
html = requests.get(URL)
soup = BeautifulSoup(html.content, "html.parser")
# ^This part I need help with^
# results = all books present on page
# books = each individual book on the page
results = soup.find(class_='tab search')
books = results.find_all('div', class_='book-item')
for book in books:
title = book.h3.a
author = book.p.span
# in case there is no rating on a book
if len(book.find('div','rating-wrap').findAll('span', 'full-star')) == None:
pass
else: rating = len(book.find('div','rating-wrap').findAll('span', 'full-star'))
publish_date = book.find(class_='published')
format = book.find(class_='format')
price = book.find('span', class_='sale-price').text.strip()
# if there is no discount
if book.find(class_='rrp') == None:
pass
else:
original_price = book.find(class_='rrp').text.strip()
if book.find(class_='price-save') == None:
pass
else:
discount = book.find(class_='price-save').text.strip()
# unneeded text removed such as 'US' before the price shown
price = price.replace('US', '')
original_price = original_price.replace('US', '')
discount = discount.replace('Save US', '')
# .text.strip() gets text and rids of empty spaces
print(title.text.strip())
print(author.text.strip())
print(rating, 'stars')
print(publish_date.text.strip())
print(format.text.strip())
print(price)
print(original_price)
print(discount, 'in savings!')

What the code does is it loops 5 times in this case with page going up one every singe time.
max_pages = 5
for page in range(max_pages):
URL = f"https://www.bookdepository.com/category/352/Science-Fiction/browse/viewmode/all?page={page}"
html = requests.get(URL)
soup = BeautifulSoup(html.content, "html.parser")

Related

Python / BeautifulSoup webscraper returning "None"

trying to build a webscraper to return lists of freelance gig postings on different websites into one place. My code is below and it keeps returning "None". I'm a bit stuck at this point, if you can help identify why it keeps doing this that would be great.
import requests
from bs4 import BeautifulSoup
import pprint
res1 = requests.get('https://www.airtasker.com/tasks/?task_states=posted&lat=-33.7918&lon=151.0806&location_name=Eastwood%2C%20NSW&radius=20000000&carl_ids=&task_types=both&max_price=9999&min_price=5&search_term=python&badges=&sort_by=posted_desc') # this is where we will scrape the info from
soup1 = BeautifulSoup(res1.text, 'html.parser') # this tells BS to give us HTML code for the page
links1 = soup1.select('.new-task-list-item new-task-list-item--open') # link of each gig
subtext1 = soup1.select('.new-task-list-item__date at-icon-calendar') # date of each gig
res2 = requests.get('https://www.airtasker.com/tasks/?task_states=posted&lat=-33.7918&lon=151.0806&location_name=Eastwood%2C%20NSW&radius=20000000&carl_ids=&task_types=both&max_price=9999&min_price=5&search_term=web%20developer&badges=&sort_by=posted_desc')
soup2 = BeautifulSoup(res2.text, 'html.parser')
links2 = soup2.select('.new-task-list-item new-task-list-item--open')
subtext2 = soup2.select('.new-task-list-item__date at-icon-calendar')
res3 = requests.get('https://www.upwork.com/freelance-jobs/website/')
soup3 = BeautifulSoup(res3.text, 'html.parser')
links3 = soup3.select('.job-title')
subtext3 = soup3.select('.text-muted')
res4 = requests.get('https://www.upwork.com/freelance-jobs/data-science/')
soup4 = BeautifulSoup(res4.text, 'html.parser')
links4 = soup4.select('.job-title')
subtext4 = soup4.select('.text-muted')
res5 = requests.get('https://www.upwork.com/freelance-jobs/bot-development/')
soup5 = BeautifulSoup(res5.text, 'html.parser')
links5 = soup5.select('.job-title')
subtext5 = soup5.select('.text-muted')
res6 = requests.get('https://www.upwork.com/freelance-jobs/python-script/')
soup6 = BeautifulSoup(res6.text, 'html.parser')
links6 = soup6.select('.job-title')
subtext6 = soup6.select('.text-muted')
mega_links = links1 + links2 + links3 + links4 + links5 + links6
mega_subtext = subtext1 + subtext2 + subtext3 + subtext4 + subtext5 + subtext6
def extract(links, subtexts):
joblist = []
for indx, item in enumerate(links):
title = item.getText()
href = item.get('href')
joblist.append({'title': title, 'link': href})
return joblist
pprint.pprint(extract(mega_links , mega_subtext))

I have no idea what exactly you are trying to extract from the scraped web page requests. Here's what I tried from my end:
Your links variable are null or empty lists since there is no such querySelector present for the web page you're trying to scrape. For example, the console of the first web page that you are scraping (the element you're trying to scrape doesn't exist):
I would recommend you to confirm the element you're trying to scrape and confirm it's class.
Another Point of Consideration:
When you will print your soup variables you will notice that you get CloudFare as the output.

python requests and bs4 how to navigate through the children of an element

so this is my code
from bs4 import BeautifulSoup
import requests
import time
URL = 'http://www.vn-meido.com/k1/index.php?board=17.0'
# loads page
r = requests.get(URL)
soup = BeautifulSoup(r.content, "html.parser")
# gets the newest book
book = soup.select_one('td[class^="subject windowbg2"]').text
while True:
# reloads the page
r = requests.get(URL)
soup = BeautifulSoup(r.content, "html.parser")
# gets the newest book
new_book = soup.select_one('td[class^="subject windowbg2"]').text
# checks if a new book has been uploaded
if book == new_book:
print("no new book found")
elif book != new_book:
print(new_book)
book = soup.select_one('td[class^="subject windowbg2"]').text
# repeats after 30 seconds
time.sleep(30)
but if you go to the website and have a look I get the text of the newest book uploaded but I want to be able to separate the title and the author and the title and author are in different elements but they don't have a way to identify them (like a class or an ID) so if you can help please do, thanks

Assuming html remains consistent across entries (I only checked a few) then when next text is found under the pinned listings at the top (I assume this to be a new book) then you need to extract the book url, visit that url, then you can use ``:-soup-containsto target author and book title by specific text andnext_sibling` to get the required return values.
N.B. I have removed the while loop for the purposes of this answer. The additions to the elif are the important ones.
from bs4 import BeautifulSoup
import requests
URL = 'http://www.vn-meido.com/k1/index.php?board=17.0'
# loads page
r = requests.get(URL)
soup = BeautifulSoup(r.content, "html.parser")
# gets the newest book
book = '' # for testing altered this line
r = requests.get(URL)
soup = BeautifulSoup(r.content, "html.parser")
# gets the newest book
new_book = soup.select_one('td[class^="subject windowbg2"]').text
# checks if a new book has been uploaded
if book == new_book:
print("no new book found")
elif book != new_book:
print(new_book)
new_book_url = soup.select_one('tr:not([class]) td:not([class*=stickybg]) ~ .subject a')['href']
r = requests.get(new_book_url)
soup = BeautifulSoup(r.content, "html.parser")
for member in ['TITLE ', 'AUTHOR']:
print(soup.select_one(f'strong:-soup-contains("{member}")').next_sibling.next_sibling)

BeautifulSoup (Python): how grab text-string next to a tag (that may or may not exist)?

I think my title explains it pretty well the problem I am facing. Let's look at a picture of the problem. (You can find the web-page at this adress, however it has probably changed).
I have highlighted the text that I want to grab in blue, this is the model-year 2008. Now, it is not necessary for the seller to submit the model-year, so this may or may not exist. But when it does exist it always follows the <i> tag with class ="fa fa-calender". My solution so far has been to grab all the text whitin <p class="result-details> ... </p>" (this then becomes a list) and then choose the second element, conditioned on that <i class="fa fa-calender> ... </i> exists. Otherwise I do not grab anything.
Now, it seems as this does not work in general since that text that comes before the second element can be aranged into more than one element if has a whitespace in it. So, is there any way (any function) that can grab a text string that neighbours another tag as seen in my picture?
PS: if I have made myself unclear, I just want to fetch the year 2008 from the post on the web page if it exists.
Edit
In this situation my code erroneously gives my the word "Hjulvältar" (bulldozer in english) instead of the year 2008.
CODE
from bs4 import BeautifulSoup
from datetime import date
import requests
url_avvikande = ['bomliftar','teleskop-bomliftar','kompakta-sjalvgaende-bomlyftar','bandschaktare','reachstackers','staplare']
today = date.today().isoformat()
url_main = 'https://www.mascus.se'
produktgrupper = ['lantbruksmaskiner','transportfordon','skogsmaskiner','entreprenadmaskiner','materialhantering','gronytemaskiner']
kategorier = {
'lantbruksmaskiner': ['traktorer','sjalvgaende-falthackar','skordetroskor','atv','utv:er','snoskotrar'],
'transportfordon': ['fordonstruckar','elektriska-fordon','terrangfordon'],
'skogsmaskiner': ['skog-skordare','skog-gravmaskiner','skotare','drivare','fallare-laggare','skogstraktorer','lunnare','terminal-lastare'],
'entreprenadmaskiner': ['gravlastare','bandgravare','minigravare-7t','hjulgravare','midigravmaskiner-7t-12t','atervinningshanterare','amfibiska-gravmaskiner','gravmaskiner-med-frontskopa','gravmaskiner-med-lang-rackvidd','gravmaskiner-med-slapskopa','rivningsgravare','specialgravmaskiner','hjullastare','kompaktlastare','minilastmaskiner','bandlastare','teleskopiska-hjullastare','redaskapshallare','gruvlastare','truckar-och-lastare-for-gruvor','bergborriggar','teleskoplastare','dumprar','minidumprar','gruvtruckar','banddumprar','specialiserade-dragare','vaghyvlar','vattentankbilar','allterrangkranar','terrangkranar-grov-terrang','-bandgaende-kranar','saxliftar','bomliftar','teleskop-bomliftar','personhissar-och-andra-hissar','kompakta-sjalvgaende-bomlyftar','krossar','mobila-krossar','sorteringsverk','mobila-sorteringsverk','bandschaktare','asfaltslaggningsmaskiner','--asfaltskallfrasmaskiner','tvavalsvaltar','envalsvaltar','jordkompaktorer','pneumatiska-hjulvaltar','andra-valtar','kombirullar','borrutrustning-ytborrning','horisontella-borrutrustning','trenchers-skar-gravmaskin'],
'materialhantering': ['dieseltruckar','eldrivna-gaffeltruckar','lpg-truckar','gaffeltruckar---ovriga','skjutstativtruck','sidlastare','teleskopbomtruckar','terminaltraktorer','reachstackers','ovriga-materialhantering-maskiner','staplare-led','staplare','plocktruck-laglyftande','plocktruck-hoglyftande','plocktruck-mediumlyftande','dragtruck','terrangtruck','4-vagstruck','smalgangstruck','skurborsttorkar','inomhus-sopmaskiner','kombinationsskurborstar'],
'gronytemaskiner': ['kompakttraktorer','akgrasklippare','robotgrasklippare','nollsvangare','plattformsklippare','sopmaskiner','verktygsfraktare','redskapsbarare','golfbilar','fairway-grasklippare','green-grasklippare','grasmattevaltar','ovriga-gronytemaskiner']
}
url = 'https://www.mascus.se'
mappar = ['Lantbruk', 'Transportfordon', 'Skogsmaskiner', 'Entreprenad', 'Materialhantering', 'Grönytemaskiner']
index = -1
status = True
for produktgrupp in kategorier:
index += 1
mapp = mappar[index]
save_path = f'/home/protector.local/vika99/webscrape_mascus/Annonser/{mapp}'
underkategorier = kategorier[produktgrupp]
for underkategori in underkategorier:
# OBS
if underkategori != 'borrutrustning-ytborrning' and status:
continue
else:
status = False
# OBS
if underkategori in url_avvikande:
url = f'{url_main}/{produktgrupp}/{underkategori}'
elif underkategori == 'gravmaskiner-med-frontskopa':
url = f'{url_main}/{produktgrupp}/begagnat-{underkategori}'
elif underkategori == 'borrutrustning-ytborrning':
url = f'{url_main}/{produktgrupp}/begagnad-{underkategori}'
else:
url = f'{url_main}/{produktgrupp}/begagnade-{underkategori}'
file_name = f'{save_path}/{produktgrupp}_{underkategori}_{today}.txt'
sida = 1
print(url)
with open(file_name, 'w') as f:
while True:
print(sida)
html_text = None
soup = None
links = None
while links == None:
html_text = requests.get(url).text
soup = BeautifulSoup(html_text, 'lxml')
links = soup.find('ul', class_ = 'page-numbers')
annonser = soup.find_all('li', class_ = 'col-row single-result')
for annons in annonser:
modell = annons.find('a', class_ = 'title-font').text
if annons.p.find('i', class_ = 'fa fa-calendar') != None:
tillverkningsar = annons.find('p', class_ = 'result-details').text.strip().split(" ")[1]
else:
tillverkningsar = 'Ej angiven'
try:
pris = annons.find('span', class_ = 'title-font no-ws-wrap').text
except AttributeError:
pris = annons.find('span', class_ = 'title-font no-price').text
f.write(f'{produktgrupp:<21}{underkategori:25}{modell:<70}{tillverkningsar:<13}{pris:>14}\n')
url_part = None
sida += 1
try:
url_part = links.find('a', text = f'{sida}')['href']
except TypeError:
print(f'Avläsning av underkategori klar.')
break
url = f'{url_main}{url_part}'

As you loop the listings you can test if that calendar icon class is present, if it is then grab the next_sibling
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.mascus.se/entreprenadmaskiner/begagnade-pneumatiska-hjulvaltar')
soup = bs(r.content, 'lxml')
listings = soup.select('.single-result')
for listing in listings:
calendar = listing.select_one('.fa-calendar')
if calendar is not None:
print(calendar.next_sibling)
else:
print('Not present')

How to extract a span tag inside div another tag

I have written a code in python using Beautiful Soup for extracting user name and their rating from IMDB. But there are many user who did not gave rating for their reviews. Its become difficult to map exactly ratings with their reviews. So how can i do this part?
http://www.imdb.com/title/tt2866360/reviews?ref_=tt_ov_rt
In this url reviews are not assign rating.
url1 ="http://www.imdb.com/title/tt2866360/reviews?ref_=tt_ov_rt"
response = requests.get(url1, headers=headers)
page=response.content
soup=BeautifulSoup(page)
for k in soup.findAll('div',{"class":"load-more-data"}):
if k.name == 'span' and m['class'] == "rating-other-user-rating":
print blah()
else:
print blah 1()
This is the code to check whether rating part exist in review part or not but it did not returning any thing?

The information you're looking for (username, rating) is located in 'div.review-container' tags.
About the tags that have no rating, you can just ignore them.
for k in soup.find_all('div',{"class":"review-container"}):
rating = k.find('span', class_='rating-other-user-rating')
if rating:
rating = ''.join(i.text for i in rating.find_all('span')[-2:])
name = k.find('span', class_='display-name-link').text
print name, rating
The information that shows when you press the Load More button is loaded via XHR requests.
You'll find the all data you need in order to preform the request in a 'div.load-more-data' tag.
load_more = soup.find('div', class_='load-more-data')
url = 'http://www.imdb.com{}?paginationKey={}'.format(
load_more['data-ajaxurl'], load_more['data-key']
)
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
Just repeat the above process untill you have all the info.
import requests
from bs4 import BeautifulSoup
url = "http://www.imdb.com/title/tt2866360/reviews?ref_=tt_ov_rt"
ajax_url = url.split('?')[0] + "/_ajax?paginationKey={}"
reviews = []
while True:
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
for k in soup.find_all('div',{"class":"review-container"}):
rating = k.find('span', class_='rating-other-user-rating')
if rating:
rating = ''.join(i.text for i in rating.find_all('span')[-2:])
name = k.find('span', class_='display-name-link').text
reviews.append([name, rating])
print name, rating
load_more = soup.find('div', class_='load-more-data')
if not load_more:
break
url = ajax_url.format(load_more['data-key'])

I suggest you should try to console the content from <div class="review-container" ... of every review. Then select the specific data you want to retrieve.

Scrape through website and iterate over seach results to get specific data

I'm trying to work on a project to scrape www.boattrader.com to push 800 listings with the Make, Price, and Phone Number of each boat to a CSV file.
I'm looking for guidance on the best way to scrape the links to each boat listing from the search results and then parse through each individual page to grab the Make, Price and Phone number.
Any guidance would be much appreciated it!
Thanks again!
from bs4 import BeautifulSoup, SoupStrainer
import requests
def extract_from_search(search_results):
# make this into a function
r = requests.get(search_results)
ad_page_html = r.text
soup = BeautifulSoup(ad_page_html, 'html.parser')
possible_links = soup.find_all('a', {'class': 'btn btn-orange'})
for link in possible_links:
if link.has_attr('href'):
boat_links = link.attrs['href']
return boat_links
search_results = 'http://www.boattrader.com/search-results/NewOrUsed-any/Type-all/Zip-90007/Radius-2000/Sort-Length:DESC/Page-1,50'
boat_links = extract_from_search(search_results)
print boat_links #why does this only print one link? What would be the best way to iterate over the search results, so I can put those links into the boat_listing variable to grab the information I'm looking for?
def extract_from_listing(boat_listing):
r = requests.get(boat_listing)
ad_page_html = r.text
soup = BeautifulSoup(ad_page_html, 'html.parser')
table_heads = soup.find_all('th')
for th in table_heads:
if th.text =="Make":
make = th.find_next_sibling("td").text
price = soup.find('span', {'class': 'bd-price'})
formatted_price = price.string.strip()
contact_info = soup.find('div', {'class': 'phone'})
reversed_phone = contact_info.string[::-1]
temp_phone = reversed_phone.replace(')', '}')
temp_phone2 = temp_phone.replace('(', ')')
correct_phone = temp_phone2.replace("}", "(")
return make, formatted_price, correct_phone
boat_listing = 'http://www.boattrader.com/listing/2009-Briggs-BR9134-Sportfish-102290211'
make, price, phone = extract_from_listing(boat_listing)
print make
print price
print phone

You are only returning the last link, you need to append:
def extract_from_search(search_results):
# make this into a function
r = requests.get(search_results)
ad_page_html = r.text
soup = BeautifulSoup(ad_page_html, 'html.parser')
possible_links = soup.find_all('a', {'class': 'btn btn-orange'})
boat_links = [] # create list to append all inks to
for link in possible_links:
if link.has_attr('href'):
boat_links.append(link.attrs['href']) # append each link
return boat_links
Or use a list comp:
def extract_from_search(search_results):
# make this into a function
r = requests.get(search_results)
ad_page_html = r.content # use content to let requests handle the decoding
soup = BeautifulSoup(ad_page_html, 'html.parser')
possible_links = soup.find_all('a', {'class': 'btn btn-orange'})
return [link.attrs['href'] for link in possible_links if link.has_attr('href')]

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Web Crawler Looping the URL to crawl many pages - python

Related

Python / BeautifulSoup webscraper returning "None"

python requests and bs4 how to navigate through the children of an element

BeautifulSoup (Python): how grab text-string next to a tag (that may or may not exist)?

How to extract a span tag inside div another tag

Scrape through website and iterate over seach results to get specific data

Categories

Resources