Issue With Web Scraping In Python - python

So, for some reason when I try and get the results for this script, it just crashes and shows no error at all before I get anything, someone please help me to get this to work. I don't know why this is, I think it may have to do with getting the Items Variable in some regard, but I just can't figure it out! Any help would be appreciated.
Here Is The Script:
from bs4 import BeautifulSoup
import requests
import re
import time
print("Computer Deal Finder")
print("\nBy: ViridianTelamon.")
print("\nThis Program Will Help You Find The Best Computers, Adapters, Electronics, And Computer Components Using The Website New Egg.")
item_thing = input("\nEnter The Item You Want To Find The Best Deals On: ")
time.sleep(2)
#url = f"https://www.amazon.com/s?k={item}&page=1&crid=1BE844NMMQSV7&sprefix={item}%2Caps%2C1923&ref=nb_sb_noss_1"
url = f"https://www.newegg.ca/p/pl?d={item_thing}&N=4131"
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
#page_text = doc.find(class_="s-pagination-item s-pagination-selected")
page_text = doc.find(class_="list-tool-pagination-text").strong
pages = int(str(page_text).split("/")[-2].split(">")[-1][:-1])
items_found = []
for page in range(1, pages + 1):
url = f"https://www.newegg.ca/p/pl?d={item_thing}&N=4131page={page}"
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
items = doc.find_all(text=re.compile(item_thing))
#items = div.find_all(text=re.compile(item_thing))
for item in items:
parent = item.parent
link = None
if parent.name != "a":
continue
link = parent['href']
next_parent = item.find_parent(class_="item-container")
try:
price = next_parent.find(class_="price-current").find("strong").string
items_found[item] = {"Price: ": int(price.replace(",", "")), "URL: ": link}
except:
pass
#sorted_items = sorted(items_found.items(), key=lambda x: x[1]['price'])
sorted_items = sorted(items_found, key=lambda x: x[1]['price'])
print("\n--------------------")
for item in sorted_items:
print("\n"f"Name: {item[0]}")
print("\n"f"Price: ${items[1]['price']}")
print("\n"f"URL: items[1]['link']")
print("\n--------------------")
time.sleep(0.2)

I suggest you test the result of your .find() calls as not all items contain the information you need. For example:
from bs4 import BeautifulSoup
import requests
import re
import time
item_thing = "adapter"
url = f"https://www.newegg.ca/p/pl?d={item_thing}&N=4131"
page = requests.get(url).text
doc = BeautifulSoup(page, "html.parser")
page_text = doc.find(class_="list-tool-pagination-text").strong
pages = int(str(page_text).split("/")[-2].split(">")[-1][:-1])
items_found = []
for page in range(1, pages + 1):
print(f"Getting page {page}")
url = f"https://www.newegg.ca/p/pl?d={item_thing}&N=4131&page={page}"
req = requests.get(url)
doc = BeautifulSoup(req.content, "html.parser")
for div in doc.find_all('div', class_="item-container"):
li_price = div.find(class_='price-current')
price = 0 # assume unknown price
if li_price:
strong = li_price.find('strong')
if strong:
price = float(strong.text.replace(',', ''))
a_tag = div.find('a', class_='item-title', href=True)
items_found.append([price, a_tag['href'], a_tag.text])
for price, link, name in sorted(items_found):
print(f"Name: {name}")
print(f"Price: ${price}")
print(f"URL: {link}")
print("--------------------")
This would give you results starting:
Name: axGear Universal Brass 3.5mm Male to 6.5mm Female Stereo Audio Adapter Jack Connector
Price: $3.0
URL: https://www.newegg.ca/p/231-0099-00023?Description=adapter&cm_re=adapter-_-9SIAD1NC9E3870-_-Product
--------------------
Name: axGear USB-C Female to USB 3.0 Male Adapter Converter Type C to USB 3 F/M
Price: $7.0
URL: https://www.newegg.ca/p/231-0099-00018?Description=adapter&cm_re=adapter-_-9SIAD1NB4E4533-_-Product
--------------------
Name: ORICO USB to Bluetooth 4.0 Portable Adapter Wireless Receiver Adapter Dongle -White
Price: $8.0
URL: https://www.newegg.ca/orico-bta-403/p/0XM-000H-00009?Description=adapter&cm_re=adapter-_-0XM-000H-00009-_-Product
--------------------

Related

BeautifulSoup (Python): how grab text-string next to a tag (that may or may not exist)?

I think my title explains it pretty well the problem I am facing. Let's look at a picture of the problem. (You can find the web-page at this adress, however it has probably changed).
I have highlighted the text that I want to grab in blue, this is the model-year 2008. Now, it is not necessary for the seller to submit the model-year, so this may or may not exist. But when it does exist it always follows the <i> tag with class ="fa fa-calender". My solution so far has been to grab all the text whitin <p class="result-details> ... </p>" (this then becomes a list) and then choose the second element, conditioned on that <i class="fa fa-calender> ... </i> exists. Otherwise I do not grab anything.
Now, it seems as this does not work in general since that text that comes before the second element can be aranged into more than one element if has a whitespace in it. So, is there any way (any function) that can grab a text string that neighbours another tag as seen in my picture?
PS: if I have made myself unclear, I just want to fetch the year 2008 from the post on the web page if it exists.
Edit
In this situation my code erroneously gives my the word "Hjulvältar" (bulldozer in english) instead of the year 2008.
CODE
from bs4 import BeautifulSoup
from datetime import date
import requests
url_avvikande = ['bomliftar','teleskop-bomliftar','kompakta-sjalvgaende-bomlyftar','bandschaktare','reachstackers','staplare']
today = date.today().isoformat()
url_main = 'https://www.mascus.se'
produktgrupper = ['lantbruksmaskiner','transportfordon','skogsmaskiner','entreprenadmaskiner','materialhantering','gronytemaskiner']
kategorier = {
'lantbruksmaskiner': ['traktorer','sjalvgaende-falthackar','skordetroskor','atv','utv:er','snoskotrar'],
'transportfordon': ['fordonstruckar','elektriska-fordon','terrangfordon'],
'skogsmaskiner': ['skog-skordare','skog-gravmaskiner','skotare','drivare','fallare-laggare','skogstraktorer','lunnare','terminal-lastare'],
'entreprenadmaskiner': ['gravlastare','bandgravare','minigravare-7t','hjulgravare','midigravmaskiner-7t-12t','atervinningshanterare','amfibiska-gravmaskiner','gravmaskiner-med-frontskopa','gravmaskiner-med-lang-rackvidd','gravmaskiner-med-slapskopa','rivningsgravare','specialgravmaskiner','hjullastare','kompaktlastare','minilastmaskiner','bandlastare','teleskopiska-hjullastare','redaskapshallare','gruvlastare','truckar-och-lastare-for-gruvor','bergborriggar','teleskoplastare','dumprar','minidumprar','gruvtruckar','banddumprar','specialiserade-dragare','vaghyvlar','vattentankbilar','allterrangkranar','terrangkranar-grov-terrang','-bandgaende-kranar','saxliftar','bomliftar','teleskop-bomliftar','personhissar-och-andra-hissar','kompakta-sjalvgaende-bomlyftar','krossar','mobila-krossar','sorteringsverk','mobila-sorteringsverk','bandschaktare','asfaltslaggningsmaskiner','--asfaltskallfrasmaskiner','tvavalsvaltar','envalsvaltar','jordkompaktorer','pneumatiska-hjulvaltar','andra-valtar','kombirullar','borrutrustning-ytborrning','horisontella-borrutrustning','trenchers-skar-gravmaskin'],
'materialhantering': ['dieseltruckar','eldrivna-gaffeltruckar','lpg-truckar','gaffeltruckar---ovriga','skjutstativtruck','sidlastare','teleskopbomtruckar','terminaltraktorer','reachstackers','ovriga-materialhantering-maskiner','staplare-led','staplare','plocktruck-laglyftande','plocktruck-hoglyftande','plocktruck-mediumlyftande','dragtruck','terrangtruck','4-vagstruck','smalgangstruck','skurborsttorkar','inomhus-sopmaskiner','kombinationsskurborstar'],
'gronytemaskiner': ['kompakttraktorer','akgrasklippare','robotgrasklippare','nollsvangare','plattformsklippare','sopmaskiner','verktygsfraktare','redskapsbarare','golfbilar','fairway-grasklippare','green-grasklippare','grasmattevaltar','ovriga-gronytemaskiner']
}
url = 'https://www.mascus.se'
mappar = ['Lantbruk', 'Transportfordon', 'Skogsmaskiner', 'Entreprenad', 'Materialhantering', 'Grönytemaskiner']
index = -1
status = True
for produktgrupp in kategorier:
index += 1
mapp = mappar[index]
save_path = f'/home/protector.local/vika99/webscrape_mascus/Annonser/{mapp}'
underkategorier = kategorier[produktgrupp]
for underkategori in underkategorier:
# OBS
if underkategori != 'borrutrustning-ytborrning' and status:
continue
else:
status = False
# OBS
if underkategori in url_avvikande:
url = f'{url_main}/{produktgrupp}/{underkategori}'
elif underkategori == 'gravmaskiner-med-frontskopa':
url = f'{url_main}/{produktgrupp}/begagnat-{underkategori}'
elif underkategori == 'borrutrustning-ytborrning':
url = f'{url_main}/{produktgrupp}/begagnad-{underkategori}'
else:
url = f'{url_main}/{produktgrupp}/begagnade-{underkategori}'
file_name = f'{save_path}/{produktgrupp}_{underkategori}_{today}.txt'
sida = 1
print(url)
with open(file_name, 'w') as f:
while True:
print(sida)
html_text = None
soup = None
links = None
while links == None:
html_text = requests.get(url).text
soup = BeautifulSoup(html_text, 'lxml')
links = soup.find('ul', class_ = 'page-numbers')
annonser = soup.find_all('li', class_ = 'col-row single-result')
for annons in annonser:
modell = annons.find('a', class_ = 'title-font').text
if annons.p.find('i', class_ = 'fa fa-calendar') != None:
tillverkningsar = annons.find('p', class_ = 'result-details').text.strip().split(" ")[1]
else:
tillverkningsar = 'Ej angiven'
try:
pris = annons.find('span', class_ = 'title-font no-ws-wrap').text
except AttributeError:
pris = annons.find('span', class_ = 'title-font no-price').text
f.write(f'{produktgrupp:<21}{underkategori:25}{modell:<70}{tillverkningsar:<13}{pris:>14}\n')
url_part = None
sida += 1
try:
url_part = links.find('a', text = f'{sida}')['href']
except TypeError:
print(f'Avläsning av underkategori klar.')
break
url = f'{url_main}{url_part}'
As you loop the listings you can test if that calendar icon class is present, if it is then grab the next_sibling
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.mascus.se/entreprenadmaskiner/begagnade-pneumatiska-hjulvaltar')
soup = bs(r.content, 'lxml')
listings = soup.select('.single-result')
for listing in listings:
calendar = listing.select_one('.fa-calendar')
if calendar is not None:
print(calendar.next_sibling)
else:
print('Not present')

how to scrape author name and author url from a webpage using python

i am trying to scrape author name and author url from the following webpage.
https://medium.com/javascript-scene/top-javascript-frameworks-and-topics-to-learn-in-2019-b4142f38df20?source=tag_archive
and i am using following code;
author_flag = 0
divs = soup.find_all('h2')
for div in divs:
author = div.find('a')
if(author is not None):
author_art.append(author.text)
author_url.append('https://medium.com'+ author.get('href'))
aurhor_flag = 1
break
if(author_flag==0):
author_art.append('Author information missing')
author_url.append('Author Url information missing')
can anyone take a look what i am doing wrong in this? As this code is not picking anything.
its is just returning blank list.
Full code:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
data = pd.read_csv('url_technology.csv')
author_art = []
author_url = []
for i in range(1):
try:
author_flag = 0
divs = soup.find_all('meta')
for div in divs:
author = div.find('span')
if(author is not None):
author_art.append(author.text)
author_url.append('https://medium.com'+author.get('href'))
aurhor_flag = 1
break
if(author_flag==0):
author_art.append('Author information missing')
author_url.append('Author Url information missing')
except:
print('no data found')
author_art = pd.DataFrame(title)
author_url = pd.DataFrame(url)
res = pd.concat([author_art, author_art] , axis=1)
res.columns = ['Author_Art', 'Author_url']
res.to_csv('combined1.csv')
print('File created successfully')
https://medium.com/javascript-scene/top-javascript-frameworks-and-topics-to-learn-in-2019-b4142f38df20?source=tag_archive---------0-----------------------
https://medium.com/job-advice-for-software-engineers/what-i-want-and-dont-want-to-see-on-your-software-engineering-resume-cbc07913f7f6?source=tag_archive---------1-----------------------
https://itnext.io/load-testing-using-apache-jmeter-af189dd6f805?source=tag_archive---------2-----------------------
https://medium.com/s/story/black-mirror-bandersnatch-a-study-guide-c46dfe9156d?source=tag_archive---------3-----------------------
https://medium.com/fast-company/the-worst-design-crimes-of-2018-56f32b027bb7?source=tag_archive---------4-----------------------
https://towardsdatascience.com/make-your-pictures-beautiful-with-a-touch-of-machine-learning-magic-31672daa3032?source=tag_archive---------5-----------------------
https://medium.com/hackernoon/the-state-of-ruby-2019-is-it-dying-509160a4fb92?source=tag_archive---------6-----------------------
One possibility how to get author Name and author URL is to parse the Ld+Json data embedded within the page:
import json
import requests
from bs4 import BeautifulSoup
url = "https://medium.com/javascript-scene/top-javascript-frameworks-and-topics-to-learn-in-2019-b4142f38df20"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
data = json.loads(soup.select_one('[type="application/ld+json"]').contents[0])
# uncomment this to print all LD+JSON data:
# print(json.dumps(data, indent=4))
print("Author:", data["author"]["name"])
print("URL:", data["author"]["url"])
Prints:
Author: Eric Elliott
URL: https://medium.com/#_ericelliott
EDIT: A function that returns Author Name/URL:
import json
import requests
from bs4 import BeautifulSoup
def get_author_name_url(medium_url):
soup = BeautifulSoup(requests.get(url).content, "html.parser")
data = json.loads(
soup.select_one('[type="application/ld+json"]').contents[0]
)
return data["author"]["name"], data["author"]["url"]
url_list = [
"https://medium.com/javascript-scene/top-javascript-frameworks-and-topics-to-learn-in-2019-b4142f38df20",
]
for url in url_list:
name, url = get_author_name_url(url)
print("Author:", name)
print("URL:", url)
I've launched a python package called medium-apis to do such tasks.
Install medium-apis
pip install medium-apis
Get you RapidAPI key. See how
Run the code:
from medium_apis import Medium
medium = Medium('YOUR_RAPIDAPI_KEY')
def get_author(url):
url_without_parameters = url.split('?')[0]
article_id = url_without_parameters.split('-')[-1]
article = medium.article(article_id=article_id)
author = article.author
author.save_info()
return author
urls = [
"https://nishu-jain.medium.com/medium-apis-documentation-3384e2d08667",
]
for url in urls:
author = get_author(url)
print('Author: ', author.fullname)
print('Profile URL: ', f'https://medium.com/#{author.username}')
Github repo: https://github.com/weeping-angel/medium-apis

How to extract text within h4 strong?

I am trying to extract each "Overall Rating" (number value in strong tags) from each product page
https://www.guitarguitar.co.uk/product/12082017334688--epiphone-les-paul-standard-plus-top-pro-translucent-blue
The structure goes as follows:
<div class="col-sm-12">
<h2 class="line-bottom"> Customer Reviews</h2>
<h4>
Overall Rating
<strong>5</strong>
<span></span>
</h4>
</div>
I am trying to extract only the strong values.
productsRating = soup.find("div", {"class": "col-sm-12"}.h4
This sometimes works, but the page makes use of same class for different elements so it extracts un-wanted html elements.
Is there any solution to only getting the products overall reviews?
EDITED!!
this is the whole loop for my program.
for page in range(1, 2):
guitarPage = requests.get('https://www.guitarguitar.co.uk/guitars/electric/page-{}'.format(page)).text
soup = BeautifulSoup(guitarPage, 'lxml')
guitars = soup.find_all(class_='col-xs-6 col-sm-4 col-md-4 col-lg-3')
for guitar in guitars:
title_text = guitar.h3.text.strip()
print('Guitar Name: ', title_text)
price = guitar.find(class_='price bold small').text.strip()
trim = re.compile(r'[^\d.,]+')
int_price = trim.sub('', price)
print('Guitar Price: ', int_price)
priceSave = guitar.find('span', {'class': 'price save'})
if priceSave is not None:
priceOf = priceSave.text
trim = re.compile(r'[^\d.,]+')
int_priceOff = trim.sub('', priceOf)
print('Save: ', int_priceOff)
else:
print("No discount!")
image = guitar.img.get('src')
print('Guitar Image: ', image)
productLink = guitar.find('a').get('href')
linkProd = url + productLink
print('Link of product', linkProd)
productsPage.append(linkProd)
for products in productsPage:
response = requests.get(products)
soup = BeautifulSoup(response.content, "lxml")
productsDetails = soup.find("div", {"class": "description-preview"})
if productsDetails is not None:
description = productsDetails.text
print('product detail: ', description)
else:
print('none')
time.sleep(0.2)
productsRating = soup.find_all('strong')[0].text
print(productsRating)
Review info is all in a script tag you can extract and load with json. Simply enough to see how to fit that in a loop.
import requests
from bs4 import BeautifulSoup as bs
import json
url = 'https://www.guitarguitar.co.uk/product/12082017334688--epiphone-les-paul-standard-plus-top-pro-translucent-blue'
r = requests.get(url)
soup = bs(r.content, 'lxml')
script = soup.select_one('[type="application/ld+json"]').text
data = json.loads(script.strip())
overall_rating = data['#graph'][2]['aggregateRating']['ratingValue']
reviews = [review for review in data['#graph'][2]['review']] #extract what you want
Output:
Explore json
To handle no reviews you could use a simply try except:
import requests
from bs4 import BeautifulSoup as bs
import json
url = 'https://www.guitarguitar.co.uk/product/190319340849008--gibson-les-paul-standard-60s-iced-tea'
r = requests.get(url)
soup = bs(r.content, 'lxml')
script = soup.select_one('[type="application/ld+json"]').text
data = json.loads(script.strip())
try:
overall_rating = data['#graph'][2]['aggregateRating']['ratingValue']
reviews = [review for review in data['#graph'][2]['review']] #extract what you want
except: #you might want to use except KeyError
overall_rating = "None"
reviews = ['None']
or, use an if statement:
if 'aggregateRating' in script:
overall_rating = data['#graph'][2]['aggregateRating']['ratingValue']
reviews = [review for review in data['#graph'][2]['review']] #extract what you want
else:
overall_rating = "None"
reviews = ['None']
Try:
import requests
from bs4 import BeautifulSoup
url = 'https://www.guitarguitar.co.uk/product/190319340849008--gibson-les-paul-standard-60s-iced-tea'
html = requests.get(url).text
soup = BeautifulSoup(html, "lxml")
try:
productsRating = soup.find('h2', string=lambda s: "Customer reviews" in s).find_next_siblings()[0].find('strong').text
except:
productsRating = None
print(productsRating)

Scrape site with multiple links without "next" button using beautiful soup

I am very new to python (three days in) and I have stumbled into a problem I can't solve with google/youtube. I want to scrape the National Governors Association for background data of all US governors and save this into a csv file.
I have managed to scrape a list of all governors, but to get more details I need to enter the page of each governor individually and save the data. I have found code suggestions online which utilises a "next" button or the url structure to loop over several sites. This website, however, does not have a next button and the url-links does not follow a loopable structure. So I am stuck.
I would appreciate any help I can get very much. I want to extract the info above the main text (Office Dates, School(s) etc in the "address" tag) in each governors page, for example in this one.
This is what I have got so far:
import bs4 as bs
import urllib.request
import pandas as pd
url = 'https://www.nga.org/cms/FormerGovBios?begincac77e09-db17-41cb-9de0-687b843338d0=10&endcac77e09-db17-41cb-9de0-687b843338d0=9999&pagesizecac77e09-db17-41cb-9de0-687b843338d0=10&militaryService=&higherOfficesServed=&religion=&lastName=&sex=Any&honors=&submit=Search&college=&firstName=&party=&inOffice=Any&biography=&warsServed=&'
sauce = urllib.request.urlopen(url).read()
soup = bs.BeautifulSoup(sauce, "html.parser")
#dl list of all govs
dfs = pd.read_html(url, header=0)
for df in dfs:
df.to_csv('governors.csv')
#dl links to each gov
table = soup.find('table', 'table table-striped table-striped')
links = table.findAll('a')
with open ('governors_links.csv', 'w') as r:
for link in links:
r.write(link['href'])
r.write('\n')
r.close()
#enter each gov page and extract data in the "address" tag(s)
#save this in a csv file
I'm assuming that you've got all the links in a list named links.
You can do this to get the data you want of all the Governors one by one:
for link in links:
r = urllib.request.urlopen(link).read()
soup = bs.BeautifulSoup(r, 'html.parser')
print(soup.find('h2').text) # Name of Governor
for p in soup.find('div', {'class': 'col-md-3'}).findAll('p'):
print(p.text.strip()) # Office dates, address, phone, ...
for p in soup.find('div', {'class': 'col-md-7'}).findAll('p'):
print(p.text.strip()) # Family, school, birth state, ...
Edit:
Change your links list to
links = ['https://www.nga.org' + x.get('href') for x in table.findAll('a')]
This may work. I haven't tested it out to full completion since I'm at work but it should be a starting point for you.
import bs4 as bs
import requests
import re
def is_number(s):
try:
int(s)
return True
except ValueError:
return False
def main():
url = 'https://www.nga.org/cms/FormerGovBios?inOffice=Any&state=Any&party=&lastName=&firstName=&nbrterms=Any&biography=&sex=Any&religion=&race=Any&college=&higherOfficesServed=&militaryService=&warsServed=&honors=&birthState=Any&submit=Search'
sauce = requests.get(url).text
soup = bs.BeautifulSoup(sauce, "html.parser")
finished = False
csv_data = open('Govs.csv', 'a')
csv_data.write('Name,Address,OfficeDates,Success,Address,Phone,Fax,Born,BirthState,Party,Schooling,Email')
try:
while not finished:
#dl links to each gov
table = soup.find('table', 'table table-striped table-striped')
links = table.findAll('a')
for link in links:
info_array = []
gov = {}
name = link.string
gov_sauce = requests.get(r'https://nga.org'+link.get('href')).text
gov_soup = bs.BeautifulSoup(gov_sauce, "html.parser")
#print(gov_soup)
office_and_stuff_info = gov_soup.findAll('address')
for address in office_and_stuff_info:
infos = address.findAll('p')
for info in infos:
tex = re.sub('[^a-zA-Z\d:]','',info.text)
tex = re.sub('\\s+',' ',info.text)
tex = tex.strip()
if tex:
info_array.append(tex)
info_array = list(set(info_array))
gov['Name'] = name
secondarry_address = ''
gov['Address'] = ''
for line in info_array:
if 'OfficeDates:' in line:
gov['OfficeDates'] = line.replace('OfficeDates:','').replace('-','')
elif 'Succ' or 'Fail' in line:
gov['Success'] = line
elif 'Address' in line:
gov['Address'] = line.replace('Address:','')
elif 'Phone:' or 'Phone ' in line:
gov['Phone'] = line.replace('Phone ','').replace('Phone: ','')
elif 'Fax:' in line:
gov['Fax'] = line.replace('Fax:','')
elif 'Born:' in line:
gov['Born'] = line.replace('Born:','')
elif 'Birth State:' in line:
gov['BirthState'] = line.replace('BirthState:','')
elif 'Party:' in line:
gov['Party'] = line.replace('Party:','')
elif 'School(s)' in line:
gov['Schooling'] = line.replace('School(s):','').replace('School(s) ')
elif 'Email:' in line:
gov['Email'] = line.replace('Email:','')
else:
secondarry_address = line
gov['Address'] = gov['Address'] + secondarry_address
data_line = gov['Name'] +','+gov['Address'] +','+gov['OfficeDates'] +','+gov['Success'] +','+gov['Address'] +','+ gov['Phone'] +','+ gov['Fax'] +','+gov['Born'] +','+gov['BirthState'] +','+gov['Party'] +','+gov['Schooling'] +','+gov['Email']
csv_data.write(data_line)
next_page_link = soup.find('ul','pagination center-blockdefault').find('a',{'aria-label':'Next'})
if next_page_link.parent.get('class') == 'disabled':
finished = True
else:
url = r'https://nga.org'+next_page_link.get('href')
sauce = requests.get(url).text
soup = bs.BeautifulSoup(sauce,'html.parser')
except:
print('Code failed.')
finally:
csv_data.close()
if __name__ == '__main__':
main()

How do I only print the links that contain certain text in the description on that page?

Im trying to open the links that contain certain words on that page. And if the words are present on that page such as "engineering" then return the link if not pass.
here is what I have so far: The inputs I put are engineering and location is north york
import requests
from bs4 import BeautifulSoup
import webbrowser
import time
jobsearch = input("What type of job?: ")
location = input("What is your location: ")
url = ("https://ca.indeed.com/jobs?q=" + jobsearch + "&l=" + location)
base_url = 'https://ca.indeed.com/'
r = requests.get(url)
rcontent = r.content
prettify = BeautifulSoup(rcontent, "html.parser")
filter_words = ['chemical engineering', 'instrumentation', 'QA']
all_job_url = []
filtered_job_links = []
http_flinks = []
flinks = []
def get_all_joblinks(): # obtains all the links on the search page
for tag in prettify.find_all('a', {'data-tn-element':"jobTitle"}):
link = tag['href']
all_job_url.append(link)
def filter_links():
for eachurl in all_job_url: # iterates through each link
rurl = requests.get(base_url + eachurl)
content = rurl.content
soup = BeautifulSoup(content, "html.parser")
summary = soup.get_text()
#supposed to filter links based on certain words within text on link page
if any(word in summary for word in filter_words):
for filtered_link in soup.find_all('link', {'rel':'canonical'}):
flink = filtered_link['href'] # obtains only filtered links
if "http:" in flink:
http_flinks.append(flink)
print(http_flinks)
else:
flinks.append(flink)
#website = webbrowser.open_new(base_url + flink)
time.sleep(3)
print(flinks)
else:
print("nothing")
pass
def search_job():
while True:
if prettify.select('div.no_results'):
print("no job matches found")
break
else:
# opens the web page of job search if entries are found
website = webbrowser.open_new(url)
break
get_all_joblinks()
filter_links()

Categories