I am trying to extract names in custom <h2>, but the names I want are extracted many times.
how to fix this problem and extract it one time
The page I am pulling data from
here
import requests
import csv
from bs4 import BeautifulSoup
from itertools import zip_longest
lawy_name = []
page_num = 1
phone = []
logo = []
website = []
links = []
while True:
try:
result = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
src = result.content
soup = BeautifulSoup(src, "lxml")
page_limit = int("126")
if(page_num > page_limit // 25):
print("page ended, terminate")
break
lawy_names = soup.select('div.poap.serp-container.lawyer h2.indigo_text')
for i in range(len(lawy_names)) :
lawy_name.append(lawy_names[i].text.strip())
links.append(lawy_names[i].find("a").attrs["href"])
for link in links:
result = requests.get(link)
src = result.content
soup = BeautifulSoup(src, "lxml")
phones = soup.find("a", {"class":"profile-phone-header profile-contact-btn"})
phone.append(phones["href"])
logos = soup.find("div", {"class":"photo-container"})
logo.append(logos.find('img')['src'])
websites = soup.find("a", {"class":"profile-website-header","id":"firm_website"})
website.append(websites.text.strip())
page_num +=1
print("page switched")
except:
print("error")
break
file_list = [lawy_name, phone, website, logo]
exported = zip_longest(*file_list)
with open("/Users/dsoky/Desktop/fonts/Moaaz.csv", "w") as myfile:
wr = csv.writer(myfile)
wr.writerow(["lawyer name","phone","website","logo"])
wr.writerows(exported)
Problem:
The website does produce a lot of duplicate entries. You could probably assume that all entries have unique names, as such a dictionary could be used to hold all of your data. Simply skip any entries for which you have already seen the same name. For example:
from bs4 import BeautifulSoup
import requests
import csv
lawyers = {}
page_num = 1
while True:
print(f"Page {page_num}")
req = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
soup = BeautifulSoup(req.content, "lxml")
found = False
for id in ['sponsored_serps', 'ts_results', 'poap_results', 'basic_results']:
div_results = soup.find('div', id=id)
if div_results:
for result in div_results.find_all('div', class_='lawyer'):
name = result.h2.get_text(strip=True)
if name not in lawyers:
print(' ', name)
link = result.h2.a['href']
req_details = requests.get(link)
soup_details = BeautifulSoup(req_details.content, "lxml")
a_phone = soup_details.find("a", {"class":"profile-phone-header profile-contact-btn"}, href=True)
if a_phone:
phone = a_phone['href']
else:
phone = None
div_logo = soup_details.find("div", {"class":"photo-container"})
if div_logo.img:
logo = div_logo.img['src']
else:
logo = None
a_website = soup_details.find("a", {"class":"profile-website-header","id":"firm_website"})
if a_website:
website = a_website.get_text(strip=True)
else:
website = None
lawyers[name] = [phone, logo, website]
found = True
# Keep going until no new names found
if found:
page_num += 1
else:
break
with open('Moaaz.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['Name', 'Phone', 'Logo', 'Website'])
for name, details in lawyers.items():
csv_output.writerow([name, *details])
I think my title explains it pretty well the problem I am facing. Let's look at a picture of the problem. (You can find the web-page at this adress, however it has probably changed).
I have highlighted the text that I want to grab in blue, this is the model-year 2008. Now, it is not necessary for the seller to submit the model-year, so this may or may not exist. But when it does exist it always follows the <i> tag with class ="fa fa-calender". My solution so far has been to grab all the text whitin <p class="result-details> ... </p>" (this then becomes a list) and then choose the second element, conditioned on that <i class="fa fa-calender> ... </i> exists. Otherwise I do not grab anything.
Now, it seems as this does not work in general since that text that comes before the second element can be aranged into more than one element if has a whitespace in it. So, is there any way (any function) that can grab a text string that neighbours another tag as seen in my picture?
PS: if I have made myself unclear, I just want to fetch the year 2008 from the post on the web page if it exists.
Edit
In this situation my code erroneously gives my the word "Hjulvältar" (bulldozer in english) instead of the year 2008.
CODE
from bs4 import BeautifulSoup
from datetime import date
import requests
url_avvikande = ['bomliftar','teleskop-bomliftar','kompakta-sjalvgaende-bomlyftar','bandschaktare','reachstackers','staplare']
today = date.today().isoformat()
url_main = 'https://www.mascus.se'
produktgrupper = ['lantbruksmaskiner','transportfordon','skogsmaskiner','entreprenadmaskiner','materialhantering','gronytemaskiner']
kategorier = {
'lantbruksmaskiner': ['traktorer','sjalvgaende-falthackar','skordetroskor','atv','utv:er','snoskotrar'],
'transportfordon': ['fordonstruckar','elektriska-fordon','terrangfordon'],
'skogsmaskiner': ['skog-skordare','skog-gravmaskiner','skotare','drivare','fallare-laggare','skogstraktorer','lunnare','terminal-lastare'],
'entreprenadmaskiner': ['gravlastare','bandgravare','minigravare-7t','hjulgravare','midigravmaskiner-7t-12t','atervinningshanterare','amfibiska-gravmaskiner','gravmaskiner-med-frontskopa','gravmaskiner-med-lang-rackvidd','gravmaskiner-med-slapskopa','rivningsgravare','specialgravmaskiner','hjullastare','kompaktlastare','minilastmaskiner','bandlastare','teleskopiska-hjullastare','redaskapshallare','gruvlastare','truckar-och-lastare-for-gruvor','bergborriggar','teleskoplastare','dumprar','minidumprar','gruvtruckar','banddumprar','specialiserade-dragare','vaghyvlar','vattentankbilar','allterrangkranar','terrangkranar-grov-terrang','-bandgaende-kranar','saxliftar','bomliftar','teleskop-bomliftar','personhissar-och-andra-hissar','kompakta-sjalvgaende-bomlyftar','krossar','mobila-krossar','sorteringsverk','mobila-sorteringsverk','bandschaktare','asfaltslaggningsmaskiner','--asfaltskallfrasmaskiner','tvavalsvaltar','envalsvaltar','jordkompaktorer','pneumatiska-hjulvaltar','andra-valtar','kombirullar','borrutrustning-ytborrning','horisontella-borrutrustning','trenchers-skar-gravmaskin'],
'materialhantering': ['dieseltruckar','eldrivna-gaffeltruckar','lpg-truckar','gaffeltruckar---ovriga','skjutstativtruck','sidlastare','teleskopbomtruckar','terminaltraktorer','reachstackers','ovriga-materialhantering-maskiner','staplare-led','staplare','plocktruck-laglyftande','plocktruck-hoglyftande','plocktruck-mediumlyftande','dragtruck','terrangtruck','4-vagstruck','smalgangstruck','skurborsttorkar','inomhus-sopmaskiner','kombinationsskurborstar'],
'gronytemaskiner': ['kompakttraktorer','akgrasklippare','robotgrasklippare','nollsvangare','plattformsklippare','sopmaskiner','verktygsfraktare','redskapsbarare','golfbilar','fairway-grasklippare','green-grasklippare','grasmattevaltar','ovriga-gronytemaskiner']
}
url = 'https://www.mascus.se'
mappar = ['Lantbruk', 'Transportfordon', 'Skogsmaskiner', 'Entreprenad', 'Materialhantering', 'Grönytemaskiner']
index = -1
status = True
for produktgrupp in kategorier:
index += 1
mapp = mappar[index]
save_path = f'/home/protector.local/vika99/webscrape_mascus/Annonser/{mapp}'
underkategorier = kategorier[produktgrupp]
for underkategori in underkategorier:
# OBS
if underkategori != 'borrutrustning-ytborrning' and status:
continue
else:
status = False
# OBS
if underkategori in url_avvikande:
url = f'{url_main}/{produktgrupp}/{underkategori}'
elif underkategori == 'gravmaskiner-med-frontskopa':
url = f'{url_main}/{produktgrupp}/begagnat-{underkategori}'
elif underkategori == 'borrutrustning-ytborrning':
url = f'{url_main}/{produktgrupp}/begagnad-{underkategori}'
else:
url = f'{url_main}/{produktgrupp}/begagnade-{underkategori}'
file_name = f'{save_path}/{produktgrupp}_{underkategori}_{today}.txt'
sida = 1
print(url)
with open(file_name, 'w') as f:
while True:
print(sida)
html_text = None
soup = None
links = None
while links == None:
html_text = requests.get(url).text
soup = BeautifulSoup(html_text, 'lxml')
links = soup.find('ul', class_ = 'page-numbers')
annonser = soup.find_all('li', class_ = 'col-row single-result')
for annons in annonser:
modell = annons.find('a', class_ = 'title-font').text
if annons.p.find('i', class_ = 'fa fa-calendar') != None:
tillverkningsar = annons.find('p', class_ = 'result-details').text.strip().split(" ")[1]
else:
tillverkningsar = 'Ej angiven'
try:
pris = annons.find('span', class_ = 'title-font no-ws-wrap').text
except AttributeError:
pris = annons.find('span', class_ = 'title-font no-price').text
f.write(f'{produktgrupp:<21}{underkategori:25}{modell:<70}{tillverkningsar:<13}{pris:>14}\n')
url_part = None
sida += 1
try:
url_part = links.find('a', text = f'{sida}')['href']
except TypeError:
print(f'Avläsning av underkategori klar.')
break
url = f'{url_main}{url_part}'
As you loop the listings you can test if that calendar icon class is present, if it is then grab the next_sibling
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.mascus.se/entreprenadmaskiner/begagnade-pneumatiska-hjulvaltar')
soup = bs(r.content, 'lxml')
listings = soup.select('.single-result')
for listing in listings:
calendar = listing.select_one('.fa-calendar')
if calendar is not None:
print(calendar.next_sibling)
else:
print('Not present')
Hi i try to extract the name from h2 but an error occurs and names are extracted from other <h2> I want to extract names from <h2> specified from only <div class="poap serp-container lawyer"><div class="gray_border"><div class="col-lg-8 col-md-8 col-sm-9 col-xs-8 text_container"><h2 class=""indigo_text>Hi My name is Mark</h2></div></div></div>
import requests
import csv
from bs4 import BeautifulSoup
from itertools import zip_longest
name = []
page_num = 1
phone = []
logo = []
website = []
links = []
while True:
try:
result = requests.get(f"https://attorneys.superlawyers.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
src = result.content
soup = BeautifulSoup(src, "lxml")
page_limit = int("126")
if(page_num > page_limit // 20):
print("page ended, terminate")
break
names = soup.find_all("h2", {"class":"indigo_text"})
for i in range(len(names)) :
name.append(names[i].text.strip())
links.append(names[i].find("a").attrs["href"])
for link in links:
result = requests.get(link)
src = result.content
soup = BeautifulSoup(src, "lxml")
phones = soup.find("a", {"class":"profile-phone-header profile-contact-btn"})
phone.append(phones["href"])
logos = soup.find("div", {"class":"photo-container"})
logo.append(logos.find('img')['src'])
websites = soup.find("a", {"class":"profile-website-header","id":"firm_website"})
website.append(websites.text.strip())
page_num +=1
print("page switched")
except:
print("error")
break
file_list = [name, phone, website, logo]
exported = zip_longest(*file_list)
with open("/Users/dsoky/Desktop/fonts/Moaaz.csv", "w") as myfile:
wr = csv.writer(myfile)
wr.writerow(["name","phone","website","logo"])
wr.writerows(exported)
I hope you guys can help me solve this problem
Select your tag more specific for example with following css selector:
names = soup.select('div.poap h2')
or with all the classes:
names = soup.select('div.poap.serp-container.lawyer h2.indigo_text')
Note This answer just focus to main point in question, code could be imporved to avoid some side effects.
I am trying to scrape some statements made by U.S politicians on votesmart.org
I am experiencing errors in extracting the texts though the code could be run.
The code that I am using is as follow:
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import requests
import os
def main():
df=pd.read_csv('https://theunitedstates.io/congress-legislators/legislators-current.csv')
df = df[df.type=='sen']
df = df[~df.votesmart_id.isna()]
done_list = os.listdir('corpus')
print("{} senators".format(len(df)))
df = df[~df.full_name.isin(done_list)]
print("{} after some already done".format(len(df)))
df = df.sample(frac=1)
df.apply(scrape_politician_speeches,axis=1)
def scrape_politician_speeches(row):
print('Scraping {}...'.format(row.full_name))
vs_url='https://justfacts.votesmart.org/candidate/public-statements/{}'.format(int(row.votesmart_id))
vs_page = requests.get(vs_url) # fill in the last part of the url
soup = BeautifulSoup(vs_page.content, features="lxml")
n_pages = 1
page_num = 1
while page_num <= n_pages:
print("\tPage {} of {}".format(page_num,n_pages))
#speeches_url = vs_page.url + '?start=2019-01-01&speechType=14&p={}'.format(page_num)
speeches_url = vs_page.url + '/?s=date&start=2020/01/01&end=&p={}'.format(page_num)
speeches_page = requests.get(speeches_url)
soup = BeautifulSoup(speeches_page.content, features="lxml")
speech_table = soup.find('table', {'id':'statementsObjectsTables'})
speech_table = soup.find('tbody')
speech_links = speech_table.find_all('a',href=True)
speech_hrefs = [a.get('href') for a in speech_links]
for href in speech_hrefs:
scrape_speech(person=row.full_name, speech_url=href)
try:
n_pages = int(soup.find('h7').text.split()[-1])
except:
print("\tNo page numbers")
pass
page_num += 1
sleep(1)
def scrape_speech(person, speech_url):
try:
if not os.path.isdir('corpus/{}'.format(person)):
os.mkdir('corpus/{}'.format(person))
speech_page = requests.get(speech_url)
soup = BeautifulSoup(speech_page.content,features="lxml")
title = soup.find('h3').text
date = soup.find('span',{'itemprop':'datePublished'}).text
location = soup.find('span',{'itemprop':'contentLocation'}).text
body = soup.find('div', {'class':"main clear"})
p_list = body.find_all('p')
text_list = [p.text for p in p_list]
speech_text = '\n\n'.join(text_list)
full_text = '{}\n\n\n{}'.format(title,speech_text)
file_name = '{}, {}, {}.txt'.format(title.split(',')[0], date, location)
file_name = file_name.replace('/',' ')
with open('corpus/{}/{}'.format(person,file_name), 'w') as f:
f.write(full_text)
except:
print("\tError with {}".format(speech_url))
if __name__=='__main__':
main()
The errors are looking like this:
95 senators
95 after some already done
Scraping Tammy Duckworth...
Page 1 of 1
Error with https://votesmart.org/public-statement/1570841/durbin-duckworth-announce-135-million-for-springfield-rail-improvement-project
Error with https://votesmart.org/public-statement/1570825/durbin-duckworth-statement-on-nomination-of-ladon-reynolds-to-serve-as-us-marshal-for-the-northern-district-of-illinois
Error with https://votesmart.org/public-statement/1570826/durbin-duckworth-announce-16-million-in-telehealth-funding-for-illinois-health-care-providers
Thank you so much for your time and attention. I hope to learn more from this wonderful community.
scrape_speech is outdated, probably pages' design changed since script was writen, there's no <div class="main clear"> in html, there's no <span itemprop="datePublished"> and so on. You need to rewrite it using current css selectors.
Im trying to open the links that contain certain words on that page. And if the words are present on that page such as "engineering" then return the link if not pass.
here is what I have so far: The inputs I put are engineering and location is north york
import requests
from bs4 import BeautifulSoup
import webbrowser
import time
jobsearch = input("What type of job?: ")
location = input("What is your location: ")
url = ("https://ca.indeed.com/jobs?q=" + jobsearch + "&l=" + location)
base_url = 'https://ca.indeed.com/'
r = requests.get(url)
rcontent = r.content
prettify = BeautifulSoup(rcontent, "html.parser")
filter_words = ['chemical engineering', 'instrumentation', 'QA']
all_job_url = []
filtered_job_links = []
http_flinks = []
flinks = []
def get_all_joblinks(): # obtains all the links on the search page
for tag in prettify.find_all('a', {'data-tn-element':"jobTitle"}):
link = tag['href']
all_job_url.append(link)
def filter_links():
for eachurl in all_job_url: # iterates through each link
rurl = requests.get(base_url + eachurl)
content = rurl.content
soup = BeautifulSoup(content, "html.parser")
summary = soup.get_text()
#supposed to filter links based on certain words within text on link page
if any(word in summary for word in filter_words):
for filtered_link in soup.find_all('link', {'rel':'canonical'}):
flink = filtered_link['href'] # obtains only filtered links
if "http:" in flink:
http_flinks.append(flink)
print(http_flinks)
else:
flinks.append(flink)
#website = webbrowser.open_new(base_url + flink)
time.sleep(3)
print(flinks)
else:
print("nothing")
pass
def search_job():
while True:
if prettify.select('div.no_results'):
print("no job matches found")
break
else:
# opens the web page of job search if entries are found
website = webbrowser.open_new(url)
break
get_all_joblinks()
filter_links()