I am trying to scrape some statements made by U.S politicians on votesmart.org
I am experiencing errors in extracting the texts though the code could be run.
The code that I am using is as follow:
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import requests
import os
def main():
df=pd.read_csv('https://theunitedstates.io/congress-legislators/legislators-current.csv')
df = df[df.type=='sen']
df = df[~df.votesmart_id.isna()]
done_list = os.listdir('corpus')
print("{} senators".format(len(df)))
df = df[~df.full_name.isin(done_list)]
print("{} after some already done".format(len(df)))
df = df.sample(frac=1)
df.apply(scrape_politician_speeches,axis=1)
def scrape_politician_speeches(row):
print('Scraping {}...'.format(row.full_name))
vs_url='https://justfacts.votesmart.org/candidate/public-statements/{}'.format(int(row.votesmart_id))
vs_page = requests.get(vs_url) # fill in the last part of the url
soup = BeautifulSoup(vs_page.content, features="lxml")
n_pages = 1
page_num = 1
while page_num <= n_pages:
print("\tPage {} of {}".format(page_num,n_pages))
#speeches_url = vs_page.url + '?start=2019-01-01&speechType=14&p={}'.format(page_num)
speeches_url = vs_page.url + '/?s=date&start=2020/01/01&end=&p={}'.format(page_num)
speeches_page = requests.get(speeches_url)
soup = BeautifulSoup(speeches_page.content, features="lxml")
speech_table = soup.find('table', {'id':'statementsObjectsTables'})
speech_table = soup.find('tbody')
speech_links = speech_table.find_all('a',href=True)
speech_hrefs = [a.get('href') for a in speech_links]
for href in speech_hrefs:
scrape_speech(person=row.full_name, speech_url=href)
try:
n_pages = int(soup.find('h7').text.split()[-1])
except:
print("\tNo page numbers")
pass
page_num += 1
sleep(1)
def scrape_speech(person, speech_url):
try:
if not os.path.isdir('corpus/{}'.format(person)):
os.mkdir('corpus/{}'.format(person))
speech_page = requests.get(speech_url)
soup = BeautifulSoup(speech_page.content,features="lxml")
title = soup.find('h3').text
date = soup.find('span',{'itemprop':'datePublished'}).text
location = soup.find('span',{'itemprop':'contentLocation'}).text
body = soup.find('div', {'class':"main clear"})
p_list = body.find_all('p')
text_list = [p.text for p in p_list]
speech_text = '\n\n'.join(text_list)
full_text = '{}\n\n\n{}'.format(title,speech_text)
file_name = '{}, {}, {}.txt'.format(title.split(',')[0], date, location)
file_name = file_name.replace('/',' ')
with open('corpus/{}/{}'.format(person,file_name), 'w') as f:
f.write(full_text)
except:
print("\tError with {}".format(speech_url))
if __name__=='__main__':
main()
The errors are looking like this:
95 senators
95 after some already done
Scraping Tammy Duckworth...
Page 1 of 1
Error with https://votesmart.org/public-statement/1570841/durbin-duckworth-announce-135-million-for-springfield-rail-improvement-project
Error with https://votesmart.org/public-statement/1570825/durbin-duckworth-statement-on-nomination-of-ladon-reynolds-to-serve-as-us-marshal-for-the-northern-district-of-illinois
Error with https://votesmart.org/public-statement/1570826/durbin-duckworth-announce-16-million-in-telehealth-funding-for-illinois-health-care-providers
Thank you so much for your time and attention. I hope to learn more from this wonderful community.
scrape_speech is outdated, probably pages' design changed since script was writen, there's no <div class="main clear"> in html, there's no <span itemprop="datePublished"> and so on. You need to rewrite it using current css selectors.
Related
I am trying to extract names in custom <h2>, but the names I want are extracted many times.
how to fix this problem and extract it one time
The page I am pulling data from
here
import requests
import csv
from bs4 import BeautifulSoup
from itertools import zip_longest
lawy_name = []
page_num = 1
phone = []
logo = []
website = []
links = []
while True:
try:
result = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
src = result.content
soup = BeautifulSoup(src, "lxml")
page_limit = int("126")
if(page_num > page_limit // 25):
print("page ended, terminate")
break
lawy_names = soup.select('div.poap.serp-container.lawyer h2.indigo_text')
for i in range(len(lawy_names)) :
lawy_name.append(lawy_names[i].text.strip())
links.append(lawy_names[i].find("a").attrs["href"])
for link in links:
result = requests.get(link)
src = result.content
soup = BeautifulSoup(src, "lxml")
phones = soup.find("a", {"class":"profile-phone-header profile-contact-btn"})
phone.append(phones["href"])
logos = soup.find("div", {"class":"photo-container"})
logo.append(logos.find('img')['src'])
websites = soup.find("a", {"class":"profile-website-header","id":"firm_website"})
website.append(websites.text.strip())
page_num +=1
print("page switched")
except:
print("error")
break
file_list = [lawy_name, phone, website, logo]
exported = zip_longest(*file_list)
with open("/Users/dsoky/Desktop/fonts/Moaaz.csv", "w") as myfile:
wr = csv.writer(myfile)
wr.writerow(["lawyer name","phone","website","logo"])
wr.writerows(exported)
Problem:
The website does produce a lot of duplicate entries. You could probably assume that all entries have unique names, as such a dictionary could be used to hold all of your data. Simply skip any entries for which you have already seen the same name. For example:
from bs4 import BeautifulSoup
import requests
import csv
lawyers = {}
page_num = 1
while True:
print(f"Page {page_num}")
req = requests.get(f"https://example.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
soup = BeautifulSoup(req.content, "lxml")
found = False
for id in ['sponsored_serps', 'ts_results', 'poap_results', 'basic_results']:
div_results = soup.find('div', id=id)
if div_results:
for result in div_results.find_all('div', class_='lawyer'):
name = result.h2.get_text(strip=True)
if name not in lawyers:
print(' ', name)
link = result.h2.a['href']
req_details = requests.get(link)
soup_details = BeautifulSoup(req_details.content, "lxml")
a_phone = soup_details.find("a", {"class":"profile-phone-header profile-contact-btn"}, href=True)
if a_phone:
phone = a_phone['href']
else:
phone = None
div_logo = soup_details.find("div", {"class":"photo-container"})
if div_logo.img:
logo = div_logo.img['src']
else:
logo = None
a_website = soup_details.find("a", {"class":"profile-website-header","id":"firm_website"})
if a_website:
website = a_website.get_text(strip=True)
else:
website = None
lawyers[name] = [phone, logo, website]
found = True
# Keep going until no new names found
if found:
page_num += 1
else:
break
with open('Moaaz.csv', 'w', newline='') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(['Name', 'Phone', 'Logo', 'Website'])
for name, details in lawyers.items():
csv_output.writerow([name, *details])
I think my title explains it pretty well the problem I am facing. Let's look at a picture of the problem. (You can find the web-page at this adress, however it has probably changed).
I have highlighted the text that I want to grab in blue, this is the model-year 2008. Now, it is not necessary for the seller to submit the model-year, so this may or may not exist. But when it does exist it always follows the <i> tag with class ="fa fa-calender". My solution so far has been to grab all the text whitin <p class="result-details> ... </p>" (this then becomes a list) and then choose the second element, conditioned on that <i class="fa fa-calender> ... </i> exists. Otherwise I do not grab anything.
Now, it seems as this does not work in general since that text that comes before the second element can be aranged into more than one element if has a whitespace in it. So, is there any way (any function) that can grab a text string that neighbours another tag as seen in my picture?
PS: if I have made myself unclear, I just want to fetch the year 2008 from the post on the web page if it exists.
Edit
In this situation my code erroneously gives my the word "Hjulvältar" (bulldozer in english) instead of the year 2008.
CODE
from bs4 import BeautifulSoup
from datetime import date
import requests
url_avvikande = ['bomliftar','teleskop-bomliftar','kompakta-sjalvgaende-bomlyftar','bandschaktare','reachstackers','staplare']
today = date.today().isoformat()
url_main = 'https://www.mascus.se'
produktgrupper = ['lantbruksmaskiner','transportfordon','skogsmaskiner','entreprenadmaskiner','materialhantering','gronytemaskiner']
kategorier = {
'lantbruksmaskiner': ['traktorer','sjalvgaende-falthackar','skordetroskor','atv','utv:er','snoskotrar'],
'transportfordon': ['fordonstruckar','elektriska-fordon','terrangfordon'],
'skogsmaskiner': ['skog-skordare','skog-gravmaskiner','skotare','drivare','fallare-laggare','skogstraktorer','lunnare','terminal-lastare'],
'entreprenadmaskiner': ['gravlastare','bandgravare','minigravare-7t','hjulgravare','midigravmaskiner-7t-12t','atervinningshanterare','amfibiska-gravmaskiner','gravmaskiner-med-frontskopa','gravmaskiner-med-lang-rackvidd','gravmaskiner-med-slapskopa','rivningsgravare','specialgravmaskiner','hjullastare','kompaktlastare','minilastmaskiner','bandlastare','teleskopiska-hjullastare','redaskapshallare','gruvlastare','truckar-och-lastare-for-gruvor','bergborriggar','teleskoplastare','dumprar','minidumprar','gruvtruckar','banddumprar','specialiserade-dragare','vaghyvlar','vattentankbilar','allterrangkranar','terrangkranar-grov-terrang','-bandgaende-kranar','saxliftar','bomliftar','teleskop-bomliftar','personhissar-och-andra-hissar','kompakta-sjalvgaende-bomlyftar','krossar','mobila-krossar','sorteringsverk','mobila-sorteringsverk','bandschaktare','asfaltslaggningsmaskiner','--asfaltskallfrasmaskiner','tvavalsvaltar','envalsvaltar','jordkompaktorer','pneumatiska-hjulvaltar','andra-valtar','kombirullar','borrutrustning-ytborrning','horisontella-borrutrustning','trenchers-skar-gravmaskin'],
'materialhantering': ['dieseltruckar','eldrivna-gaffeltruckar','lpg-truckar','gaffeltruckar---ovriga','skjutstativtruck','sidlastare','teleskopbomtruckar','terminaltraktorer','reachstackers','ovriga-materialhantering-maskiner','staplare-led','staplare','plocktruck-laglyftande','plocktruck-hoglyftande','plocktruck-mediumlyftande','dragtruck','terrangtruck','4-vagstruck','smalgangstruck','skurborsttorkar','inomhus-sopmaskiner','kombinationsskurborstar'],
'gronytemaskiner': ['kompakttraktorer','akgrasklippare','robotgrasklippare','nollsvangare','plattformsklippare','sopmaskiner','verktygsfraktare','redskapsbarare','golfbilar','fairway-grasklippare','green-grasklippare','grasmattevaltar','ovriga-gronytemaskiner']
}
url = 'https://www.mascus.se'
mappar = ['Lantbruk', 'Transportfordon', 'Skogsmaskiner', 'Entreprenad', 'Materialhantering', 'Grönytemaskiner']
index = -1
status = True
for produktgrupp in kategorier:
index += 1
mapp = mappar[index]
save_path = f'/home/protector.local/vika99/webscrape_mascus/Annonser/{mapp}'
underkategorier = kategorier[produktgrupp]
for underkategori in underkategorier:
# OBS
if underkategori != 'borrutrustning-ytborrning' and status:
continue
else:
status = False
# OBS
if underkategori in url_avvikande:
url = f'{url_main}/{produktgrupp}/{underkategori}'
elif underkategori == 'gravmaskiner-med-frontskopa':
url = f'{url_main}/{produktgrupp}/begagnat-{underkategori}'
elif underkategori == 'borrutrustning-ytborrning':
url = f'{url_main}/{produktgrupp}/begagnad-{underkategori}'
else:
url = f'{url_main}/{produktgrupp}/begagnade-{underkategori}'
file_name = f'{save_path}/{produktgrupp}_{underkategori}_{today}.txt'
sida = 1
print(url)
with open(file_name, 'w') as f:
while True:
print(sida)
html_text = None
soup = None
links = None
while links == None:
html_text = requests.get(url).text
soup = BeautifulSoup(html_text, 'lxml')
links = soup.find('ul', class_ = 'page-numbers')
annonser = soup.find_all('li', class_ = 'col-row single-result')
for annons in annonser:
modell = annons.find('a', class_ = 'title-font').text
if annons.p.find('i', class_ = 'fa fa-calendar') != None:
tillverkningsar = annons.find('p', class_ = 'result-details').text.strip().split(" ")[1]
else:
tillverkningsar = 'Ej angiven'
try:
pris = annons.find('span', class_ = 'title-font no-ws-wrap').text
except AttributeError:
pris = annons.find('span', class_ = 'title-font no-price').text
f.write(f'{produktgrupp:<21}{underkategori:25}{modell:<70}{tillverkningsar:<13}{pris:>14}\n')
url_part = None
sida += 1
try:
url_part = links.find('a', text = f'{sida}')['href']
except TypeError:
print(f'Avläsning av underkategori klar.')
break
url = f'{url_main}{url_part}'
As you loop the listings you can test if that calendar icon class is present, if it is then grab the next_sibling
import requests
from bs4 import BeautifulSoup as bs
r = requests.get('https://www.mascus.se/entreprenadmaskiner/begagnade-pneumatiska-hjulvaltar')
soup = bs(r.content, 'lxml')
listings = soup.select('.single-result')
for listing in listings:
calendar = listing.select_one('.fa-calendar')
if calendar is not None:
print(calendar.next_sibling)
else:
print('Not present')
Hi i try to extract the name from h2 but an error occurs and names are extracted from other <h2> I want to extract names from <h2> specified from only <div class="poap serp-container lawyer"><div class="gray_border"><div class="col-lg-8 col-md-8 col-sm-9 col-xs-8 text_container"><h2 class=""indigo_text>Hi My name is Mark</h2></div></div></div>
import requests
import csv
from bs4 import BeautifulSoup
from itertools import zip_longest
name = []
page_num = 1
phone = []
logo = []
website = []
links = []
while True:
try:
result = requests.get(f"https://attorneys.superlawyers.com/motor-vehicle-accidents/texas/houston/page{page_num}/")
src = result.content
soup = BeautifulSoup(src, "lxml")
page_limit = int("126")
if(page_num > page_limit // 20):
print("page ended, terminate")
break
names = soup.find_all("h2", {"class":"indigo_text"})
for i in range(len(names)) :
name.append(names[i].text.strip())
links.append(names[i].find("a").attrs["href"])
for link in links:
result = requests.get(link)
src = result.content
soup = BeautifulSoup(src, "lxml")
phones = soup.find("a", {"class":"profile-phone-header profile-contact-btn"})
phone.append(phones["href"])
logos = soup.find("div", {"class":"photo-container"})
logo.append(logos.find('img')['src'])
websites = soup.find("a", {"class":"profile-website-header","id":"firm_website"})
website.append(websites.text.strip())
page_num +=1
print("page switched")
except:
print("error")
break
file_list = [name, phone, website, logo]
exported = zip_longest(*file_list)
with open("/Users/dsoky/Desktop/fonts/Moaaz.csv", "w") as myfile:
wr = csv.writer(myfile)
wr.writerow(["name","phone","website","logo"])
wr.writerows(exported)
I hope you guys can help me solve this problem
Select your tag more specific for example with following css selector:
names = soup.select('div.poap h2')
or with all the classes:
names = soup.select('div.poap.serp-container.lawyer h2.indigo_text')
Note This answer just focus to main point in question, code could be imporved to avoid some side effects.
My project aims that all web-article information was crawled by pages using the Beautifulsoup function.
The article information is the article title, time, body.
But, the article time text is behind tag as you can see.
I tried to do my best all day. However, I can not solve the problem.
How to solve this problem?
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import pandas as pd
import requests
i = input('Start page? : ')
k = input('End page? : ')
pagenum = int(i)
lastpage = int(k)
count = int(i)
news_info = pd.DataFrame(columns=('Title', 'Datetime', 'Article'))
idx = 0
while pagenum<lastpage + 1:
url = f'http://www.koscaj.com/news/articleList.html?page={pagenum}&total=72698&box_idxno=&sc_section_code=S1N2&view_type=sm'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all(class_='list-titles')
print(f'-----{count}page result-----')
for link in links:
news_url = "http://www.koscaj.com"+link.find('a')['href']
news_link = urllib.request.urlopen(news_url).read()
soup2 = BeautifulSoup(news_link, 'html.parser')
title = soup2.find('div', {'class':'article-head-title'})
date = soup2.find('div',{'class':'info-text'})
datetime = date[1]
article = soup2.find('div', {'id':'article-view-content-div'})
news_info.loc[idx] = [title, datetime, article]
idx += 1
pagenum += 1
count += 1
print('Complete')
ya it's not clear what you're issue is. I am assuming you are after this. Also note you need to grab the text of the title and articles as well (as you are not doing that in your code):
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import pandas as pd
import requests
i = input('Start page? : ')
k = input('End page? : ')
pagenum = int(i)
lastpage = int(k)
count = int(i)
news_info = pd.DataFrame(columns=('Title', 'Datetime', 'Article'))
idx = 0
while pagenum<lastpage + 1:
url = f'http://www.koscaj.com/news/articleList.html?page={pagenum}&total=72698&box_idxno=&sc_section_code=S1N2&view_type=sm'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all(class_='list-titles')
print(f'-----{count}page result-----')
for link in links:
news_url = "http://www.koscaj.com"+link.find('a')['href']
news_link = urllib.request.urlopen(news_url).read()
soup2 = BeautifulSoup(news_link, 'html.parser')
title = soup2.find('div', {'class':'article-head-title'})
if title:
title = soup2.find('div', {'class':'article-head-title'}).text
else:
title = ''
date = soup2.find('div',{'class':'info-text'})
try:
datetime = date.find('i', {'class':'fa fa-clock-o fa-fw'}).parent.text.strip()
except:
datetime = ''
article = soup2.find('div', {'id':'article-view-content-div'})
if article:
article = soup2.find('div', {'id':'article-view-content-div'}).text
else:
article = ''
news_info.loc[idx] = [title, datetime, article]
idx += 1
pagenum += 1
count += 1
print('Complete')
You have to access inner children in this tag.
Assuming that variable date contains:
<div class="info-text">
<ul class="...">
<li><i class="fa fa-user-o fa-fw"></i> 전문건설신문</li>
<li><i class="fa fa-clock-o fa-fw"></i> 승인 2020.11.25 18:24</li>
...
You can access date with:
date.find_all('li')[1].text
Which will be:
승인 2020.11.25 18:24
You can read more about accessing children in docs.
I am very new to python (three days in) and I have stumbled into a problem I can't solve with google/youtube. I want to scrape the National Governors Association for background data of all US governors and save this into a csv file.
I have managed to scrape a list of all governors, but to get more details I need to enter the page of each governor individually and save the data. I have found code suggestions online which utilises a "next" button or the url structure to loop over several sites. This website, however, does not have a next button and the url-links does not follow a loopable structure. So I am stuck.
I would appreciate any help I can get very much. I want to extract the info above the main text (Office Dates, School(s) etc in the "address" tag) in each governors page, for example in this one.
This is what I have got so far:
import bs4 as bs
import urllib.request
import pandas as pd
url = 'https://www.nga.org/cms/FormerGovBios?begincac77e09-db17-41cb-9de0-687b843338d0=10&endcac77e09-db17-41cb-9de0-687b843338d0=9999&pagesizecac77e09-db17-41cb-9de0-687b843338d0=10&militaryService=&higherOfficesServed=&religion=&lastName=&sex=Any&honors=&submit=Search&college=&firstName=&party=&inOffice=Any&biography=&warsServed=&'
sauce = urllib.request.urlopen(url).read()
soup = bs.BeautifulSoup(sauce, "html.parser")
#dl list of all govs
dfs = pd.read_html(url, header=0)
for df in dfs:
df.to_csv('governors.csv')
#dl links to each gov
table = soup.find('table', 'table table-striped table-striped')
links = table.findAll('a')
with open ('governors_links.csv', 'w') as r:
for link in links:
r.write(link['href'])
r.write('\n')
r.close()
#enter each gov page and extract data in the "address" tag(s)
#save this in a csv file
I'm assuming that you've got all the links in a list named links.
You can do this to get the data you want of all the Governors one by one:
for link in links:
r = urllib.request.urlopen(link).read()
soup = bs.BeautifulSoup(r, 'html.parser')
print(soup.find('h2').text) # Name of Governor
for p in soup.find('div', {'class': 'col-md-3'}).findAll('p'):
print(p.text.strip()) # Office dates, address, phone, ...
for p in soup.find('div', {'class': 'col-md-7'}).findAll('p'):
print(p.text.strip()) # Family, school, birth state, ...
Edit:
Change your links list to
links = ['https://www.nga.org' + x.get('href') for x in table.findAll('a')]
This may work. I haven't tested it out to full completion since I'm at work but it should be a starting point for you.
import bs4 as bs
import requests
import re
def is_number(s):
try:
int(s)
return True
except ValueError:
return False
def main():
url = 'https://www.nga.org/cms/FormerGovBios?inOffice=Any&state=Any&party=&lastName=&firstName=&nbrterms=Any&biography=&sex=Any&religion=&race=Any&college=&higherOfficesServed=&militaryService=&warsServed=&honors=&birthState=Any&submit=Search'
sauce = requests.get(url).text
soup = bs.BeautifulSoup(sauce, "html.parser")
finished = False
csv_data = open('Govs.csv', 'a')
csv_data.write('Name,Address,OfficeDates,Success,Address,Phone,Fax,Born,BirthState,Party,Schooling,Email')
try:
while not finished:
#dl links to each gov
table = soup.find('table', 'table table-striped table-striped')
links = table.findAll('a')
for link in links:
info_array = []
gov = {}
name = link.string
gov_sauce = requests.get(r'https://nga.org'+link.get('href')).text
gov_soup = bs.BeautifulSoup(gov_sauce, "html.parser")
#print(gov_soup)
office_and_stuff_info = gov_soup.findAll('address')
for address in office_and_stuff_info:
infos = address.findAll('p')
for info in infos:
tex = re.sub('[^a-zA-Z\d:]','',info.text)
tex = re.sub('\\s+',' ',info.text)
tex = tex.strip()
if tex:
info_array.append(tex)
info_array = list(set(info_array))
gov['Name'] = name
secondarry_address = ''
gov['Address'] = ''
for line in info_array:
if 'OfficeDates:' in line:
gov['OfficeDates'] = line.replace('OfficeDates:','').replace('-','')
elif 'Succ' or 'Fail' in line:
gov['Success'] = line
elif 'Address' in line:
gov['Address'] = line.replace('Address:','')
elif 'Phone:' or 'Phone ' in line:
gov['Phone'] = line.replace('Phone ','').replace('Phone: ','')
elif 'Fax:' in line:
gov['Fax'] = line.replace('Fax:','')
elif 'Born:' in line:
gov['Born'] = line.replace('Born:','')
elif 'Birth State:' in line:
gov['BirthState'] = line.replace('BirthState:','')
elif 'Party:' in line:
gov['Party'] = line.replace('Party:','')
elif 'School(s)' in line:
gov['Schooling'] = line.replace('School(s):','').replace('School(s) ')
elif 'Email:' in line:
gov['Email'] = line.replace('Email:','')
else:
secondarry_address = line
gov['Address'] = gov['Address'] + secondarry_address
data_line = gov['Name'] +','+gov['Address'] +','+gov['OfficeDates'] +','+gov['Success'] +','+gov['Address'] +','+ gov['Phone'] +','+ gov['Fax'] +','+gov['Born'] +','+gov['BirthState'] +','+gov['Party'] +','+gov['Schooling'] +','+gov['Email']
csv_data.write(data_line)
next_page_link = soup.find('ul','pagination center-blockdefault').find('a',{'aria-label':'Next'})
if next_page_link.parent.get('class') == 'disabled':
finished = True
else:
url = r'https://nga.org'+next_page_link.get('href')
sauce = requests.get(url).text
soup = bs.BeautifulSoup(sauce,'html.parser')
except:
print('Code failed.')
finally:
csv_data.close()
if __name__ == '__main__':
main()