Hello I've created two functions that work well well called alone. But when I try to use a for loop with these functions I got a problem with my parameter.
First function to search and get link to pass to the second one.
USER_AGENT = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
def searchsport(terme):
url = 'https://www.verif.com/recherche/{}/1/ca/d/?ville=null'.format(terme)
response = requests.get(url, headers= USER_AGENT)
response.raise_for_status()
return terme, response.text
def crawl(keyword):
if __name__ == '__main__':
try:
keyword, html = searchsport(keyword)
soup = bs(html,'html.parser')
table = soup.find_all('td', attrs={'class': 'verif_col1'})
premier = []
for result in table:
link = result.find('a', href=True)
premier.append(link)
truelink = 'https://www.verif.com/'+str(premier[0]).split('"')[1]
#print("le lien", truelink)
except Exception as e:
print(e)
finally:
time.sleep(10)
return truelink
Second function to scrape a link.
def single_text(item_url):
source_code = requests.get(item_url)
print('nivo1 ok')
plain_text = source_code.text # La page en html avec toutes ces balises
soup = bs(plain_text,features="lxml" )
print('nivo2 ok')
table = soup.find('table',{'class':"table infoGen hidden-smallDevice"}) # on cherche que la balise table
print('nivo1 ok', '\n', table)
table_rows = table.find_all('tr') # les données de tables sont dans les celulles tr
#print(table_rows)
l = []
for tr in table_rows:
td = tr.find_all('td')
row = row = [tr.text.strip() for tr in td]
l.append(row)
# On enleve certains caractères unitiles
df = pd.DataFrame(l)
return df
All these function worked when I tested them on a link.
Now I have a csv file with name of companies using searchsport() to search in website and the returned link is passed to single_text() to scrape.
for keyword in list(pd.read_csv('sport.csv').name):
l = crawl(keyword)
print(l) # THIS PRINT THE LINK
single_item(l) # HERE I GOT THE PROBLEME
Error:
nivo1 ok
nivo2 ok
nivo1 ok
None
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-55-263d95d6748c> in <module>
3 l = crawl(keyword)
4
----> 5 single_item(item_url=l)
<ipython-input-53-6d3b5c1b1ee8> in single_item(item_url)
7 table = soup.find('table',{'class':"table infoGen hidden-smallDevice"}) # on cherche que la balise table
8 print('nivo1 ok', '\n', table)
----> 9 table_rows = table.find_all('tr') # les données de tables sont dans les celulles tr
10 #print(table_rows)
11
AttributeError: 'NoneType' object has no attribute 'find_all'
When I run this I got a df.
single_item(item_url="https://www.verif.com/societe/COMPANYNAME-XXXXXXXXX/").head(1)
My expected results should be two DataFrame for every keyword.
Why it doesn't work?
So I have noted throughout the code some of the problems I saw with your code as posted.
Some things I noticed:
Not handling cases of where something is not found e.g. 'PARIS-SAINT-GERMAIN-FOOTBALL' will fail whereas 'PARIS SAINT GERMAIN FOOTBALL' as a search term will not
Opportunities for simplification missed e.g. creating a dataframe by looping tr then td when could just use read_html on table; Using find_all when a single table or a tag is needed
Overwriting variables in loops as well as typos e.g.
for tr in table_rows:
td = tr.find_all('td')
row = row = [tr.text.strip() for tr in td] # presumable a typo with row = row
Not testing if a dataframe is empty
Risking generating incorrect urls by using 'https://www.verif.com/' as the next part you concatenate on starts with "/" as well
Inconsistent variable naming e.g. what is single_item? The function I see is called single_text.
These are just some observations and there is certainly still room for improvement.
import requests, time
from bs4 import BeautifulSoup as bs
import pandas as pd
def searchsport(terme):
url = f'https://www.verif.com/recherche/{terme}/1/ca/d/?ville=null'
response = requests.get(url, headers = {'User-Agent':'Mozilla/5.0'})
response.raise_for_status()
return terme, response.text
def crawl(keyword):
try:
keyword, html = searchsport(keyword)
soup = bs(html,'lxml')
a_tag = soup.select_one('td.verif_col1 a[href]')
# your code before when looping tds would just overwrite truelink if more than one found. Instead
if a_tag is None:
#handle case of no result e.g. with using crawl('PARIS-SAINT-GERMAIN-FOOTBALL') instead of
#crawl('PARIS SAINT GERMAIN FOOTBALL')
truelink = ''
else:
# print(a_tag['href'])
# adding to the list premier served no purpose. Using split on href would result in list index out of range
truelink = f'https://www.verif.com{a_tag["href"]}' #relative link already so no extra / after .com
except Exception as e:
print(e)
truelink = '' #handle case of 'other' fail. Make sure there is an assigment
finally:
time.sleep(5)
return truelink #unless try succeeded this would have failed with local variable referenced before assignment
def single_text(item_url):
source_code = requests.get(item_url, headers = {'User-Agent':'Mozilla/5.0'})
print('nivo1 ok')
plain_text = source_code.text # La page en html avec toutes ces balises
soup = bs(plain_text,features="lxml")
print('nivo2 ok')
table = soup.select_one('.table') # on cherche que la balise table
#print('nivo1 ok', '\n', table)
if table is None:
df = pd.DataFrame()
else:
df = pd.read_html(str(table))[0] #simplify to work direct with table and pandas;avoid your loops
return df
def main():
terms = ['PARIS-SAINT-GERMAIN-FOOTBALL', 'PARIS SAINT GERMAIN FOOTBALL']
for term in terms:
item_url = crawl(term)
if item_url:
print(item_url)
df = single_text(item_url) # what is single_item in your question? There is single_text
if not df.empty: #test if dataframe is empty
print(df.head(1))
if __name__ == '__main__':
main()
Returning df from main()
import requests, time
from bs4 import BeautifulSoup as bs
import pandas as pd
def searchsport(terme):
url = f'https://www.verif.com/recherche/{terme}/1/ca/d/?ville=null'
response = requests.get(url, headers = {'User-Agent':'Mozilla/5.0'})
response.raise_for_status()
return terme, response.text
def crawl(keyword):
try:
keyword, html = searchsport(keyword)
soup = bs(html,'lxml')
a_tag = soup.select_one('td.verif_col1 a[href]')
# your code before when looping tds would just overwrite truelink if more than one found. Instead
if a_tag is None:
#handle case of no result e.g. with using crawl('PARIS-SAINT-GERMAIN-FOOTBALL') instead of
#crawl('PARIS SAINT GERMAIN FOOTBALL')
truelink = ''
else:
# print(a_tag['href'])
# adding to the list premier served no purpose. Using split on href would result in list index out of range
truelink = f'https://www.verif.com{a_tag["href"]}' #relative link already so no extra / after .com
except Exception as e:
print(e)
truelink = '' #handle case of 'other' fail. Make sure there is an assigment
finally:
time.sleep(5)
return truelink #unless try succeeded this would have failed with local variable referenced before assignment
def single_text(item_url):
source_code = requests.get(item_url, headers = {'User-Agent':'Mozilla/5.0'})
print('nivo1 ok')
plain_text = source_code.text # La page en html avec toutes ces balises
soup = bs(plain_text,features="lxml")
print('nivo2 ok')
table = soup.select_one('.table') # on cherche que la balise table
#print('nivo1 ok', '\n', table)
if table is None:
df = pd.DataFrame()
else:
df = pd.read_html(str(table))[0] #simplify to work direct with table and pandas;avoid your loops
return df
def main():
terms = ['PARIS-SAINT-GERMAIN-FOOTBALL', 'PARIS SAINT GERMAIN FOOTBALL']
for term in terms:
item_url = crawl(term)
if item_url:
#print(item_url)
df = single_text(item_url) # what is single_item in your question? There is single_text
return df
if __name__ == '__main__':
df = main()
print(df)
Your error suggests that you trying to run find_all() against a variable which hasn't been populated, i.e. a tag wasn't found to which you could run find_all() against. I have dealt with this by including a statement testing for NoneType
if VALUE is not None:
## code when the tag is found
else:
## code when tag is not found
I think this is the bit you need to do an update like this,
for tr in table_rows:
if tr is not None:
td = tr.find_all('td')
row = row = [tr.text.strip() for tr in td]
l.append(row)
# On enleve certains caractères unitiles
df = pd.DataFrame(l)
else:
## code to run when tr isn't populated
There's a more colourful example where some XML is being parsed where this in action here
Related
From someday I am trying to crawl all vessel data from vesselfinder with its description page, like from description page I want its information like vessel type, Imo number etc. in table form. I try different way to do this but still a lot of errors. First, I found that how I go through these links to its description page, how to get all these links from all pages, also how to get specific table data from its description page (which is still not complete but get some).
But today I try get the data from all links with its description pages at same time, it gives me a lot of error which make me so confused (by combining the code).
I attached my code, which is not good but to this point #print(len(vessellist)) it work after that… errors..
import requests
from bs4 import BeautifulSoup
import pandas as pd
headers = {
'user-agent': 'Mozilla/5.0',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
baseurl = 'https://www.vesselfinder.com/vessels'
vessellist = []
for x in range(1,6):
response = requests.get(
f'https://www.vesselfinder.com/vessels?page={x}',
headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
contents = soup.find_all('td', class_='v2')
for property in contents:
for item in property.find_all('a', href=True):
vessellist.append(baseurl + item['href'])
for link in vessellist:
response = requests.get(link, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', class_ = 'tparams')
head = []
for i in table.find_all('td', class_ = 'n3'):
title = i.text
head.append(title)
values =[]
for row in table.find_all('td', class_ = 'v3'):
data = row.text
values.append(data)
df = pd.DataFrame(values)
print(df)
two steps: get summary data (includes href).Next get detailled ones. Theses two steps are implemented in two functions. Here I get first 10 pages, 200 are available.
import requests as rq
from bs4 import BeautifulSoup as bs
from requests.api import head
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"}
def getSummaryData():
data = []
url = "https://www.vesselfinder.com/vessels"
for page in range(1, 10+1, 1): # only 200 first pages autorized ?
print("Page : %d/10" % page)
resp = rq.get(url + "?page=%s" % page, headers=headers)
soup = bs(resp.content, "lxml")
section = soup.find_all('section', {'class', 'listing'})[0]
tbody = section.find_all('tbody')[0]
trs = tbody.find_all('tr')
for tr in trs:
tds = tr.find_all('td')
# column 1 data
sub = tds[1].find('a')
href = sub['href']
divs = sub.find_all('div')
country = divs[0]['title']
sub_divs = divs[1].find_all('div')
vessel_name = sub_divs[0].text
vessel_type = sub_divs[1].text
# column 2 data
build_year = tds[2].text
# column 3 data
gt = tds[3].text
# column 4 data
dwt = tds[4].text
# column 5 data
size = tds[5].text
# save data
tr_data = {'country': country,
'vessel_name': vessel_name,
'vessel_type': vessel_type,
'build_year': build_year,
'gt': gt,
'dwt': dwt,
'size': size,
'href': href}
data.append(tr_data)
return data
def getDetailledData(data):
for (iel, el) in enumerate(data):
print("%d/%d" % (iel+1, len(data)))
url = "https://www.vesselfinder.com" + el['href']
# make get call
resp = rq.get(url, headers=headers)
soup = bs(resp.content, "lxml")
# position and voyage data
table = soup.find_all('table', {'class', 'aparams'})[0]
trs = table.find_all('tr')
labels = ["course_speed", "current_draught","navigation_status",
"position_received", "IMO_MMSI", "callsign", "flag", "length_beam"]
for (i, tr) in enumerate(trs):
td = tr.find_all('td')[1]
el.update({'%s' % labels[i]: td.text})
# vessel particulars
table = soup.find_all('table', {'class', 'tparams'})[0]
trs = table.find_all('tr')
labels = ["IMO_number", "vessel_name", "ship_type", "flag",
"homeport", "gross_tonnage", "summer_deadweight_t",
"length_overall_m", "beam_m", "draught_m", "year_of_built",
"builder", "place_of_built", "yard", "TEU", "crude", "grain",
"bale", "classification_society", "registered_owner", "manager"]
for (i, tr) in enumerate(trs):
td = tr.find_all('td')[1]
el.update({'%s' % labels[i]: td.text})
#break
return data
Call theses functions :
data = getSummaryData() # href include
data = getDetailledData(data)
Don't rely on 'class' tag to target the data. Generally, you need to go throught table -> tbody and then get tds or trs to be sure that's the correct ones.
I am still a beginner so Im sorry if this is a stupid question. I am trying to scrape some new articles for my master analysis through Jupyter notebook, but I am struggling with pagination. How can I fix that?
Here is the code:
from bs4 import BeautifulSoup
import requests
import pandas as pd
danas = []
base_url = 'https://www.danas.rs/tag/izbori-2020/page/'
r = requests.get(base_url)
c = r.content
soup = BeautifulSoup(c,"html.parser")
paging = soup.find("div",{"column is-8"}).find("div",{"nav-links"}).find_all("a")
start_page = paging[1].int
last_page = paging[len(paging)-1].int
web_content_list = []
for page_number in range(int(float(start_page)),int(float(last_page)) + 1):
url = base_url+str(page_number)+"/.html"
r = requests.get(base_url+str(page_number))
c = r.content
soup = BeautifulSoup(c,"html.parser")
if r.status_code == 200:
soup = BeautifulSoup(r.content, 'html.parser')
try:
headline = soup.find('h1', {'class': 'post-title'}).text.strip()
except:
headline = None
try:
time = soup.find('time', {'class': 'entry-date published'}).text.strip()[:17]
except:
time = None
try:
descr = soup.find('div', {'class': 'post-intro-content content'}).text.strip()
except:
descr = None
try:
txt = soup.find('div', {'class': 'post-content content'}).text.strip()
except:
txt = None
# create a list with all scraped info
danas = [headline,
date,
time,
descr,
txt]
web_content_list.append(danas)
else:
print('Oh No! ' + l)
dh = pd.DataFrame(danas)
dh.head()
And here is the error that pops out:
*AttributeError Traceback (most recent call last)
<ipython-input-10-1c9e3a7e6f48> in <module>
11 soup = BeautifulSoup(c,"html.parser")
12
---> 13 paging = soup.find("div",{"column is-8"}).find("div",{"nav-links"}).find_all("a")
14 start_page = paging[1].int
15 last_page = paging[len(paging)-1].int
AttributeError: 'NoneType' object has no attribute 'find'*
Well one issue is that 'https://www.danas.rs/tag/izbori-2020/page/' returns Greška 404: Tražena stranica nije pronađena. on the initial request. So wil lneed to address that.
Second issue is pulling in the start page and end page. Just curious, why would you search for a start page? All pages start at 1.
Another question, why convert to float, then int. Just get the page as int.
3rd, you never declare your variable date.
4th you are only grabbing the 1st article on the page. Is that what you want? Or do you want all the articles on the page? I left your code as is, since you're question is referring to iterating through the pages.
5th If you want the full text of the articles, you'll need to get to each of the article links.
There are few more issues too with the code. I tried to comment so you could see it. So compare this code to yours, and if you have questions, let me know:
Code:
from bs4 import BeautifulSoup
import requests
import pandas as pd
base_url = 'https://www.danas.rs/tag/izbori-2020/'
r = requests.get(base_url)
c = r.text
soup = BeautifulSoup(c,"html.parser")
paging = soup.find("div",{"column is-8"}).find("div",{"nav-links"}).find_all("a")
start_page = 1
last_page = int(paging[1].text)
web_content_list = []
for page_number in range(int(start_page),int(last_page) + 1):
url = base_url+ 'page/' + str(page_number) #<-- fixed this
r = requests.get(url)
c = r.text
soup = BeautifulSoup(c,"html.parser")
if r.status_code == 200:
soup = BeautifulSoup(r.content, 'html.parser')
articles = soup.find_all('article')
for article in articles:
w=1
try:
headline = soup.find('h2', {'class': 'article-post-title'}).text.strip()
except:
headline = None
try:
time = soup.find('time')['datetime']
except:
time = None
try:
descr = soup.find('div', {'class': 'article-post-excerpt'}).text.strip()
except:
descr = None
# create a list with all scraped info <--- changed to dictionary so that you have column:value when you create the dataframe
danas = {'headline':headline,
'time':time,
'descr':descr}
web_content_list.append(danas)
print('Collected: %s of %s' %(page_number, last_page))
else:
#print('Oh No! ' + l) #<--- what is l?
print('Oh No!')
dh = pd.DataFrame(web_content_list) #<-- need to get the full appended list, not the danas, as thats overwritten after each iteration
dh.head()
Output:
print(dh.head().to_string())
headline time descr
0 Vučić saopštava ime mandatara vlade u 20 časova 2020-10-05T09:00:05+02:00 Predsednik Aleksandar Vučić će u danas, nakon sastanka Predsedništva Srpske napredne stranke (SNS) saopštiti ime mandatara za sastav nove Vlade Srbije. Vučić će odluku saopštiti u 20 sati u Palati Srbija, rečeno je FoNetu u kabinetu predsednika Srbije.
1 Nova skupština i nova vlada 2020-08-01T14:00:13+02:00 Saša Radulović biće poslanik još nekoliko dana i prvi je objavio da se vrši dezinfekcija skupštinskih prostorija, što govori u prilog tome da će se novi saziv, izabran 21. juna, ipak zakleti u Domu Narodne skupštine.
2 Brnabić o novom mandatu: To ne zavisi od mene, SNS ima dobre kandidate 2020-07-15T18:59:43+02:00 Premijerka Ana Brnabić izjavila je danas da ne zavisi od nje da li će i u novom mandatu biti na čelu Vlade Srbije.
3 Državna izborna komisija objavila prve rezultate, HDZ ubedljivo vodi 2020-07-05T21:46:56+02:00 Državna izborna komisija (DIP) objavila je večeras prve nepotpune rezultate po kojima vladajuća Hrvatska demokratska zajednica (HDZ) osvaja čak 69 mandata, ali je reč o rezultatima na malom broju prebrojanih glasova.
4 Analiza Pravnog tima liste „Šabac je naš“: Ozbiljni dokazi za krađu izbora 2020-07-02T10:53:57+02:00 Na osnovu izjave 123 birača, od kojih je 121 potpisana i sa matičnim brojem, prikupljenim u roku od 96 sati nakon zatvaranja biračkih mesta u nedelju 21. 6. 2020. godine u 20 časova, uočena su 263 kršenja propisa na 55 biračkih mesta, navodi se na početku Analize koju je o kršenju izbornih pravila 21. juna i uoči izbora sačinio pravni tim liste „Nebojša Zelenović – Šabac je naš“.
everybody.
So, I'm trying to write this function as a part of my python course. What it should do is go to a wiki page, parse the table with Greek philosophers there, and return the list of tuples, each containing the name of the philosopher and a link to his wiki page. Below is what I've got:
def get_philosophers():
url="https://en.wikipedia.org/wiki/List_of_ancient_Greek_philosophers"
philosophers = []
import requests
from bs4 import BeautifulSoup
try:
response = requests.get(url)
if not response.status_code == 200:
return 'Main page error'
page = BeautifulSoup(response.content, "lxml")
table = page.find('table',class_='wikitable')
trs = table.find_all('tr')
bigname = ()
for tr in trs:
tds = tr.find_all('td')
name = tds[0].find('a').get('title')
link = "https://wikipedia.org" + tds[0].find('a').get('href')
bigname = (name, link)
philosophers.append(bigname)
return len(philosophers)
except:
print('Scraping error')
I've tried commands via console, they mainly worked; except for the 'for' loop, which returned 'index out of range' error on the name = tds[0].find('a').get('title') line, but when earlier I tried same commands not as a loop, but just for one of the elements, they worked alright.
UPD: modified the function:
url="https://en.wikipedia.org/wiki/List_of_ancient_Greek_philosophers"
philosophers = []
import requests
from bs4 import BeautifulSoup
try:
response = requests.get(url)
if not response.status_code == 200:
return 'Main page error'
page = BeautifulSoup(response.content, "lxml")
table = page.find('table',class_='wikitable')
trs = table.find_all('tr')
bigname = ()
for tr in trs[1:]: #skip the thead tr element
try:
tds = tr.find_all('td')
name = tds[0].find('a').get('title')
link = "https://wikipedia.org" + tds[0].find('a').get('href')
bigname = (name, link)
philosophers.append(bigname)
# return philosophers
except:
print('Loop error')
return philosophers
except:
print('Scraping error')
works as intended.
It was the position of try - except that created the issue. Try :
def get_philosophers():
url="https://en.wikipedia.org/wiki/List_of_ancient_Greek_philosophers"
philosophers = []
import requests
from bs4 import BeautifulSoup
response = requests.get(url)
if not response.status_code == 200:
return 'Main page error'
page = BeautifulSoup(response.content, "lxml")
table = page.find('table',class_='wikitable')
trs = table.find_all('tr')
bigname = ()
for tr in trs:
try:
tds = tr.find_all('td')
name = tds[0].find('a').get('title')
link = "https://wikipedia.org" + tds[0].find('a').get('href')
bigname = (name, link)
philosophers.append(bigname)
except:
pass
return len(philosophers)
Now call it:
x = get_philosophers()
print(x)
What this does is that, it skips the error causing tr while iterating.
Or just delete the first error causer:
def get_philosophers():
url="https://en.wikipedia.org/wiki/List_of_ancient_Greek_philosophers"
philosophers = []
import requests
from bs4 import BeautifulSoup
try:
response = requests.get(url)
if not response.status_code == 200:
return 'Main page error'
page = BeautifulSoup(response.content, "lxml")
table = page.find('table',class_='wikitable')
trs = table.find_all('tr')
bigname = ()
del trs[0] # deletion
for tr in trs:
tds = tr.find_all('td')
name = tds[0].find('a').get('title')
link = "https://wikipedia.org" + tds[0].find('a').get('href')
bigname = (name, link)
print(bigname)
philosophers.append(bigname)
return len(philosophers)
except:
print('Scraping error')
I am very new to python (three days in) and I have stumbled into a problem I can't solve with google/youtube. I want to scrape the National Governors Association for background data of all US governors and save this into a csv file.
I have managed to scrape a list of all governors, but to get more details I need to enter the page of each governor individually and save the data. I have found code suggestions online which utilises a "next" button or the url structure to loop over several sites. This website, however, does not have a next button and the url-links does not follow a loopable structure. So I am stuck.
I would appreciate any help I can get very much. I want to extract the info above the main text (Office Dates, School(s) etc in the "address" tag) in each governors page, for example in this one.
This is what I have got so far:
import bs4 as bs
import urllib.request
import pandas as pd
url = 'https://www.nga.org/cms/FormerGovBios?begincac77e09-db17-41cb-9de0-687b843338d0=10&endcac77e09-db17-41cb-9de0-687b843338d0=9999&pagesizecac77e09-db17-41cb-9de0-687b843338d0=10&militaryService=&higherOfficesServed=&religion=&lastName=&sex=Any&honors=&submit=Search&college=&firstName=&party=&inOffice=Any&biography=&warsServed=&'
sauce = urllib.request.urlopen(url).read()
soup = bs.BeautifulSoup(sauce, "html.parser")
#dl list of all govs
dfs = pd.read_html(url, header=0)
for df in dfs:
df.to_csv('governors.csv')
#dl links to each gov
table = soup.find('table', 'table table-striped table-striped')
links = table.findAll('a')
with open ('governors_links.csv', 'w') as r:
for link in links:
r.write(link['href'])
r.write('\n')
r.close()
#enter each gov page and extract data in the "address" tag(s)
#save this in a csv file
I'm assuming that you've got all the links in a list named links.
You can do this to get the data you want of all the Governors one by one:
for link in links:
r = urllib.request.urlopen(link).read()
soup = bs.BeautifulSoup(r, 'html.parser')
print(soup.find('h2').text) # Name of Governor
for p in soup.find('div', {'class': 'col-md-3'}).findAll('p'):
print(p.text.strip()) # Office dates, address, phone, ...
for p in soup.find('div', {'class': 'col-md-7'}).findAll('p'):
print(p.text.strip()) # Family, school, birth state, ...
Edit:
Change your links list to
links = ['https://www.nga.org' + x.get('href') for x in table.findAll('a')]
This may work. I haven't tested it out to full completion since I'm at work but it should be a starting point for you.
import bs4 as bs
import requests
import re
def is_number(s):
try:
int(s)
return True
except ValueError:
return False
def main():
url = 'https://www.nga.org/cms/FormerGovBios?inOffice=Any&state=Any&party=&lastName=&firstName=&nbrterms=Any&biography=&sex=Any&religion=&race=Any&college=&higherOfficesServed=&militaryService=&warsServed=&honors=&birthState=Any&submit=Search'
sauce = requests.get(url).text
soup = bs.BeautifulSoup(sauce, "html.parser")
finished = False
csv_data = open('Govs.csv', 'a')
csv_data.write('Name,Address,OfficeDates,Success,Address,Phone,Fax,Born,BirthState,Party,Schooling,Email')
try:
while not finished:
#dl links to each gov
table = soup.find('table', 'table table-striped table-striped')
links = table.findAll('a')
for link in links:
info_array = []
gov = {}
name = link.string
gov_sauce = requests.get(r'https://nga.org'+link.get('href')).text
gov_soup = bs.BeautifulSoup(gov_sauce, "html.parser")
#print(gov_soup)
office_and_stuff_info = gov_soup.findAll('address')
for address in office_and_stuff_info:
infos = address.findAll('p')
for info in infos:
tex = re.sub('[^a-zA-Z\d:]','',info.text)
tex = re.sub('\\s+',' ',info.text)
tex = tex.strip()
if tex:
info_array.append(tex)
info_array = list(set(info_array))
gov['Name'] = name
secondarry_address = ''
gov['Address'] = ''
for line in info_array:
if 'OfficeDates:' in line:
gov['OfficeDates'] = line.replace('OfficeDates:','').replace('-','')
elif 'Succ' or 'Fail' in line:
gov['Success'] = line
elif 'Address' in line:
gov['Address'] = line.replace('Address:','')
elif 'Phone:' or 'Phone ' in line:
gov['Phone'] = line.replace('Phone ','').replace('Phone: ','')
elif 'Fax:' in line:
gov['Fax'] = line.replace('Fax:','')
elif 'Born:' in line:
gov['Born'] = line.replace('Born:','')
elif 'Birth State:' in line:
gov['BirthState'] = line.replace('BirthState:','')
elif 'Party:' in line:
gov['Party'] = line.replace('Party:','')
elif 'School(s)' in line:
gov['Schooling'] = line.replace('School(s):','').replace('School(s) ')
elif 'Email:' in line:
gov['Email'] = line.replace('Email:','')
else:
secondarry_address = line
gov['Address'] = gov['Address'] + secondarry_address
data_line = gov['Name'] +','+gov['Address'] +','+gov['OfficeDates'] +','+gov['Success'] +','+gov['Address'] +','+ gov['Phone'] +','+ gov['Fax'] +','+gov['Born'] +','+gov['BirthState'] +','+gov['Party'] +','+gov['Schooling'] +','+gov['Email']
csv_data.write(data_line)
next_page_link = soup.find('ul','pagination center-blockdefault').find('a',{'aria-label':'Next'})
if next_page_link.parent.get('class') == 'disabled':
finished = True
else:
url = r'https://nga.org'+next_page_link.get('href')
sauce = requests.get(url).text
soup = bs.BeautifulSoup(sauce,'html.parser')
except:
print('Code failed.')
finally:
csv_data.close()
if __name__ == '__main__':
main()
I am trying to scrape multiple pages of a url.
But am able to scrape only the first page is there is a way to get all the pages.
Here is my code.
from bs4 import BeautifulSoup as Soup
import urllib, requests, re, pandas as pd
pd.set_option('max_colwidth',500) # to remove column limit (Otherwise, we'll lose some info)
df = pd.DataFrame()
Comp_urls = ['https://www.indeed.com/jobs?q=Dell&rbc=DELL&jcid=0918a251e6902f97', 'https://www.indeed.com/jobs?q=Harman&rbc=Harman&jcid=4faf342d2307e9ed','https://www.indeed.com/jobs?q=johnson+%26+johnson&rbc=Johnson+%26+Johnson+Family+of+Companies&jcid=08849387e791ebc6','https://www.indeed.com/jobs?q=nova&rbc=Nova+Biomedical&jcid=051380d3bdd5b915']
for url in Comp_urls:
target = Soup(urllib.request.urlopen(url), "lxml")
targetElements = target.findAll('div', class_ =' row result')
for elem in targetElements:
comp_name = elem.find('span', attrs={'class':'company'}).getText().strip()
job_title = elem.find('a', attrs={'class':'turnstileLink'}).attrs['title']
home_url = "http://www.indeed.com"
job_link = "%s%s" % (home_url,elem.find('a').get('href'))
job_addr = elem.find('span', attrs={'class':'location'}).getText()
date_posted = elem.find('span', attrs={'class': 'date'}).getText()
description = elem.find('span', attrs={'class': 'summary'}).getText().strip()
comp_link_overall = elem.find('span', attrs={'class':'company'}).find('a')
if comp_link_overall != None:
comp_link_overall = "%s%s" % (home_url, comp_link_overall.attrs['href'])
else: comp_link_overall = None
df = df.append({'comp_name': comp_name, 'job_title': job_title,
'job_link': job_link, 'date_posted': date_posted,
'overall_link': comp_link_overall, 'job_location': job_addr, 'description': description
}, ignore_index=True)
df
df.to_csv('path\\web_scrape_Indeed.csv', sep=',', encoding='utf-8')
Please suggest if there is anyway.
Case 1: The code presented here is exactly what you have
Comp_urls = ['https://www.indeed.com/jobs?q=Dell&rbc=DELL&jcid=0918a251e6902f97', 'https://www.indeed.com/jobs?q=Harman&rbc=Harman&jcid=4faf342d2307e9ed','https://www.indeed.com/jobs?q=johnson+%26+johnson&rbc=Johnson+%26+Johnson+Family+of+Companies&jcid=08849387e791ebc6','https://www.indeed.com/jobs?q=nova&rbc=Nova+Biomedical&jcid=051380d3bdd5b915']
for url in Comp_urls:
target = Soup(urllib.request.urlopen(url), "lxml")
targetElements = target.findAll('div', class_ =' row result')
for elem in targetElements:
The problem here is targetElements changes with every iteration in the first for loop.
To avoid this, indent the second for loop inside the first like so:
for url in Comp_urls:
target = Soup(urllib.request.urlopen(url), "lxml")
targetElements = target.findAll('div', class_ =' row result')
for elem in targetElements:
Case 2: Your the bug is not a result of improper indentation (i.e. not like what is in your original post)
If it is the case that your code is properly idented , then it may be the case that targetElements is an empty list. This means target.findAll('div', class_ =' row result') does not return anything. In that case, visit the sites, check out the dom, then modify your scraping program.