Scraping with Python

Scraping with Python - python

I have a code in python in order to scrape some data from trip advisor(ratings from the reviews). The problem is that whenever I run the code it gives me different rows and never scraps all the webpages.
The index error that appears is:
Traceback (most recent call last):
File "C:/Users/thimios/PycharmProjects/TripadvisorScrapping/proxiro.py", line 26, in <module>
rating = soup.findAll("div", {'class': 'rating reviewItemInline'})[i]
IndexError: list index out of range
The code is the following:
from bs4 import BeautifulSoup
import os
import urllib.request
file2 = open(os.path.expanduser(r"~/Desktop/TripAdviser Reviews2.csv"), "wb")
file2.write(b"Organization,Rating" + b"\n")
WebSites = [
"https://www.tripadvisor.com/Hotel_Review-g189400-d198932-Reviews-Hilton_Athens-Athens_Attica.html#REVIEWS"]
Checker ="REVIEWS"
# looping through each site until it hits a break
for theurl in WebSites:
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage, "html.parser")
#print(soup)
while True:
# Extract ratings from the text reviews
altarray = ""
for i in range(0,10):
rating = soup.findAll("div", {'class': 'rating reviewItemInline'})[i]
rating1 = rating.find_all("span")[0]
rating2 = rating1['class'][1][-2:]
print(rating2)
if len(altarray) == 0:
altarray = [rating2]
else:
altarray.append(rating2)
#print(altarray)
#print(len(altarray))
#print(type(altarray))
# Extract Organization,
Organization1 = soup.find(attrs={'class': 'heading_name'})
Organization = Organization1.text.replace('"', ' ').replace('Review of',' ').strip()
#print(Organization)
# Loop through each review on the page
for x in range(0, 10):
Rating = altarray[x]
Rating = str(Rating)
#print(Rating)
#print(type(Rating))
Record2 = Organization + "," + Rating
if Checker == "REVIEWS":
file2.write(bytes(Record2, encoding="ascii", errors='ignore') + b"\n")
link = soup.find_all(attrs={"class": "nav next rndBtn ui_button primary taLnk"})
#print(link)
#print(link[0])
if len(link) == 0:
break
else:
soup = BeautifulSoup(urllib.request.urlopen("http://www.tripadvisor.com" + link[0].get('href')),"html.parser")
#print(soup)
#print(Organization)
print(link[0].get('href'))
Checker = link[0].get('href')[-7:]
#print(Checker)
file2.close()
I am supposing that trip advisor doesn't give full access to the data.Any idea?

The error is encountered when you are trying to access an element in the list by index and that index does not exists.
I have ran your code and it prints :
50
50
50
50
50
50
40
40
40
50
Although, the way you are looping is not the most pythonic way of doing it and also susceptible to lot of index errors.
What you can do is replace this :
for i in range(0,10):
rating = soup.findAll("div", {'class': 'rating reviewItemInline'})[i]
with :
for rating in soup.findAll("div", {'class': 'rating reviewItemInline'}) :
This shall resolve the error as well.

Related

Looping until max results

I'm pretty new to web scraping but enjoying it so far so thought I'd test myself!
I've written this query to scrape this website but just wondering is there a way of making it more efficient? At the moment, I've had to set the max page to 87 as this is the last page that guitars appear on. However, amps only have 15 pages of results but I'm still looping through 87. Any ideas appreciated!
import pandas as pd
import requests
from bs4 import BeautifulSoup
guitar_products = []
n = 88
#ELECTRIC GUITAR DATA
for category in ['guitars/electric/','guitars/bass/','amps/','guitars/acoustic/','pedals/']:
for x in range(1,n):
url = "https://www.guitarguitar.co.uk/" + category + "page-" + str(x)
print(url)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
products = [product.text.strip() for product in soup.findAll('h3', {'class': 'qa-product-list-item-title'})]
prices = [price.text.strip()[:-1] for price in soup.findAll('span', {'class': 'js-pounds'})]
avails = [avail.text.strip() for avail in soup.findAll('div', {'class': 'availability'})]
for index in range(0, len(products)):
guitar_products.append({
'product': products[index],
'price' : prices[index],
'avail' : avails[index]
})
guitar_data = pd.DataFrame(guitar_products)
guitar_data['price'] = pd.to_numeric(guitar_data['price'].str.replace('[^\d.]', '', regex=True))
Thanks

Try the following approach:
import pandas as pd
import requests
from bs4 import BeautifulSoup
guitar_products = []
#ELECTRIC GUITAR DATA
for category in ['guitars/electric/', 'guitars/bass/', 'amps/', 'guitars/acoustic/', 'pedals/']:
page_number = 1
while True:
url = f"https://www.guitarguitar.co.uk/{category}page-{page_number}"
print(url)
page_number += 1
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html.parser')
for div_product in soup.find_all('div', class_="product-inner"):
product = div_product.find('h3', {'class': 'qa-product-list-item-title'}).get_text(strip=True)
price = div_product.find('span', {'class': 'js-pounds'}).get_text(strip=True)
avail = div_product.find('div', {'class': 'availability'}).get_text(strip=True)
guitar_products.append({'product' : product, 'price' : price, 'avail' : avail})
# Is there a next button?
if not soup.find('a', class_="next-page-button"):
print("No more")
break
guitar_data = pd.DataFrame(guitar_products)
guitar_data['price'] = pd.to_numeric(guitar_data['price'].str.replace('[^\d.]', '', regex=True))
Improvements:
This looks for the Next button on each page to then skip to the next category.
It locates the <div> holding each product and then uses a single find to get each product detail. This avoids the need to build multiple lists and then join them.
Build the URL using a Python f string.

You can check H1:
*soup = BeautifulSoup(page.content, 'html.parser')*
if soup.find('h1').contents[0] == 'Page Not Found':
break
or change circle from for to while:
is_page = True
x = 0
while is_page:
x = x + 1
. . .
if soup.find('h1').contents[0] == 'Page Not Found':
is_page = False
break

This is probably not the most elegant solution, but it is functional and straightforward. An infinite loop which ends if no product is found.
import pandas as pd
import requests
from bs4 import BeautifulSoup
guitar_products = []
n = 1
# ELECTRIC GUITAR DATA
for category in ['guitars/electric/', 'guitars/bass/', 'amps/', 'guitars/acoustic/', 'pedals/']:
while True:
url = "https://www.guitarguitar.co.uk/" + category + "page-" + str(n)
print(url)
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
products = [product.text.strip() for product in soup.findAll('h3', {'class': 'qa-product-list-item-title'})]
prices = [price.text.strip()[:-1] for price in soup.findAll('span', {'class': 'js-pounds'})]
avails = [avail.text.strip() for avail in soup.findAll('div', {'class': 'availability'})]
for index in range(0, len(products)):
guitar_products.append({
'product': products[index],
'price': prices[index],
'avail': avails[index]
})
if len(products) == 0:
n = 1
break
else:
n += 1
guitar_data = pd.DataFrame(guitar_products)
guitar_data['price'] = pd.to_numeric(guitar_data['price'].str.replace('[^\d.]', '', regex=True))

I want to extract IMDb movie IDs using python

Here is my code:
So I wanted to extract all the bollywood movies, and the project requires, movie titles, cast, crew, IMDB id etc.... I am not able to get all the IMDb IDs with the error nonetype. When I used it on one page only it was working quite well, however, when I use it on multiple pages it shows an error. Please help
#importing the libraries needed
import pandas as pd
import numpy as np
import requests
import re
from bs4 import BeautifulSoup
from time import sleep
from random import randint
#declaring the list of empty variables, So that we can append the data overall
movie_name = []
year = []
time=[]
rating=[]
votes = []
description = []
director_s = []
starList= []
imdb_id = []
#the whole core of the script
url = "https://www.imdb.com/search/title/?title_type=feature&primary_language=hi&sort=num_votes,desc&start=1&ref_=adv_nxt"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
movie_data = soup.findAll('div', attrs = {'class': 'lister-item mode-advanced'})
for store in movie_data:
name = store.h3.a.text
movie_name.append(name)
year_of_release = store.h3.find('span', class_ = "lister-item-year text-muted unbold").text
year.append(year_of_release)
runtime = store.p.find("span", class_ = 'runtime').text if store.p.find("span", class_ = 'runtime') else " "
time.append(runtime)
rate = store.find('div', class_ = "inline-block ratings-imdb-rating").text.replace('\n', '') if store.find('div', class_ = "inline-block ratings-imdb-rating") else " "
rating.append(rate)
value = store.find_all('span', attrs = {'name': "nv"})
vote = value[0].text if store.find_all('span', attrs = {'name': "nv"}) else " "
votes.append(vote)
# Description of the Movies
describe = store.find_all('p', class_ = 'text-muted')
description_ = describe[1].text.replace('\n', '') if len(describe) > 1 else ' '
description.append(description_)
## Director
ps = store.find_all('p')
for p in ps:
if 'Director'in p.text:
director =p.find('a').text
director_s.append(director)
## ID
imdbID = store.find('span','rating-cancel').a['href'].split('/')[2]
imdb_id.append(imdbID)
## actors
star = store.find("p", attrs={"class":""}).text.replace("Stars:", "").replace("\n", "").replace("Director:", "").strip()
starList.append(star)
Error:
AttributeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_17576/2711511120.py in <module>
63
64 ## IDs
---> 65 imdbID = store.find('span','rating-cancel').a['href'].split('/')[2] if store.find('span','rating-cancel').a['href'].split('/')[2] else ' '
66 imdb_id.append(imdbID)
67
AttributeError: 'NoneType' object has no attribute 'a'

Change your condition to the following, cause first you have to check if <span> exists:
imdbID = store.find('span','rating-cancel').a.get('href').split('/')[2] if store.find('span','rating-cancel') else ' '
Example
Check the url, here are some of the <span> missing:
import requests
from bs4 import BeautifulSoup
#the whole core of the script
url = "https://www.imdb.com/search/title/?title_type=feature&primary_language=hi&sort=my_ratings,desc"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
movie_data = soup.find_all('div', attrs = {'class': 'lister-item mode-advanced'})
for store in movie_data:
imdbID = store.find('span','rating-cancel').a.get('href').split('/')[2] if store.find('span','rating-cancel') else ' '
print(imdbID)
Output
tt9900050
tt9896506
tt9861220
tt9810436
tt9766310
tt9766294
tt9725058
tt9700334
tt9680166
tt9602804
Even better scrape the id via image tag cause these is always there even if there is only the placholder:
imdbID = store.img.get('data-tconst')

Error: Index out of range for my Flipkart product scraper

So I'm fairly new to Python and I tried using this Flipkart scraper.
I tried to add a 'price' module but it keeps giving me the error 'IndexError: list index out of range'
My goal for this scraper is to scrape product info, rating, price, specs, image URL, etc from Flipkart. It is a challenging goal for me so far.... but I think I can do it if I get the right help and understand python more.
import requests
from urllib.request import urlopen as req
from bs4 import BeautifulSoup as soup
filename = "mobiles.csv"
f = open(filename, "w")
headers = "product_name, specs, rating, price\n"
f.write(headers)
for i in range(0, 200):
url = 'https://www.flipkart.com/search?q=phones&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off'+'&page='+str(i)
print(url)
client = req(url)
html = client.read()
client.close()
page_soup = soup(html, "html.parser")
containers = page_soup.findAll("div",{"class":"col col-7-12"})
for container in containers:
price_container = container.findAll('div', {"class":"_1vC4OE _2rQ-NK"})
price = price_container[0].text
name_container = container.findAll("div", {"class":"_3wU53n"})
product_name = name_container[0].text
rate_container = container.findAll("div", {"class":"hGSR34"})
if(not(rate_container)):
rating = "none"
else:
rating = rate_container[0].text
specs_container = container.findAll("ul", {"class":"vFw0gD"})
specs = specs_container[0].text
f.write(product_name.replace(",", "|") + "," +specs + "," +rating + "," +price + "\n")
f.close()
Which prints the following:
https://www.flipkart.com/search?q=phones&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page=0
Traceback (most recent call last):
File "C:\Users\HOLES\Desktop\flipkart_web_scraper-master\flipkart_web_scraper-master\flipkart.py", line 24, in <module>
price = price_container[0].text
IndexError: list index out of range

The problem with your code is the container lies in the following code:
containers = page_soup.findAll("div",{"class":"col col-7-12"})
If you print containers[0] and searched for _1vC4OE _2rQ-NK inside it, you won't find any. So, you can fix this issue by looking to a broader <div> like this one:
containers = page_soup.findAll("div",{"class":"_1UoZlX"})

Python BeautifulSoup selenium scraper

I'm using the following python script for scraping info from Amazon pages.
At some point, it stopped returning page results. The script is starting, browsing through the keywords/pages but I only get the headers as output:
Keyword Rank Title ASIN Score Reviews Prime Date
I suspect that the problem is in the following line as this tag doesn't exist anymore and the results var doesn't get any value:
results = soup.findAll('div', attrs={'class': 's-item-container'})
This is the full code:
from bs4 import BeautifulSoup
import time
from selenium import webdriver
import re
import datetime
from collections import deque
import logging
import csv
class AmazonScaper(object):
def __init__(self,keywords, output_file='example.csv',sleep=2):
self.browser = webdriver.Chrome(executable_path='/Users/willcecil/Dropbox/Python/chromedriver') #Add path to your Chromedriver
self.keyword_queue = deque(keywords) #Add the start URL to our list of URLs to crawl
self.output_file = output_file
self.sleep = sleep
self.results = []
def get_page(self, keyword):
try:
self.browser.get('https://www.amazon.co.uk/s/ref=nb_sb_noss_2?url=search-alias%3Daps&field-keywords={a}'.format(a=keyword))
return self.browser.page_source
except Exception as e:
logging.exception(e)
return
def get_soup(self, html):
if html is not None:
soup = BeautifulSoup(html, 'lxml')
return soup
else:
return
def get_data(self,soup,keyword):
try:
results = soup.findAll('div', attrs={'class': 's-item-container'})
for a, b in enumerate(results):
soup = b
header = soup.find('h2')
result = a + 1
title = header.text
try:
link = soup.find('a', attrs={'class': 'a-link-normal a-text-normal'})
url = link['href']
url = re.sub(r'/ref=.*', '', str(url))
except:
url = "None"
# Extract the ASIN from the URL - ASIN is the breaking point to filter out if the position is sponsored
ASIN = re.sub(r'.*amazon.co.uk.*/dp/', '', str(url))
# Extract Score Data using ASIN number to find the span class
score = soup.find('span', attrs={'name': ASIN})
try:
score = score.text
score = score.strip('\n')
score = re.sub(r' .*', '', str(score))
except:
score = "None"
# Extract Number of Reviews in the same way
reviews = soup.find('a', href=re.compile(r'.*#customerReviews'))
try:
reviews = reviews.text
except:
reviews = "None"
# And again for Prime
PRIME = soup.find('i', attrs={'aria-label': 'Prime'})
try:
PRIME = PRIME.text
except:
PRIME = "None"
data = {keyword:[keyword,str(result),title,ASIN,score,reviews,PRIME,datetime.datetime.today().strftime("%B %d, %Y")]}
self.results.append(data)
except Exception as e:
print(e)
return 1
def csv_output(self):
keys = ['Keyword','Rank','Title','ASIN','Score','Reviews','Prime','Date']
print(self.results)
with open(self.output_file, 'a', encoding='utf-8') as outputfile:
dict_writer = csv.DictWriter(outputfile, keys)
dict_writer.writeheader()
for item in self.results:
for key,value in item.items():
print(".".join(value))
outputfile.write(",".join('"' + item + '"' for item in value)+"\n") # Add "" quote character so the CSV accepts commas
def run_crawler(self):
while len(self.keyword_queue): #If we have keywords to check
keyword = self.keyword_queue.popleft() #We grab a keyword from the left of the list
html = self.get_page(keyword)
soup = self.get_soup(html)
time.sleep(self.sleep) # Wait for the specified time
if soup is not None: #If we have soup - parse and save data
self.get_data(soup,keyword)
self.browser.quit()
self.csv_output() # Save the object data to csv
if __name__ == "__main__":
keywords = [str.replace(line.rstrip('\n'),' ','+') for line in
open('keywords.txt')] # Use our file of keywords & replaces spaces with +
ranker = AmazonScaper(keywords) # Create the object
ranker.run_crawler() # Run the rank checker
The output should look like this (I have trimmed the Titles for clarity).
Keyword Rank Title ASIN Score Reviews Prime Date
Blue+Skateboard 3 Osprey Complete
Beginn B00IL1JMF4 3.7 40 Prime February 21, 2019
Blue+Skateboard 4 ENKEEO Complete Mini
C B078J9Y1DG 4.5 42 Prime February 21, 2019 Blue+Skateboard 5 skatro -
Mini Cruiser B00K93PIXM 4.8 223 Prime February 21, 2019
Blue+Skateboard 7 Vinsani Retro Cruiser
B00CSV72AK 4.4 8 Prime February 21, 2019 Blue+Skateboard 8 Ridge
Retro Cruiser Bo B00CA33ISQ 4.1 207 Prime February 21, 2019
Blue+Skateboard 9 Xootz Kids Complete
Be B01B2YNSJM 3.6 32 Prime February 21, 2019 Blue+Skateboard 10 Enuff
Pyro II Skateboa B00MGRGX2Y 4.3 68 Prime February 21, 2019

The following shows some changes you could make. I have changed to using css selectors at some points.
The main result set to loop over are retrieved by soup.select('.s-result-list [data-asin]'). This specifies elements with class name .s-result-list having children with attribute data-asin. This matches the 60 (current) items on page.
I swapped the PRIME selection to using an attribute = value selector
Headers are now h5 i.e. header = soup.select_one('h5').
soup.select_one('[aria-label="Amazon Prime"]
Example code:
import datetime
from bs4 import BeautifulSoup
import time
from selenium import webdriver
import re
keyword = 'blue+skateboard'
driver = webdriver.Chrome()
url = 'https://www.amazon.co.uk/s/ref=nb_sb_noss_2?url=search-alias%3Daps&field-keywords={}'
driver.get(url.format(keyword))
soup = BeautifulSoup(driver.page_source, 'lxml')
results = soup.select('.s-result-list [data-asin]')
for a, b in enumerate(results):
soup = b
header = soup.select_one('h5')
result = a + 1
title = header.text.strip()
try:
link = soup.select_one('h5 > a')
url = link['href']
url = re.sub(r'/ref=.*', '', str(url))
except:
url = "None"
if url !='/gp/slredirect/picassoRedirect.html':
ASIN = re.sub(r'.*/dp/', '', str(url))
#print(ASIN)
try:
score = soup.select_one('.a-icon-alt')
score = score.text
score = score.strip('\n')
score = re.sub(r' .*', '', str(score))
except:
score = "None"
try:
reviews = soup.select_one("href*='#customerReviews']")
reviews = reviews.text.strip()
except:
reviews = "None"
try:
PRIME = soup.select_one('[aria-label="Amazon Prime"]')
PRIME = PRIME['aria-label']
except:
PRIME = "None"
data = {keyword:[keyword,str(result),title,ASIN,score,reviews,PRIME,datetime.datetime.today().strftime("%B %d, %Y")]}
print(data)
Example output:

Scrape site with multiple links without "next" button using beautiful soup

I am very new to python (three days in) and I have stumbled into a problem I can't solve with google/youtube. I want to scrape the National Governors Association for background data of all US governors and save this into a csv file.
I have managed to scrape a list of all governors, but to get more details I need to enter the page of each governor individually and save the data. I have found code suggestions online which utilises a "next" button or the url structure to loop over several sites. This website, however, does not have a next button and the url-links does not follow a loopable structure. So I am stuck.
I would appreciate any help I can get very much. I want to extract the info above the main text (Office Dates, School(s) etc in the "address" tag) in each governors page, for example in this one.
This is what I have got so far:
import bs4 as bs
import urllib.request
import pandas as pd
url = 'https://www.nga.org/cms/FormerGovBios?begincac77e09-db17-41cb-9de0-687b843338d0=10&endcac77e09-db17-41cb-9de0-687b843338d0=9999&pagesizecac77e09-db17-41cb-9de0-687b843338d0=10&militaryService=&higherOfficesServed=&religion=&lastName=&sex=Any&honors=&submit=Search&college=&firstName=&party=&inOffice=Any&biography=&warsServed=&'
sauce = urllib.request.urlopen(url).read()
soup = bs.BeautifulSoup(sauce, "html.parser")
#dl list of all govs
dfs = pd.read_html(url, header=0)
for df in dfs:
df.to_csv('governors.csv')
#dl links to each gov
table = soup.find('table', 'table table-striped table-striped')
links = table.findAll('a')
with open ('governors_links.csv', 'w') as r:
for link in links:
r.write(link['href'])
r.write('\n')
r.close()
#enter each gov page and extract data in the "address" tag(s)
#save this in a csv file

I'm assuming that you've got all the links in a list named links.
You can do this to get the data you want of all the Governors one by one:
for link in links:
r = urllib.request.urlopen(link).read()
soup = bs.BeautifulSoup(r, 'html.parser')
print(soup.find('h2').text) # Name of Governor
for p in soup.find('div', {'class': 'col-md-3'}).findAll('p'):
print(p.text.strip()) # Office dates, address, phone, ...
for p in soup.find('div', {'class': 'col-md-7'}).findAll('p'):
print(p.text.strip()) # Family, school, birth state, ...
Edit:
Change your links list to
links = ['https://www.nga.org' + x.get('href') for x in table.findAll('a')]

This may work. I haven't tested it out to full completion since I'm at work but it should be a starting point for you.
import bs4 as bs
import requests
import re
def is_number(s):
try:
int(s)
return True
except ValueError:
return False
def main():
url = 'https://www.nga.org/cms/FormerGovBios?inOffice=Any&state=Any&party=&lastName=&firstName=&nbrterms=Any&biography=&sex=Any&religion=&race=Any&college=&higherOfficesServed=&militaryService=&warsServed=&honors=&birthState=Any&submit=Search'
sauce = requests.get(url).text
soup = bs.BeautifulSoup(sauce, "html.parser")
finished = False
csv_data = open('Govs.csv', 'a')
csv_data.write('Name,Address,OfficeDates,Success,Address,Phone,Fax,Born,BirthState,Party,Schooling,Email')
try:
while not finished:
#dl links to each gov
table = soup.find('table', 'table table-striped table-striped')
links = table.findAll('a')
for link in links:
info_array = []
gov = {}
name = link.string
gov_sauce = requests.get(r'https://nga.org'+link.get('href')).text
gov_soup = bs.BeautifulSoup(gov_sauce, "html.parser")
#print(gov_soup)
office_and_stuff_info = gov_soup.findAll('address')
for address in office_and_stuff_info:
infos = address.findAll('p')
for info in infos:
tex = re.sub('[^a-zA-Z\d:]','',info.text)
tex = re.sub('\\s+',' ',info.text)
tex = tex.strip()
if tex:
info_array.append(tex)
info_array = list(set(info_array))
gov['Name'] = name
secondarry_address = ''
gov['Address'] = ''
for line in info_array:
if 'OfficeDates:' in line:
gov['OfficeDates'] = line.replace('OfficeDates:','').replace('-','')
elif 'Succ' or 'Fail' in line:
gov['Success'] = line
elif 'Address' in line:
gov['Address'] = line.replace('Address:','')
elif 'Phone:' or 'Phone ' in line:
gov['Phone'] = line.replace('Phone ','').replace('Phone: ','')
elif 'Fax:' in line:
gov['Fax'] = line.replace('Fax:','')
elif 'Born:' in line:
gov['Born'] = line.replace('Born:','')
elif 'Birth State:' in line:
gov['BirthState'] = line.replace('BirthState:','')
elif 'Party:' in line:
gov['Party'] = line.replace('Party:','')
elif 'School(s)' in line:
gov['Schooling'] = line.replace('School(s):','').replace('School(s) ')
elif 'Email:' in line:
gov['Email'] = line.replace('Email:','')
else:
secondarry_address = line
gov['Address'] = gov['Address'] + secondarry_address
data_line = gov['Name'] +','+gov['Address'] +','+gov['OfficeDates'] +','+gov['Success'] +','+gov['Address'] +','+ gov['Phone'] +','+ gov['Fax'] +','+gov['Born'] +','+gov['BirthState'] +','+gov['Party'] +','+gov['Schooling'] +','+gov['Email']
csv_data.write(data_line)
next_page_link = soup.find('ul','pagination center-blockdefault').find('a',{'aria-label':'Next'})
if next_page_link.parent.get('class') == 'disabled':
finished = True
else:
url = r'https://nga.org'+next_page_link.get('href')
sauce = requests.get(url).text
soup = bs.BeautifulSoup(sauce,'html.parser')
except:
print('Code failed.')
finally:
csv_data.close()
if __name__ == '__main__':
main()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scraping with Python - python

Related

Looping until max results

I want to extract IMDb movie IDs using python

Error: Index out of range for my Flipkart product scraper

Python BeautifulSoup selenium scraper

Scrape site with multiple links without "next" button using beautiful soup

Categories

Resources