how can i run parts of my parser in parallel? - python

im parsing all news from site, first problem is to take all links and titles, time and date from archive, this is my code:
lst_of_URL = []
years = list(range(2005,2023))
months = list(range(1,13))
months = list(map(str, months))
for i in range(len(months)):
if len(months[i]) == 1:
months[i] = '0' + months[i]
days = list(range(1,32))
days = list(map(str, days))
for i in range(len(days)):
if len(days[i]) == 1:
days[i] = '0' + days[i]
for year in years:
for month in months:
for day in days:
lst_of_URL.append(f'https://*******/archive/{year}/{month}/{day}')
it allows me to get all the links for each day's news and now I go through this list and parse each day separately, from the html I extract the number of links, title, publication date, etc. and write it into mysql, but it takes a lot of time hours 6 for 6600 samples from the list, tell me how to speed things up, I was thinking about running a couple of parallel parsers of different parts of the list al urls, but I don't know if it's possible, what would you recommend? here is the code:
for URL in tqdm(lst_of_URL):
data_salary = {}
try:
html = urlopen(URL)
bsObj = BeautifulSoup(html, 'lxml')
for link in bsObj.find_all('div', {'class':'c-card__body'}):
data_salary['link_href'] = link.a.get('href') # get links
data_salary['link_name'] = link.a.get_text() # get text from links
for sibling in link.find('div', 'u-fx u-fx--wrap'):
data_salary['date'] = sibling.get_text() # get post dates and view counts
cnx = mysql.connector.connect(user='root', password='aaaa1111',
host='127.0.0.1',
database='data')
cursor = cnx.cursor()
add_salary = ("INSERT INTO data_tsn "
"(link_href, link_name, date) "
"VALUES (%(link_href)s, %(link_name)s, %(date)s)")
cursor.execute(add_salary, data_salary)
cnx.commit()
except HTTPError as e:
print('cant connect or server not found')
cursor.close()
cnx.close()

Related

How to retrieve only tweets about a hashtag within an hour?

I have been trying to write a python code to use snscrape to retrieve tweets about a hashtag within an hour. But my code has been returning an empty dataframe each time I tried.
This is what I have tried so far:
now = datetime.utcnow()
since = now - timedelta(hours=1)
since_str = since.strftime('%Y-%m-%d %H:%M:%S.%f%z')
until_str = now.strftime('%Y-%m-%d %H:%M:%S.%f%z')
# Query tweets with hashtag #SOSREX in the last one hour
query = '#SOSREX Since:' + since_str + ' until:' + until_str
SOSREX_data = []
SOSREX_data=[]
for tweet in sntwitter.TwitterSearchScraper(query).get_items():
if len(SOSREX_data)>100:
break
else:
SOSREX_data.append([tweet.date,tweet.user.username,tweet.user.displayname,
tweet.content,tweet.likeCount,tweet.retweetCount,
tweet.sourceLabel,tweet.user.followersCount,tweet.user.location
])
# Creating a dataframe from the tweets list above
Tweets_data = pd.DataFrame(SOSREX_data,
columns=["Date_tweeted","username","display_name",
"Tweets","Number_of_Likes","Number_retweets",
"Source_of_Tweet",
"number_of_followers","location"
])
print("Tweets_data")

How to get timestamp and user ID from a list of string in Python 3?

I am trying to extract some parts of text from a list of strings.
This is how the list looks like:
'<rev revid="78273004" parentid="78127030" minor="" user="BF" timestamp="2016-01-19T17:33:57Z" comment="added [[Category:Politics]] usando [[Wikipedia:Monobook.js/Hot Cat|HotCat]]" />', '<rev revid="78127030" parentid="78054777" user="Atar" timestamp="2016-01-15T05:33:33Z" comment="template citazione; rinomina/fix nomi parametri; converto template cite xxx -> cita xxx; elimino parametri vuoti; fix formato data" />', '<rev revid="78054777" parentid="78054533" user="yk" timestamp="2016-01-11T20:50:39Z" comment="/* Voci correlate */ coll. esterni" />', ...
I would extract users and timestamps in two different arrays in order to plot them separately.
What I already tried to do is create two different arrays and try to get users and timestamps.
url = "https://it.wikipedia.org/w/api.php?action=query&format=xml&prop=revisions&rvlimit=500&titles=" + pageTitle
revisions = [] #list of all accumulated revisions
timestamps = [] #list of all accumulated timestamps
users = [] #list of all accumulated users
next = '' #information for the next request
while True:
response = requests.get(url + next).text #web request
revisions += re.findall('<rev [^>]*>', response) #adds all revisions from the current request to the list
timestamps += re.findall('timestamp="\d{4}-\d{2}-\d{2}\w\d{2}:\d{2}:\d{2}\w"', response)
users += re.findall('user="\w"', response)
cont = re.search('<continue rvcontinue="([^"]+)"', response)
if not cont: #break the loop if 'continue' element missing
break
next = "&rvcontinue=" + cont.group(1) #gets the revision Id from which to start the next request
return timestamps, users;
GetRevisions("Italia")
What I would like to get is two arrays, one with timestamps and another one with users.
timestamps= [2016-01-19T17:33:57Z, 2016-01-15T05:33:33Z, ...]
users= [BF, Atar, ...]
(I would like to make an association between users and timestamps).
However, I am getting only empty lists:
[], []
I hope you can help me.
Have you tried parsing your text using BeautifulSoup ?
You can parse your text as html tags and extract the tags that matter to you, in a simple loop:
from bs4 import BeautifulSoup
## The text you refer to as list:
yourText = '''<rev revid="78273004" parentid="78127030" minor="" user="BF" timestamp="2016-01-19T17:33:57Z" comment="added [[Category:Politics]] usando [[Wikipedia:Monobook.js/Hot Cat|HotCat]]" />', '<rev revid="78127030" parentid="78054777" user="Atar" timestamp="2016-01-15T05:33:33Z" comment="template citazione; rinomina/fix nomi parametri; converto template cite xxx -> cita xxx; elimino parametri vuoti; fix formato data" />', '<rev revid="78054777" parentid="78054533" user="yk" timestamp="2016-01-11T20:50:39Z" comment="/* Voci correlate */ coll. esterni" />'''
### parse it with BeautifulSoup
soup = BeautifulSoup(yourText, 'html.parser')
users = []
timestamps = []
for rev in soup.findAll('rev'):
users.append(rev.get('user'))
timestamps.append(rev.get('timestamp'))
print (users)
print (timestamps)
['BF', 'Atar', 'yk']
['2016-01-19T17:33:57Z', '2016-01-15T05:33:33Z', '2016-01-11T20:50:39Z']
Using your original code
Using your original code, we just need to change how your are capturing the texts using regex. The logic I am applying is:
Starts with timestamp= or user=;
Followed by "
Followed by any characters that are not "
Closed with a " character.
timestamps += re.findall('(?:timestamp=)"([^"]*)"', response)
users += re.findall('(?:user=)"([^"]*)"', response)
url = "https://it.wikipedia.org/w/api.php?action=query&format=xml&prop=revisions&rvlimit=500&titles=Italia"
revisions = [] #list of all accumulated revisions
timestamps = [] #list of all accumulated timestamps
users = [] #list of all accumulated users
next = '' #information for the next request
while True:
response = requests.get(url + next).text #web request
revisions += re.findall('(?=<rev)', response) #adds all revisions from the current request to the list
timestamps += re.findall('(?:timestamp=)"([^"]*)"', response)
users += re.findall('(?:user=)"([^"]*)"', response)
cont = re.search('<continue rvcontinue="([^"]+)"', response)
if not cont: #break the loop if 'continue' element missing
break
next = "&rvcontinue=" + cont.group(1) #gets the revision Id from which to start the next request
This will yield two lists with 9968 elements:
users[0:3]
Out[1]:
['U9POI57', 'SuperPierlu', 'Superchilum']
timestamps[0:3]
Out[2]:
['2019-07-24T22:15:23Z', '2019-07-24T16:09:59Z', '2019-07-24T12:40:24Z']
EDIT
Keeping only date, without time. To do so, you just need to replace the end of the matching string from " to T:
timestamps += re.findall('(?:timestamp=)"([^"]*)T', response)

Stop a loop if information is not found

I created a web scraping program that open several URLs, it checks which one of the URLs has information related to "tomorrow"s date and then it prints some specific information that is on that URL. My problem is that sometimes none of the URLs in that list has information concerning "tomorrow". So I would like that in such case, the program prints other innformation like "no data found". How could I accomplish that? Other doubt I have, do I need the while loop at the beginning? Thanks.
My code is:
from datetime import datetime, timedelta
tomorrow = datetime.now() + timedelta(days=1)
tomorrow = tomorrow.strftime('%d-%m-%Y')
day = ""
while day != tomorrow:
for url in list_urls:
browser.get(url)
time.sleep(1)
dia_page = browser.find_element_by_xpath("//*[#id='item2']/b").text
dia_page = dia_page[-10:]
day_uns = datetime.strptime(dia_page, "%d-%m-%Y")
day = day_uns.strftime('%d-%m-%Y')
if day == tomorrow:
meals = browser.find_elements_by_xpath("//*[#id='item2']/span")
meal_reg = browser.find_element_by_xpath("//*[#id='item_frm']/span[1]").text
sopa2 = (meals[0].text)
refeicao2 = (meals[1].text)
sobremesa2 = (meals[2].text)
print(meal_reg)
print(sopa2)
print(refeicao2)
print(sobremesa2)
break
No need for a while loop, you can use the for-else Python construct for this:
for url in list_urls:
# do stuff
if day == tomorrow:
# do and print stuff
break
else: # break never encountered
print("no data found")

Why can't I loop through a `payload` in `requests` to iterate my web scrape?

Summary: I want to iterate through a requests payload, so that I can change the log-in ID number for each scrape.
I'm using requests & beautiful soup to do a web scrape.
To log-in to the page, I need to enter a unique ID number; I have a list of such numbers, called hit_list.
For any given ID number, this script works absolutely fine. But what I want to do is automate it so that it runs through my entire hit_list
In other words, I want num in payload_1 to change for each iteration. At present num remains constant and the scrape just iterates according to the length of hit_list (i.e. in this case the same scrape would run five times)
Please note, I'm very new to coding and this is my first project. I'm aware there are likely to be problems with it and am happy to receive constructive criticism.
Importing Libraries
import requests
import pymysql.cursors
from pymysql import connect, err, sys, cursors
import sys
import time
import bs4
import time
from datetime import datetime
import openpyxl
#Recording time # Start
startTime = datetime.now()
print(datetime.now())
#use pymysql to create database- omitted here for parsimony
#This is a sample list, in reality the list will have 100,000 + numbers.
hit_list = [100100403,100100965,100101047,100100874,100100783]
"""
This is my code for importing the real list, included here incase the way the list is imported is relevant to the problem
wb = openpyxl.load_workbook('/Users/Seansmac/Desktop/stage2_trial.xlsx')
sheet= wb.get_sheet_by_name('Sheet1')
type(wb)
#LOUIS: Only importing first twenty (for trial purposes)
for id in range(1,20):
hit_list.append(sheet.cell(row=id, column =1).value)
"""
def web_scrape():
#I'm only creating a function, because I'm told it's always good practice to put any 'bit' of logic into a function- I'm aware this probably looks amateurish.
#Open page
url = 'https://ndber.seai.ie/pass/ber/search.aspx'
with requests.session() as r:
r.headers.update({
'user-agent': 'For more information on this data collection please contact **************************************'
})
for num in hit_list:
#***LOCATION OF THE PROBLEM***
payload_1 = {
'ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber':num,
'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search',
'__VIEWSTATE' :'/wEPDwULLTE2MDEwODU4NjAPFgIeE1ZhbGlkYXRlUmVxdWVzdE1vZGUCARYCZg9kFgICAw9kFgICAw8WAh4FY2xhc3MFC21haW53cmFwcGVyFgQCBQ8PFgIeB1Zpc2libGVnZGQCCQ9kFgICAQ9kFgxmD2QWAgIBD2QWAgIBD2QWAmYPZBYCZg9kFgQCAQ8WAh4JaW5uZXJodG1sZWQCAw9kFgICAg9kFgJmD2QWBAIBD2QWAgIDDw8WCB4EXyFTQgKAAh4MRGVmYXVsdFdpZHRoHB4HVG9vbFRpcAU+UGxlYXNlIGVudGVyIGEgdmFsdWUsIHdpdGggbm8gc3BlY2lhbCBjaGFyYWN0ZXJzLCB3aXRoIG5vIHRleHQeBVdpZHRoHGRkAgMPZBYCAgMPDxYIHwQCgAIfBRwfBgU+UGxlYXNlIGVudGVyIGEgdmFsdWUsIHdpdGggbm8gc3BlY2lhbCBjaGFyYWN0ZXJzLCB3aXRoIG5vIHRleHQfBxxkZAIEDxQrAAJkEBYAFgAWABYCZg9kFgICAg9kFgJmDzwrABECARAWABYAFgAMFCsAAGQCBg8WAh8CaBYEAgEPFgIfAmhkAgMPZBYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCZg9kFgICAQ8WAh8CaGQCCA8WAh8CaBYEAgEPFgIfAmhkAgMPZBYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCZg9kFgICAQ8WAh8CaGQCCg8WAh8CaBYEAgEPFgIfAmhkAgMPZBYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCZg9kFgICAQ8WAh8CaGQCDA8WAh8CaBYEAgEPFgIfAmhkAgMPZBYCZg9kFgJmD2QWAgIDD2QWAmYPZBYCZg9kFgICAQ8WAh8CaGQYAQUzY3RsMDAkRGVmYXVsdENvbnRlbnQkQkVSU2VhcmNoJGdyaWRSYXRpbmdzJGdyaWR2aWV3D2dkrGhAYkdLuZZh8E98usAnWAaRMxurQ1Gquc+9krb7Boc=',
}
r.post(url, data=payload_1)
#click intermediate page
payload_2 = {
'__EVENTTARGET': 'ctl00$DefaultContent$BERSearch$gridRatings$gridview$ctl02$ViewDetails',
'__VIEWSTATE': "/wEPDwULLTE2MDEwODU4NjAPFgIeE1ZhbGlkYXRlUmVxdWVzdE1vZGUCARYCZg9kFgICAw9kFgICAw8WAh4FY2xhc3MFC21haW53cmFwcGVyFgQCBQ8PFgIeB1Zpc2libGVnZGQCCQ9kFgICAQ9kFg5mD2QWAgIBDxYCHwJoFgICAQ8PFgIfAmhkFgJmD2QWAmYPZBYEAgEPFgIeCWlubmVyaHRtbGVkAgMPZBYCAgIPZBYCZg9kFgQCAQ9kFgICAw8PFgoeBF8hU0ICgAIeDERlZmF1bHRXaWR0aBweBFRleHQFCTEwMDEwMDMxMh4HVG9vbFRpcAU+UGxlYXNlIGVudGVyIGEgdmFsdWUsIHdpdGggbm8gc3BlY2lhbCBjaGFyYWN0ZXJzLCB3aXRoIG5vIHRleHQeBVdpZHRoHGRkAgMPZBYCAgMPDxYIHwQCgAIfBRwfBwU+UGxlYXNlIGVudGVyIGEgdmFsdWUsIHdpdGggbm8gc3BlY2lhbCBjaGFyYWN0ZXJzLCB3aXRoIG5vIHRleHQfCBxkZAICDw8WAh8CZ2QWAmYPZBYCZg9kFgICAw9kFgJmD2QWAmYPZBYCAgEPZBYCZg9kFgJmD2QWAgIBDxYCHwMFDlNlYXJjaCBSZXN1bHRzZAIEDxQrAAIPFgYfAmceElNlbGVjdGVkUm93SW5kZXhlczLNAQABAAAA/////wEAAAAAAAAABAEAAAB+U3lzdGVtLkNvbGxlY3Rpb25zLkdlbmVyaWMuTGlzdGAxW1tTeXN0ZW0uSW50MzIsIG1zY29ybGliLCBWZXJzaW9uPTQuMC4wLjAsIEN1bHR1cmU9bmV1dHJhbCwgUHVibGljS2V5VG9rZW49Yjc3YTVjNTYxOTM0ZTA4OV1dAwAAAAZfaXRlbXMFX3NpemUIX3ZlcnNpb24HAAAICAgJAgAAAAAAAAABAAAADwIAAAAAAAAACAseCmVkaXRfc3R5bGULKXNWMS5ORVQuV2ViQ29udHJvbHMuRWRpdFN0eWxlLCBWMS5ORVQuV2ViQ29udHJvbHMsIFZlcnNpb249MS40LjAuMCwgQ3VsdHVyZT1uZXV0cmFsLCBQdWJsaWNLZXlUb2tlbj01YmYzNDU3ZDMwODk1MjEzAmQQFgAWABYAFgJmD2QWAgICD2QWAmYPPCsAEQMADxYEHgtfIURhdGFCb3VuZGceC18hSXRlbUNvdW50AgFkARAWABYAFgAMFCsAABYCZg9kFgICAQ9kFgpmD2QWAgIBDw8WBB4PQ29tbWFuZEFyZ3VtZW50BQkxMDAxMDAzMTIfBgUJMTAwMTAwMzEyZGQCAQ9kFgJmDw8WAh8GBQNCRVJkZAICD2QWAmYPDxYCHwYFCzEwMDE1MTAwMDkwZGQCAw9kFgJmDw8WAh8GBQowNy0wMS0yMDA5ZGQCBA9kFgJmDw8WAh8GBSQzMCBNQVJJTkUgVklFVw1BVEhMT05FDUNPLiBXRVNUTUVBVEhkZAIGDxYCHwJoFgQCAQ8WAh8CaGQCAw9kFgJmD2QWAmYPZBYCAgMPZBYCZg9kFgJmD2QWAgIBDxYCHwJoZAIIDxYCHwJoFgQCAQ8WAh8CaGQCAw9kFgJmD2QWAmYPZBYCAgMPZBYCZg9kFgJmD2QWAgIBDxYCHwJoZAIKDxYCHwJoFgQCAQ8WAh8CaGQCAw9kFgJmD2QWAmYPZBYCAgMPZBYCZg9kFgJmD2QWAgIBDxYCHwJoZAIMDxYCHwJoFgQCAQ8WAh8CaGQCAw9kFgJmD2QWAmYPZBYCAgMPZBYCZg9kFgJmD2QWAgIBDxYCHwJoZBgBBTNjdGwwMCREZWZhdWx0Q29udGVudCRCRVJTZWFyY2gkZ3JpZFJhdGluZ3MkZ3JpZHZpZXcPPCsADAEIAgFkjLH/5QxuANxuCh3kAmhUU/4/OZj+wy8nJDYIFx4Lowo=",
'__VIEWSTATEGENERATOR':"1F9CCB97",
'__EVENTVALIDATION': "/wEdAAbaTEcivWuxiWecwu4mVYO9eUnQmzIzqu4hlt+kSDcrOBWCa0ezllZh+jGXjO1EB1dmMORt6G1O0Qbn0WLg3p+rPmLeN6mjN7eq7JtUZMjpL2DXqeB/GqPe7AFtNDKiJkEPdN6Y/vq7o/49hX+o366Ioav3zEBl37yPlq3sYQBXpQ==",
}
s=r.post(url, data=payload_2)
#scrape the page
soup = bs4.BeautifulSoup(s.content, 'html.parser')
"""
FOR THE PURPOSES OF MY ISSUE EVERYTHING BELOW WORKS FINE & CAN BE SKIPPED
"""
print('\nBEGINNING SCRAPE....')
# First Section
ber_dec = soup.find('fieldset', {'id':'ctl00_DefaultContent_BERSearch_fsBER'})
#Address- clean scrape
address = ber_dec.find('div', {'id':'ctl00_DefaultContent_BERSearch_dfBER_div_PublishingAddress'})
address = (address.get_text(',').strip())
print('address:', address)
#Date of Issue- clean scrape
date_issue1 = ber_dec.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfBER_container_DateOfIssue'})
date_issue = date_issue1.find('div', {'class':'formControlReadonly'})
date_issue = (date_issue.get_text().strip())
print('date_of_issue:',date_issue)
#MPRN -Clean scrape
MPRN1 = ber_dec.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfBER_container_MPRN'})
MPRN = MPRN1.find('div',{'class':'formControlReadonly'})
MPRN = MPRN.get_text().strip()
print('MPRN:', MPRN)
#Emissions Indicator- clean scrape
emissions_indicator1 = ber_dec.find('div',{'id':'ctl00_DefaultContent_BERSearch_dfBER_div_CDERValue'})
emissions_indicator_bunched = emissions_indicator1.get_text().strip()
print('\n\nem_bunched:',emissions_indicator_bunched)
emissions_indicator, emissions_indicator_unit = emissions_indicator_bunched.split()
print('emissions_indicator:',emissions_indicator)
emissions_indicator_unit= emissions_indicator_unit.replace("(","")
emissions_indicator_unit=emissions_indicator_unit.replace(")","")
print('emissions_indicator_unit:',emissions_indicator_unit)
#BER Score- clean scrape
BER_bunched = ber_dec.find('div', {'id':'ctl00_DefaultContent_BERSearch_dfBER_div_EnergyRating'})
BER_bunched =(BER_bunched.get_text().strip())
print ('\n \nBER_bunched:', BER_bunched)
BER_score, BER_actual_rating, BER_unit = BER_bunched.split()
print('\nBER_score:',BER_score)
print('\nBER_actual_rating:',BER_actual_rating)
BER_unit = BER_unit.replace("(", " ")
BER_unit = BER_unit.replace(")","")
print('\nClean_BER_unit:',BER_unit )
#Type of Rating- clean scrape
type_of_rating1= ber_dec.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfBER_container_TypeOfRating'})
type_of_rating= type_of_rating1.find('div',{'class':'formControlReadonly'})
type_of_rating = type_of_rating.get_text().strip()
print('type_of_rating:',type_of_rating )
# Second Section
dwelling_details = soup.find('fieldset', {'id':'ctl00_DefaultContent_BERSearch_fsStructure'})
#Dwelling Type- clean scrape
dwelling_type1 = dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_DwellingType'})
dwelling_type = dwelling_type1.find('div',{'class':'formControlReadonly'})
dwelling_type = dwelling_type.get_text().strip()
print ('Dwelling Type:', dwelling_type)
#Number of Stories- clean scrape
num_stories1 = dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_NoStoresy'})
num_stories = num_stories1.find('div',{'class':'formControlReadonly'})
num_stories = num_stories.get_text().strip()
print('Number of Stories:', num_stories)
#Year of Construction- clean scrape
yr_construction1 = dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_DateOfConstruction'})
yr_construction = yr_construction1.find('div',{'class':'formControlReadonly'})
yr_construction = yr_construction.get_text().strip()
print('Year of Construction:', yr_construction)
#Floor Area- clean scrape
floor_area= dwelling_details.find('div', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_div_FloorArea'})
floor_area = floor_area.get_text().strip()
floor_area, floor_area_unit =floor_area.split()
floor_area_unit = floor_area_unit.replace("(","")
floor_area_unit=floor_area_unit.replace(")","")
print('\nFloor Area:', floor_area)
print('floor_area_unit:', floor_area_unit)
#Wall Type- clean scrape
wall_type1 = dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_WallType'})
wall_type = wall_type1.find('div',{'class':'formControlReadonly'})
wall_type= wall_type.get_text().strip()
print('Wall Type:', wall_type)
#Glazing Type- clean scrape
glazing_type1 =dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_GlazingType'})
glazing_type =glazing_type1.find('div',{'class':'formControlReadonly'})
glazing_type = glazing_type.get_text().strip()
print('Glazing Type:', glazing_type)
#Percent Low Energy Lighting- clean scrape
percent_low_energy_lighting1= dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_PercentLowEnergyLight'})
percent_low_energy_lighting = percent_low_energy_lighting1.find('div',{'class':'formControlReadonly'})
percent_low_energy_lighting = percent_low_energy_lighting.get_text().strip()
print('% Low Energy Lighting:', percent_low_energy_lighting)
#Space Heating Fuel- clean scrape
space_heating_fuel1 =dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_MainSpaceHeatingFuel'})
space_heating_fuel =space_heating_fuel1.find('div',{'class':'formControlReadonly'})
space_heating_fuel = space_heating_fuel.get_text().strip()
print('Space Heating Fuel:',space_heating_fuel)
#Space Heating Efficiency- clean scrape
space_heating_efficiency1= dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_MainSpaceHeatingEfficiency'})
space_heating_efficiency = space_heating_efficiency1.find('div',{'class':'formControlReadonly'})
space_heating_efficiency= space_heating_efficiency.get_text().strip()
print('Space Heating Efficiency:', space_heating_efficiency)
#Water Heatng Fuel- clean scrape
water_heating_fuel1 = dwelling_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_MainWaterHeatingFuel'})
water_heating_fuel =water_heating_fuel1.find('div',{'class':'formControlReadonly'})
water_heating_fuel = water_heating_fuel.get_text().strip()
print('Water Heating Fuel:', water_heating_fuel)
#Water Heating Efficiency- clean scrape
water_heating_efficiency1 =dwelling_details.find('span',{'id':'ctl00_DefaultContent_BERSearch_dfNasStructuralDetails_container_MainWaterHeatingEfficiency'})
water_heating_efficiency =water_heating_efficiency1.find('div',{'class':'formControlReadonly'})
water_heating_efficiency= water_heating_efficiency.get_text().strip()
print('Water Heating Efficiency:', water_heating_efficiency)
#thrid section
assessor_details = soup.find('fieldset', {'id':'ctl00_DefaultContent_BERSearch_fsAssessor'})
#Assessor Number- clean scrape
assessor_num1 = assessor_details.find('span', {'id':'ctl00_DefaultContent_BERSearch_dfAssessor_container_AssessorNumber'})
assessor_num = assessor_num1.find('div',{'class':'formControlReadonly'})
assessor_num= assessor_num.get_text().strip()
print('Assessor Number:', assessor_num)
print('BER:', num)
print('\***************nSCRAPE FINISHED***************\n')
#Populate datebase
print('\nRECONNECTING WITH DATABASE')
with connection.cursor() as cursor:
print('SUCCESSFUL CONNECTION')
sql =("INSERT INTO table1(BER_number, MPRN, address, BER_score, BER_actual_rating, BER_unit, emissions_indicator, emissions_indicator_unit, date_issue, floor_area, floor_area_unit, dwelling_type, num_stories, yr_construction, wall_type, assessor_num, water_heating_efficiency, glazing_type, percent_low_energy_lighting, space_heating_fuel, space_heating_efficiency, water_heating_fuel, type_of_rating)VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)")
cursor.execute(sql, (num, MPRN, address, BER_score, BER_actual_rating, BER_unit, emissions_indicator, emissions_indicator_unit, date_issue, floor_area, floor_area_unit, dwelling_type, num_stories, yr_construction, wall_type, assessor_num, water_heating_efficiency, glazing_type, percent_low_energy_lighting, space_heating_fuel, space_heating_efficiency, water_heating_fuel, type_of_rating))
print('ROW POPULATED')
#Calling the function
web_scrape()
#Metadata
print('Gathering Details...')
Run_time = datetime.now() - startTime
print('Run Time:', Run_time)
#Loop Finished
print('\n***************PROGRAMME FINISHED***************')
You need to get new __EVENTVALIDATION tokens etc... for each post, you cannot just copy values from your browser and hard code them into your post data:
import requests
url = 'https://ndber.seai.ie/pass/ber/search.aspx'
hit_list = [100100403, 100100965, 100101047, 100100874, 100100783]
h = {}
def renew(s):
soup = BeautifulSoup(s.get(url).content,"html.parser.)
return {"__VIEWSTATE": soup.select_one("#__VIEWSTATE")["value"],
"__VIEWSTATEGENERATOR": soup.select_one("#__VIEWSTATEGENERATOR")["value"],
"__EVENTVALIDATION": soup.select_one("#__EVENTVALIDATION")["value"]}
with requests.session() as s:
for num in hit_list:
payload_1 = {
'ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber': num,
'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search'}
# update the post data with new token values
payload_1.update(renew(s))
r = s.post(url, data=payload_1)
# scrape the page
soup = BeautifulSoup(r.content, 'html.parser')
If we run the code and parse a bit of what is returned, you can see we get each page correctly:
In [8]: with requests.session() as s:
...: for num in hit_list:
...: payload_1 = {
...: 'ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber': str(num),
...: 'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search'}
...: payload_1.update(renew(s))
...: r = s.post(url, data=payload_1)
...: soup = BeautifulSoup(r.content, 'html.parser')
...: spans = soup.select("#ctl00_DefaultContent_BERSearch_gridRatings_gridview tr.GridRowStyle td span")
...: print(spans)
...:
[<span>BER</span>, <span>10003467711</span>, <span>07-01-2009</span>, <span>24 CLONEE COURT\rMAIN STREET\rCLONEE\rCO. MEATH</span>]
[<span>BER</span>, <span>10301654014</span>, <span>26-11-2014</span>, <span>19 GORTANORA\rDINGLE\rCO. KERRY</span>]
[<span>BER</span>, <span>10002082335</span>, <span>08-01-2009</span>, <span>8 CANNON PLACE\r1 HERBERT ROAD\rDUBLIN 4</span>]
[<span>BER</span>, <span>10301653940</span>, <span>18-01-2015</span>, <span>12 GORTANORA\rDINGLE\rCO. KERRY</span>]
[<span>BER</span>, <span>10010500405</span>, <span>07-01-2009</span>, <span>13 RENMORE ROAD\rGALWAY CITY</span>]
That gives you all the info from the table bar the BER cert number, you already have that so you don't need to worry about it.
As you figured out you just need to pass the data to your second payload from what is returned from first post, if you encapsulate the logic in functions it will also make your code a bit easier to manage:
def renew(soup):
return {"__VIEWSTATE": soup.select_one("#__VIEWSTATE")["value"],
"__VIEWSTATEGENERATOR": soup.select_one("#__VIEWSTATEGENERATOR")["value"],
"__EVENTVALIDATION": soup.select_one("#__EVENTVALIDATION")["value"]}
def parse_data(soup):
address = soup.select_one("#ctl00_DefaultContent_BERSearch_dfBER_div_PublishingAddress").text.strip()
MPRN = soup.select_one("#ctl00_DefaultContent_BERSearch_dfBER_container_MPRN div.formControlReadonly").text.strip()
emissions_indicator, emissions_indicator_unit = soup.select_one(
"#ctl00_DefaultContent_BERSearch_dfBER_div_CDERValue").text.split()
emissions_indicator_unit = emissions_indicator_unit.strip("()")
BER_score, BER_actual_rating, BER_unit = soup.select_one(
"#ctl00_DefaultContent_BERSearch_dfBER_div_EnergyRating").text.split()
BER_unit = BER_unit.strip("()")
return {"MPRN": MPRN, "emissions_indicator": emissions_indicator,
"emissions_indicator_unit": emissions_indicator_unit,
"BER_score": BER_score, "BER_actual_rating": BER_actual_rating,
"BER_unit": BER_unit, "address": address}
def submint_to_db(dct):
with connection.cursor() as cursor:
print('SUCCESSFUL CONNECTION')
sql = "INSERT INTO table1 ( %s ) VALUES ( %s )" % (",".join(dct), ', '.join(['%s'] * len(dct)))
cursor.execute(sql, dct.values())
payload_1 = {
'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search'}
payload_2 = {
'__EVENTTARGET': 'ctl00$DefaultContent$BERSearch$gridRatings$gridview$ctl02$ViewDetails',
}
with requests.session() as s:
tokens = renew(BeautifulSoup(requests.get(url).content, "html.parser"))
for num in hit_list:
# update the post data with new token values
payload_1['ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber'] = num
payload_1.update(tokens)
r = s.post(url, data=payload_1)
tokens2 = renew(BeautifulSoup(r.content, 'html.parser'))
payload_2.update(tokens2)
soup = BeautifulSoup(requests.post(url, data=payload_2).content, "html.parser")
submint_to_db(parse_data(soup))
I have not parsed all the data but the logic is the same for the rest, printing the dicts returned for what is parsed will give you:
{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '57.83', 'address': '24 CLONEE COURTMAIN STREETCLONEECO. MEATH', 'BER_score': 'D1', 'BER_actual_rating': '235.54', 'MPRN': '10003467711'}
{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '42.4', 'address': '19 GORTANORADINGLECO. KERRY', 'BER_score': 'C1', 'BER_actual_rating': '165.79', 'MPRN': '10301654014'}
{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '34.03', 'address': '8 CANNON PLACE1 HERBERT ROADDUBLIN 4', 'BER_score': 'C2', 'BER_actual_rating': '175.32', 'MPRN': '10002082335'}
{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '53.51', 'address': '12 GORTANORADINGLECO. KERRY', 'BER_score': 'C3', 'BER_actual_rating': '208.45', 'MPRN': '10301653940'}
{'BER_unit': 'kWh/m2/yr', 'emissions_indicator_unit': 'kgCO2/m2/yr', 'emissions_indicator': '121.54', 'address': '13 RENMORE ROADGALWAY CITY', 'BER_score': 'G', 'BER_actual_rating': '472.19', 'MPRN': '10010500405'}
#PadraicCunningham provided most of the logic for this answer, but as my comment below his answer describes, his solution only gets me half way.
I have been able to build on his work to solve the problem.
There was just one more step to complete, which was to 'click through' an intermediary' page, which led to where the data I wanted to scrape lies.
Apologies in advance for my non-standard labelling and formatting. I'm a beginner.
import requests
import pymysql.cursors
from pymysql import connect, err, sys, cursors
import sys
import time
import bs4
import time
from datetime import datetime
import openpyxl
hit_list = [100100403,100100965,100101047,100100874,100100783] #this is a sample list
#Open page
url = 'https://ndber.seai.ie/pass/ber/search.aspx'
def field_update(s):
soup = bs4.BeautifulSoup(s.get(url).content,"html.parser")
return {"__VIEWSTATE": soup.select_one("#__VIEWSTATE")["value"],
"__VIEWSTATEGENERATOR": soup.select_one("#__VIEWSTATEGENERATOR") ["value"],
"__EVENTVALIDATION": soup.select_one("#__EVENTVALIDATION")["value"]}
print('field updated')
with requests.session() as s:
for ber in hit_list:
payload_1 = {
'ctl00$DefaultContent$BERSearch$dfSearch$txtBERNumber': ber,
'ctl00$DefaultContent$BERSearch$dfSearch$Bottomsearch': 'Search'}
# update the post data with new token values
payload_1.update(field_update(s))
r = s.post(url, data=payload_1)
#'click through' intermediate page
#THIS IS THE ADDITIONAL CODE THAT BUILDS ON PADRAIC'S ANSWER
soup = bs4.BeautifulSoup(r.content,"html.parser")
stage_two= {
"__EVENTTARGET": 'ctl00$DefaultContent$BERSearch$gridRatings$gridview$ctl02$ViewDetails',
"__VIEWSTATE": soup.select_one("#__VIEWSTATE")["value"],
"__VIEWSTATEGENERATOR": soup.select_one("#__VIEWSTATEGENERATOR")["value"],
"__EVENTVALIDATION": soup.select_one("#__EVENTVALIDATION")["value"]}
q=s.post(url, data=stage_two)
print('payload_2 posted')
soup = bs4.BeautifulSoup(q.content, 'html.parser')
print('\nBEGINNING SCRAPE....')
#FOR DATA TO BE SCRAPED, SEE ORIGINAL QUESTION

Random "IndexError: list index out of range "

I am trying to scrape a site that returns its data via Javascript. The code I wrote using BeautifulSoup works pretty well, but at random points during scraping I get the following error:
Traceback (most recent call last):
File "scraper.py", line 48, in <module>
accessible = accessible[0].contents[0]
IndexError: list index out of range
Sometimes I can scrape 4 urls, sometimes 15, but at some point the script eventually fails and gives me the above error. I can find no pattern behind the failing, so I'm really at a loss here - what am I doing wrong?
from bs4 import BeautifulSoup
import urllib
import urllib2
import jabba_webkit as jw
import csv
import string
import re
import time
countries = csv.reader(open("countries.csv", 'rb'), delimiter=",")
database = csv.writer(open("herdict_database.csv", 'w'), delimiter=',')
basepage = "https://www.herdict.org/explore/"
session_id = "indepth;jsessionid=C1D2073B637EBAE4DE36185564156382"
ccode = "#fc=IN"
end_date = "&fed=12/31/"
start_date = "&fsd=01/01/"
year_range = range(2009, 2011)
years = [str(year) for year in year_range]
def get_number(var):
number = re.findall("(\d+)", var)
if len(number) > 1:
thing = number[0] + number[1]
else:
thing = number[0]
return thing
def create_link(basepage, session_id, ccode, end_date, start_date, year):
link = basepage + session_id + ccode + end_date + year + start_date + year
return link
for ccode, name in countries:
for year in years:
link = create_link(basepage, session_id, ccode, end_date, start_date, year)
print link
html = jw.get_page(link)
soup = BeautifulSoup(html, "lxml")
accessible = soup.find_all("em", class_="accessible")
inaccessible = soup.find_all("em", class_="inaccessible")
accessible = accessible[0].contents[0]
inaccessible = inaccessible[0].contents[0]
acc_num = get_number(accessible)
inacc_num = get_number(inaccessible)
print acc_num
print inacc_num
database.writerow([name]+[year]+[acc_num]+[inacc_num])
time.sleep(2)
You need to add error-handling to your code. When scraping a lot of websites, some will be malformed, or somehow broken. When that happens, you'll be trying to manipulate empty objects.
Look through the code, find all assumptions where you're assuming it works, and check against errors.
For that specific case, I would do this:
if not inaccessible or not accessible:
# malformed page
continue
soup.find_all("em", class_="accessible") is probably returning an empty list. You can try:
if accessible:
accessible = accessible[0].contents[0]
or more generally:
if accessibe and inaccesible:
accessible = accessible[0].contents[0]
inaccessible = inaccessible[0].contents[0]
else:
print 'Something went wrong!'
continue

Categories