Python - save requests or BeautifulSoup object locally - python

I have some code that is quite long, so it takes a long time to run. I want to simply save either the requests object (in this case "name") or the BeautifulSoup object (in this case "soup") locally so that next time I can save time. Here is the code:
from bs4 import BeautifulSoup
import requests
url = 'SOMEURL'
name = requests.get(url)
soup = BeautifulSoup(name.content)

Since name.content is just HTML, you can just dump this to a file and read it back later.
Usually the bottleneck is not the parsing, but instead the network latency of making requests.
from bs4 import BeautifulSoup
import requests
url = 'https://google.com'
name = requests.get(url)
with open("/tmp/A.html", "w") as f:
f.write(name.content)
# read it back in
with open("/tmp/A.html") as f:
soup = BeautifulSoup(f)
# do something with soup
Here is some anecdotal evidence for the fact that bottleneck is in the network.
from bs4 import BeautifulSoup
import requests
import time
url = 'https://google.com'
t1 = time.clock();
name = requests.get(url)
t2 = time.clock();
soup = BeautifulSoup(name.content)
t3 = time.clock();
print t2 - t1, t3 - t2
Output, from running on Thinkpad X1 Carbon, with a fast campus network.
0.11 0.02

Storing requests locally and restoring them as Beautifoul Soup object latter on
If you are iterating through pages of web site you can store each page with request explained here.
Create folder soupCategory in same folder where your script is.
Use any latest user agent for headers
headers = {'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0 Safari/605.1.15'}
def getCategorySoup():
session = requests.Session()
retry = Retry(connect=7, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
basic_url = "https://www.somescrappingdomain.com/apartments?adsWithImages=1&page="
t0 = time.time()
j=0
totalPages = 1525 # put your number of pages here
for i in range(1,totalPages):
url = basic_url+str(i)
r = requests.get(url, headers=headers)
pageName = "./soupCategory/"+str(i)+".html"
with open(pageName, mode='w', encoding='UTF-8', errors='strict', buffering=1) as f:
f.write(r.text)
print (pageName, end=" ")
t1 = time.time()
total = t1-t0
print ("Total time for getting ",totalPages," category pages is ", round(total), " seconds")
return
Latter on you can create Beautifoul Soup object as #merlin2011 mentioned with:
with open("/soupCategory/1.html") as f:
soup = BeautifulSoup(f)

Related

Scraping with Beautiful Soup does not update values properly

I try to web-scrape weather website but the data does not update properly. The code:
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://www.wunderground.com/dashboard/pws/KORPISTO1'
while True:
soup = BeautifulSoup(urlopen(url), 'html.parser')
data = soup.find("div", {"class": "weather__text"})
print(data.text)
I am looking at 'WIND & WIND GUST' in 'CURRENT CONDITIONS' section. It prints the first values correctly (for example 1.0 / 2.2 mph) but after that the values update very slowly (at times 5+ minutes pass by) even though they change every 10-20-30 seconds in the website.
And when the values update in Python they are still different from the current values in the website.
You could try this alternate method: since the site actually retrieves the data from another url, you could just directly make the request and scrape the site only every hour or so to update the request url.
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json
from datetime import datetime, timedelta
#def getReqUrl...
reqUrl = getReqUrl()
prevTime, prevAt = '', datetime.now()
while True:
ures = json.loads(urlopen(reqUrl).read())
if 'observations' not in asd:
reqUrl = getReqUrl()
ures = json.loads(urlopen(reqUrl).read())
#to see time since last update
obvTime = ures['observations'][0]['obsTimeUtc']
td = (datetime.now() - prevAt).seconds
wSpeed = ures['observations'][0]['imperial']['windSpeed']
wGust = ures['observations'][0]['imperial']['windGust']
print('',end=f'\r[+{td}s -> {obvTime}]: {wGust} ° / {wSpeed} °mph')
if prevTime < obvTime:
prevTime = obvTime
prevAt = datetime.now()
print('')
Even when making the request directly, the "observation time" in the retrieved data jumps around sometimes, which is why I'm only printing on a fresh line when obvTime increases - without that, it looks like this. (If that's preferred you can just print normally without the '',end='\r... format, and the second if block is no longer necessary either).
The first if block is for refreshing the reqUrl (because it expires after a while), which is when I actually scrape the wunderground site, because the url is inside one of their script tags:
def getReqUrl():
url = 'https://www.wunderground.com/dashboard/pws/KORPISTO1'
soup = BeautifulSoup(urlopen(url), 'html.parser')
appText = soup.select_one('#app-root-state').text
nxtSt = json.loads(appText.replace('&q;','"'))['wu-next-state-key']
return [
ns for ns in nxtSt.values()
if 'observations' in ns['value'] and
len(ns['value']['observations']) == 1
][0]['url'].replace('&a;','&')
or, since I know how the url starts, more simply like:
def getReqUrl():
url = 'https://www.wunderground.com/dashboard/pws/KORPISTO1'
soup = BeautifulSoup(urlopen(url), 'html.parser')
appText = soup.select_one('#app-root-state').text
rUrl = 'https://api.weather.com/v2/pws/observations/current'
rUrl = rUrl + appText.split(rUrl)[1].split('&q;')[0]
return rUrl.replace('&a;','&')
try:
import requests
from bs4 import BeautifulSoup
url = 'https://www.wunderground.com/dashboard/pws/KORPISTO1'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:76.0) Gecko/20100101 Firefox/76.0'}
session = requests.Session()
r = session.get(url, timeout=30, headers=headers) # print(r.status_code)
soup = BeautifulSoup(r.content, 'html.parser')
#'WIND & WIND GUST' in 'CURRENT CONDITIONS' section
wind_gust = [float(i.text) for i in soup.select_one('.weather__header:-soup-contains("WIND & GUST")').find_next('div', class_='weather__text').select('span.wu-value-to')]
print(wind_gust)
[1.8, 2.2]
wind = wind_gust[0]
gust = wind_gust[1]
print(wind)
1.8
print(gust)
2.2

Limited number of scraped data?

I am scraping a website and everything seems work fine from today's news until news published in 2015/2016. After these years, I am not able to scrape news.
Could you please tell me if anything has changed?
I should get 672 pages getting titles and snippets from this page:
https://catania.liveuniversity.it/attualita/
but I have got approx. 158.
The code that I am using is:
import bs4, requests
import pandas as pd
import re
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
page_num=1
website="https://catania.liveuniversity.it/attualita/"
while True:
r = requests.get(website, headers=headers)
soup = bs4.BeautifulSoup(r.text, 'html')
title=soup.find_all('h2')
date=soup.find_all('span', attrs={'class':'updated'})
if soup.find_all('a', attrs={'class':'page-numbers'}):
website = f"https://catania.liveuniversity.it/attualita/page/{page_num}"
page_num +=1
print(page_num)
else:
break
df = pd.DataFrame(list(zip(dates, titles)),
columns =['Date', 'Titles'])
I think there has been some changes in tags (for example in next page button, or just in the date/title tag).
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
def main(req, num):
r = req.get(
"https://catania.liveuniversity.it/attualita/page/{}/".format(num))
soup = BeautifulSoup(r.content, 'html.parser')
try:
data = [(x.select_one("span.updated").text, x.findAll("a")[1].text, x.select_one("div.entry-content").get_text(strip=True)) for x in soup.select(
"div.col-lg-8.col-md-8.col-sm-8")]
return data
except AttributeError:
print(r.url)
return False
with ThreadPoolExecutor(max_workers=30) as executor:
with requests.Session() as req:
fs = [executor.submit(main, req, num) for num in range(1, 673)]
allin = []
for f in fs:
f = f.result()
if f:
allin.extend(f)
df = pd.DataFrame.from_records(
allin, columns=["Date", "Title", "Content"])
print(df)
df.to_csv("result.csv", index=False)

Python : How to stay logged in while scraping?

Just to clarify from the beginning: I'm a total beginner (I wrote something in Python for the first time today). This was more applying from a guide and trying to remember what I did 7 years ago when I tried learning java than anything else.
I wanted to scrape the image tags from a website (to plot them later) but have to stay logged in to view all images. After I got the scraping down I noticed that there were some tags blocked so the issue with the login came up. I now managed to log in but it doesn't work outside of the session itself which makes the rest of my code useless. Can I get this to work or do I have to give up?
This is the working login:
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
login_data = {
'user' : 'theusername',
'pass' : 'thepassword',
'op' : 'Log in'
}
with requests.Session() as s:
url = "https://thatwebsite.com/index.php?page=account&s=login&code=00"
r = s.get(url)
r = s.post(url, data=login_data)
And what I had working before to scrape the website but with the login missing:
filename = "taglist.txt"
f = open(filename, "w", encoding="utf-8")
headers = "tags\n"
f.write(headers)
pid = 0
actual_page = 1
while pid < 150:
url = "https://thatwebsite.com/index.php?page=post&s=list&tags=absurdres&pid=" + str(pid)
print(url)
client = urlopen(url)
page_html = client.read()
client.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div",{"class":"thumbnail-preview"})
print("Current pid: " + str(pid))
for container in containers:
tags = container.span.a.img["title"]
f.write(tags.replace(" ", "\n") + "\n")
pid = pid + 42
print("Current page: " + str(actual_page))
actual_page += 1
print("Done.")
f.close()
Out comes a list of every tag used by high res images.
I hope I don't offend anyone with this.
Edit: The code is working now, had a cookie typo:
import requests
from bs4 import BeautifulSoup as soup
login_data = {
'user' : 'myusername',
'pass' : 'mypassword',
'op' : 'Log in'
}
s = requests.Session()
print("\n\n\n\n\n")
filename = "taglist.txt"
f = open(filename, "w", encoding="utf-8")
headers = "tags\n"
f.write(headers)
pid = 0
actual_page = 1
while pid < 42:
url2 = "https://thiswebsite.com/index.php?page=post&s=list&tags=rating:questionable&pid=" + str(pid)
r = s.get(url2, cookies={'duid' : 'somehash', 'user_id' : 'my userid', 'pass_hash' : 'somehash'})
page_html = str(r.content)
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div",{"class":"thumbnail-preview"})
for container in containers:
tags = container.span.a.img["title"]
f.write(tags.replace(" ", "\n") + "\n")
print("\nCurrent page: " + str(actual_page) + " Current pid: " + str(pid) + "\nDone.")
actual_page += 1
pid = pid + 42
f.close()
You use two different libraries for doing web requests right now. requests and urllib. I would opt for using only requests.
Also don't use the Session() context manager. Context manager are used to do some cleanup after leaving the indented block and have that with ... as x syntax you use on the requests.Session() object. In context of requests this will clear the cookies as you leave the session. (I assume login is managed by cookies at this site).
Keep the session in a variable instead that you can use for subsequent requests as this stores your cookies at login. You need them for subsequent requests.
s = requests.Session()
url = "https://thatwebsite.com/index.php?page=account&s=login&code=00"
r = s.get(url) # do you need this request?
r = s.post(url, data=login_data)
Also make the subsequent call in the loop with requests:
client = s.get(url)

Trying to get leads from yelp

I'm trying to get leads from yelp using python and beautifulsoup but I'm not able to catch the fields for phone name address and wesbite (optional).
I'm getting the following error here is my code I try to search and found different solution but they didn't work for me.
Here is my code
from bs4 import BeautifulSoup
import requests
import sys
import csv
import requests, re, json
## Get the min and max page numbers
pagenum=0
maxpage =0
## loop go thourgh the pages
while pagenum <= maxpage:
newsu =pagenum
newsu = str(newsu)
csvname = 'cardealers'+newsu+'.csv';
csvfile = open(csvname , 'w',encoding="utf-8")
csv_writer = csv.writer(csvfile)
csv_writer.writerow(['Business name', 'phone' , 'address'] )
headers = {'User-Agent':'Mozilla/5.0'}
r = requests.get('https://www.yelp.com/search?find_desc=Used%20Car%20Dealers&find_loc=New%20York%2C%20NY&ns=1&sortby=review_count&start={}'.format(pagenum), headers = headers)
p = re.compile(r'PRELOADED_STATE__ = (.*?);')
data = json.loads(p)
print(data)
pagenum =pagenum+1
for item in data['searchResult']['results']:
name = item['businessName']
phone=item['phone']
address= ([item['address'],item['city'], item['state'], item['postalcode']])
csv_writer.writerow([name, phone , address ])
print(name)
csvfile.close()
here is the error message.
Traceback (most recent call last): File
"\Python\Python36\scraper\scrape.py", line 22, in
data = json.loads(p) File "\Python\Python36\lib\json__init__.py", line 348, in loads
'not {!r}'.format(s.class.name)) TypeError: the JSON object must be str, bytes or bytearray, not 'SRE_Pattern'
you are trying to read in a string that is not json format.
Essentially, this is what you are doing:
data = json.loads('THIS IS JUST A STRING. NOT IN A JSON FORMAT')
so you want to do something like: data = json.loads(p.findall(r.text))
You actually need to pull that out from the html. The other MAJOR issue though is that is not even within the html you are pulling...so it will always return an empty list.
Also, you are not iterating through anything. You start at pagenum=0, with maxpage page=0 and run while pagenum<=maxpage which means it's going to run forever.
The json structure with the data is in the html, but looks like it's within the Comments. So you'll need to parse that instead.
Also, why do:
newsu =pagenum
newsu = str(newsu)
simply do newsu = str(pagenum). Do you really want a seperate file for each iteration? I just put it into 1 file:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
import math
## Get the min and max page numbers
pagenum=0
results = pd.DataFrame()
with requests.Session() as s:
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36'}
url = 'https://www.yelp.com/search?find_desc=Used%20Car%20Dealers&find_loc=New%20York%2C%20NY&ns=1&sortby=review_count&start={}'.format(pagenum)
r = s.get(url, headers = headers)
soup = BeautifulSoup(r.text, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
if '<!--{' in script.text:
jsonStr = script.text.split('<!--')[-1].split('-->')[0]
jsonData = json.loads(jsonStr)
totalPages = jsonData['searchPageProps']['searchResultsProps']['paginationInfo']['totalResults']
resultsPerPage = jsonData['searchPageProps']['searchResultsProps']['paginationInfo']['resultsPerPage']
totalPages = math.ceil(totalPages/resultsPerPage)
## loop go through the pages
for pagenum in range(0,totalPages+1):
url = 'https://www.yelp.com/search?find_desc=Used%20Car%20Dealers&find_loc=New%20York%2C%20NY&ns=1&sortby=review_count&start={}'.format(pagenum)
r = s.get(url, headers = headers)
soup = BeautifulSoup(r.text, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
if '<!--{' in script.text:
jsonStr = script.text.split('<!--')[-1].split('-->')[0]
jsonData = json.loads(jsonStr)
for each in jsonData['searchPageProps']['searchResultsProps']['searchResults']:
if 'searchResultBusiness' in each.keys():
busiName = each['searchResultBusiness']['name']
phone = each['searchResultBusiness']['phone']
address = each['searchResultBusiness']['formattedAddress']
temp_df = pd.DataFrame([[busiName, phone, address]], columns=['Business name', 'phone' , 'address'])
results = results.append(temp_df, sort=False).reset_index(drop=True)
print ('Aquired page: %s' %pagenum)
results.to_csv('cardealers.csv', index=False)

Python mechanize javascript

I'm trying to use mechanize to grab prices for New York's metro-north railroad from this site:
http://as0.mta.info/mnr/fares/choosestation.cfm
The problem is that when you select the first option, the site uses javascript to populate your list of possible destinations. I have written equivalent code in python, but I can't seem to get it all working. Here's what I have so far:
import mechanize
import cookielib
from bs4 import BeautifulSoup
br = mechanize.Browser()
br.set_handle_robots(False)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
br.open("http://as0.mta.info/mnr/fares/choosestation.cfm")
br.select_form(name="form1")
br.form.set_all_readonly(False)
origin_control = br.form.find_control("orig_stat", type="select")
origin_control_list = origin_control.items
origin_control.value = [origin_control.items[0].name]
destination_control_list = reFillList(0, origin_control_list)
destination_control = br.form.find_control("dest_stat", type="select")
destination_control.items = destination_control_list
destination_control.value = [destination_control.items[0].name]
response = br.submit()
response_text = response.read()
print response_text
I know I didn't give you code for the reFillList() method, because it's long, but assume it correctly creates a list of mechanize.option objects. Python doesn't complain about me about anything, but on submit I get the html for this alert:
"Fare information for travel between two lines is not available on-line. Please contact our Customer Information Center at 511 and ask to speak to a representative for further information."
Am I missing something here? Thanks for all the help!
If you know the station IDs, it is easier to POST the request yourself:
import mechanize
import urllib
post_url = 'http://as0.mta.info/mnr/fares/get_fares.cfm'
orig = 295 #BEACON FALLS
dest = 292 #ANSONIA
params = urllib.urlencode({'dest_stat':dest, 'orig_stat':orig })
rq = mechanize.Request(post_url, params)
fares_page = mechanize.urlopen(rq)
print fares_page.read()
If you have the code to find the list of destination IDs for a given starting ID (i.e. a variant of refillList()), you can then run this request for each combination:
import mechanize
import urllib, urllib2
from bs4 import BeautifulSoup
url = 'http://as0.mta.info/mnr/fares/choosestation.cfm'
post_url = 'http://as0.mta.info/mnr/fares/get_fares.cfm'
def get_fares(orig, dest):
params = urllib.urlencode({'dest_stat':dest, 'orig_stat':orig })
rq = mechanize.Request(post_url, params)
fares_page = mechanize.urlopen(rq)
print(fares_page.read())
pool = BeautifulSoup(urllib2.urlopen(url).read())
#let's keep our stations organised
stations = {}
# dict by station id
for option in pool.find('select', {'name':'orig_stat'}).findChildren():
stations[option['value']] = {'name':option.string}
#iterate over all routes
for origin in stations:
destinations = get_list_of_dests(origin) #use your code for this
stations[origin]['dests'] = destinations
for destination in destinations:
print('Processing from %s to %s' % (origin, destination))
get_fares(origin, destination)

Categories