I'm working on a web scraping project in Python and trying to add automated testing w/ Pytest. I'm not new to web scraping but I'm very new to testing, and I believe the idea here is I should mock the HTTP request and replacing it with some dummy html fixture code to test if the rest of the function works without having to rely on requesting anything from the actual url.
Below is my web scraping function.
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
def get_player_stats_data():
"""
Web Scrape function w/ BS4 that grabs aggregate season stats
Args:
None
Returns:
Pandas DataFrame of Player Aggregate Season stats
"""
try:
year_stats = 2022
url = f"https://www.basketball-reference.com/leagues/NBA_{year_stats}_per_game.html"
html = urlopen(url)
soup = BeautifulSoup(html, "html.parser")
headers = [th.getText() for th in soup.findAll("tr", limit=2)[0].findAll("th")]
headers = headers[1:]
rows = soup.findAll("tr")[1:]
player_stats = [
[td.getText() for td in rows[i].findAll("td")] for i in range(len(rows))
]
stats = pd.DataFrame(player_stats, columns=headers)
print(
f"General Stats Extraction Function Successful, retrieving {len(stats)} updated rows"
)
return stats
except BaseException as error:
print(f"General Stats Extraction Function Failed, {error}")
df = []
return df
And here is what I'm using to grab the raw html of the page, and pickling it so I can save it and import it for testing.
import pickle
from bs4 import BeautifulSoup
from urllib.request import urlopen
year_stats = 2022
url = "https://www.basketball-reference.com/leagues/NBA_2022_per_game.html"
html = urlopen(url)
# how you save it
with open('new_test/tests/fixture_csvs/stats_html.html', 'wb') as fp:
while True:
chunk = html.read(1024)
if not chunk:
break
fp.write(chunk)
# how you open it
with open('new_test/tests/fixture_csvs/stats_html.html', "rb") as fp:
stats_html = fp.read()
My question is how do I mock/patch/monkeypatch the urlopen(url) call and use the pickled html in its place to create a fixture with it? The Pytest docs example is creating a class & monkeypatching requests.get() where get is an attribute of requests which seems a little different from what i'm doing, and I haven't been able to get mine working, I think i'm supposed to use something other than monkeypatch.setattr? Below is what I tried.
#pytest.fixture(scope="session")
def player_stats_data_raw(monkeypatch):
"""
Fixture to load web scrape html from an html file for testing.
"""
fname = os.path.join(
os.path.dirname(__file__), "fixture_csvs/stats_html.html"
)
with open(fname, "rb") as fp:
html = fp.read()
def mock_urlopen():
return html
monkeypatch.setattr(urlopen, "url", mock_urlopen)
df = get_player_stats_data()
return df
### The actual tests in a separate file
def test_raw_stats_rows(player_stats_data_raw):
assert len(player_stats_data_raw) == 30
def test_raw_stats_schema(player_stats_data_raw):
assert list(player_stats_data_raw.columns) == raw_stats_cols
The goal is to replace html = urlopen(url) in the web scraping function with this pickled html I've previously saved.
The other option is to turn that url into an input parameter for the function, where in production I just call the actual url as you see here (www.basketballreference.com/etc), and in testing I just read in that pickled value. That's an option but I'm curious to learn & apply this patching technique to a real example. If anyone has any thoughts I'd appreciate it!
In your test file, you could try like this:
from module.script import get_player_stats_data
#pytest.fixture(scope="session")
def urlopen(mocker):
with open(fname, "rb") as fp:
html = fp.read()
urlopen = mocker.patch("module.script.urlopen")
urlopen.return_value = html
return urlopen
def test_raw_stats_rows(urlopen):
df = get_player_stats_data()
assert len(df) == 30
def test_raw_stats_schema(urlopen):
df = get_player_stats_data()
assert list(df.columns) == raw_stats_cols
Related
I want to scrape multiple pages of website using Python, but I'm getting Remote Connection closed error.
Here is my code
import pandas as pd
url_link = 'https://www.taneps.go.tz/epps/viewAllAwardedContracts.do?d-3998960-p={}&selectedItem=viewAllAwardedContracts.do'
LIST = []
for number in range(1,5379):
url = url_link.format(number)
dframe = pd.read_html(url, header=None)[0]
LIST.append(dframe)
Result_df = pd.concat(LIST)
Result_df.to_csv('Taneps_contracts.csv')
Any idea how to solve it?
For me, just using requests to fetch the html before passing to read_html is getting the data. I just edited your code to
import pandas as pd
import re
url_link = 'https://www.taneps.go.tz/epps/viewAllAwardedContracts.do?d-3998960-p={}&selectedItem=viewAllAwardedContracts.do'
LIST = []
for number in range(1,5379):
url = url_link.format(number)
r = requests.get(url) # getting page -> html in r.text
dframe = pandas.read_html(r.text, header=None)[0]
LIST.append(dframe)
Result_df = pd.concat(LIST)
Result_df.to_csv('Taneps_contracts.csv')
I didn't even have to add headers, but if this isn't enough for you (i.e., if the program breaks or if you don't end up with 53770+ rows), try adding convincing headers or using something like HTMLSession instead of directly calling requests.get...
NEW TO PYTHON*** Below is my code I am using to pull a zip file from a website but I am getting the error, "list index out of range". I was given this code by someone else who wrote it but I had to change the URL and now I am getting the error. When I print(list_of_documents) it is blank.
Can someone help me with this? The url requires access so you won't be able to try to input this code directly. I am trying to understand how to use beautiful soup in this and how I can get the list to populate correctly.
import datetime
import requests
import csv
from zipfile import ZipFile as zf
import os
import pandas as pd
import time
from bs4 import BeautifulSoup
import pyodbc
import re
#set download location
downloads_folder = r"C:\Scripts\"
##### Creating outage dataframe
#Get list of download links
res = requests.get('https://www.ercot.com/mp/data-products/data-product-details?id=NP3-233-CD')
ercot_soup = BeautifulSoup(res.text, "lxml")
list_of_documents = ercot_soup.findAll('td', attrs={'class': 'labelOptional_ind'})
list_of_links = ercot_soup.select('a')'
##create the url for the download
loc = str(list_of_links[0])[9:len(str(list_of_links[0]))-9]
link = 'http://www.ercot.com' + loc
link = link.replace('amp;','')
# Define file name and set download path
file_name = str(list_of_documents[0])[30:len(str(list_of_documents[0]))-5]
file_path = downloads_folder + '/' + file_name
You can't expect code tailored to scrape one website to work for a different link! You should always inspect and explore your target site, especially the parts you need to scrape, so you know the tag names [like td and a here] and identifying attributes [like name, id, class, etc.] of the elements you need to extract data from.
With this site, if you want the info from the reportTable, it gets generated after the page gets loaded with javascript, so it wouldn't show up in the request response. You could either try something like Selenium, or you could try retrieving the data from the source itself.
If you inspect the site and look at the network tab, you'll find a request (which is what actually retrieves the data for the table) that looks like this, and when you inspect the table's html, you'll find above it the scripts to generate the data.
In the suggested solution below, the getReqUrl scrapes your link to get the url for requesting the reports (and also the template of the url for downloading the documents).
def getReqUrl(scrapeUrl):
res = requests.get(scrapeUrl)
ercot_soup = BeautifulSoup(res.text, "html.parser")
script = [l.split('"') for l in [
s for s in ercot_soup.select('script')
if 'reportListUrl' in s.text
and 'reportTypeID' in s.text
][0].text.split('\n') if l.count('"') == 2]
rtID = [l[1] for l in script if 'reportTypeID' in l[0]][0]
rlUrl = [l[1] for l in script if 'reportListUrl' in l[0]][0]
rdUrl = [l[1] for l in script if 'reportDownloadUrl' in l[0]][0]
return f'{rlUrl}{rtID}&_={int(time.time())}', rdUrl
(I couldn't figure out how to scrape the last query parameter [the &_=... part] from the site exactly, but {int(time.time())}} seems to get close enough - the results are the same even then and even when that last bit is omitted entirely; so it's totally optional.)
The url returned can be used to request the documents:
#import json
url = 'https://www.ercot.com/mp/data-products/data-product-details?id=NP3-233-CD'
reqUrl, ddUrl = getReqUrl(url)
reqRes = requests.get(reqUrl[0]).text
rsJson = json.loads(reqRes)
for doc in rsJson['ListDocsByRptTypeRes']['DocumentList']:
d = doc['Document']
downloadLink = ddUrl+d['DocID']
#print(f"{d['FriendlyName']} {d['PublishDate']} {downloadLink}")
print(f"Download '{d['ConstructedName']}' at\n\t {downloadLink}")
print(len(rsJson['ListDocsByRptTypeRes']['DocumentList']))
The print results will look like
I am very new to Web scraping. I have started using BeautifulSoup in Python. I wrote a code that would loop through a list of urls and get me the data i need. The code works fine for 10-12 links but I am not sure if the same code will be effective if the list has over 100 links. Is there any alternative way or any other library to get the data by inputing a list of large number of url's without harming the website in any way. Here is my code so far.
url_list = [url1, url2,url3, url4,url5]
mylist = []
for l in url_list:
url = l
res = get(url)
soup = BeautifulSoup(res.text, 'html.parser')
data = soup.find('pre').text
mylist.append(data)
Here's an example, maybe for you.
from simplified_scrapy import Spider, SimplifiedDoc, SimplifiedMain, utils
class MySpider(Spider):
name = 'my_spider'
start_urls = ['url1']
# refresh_urls = True # If you want to download the downloaded link again, please remove the "#" in the front
def __init__(self):
# If your link is stored elsewhere, read it out here.
self.start_urls = utils.getFileLines('you url file name.txt')
Spider.__init__(self,self.name) # Necessary
def extract(self, url, html, models, modelNames):
doc = SimplifiedDoc(html)
data = doc.select('pre>text()') # Extract the data you want.
return {'Urls': None, 'Data':{'data':data} } # Return the data to the framework, which will save it for you.
SimplifiedMain.startThread(MySpider()) # Start download
You can see more examples here, as well as the source code of Library simplified_scrapy: https://github.com/yiyedata/simplified-scrapy-demo
I am using 3 modules in this program, I don't know if what I'm trying to do is even possible! So I want to scrape some data off of twitter and write it in a text file using python, can somebody please guide me and tell me why my code isn't writing the data scrapped?
import urllib
import urllib.request
from os import path
from bs4 import BeautifulSoup
# here I define the url, I request the page, create my soup
theurl = "https://twitter.com/realDonaldTrump"
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage, "html.parser")
def create_file(dest):
"""
Creates a file for the user to write data in!
:param dest:
:return:
"""
## FileName == Month_Day_Year
name = 'Data Scraped.txt'
if not(path.isfile(dest +name)):
f = open(dest + name, "w")
f.write(soup.title.text)
f.close()
if __name__ == '__main__':
destination = 'C:\\Users\\edwin\\' \
'Desktop\\WebScrappin\\'
create_file(destination)
print("Your file has been created!!")
You're only the writing the title of the document that you received.
f.write(soup.title.text)
Instead of scraping (which is against their ToS) you should gather your data from their RESTful API or use a library like Twython
I'm writing a web scraper. I could've just used scrapy but decided to write it from scratch so I can practice.
I've created a scraper that works successfully using requests and BeautifulSoup. It navigates through about 135 pages with 12 items on each, grabs the link and then grabs the information from the link destination. At the end it writes everything in a CSV file. It only grabs strings and it doesn't download any images or anything like that… for now.
Problem? It's quite slow. It takes about 5 secs to grab the everything just from contents of one page so that times 135 is about 11 minutes.
So my question is how do I implement threading in my code so it gets data way faster.
Here's the code:
import requests
from bs4 import BeautifulSoup
import re
import csv
def get_actor_dict_from_html(url, html):
soup = BeautifulSoup(html, "html.parser")
#There must be a better way to handle this, but let's assign a NULL value to all upcoming variables.
profileName = profileImage = profileHeight = profileWeight = 'NULL'
#Let's get the name and image..
profileName = str.strip(soup.find('h1').get_text())
profileImage = "http://images.host.com/actors/" + re.findall(r'\d+', url)[0] + "/actor-large.jpg"
#Now the rest of the stuff..
try:
profileHeight = soup.find('a', {"title": "Height"}).get_text()
except:
pass
try:
profileWeight = soup.find('a', {"title": "Weight"}).get_text()
except:
pass
return {
'Name': profileName,
'ImageUrl': profileImage,
'Height': profileHeight,
'Weight': profileWeight,
}
def lotta_downloads():
output = open("/tmp/export.csv", 'w', newline='')
wr = csv.DictWriter(output, ['Name','ImageUrl','Height','Weight'], delimiter=',')
wr.writeheader()
for i in range(135):
url = "http://www.host.com/actors/all-actors/name/{}/".format(i)
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, "html.parser")
links = soup.find_all("div", { "class" : "card-image" })
for a in links:
for url in a.find_all('a'):
url = "http://www.host.com" + url['href']
print(url)
response = requests.get(url)
html = response.content
actor_dict = get_actor_dict_from_html(url, html)
wr.writerow(actor_dict)
print('All Done!')
if __name__ == "__main__":
lotta_downloads()
Thanks!
Why don't you try to use gevent library?
gevent library has monkey patch making blocking function to non-blocking function.
Maybe wait time of requests is too much and so slow.
So I think that Making request as non-blocking function make your program fast.
On python 2.7.10
example:
import gevent
from gevent import monkey; monkey.patch_all() # Fix import code
import reqeusts
actor_dict_list = []
def worker(url):
content = requests.get(url).content
bs4.BeautifulSoup(content)
links = soup.find_all('div', {'class': 'card-image'})
for a in links:
for url in a.find_all('a'):
response = requests.get(url) # You can also use gevent spawn function on this line
...
actor_dict_list.append(get_actor_dict_from_html(url, html)) # Because of preventing race condition
output = open("/tmp/export.csv", "w", newline='')
wr = csv.DictWriter(output, ['Name', 'ImageUrl', 'Height', 'Weight'], delimiter=',')
wr.writeheader()
urls = ["http://www.host.com/actors/all-actors/name/{}/".format(i) for i in range(135)]
jobs = [gevent.spawn(worker, url) for url in urls]
gevent.joinall(jobs)
for i in actor_dict_list:
wr.writerow(actor_dict)
public gevent document: doc
P.S.
You must install python-gevent If you have ubuntu OS
sudo apt-get install python-gevent