I'm writing a web scraper. I could've just used scrapy but decided to write it from scratch so I can practice.
I've created a scraper that works successfully using requests and BeautifulSoup. It navigates through about 135 pages with 12 items on each, grabs the link and then grabs the information from the link destination. At the end it writes everything in a CSV file. It only grabs strings and it doesn't download any images or anything like that… for now.
Problem? It's quite slow. It takes about 5 secs to grab the everything just from contents of one page so that times 135 is about 11 minutes.
So my question is how do I implement threading in my code so it gets data way faster.
Here's the code:
import requests
from bs4 import BeautifulSoup
import re
import csv
def get_actor_dict_from_html(url, html):
soup = BeautifulSoup(html, "html.parser")
#There must be a better way to handle this, but let's assign a NULL value to all upcoming variables.
profileName = profileImage = profileHeight = profileWeight = 'NULL'
#Let's get the name and image..
profileName = str.strip(soup.find('h1').get_text())
profileImage = "http://images.host.com/actors/" + re.findall(r'\d+', url)[0] + "/actor-large.jpg"
#Now the rest of the stuff..
try:
profileHeight = soup.find('a', {"title": "Height"}).get_text()
except:
pass
try:
profileWeight = soup.find('a', {"title": "Weight"}).get_text()
except:
pass
return {
'Name': profileName,
'ImageUrl': profileImage,
'Height': profileHeight,
'Weight': profileWeight,
}
def lotta_downloads():
output = open("/tmp/export.csv", 'w', newline='')
wr = csv.DictWriter(output, ['Name','ImageUrl','Height','Weight'], delimiter=',')
wr.writeheader()
for i in range(135):
url = "http://www.host.com/actors/all-actors/name/{}/".format(i)
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, "html.parser")
links = soup.find_all("div", { "class" : "card-image" })
for a in links:
for url in a.find_all('a'):
url = "http://www.host.com" + url['href']
print(url)
response = requests.get(url)
html = response.content
actor_dict = get_actor_dict_from_html(url, html)
wr.writerow(actor_dict)
print('All Done!')
if __name__ == "__main__":
lotta_downloads()
Thanks!
Why don't you try to use gevent library?
gevent library has monkey patch making blocking function to non-blocking function.
Maybe wait time of requests is too much and so slow.
So I think that Making request as non-blocking function make your program fast.
On python 2.7.10
example:
import gevent
from gevent import monkey; monkey.patch_all() # Fix import code
import reqeusts
actor_dict_list = []
def worker(url):
content = requests.get(url).content
bs4.BeautifulSoup(content)
links = soup.find_all('div', {'class': 'card-image'})
for a in links:
for url in a.find_all('a'):
response = requests.get(url) # You can also use gevent spawn function on this line
...
actor_dict_list.append(get_actor_dict_from_html(url, html)) # Because of preventing race condition
output = open("/tmp/export.csv", "w", newline='')
wr = csv.DictWriter(output, ['Name', 'ImageUrl', 'Height', 'Weight'], delimiter=',')
wr.writeheader()
urls = ["http://www.host.com/actors/all-actors/name/{}/".format(i) for i in range(135)]
jobs = [gevent.spawn(worker, url) for url in urls]
gevent.joinall(jobs)
for i in actor_dict_list:
wr.writerow(actor_dict)
public gevent document: doc
P.S.
You must install python-gevent If you have ubuntu OS
sudo apt-get install python-gevent
Related
I am using PyCharm to capture some data from web and push it into in-memory database-table on SQLite. I have debugged the code, it works fine, in the debugger I can see data being fetched, it being pushed into db[table] location.
Python code is as below -
import requests
import dataset
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
def begin():
db = dataset.connect('sqlite:///quotes.db')
authors_seen = set()
base_url = 'http://quotes.toscrape.com/'
def clean_url(url):
# Clean '/author/Steve-Martin' to 'Steve-Martin'
# Use urljoin to make an absolute URL
url = urljoin(base_url, url)
# Use urlparse to get out the path part
path = urlparse(url).path
# Now split the path by '/' and get the second part
# E.g. '/author/Steve-Martinvisual studio' -> ['','author', 'Steve-Martin']
return path.split('/')[2]
def scrape_quotes(html_soup):
for quote in html_soup.select('div.quote'):
quote_text = quote.find(class_='text').get_text(strip=True)
quote_author_url = clean_url(quote.find(class_='author').find_next_sibling('a').get('href'))
quote_tag_urls = [clean_url(a.get('href')) for a in quote.find_all('a', class_='tag')]
authors_seen.add(quote_author_url)
# Store this quote and its tags
quote_id = db['quotes'].insert({'text' : quote_text, 'author' : quote_author_url})
db['quotes_tags'].insert_many([{'quote_id' : quote_id, 'tag_id' : tag} for tag in quote_tag_urls])
def scrape_author(html_soup, author_id):
author_name = html_soup.find(class_='author-title').get_text(strip=True)
author_born_date = html_soup.find(class_='author-born-date').get_text(strip=True)
author_born_loc = html_soup.find(class_='author-born-location').get_text(strip=True)
author_desc = html_soup.find(class_='author-description').get_text(strip=True)
db['authors'].insert({'author_id': author_id, 'name': author_name,
'born_date': author_born_date, 'born_location': author_born_loc,
'description': author_desc})
# Start by scraping all the quote pages
print('*****Beginning scraping process - quotes first.*****')
url = base_url
while True:
print('Now scraping page:', url)
r = requests.get(url)
html_soup = BeautifulSoup(r.text, 'html.parser')
# Scrape the quotes
scrape_quotes(html_soup)
# Is there a next page?
next_a = html_soup.select('li.next > a')
if not next_a or not next_a[0].get('href'):
break
url = urljoin(url, next_a[0].get('href'))
# Now fetch out the author information
print('*****Scraping authors data.*****')
for author_id in authors_seen:
url = urljoin(base_url, '/author/' + author_id)
print('Now scraping author:', url)
r = requests.get(url)
html_soup = BeautifulSoup(r.text, 'html.parser')
# Scrape the author information
scrape_author(html_soup, author_id)
db.commit()
db.close()
What I am struggling with is the pycharm IDE connection. As shown in the figure below, I can see quotes.sqlite database. It has only one table listed - sqlite_master. Under server objects there are collations, modules and routines, which is part of infrastructure provided by SQLite.
Also, when I view the db object (python's driver to SQLite) in debugger, I can see the relevant table as shown in the picture below -
Any ideas why PyCharm refuses to show relevant table/collection in the IDE?
I'm working on a web scraping project in Python and trying to add automated testing w/ Pytest. I'm not new to web scraping but I'm very new to testing, and I believe the idea here is I should mock the HTTP request and replacing it with some dummy html fixture code to test if the rest of the function works without having to rely on requesting anything from the actual url.
Below is my web scraping function.
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
def get_player_stats_data():
"""
Web Scrape function w/ BS4 that grabs aggregate season stats
Args:
None
Returns:
Pandas DataFrame of Player Aggregate Season stats
"""
try:
year_stats = 2022
url = f"https://www.basketball-reference.com/leagues/NBA_{year_stats}_per_game.html"
html = urlopen(url)
soup = BeautifulSoup(html, "html.parser")
headers = [th.getText() for th in soup.findAll("tr", limit=2)[0].findAll("th")]
headers = headers[1:]
rows = soup.findAll("tr")[1:]
player_stats = [
[td.getText() for td in rows[i].findAll("td")] for i in range(len(rows))
]
stats = pd.DataFrame(player_stats, columns=headers)
print(
f"General Stats Extraction Function Successful, retrieving {len(stats)} updated rows"
)
return stats
except BaseException as error:
print(f"General Stats Extraction Function Failed, {error}")
df = []
return df
And here is what I'm using to grab the raw html of the page, and pickling it so I can save it and import it for testing.
import pickle
from bs4 import BeautifulSoup
from urllib.request import urlopen
year_stats = 2022
url = "https://www.basketball-reference.com/leagues/NBA_2022_per_game.html"
html = urlopen(url)
# how you save it
with open('new_test/tests/fixture_csvs/stats_html.html', 'wb') as fp:
while True:
chunk = html.read(1024)
if not chunk:
break
fp.write(chunk)
# how you open it
with open('new_test/tests/fixture_csvs/stats_html.html', "rb") as fp:
stats_html = fp.read()
My question is how do I mock/patch/monkeypatch the urlopen(url) call and use the pickled html in its place to create a fixture with it? The Pytest docs example is creating a class & monkeypatching requests.get() where get is an attribute of requests which seems a little different from what i'm doing, and I haven't been able to get mine working, I think i'm supposed to use something other than monkeypatch.setattr? Below is what I tried.
#pytest.fixture(scope="session")
def player_stats_data_raw(monkeypatch):
"""
Fixture to load web scrape html from an html file for testing.
"""
fname = os.path.join(
os.path.dirname(__file__), "fixture_csvs/stats_html.html"
)
with open(fname, "rb") as fp:
html = fp.read()
def mock_urlopen():
return html
monkeypatch.setattr(urlopen, "url", mock_urlopen)
df = get_player_stats_data()
return df
### The actual tests in a separate file
def test_raw_stats_rows(player_stats_data_raw):
assert len(player_stats_data_raw) == 30
def test_raw_stats_schema(player_stats_data_raw):
assert list(player_stats_data_raw.columns) == raw_stats_cols
The goal is to replace html = urlopen(url) in the web scraping function with this pickled html I've previously saved.
The other option is to turn that url into an input parameter for the function, where in production I just call the actual url as you see here (www.basketballreference.com/etc), and in testing I just read in that pickled value. That's an option but I'm curious to learn & apply this patching technique to a real example. If anyone has any thoughts I'd appreciate it!
In your test file, you could try like this:
from module.script import get_player_stats_data
#pytest.fixture(scope="session")
def urlopen(mocker):
with open(fname, "rb") as fp:
html = fp.read()
urlopen = mocker.patch("module.script.urlopen")
urlopen.return_value = html
return urlopen
def test_raw_stats_rows(urlopen):
df = get_player_stats_data()
assert len(df) == 30
def test_raw_stats_schema(urlopen):
df = get_player_stats_data()
assert list(df.columns) == raw_stats_cols
I want to scrape a website and its sub-pages, but it is taking too long. How can I optimize the request or use an alternative solution?
Below is the code I am using. It takes 10s for just loading the Google home page. So it's clearly not scalable if I were to give it 280 links
from selenium import webdriver
import time
# prepare the option for the chrome driver
options = webdriver.ChromeOptions()
options.add_argument('headless')
# start chrome browser
browser = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver" ,chrome_options=options)
start=time.time()
browser.get('http://www.google.com/xhtml')
print(time.time()-start)
browser.quit()
Use python requests and Beautiful soup module.
import requests
from bs4 import BeautifulSoup
url="https://tajinequiparle.com/dictionnaire-francais-arabe-marocain/"
url1="https://tajinequiparle.com/dictionnaire-francais-arabe-marocain/{}/"
req = requests.get(url,verify=False)
soup = BeautifulSoup(req.text, 'html.parser')
print("Letters : A")
print([item['href'] for item in soup.select('.columns-list a[href]')])
letters=['B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
for letter in letters:
req = requests.get(url1.format(letter), verify=False)
soup = BeautifulSoup(req.text, 'html.parser')
print('Letters : ' + letter)
print([item['href'] for item in soup.select('.columns-list a[href]')])
you can use that script for the speed. multithread crawler better than all:
https://edmundmartin.com/multi-threaded-crawler-in-python/
After that you must change that code:
def run_scraper(self):
with open("francais-arabe-marocain.csv", 'a') as file:
file.write("url")
file.writelines("\n")
for i in range(50000):
try:
target_url = self.to_crawl.get(timeout=600)
if target_url not in self.scraped_pages and "francais-arabe-marocain" in target_url:
self.scraped_pages.add(target_url)
job = self.pool.submit(self.scrape_page, target_url)
job.add_done_callback(self.post_scrape_callback)
df = pd.DataFrame([{'url': target_url}])
df.to_csv(file, index=False, header=False)
print(target_url)
except Empty:
return
except Exception as e:
print(e)
continue
If url include "francais-arabe-marocain" save urls in a csv file.
After that you can scrape that urls in one for loop reading csv line by line with same way
try to use urllib just like this
import urllib.request
start=time.time()
page = urllib.request.urlopen("https://google.com/xhtml")
print(time.time()-start)
it took only 2s. However, it depends also on the quality of connection you have
My Threading works fine only when fetch() is not called from validate(). But in this scenario below it returns
RuntimeError: There is no current event loop in thread
in all the threads 0 - 99 what am I doing wrong here ?
from threading import Thread
import requests
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def fetch():
#fetch data from another site
session = HTMLSession()
url='http://url'
data = session.get(url)
data.html.render()
content = data.html.html
soup = BeautifulSoup(content, "html.parser")
iban = soup.find('p',{"id":"demo"})
return result.text
def validate():
url = "https://url"
payload = {
"data" : fetch(),
"veto" : "fi"
}
response = requests.post(url, data=payload)
soup = BeautifulSoup(response.text, "html.parser")
data = soup.body.find(text='contact')
if(data):
print (data)
else:
print ("no data")
if __name__ == "__main__":
threads = []
for i in range(100):
# We start one thread per url present.
process = Thread(target=validate)
process.start()
threads.append(process)
From a quick search of the error, I found this Github issue that seems to show your problem and its solution.
It looks like you need to use asyncio and, at the beginning of each running thread, call asyncio.set_event_loop(asyncio.new_event_loop()).
I've written a script in python using proxies to scrape the links of different posts traversing different pages of a webpage. I've tried to make use of proxies from a list. The script is supposed to take random proxies from the list and send request to that website and finally parse the items. However, if any proxy is not working then it should be kicked out from the list.
I thought the way I've used number of proxies and list of urls within ThreadPool(10).starmap(make_requests, zip(proxyVault,lead_url)) is accurate but it doesn't produce any results; rather, the script gets stuck.
How can I pass the proxies and the links to the ThreadPool in order for the script to produce results?
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from multiprocessing.pool import ThreadPool
from itertools import cycle
import random
base_url = 'https://stackoverflow.com/questions/tagged/web-scraping'
lead_url = ["https://stackoverflow.com/questions/tagged/web-scraping?sort=newest&page={}&pagesize=15".format(page) for page in range(1,6)]
proxyVault = ['104.248.159.145:8888', '113.53.83.252:54356', '206.189.236.200:80', '218.48.229.173:808', '119.15.90.38:60622', '186.250.176.156:42575']
def make_requests(proxyVault,lead_url):
while True:
random.shuffle(proxyVault)
global pitem
pitem = cycle(proxyVault)
proxy = {'https':'http://{}'.format(next(pitem))}
try:
res = requests.get(lead_url,proxies=proxy)
soup = BeautifulSoup(res.text,"lxml")
[get_title(proxy,urljoin(base_url,item.get("href"))) for item in soup.select(".summary .question-hyperlink")]
except Exception:
try:
proxyVault.pop(0)
make_requests(proxyVault,lead_url)
except Exception:pass
def get_title(proxy,itemlink):
res = requests.get(itemlink,proxies=proxy)
soup = BeautifulSoup(res.text,"lxml")
print(soup.select_one("h1[itemprop='name'] a").text)
if __name__ == '__main__':
ThreadPool(10).starmap(make_requests, zip(proxyVault,lead_url))
Btw, the proxies used above are just placeholders.
The problems with your code was that it was creating a lot of endless loops in the thread. Also they way you handled the proxies was a bit strange to me, so I changed it.
I also think you had misunderstood how data was sent to the threads, they get one one element of the iterable, not the whole thing. So I changed some names to reflect that.
The way it works now is that each thread gets their own url from lead_url, then they choose a random proxy from the proxyVault.
They fetch the webpage and parse it and calls get_title on each of the parsed links.
If the request fails because of the proxy, that proxy is removed from the list so its not used again and make_requests is called again, which will randomly choose a new proxy from the ones that are still available.
I did not change the actual parsing, because I can't judge if it's what you want or not.
Runnable code:
https://repl.it/#zlim00/unable-to-pass-proxies-and-links-to-the-threadpool-to-get-re
from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool
from random import choice
import requests
from urllib.parse import urljoin
base_url = 'https://stackoverflow.com/questions/tagged/web-scraping'
lead_url = [f'https://stackoverflow.com/questions/tagged/web-scraping?sort='
f'newest&page={page}&pagesize=15' for page in range(1, 6)]
proxyVault = ['36.67.57.45:53367', '5.202.150.233:42895',
'85.187.184.129:8080', '109.195.23.223:45947']
def make_requests(url):
proxy_url = choice(proxyVault)
proxy = {'https': f'http://{proxy_url}'}
try:
res = requests.get(url, proxies=proxy)
soup = BeautifulSoup(res.text, "lxml")
[get_title(proxy, urljoin(base_url, item.get("href")))
for item in soup.select(".summary .question-hyperlink")]
except requests.exceptions.ProxyError:
# Check so that the bad proxy was not removed by another thread
if proxy_url in proxyVault:
proxyVault.remove(proxy_url)
print(f'Removed bad proxy: {proxy_url}')
return make_requests(url)
def get_title(proxy, itemlink):
res = requests.get(itemlink, proxies=proxy)
soup = BeautifulSoup(res.text, "lxml")
print(soup.select_one("h1[itemprop='name'] a").text)
if __name__ == '__main__':
ThreadPool(10).map(make_requests, lead_url)
Maybe you can use another approach to get proxies like this
def get_proxy():
url = 'https://free-proxy-list.net/anonymous-proxy.html'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find('table', attrs={'id': 'proxylisttable'})
table_body = table.find('tbody')
proxies = table_body.find_all('tr')
proxy_row = random.choice(proxies).find_all('td')
return proxy_row[0].text + ':' + proxy_row[1].text