Scraping the metadata of 10,000 website is too slow (Python) - python

Hi all,
I'm trying to parse the metadata of 10,000 websites into a Pandas dataframe for an SEO / analytics application but the code is taking ages. I've been trying to do it on 1,000 websites and the code has been running for the last 3 hours (it works without problem on 10-50 websites).
Here's the sample data:
index site
0 http://www.google.com
1 http://www.youtube.com
2 http://www.facebook.com
3 http://www.cnn.com
... ...
10000 http://www.sony.com
Here's my Python (2.7) code:
# Importing dependencies
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import metadata_parser
# Loading the Pandas dataframe
df = pd.read_csv('final_urls')
# Utility functions
def meta(website, metadata):
full_url = website
parser = metadata_parser.MetadataParser(url=full_url)
if metadata == 'all':
return parser.metadata
else:
return parser.metadata[metadata]
def meta_all(website):
try:
result = meta(website, 'all')
except BaseException:
result = 'Exception'
return result
# Main
df['site'].apply(meta_all)
I'd like the code to be much faster. I've been using the metadata_parser library (https://github.com/jvanasco/metadata_parser) which relies heavily on requests and BeautifulSoup.
I understand I might be able to change the parser to lxml for the code to be faster. It's already installed on my machine so BeautifulSoup should use it as the primary choice.
Do you have any suggestion to get this code to run faster?
Thanks!

You can use Python Twisted (Twisted is an event-driven networking engine written in Python). You will need to install a few packages with pip, maybe twisted, pyopenssl and service_identity maybe others. This code works on Python 2.7 which you say you are using.
from twisted.internet import defer, reactor
from twisted.web.client import getPage
import metadata_parser
import pandas as pd
import numpy as np
from multiprocessing import Process
def pageCallback(result, url):
data = {
'content': result,
'url': url,
}
return data
def getPageData(url):
d = getPage(url)
d.addCallback(pageCallback, url)
return d
def listCallback(result):
for isSuccess, data in result:
if isSuccess:
print("Call to %s succeeded " % (data['url']))
parser = metadata_parser.MetadataParser(html=data['content'], search_head_only=False)
print(parser.metadata) # do something with it here
def finish(ign):
reactor.stop()
def start(urls):
data = []
for url in urls:
data.append(getPageData(url))
dl = defer.DeferredList(data)
dl.addCallback(listCallback)
dl.addCallback(finish)
def processStart(chunk):
start(chunk)
reactor.run()
df = pd.read_csv('final_urls')
urls = df['site'].values.tolist()
chunkCounter = 0
chunkLength = 1000
for chunk in np.array_split(urls,len(urls)/chunkLength):
p = Process(target=processStart, args=(chunk,))
p.start()
p.join()
chunkCounter += 1
print("Finished chunk %s of %s URLs" % (str(chunkCounter), str(chunkLength)))
I have run it on 10,000 URLs and it took less than 16 minutes.
Updated
Normally you would process the data you generated where I added the comment "# do something with it here". In the event you want the generated data returned back for processing you can do something like this (I have also updated to use treq.):
from twisted.internet import defer, reactor
import treq
import metadata_parser
import pandas as pd
import numpy as np
import multiprocessing
from twisted.python import log
import sys
# log.startLogging(sys.stdout)
results = []
def pageCallback(result, url):
content = result.content()
data = {
'content': content,
'url': url,
}
return data
def getPageData(url):
d = treq.get(url, timeout=60, headers={'User-Agent': ["Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv'\:'57.0) Gecko/20100101 Firefox/57.0"]})
d.addCallback(pageCallback, url)
return d
def listCallback(result):
global results
for isSuccess, data in result:
if isSuccess:
print("Call to %s succeeded " % (data['url']))
parser = metadata_parser.MetadataParser(html=str(data['content']), search_head_only=False)
# print(parser.metadata) # do something with it here
results.append((data['url'], parser.metadata))
def finish(ign):
reactor.stop()
def start(urls):
data = []
for url in urls:
data.append(getPageData(url))
dl = defer.DeferredList(data)
dl.addCallback(listCallback)
dl.addCallback(finish)
def processStart(chunk, returnList):
start(chunk)
reactor.run()
returnList.extend(results)
df = pd.read_csv('final_urls')
urls = df['site'].values.tolist()
chunkCounter = 0
chunkLength = 1000
manager = multiprocessing.Manager()
returnList = manager.list()
for chunk in np.array_split(urls,len(urls)/chunkLength):
p = multiprocessing.Process(target=processStart, args=(chunk,returnList))
p.start()
p.join()
chunkCounter += 1
print("Finished chunk %s of %s URLs" % (str(chunkCounter), str(chunkLength)))
for res in returnList:
print (res)
print (len(returnList))
You may also want to add some error handling, to help you can uncomment the line reading "log.startLogging(sys.stdout)" but this is too much detail for one answer. If you get some failures for URLs I would generally retry them by running the code again with just the failed URLs possibly a few times if necessary.

Related

ThreadPoolExecutor: threads (futures) do not release memory when completed and the results are yielded

I am facing memory issues during downloading large data sets from paginated API responses in python.
When I tried to parallelize the download of multiple pages using ThreadPoolExecutor, I noticed that the finished and resolved futures do not release its memory footprint.
I tried to simplify it in following 2 examples. The first one downloads all pages using ThreadPoolExecutor with max_workers set to 1 (this should have the same memory footprint as a simple loop as far as I understand):
from random import random
from concurrent.futures import ThreadPoolExecutor, as_completed
import gc
TOTAL_PAGES = 60
def download_data(page: int = 1) -> list[float]:
# Send a request to some resource to get data
print(f"Downloading page {page}.")
return [random() for _ in range(1000000)] # mock some larga data sets
def threadpool_memory_test():
processed_pages = 0
with ThreadPoolExecutor(max_workers=1) as executor:
future_to_page = {
executor.submit(download_data, page): page for page in range(1, TOTAL_PAGES + 1)
}
for future in as_completed(future_to_page):
records = future.result()
# Do something with the downloaded data..
processed_pages += 1
print(f"Downloaded page: {processed_pages} / {TOTAL_PAGES} (number: {future_to_page[future]}) with {len(records)} records.")
gc.collect() # just to be sure gc is called
if __name__ == "__main__":
threadpool_memory_test()
However when running this script and plotting the memory footprint, it looks like this:
The futures do not release their memory even when looped through with as_completed and obtaining the results.
When I download and process the pages in simple loop. The memory footprint is as expected:
from random import random
TOTAL_PAGES = 60
def download_data(page: int = 1) -> list[float]:
# Send a request to some resource to get data
print(f"Downloading page {page}.")
return [random() for _ in range(1000000)] # mock some larga data sets
def loop_memory_test():
for page in range(1, TOTAL_PAGES + 1):
records = download_data(page)
# Do something with the downloaded data..
print(f"Downloaded page: {page} / {TOTAL_PAGES} with {len(records)} records.")
if __name__ == "__main__":
loop_memory_test()
Memory footprint of such script:
Is there a way how to release memory of a future from which the results were already obtained?
I am testing this on macOs Monterey version 12.5 (21G72)
Based on Stuart's comment I updated the script and it now works as expected (it is also 10x faster as well as uses fraction of the memory):
from random import random
from concurrent.futures import ThreadPoolExecutor, as_completed
import gc
TOTAL_PAGES = 60
def download_data(page: int = 1) -> list[float]:
# Send a request to some resource to get data
print(f"Downloading page {page}.")
return [random() for _ in range(1000000)] # mock some larga data sets
def threadpool_memory_test():
processed_pages = 0
with ThreadPoolExecutor(max_workers=1) as executor:
future_to_page = {
executor.submit(download_data, page): page for page in range(1, TOTAL_PAGES + 1)
}
for future in as_completed(future_to_page):
records = future.result()
page = future_to_page.pop(future)
# Do something with the downloaded data..
processed_pages += 1
print(f"Downloaded page: {processed_pages} / {TOTAL_PAGES} (number: {page}) with {len(records)} records.")
gc.collect() # just to be sure gc is called
if __name__ == "__main__":
threadpool_memory_test()
It boils down just to this line:
page = future_to_page.pop(future)
making sure the reference to the future is removed.
The memory footprint now:
Thank you!

Python multi-threading method

I've heard that Python multi-threading is a bit tricky, and I am not sure what is the best way to go about implementing what I need. Let's say I have a function called IO_intensive_function that does some API call which may take a while to get a response.
Say the process of queuing jobs can look something like this:
import thread
for job_args in jobs:
thread.start_new_thread(IO_intense_function, (job_args))
Would the IO_intense_function now just execute its task in the background and allow me to queue in more jobs?
I also looked at this question, which seems like the approach is to just do the following:
from multiprocessing.dummy import Pool as ThreadPool
pool = ThreadPool(2)
results = pool.map(IO_intensive_function, jobs)
As I don't need those tasks to communicate with each other, the only goal is to send my API requests as fast as possible. Is this the most efficient way? Thanks.
Edit:
The way I am making the API request is through a Thrift service.
I had to create code to do something similar recently. I've tried to make it generic below. Note I'm a novice coder, so please forgive the inelegance. What you may find valuable, however, is some of the error processing I found it necessary to embed to capture disconnects, etc.
I also found it valuable to perform the json processing in a threaded manner. You have the threads working for you, so why go "serial" again for a processing step when you can extract the info in parallel.
It is possible I will have mis-coded in making it generic. Please don't hesitate to ask follow-ups and I will clarify.
import requests
from multiprocessing.dummy import Pool as ThreadPool
from src_code.config import Config
with open(Config.API_PATH + '/api_security_key.pem') as f:
my_key = f.read().rstrip("\n")
f.close()
base_url = "https://api.my_api_destination.com/v1"
headers = {"Authorization": "Bearer %s" % my_key}
itm = list()
itm.append(base_url)
itm.append(headers)
def call_API(call_var):
base_url = call_var[0]
headers = call_var[1]
call_specific_tag = call_var[2]
endpoint = f'/api_path/{call_specific_tag}'
connection_tries = 0
for i in range(3):
try:
dat = requests.get((base_url + endpoint), headers=headers).json()
except:
connection_tries += 1
print(f'Call for {api_specific_tag} failed after {i} attempt(s). Pausing for 240 seconds.')
time.sleep(240)
else:
break
tag = list()
vars_to_capture_01 = list()
vars_to_capture_02 = list()
connection_tries = 0
try:
if 'record_id' in dat:
vars_to_capture_01.append(dat['record_id'])
vars_to_capture_02.append(dat['second_item_of_interest'])
else:
vars_to_capture_01.append(call_specific_tag)
print(f'Call specific tag {call_specific_tag} is unavailable. Successful pull.')
vars_to_capture_02.append(-1)
except:
print(f'{call_specific_tag} is unavailable. Unsuccessful pull.')
vars_to_capture_01.append(call_specific_tag)
vars_to_capture_02.append(-1)
time.sleep(240)
pack = list()
pack.append(vars_to_capture_01)
pack.append(vars_to_capture_02)
return pack
vars_to_capture_01 = list()
vars_to_capture_02 = list()
i = 0
max_i = len(all_tags)
while i < max_i:
ind_rng = range(i, min((i + 10), (max_i)), 1)
itm_lst = (itm.copy())
call_var = [itm_lst + [all_tags[q]] for q in ind_rng]
#packed = call_API(call_var[0]) # for testing of function without pooling
pool = ThreadPool(len(call_var))
packed = pool.map(call_API, call_var)
pool.close()
pool.join()
for pack in packed:
try:
vars_to_capture_01.append(pack[0][0])
except:
print(f'Unpacking error for {all_tags[i]}.')
vars_to_capture_02.append(pack[1][0])
For network API request you can use asyncio. Have a look at this article https://realpython.com/python-concurrency/#asyncio-version for an example how to implement it.

Python requests module multi threading

Is there a possible way to speed up my code using multiprocessing interface?
i have data array that include password i would like to run some requests togther.
import requests
data = ['test','test1','test2']
counter=0
for x in data:
counter+=1
burp0_data = "<methodCall>\r\n<methodName>wp.getUsersBlogs</methodName>\r\n<params>\r\n<param>
<value>zohar</value></param>\r\n<param><value>"+x+"</value>
</param>\r\n</params>\r\n</methodCall>\r\n"
s=requests.post(burp0_url, headers=burp0_headers, data=burp0_data)
if not (s.text.__contains__("403")):
print(s.text)
print(x)
exit()
Python multiprocessing module is what you are looking for. For instance, it has a parallel map function, which will run all requests asynchronously. Here is roughly what your code would look like:
import requests
from multiprocessing import Pool
def post(x):
burp0_data = "<methodCall>\r\n<methodName>wp.getUsersBlogs</methodName>\r\n<params>\r\n<param>
<value>zohar</value></param>\r\n<param><value>"+x+"</value>
</param>\r\n</params>\r\n</methodCall>\r\n"
s=requests.post(burp0_url, headers=burp0_headers, data=burp0_data)
if not (s.text.__contains__("403")):
return s.text, x
return None, None
if __name__ == '__main__':
data = ['test','test1','test2']
counter=0
with Pool(processes=len(data)) as pool:
results = pool.map(post, data, 1)
for res in results:
if res[0] is not None:
print(res[0])
print(res[1])
exit()
For more information please refer to the Python docs on multiprocessing.

Python Jupyter Notebook won't run my code, keeps reconnecting

How come this piece of code does not run properly on Jupyter Notebook.
It keeps reconnecting without any result. I try to make a database and scrape data as fast as possible from a webserver. I use threads to speed up the process and iterate over multiple url's (every different url represent a different day).
import pandas as pd
import datetime
import urllib
import requests
from pprint import pprint
import time
from io import StringIO
from multiprocessing import Process, Pool
symbols = ['AAP']
start = time.time()
dflist = []
def load(date):
if date is None:
return
url = "http://regsho.finra.org/FNYXshvol{}.txt".format(date)
try:
df = pd.read_csv(url,delimiter='|')
if any(df['Symbol'].isin(symbols)):
stocks = df[df['Symbol'].isin(symbols)]
print(stocks.to_string(index=False, header=False))
# Save stocks to mysql
else:
print(f'No stock found for {date}' )
except urllib.error.HTTPError:
pass
pool = []
numdays = 365
start_date = datetime.datetime(2019, 1, 15 ) #year - month - day
datelist = [
(start_date - datetime.timedelta(days=x)).strftime('%Y%m%d') for x in range(0, numdays)
]
pool = Pool(processes=16)
pool.map(load, datelist)
pool.close()
pool.join()
print(time.time() - start)
Would like to know how I can solve this and make it work

Fetching jenkins build status from python script

I am trying to fetch status of all the builds for all my jobs.I have written a script it takes way too much time to execute.Is there anyway I can optimize the script? Any help will be appreciated.
def jenkinsconn():
server = jenkins.Jenkins('server',username=username,password=password)
jobs = server.get_jobs()
job_name_list=[]
build_number_list=[]
build_info_list=[]
status_list_dict={}
success=0
failure=0
unstable=0
aborted=0
#print dir(server)
for i in range(len(jobs)):
job_name=jobs[i]['name']
job_name_list.append(job_name)
for i in range(len(job_name_list)):
job_info=server.get_job_info(job_name_list[i])
lastbuilt=job_info['lastSuccessfulBuild']
if lastbuilt:
b_number=job_info['lastSuccessfulBuild']['number']
build_number_list.append(b_number)
build_zipped=zip(job_name_list,build_number_list)
for i ,j in build_zipped:
success=0
failure=0
unstable=0
aborted=0
for k in range(j):
build_info=server.get_build_info(i,k+1)
build_info_list.append(build_info)
status=build_info['result']
if status=="SUCCESS":
success+=1
elif status=="FAILURE":
failure+=1
elif status=="UNSTABLE":
unstable+=1
else:
aborted+=1
statuscount=[success,failure,unstable,aborted]
status_list_dict[i]=statuscount
If you only need the number of builds succeeding, failing, etc. then you can make do with one request per job, rather than a request per build like it looks like your code is doing. I can't find an method in the python-jenkins module to do this, but you can do it yourself with the Jenkins API.
Eg:
try: # Python 3
from urllib.request import urlopen
from urllib.parse import quote
except ImportError: # Python 2
from urllib2 import urlopen, quote
import json
import contextlib
status_list_dict = {}
with contextlib.closing(
urlopen("http://HOST_NAME:8080/api/json")
) as job_list_response:
job_list = json.load(job_list_response)["jobs"]
for job in job_list:
status_counts = [0,0,0,0]
with contextlib.closing(
urlopen(
"http://HOST_NAME:8080/job/{job_name}/api/json?tree=allBuilds[result]".format(
job_name=quote(job["name"])
)
)
) as build_list_response:
build_list = json.load(build_list_response)["allBuilds"]
for build_data in build_list:
if build_data["result"] == "SUCCESS":
status_counts[0] += 1
elif build_data["result"] == "FAILURE":
status_counts[1] += 1
elif build_data["result"] == "UNSTABLE":
status_counts[2] += 1
elif build_data["result"] == "ABORTED":
status_counts[3] += 1
status_list_dict[job["name"]] = status_counts

Categories