How to read file of URLs and web scrape them with multithreading

How to read file of URLs and web scrape them with multithreading - python

I am implementing a web scraping script in Python that reads a JSON file and gets a list of URLs to scrape each.
This file contains over 60K rows of which around 50K are unique (so first I am removing duplicates).
To do this process I have the next:
import contextlib
from bs4 import BeautifulSoup
import feedparser
import pandas
import requests
import time
BASE_URL = 'https://www.iso.org'
def create_iso_details_json(p_merged_iso_df):
merged_iso_details_df = p_merged_iso_df.drop_duplicates(subset=['Link']).drop(columns=['TC', 'ICS'], axis=1)
iso_details_dfs = [parse_iso_details(iso, stage, link)
for iso, stage, link in zip(merged_iso_details_df['Standard and/or project'], merged_iso_details_df['Stage'], merged_iso_details_df['Link'])
if link != '']
merged_iso_details_df = pandas.concat(iso_details_dfs)
print('Total rows retrieved: ', len(merged_iso_details_df.index))
merged_iso_details_df.to_json('iso_details.json', orient="records")
def parse_iso_details(p_iso, p_stage, p_url):
print('URL: ', p_url)
soup = BeautifulSoup(requests.get(p_url).text, 'html.parser')
try:
feed_details_url = BASE_URL + soup.find('section', {'id': 'product-details'}).find('a', {'class': 'ss-icon ss-social-circle text-warning text-sm'})['href']
except AttributeError:
print('Could not find feed data for URL: ', p_url)
print(feed_details_url)
iso_details_dfs = []
if feed_details_url is not None:
iso_details_dfs.append(read_iso_details(feed_details_url, p_iso, p_stage))
with contextlib.suppress(ValueError):
return pandas.concat(iso_details_dfs)
def read_iso_details(p_feed_details_url, p_iso, p_stage):
data = {'Standard and/or project': p_iso, 'Stage': p_stage}
df = pandas.DataFrame(data, index=[0])
feed = feedparser.parse(p_feed_details_url)
df['Publication date'] = [entry.published for entry in feed.entries]
return df
def main():
start_time = time.time()
merged_iso_df = pandas.read_json('input_file.json', dtype={"Stage": str})
create_iso_details_json(merged_iso_df)
print(f"--- {time.time() - start_time} seconds ---")
if __name__ == "__main__":
main()
I am merging the results in a pandas DataFrame to write it to another JSON file later.
Now, this takes so much time since the process makes a request per each input URL and lasts between 0.5 and 1 seconds.
I would like to implement this process with multithreading (not multiprocessing) so the processing time decreases significatively.
What is the best approach to achieve this? Split the input JSON file into many parts as number of threads to create to processing? How I merge the results of each thread into one to write the output JSON file?
Thank you in advance.

This Website explains multithreading pretty well. What you could do is splitting the URLs into equal parts and running them simultaneously. The problem with that is, that you basically just divide the time it would take by the number of threads you use. But to my knowledge, this is the best thing you can do without overcomplicating it.

I finally managed to implement the process with multithreading as #Yui posted in their answer.
The real problem was to merge results of each thread into one, so I decided to write each thread result into a file in append mode as a CSV. Then when all threads are finished I read the CSV and write the results into requierd JSON file.
BASE_URL = 'https://www.iso.org'
NUM_THREADS = 4
q = Queue()
INPUT_FILE = 'iso_tc_ics.json'
ISO_DETAILS_CSV_FILE = 'iso_details.csv'
OUTPUT_FILE = 'iso_details.json'
def create_iso_details_json(p_queue):
iso_details_df = p_queue.get()
iso_details_dfs = [parse_iso_details(iso, stage, link)
for iso, stage, link in zip(iso_details_df['Standard and/or project'], iso_details_df['Stage'], iso_details_df['Link'])
if link != '']
iso_details_df = pandas.concat(iso_details_dfs)
print('Rows retrieved: ', len(iso_details_df.index))
return iso_details_df
def parse_iso_details(p_iso, p_stage, p_url):
print('URL: ', p_url)
soup = BeautifulSoup(requests.get(p_url).text, 'html.parser')
try:
feed_details_url = BASE_URL + soup.find('section', {'id': 'product-details'}).find('a', {'class': 'ss-icon ss-social-circle text-warning text-sm'})['href']
except AttributeError:
print('Could not find feed data for URL: ', p_url)
print(feed_details_url)
iso_details_dfs = []
if feed_details_url is not None:
iso_details_dfs.append(read_iso_details(feed_details_url, p_iso, p_stage))
with contextlib.suppress(ValueError):
return pandas.concat(iso_details_dfs)
def read_iso_details(p_feed_details_url, p_iso, p_stage):
data = {'Standard and/or project': p_iso, 'Stage': p_stage}
df = pandas.DataFrame(data, index=[0])
feed = feedparser.parse(p_feed_details_url)
df['Publication date'] = [entry.published for entry in feed.entries]
return df
def main():
global q
result_df = create_iso_details_json(q)
with open(ISO_DETAILS_CSV_FILE, 'a') as f:
result_df.to_csv(f, mode='a', index=False, header=not f.tell(), encoding='ISO-8859-1')
q.task_done()
def init():
merged_iso_df = pandas.read_json(INPUT_FILE, dtype={"Stage": str})
merged_iso_details_df = merged_iso_df.drop_duplicates(subset=['Link']).drop(columns=['TC', 'ICS'], axis=1)
iso_details_df_chunks = numpy.array_split(merged_iso_details_df, NUM_THREADS)
for iso_details_df in iso_details_df_chunks:
q.put(iso_details_df)
for _ in range(NUM_THREADS):
worker = Thread(target=main)
worker.daemon = True
worker.start()
def end():
q.join()
result_iso_details_df = pandas.read_csv(ISO_DETAILS_CSV_FILE, dtype={"Stage": str}, encoding='ISO-8859-1')
print('Total rows retrieved: ', len(result_iso_details_df.index))
result_iso_details_df.to_json(OUTPUT_FILE, orient="records")
with contextlib.suppress(OSError):
os.remove(ISO_DETAILS_CSV_FILE)
if __name__ == "__main__":
start_time = time.time()
init()
end()
print(f"--- {time.time() - start_time} seconds ---")

I would go with asyncio and aiohttp here is a complete example of how to do multiple requests concurrently and get the result in the end
import aiohttp
import asyncio
async def geturl(url, session):
async with session.get(url) as resp:
if resp.status == 200:
return (await resp.json())['name']
else:
return "ERROR"
async def main():
urls = [f'https://pokeapi.co/api/v2/pokemon/{i}' for i in range(1,10)]
async with aiohttp.ClientSession() as session:
tasks = [geturl(url, session) for url in urls]
# asyncio.gather will run all the tasks concurrently
# and return their results once all tasks have returned
all_results = await asyncio.gather(*tasks)
print(all_results)
asyncio.run(main())
This will print the first 10 pokemon names by the way, you can tweak for your needs

Related

Why my code still slow after threading for 15k records only, how to fix this

I have a script, taking links from a file, visiting it, getting re-directed links, storing it back. But it works too slow on a file with 15k records. How can I make it quick? already used threading
Please do help to fix it out!, I've tried multiple ways, threadings but I cannot make it quick. Is there any solution to my problem by any chance? any expert who could help me out.
import concurrent.futures
import sys
import pandas as pd
import requests
from threading import Thread
from queue import Queue
out_put_file=""
linkes = None
out = []
urls = []
old = []
file_name =None
concurrent = 10000
q = None
count=0
df =None
def do_work():
while True:
global q
url = q.get()
res = get_status(url)
q.task_done()
def get_status(o_url):
try:
res = requests.get(o_url)
if res:
out.append(res.url)
old.append(o_url)
print(count)
count=count+1
return [res.status_code,res.url ,o_url]
except:
pass
return [ans.status_code,ans.url,url]
def process_data():
global q
global file_name
global linkes
global df
file_name = input("Enter file name : ")
file_name = file_name.strip()
print("Generating .......")
df = pd.read_csv(file_name+".csv")
old_links =df["shopify"]
for i in old_links:
if type(i)!=str:
urls.append(i)
continue
if not i.startswith("http"):
linkes = "http://"+i
urls.append(linkes)
else:
urls.append(i)
df["shopify"]=urls
q = Queue(concurrent * 2)
for i in range(concurrent):
t = Thread(target=do_work)
t.daemon = True
t.start()
try:
for url in urls:
if type(url)!=str:
continue
q.put(url.strip())
q.join()
except KeyboardInterrupt:
sys.exit(1)
process_data()
for i in range (len(df['shopify'])):
for j in range(len(old)):
if df['shopify'][i]==old[j]:
df['shopify'][i]=out[j]
df = df[~df['shopify'].astype(str).str.startswith('http:')]
df = df.dropna()
df.to_csv(file_name+"-new.csv",index=False)
Email,shopify,Proofy_Status_Name
hello#knobblystudio.com,http://puravidabracelets.myshopify.com,Deliverable
service#cafe-select.co.uk,cafe-select.co.uk,Deliverable
mtafich#gmail.com,,Deliverable
whoopies#stevessnacks.com,stevessnacks.com,Deliverable
customerservice#runwayriches.com,runwayriches.com,Deliverable
shop#blackdogride.com.au,blackdogride.com.au,Deliverable
anavasconcelos.nica#gmail.com,grass4you.com,Deliverable
info#prideandprestigehair.com,prideandprestigehair.com,Deliverable
info#dancinwoofs.com,dancinwoofs.com,Deliverable

Threads in Python do not run simultaneously due to the Global Interpreter Lock. You might want to use the multiprocessing module instead, or ProcessPoolExecutor() from concurrent.futures. If you decide to use ProcessPoolExecutors, pass the URLs to the callback and have the callback return the old and redirected URL which should be returned by the result method of the future you get from the executor.submit. When using processes, global variables are not shared, unlike threads.
There has been an attempt to remove the global interpreter lock but without the GIL, Python doesn't run quite as fast or something like that if I remember correctly.
Something like the following might work. I renamed the concurrent variable because it would shadow the concurrent module and probably cause an error. This code is untested because I don't have the csv file to test with.
import concurrent.futures
from concurrent.futures import ProcessPoolExecutor
import sys
import pandas as pd
import requests
import numpy as np
from threading import Thread
from queue import Queue
out_put_file=""
linkes = None
out = []
urls = []
old = []
futures = []
file_name =None
concurrent_ = 10000
q = None
count=0
df =None
def do_work(urls):
results = []
for url in urls:
res = get_status(url)
if res:
results.append((res[2], res[1]))
else:
results.append((url, url))
return results
def get_status(o_url):
try:
res = requests.get(o_url)
if res:
out.append(res.url)
old.append(o_url)
#print(count)
#count=count+1
return [res.status_code,res.url ,o_url]
except:
pass
def load_url(url, timeout):
ans = requests.get(url, timeout=timeout)
return [ans.status_code,ans.url,url]
def process_data():
global q
global file_name
global linkes
global df
global urls
file_name = input("Enter file name : ")
file_name = file_name.strip()
print("Generating .......")
df = pd.read_csv(file_name+".csv")
old_links =df["shopify"]
for i in old_links:
if type(i)!=str:
urls.append(i)
continue
if not i.startswith("http"):
linkes = "http://"+i
urls.append(linkes)
else:
urls.append(i)
df["shopify"]=urls
workers = 50
with ProcessPoolExecutor(max_workers=workers) as executor:
url_arrays = np.array_split(urls, workers)
for urls in url_arrays:
f = executor.submit(do_work, urls)
futures.append(f)
process_data()
df['shopify'] = [res[1] for f in concurrent.futures.as_completed(futures) for res in f.result()]
df = df[~df['shopify'].astype(str).str.startswith('http:')]
df = df.dropna()
df.to_csv(file_name+"-new.csv",index=False)

How to find XML element fast using Python?

I am quite new to XML and to what makes code effective, and the code I am using takes quite a long time to run.
So I want to extract the elevation from given lat, long-values as fast as possible (I have a lot of lat,long-points). This is how I tried it:
import xml.etree.ElementTree as ET
from urllib.request import urlopen
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
def elevation(lat, long):
query = ('http://openwps.statkart.no/skwms1/wps.elevation2?request=Execute&service=WPS&version=1.0.0'
f'&identifier=elevation&datainputs=lat={lat};lon={long};epsg=4326')
parsing = "{http://www.opengis.net/wps/1.0.0}"
with urlopen(query) as f:
tree = ET.parse(f)
root = tree.getroot()
return float(root.findall(f".//{parsing}Data/*")[0].text)
Using this function on the data set I have extracted from an csv-file, with several datasets within the same file separated by a "new_sheep"-line:
df = pd.read_csv("/Users/ninsalv/Documents/Sheepdata/Data.csv", delimiter=';',
dtype={"Initial start": "str", "Start": "str", "Stop": "str"})
print(df.head())
dataset = 1
Lat = []
Long = []
temp = 0
for i in range(len(df)):
if "new_sheep" in df.iloc[i][0]:
temp += 1
continue
if temp == dataset:
Lat.append(df.iloc[i][3])
Long.append(df.iloc[i][4])
if temp > dataset:
break
step = np.linspace(0,len(Lat),len(Lat))
altitude = []
for i in range(len(Lat)):
altitude.append(elevation(Lat[i], Long[i]))
if (i % 100) == 0:
print("round number ", i)
plt.plot(step, altitude)
This works, but it takes almost a minute to find every 100 altitudes, and I have about 7000-15000 points to check in my dataset. Does anybody know either XML, pandas or something else that may make my code faster?

What you need to do is to get data (HTTP request) you are looking for in parallel. You cab use multi threading for that.
See the example below.
import requests
from requests.sessions import Session
import time
from threading import Thread,local
from queue import Queue
url_list = [] # TODO long list of urls to be populated by your code
q = Queue(maxsize=0) #Use a queue to store all URLs
for url in url_list:
q.put(url)
thread_local = local() #The thread_local will hold a Session object
def get_session() -> Session:
if not hasattr(thread_local,'session'):
thread_local.session = requests.Session() # Create a new Session if not exists
return thread_local.session
def download_link() -> None:
'''download link worker, get URL from queue until no url left in the queue'''
session = get_session()
while not q.empty():
url = q.get()
with session.get(url) as response:
print(f'Read {len(response.content)} from {url}')
q.task_done() # tell the queue, this url downloading work is done
def download_all(urls) -> None:
'''Start 10 threads, each thread as a wrapper of downloader'''
thread_num = 10
for i in range(thread_num):
t_worker = Thread(target=download_link)
t_worker.start()
q.join() # main thread wait until all url finished downloading
print("start work")
start = time.time()
download_all(url_list)
end = time.time()
print(f'download {len(url_list)} links in {end - start} seconds')

How can I test concurrency using ThreadPoolExecutor?

I have written a program that I would like to use concurrency on and I have implemented it but I am having a hard time testing to see if it will actually spin up more threads than just one. Does anyone have any suggestions? I am just trying to see if this code will ever use 2,3,4,5 workers.
def read_files():
t0 = time.process_time()
cols = ['fname', ' lname', ' age']
path = 'data'
files = glob.glob(os.path.join(path, "*.csv"))
# with open('data/url') as f:
# for line in f:
# files.append(line.rstrip('\n'))
bad_files = []
df_list = []
for file in files:
try:
temp = pd.read_csv(file)
if temp.columns.to_list() == cols:
df_list.append(temp)
else:
bad_files.append(file)
except ParserError as pe:
bad_files.append(file)
logging.error(f'Parsing Error on {file}. Error: {pe}')
except ValueError as ve:
logging.error(f'Value error on reading the csv: {temp}, error: {ve}')
bad_files.append(file)
except urllib.error.HTTPError as he:
bad_files.append(file)
logging.error(f'Http Error {he}, Code {he.code}')
except Exception as e:
bad_files.append(file)
logging.error(f'Error grabbing data from given {file} possible HTTP error. Error: {e}')
print(f'Files that were not read {bad_files}')
df = pd.concat(df_list)
t1 = time.process_time()
print(f'It took {t1 - t0} seconds, to read and fill the dataframe.')
return df
def run_calculations(df):
if len(df.index) % 2 == 0:
print(f'Even number of entries, pandas median() method will add both middle numbers and find the average.')
average = round(df[' age'].mean())
median = df[' age'].median()
names_arr = df[df[' age'] == median].values[0]
fname = names_arr[0]
lname = names_arr[1]
print(f'The Average Age is {int(average)}, The Median Age is {int(median)}. {fname} {lname} is {int(median)}')
if __name__ == '__main__':
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
executor.submit(run_calculations(read_files()))
print(f'I have used {len(executor._threads)} thread(s) for processing')

the short answer is: no, your code will not use more than one workers.
The reason is you passed a wrong-typed argument to executor.submit, which accepts a callable while you are passing basically None.
A quick fix would be to replace executor.submit(run_calculations(read_files())) with executor.submit(lambda : run_calculations(read_files()))
The following snippet will help to explain how to submit a callable to executor:
import time
import threading
from concurrent.futures import ThreadPoolExecutor
def task(time_to_sleep):
time.sleep(time_to_sleep)
print(id(threading.current_thread()))
def use_single_worker():
print("in use single worker")
with ThreadPoolExecutor(max_workers=5) as executor:
# a single thread id will get dumped multi times
futures = [executor.submit(task(i)) for i in range(10)]
for future in futures:
try:
future.result()
except Exception:
pass
def use_multiple_workers():
print("in use multiple workers")
with ThreadPoolExecutor(max_workers=5) as executor:
# different thread ids will get dumped
futures = [executor.submit(lambda: task(i)) for i in range(10)]
for future in futures:
try:
future.result()
except Exception:
pass
if __name__ == '__main__':
use_single_worker()
use_multiple_workers()

Effective Python Multiprocessing in a list for loop

I'm trying to multiprocess an action inside a for x in y loop. Basically, the concept of the script is to do a request to a site, load up a json file containing a list of URLs. Once fetched, another function is called to parse an URL individually. What i've been trying to do is to multiprocess this task with multiprocess.Process() in order to speed up the process since there is lots of URLs to parse. However, my approach doesn't speed up the process at all, it actually goes at the same speed than with no multiprocessing. It seems like gets blocked when using proc.join().
This is a code i've been working on:
import json
import requests
import multiprocessing
def ExtractData(id):
print("Processing ", id)
result = requests.get('http://example-index.com/' + id')
result = result.text.split('\n')[:-1]
for entry in result:
data = json.loads(entry)['url']
print("data is:", data)
def ParseJsonAndCall():
url = "https://example-site.com/info.json"
data = json.loads(requests.get(url).text)
t = []
for results in data:
print("Processing ", results['url'])
p = multiprocessing.Process(target=ExtractData, args=(results['id'],))
t.append(p)
p.start()
for proc in threads:
proc.join()
ParseJsonAndCall()
Any help would be greatly appreciated!

A Pool may help.
import multiprocessing as mp
def ParseJsonAndCall():
url = "https://example-site.com/info.json"
data = json.loads(requests.get(url).text)
collect_results = []
with mp.Pool(processes=mp.cpu_count()) as pool:
for results in data:
res = pool.apply_async(ExtractData, [results['id'],])
collect_results.append(res)
for res in collect_results:
res.get()
Although the print statement in ExtractData() might cause a race condition.

Using WGET or Python to download and rename attachments from CSV requiring basic authentication

I scraped a ticketing website that we were using and I now have a CSV file which looks like this: ID, Attachment_URL, Ticket_URL. What I now need to do is download every attachment and rename the file with the Ticket_URL. The main issue I have is that when navigating to the Attachment_URL you must use basic authentication and then you are redirected to an aws s3 link. I have been able to download individual files using wget, but I have not been able to iterate through the entire list (35k rows or so), and I am not sure how I would be able to name the file as the ticket_id. Any advice would be appreciated.

Got it.
To open the authenticated session:
# -*- coding: utf-8 -*-
import requests
import re
from bs4 import BeautifulSoup
import csv
import pandas as pd
import time
s = requests.session()
payload = {
'user': '',
'pw': ''
}
s.post('login.url.here', data=payload)
for i in range(1, 6000):
testURL = s.get(
'https://urlhere.com/efw/stuff&page={}'.format(i))
soup = BeautifulSoup(testURL.content)
table = soup.find("table", {"class": "table-striped"})
table_body = table.find('tbody')
rows = table_body.find_all('tr')[1:]
print "The current page is: " + str(i)
for row in rows:
cols = row.find_all('a', attrs={'href': re.compile("^/helpdesk/")})
# time.sleep(1)
with open('fd.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow(cols)
print cols
print cols
Then I cleaned the links a bit in R and to download the files.
#! /usr/bin/env python
import threading
import os
from time import gmtime, strftime
from Queue import Queue
import requests
s = requests.session()
payload = {
'user': '',
'pw': ''
}
s.post('login', data=payload)
class log:
def info(self, message):
self.__message("info", message)
def error(self, message):
self.__message("error", message)
def debug(self, message):
self.__message("debug", message)
def __message(self, log_level, message):
date = strftime("%Y-%m-%d %H:%M:%S", gmtime())
print "%s [%s] %s" % (date, log_level, message)
class fetch:
def __init__(self):
self.temp_dir = "/tmp"
def run_fetcher(self, queue):
while not queue.empty():
url, ticketid = queue.get()
if ticketid.endswith("NA"):
fileName = url.split("/")[-1] + 'NoTicket'
else:
fileName = ticketid.split("/")[-1]
response = s.get(url)
with open(os.path.join('/Users/Desktop/FolderHere', fileName + '.mp3'), 'wb') as f:
f.write(response.content)
print fileName
queue.task_done()
if __name__ == '__main__':
# load in classes
q = Queue()
log = log()
fe = fetch()
# get bucket name
#Read in input file
with open('/Users/name/csvfilehere.csv', 'r') as csvfile:
for line in csvfile:
id,url,ticket = line.split(",")
q.put([url.strip(),ticket.strip()])
# spin up fetcher workers
threads = []
for i in range(8):
t = threading.Thread(target=fe.run_fetcher, args=(q,))
t.daemon = True
threads.append(t)
t.start()
# close threads
[x.join() for x in threads]
# close queue
q.join()
log.info("End")

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to read file of URLs and web scrape them with multithreading - python

Related

Why my code still slow after threading for 15k records only, how to fix this

How to find XML element fast using Python?

How can I test concurrency using ThreadPoolExecutor?

Effective Python Multiprocessing in a list for loop

Using WGET or Python to download and rename attachments from CSV requiring basic authentication

Categories

Resources