How to find XML element fast using Python? - python

I am quite new to XML and to what makes code effective, and the code I am using takes quite a long time to run.
So I want to extract the elevation from given lat, long-values as fast as possible (I have a lot of lat,long-points). This is how I tried it:
import xml.etree.ElementTree as ET
from urllib.request import urlopen
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
def elevation(lat, long):
query = ('http://openwps.statkart.no/skwms1/wps.elevation2?request=Execute&service=WPS&version=1.0.0'
f'&identifier=elevation&datainputs=lat={lat};lon={long};epsg=4326')
parsing = "{http://www.opengis.net/wps/1.0.0}"
with urlopen(query) as f:
tree = ET.parse(f)
root = tree.getroot()
return float(root.findall(f".//{parsing}Data/*")[0].text)
Using this function on the data set I have extracted from an csv-file, with several datasets within the same file separated by a "new_sheep"-line:
df = pd.read_csv("/Users/ninsalv/Documents/Sheepdata/Data.csv", delimiter=';',
dtype={"Initial start": "str", "Start": "str", "Stop": "str"})
print(df.head())
dataset = 1
Lat = []
Long = []
temp = 0
for i in range(len(df)):
if "new_sheep" in df.iloc[i][0]:
temp += 1
continue
if temp == dataset:
Lat.append(df.iloc[i][3])
Long.append(df.iloc[i][4])
if temp > dataset:
break
step = np.linspace(0,len(Lat),len(Lat))
altitude = []
for i in range(len(Lat)):
altitude.append(elevation(Lat[i], Long[i]))
if (i % 100) == 0:
print("round number ", i)
plt.plot(step, altitude)
This works, but it takes almost a minute to find every 100 altitudes, and I have about 7000-15000 points to check in my dataset. Does anybody know either XML, pandas or something else that may make my code faster?

What you need to do is to get data (HTTP request) you are looking for in parallel. You cab use multi threading for that.
See the example below.
import requests
from requests.sessions import Session
import time
from threading import Thread,local
from queue import Queue
url_list = [] # TODO long list of urls to be populated by your code
q = Queue(maxsize=0) #Use a queue to store all URLs
for url in url_list:
q.put(url)
thread_local = local() #The thread_local will hold a Session object
def get_session() -> Session:
if not hasattr(thread_local,'session'):
thread_local.session = requests.Session() # Create a new Session if not exists
return thread_local.session
def download_link() -> None:
'''download link worker, get URL from queue until no url left in the queue'''
session = get_session()
while not q.empty():
url = q.get()
with session.get(url) as response:
print(f'Read {len(response.content)} from {url}')
q.task_done() # tell the queue, this url downloading work is done
def download_all(urls) -> None:
'''Start 10 threads, each thread as a wrapper of downloader'''
thread_num = 10
for i in range(thread_num):
t_worker = Thread(target=download_link)
t_worker.start()
q.join() # main thread wait until all url finished downloading
print("start work")
start = time.time()
download_all(url_list)
end = time.time()
print(f'download {len(url_list)} links in {end - start} seconds')

Related

Why my code still slow after threading for 15k records only, how to fix this

I have a script, taking links from a file, visiting it, getting re-directed links, storing it back. But it works too slow on a file with 15k records. How can I make it quick? already used threading
Please do help to fix it out!, I've tried multiple ways, threadings but I cannot make it quick. Is there any solution to my problem by any chance? any expert who could help me out.
import concurrent.futures
import sys
import pandas as pd
import requests
from threading import Thread
from queue import Queue
out_put_file=""
linkes = None
out = []
urls = []
old = []
file_name =None
concurrent = 10000
q = None
count=0
df =None
def do_work():
while True:
global q
url = q.get()
res = get_status(url)
q.task_done()
def get_status(o_url):
try:
res = requests.get(o_url)
if res:
out.append(res.url)
old.append(o_url)
print(count)
count=count+1
return [res.status_code,res.url ,o_url]
except:
pass
return [ans.status_code,ans.url,url]
def process_data():
global q
global file_name
global linkes
global df
file_name = input("Enter file name : ")
file_name = file_name.strip()
print("Generating .......")
df = pd.read_csv(file_name+".csv")
old_links =df["shopify"]
for i in old_links:
if type(i)!=str:
urls.append(i)
continue
if not i.startswith("http"):
linkes = "http://"+i
urls.append(linkes)
else:
urls.append(i)
df["shopify"]=urls
q = Queue(concurrent * 2)
for i in range(concurrent):
t = Thread(target=do_work)
t.daemon = True
t.start()
try:
for url in urls:
if type(url)!=str:
continue
q.put(url.strip())
q.join()
except KeyboardInterrupt:
sys.exit(1)
process_data()
for i in range (len(df['shopify'])):
for j in range(len(old)):
if df['shopify'][i]==old[j]:
df['shopify'][i]=out[j]
df = df[~df['shopify'].astype(str).str.startswith('http:')]
df = df.dropna()
df.to_csv(file_name+"-new.csv",index=False)
Email,shopify,Proofy_Status_Name
hello#knobblystudio.com,http://puravidabracelets.myshopify.com,Deliverable
service#cafe-select.co.uk,cafe-select.co.uk,Deliverable
mtafich#gmail.com,,Deliverable
whoopies#stevessnacks.com,stevessnacks.com,Deliverable
customerservice#runwayriches.com,runwayriches.com,Deliverable
shop#blackdogride.com.au,blackdogride.com.au,Deliverable
anavasconcelos.nica#gmail.com,grass4you.com,Deliverable
info#prideandprestigehair.com,prideandprestigehair.com,Deliverable
info#dancinwoofs.com,dancinwoofs.com,Deliverable
Threads in Python do not run simultaneously due to the Global Interpreter Lock. You might want to use the multiprocessing module instead, or ProcessPoolExecutor() from concurrent.futures. If you decide to use ProcessPoolExecutors, pass the URLs to the callback and have the callback return the old and redirected URL which should be returned by the result method of the future you get from the executor.submit. When using processes, global variables are not shared, unlike threads.
There has been an attempt to remove the global interpreter lock but without the GIL, Python doesn't run quite as fast or something like that if I remember correctly.
Something like the following might work. I renamed the concurrent variable because it would shadow the concurrent module and probably cause an error. This code is untested because I don't have the csv file to test with.
import concurrent.futures
from concurrent.futures import ProcessPoolExecutor
import sys
import pandas as pd
import requests
import numpy as np
from threading import Thread
from queue import Queue
out_put_file=""
linkes = None
out = []
urls = []
old = []
futures = []
file_name =None
concurrent_ = 10000
q = None
count=0
df =None
def do_work(urls):
results = []
for url in urls:
res = get_status(url)
if res:
results.append((res[2], res[1]))
else:
results.append((url, url))
return results
def get_status(o_url):
try:
res = requests.get(o_url)
if res:
out.append(res.url)
old.append(o_url)
#print(count)
#count=count+1
return [res.status_code,res.url ,o_url]
except:
pass
def load_url(url, timeout):
ans = requests.get(url, timeout=timeout)
return [ans.status_code,ans.url,url]
def process_data():
global q
global file_name
global linkes
global df
global urls
file_name = input("Enter file name : ")
file_name = file_name.strip()
print("Generating .......")
df = pd.read_csv(file_name+".csv")
old_links =df["shopify"]
for i in old_links:
if type(i)!=str:
urls.append(i)
continue
if not i.startswith("http"):
linkes = "http://"+i
urls.append(linkes)
else:
urls.append(i)
df["shopify"]=urls
workers = 50
with ProcessPoolExecutor(max_workers=workers) as executor:
url_arrays = np.array_split(urls, workers)
for urls in url_arrays:
f = executor.submit(do_work, urls)
futures.append(f)
process_data()
df['shopify'] = [res[1] for f in concurrent.futures.as_completed(futures) for res in f.result()]
df = df[~df['shopify'].astype(str).str.startswith('http:')]
df = df.dropna()
df.to_csv(file_name+"-new.csv",index=False)

How to read file of URLs and web scrape them with multithreading

I am implementing a web scraping script in Python that reads a JSON file and gets a list of URLs to scrape each.
This file contains over 60K rows of which around 50K are unique (so first I am removing duplicates).
To do this process I have the next:
import contextlib
from bs4 import BeautifulSoup
import feedparser
import pandas
import requests
import time
BASE_URL = 'https://www.iso.org'
def create_iso_details_json(p_merged_iso_df):
merged_iso_details_df = p_merged_iso_df.drop_duplicates(subset=['Link']).drop(columns=['TC', 'ICS'], axis=1)
iso_details_dfs = [parse_iso_details(iso, stage, link)
for iso, stage, link in zip(merged_iso_details_df['Standard and/or project'], merged_iso_details_df['Stage'], merged_iso_details_df['Link'])
if link != '']
merged_iso_details_df = pandas.concat(iso_details_dfs)
print('Total rows retrieved: ', len(merged_iso_details_df.index))
merged_iso_details_df.to_json('iso_details.json', orient="records")
def parse_iso_details(p_iso, p_stage, p_url):
print('URL: ', p_url)
soup = BeautifulSoup(requests.get(p_url).text, 'html.parser')
try:
feed_details_url = BASE_URL + soup.find('section', {'id': 'product-details'}).find('a', {'class': 'ss-icon ss-social-circle text-warning text-sm'})['href']
except AttributeError:
print('Could not find feed data for URL: ', p_url)
print(feed_details_url)
iso_details_dfs = []
if feed_details_url is not None:
iso_details_dfs.append(read_iso_details(feed_details_url, p_iso, p_stage))
with contextlib.suppress(ValueError):
return pandas.concat(iso_details_dfs)
def read_iso_details(p_feed_details_url, p_iso, p_stage):
data = {'Standard and/or project': p_iso, 'Stage': p_stage}
df = pandas.DataFrame(data, index=[0])
feed = feedparser.parse(p_feed_details_url)
df['Publication date'] = [entry.published for entry in feed.entries]
return df
def main():
start_time = time.time()
merged_iso_df = pandas.read_json('input_file.json', dtype={"Stage": str})
create_iso_details_json(merged_iso_df)
print(f"--- {time.time() - start_time} seconds ---")
if __name__ == "__main__":
main()
I am merging the results in a pandas DataFrame to write it to another JSON file later.
Now, this takes so much time since the process makes a request per each input URL and lasts between 0.5 and 1 seconds.
I would like to implement this process with multithreading (not multiprocessing) so the processing time decreases significatively.
What is the best approach to achieve this? Split the input JSON file into many parts as number of threads to create to processing? How I merge the results of each thread into one to write the output JSON file?
Thank you in advance.
This Website explains multithreading pretty well. What you could do is splitting the URLs into equal parts and running them simultaneously. The problem with that is, that you basically just divide the time it would take by the number of threads you use. But to my knowledge, this is the best thing you can do without overcomplicating it.
I finally managed to implement the process with multithreading as #Yui posted in their answer.
The real problem was to merge results of each thread into one, so I decided to write each thread result into a file in append mode as a CSV. Then when all threads are finished I read the CSV and write the results into requierd JSON file.
BASE_URL = 'https://www.iso.org'
NUM_THREADS = 4
q = Queue()
INPUT_FILE = 'iso_tc_ics.json'
ISO_DETAILS_CSV_FILE = 'iso_details.csv'
OUTPUT_FILE = 'iso_details.json'
def create_iso_details_json(p_queue):
iso_details_df = p_queue.get()
iso_details_dfs = [parse_iso_details(iso, stage, link)
for iso, stage, link in zip(iso_details_df['Standard and/or project'], iso_details_df['Stage'], iso_details_df['Link'])
if link != '']
iso_details_df = pandas.concat(iso_details_dfs)
print('Rows retrieved: ', len(iso_details_df.index))
return iso_details_df
def parse_iso_details(p_iso, p_stage, p_url):
print('URL: ', p_url)
soup = BeautifulSoup(requests.get(p_url).text, 'html.parser')
try:
feed_details_url = BASE_URL + soup.find('section', {'id': 'product-details'}).find('a', {'class': 'ss-icon ss-social-circle text-warning text-sm'})['href']
except AttributeError:
print('Could not find feed data for URL: ', p_url)
print(feed_details_url)
iso_details_dfs = []
if feed_details_url is not None:
iso_details_dfs.append(read_iso_details(feed_details_url, p_iso, p_stage))
with contextlib.suppress(ValueError):
return pandas.concat(iso_details_dfs)
def read_iso_details(p_feed_details_url, p_iso, p_stage):
data = {'Standard and/or project': p_iso, 'Stage': p_stage}
df = pandas.DataFrame(data, index=[0])
feed = feedparser.parse(p_feed_details_url)
df['Publication date'] = [entry.published for entry in feed.entries]
return df
def main():
global q
result_df = create_iso_details_json(q)
with open(ISO_DETAILS_CSV_FILE, 'a') as f:
result_df.to_csv(f, mode='a', index=False, header=not f.tell(), encoding='ISO-8859-1')
q.task_done()
def init():
merged_iso_df = pandas.read_json(INPUT_FILE, dtype={"Stage": str})
merged_iso_details_df = merged_iso_df.drop_duplicates(subset=['Link']).drop(columns=['TC', 'ICS'], axis=1)
iso_details_df_chunks = numpy.array_split(merged_iso_details_df, NUM_THREADS)
for iso_details_df in iso_details_df_chunks:
q.put(iso_details_df)
for _ in range(NUM_THREADS):
worker = Thread(target=main)
worker.daemon = True
worker.start()
def end():
q.join()
result_iso_details_df = pandas.read_csv(ISO_DETAILS_CSV_FILE, dtype={"Stage": str}, encoding='ISO-8859-1')
print('Total rows retrieved: ', len(result_iso_details_df.index))
result_iso_details_df.to_json(OUTPUT_FILE, orient="records")
with contextlib.suppress(OSError):
os.remove(ISO_DETAILS_CSV_FILE)
if __name__ == "__main__":
start_time = time.time()
init()
end()
print(f"--- {time.time() - start_time} seconds ---")
I would go with asyncio and aiohttp here is a complete example of how to do multiple requests concurrently and get the result in the end
import aiohttp
import asyncio
async def geturl(url, session):
async with session.get(url) as resp:
if resp.status == 200:
return (await resp.json())['name']
else:
return "ERROR"
async def main():
urls = [f'https://pokeapi.co/api/v2/pokemon/{i}' for i in range(1,10)]
async with aiohttp.ClientSession() as session:
tasks = [geturl(url, session) for url in urls]
# asyncio.gather will run all the tasks concurrently
# and return their results once all tasks have returned
all_results = await asyncio.gather(*tasks)
print(all_results)
asyncio.run(main())
This will print the first 10 pokemon names by the way, you can tweak for your needs

Effective Python Multiprocessing in a list for loop

I'm trying to multiprocess an action inside a for x in y loop. Basically, the concept of the script is to do a request to a site, load up a json file containing a list of URLs. Once fetched, another function is called to parse an URL individually. What i've been trying to do is to multiprocess this task with multiprocess.Process() in order to speed up the process since there is lots of URLs to parse. However, my approach doesn't speed up the process at all, it actually goes at the same speed than with no multiprocessing. It seems like gets blocked when using proc.join().
This is a code i've been working on:
import json
import requests
import multiprocessing
def ExtractData(id):
print("Processing ", id)
result = requests.get('http://example-index.com/' + id')
result = result.text.split('\n')[:-1]
for entry in result:
data = json.loads(entry)['url']
print("data is:", data)
def ParseJsonAndCall():
url = "https://example-site.com/info.json"
data = json.loads(requests.get(url).text)
t = []
for results in data:
print("Processing ", results['url'])
p = multiprocessing.Process(target=ExtractData, args=(results['id'],))
t.append(p)
p.start()
for proc in threads:
proc.join()
ParseJsonAndCall()
Any help would be greatly appreciated!
A Pool may help.
import multiprocessing as mp
def ParseJsonAndCall():
url = "https://example-site.com/info.json"
data = json.loads(requests.get(url).text)
collect_results = []
with mp.Pool(processes=mp.cpu_count()) as pool:
for results in data:
res = pool.apply_async(ExtractData, [results['id'],])
collect_results.append(res)
for res in collect_results:
res.get()
Although the print statement in ExtractData() might cause a race condition.

Parallel GET requests for different domains with threading module

I expect to have maybe something like 100k URLs from different domains. I wrote this code which has a list of URLs in all_urls and forms N threads to run in one batch. Currently I'm using threading module to make these requests in parallel.
import requests
import os
import threading
import time
all_urls = [] # a list of URLs to request, can have up to 100k
global success, fail
success = 0
fail = 0
def func(url_to_request):
global success, fail
try:
r = requests.get(url_to_request, timeout=5)
c = r.content
success = success +1
except:
fail = fail +1
return
batch_count = 1
N = 200 # number of threads
all_threads_urls = []
time_start = time.time()
for item in all_urls:
all_threads_urls.append(item)
if all_urls.index(item) == len(all_urls)-1 or len(all_threads_urls) == N:
# call it
all_threads = []
for link in all_threads_urls:
current_thread = threading.Thread(target=func, args=(link,))
all_threads.append(current_thread)
current_thread.start()
for thr in all_threads:
thr.join()
all_threads_urls = [] # for the next batch
time_end = time.time()
print "Request number", all_urls.index(item)+1, "Good:", success, "Bad:", fail, "Duration:", round(time_end - time_start,2 ), "seconds."
time_start = time_end
Results for this are a bit weird, it seems that the script starts very fast but then slows down a lot (see image). Printed durations are for each batch.
Can someone explain what is the bottleneck here? Is there maybe a better module for this or there is no way around this?

get many pages with pycurl?

I want to get many pages from a website, like
curl "http://farmsubsidy.org/DE/browse?page=[0000-3603]" -o "de.#1"
but get the pages' data in python, not disk files.
Can someone please post pycurl code to do this,
or fast urllib2 (not one-at-a-time) if that's possible,
or else say "forget it, curl is faster and more robust" ? Thanks
So you have 2 problem and let me show you in one example. Notice the pycurl already did the multithreading/not one-at-a-time w/o your hardwork.
#! /usr/bin/env python
import sys, select, time
import pycurl,StringIO
c1 = pycurl.Curl()
c2 = pycurl.Curl()
c3 = pycurl.Curl()
c1.setopt(c1.URL, "http://www.python.org")
c2.setopt(c2.URL, "http://curl.haxx.se")
c3.setopt(c3.URL, "http://slashdot.org")
s1 = StringIO.StringIO()
s2 = StringIO.StringIO()
s3 = StringIO.StringIO()
c1.setopt(c1.WRITEFUNCTION, s1.write)
c2.setopt(c2.WRITEFUNCTION, s2.write)
c3.setopt(c3.WRITEFUNCTION, s3.write)
m = pycurl.CurlMulti()
m.add_handle(c1)
m.add_handle(c2)
m.add_handle(c3)
# Number of seconds to wait for a timeout to happen
SELECT_TIMEOUT = 1.0
# Stir the state machine into action
while 1:
ret, num_handles = m.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
# Keep going until all the connections have terminated
while num_handles:
# The select method uses fdset internally to determine which file descriptors
# to check.
m.select(SELECT_TIMEOUT)
while 1:
ret, num_handles = m.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
# Cleanup
m.remove_handle(c3)
m.remove_handle(c2)
m.remove_handle(c1)
m.close()
c1.close()
c2.close()
c3.close()
print "http://www.python.org is ",s1.getvalue()
print "http://curl.haxx.se is ",s2.getvalue()
print "http://slashdot.org is ",s3.getvalue()
Finally, these code is mainly based on an example on the pycurl site =.=
may be you should really read doc. ppl spend huge time on it.
here is a solution based on urllib2 and threads.
import urllib2
from threading import Thread
BASE_URL = 'http://farmsubsidy.org/DE/browse?page='
NUM_RANGE = range(0000, 3603)
THREADS = 2
def main():
for nums in split_seq(NUM_RANGE, THREADS):
t = Spider(BASE_URL, nums)
t.start()
def split_seq(seq, num_pieces):
start = 0
for i in xrange(num_pieces):
stop = start + len(seq[i::num_pieces])
yield seq[start:stop]
start = stop
class Spider(Thread):
def __init__(self, base_url, nums):
Thread.__init__(self)
self.base_url = base_url
self.nums = nums
def run(self):
for num in self.nums:
url = '%s%s' % (self.base_url, num)
data = urllib2.urlopen(url).read()
print data
if __name__ == '__main__':
main()
You can just put that into a bash script inside a for loop.
However you may have better success at parsing each page using python.
http://www.securitytube.net/Crawling-the-Web-for-Fun-and-Profit-video.aspx
You will be able to get at the exact data and save it at the same time into a db.
http://www.securitytube.net/Storing-Mined-Data-from-the-Web-for-Fun-and-Profit-video.aspx
If you want to crawl a website using python, you should have a look to scrapy http://scrapy.org
Using BeautifulSoup4 and requests -
Grab head page:
page = Soup(requests.get(url='http://rootpage.htm').text)
Create an array of requests:
from requests import async
requests = [async.get(url.get('href')) for url in page('a')]
responses = async.map(requests)
[dosomething(response.text) for response in responses]
Requests requires gevent to do this btw.
I can recommend you to user async module of human_curl
Look example:
from urlparse import urljoin
from datetime import datetime
from human_curl.async import AsyncClient
from human_curl.utils import stdout_debug
def success_callback(response, **kwargs):
"""This function call when response successed
"""
print("success callback")
print(response, response.request)
print(response.headers)
print(response.content)
print(kwargs)
def fail_callback(request, opener, **kwargs):
"""Collect errors
"""
print("fail callback")
print(request, opener)
print(kwargs)
with AsyncClient(success_callback=success_callback,
fail_callback=fail_callback) as async_client:
for x in xrange(10000):
async_client.get('http://google.com/', params=(("x", str(x)),)
async_client.get('http://google.com/', params=(("x", str(x)),),
success_callback=success_callback, fail_callback=fail_callback)
Usage very simple. Then page success loaded of failed async_client call you callback. Also you can specify number on parallel connections.

Categories