I'm trying to write huge data to a csv file. When I try normal method it writes 50 data in 1 second but with multiprocessing it's down to 5 data in 1 second.
And I also added this code sys.setrecursionlimit(25000). Because without it's giving error.
I can feel I'm not doing right. What is the right way?
from bs4 import BeautifulSoup
import requests
import lxml
import csv
import cchardet
from multiprocessing import Pool
import sys
import time
sys.setrecursionlimit(25000)
csvfileWrite=open("comments.csv", 'a+', newline='',encoding='utf-8') #declared as a global variable
writer = csv.writer(csvfileWrite, delimiter=';', quotechar='"',
quoting=csv.QUOTE_MINIMAL) #declared as a global variable
def kacYildiz(div): #This function returns a number 0 to 5. Not important.
yildizSayisi=0
yildizYeri=div.find("div",attrs={"class":"RatingPointer-module-1OKF3"})
yildizlar=yildizYeri.find_all("svg")
for yildiz in yildizlar:
sonuc=yildiz.find("path").get("fill")
if(sonuc=="#f28b00"):
yildizSayisi+=1
return yildizSayisi
def takeText(div):
comment=div.find("span",attrs={"itemprop":"description"}).text
return comment
def yorumSayfaSayisi(row): # This function returns a number that how many
pages in the sites comment section. Not important.
yorumKismi="-yorumlari?"
adres=row[0]+yorumKismi
r = requests_session.get(adres)
soup = BeautifulSoup(r.text,"lxml")
sayfaS=soup.find("ul",attrs={"class":"PaginationBar-module-3qhrm"})
sayi=sayfaS.find_all("li")[-1].text
return sayi
def writeToCsv(comments): #writing commets to csv file.
global csvfileWrite
global writer
textToWrite = takeText(comments)
writer.writerow([kacYildiz(comments),textToWrite])
if __name__ == '__main__':
pageNumber=1
requests_session = requests.Session()
comments=list()
csvfile=open('adresler.csv',newline='')
reader = csv.reader(csvfile, delimiter=';', quotechar='|')
for row in reader:
rowNumber=yorumSayfaSayisi(row)
for i in range(1,int(rowNumber)):
comments.clear()
commetAdress="-yorumlari?sayfa={}".format(i)
adress=row[0]+commetAdress
r = requests_session.get(adress)
soup = BeautifulSoup(r.text,"lxml")
page=soup.find_all("div",attrs={"class":"ReviewCard-module-
3Y36S"})
for comment in page:
comments.append(comment)
p = Pool(10)
start = time.process_time()
p.map(writeToCsv, comments)
p.terminate()
p.join()
once try this approach using ThreadPool
from multiprocessing.pool import ThreadPool
def csvYaz(yorumlar):
global csvfileYaz
global yazici
yazi = yorumAl(yorumlar)
yazici.writerow([kacYildiz(yorumlar),yazi])
# ------main-----
for yorum in yorumSayfasi:
yorumlar.append(yorum)
threads = ThreadPool(10).map(csvYaz, yorumlar)
for zz in threads:
print(zz)
Related
I'm writing a utility I can use to check ports on many subnets. Currently I'm adding my results to a csv file and then sorting the file. I would like to instead add my results to a single list and then output the list so I'm doing fewer file open/close operations. I cannot seem to figure out how to make my results persist between threads. Below is my code:
import csv
import test_ports
import pandas
import ipaddress
import concurrent.futures
import time
import os
class check_subnets(object):
def __init__(self):
self.tested_list = []
def setup(self, l_subnets):
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
executor.map(self.subnet_search, l_subnets)
return self.tested_list
def subnet_search(self, sub):
print("Testing the " + sub + " subnet.")
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor2:
executor2.map(self.ip_search, ipaddress.IPv4Network(sub))
def ip_search(self, ip):
test = test_ports.TestPort()
s_ip_addr = str(ip)
print("Tested " + s_ip_addr)
test_ssh = test.test_ssh(s_ip_addr)
test_rdp = test.test_rdp(s_ip_addr)
this_list = [s_ip_addr, test_ssh, test_rdp]
self.tested_list.append(this_list)
with open('tested.csv', 'a') as file:
writer = csv.writer(file)
writer.writerow(this_list)
file.close()
if __name__ == '__main__':
subnets = pandas.read_csv('hosts.csv')
list_subnets = subnets['Subnet'].values.tolist()
fields = ['IP_Addr', "SSH(22)", "RDP(443)"]
with open('tested.csv', 'w') as f:
write = csv.writer(f)
write.writerow(fields)
f.close()
t0 = time.time()
checker = check_subnets()
results = checker.setup(list_subnets)
print(results)
t1 = time.time()
print(t1-t0)
with open("tested.csv", 'r',newline='') as f_input:
csv_input = csv.DictReader(f_input)
data = sorted(csv_input, key=lambda row: (row['IP_Addr']))
f_input.close()
with open("sorted.csv", 'w', newline='') as f_output:
csv_output = csv.DictWriter(f_output, fieldnames=csv_input.fieldnames)
csv_output.writeheader()
csv_output.writerows(data)
f_output.close()
if os.path.exists("tested.csv"):
os.remove("tested.csv")
else:
print("The file does not exist")
I'm using the class to try and create some kind of location each method would see. I have a feeling the class-specific tested_list is not available to each thread, rather each thread is seeing one instance of tested_list and not a shared list.
The test_ports module is just a wrapper for some socket operations.
I figured out that there is a small difference in concurrent.futures.ProcessPoolExecutor
and
concurrent.futures.ThreadPoolExecutor
ThreadPoolExecutor is doing exactly what I wanted, preserving data between threads. New code looks like this:
import csv
import test_ports
import pandas
import ipaddress
import concurrent.futures
import time
class check_subnets(object):
def __init__(self):
self.tested_list = []
def setup(self, l_subnets):
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
executor.map(self.subnet_search, l_subnets)
return self.tested_list
def subnet_search(self, sub):
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor2:
executor2.map(self.ip_search, ipaddress.IPv4Network(sub))
def ip_search(self, ip):
test = test_ports.TestPort()
s_ip_addr = str(ip)
test_ssh = test.test_ssh(s_ip_addr)
test_rdp = test.test_rdp(s_ip_addr)
this_list = [s_ip_addr, test_ssh, test_rdp]
self.tested_list.append(this_list)
if __name__ == '__main__':
subnets = pandas.read_csv('hosts.csv')
list_subnets = subnets['Subnet'].values.tolist()
t0 = time.time()
checker = check_subnets()
results = checker.setup(list_subnets)
t1 = time.time()
print(t1-t0)
sorted_list = (sorted(results, key=lambda x: x[0]))
fields = ['IP_Addr', "SSH(22)", "RDP(443)"]
with open('tested.csv', 'w') as f:
write = csv.writer(f)
write.writerow(fields)
write.writerows(sorted_list)
f.close()
The end result is a sorted list of opened and closed ssh and rdp ports.
I've been trying to download all the files on this page (https://apps.fs.usda.gov/fia/datamart/datamart_excel.html) in bulk , but am having some issues.
All the filenames are the '{state abbreviations}.xlsm', so I can download a single file using requests using code like this:
import requests
url = 'https://apps.fs.usda.gov/fia/datamart/Workbooks/WA.xlsm'
r = requests.get(url)
with open('WA.xlsm', 'wb') as f:
f.write(r.content)
I believe there should be a way to incorporate this into a for loop to get all of the files, but I'm at a loss. Any advice?
Thanks!
Try the below
import requests
states = ['WA','CA'] # TODO add more states
for state in states:
url = f'https://apps.fs.usda.gov/fia/datamart/Workbooks/{state}.xlsm'
r = requests.get(url)
with open(f'{state}.xlsm', 'wb') as f:
f.write(r.content)
Just to add on to #balderman asnwer, but if you have multiple states to get, might be slightly more efficient to use a threading approach. straightforward example using concurrent.futures:
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from time import time
import requests
states = ['WA', 'CA', 'VA', 'NC'] # TODO add more states
out_dir = Path('temp_files')
out_dir.mkdir(exist_ok=True)
def get_content(state: str) -> bytes:
url = f'https://apps.fs.usda.gov/fia/datamart/Workbooks/{state}.xlsm'
r = requests.get(url)
return r.content
start = time()
with ThreadPoolExecutor(max_workers=max(10, len(states))) as pool:
for state, content in zip(states, pool.map(get_content, states)):
with open(out_dir / f'{state}.xlsm', 'wb') as f:
f.write(content)
print('Download ThreadExecutor took', time()-start)
# Compare times with below
# start = time()
# for state in states:
# b = get_content(state)
# with open(out_dir / f'{state}.xlsm', 'wb') as f:
# f.write(b)
# print('Download took', time()-start)
I am totally new to python and just started with it.
I wrote following script go get some data from the SenseHat, but unfortunately it doesn't do what it should.
It should get the temp, humidity and pressure from the SenseHat and write in to a file "data.csv" every 10 minutes.
Without the sleep(600) the script works fine, but logs to much data. With the sleep function it seems it does nothing...
This is my script:
from sense_hat import SenseHat
from datetime import datetime
from csv import writer
from time import sleep
sense = SenseHat()
def get_sense_data():
sense_data = []
sense_data.append(datetime.now())
sense_data.append(sense.get_temperature())
sense_data.append(sense.get_pressure())
sense_data.append(sense.get_humidity())
return sense_data
#while True:
# print(get_sense_data())
with open('data.csv', 'wb') as f:
data_writer = writer(f)
data_writer.writerow(['time','temp','pres','hum'])
while True:
data = get_sense_data()
data_writer.writerow(data)
sleep(600)from sense_hat import SenseHat
from datetime import datetime
from csv import writer
from time import sleep
sense = SenseHat()
def get_sense_data():
sense_data = []
sense_data.append(datetime.now())
sense_data.append(sense.get_temperature())
sense_data.append(sense.get_pressure())
sense_data.append(sense.get_humidity())
return sense_data
#while True:
# print(get_sense_data())
with open('data.csv', 'wb') as f:
data_writer = writer(f)
data_writer.writerow(['time','temp','pres','hum'])
while True:
data = get_sense_data()
data_writer.writerow(data)
sleep(600)
Please could anyone help me and explain to me what is wrong...?
Output to files is buffered. If you want to see the intermediate results of the script while it's sleeping, you need to flush the buffer.
with open('data.csv', 'wb') as f:
data_writer = writer(f)
data_writer.writerow(['time','temp','pres','hum'])
while True:
data = get_sense_data()
data_writer.writerow(data)
f.flush()
sleep(600)
I'm trying to pass Urls from a .csv to a function that will send requests to SMMRY api. The .csv has a column labeled 'url', and the API summarizes websites using SMMRY (https://smmry.com/) and asyncio. The smmrpy module creates an "article" object and while it can print the properties, I'm trying to past a list of URLs to the function and have it loop and print summarizations until complete
The problem is, the urls aren't being passed to the function. Below is my code
import time
import csv
import asyncio
import smmrpy
s = smmrpy.SMMRPY("ABCDEFGHI")
with open('Dec1.csv') as csvFile:
reader = csv.DictReader(csvFile)
for row in reader:
URL = (row['url'])
async def main():
article = await s.get_smmry(URL)
global contents
contents = article.content
#print(contents)
print(article.keywords)
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
print(contents)
I can't test this, but try:
import time
import csv
import asyncio
import smmrpy
async def main():
s = smmrpy.SMMRPY("ABCDEFGHI")
with open('Dec1.csv') as csvFile:
reader = csv.DictReader(csvFile)
for row in reader:
URL = (row['url'])
article = await s.get_smmry(URL)
contents = article.content
print(contents)
print(article.keywords)
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
I am a gevent newbie, but I think I got it working — in a limited sense. Basically, for pools of 1, the code proceeds, while for larger pools the code gets stuck, usually within the first pool (e.g. with a pool of 5, I see 3 greenlet finishing, but not more). What is going wrong? Spawn? Join?
I cannot verify whether the remote server gets confused by multiple queries, but it has no problem with a rapid sequence of serial requests, so probably not…
(I share the code in its entirety as I am not sure where the bug is. Thanks for bearing with me.)
from urllib2 import urlopen
from lxml.etree import parse
import os, csv, cStringIO, codecs, pickle
from selenium import webdriver
from time import sleep
import gevent
from gevent import socket
from gevent import monkey, pool
# patches stdlib (including socket and ssl modules) to cooperate with other greenlets
monkey.patch_all()
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([unicode(s).encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
os.chdir('/Users/laszlosandor/Downloads/kozbeszerzes')
HOSTNAME = 'http://kozbeszerzes.ceu.hu'
driver = webdriver.Chrome()
results = set()
for y in xrange(1998,2015):
for p in xrange(0,9999):
driver.get('http://kozbeszerzes.ceu.hu/searchresults.xhtml?q={}&page={}'.format(y,p))
sleep(1)
if len(driver.find_elements_by_class_name('result'))==0:
break
for e in driver.find_elements_by_class_name('result'):
link = e.find_element_by_tag_name('a')
r = link.get_attribute('href').encode('ascii', 'ignore')
if r[:34]== 'http://kozbeszerzes.ceu.hu/tender/':
results.add(r)
driver.quit()
with open('list_of_urls', 'wb') as f:
pickle.dump(results, f)
#with open('list_of_urls', 'r') as f:
# results = pickle.load(f)
entities = set()
header = ('TenderID','RequestorName','URL','Year','RequestorID','Subject','SourceURL','EstValue','Currency','DecisionDate','Value','VAT')
# """Spawn multiple workers and wait for them to complete"""
# # limit ourselves to max 10 simultaneous outstanding requests
p = pool.Pool(10)
f = open('tenders.csv', 'w')
f.write(codecs.BOM_UTF8)
writer = UnicodeWriter(f)
writer.writerow(header)
def workres(res):
try:
tender = parse(urlopen(res)).getroot()
print ('%s succeeded' % res)
for requestor in tender.findall('requestor'):
entities.add(HOSTNAME + requestor.get('url'))
id = tender.get('id')
reqname = tender.get('requestor')
url = tender.get('url')
year = tender.get('year')
reqid = tender.get('requestor_id')
subject = tender.get('subject')
source = tender.get('source_url')
estval = tender.get('estimated_value')
for part in tender.findall('./parts/part'):
winner = part.find('winner')
entities.add(HOSTNAME + winner.get('url'))
curr = part.find('currency').text
date = part.find('decisionDate').text
value = part.find('value').text
vat = part.find('vat').text
row = id, reqname, url, year, reqid, subject, source, estval, curr, date, value, vat
writer.writerow(row)
except socket.gaierror:
ex = sys.exc_info()[1]
print ('%s failed with %s' % (res, ex))
jobs = [p.spawn(workres, res) for res in results]
p.join()
f.close()
with open('entities', 'wb') as f:
pickle.dump(entities, f)
header = ['ID','URL','Name','NominalCity','City', 'ZIP', 'Address']
f = open('entities.csv', 'w')
f.write(codecs.BOM_UTF8)
writer = UnicodeWriter(f)
writer.writerow(header)
def workent(ent):
try:
ent = parse(urlopen(ent)).getroot()
print ('%s succeeded' % ent)
id = ent.get('id')
url = ent.get('url')
name = ent.get('name')
nominalcity = ent.get('city')
cities = ent.findall('./resolved_addresses/whitelistAddress/city')
zips = ent.findall('./resolved_addresses/whitelistAddress/postalCode')
streets = ent.findall('./resolved_addresses/whitelistAddress/street')
for a in xrange(0,len(cities)):
city = cities[a].text
zip = zips[a].text
street = streets[a].text
row = id, url, name, nominalcity, city, zip, street
writer.writerow(row)
except socket.gaierror:
ex = sys.exc_info()[1]
print ('%s failed with %s' % (ent, ex))
jobs = [p.spawn(workent, ent) for ent in entities]
p.join()
f.close()
I see many mistakes here.
There is not used gevent.sleep() and not time.sleep which is
blocking.
Your variables names are too short. Your could add
descriptions on what each part of code is supposed to do. for example the variable 'p'
is used twice..
There are multiple urls gets using urlopen and the driver module? confusing..
I would use queues between different workers and have just one worker do
write_row calls and deal with the file access now you have multiple green lets accessing
the same file..
use less list compehensions just write out the loops.
I would suggest putting the try except in 'workres' only around the 'parse(urlopen())'
code maybe there are more exceptions happening, which you now don't see.
more tips for gevent