Url's from CSV not passing to function

Url's from CSV not passing to function - python

I'm trying to pass Urls from a .csv to a function that will send requests to SMMRY api. The .csv has a column labeled 'url', and the API summarizes websites using SMMRY (https://smmry.com/) and asyncio. The smmrpy module creates an "article" object and while it can print the properties, I'm trying to past a list of URLs to the function and have it loop and print summarizations until complete
The problem is, the urls aren't being passed to the function. Below is my code
import time
import csv
import asyncio
import smmrpy
s = smmrpy.SMMRPY("ABCDEFGHI")
with open('Dec1.csv') as csvFile:
reader = csv.DictReader(csvFile)
for row in reader:
URL = (row['url'])
async def main():
article = await s.get_smmry(URL)
global contents
contents = article.content
#print(contents)
print(article.keywords)
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
print(contents)

I can't test this, but try:
import time
import csv
import asyncio
import smmrpy
async def main():
s = smmrpy.SMMRPY("ABCDEFGHI")
with open('Dec1.csv') as csvFile:
reader = csv.DictReader(csvFile)
for row in reader:
URL = (row['url'])
article = await s.get_smmry(URL)
contents = article.content
print(contents)
print(article.keywords)
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())

Related

Storing result of a thread or process with concurrent.futures

I'm writing a utility I can use to check ports on many subnets. Currently I'm adding my results to a csv file and then sorting the file. I would like to instead add my results to a single list and then output the list so I'm doing fewer file open/close operations. I cannot seem to figure out how to make my results persist between threads. Below is my code:
import csv
import test_ports
import pandas
import ipaddress
import concurrent.futures
import time
import os
class check_subnets(object):
def __init__(self):
self.tested_list = []
def setup(self, l_subnets):
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
executor.map(self.subnet_search, l_subnets)
return self.tested_list
def subnet_search(self, sub):
print("Testing the " + sub + " subnet.")
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor2:
executor2.map(self.ip_search, ipaddress.IPv4Network(sub))
def ip_search(self, ip):
test = test_ports.TestPort()
s_ip_addr = str(ip)
print("Tested " + s_ip_addr)
test_ssh = test.test_ssh(s_ip_addr)
test_rdp = test.test_rdp(s_ip_addr)
this_list = [s_ip_addr, test_ssh, test_rdp]
self.tested_list.append(this_list)
with open('tested.csv', 'a') as file:
writer = csv.writer(file)
writer.writerow(this_list)
file.close()
if __name__ == '__main__':
subnets = pandas.read_csv('hosts.csv')
list_subnets = subnets['Subnet'].values.tolist()
fields = ['IP_Addr', "SSH(22)", "RDP(443)"]
with open('tested.csv', 'w') as f:
write = csv.writer(f)
write.writerow(fields)
f.close()
t0 = time.time()
checker = check_subnets()
results = checker.setup(list_subnets)
print(results)
t1 = time.time()
print(t1-t0)
with open("tested.csv", 'r',newline='') as f_input:
csv_input = csv.DictReader(f_input)
data = sorted(csv_input, key=lambda row: (row['IP_Addr']))
f_input.close()
with open("sorted.csv", 'w', newline='') as f_output:
csv_output = csv.DictWriter(f_output, fieldnames=csv_input.fieldnames)
csv_output.writeheader()
csv_output.writerows(data)
f_output.close()
if os.path.exists("tested.csv"):
os.remove("tested.csv")
else:
print("The file does not exist")
I'm using the class to try and create some kind of location each method would see. I have a feeling the class-specific tested_list is not available to each thread, rather each thread is seeing one instance of tested_list and not a shared list.
The test_ports module is just a wrapper for some socket operations.

I figured out that there is a small difference in concurrent.futures.ProcessPoolExecutor
and
concurrent.futures.ThreadPoolExecutor
ThreadPoolExecutor is doing exactly what I wanted, preserving data between threads. New code looks like this:
import csv
import test_ports
import pandas
import ipaddress
import concurrent.futures
import time
class check_subnets(object):
def __init__(self):
self.tested_list = []
def setup(self, l_subnets):
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
executor.map(self.subnet_search, l_subnets)
return self.tested_list
def subnet_search(self, sub):
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor2:
executor2.map(self.ip_search, ipaddress.IPv4Network(sub))
def ip_search(self, ip):
test = test_ports.TestPort()
s_ip_addr = str(ip)
test_ssh = test.test_ssh(s_ip_addr)
test_rdp = test.test_rdp(s_ip_addr)
this_list = [s_ip_addr, test_ssh, test_rdp]
self.tested_list.append(this_list)
if __name__ == '__main__':
subnets = pandas.read_csv('hosts.csv')
list_subnets = subnets['Subnet'].values.tolist()
t0 = time.time()
checker = check_subnets()
results = checker.setup(list_subnets)
t1 = time.time()
print(t1-t0)
sorted_list = (sorted(results, key=lambda x: x[0]))
fields = ['IP_Addr', "SSH(22)", "RDP(443)"]
with open('tested.csv', 'w') as f:
write = csv.writer(f)
write.writerow(fields)
write.writerows(sorted_list)
f.close()
The end result is a sorted list of opened and closed ssh and rdp ports.

url function showing only 1 row?

i am trying to create a gui which with file explorer to select csv file and the program will go through the file fetching the url from each row and performing a requests.get however i am getting only one row output as shown in the screenshot
code:
import csv
import requests
from gooey import Gooey, GooeyParser
from checkurl import url_status
#Gooey(program_name="My program")
def parse_args():
UI = GooeyParser()
UI.add_argument('data_file',
action='store',
widget='FileChooser',
help="Source Excel file")
mainUI = UI.parse_args()
return mainUI
if __name__ == '__main__':
args = parse_args()
input_file=args.data_file
output= url_status(input_file)
mainUI = parse_args()
print(output)
function:
import requests
import csv
def url_status(file_path):
with open(file_path, "r") as file:
reader = csv.reader(file, delimiter=",")
my_list = list(reader)
for row in my_list:
name, url = row
response = requests.get(url)
result = "{}: {}".format(name, response)
return result

It is because you create the result object every time you enter the loop and assign it with the new result with this line result = "{}: {}".format(name, response) and you return only the last item.
You can create a list of results and return that and print the results in a loop.
def url_status(file_path):
with open(file_path, "r") as file:
reader = csv.reader(file, delimiter=",")
my_list = list(reader)
results = []
for row in my_list:
name, url = row
response = requests.get(url)
results.append("{}: {}".format(name, response))
return results
And print the result one by one by iterating over the results list as:
if __name__ == '__main__':
args = parse_args()
input_file=args.data_file
output = url_status(input_file)
mainUI = parse_args()
for out in output:
print(out)

multiprocessing slower than loop

I'm trying to write huge data to a csv file. When I try normal method it writes 50 data in 1 second but with multiprocessing it's down to 5 data in 1 second.
And I also added this code sys.setrecursionlimit(25000). Because without it's giving error.
I can feel I'm not doing right. What is the right way?
from bs4 import BeautifulSoup
import requests
import lxml
import csv
import cchardet
from multiprocessing import Pool
import sys
import time
sys.setrecursionlimit(25000)
csvfileWrite=open("comments.csv", 'a+', newline='',encoding='utf-8') #declared as a global variable
writer = csv.writer(csvfileWrite, delimiter=';', quotechar='"',
quoting=csv.QUOTE_MINIMAL) #declared as a global variable
def kacYildiz(div): #This function returns a number 0 to 5. Not important.
yildizSayisi=0
yildizYeri=div.find("div",attrs={"class":"RatingPointer-module-1OKF3"})
yildizlar=yildizYeri.find_all("svg")
for yildiz in yildizlar:
sonuc=yildiz.find("path").get("fill")
if(sonuc=="#f28b00"):
yildizSayisi+=1
return yildizSayisi
def takeText(div):
comment=div.find("span",attrs={"itemprop":"description"}).text
return comment
def yorumSayfaSayisi(row): # This function returns a number that how many
pages in the sites comment section. Not important.
yorumKismi="-yorumlari?"
adres=row[0]+yorumKismi
r = requests_session.get(adres)
soup = BeautifulSoup(r.text,"lxml")
sayfaS=soup.find("ul",attrs={"class":"PaginationBar-module-3qhrm"})
sayi=sayfaS.find_all("li")[-1].text
return sayi
def writeToCsv(comments): #writing commets to csv file.
global csvfileWrite
global writer
textToWrite = takeText(comments)
writer.writerow([kacYildiz(comments),textToWrite])
if __name__ == '__main__':
pageNumber=1
requests_session = requests.Session()
comments=list()
csvfile=open('adresler.csv',newline='')
reader = csv.reader(csvfile, delimiter=';', quotechar='|')
for row in reader:
rowNumber=yorumSayfaSayisi(row)
for i in range(1,int(rowNumber)):
comments.clear()
commetAdress="-yorumlari?sayfa={}".format(i)
adress=row[0]+commetAdress
r = requests_session.get(adress)
soup = BeautifulSoup(r.text,"lxml")
page=soup.find_all("div",attrs={"class":"ReviewCard-module-
3Y36S"})
for comment in page:
comments.append(comment)
p = Pool(10)
start = time.process_time()
p.map(writeToCsv, comments)
p.terminate()
p.join()

once try this approach using ThreadPool
from multiprocessing.pool import ThreadPool
def csvYaz(yorumlar):
global csvfileYaz
global yazici
yazi = yorumAl(yorumlar)
yazici.writerow([kacYildiz(yorumlar),yazi])
# ------main-----
for yorum in yorumSayfasi:
yorumlar.append(yorum)
threads = ThreadPool(10).map(csvYaz, yorumlar)
for zz in threads:
print(zz)

Write a CSV file asynchronously in Python

I am writing a CSV file with the following function:
import csv
import os
import aiofiles
async def write_extract_file(output_filename: str, csv_list: list):
"""
Write the extracted content into the file
"""
try:
async with aiofiles.open(output_filename, "w+") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=columns.keys())
writer.writeheader()
writer.writerows(csv_list)
except FileNotFoundError:
print("Output file not present", output_filename)
print("Current dir: ", os.getcwd())
raise FileNotFoundError
However, as there is no await allowed over writerows method, there are no rows being written into the CSV file.
How to resolve this issue? Is there any workaround available?
Thank you.
Entire code can be found here.

You can use aiocsv. Here is a quick example of writing a row to a CSV file asynchronously:
import asyncio
import aiofiles
from aiocsv import AsyncWriter
async def main():
async with aiofiles.open('your-path.csv', 'w') as f:
writer = AsyncWriter(f)
await writer.writerow(['name', 'age'])
await writer.writerow(['John', 25])
asyncio.run(main())
For more examples follow: https://pypi.org/project/aiocsv/

In my opinion it’s better not to try to use the aiofiles with the csv module and run the synchronous code using loop.run_in_executor and wait it asynchronously like below:
def write_extract_file(output_filename: str, csv_list: list):
"""
Write the extracted content into the file
"""
try:
with open(output_filename, "w+") as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=columns.keys())
writer.writeheader()
writer.writerows(csv_list)
except FileNotFoundError:
print("Output file not present", output_filename)
print("Current dir: ", os.getcwd())
raise FileNotFoundError
async def main():
loop = asyncio.get_running_loop()
await loop.run_in_executor(None, write_extract_file, 'test.csv', csv_list)

You can use aiofiles, you just gotta convert the dict to a row :)
import aiofiles
async def write_extract_file(
output_filename: str, csv_list: list
):
cols = columns.keys()
async with aiofiles.open(output_filename, mode='w+') as f_out:
await f_out.write(','.join(cols)+'\n')
for data in csv_list:
line = []
for c in cols:
line.append(str(data[c]) if c in data else '')
line = ','.join(line) + '\n'
await f_out.write(line)

How to append a new row to an existing CSV file in Python?

I am trying to write a Python script which would create a CSV file with first row as AWS EC2 instance distinct tag names from all accounts. Then it will populate that CSV file with corresponding tag values from all instances. I am able to create header of CSV file and also can generate rows consisting of values. I believe where I am struggling in the code is to properly append the row. I have pasted my complete code below. Please kindly advise. Thanks.
I believe, in the code, in this particular place, I am making some mistake:
with open(output_file, 'a', newline='') as f:
writer = csv.writer(f)
writer.writerow(row)
#print(row)
#sys.exit()
#pass
#!/usr/bin/env python
import boto3
import botocore
import argparse
import csv
import logging
import datetime
import click
import yaml
import os
import time
import sys
logging.basicConfig(filename='tag-export.log', filemode='a', format='%(levelname)s - %(message)s')
logger = logging.getLogger()
logger.setLevel(logging.INFO)
targetaccount = 'allaccounts'
# GLOBAL SESSION
sts = boto3.client('sts')
def aws_session(account_id, session_name, role_name):
"""
Function that creates the boto3 session by assuming a cross account role
Uses boto3 to make calls to the STS API
def account_name(session):
"""
Function that resolves the account name (alias) to make output human friendly
Uses boto3 to make calls to the IAM API
def get_instances(filters=[]):
reservations = {}
try:
reservations = ec2.describe_instances(
Filters=filters
)
except botocore.exceptions.ClientError as e:
print(e.response['Error']['Message'])
instances = []
for reservation in reservations.get('Reservations', []):
for instance in reservation.get('Instances', []):
instances.append(instance)
return instances
#click.command()
#click.option('--config_file', required=True, prompt=True, default=lambda: str(os.getcwd()) + '/config.yml', show_default='config.yml')
def main(config_file):
try:
with open(config_file, 'r') as ymlfile:
config = yaml.load(ymlfile)
ymlfile.close()
except Exception as e:
logger.error('Unable to open config file: ' + str(e))
exit
# globals
if 'accounts' in config:
accounts = config['accounts']
if 'role_name' in config:
role_name = config['role_name']
tag_set = []
for account in accounts:
logger.info('dispatching session call for account: ' + str(account) )
if str(account) == str(config['sourceaccount']):
session = boto3.Session(region_name=config['region'])
else:
session = aws_session(account_id=str(account), session_name='cross-account-assume-role', role_name = role_name)
if session:
AccountName = account_name(session)
logger.info('Working on account: ' + AccountName)
print('Working on gathering tags and sorting them.....Wait...: ')
global ec2
ec2 = session.client('ec2', region_name='ap-southeast-2')
output_file = "{}-tags.csv".format(targetaccount)
instances = get_instances()
for instance in instances:
for tag in instance.get('Tags', []):
if tag.get('Key'):
tag_set.append(tag.get('Key'))
tag_set = sorted(list(set(tag_set)))
else:
print("did not get session")
sys.exit()
if tag_set:
print ('Tag Gathering Completed! Moving to each account to get Tag Values')
#sys.exit()
with open(output_file, 'a', newline='') as csvfile:
fieldnames = ['Account'] + ['InstanceId'] + tag_set
writer = csv.DictWriter(csvfile, fieldnames=fieldnames,extrasaction='ignore')
writer.writeheader()
for account in accounts:
if str(account) == str(config['sourceaccount']):
session = boto3.Session(region_name=config['region'])
else:
session = aws_session(account_id=str(account), session_name='cross-account-assume-role', role_name = role_name)
if session:
AccountName = account_name(session)
logger.info('Working on account: ' + AccountName)
print('Working on account: ' + AccountName + '....')
instances = get_instances()
for instance in instances:
row = {}
row['Account'] = AccountName
row['InstanceId'] = instance.get('InstanceId')
#print (row)
#sys.exit()
for tag in instance.get('Tags', []):
for vtag in tag_set:
if vtag == tag.get('Key'):
#print (tag.get('Key'))
#print ('vtag=' + vtag)
#sys.exit()
row[tag.get('Key')] = tag.get('Value')
#print(row)
with open(output_file, 'a', newline='') as f:
writer = csv.writer(f)
writer.writerow(row)
#print(row)
#sys.exit()
#pass
else:
print("did not get session")
if __name__ == "__main__":
main()

I resolved this issue by changing the logic. Instead of doing it all in one chunk. I first built dictionary of tags, then I built another dictionary from CSV file, and then wrote it all in one chunk like following:
with open(output_file, 'a', newline='') as f:
writer = csv.writer(f)
writer.writerow(row)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Url's from CSV not passing to function - python

Related

Storing result of a thread or process with concurrent.futures

url function showing only 1 row?

multiprocessing slower than loop

Write a CSV file asynchronously in Python

How to append a new row to an existing CSV file in Python?

Categories

Resources