url function showing only 1 row? - python

i am trying to create a gui which with file explorer to select csv file and the program will go through the file fetching the url from each row and performing a requests.get however i am getting only one row output as shown in the screenshot
code:
import csv
import requests
from gooey import Gooey, GooeyParser
from checkurl import url_status
#Gooey(program_name="My program")
def parse_args():
UI = GooeyParser()
UI.add_argument('data_file',
action='store',
widget='FileChooser',
help="Source Excel file")
mainUI = UI.parse_args()
return mainUI
if __name__ == '__main__':
args = parse_args()
input_file=args.data_file
output= url_status(input_file)
mainUI = parse_args()
print(output)
function:
import requests
import csv
def url_status(file_path):
with open(file_path, "r") as file:
reader = csv.reader(file, delimiter=",")
my_list = list(reader)
for row in my_list:
name, url = row
response = requests.get(url)
result = "{}: {}".format(name, response)
return result

It is because you create the result object every time you enter the loop and assign it with the new result with this line result = "{}: {}".format(name, response) and you return only the last item.
You can create a list of results and return that and print the results in a loop.
def url_status(file_path):
with open(file_path, "r") as file:
reader = csv.reader(file, delimiter=",")
my_list = list(reader)
results = []
for row in my_list:
name, url = row
response = requests.get(url)
results.append("{}: {}".format(name, response))
return results
And print the result one by one by iterating over the results list as:
if __name__ == '__main__':
args = parse_args()
input_file=args.data_file
output = url_status(input_file)
mainUI = parse_args()
for out in output:
print(out)

Related

Storing result of a thread or process with concurrent.futures

I'm writing a utility I can use to check ports on many subnets. Currently I'm adding my results to a csv file and then sorting the file. I would like to instead add my results to a single list and then output the list so I'm doing fewer file open/close operations. I cannot seem to figure out how to make my results persist between threads. Below is my code:
import csv
import test_ports
import pandas
import ipaddress
import concurrent.futures
import time
import os
class check_subnets(object):
def __init__(self):
self.tested_list = []
def setup(self, l_subnets):
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
executor.map(self.subnet_search, l_subnets)
return self.tested_list
def subnet_search(self, sub):
print("Testing the " + sub + " subnet.")
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor2:
executor2.map(self.ip_search, ipaddress.IPv4Network(sub))
def ip_search(self, ip):
test = test_ports.TestPort()
s_ip_addr = str(ip)
print("Tested " + s_ip_addr)
test_ssh = test.test_ssh(s_ip_addr)
test_rdp = test.test_rdp(s_ip_addr)
this_list = [s_ip_addr, test_ssh, test_rdp]
self.tested_list.append(this_list)
with open('tested.csv', 'a') as file:
writer = csv.writer(file)
writer.writerow(this_list)
file.close()
if __name__ == '__main__':
subnets = pandas.read_csv('hosts.csv')
list_subnets = subnets['Subnet'].values.tolist()
fields = ['IP_Addr', "SSH(22)", "RDP(443)"]
with open('tested.csv', 'w') as f:
write = csv.writer(f)
write.writerow(fields)
f.close()
t0 = time.time()
checker = check_subnets()
results = checker.setup(list_subnets)
print(results)
t1 = time.time()
print(t1-t0)
with open("tested.csv", 'r',newline='') as f_input:
csv_input = csv.DictReader(f_input)
data = sorted(csv_input, key=lambda row: (row['IP_Addr']))
f_input.close()
with open("sorted.csv", 'w', newline='') as f_output:
csv_output = csv.DictWriter(f_output, fieldnames=csv_input.fieldnames)
csv_output.writeheader()
csv_output.writerows(data)
f_output.close()
if os.path.exists("tested.csv"):
os.remove("tested.csv")
else:
print("The file does not exist")
I'm using the class to try and create some kind of location each method would see. I have a feeling the class-specific tested_list is not available to each thread, rather each thread is seeing one instance of tested_list and not a shared list.
The test_ports module is just a wrapper for some socket operations.
I figured out that there is a small difference in concurrent.futures.ProcessPoolExecutor
and
concurrent.futures.ThreadPoolExecutor
ThreadPoolExecutor is doing exactly what I wanted, preserving data between threads. New code looks like this:
import csv
import test_ports
import pandas
import ipaddress
import concurrent.futures
import time
class check_subnets(object):
def __init__(self):
self.tested_list = []
def setup(self, l_subnets):
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
executor.map(self.subnet_search, l_subnets)
return self.tested_list
def subnet_search(self, sub):
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor2:
executor2.map(self.ip_search, ipaddress.IPv4Network(sub))
def ip_search(self, ip):
test = test_ports.TestPort()
s_ip_addr = str(ip)
test_ssh = test.test_ssh(s_ip_addr)
test_rdp = test.test_rdp(s_ip_addr)
this_list = [s_ip_addr, test_ssh, test_rdp]
self.tested_list.append(this_list)
if __name__ == '__main__':
subnets = pandas.read_csv('hosts.csv')
list_subnets = subnets['Subnet'].values.tolist()
t0 = time.time()
checker = check_subnets()
results = checker.setup(list_subnets)
t1 = time.time()
print(t1-t0)
sorted_list = (sorted(results, key=lambda x: x[0]))
fields = ['IP_Addr', "SSH(22)", "RDP(443)"]
with open('tested.csv', 'w') as f:
write = csv.writer(f)
write.writerow(fields)
write.writerows(sorted_list)
f.close()
The end result is a sorted list of opened and closed ssh and rdp ports.

Export all table data from PDF to Excel using Amazon textract

Looking out to extract PDF data to Excel/CSV using Amazon Textract. How we can Insert the Input PDF data from the local folder.
Having PDF with multiple Tables, we need to extract all the tables from their respective pages and export the data to CSV/Excel files. which can be used for further analysis.
Piece of code received from AWS but could not understand how input pdf file can be taken up into the script.
import webbrowser, os
import json
import boto3
import io
from io import BytesIO
import sys
from pprint import pprint
def get_rows_columns_map(table_result, blocks_map):
rows = {}
for relationship in table_result['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
cell = blocks_map[child_id]
if cell['BlockType'] == 'CELL':
row_index = cell['RowIndex']
col_index = cell['ColumnIndex']
if row_index not in rows:
# create new row
rows[row_index] = {}
# get the text value
rows[row_index][col_index] = get_text(cell, blocks_map)
return rows
def get_text(result, blocks_map):
text = ''
if 'Relationships' in result:
for relationship in result['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
word = blocks_map[child_id]
if word['BlockType'] == 'WORD':
text += word['Text'] + ' '
if word['BlockType'] == 'SELECTION_ELEMENT':
if word['SelectionStatus'] =='SELECTED':
text += 'X '
return text
def get_table_csv_results(file_name):
with open(file_name, 'rb') as file:
img_test = file.read()
bytes_test = bytearray(img_test)
print('Image loaded', file_name)
# process using image bytes
# get the results
client = boto3.client('textract')
response = client.analyze_document(Document={'Bytes': bytes_test}, FeatureTypes=['TABLES'])
# Get the text blocks
blocks=response['Blocks']
pprint(blocks)
blocks_map = {}
table_blocks = []
for block in blocks:
blocks_map[block['Id']] = block
if block['BlockType'] == "TABLE":
table_blocks.append(block)
if len(table_blocks) <= 0:
return "<b> NO Table FOUND </b>"
csv = ''
for index, table in enumerate(table_blocks):
csv += generate_table_csv(table, blocks_map, index +1)
csv += '\n\n'
return csv
def generate_table_csv(table_result, blocks_map, table_index):
rows = get_rows_columns_map(table_result, blocks_map)
table_id = 'Table_' + str(table_index)
# get cells.
csv = 'Table: {0}\n\n'.format(table_id)
for row_index, cols in rows.items():
for col_index, text in cols.items():
csv += '{}'.format(text) + ","
csv += '\n'
csv += '\n\n\n'
return csv
def main(file_name):
table_csv = get_table_csv_results(file_name)
output_file = 'output.csv'
# replace content
with open(output_file, "wt") as fout:
fout.write(table_csv)
# show the results
print('CSV OUTPUT FILE: ', output_file)
if __name__ == "__main__":
file_name = sys.argv[1]
main(file_name)
Sample PDF file Click Here
first you must generate the necessary environments in aws, install awscli and configure it with your aws credentials, having that, you only need to install the corresponding libraries and change the last line of the code:
if __name__ == "__main__": file_name = "name_image.png" main(file_name)
I recommend you to read this publication, to set up your aws environment:
https://medium.com/#victorjatoba10/extract-tables-and-forms-from-pdf-using-amazon-aws-textract-827c6e866453
You can read the file yourself and pass the Bytes to Textract
import os
for filename in os.listdir('input'):
if filename.endswith("jpg"):
with open('input/'+filename, 'rb') as img_file:
img_bytes = img_file.read()
response = client_Textract.analyze_document(Document={'Bytes': img_bytes}, FeatureTypes=["TABLES"])

How to skip a column when writing to a csv file in python

Sorry if this has been asked, but is it possible to skip a column when writing to a csv file?
Here is the code I have:
with open("list.csv","r") as f:
reader2 = csv.reader(f)
for row in reader2:
url = 'http://peopleus.intelius.com/results.php?ReportType=33&qi=0&qk=10&qp='+row
req = urllib.request.Request(url)
response = urllib.request.urlopen(req)
html = response.read()
retrieved_name = b'class="singleName">(.*?)<\/h1'
retrieved_number = b'<div\sclass="phone">(.*?)<\/div'
retrieved_nothing = b"(Sorry\swe\scouldn\\'t\sfind\sany\sresults)"
if re.search(retrieved_nothing,html):
noth = re.search(retrieved_nothing.decode('utf-8'),html.decode('utf-8')).group(1)
add_list(phone_data, noth)
else:
if re.search(retrieved_name,html):
name_found = re.search(retrieved_name.decode('utf-8'),html.decode('utf-8')).group(1)
else:
name_found = "No name found on peopleus.intelius.com"
if re.search(retrieved_number,html):
number_found = re.search(retrieved_number.decode('utf-8'),html.decode('utf-8')).group(1)
else:
number_found = "No number found on peopleus.intelius.com"
add_list(phone_data, name_found, number_found)
with open('column_skip.csv','a+', newline='') as mess:
writ = csv.writer(mess, dialect='excel')
writ.writerow(phone_data[-1])
time.sleep(10)
Assuming that there is data in the first three rows of column_skip.csv, can I have my program start writing its info in column 4?
Yeah, don't use csv.writer method and write it as an simple file write operation:
`file_path ='your_csv_file.csv'
with open(file_path, 'w') as fp:
#following are the data you want to write to csv
fp.write("%s, %s, %s" % ('Name of col1', 'col2', 'col4'))
fp.write("\n")`
I hope this helps...

Python 3 csv.reader empty response

Hello I got the following code but the loop won't work because the csv.reader is empty. The file with the csv data is opened correctly.
For Understanding:
var pokemon can be any pokemon name as string.
bot, logger and event are vars comming from the Hangoutsbot.
All needed libaries are loaded.
Code:
def pkmn_translate(bot, event, pokemon):
logger.info("translating pokemon name")
url = "https://raw.githubusercontent.com/PokeAPI/pokeapi/master/data/v2/csv/pokemon_species_names.csv"
request = urllib.request.Request(url, headers = {"User-agent":"Mozilla/5.0", "Accept-Charset":"utf-8"})
try:
data = urllib.request.urlopen(request)
csv_data = data.read()
csvstr = str(csv_data).strip("b'")
lines = csvstr.split("\\n")
f = open('{}/pokemon_species_names.csv'.format(os.path.dirname(os.path.realpath(__file__))), "w",encoding='utf8')
for line in lines:
f.write(line + "\n")
f.close()
logger.info("translating db saved")
except urllib.error.URLError as e:
logger.info("{}: Error: {}".format(event.user.full_name, json.loads(e.read().decode("utf8","ignore"))['detail']))
yield from bot.coro_send_message(event.conv, "{}: Error: {}".format(event.user.full_name, json.loads(e.read().decode("utf8","ignore"))['detail']))
return
pokemon_id = "default"
f = open('{}/pokemon_species_names.csv'.format(os.path.dirname(os.path.realpath(__file__))), 'r', encoding='utf8') # opens the csv file
try:
logger.info("DEBUG: openFile")
#Quick and dirty fix because CSV File is very big
maxInt = sys.maxsize
decrement = True
while decrement:
# decrease the maxInt value by factor 10
# as long as the OverflowError occurs.
decrement = False
try:
csv.field_size_limit(maxInt)
except OverflowError:
maxInt = int(maxInt/10)
decrement = True
logger.info("DEBUG: maxInt = {}".format(maxInt))
reader = csv.reader(f)
rows = list(reader)
for row in reader:
logger.info("DEBUG: row = {}".format(row))
for column in row:
if pokemon == column:
#DEBUG
logger.info("Info: row = {}".format(row))
#SET VAR
pokemon_id = rows[row][0]
#DEBUG
logger.info("Info: {}".format(pokemon_id))
bot.coro_send_message(event.conv, "Info: {}".format(pokemon_id))
else:
logger.info("Error: Name not in File!")
bot.coro_send_message(event.conv, "Error: Name not in File!")
else:
logger.info("DEBUG: Loop exited")
else:
logger.info("DEBUG: Loop exited")
except:
logger.info("Debug: Some error")
finally:
f.close() # closing
logger.info("Debug func: PokemonID = {}".format(pokemon_id))
yield from pokemon_id
return pokemon_id
at the for loop it has no data in the reader variable and it fails. I don't know how to get the csv.reader to work.
PS: I am an total noob at python.
your list(reader) call consumes the reader, which is empty on the for loop.
just replace
reader = csv.reader(f)
rows = list(reader)
for row in reader:
by
reader = csv.reader(f)
rows = list(reader)
for row in rows:

Python Error Using Argparse

I made a program that converts a csv file to a xml file using argparse. First it will read the csv file as an inputfile then converts it to a xml file. Here is my code:
import sys, argparse
import csv
import indent
from xml.etree.ElementTree import ElementTree, Element, SubElement, Comment, tostring
parser=argparse.ArgumentParser(description='Convert wordlist text files to various formats.', prog='Text Converter')
parser.add_argument('-v','--verbose',action='store_true',dest='verbose',help='Increases messages being printed to stdout')
parser.add_argument('-c','--csv',action='store_true',dest='readcsv',help='Reads CSV file and converts to XML file with same name')
parser.add_argument('-x','--xml',action='store_true',dest='toxml',help='Convert CSV to XML with different name')
parser.add_argument('-i','--inputfile',type=argparse.FileType('r'),dest='inputfile',help='Name of file to be imported',required=True)
parser.add_argument('-o','--outputfile',type=argparse.FileType('w'),dest='outputfile',help='Output file name')
args = parser.parse_args()
def main(argv):
reader = read_csv(args.inputfile)
if args.verbose:
print ('Verbose Selected')
if args.toxml:
if args.verbose:
print ('Convert to XML Selected')
generate_xml(reader, args.outputfile)
if args.readcsv:
if args.verbose:
print ('Reading CSV file')
if not (args.toxml or args.readcsv):
parser.error('No action requested')
return 1
def read_csv(inputfile):
return list(csv.reader(inputfile))
def generate_xml(reader,outfile):
root = Element('Solution')
root.set('version','1.0')
tree = ElementTree(root)
head = SubElement(root, 'DrillHoles')
head.set('total_holes', '238')
description = SubElement(head,'description')
current_group = None
i = 0
for row in reader:
if i > 0:
x1,y1,z1,x2,y2,z2,cost = row
if current_group is None or i != current_group.text:
current_group = SubElement(description, 'hole',{'hole_id':"%s"%i})
collar = SubElement (current_group, 'collar',{'':', '.join((x1,y1,z1))}),
toe = SubElement (current_group, 'toe',{'':', '.join((x2,y2,z2))})
cost = SubElement(current_group, 'cost',{'':cost})
i+=1
indent.indent(root)
tree.write(outfile)
if (__name__ == "__main__"):
sys.exit(main(sys.argv))
then on the command prompt i write, Argparse.py -i 1250_12.csv -o output.xml -x
where argparse is the program name and 1250_12.csv is csv file name and output.xml is what i want the output name to be and -x is an action converting csv to xml.
this program was working 10 min ago and now it gets an error saying:
x1,y1,z1,x2,y2,z2,cost = row
Value error: need more than 1 value to unpack
It appears that there are lines in your CSV that do not have exactly 7 columns. Perhaps there are some empty lines?
You could use a try..except to catch and handle the error:
def generate_xml(reader,outfile):
root = Element('Solution')
root.set('version','1.0')
tree = ElementTree(root)
head = SubElement(root, 'DrillHoles')
head.set('total_holes', '238')
description = SubElement(head,'description')
current_group = None
next(reader) # skip the first line
for row in reader:
try:
x1,y1,z1,x2,y2,z2,cost = row
except ValueError as err:
sys.stderr.write('{e}: {r!r}'.format(e=err, r=row))
if current_group is None or i != current_group.text:
current_group = SubElement(description, 'hole',{'hole_id':"%s"%i})
collar = SubElement (current_group, 'collar',{'':', '.join((x1,y1,z1))}),
toe = SubElement (current_group, 'toe',{'':', '.join((x2,y2,z2))})
cost = SubElement(current_group, 'cost',{'':cost})
indent.indent(root)
tree.write(outfile)
Also, if you set reader = csv.reader(inputfile) instead of making it a list, then your program will require less memory since reader will be an iterator instead of a list of rows.
Morever, to skip the first line with next(reader), reader must be an iterator, not a list. So for the above to work, also change:
reader = read_csv(args.inputfile)
to
reader = csv.reader(inputfile)

Categories