I don't have much programming skills, but I need to send a output of a command into .csv table. I managed to create this, but it prints only 1st line of the table instead whole table, and I don't know how to procceed futher with turning it into csv.
Any help would be much appreciated.
from __future__ import print_function
from datetime import date
import sys
import os
import time
today1 = date.today().strftime('%Y_%m_%d')
strTime = time.strftime('%Y_%m_%d')
command = 'My command here'
cmd = session.command()
response = cmd.execute(command)
element_group = response.get_output()
table = element_group.groups()[0]
for cell in table[0]:
print(cell.labels()[0] + ' , ' + '\t', end='')
print('\n')
for cell in table[5]:
print(cell.value() + ' , ', end='')
print('\n')
I have tried script in description. I was expecting to print whole table and turning it into .csv file.
I have figured it out. Here is the script I wanted.
from __future__ import print_function
import csv
from datetime import date
import sys
import os
import time
today1 = date.today().strftime('%Y_%m_%d')
strTime = time.strftime('%Y_%m_%d')
command = 'My command here'
cmd = session.command()
response = cmd.execute(command)
element_group = response.get_output()
table = element_group.groups()[0]
header = [cell.labels()[0] for cell in table[0]]
rows = [[cell.value() for cell in row] for row in table]
directory = 'location'
filename = directory + 'filename' + today1 + '.csv'
with open(filename, mode='w') as file:
writer = csv.writer(file)
writer.writerow(header)
writer.writerows(rows)
Related
Looking out to extract PDF data to Excel/CSV using Amazon Textract. How we can Insert the Input PDF data from the local folder.
Having PDF with multiple Tables, we need to extract all the tables from their respective pages and export the data to CSV/Excel files. which can be used for further analysis.
Piece of code received from AWS but could not understand how input pdf file can be taken up into the script.
import webbrowser, os
import json
import boto3
import io
from io import BytesIO
import sys
from pprint import pprint
def get_rows_columns_map(table_result, blocks_map):
rows = {}
for relationship in table_result['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
cell = blocks_map[child_id]
if cell['BlockType'] == 'CELL':
row_index = cell['RowIndex']
col_index = cell['ColumnIndex']
if row_index not in rows:
# create new row
rows[row_index] = {}
# get the text value
rows[row_index][col_index] = get_text(cell, blocks_map)
return rows
def get_text(result, blocks_map):
text = ''
if 'Relationships' in result:
for relationship in result['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
word = blocks_map[child_id]
if word['BlockType'] == 'WORD':
text += word['Text'] + ' '
if word['BlockType'] == 'SELECTION_ELEMENT':
if word['SelectionStatus'] =='SELECTED':
text += 'X '
return text
def get_table_csv_results(file_name):
with open(file_name, 'rb') as file:
img_test = file.read()
bytes_test = bytearray(img_test)
print('Image loaded', file_name)
# process using image bytes
# get the results
client = boto3.client('textract')
response = client.analyze_document(Document={'Bytes': bytes_test}, FeatureTypes=['TABLES'])
# Get the text blocks
blocks=response['Blocks']
pprint(blocks)
blocks_map = {}
table_blocks = []
for block in blocks:
blocks_map[block['Id']] = block
if block['BlockType'] == "TABLE":
table_blocks.append(block)
if len(table_blocks) <= 0:
return "<b> NO Table FOUND </b>"
csv = ''
for index, table in enumerate(table_blocks):
csv += generate_table_csv(table, blocks_map, index +1)
csv += '\n\n'
return csv
def generate_table_csv(table_result, blocks_map, table_index):
rows = get_rows_columns_map(table_result, blocks_map)
table_id = 'Table_' + str(table_index)
# get cells.
csv = 'Table: {0}\n\n'.format(table_id)
for row_index, cols in rows.items():
for col_index, text in cols.items():
csv += '{}'.format(text) + ","
csv += '\n'
csv += '\n\n\n'
return csv
def main(file_name):
table_csv = get_table_csv_results(file_name)
output_file = 'output.csv'
# replace content
with open(output_file, "wt") as fout:
fout.write(table_csv)
# show the results
print('CSV OUTPUT FILE: ', output_file)
if __name__ == "__main__":
file_name = sys.argv[1]
main(file_name)
Sample PDF file Click Here
first you must generate the necessary environments in aws, install awscli and configure it with your aws credentials, having that, you only need to install the corresponding libraries and change the last line of the code:
if __name__ == "__main__": file_name = "name_image.png" main(file_name)
I recommend you to read this publication, to set up your aws environment:
https://medium.com/#victorjatoba10/extract-tables-and-forms-from-pdf-using-amazon-aws-textract-827c6e866453
You can read the file yourself and pass the Bytes to Textract
import os
for filename in os.listdir('input'):
if filename.endswith("jpg"):
with open('input/'+filename, 'rb') as img_file:
img_bytes = img_file.read()
response = client_Textract.analyze_document(Document={'Bytes': img_bytes}, FeatureTypes=["TABLES"])
The problem is that a file that I created with Microsoft Word 2010 but can't be opened with python. Some open and some don't. They're all created the same way.
At first I tried to open them via path, it didn't work so I tried to do it the simple. Still no success. This is the error that I get:docx.opc.exceptions.PackageNotFoundError: Package not found at 'COMANDA_TRANSPORT_-_Grecia_SRL.docx'
Here's my spaghetti code:
import os
import re
import Database
import mysql.connector as mysql
from docx import Document
from docx.shared import Inches
from Database import tables
#=============================================#
# == Search for file == #
director = os.path.dirname(os.path.abspath(__file__))
lista_directoare = os.listdir(director)
print(lista_directoare)
print(lista_directoare.sort())
# == Last file== #
ultimul_fisier = lista_directoare[-1]
print('Last file: ' + ultimul_fisier)
def sort(fisier):
fisier = re.search(r'\d+',ultimul_fisier).group()
print(fisier)
string_ultimFisier = str(ultimul_fisier)
print(string_ultimFisier)
print(director + "\\" + string_ultimFisier)
#fisier = open('{}'.format(ultimul_fisier),"rb")
#fisier = open(director + "\\" + string_ultimFisier,"rb")
#document = Document(fisier)
document = Document('COMANDA_TRANSPORT_-_Grecia_SRL.docx')
for paragraph in document.paragraphs:
if "pip" in paragraph.text:
print("Am gasit")
break
else:
print('Nu am gasit')
break
for table in tables:
print(table)
document.save('test.docx')
Do I miss anything here? Why does this code not outputting data to the file I opened? Any ideas?
The following is the essential part of the entire code that has passed the complier without errors, but not outputting data to the file.
#! /usr/bin/python
#Basic imports
import sys
from time import sleep
from datetime import datetime,date,time
import numpy as np
#Create Bridge objects
try:
bridge_1 = Bridge()
outfile = open("prototype.csv", "a")
# Initialize all sensors and discard the readings
lc1_ini = bridge_1.getBridgeValue(0) * 2674.0 - 210.7
sleep(1)
lc1 = bridge_1.getBridgeValue(0) * 2674.0 - 210.7
# create empty array to store the converted digital data
readings_lc1 = np.empty([])
avg_lc1 = np.empty([])
max_samples = 3
readings_lc1 = np.append(readings_lc1 , lc1 )
if len(readings_lc1) == max_samples:
avg_lc1 = np.mean(readings_lc1[1:])
#Write the data to the text file
outfile.write(str(datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')) + "," + str(round(lc1,2)) + "\n")
outfile.close()
I am trying to convert xls to json and but when I am executing the code it's not giving me the data inside xls sheet, it's only giving me the json structure.
Below is the code which I am running, I am not able to understand what modification I should further make in this so that I can get a perfect json file.
Please note - input is in the form of binary stream and output is also in the form of a stream and not file.
#!/usr/bin/python -u
import sys
import xlrd
import simplejson
from collections import OrderedDict
wb = xlrd.open_workbook(file_contents=sys.stdin.read())
for sheet_index in range(wb.nsheets):
# print sheet_index
sh = wb.sheet_by_index(sheet_index)
# print "Processing sheet no ", sheet_index
attributes = sh.row_values(0)
#print attributes
rows_list = []
attr_list = []
# print attr_list[0]
for rownum in range(1,sh.nrows):
row_val_list = sh.row_values(rownum)
row_dict = OrderedDict()
for index in range(len(attr_list)):
row_dict[attr_list[index]] = row_val_list[index]
#row_dict['ID'] = row_val_list[0]
#row_dict['Name'] = row_val_list[1]
#rows_list.append(row_dict)
#json_data = simplejson.dumps(rows_list)
#sys.stdout.write(json_data)
rows_list.append(row_dict)
json_data = simplejson.dumps(rows_list)
sys.stdout.write(json_data)
# json_data = simplejson.dumps(rows_list)
#sys.stdout.write(json_data)
~
Any help is much appreciated
here is the correct working python code
#!/usr/bin/python -u
import sys
import xlrd
import simplejson
from collections import OrderedDict
wb = xlrd.open_workbook(file_contents=sys.stdin.read())
#print "Sheets are .... ", wb.nsheets
for sheet_index in range(wb.nsheets):
sh = wb.sheet_by_index(sheet_index)
if sh.nrows == 0:
continue
attr_list = sh.row_values(0)
rows_list = []
for rownum in range(1,sh.nrows):
row_values = sh.row_values(rownum)
row_dict = OrderedDict()
for index in range(len(attr_list)):
row_dict[attr_list[index]] = row_values[index]
rows_list.append(row_dict)
json_data = simplejson.dumps(rows_list)
sys.stdout.write(json_data)
So this is rather worrying--I hope that someone can give me a hand with this one.
I am using a python script to download google doc spreadsheets and then back them up to our servers. MOST of the time, it works well, but every so often I get an error that looks like this:
gdata.service.RequestError: {'status': 409, 'body': '', 'reason': 'Conflict'}
Here is all of the code that I am using. Does somebody know if the Export function has some strange behavior that could be causing this?
QC_GoogleDoc_Spreadsheet_AutoLog
Author: Christopher James Johnson
Date: May 22, 2012
try:
from xml.etree import ElementTree
except ImportError:
from elementtree import ElementTree
import gdata.spreadsheet.service
import gdata.service
import atom.service
import gdata.spreadsheet
import gdata.docs.service
import atom
import getopt
import sys
import string
import time
import shutil
import os
import getpass
import tempfile
import csv
import time
import datetime
import glob
def main():
archiver = backUpper()
class backUpper():
def __init__(self):
gd = gdata.docs.service.DocsService()
self.gd_client = gdata.docs.service.DocsService()
self.gd_client.email = 'xxxx.xxxx'
self.gd_client.password = 'xxxxxxxx'
self.gd_client.source = 'Spreadsheets GData Sample'
self.gd_client.ProgrammaticLogin()
self.curr_key = ''
self.curr_wksht_id = ''
self.list_feed = None
self.autoLogPath = ""
spreadsheets_client = gdata.spreadsheet.service.SpreadsheetsService()
spreadsheets_client.email = self.gd_client.email
spreadsheets_client.password = self.gd_client.password
spreadsheets_client.source = "My Fancy Spreadsheet Downloader"
spreadsheets_client.ProgrammaticLogin()
feed = spreadsheets_client.GetSpreadsheetsFeed()
for i, entry in enumerate(feed.entry):
if isinstance(feed, gdata.spreadsheet.SpreadsheetsSpreadsheetsFeed):
if isinstance(entry, gdata.spreadsheet.SpreadsheetsSpreadsheet):
print entry.title.text
x = entry.id.text
print x
self.Download(entry)
self.DeleteTemporaryFiles()
def Download(self, entry):
line = entry.id.text
title = entry.title.text
splitLine = line.split('/')
key = splitLine[-1]
backUpDir = R'\\cob-hds-1\compression\QC\QCing\otherFiles\GoogleDocBackUp' + '\\'
now = datetime.datetime.now()
hour = now.hour
today = datetime.date.today()
if not os.path.exists(backUpDir + str(today)):
os.mkdir(backUpDir + str(today))
if not os.path.exists(backUpDir + str(today) + '\\' + str(hour)):
os.mkdir(backUpDir + str(today) + '\\' + str(hour))
backupDir = backUpDir + str(today) + '\\' + str(hour)
tempfile.tempdir = backupDir
file_path = tempfile.mkstemp(suffix='.xls')
uri = 'http://docs.google.com/feeds/documents/private/full/%s' % key
spreadsheets_client = gdata.spreadsheet.service.SpreadsheetsService()
spreadsheets_client.email = self.gd_client.email
spreadsheets_client.password = self.gd_client.password
spreadsheets_client.source = "My Fancy Spreadsheet Downloader"
spreadsheets_client.ProgrammaticLogin()
# ...
docEntry = self.gd_client.GetDocumentListEntry(uri)
docs_auth_token = self.gd_client.GetClientLoginToken()
self.gd_client.SetClientLoginToken(spreadsheets_client.GetClientLoginToken())
self.gd_client.Export(docEntry, file_path[1])
shutil.copy(file_path[1], backupDir + '//' + title + '.xls')
os.close(file_path[0])
self.gd_client.SetClientLoginToken(docs_auth_token)
if __name__ == '__main__':
main()
So the scary part--This just started happening THIS MORNING! Everything was great before...and this morning...something has started happening with this and other Gdata using python scripts! Please help!
Thanks!
EDIT: So a co-worker of mine was working on one of these spreadsheets at the time and both of our programs crashed. (Mine backs up the google docs and his writes to it. If we are both working on the same spreadsheet at the same time, could this create a problem?)