Looking out to extract PDF data to Excel/CSV using Amazon Textract. How we can Insert the Input PDF data from the local folder.
Having PDF with multiple Tables, we need to extract all the tables from their respective pages and export the data to CSV/Excel files. which can be used for further analysis.
Piece of code received from AWS but could not understand how input pdf file can be taken up into the script.
import webbrowser, os
import json
import boto3
import io
from io import BytesIO
import sys
from pprint import pprint
def get_rows_columns_map(table_result, blocks_map):
rows = {}
for relationship in table_result['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
cell = blocks_map[child_id]
if cell['BlockType'] == 'CELL':
row_index = cell['RowIndex']
col_index = cell['ColumnIndex']
if row_index not in rows:
# create new row
rows[row_index] = {}
# get the text value
rows[row_index][col_index] = get_text(cell, blocks_map)
return rows
def get_text(result, blocks_map):
text = ''
if 'Relationships' in result:
for relationship in result['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
word = blocks_map[child_id]
if word['BlockType'] == 'WORD':
text += word['Text'] + ' '
if word['BlockType'] == 'SELECTION_ELEMENT':
if word['SelectionStatus'] =='SELECTED':
text += 'X '
return text
def get_table_csv_results(file_name):
with open(file_name, 'rb') as file:
img_test = file.read()
bytes_test = bytearray(img_test)
print('Image loaded', file_name)
# process using image bytes
# get the results
client = boto3.client('textract')
response = client.analyze_document(Document={'Bytes': bytes_test}, FeatureTypes=['TABLES'])
# Get the text blocks
blocks=response['Blocks']
pprint(blocks)
blocks_map = {}
table_blocks = []
for block in blocks:
blocks_map[block['Id']] = block
if block['BlockType'] == "TABLE":
table_blocks.append(block)
if len(table_blocks) <= 0:
return "<b> NO Table FOUND </b>"
csv = ''
for index, table in enumerate(table_blocks):
csv += generate_table_csv(table, blocks_map, index +1)
csv += '\n\n'
return csv
def generate_table_csv(table_result, blocks_map, table_index):
rows = get_rows_columns_map(table_result, blocks_map)
table_id = 'Table_' + str(table_index)
# get cells.
csv = 'Table: {0}\n\n'.format(table_id)
for row_index, cols in rows.items():
for col_index, text in cols.items():
csv += '{}'.format(text) + ","
csv += '\n'
csv += '\n\n\n'
return csv
def main(file_name):
table_csv = get_table_csv_results(file_name)
output_file = 'output.csv'
# replace content
with open(output_file, "wt") as fout:
fout.write(table_csv)
# show the results
print('CSV OUTPUT FILE: ', output_file)
if __name__ == "__main__":
file_name = sys.argv[1]
main(file_name)
Sample PDF file Click Here
first you must generate the necessary environments in aws, install awscli and configure it with your aws credentials, having that, you only need to install the corresponding libraries and change the last line of the code:
if __name__ == "__main__": file_name = "name_image.png" main(file_name)
I recommend you to read this publication, to set up your aws environment:
https://medium.com/#victorjatoba10/extract-tables-and-forms-from-pdf-using-amazon-aws-textract-827c6e866453
You can read the file yourself and pass the Bytes to Textract
import os
for filename in os.listdir('input'):
if filename.endswith("jpg"):
with open('input/'+filename, 'rb') as img_file:
img_bytes = img_file.read()
response = client_Textract.analyze_document(Document={'Bytes': img_bytes}, FeatureTypes=["TABLES"])
I want to make a keystore of values in JSON. Everything should work through the arguments entered into the console. That is, the data is first written to a file, and then must be read from there.
Input: python storage.py --key key_name --value value_name
Output: python storage.py --key key_name
A function with arguments and a function with data entry work. But I had a problem with the file read function. I need to print the key by its value, or values if there are several.
The recorded JSON looks something like this:
{"key": "Pepe", "value": "Pepeyaya"}{"key": "PepeHug", "value": "KekeHug"}{"key": "Pepega", "value": "Kekega"}{"key": "Pepe", "value": "Keke"}
I tried reading the file like this:
data = json.loads(f.read())
But the error is exactly the same
In other similar topics I saw that the "dictionaries" in JSON are written to the list. I tried something like this:
data = json.loads([f.read()])
Result:
TypeError: the JSON object must be str, bytes or bytearray, not list
Also:
data = json.load([f])
Result:
AttributeError: 'list' object has no attribute 'read'
I tried to change the recording function, but I can't write everything to a pre-created dictionary, everything is written to the right of it. Something like this:
[]{"key": "Pepe", "value": "Pepeyaya"}{"key": "PepeHug", "value": "KekeHug"}{"key": "Pepega", "value": "Kekega"}{"key": "Pepe", "value": "Keke"}
Code:
import os
import tempfile
import json
import sys
def create_json(path):
with open(path, mode='a', encoding='utf-8') as f:
json.dump([], f)
def add(key, value, path):
with open(path, mode='a', encoding='utf-8') as f:
entry = {'key': key, 'value': value}
json.dump(entry, f)
def read(a_key, path):
read_result = ""
with open(path) as f:
data = json.load(f)
print(data)
my_list = data
for i in my_list:
for key, value in i.items():
if key == a_key:
read_result += value + ", "
print(value)
def main():
storage_path = os.path.join(tempfile.gettempdir(), 'storage.json')
if sys.argv[1] == "--key":
arg_key = sys.argv[2]
if len(sys.argv) <= 3:
read(arg_key, storage_path)
elif sys.argv[3] == "--value":
arg_value = sys.argv[4]
add(arg_key, arg_value, storage_path)
else:
print("Введите верные аргументы")
else:
print("Введите верные аргументы")
if __name__ == '__main__':
main()
In general, from the attached code, now this error:
json.decoder.JSONDecodeError: Extra data: line 1 column 39 (char 38)
I need on request:
python storage.py --key Pepe
Get Pepe and PepeYaya values
this it's a basic storage method, this method is very bad for large json files but it's an example that show how can you do the job.
import os
import sys
import json
# storage.py key_name value
key =sys.argv[1]
value = sys.argv[2]
data_path = "data.json"
if os.path.isfile(data_path):
with open("data.json") as target:
json_data = json.load(target)
else:
json_data = {}
json_data[key] = value
with open("data.json", "w") as target:
json.dump(json_data, target)
in your case the problem is because the append flag when you open the file. If you need to write a new object you need to delete the last '}' of the json and add a ",object" item after that add the '}' char again.
My code hits the variable end point and then creates the logfile(UUID.log) these log files are unique for every hit. Inside every log file there is a json(process_name,Process_id) where endpoint name gets logged in as a process_name.
The if condition checks in for the duplicate process_name inside the log files before creating a new file to ensure that the log file with duplicate process_name do not get logged in.
from flask import Flask, jsonify
import json
import uuid
import os
import test1
app = Flask(__name__)
#app.route('/<string:name>')
def get_stats(name):
proceuudi = uuid.uuid4()
stat = [
{
'process_id': str(proceuudi),
'process_name': name
}
]
os.chdir("file_path")
files = os.listdir('file_path')
l=[]
for i in files:
with open(i) as f:
data = json.load(f)
for j in data:
l.append(j)
for j in l:
print(j)
if j['process_name'] != name:
with open(str(proceuudi) + '.log', 'w+') as f: # writing JSON object
json.dump(stat, f)
return jsonify({'stats':stat})
else:
return 'Process already running'
app.run(port = 6011)
Whenever i am trying to parse the list(l=[]) containing the process_name and process_id. I am not able to parse the entire list. it is only checking at the starting index. if it gets j['process_name'] != name at the first index it is getting returned. Is there a way through which entire list could be parsed and then if the process_name do not exist in any log file the log file with that process name gets created.
use set to hold process_name as this will avoid scanning whole list.
don't scan all files on every call use global variable to hold name in memory
app = Flask(__name__)
# use set as membership (in operator) check is O(1)
l = set()
running = False
#app.route('/<string:name>')
def get_stats(name):
global l, running
proceuudi = uuid.uuid4()
# why list as from the code it is clear that one file will have only one entry
stat = [
{
'process_id': str(proceuudi),
'process_name': name
}
]
# take all name at the start of server
if not running:
# better to write new function for this stuff
files = os.listdir('./file_path')
print files
for i in files:
with open("./file_path/"+i) as f:
data = json.load(f)
for j in data:
l.add(j["process_name"])
running = True
if name in l:
# use jsonfy here too
return jsonify("proces running")
else:
# add new process_name to in momery variable
l.add(stat[0]["process_name"])
with open("./file_path/"+str(proceuudi) + '.log', 'w+') as f: # writing JSON object
json.dump(stat, f)
return jsonify({'stats':stat})
app.run(port = 6011)
NOTE: use code review for such type of question.
The following code is failing to write the name and address variables to a csv file. When I test it using numbers or words, or the "write' variable, these will be recorded in the csv, but the "Writeaddress" and "WriteName" will not*. (Also, using the original sources for these variables will also leave blanks)
import requests, sys, pyperclip, bs4, csv
StationList = open('CTA Station Addresses.csv', 'w', newline='')
StationWrite = csv.writer(StationList)
for i in range(149):
id = str(i)
res = requests.get('http://www.transitchicago.com/travel_information /station.aspx?StopId=' + id)
res.raise_for_status()
Station = bs4.BeautifulSoup(res.text)
Name = Station.select('.rtehdng')
Address = Station.select('#ctl07_divAddress')
Write = 0
if Name == []:
print('missing name')
Write = 1
else:
#print(Name[0].getText())
WriteName = Name[0].getText()
pass
if Address == []:
print('missing address')
Write = 1
else:
#print(Address[0].getText())
WriteAddress = Address[0].getText()
pass
if Write == 0:
StationWrite.writerow([Write, WriteName, WriteAddress])
Write = 0
StationList.close()
*(I can do "writerows([3, Write, Writename]) and the CSV row will be "3, 0, ")
I couldn't reproduce your error but the data you get has embedded newlines and spaces which can make the csv look odd. I've cleaned up the script and scrubbed the data before writing the csv and ended up with station,address entries. I didn't see a need to write Write because it was always 0 in your script and doesn't even exist in mine since I leverage exception handling instead.
import requests, sys, pyperclip, bs4, csv
with open('CTA Station Addresses.csv', 'w', newline='') as StationList:
StationWrite = csv.writer(StationList)
for i in range(149):
_id = str(i)
res = requests.get('http://www.transitchicago.com/travel_information/station.aspx?StopId=' + _id)
res.raise_for_status()
Station = bs4.BeautifulSoup(res.text, 'lxml')
try:
name = Station.select('.rtehdng')[0].getText().strip()
address = Station.select('#ctl07_divAddress')[0].getText().splitlines()[-1].strip()
except IndexError as e:
print("No data for station", _id)
continue
if not name or not address:
print('Empty elements for station', _id)
continue
print(repr(name), repr(address))
StationWrite.writerow([name, address])
I somewhat understand how to do looping in Python, seems easy enough to say "For each file in this directory...do something". I'm now having a hard time figuring out how to loop through a series of .ini files in a directory, read lines from them, and use the text in the ini files as variables in the same Python script. For example, in this script, a single .ini file provides the values for 12 variables in the script. Currently, to run the script multiple times, one has to replace the single ini file with another one, that contains a different 12 variables. The script performs routine maintenance of an on-line mapping service provider..thing is...I have dozen's of services I'd like to manage with the script. From the script, it appears that the name of the .ini file is fixed, not sure it's even possible to loop through multiple ini file? The good news is, that the script is using ConfigParser.....I hope this makes sense!
[FS_INFO]
SERVICENAME = MyMapService
FOLDERNAME = None
MXD = D:\nightly_updates\maps\MyMap.mxd
TAGS = points, dots, places
DESCRIPTION = This is the description text
MAXRECORDS = 1000
[FS_SHARE]
SHARE = True
EVERYONE = true
ORG = true
GROUPS = None
[AGOL]
USER = user_name
PASS = pass_word1
The script below is reading from the ini file above.
# Import system modules
import urllib, urllib2, json
import sys, os
import requests
import arcpy
import ConfigParser
from xml.etree import ElementTree as ET
class AGOLHandler(object):
def __init__(self, username, password, serviceName, folderName):
self.username = username
self.password = password
self.serviceName = serviceName
self.token, self.http = self.getToken(username, password)
self.itemID = self.findItem("Feature Service")
self.SDitemID = self.findItem("Service Definition")
self.folderName = folderName
self.folderID = self.findFolder()
def getToken(self, username, password, exp=60):
referer = "http://www.arcgis.com/"
query_dict = {'username': username,
'password': password,
'expiration': str(exp),
'client': 'referer',
'referer': referer,
'f': 'json'}
query_string = urllib.urlencode(query_dict)
url = "https://www.arcgis.com/sharing/rest/generateToken"
token = json.loads(urllib.urlopen(url + "?f=json", query_string).read())
if "token" not in token:
print token['error']
sys.exit()
else:
httpPrefix = "http://www.arcgis.com/sharing/rest"
if token['ssl'] == True:
httpPrefix = "https://www.arcgis.com/sharing/rest"
return token['token'], httpPrefix
def findItem(self, findType):
#
# Find the itemID of whats being updated
#
searchURL = self.http + "/search"
query_dict = {'f': 'json',
'token': self.token,
'q': "title:\""+ self.serviceName + "\"AND owner:\"" + self.username + "\" AND type:\"" + findType + "\""}
jsonResponse = sendAGOLReq(searchURL, query_dict)
if jsonResponse['total'] == 0:
print "\nCould not find a service to update. Check the service name in the settings.ini"
sys.exit()
else:
print("found {} : {}").format(findType, jsonResponse['results'][0]["id"])
return jsonResponse['results'][0]["id"]
def findFolder(self):
#
# Find the ID of the folder containing the service
#
if self.folderName == "None":
return ""
findURL = self.http + "/content/users/{}".format(self.username)
query_dict = {'f': 'json',
'num': 1,
'token': self.token}
jsonResponse = sendAGOLReq(findURL, query_dict)
for folder in jsonResponse['folders']:
if folder['title'] == self.folderName:
return folder['id']
print "\nCould not find the specified folder name provided in the settings.ini"
print "-- If your content is in the root folder, change the folder name to 'None'"
sys.exit()
def urlopen(url, data=None):
# monkey-patch URLOPEN
referer = "http://www.arcgis.com/"
req = urllib2.Request(url)
req.add_header('Referer', referer)
if data:
response = urllib2.urlopen(req, data)
else:
response = urllib2.urlopen(req)
return response
def makeSD(MXD, serviceName, tempDir, outputSD, maxRecords):
#
# create a draft SD and modify the properties to overwrite an existing FS
#
arcpy.env.overwriteOutput = True
# All paths are built by joining names to the tempPath
SDdraft = os.path.join(tempDir, "tempdraft.sddraft")
newSDdraft = os.path.join(tempDir, "updatedDraft.sddraft")
arcpy.mapping.CreateMapSDDraft(MXD, SDdraft, serviceName, "MY_HOSTED_SERVICES")
# Read the contents of the original SDDraft into an xml parser
doc = ET.parse(SDdraft)
root_elem = doc.getroot()
if root_elem.tag != "SVCManifest":
raise ValueError("Root tag is incorrect. Is {} a .sddraft file?".format(SDDraft))
# The following 6 code pieces modify the SDDraft from a new MapService
# with caching capabilities to a FeatureService with Query,Create,
# Update,Delete,Uploads,Editing capabilities as well as the ability to set the max
# records on the service.
# The first two lines (commented out) are no longer necessary as the FS
# is now being deleted and re-published, not truly overwritten as is the
# case when publishing from Desktop.
# The last three pieces change Map to Feature Service, disable caching
# and set appropriate capabilities. You can customize the capabilities by
# removing items.
# Note you cannot disable Query from a Feature Service.
#doc.find("./Type").text = "esriServiceDefinitionType_Replacement"
#doc.find("./State").text = "esriSDState_Published"
# Change service type from map service to feature service
for config in doc.findall("./Configurations/SVCConfiguration/TypeName"):
if config.text == "MapServer":
config.text = "FeatureServer"
#Turn off caching
for prop in doc.findall("./Configurations/SVCConfiguration/Definition/" +
"ConfigurationProperties/PropertyArray/" +
"PropertySetProperty"):
if prop.find("Key").text == 'isCached':
prop.find("Value").text = "false"
if prop.find("Key").text == 'maxRecordCount':
prop.find("Value").text = maxRecords
# Turn on feature access capabilities
for prop in doc.findall("./Configurations/SVCConfiguration/Definition/Info/PropertyArray/PropertySetProperty"):
if prop.find("Key").text == 'WebCapabilities':
prop.find("Value").text = "Query,Create,Update,Delete,Uploads,Editing"
# Add the namespaces which get stripped, back into the .SD
root_elem.attrib["xmlns:typens"] = 'http://www.esri.com/schemas/ArcGIS/10.1'
root_elem.attrib["xmlns:xs"] ='http://www.w3.org/2001/XMLSchema'
# Write the new draft to disk
with open(newSDdraft, 'w') as f:
doc.write(f, 'utf-8')
# Analyze the service
analysis = arcpy.mapping.AnalyzeForSD(newSDdraft)
if analysis['errors'] == {}:
# Stage the service
arcpy.StageService_server(newSDdraft, outputSD)
print "Created {}".format(outputSD)
else:
# If the sddraft analysis contained errors, display them and quit.
print analysis['errors']
sys.exit()
def upload(fileName, tags, description):
#
# Overwrite the SD on AGOL with the new SD.
# This method uses 3rd party module: requests
#
updateURL = agol.http+'/content/users/{}/{}/items/{}/update'.format(agol.username, agol.folderID, agol.SDitemID)
filesUp = {"file": open(fileName, 'rb')}
url = updateURL + "?f=json&token="+agol.token+ \
"&filename="+fileName+ \
"&type=Service Definition"\
"&title="+agol.serviceName+ \
"&tags="+tags+\
"&description="+description
response = requests.post(url, files=filesUp);
itemPartJSON = json.loads(response.text)
if "success" in itemPartJSON:
itemPartID = itemPartJSON['id']
print("updated SD: {}").format(itemPartID)
return True
else:
print "\n.sd file not uploaded. Check the errors and try again.\n"
print itemPartJSON
sys.exit()
def publish():
#
# Publish the existing SD on AGOL (it will be turned into a Feature Service)
#
publishURL = agol.http+'/content/users/{}/publish'.format(agol.username)
query_dict = {'itemID': agol.SDitemID,
'filetype': 'serviceDefinition',
'overwrite': 'true',
'f': 'json',
'token': agol.token}
jsonResponse = sendAGOLReq(publishURL, query_dict)
print("successfully updated...{}...").format(jsonResponse['services'])
return jsonResponse['services'][0]['serviceItemId']
def enableSharing(newItemID, everyone, orgs, groups):
#
# Share an item with everyone, the organization and/or groups
#
shareURL = agol.http+'/content/users/{}/{}/items/{}/share'.format(agol.username, agol.folderID, newItemID)
if groups == None:
groups = ''
query_dict = {'f': 'json',
'everyone' : everyone,
'org' : orgs,
'groups' : groups,
'token': agol.token}
jsonResponse = sendAGOLReq(shareURL, query_dict)
print("successfully shared...{}...").format(jsonResponse['itemId'])
def sendAGOLReq(URL, query_dict):
#
# Helper function which takes a URL and a dictionary and sends the request
#
query_string = urllib.urlencode(query_dict)
jsonResponse = urllib.urlopen(URL, urllib.urlencode(query_dict))
jsonOuput = json.loads(jsonResponse.read())
wordTest = ["success", "results", "services", "notSharedWith", "folders"]
if any(word in jsonOuput for word in wordTest):
return jsonOuput
else:
print "\nfailed:"
print jsonOuput
sys.exit()
if __name__ == "__main__":
#
# start
#
print "Starting Feature Service publish process"
# Find and gather settings from the ini file
localPath = sys.path[0]
settingsFile = os.path.join(localPath, "settings.ini")
if os.path.isfile(settingsFile):
config = ConfigParser.ConfigParser()
config.read(settingsFile)
else:
print "INI file not found. \nMake sure a valid 'settings.ini' file exists in the same directory as this script."
sys.exit()
# AGOL Credentials
inputUsername = config.get( 'AGOL', 'USER')
inputPswd = config.get('AGOL', 'PASS')
# FS values
MXD = config.get('FS_INFO', 'MXD')
serviceName = config.get('FS_INFO', 'SERVICENAME')
folderName = config.get('FS_INFO', 'FOLDERNAME')
tags = config.get('FS_INFO', 'TAGS')
description = config.get('FS_INFO', 'DESCRIPTION')
maxRecords = config.get('FS_INFO', 'MAXRECORDS')
# Share FS to: everyone, org, groups
shared = config.get('FS_SHARE', 'SHARE')
everyone = config.get('FS_SHARE', 'EVERYONE')
orgs = config.get('FS_SHARE', 'ORG')
groups = config.get('FS_SHARE', 'GROUPS') #Groups are by ID. Multiple groups comma separated
# create a temp directory under the script
tempDir = os.path.join(localPath, "tempDir")
if not os.path.isdir(tempDir):
os.mkdir(tempDir)
finalSD = os.path.join(tempDir, serviceName + ".sd")
#initialize AGOLHandler class
agol = AGOLHandler(inputUsername, inputPswd, serviceName, folderName)
# Turn map document into .SD file for uploading
makeSD(MXD, serviceName, tempDir, finalSD, maxRecords)
# overwrite the existing .SD on arcgis.com
if upload(finalSD, tags, description):
# publish the sd which was just uploaded
newItemID = publish()
# share the item
if shared:
enableSharing(newItemID, everyone, orgs, groups)
print "\nfinished."
If I understand your question correctly, you would just want to add another loop in your main and then place most of what you have in your main into a new function (in my example, the new function is called 'process_ini'.
So, try replacing everything from your name == main line through the end with:
def process_ini(fileName):
settingsFile = os.path.join(localPath, fileName)
if os.path.isfile(settingsFile):
config = ConfigParser.ConfigParser()
config.read(settingsFile)
else:
print "INI file not found. \nMake sure a valid 'settings.ini' file exists in the same directory as this script."
sys.exit()
# AGOL Credentials
inputUsername = config.get( 'AGOL', 'USER')
inputPswd = config.get('AGOL', 'PASS')
# FS values
MXD = config.get('FS_INFO', 'MXD')
serviceName = config.get('FS_INFO', 'SERVICENAME')
folderName = config.get('FS_INFO', 'FOLDERNAME')
tags = config.get('FS_INFO', 'TAGS')
description = config.get('FS_INFO', 'DESCRIPTION')
maxRecords = config.get('FS_INFO', 'MAXRECORDS')
# Share FS to: everyone, org, groups
shared = config.get('FS_SHARE', 'SHARE')
everyone = config.get('FS_SHARE', 'EVERYONE')
orgs = config.get('FS_SHARE', 'ORG')
groups = config.get('FS_SHARE', 'GROUPS') #Groups are by ID. Multiple groups comma separated
# create a temp directory under the script
tempDir = os.path.join(localPath, "tempDir")
if not os.path.isdir(tempDir):
os.mkdir(tempDir)
finalSD = os.path.join(tempDir, serviceName + ".sd")
#initialize AGOLHandler class
agol = AGOLHandler(inputUsername, inputPswd, serviceName, folderName)
# Turn map document into .SD file for uploading
makeSD(MXD, serviceName, tempDir, finalSD, maxRecords)
# overwrite the existing .SD on arcgis.com
if upload(finalSD, tags, description):
# publish the sd which was just uploaded
newItemID = publish()
# share the item
if shared:
enableSharing(newItemID, everyone, orgs, groups)
print "\nfinished."
if __name__ == "__main__":
print "Starting Feature Service publish process"
# Find and gather settings from the ini file
localPath = sys.path[0]
for fileName in ['settings.ini', 'flurb.ini', 'durf.ini']:
process_ini(fileName)
You'd have to write all the ini filenames in the list found in the penultimate line of my example.
Alternatively, you could identify all the .ini files in the directory via code:
if __name__ == "__main__":
print "Starting Feature Service publish process"
# Find and gather settings from the ini file
localPath = sys.path[0]
fileNames = [os.path.join(localPath, i) for i in os.listdir(localPath) if i.endswith('.ini')]
for fileName in fileNames:
process_ini(fileName)
It also might help to set the working directory (e.g., os.chdir(localPath)), but I'm going off of what you already had.