I have a Python script that iterates through a PDF file (loops over each page), and inside each page does some text manipulation. So basically two loops:
files = {}
#npages is the number of PDF pages in the specific file.
for n in range(npages):
path = pdf_name + str(n + 1) + '_1.txt'
files[int(n)] = path
for i, col in enumerate(COLUMNS):
path = pdf_name + str(n + 1) + '_' + str(i + 2) + '.txt'
files[int(n)][int(i)] = path
Basically, I looking on each PDF page, and on each page I then further do some text manipulation.
I am trying to output it like:
- file_page_1.pdf
- file_page_1_col_1.pdf
- file_page_1_col_2.pdf
file_page_2.pdf
- file_page_2_col_1.pdf
- file_page_2_col_2.pdf
However using above coes gives me below error:
files[int(n)][int(i)] = path
TypeError: 'str' object does not support item assignment
I think the structure you're looking for is a dict that has string keys to list values.
files = {}
for page in range(npages):
path = pdf_name + str(n+1) + '_1.txt'
files[path] = []
for i, col in enumerate(COLUMNS):
subpath = pdf_name + str(n + 1) + '_' + str(i + 2) + '.txt'
files[path].append(subpath)
# For accessing items
for path, subpaths in files.items():
# path is a string, the key in files dict
print(path)
# subpaths is a list of strings, the value in files dict
for subpath in subpaths:
print(subpath)
If you want the path/subpath pairs to be returned in the order it was inserted, you can use OrderedDict instead of dict.
from collections import OrderedDict
files = OrderedDict()
# code as above
it is because files[int(n)] returns you str and not a dictionary.
as you can see from your line.
files[int(n)] = path
you are trying to achieve a dictionary behavior from a str object.
to carry out what you are trying to do we can do something like.
from collections import defaultdict
files = {}
for n in range(npages):
path = pdf_name + str(n + 1) + '_1.txt'
files[int(n)] = defaultdict()
files[int(n)]['path_root'] = path
for i, col in enumerate(COLUMNS):
path = pdf_name + str(n + 1) + '_' + str(i + 2) + '.txt'
files[int(n)][int(i)] = path
this should give you result like:
|-- nth file
| |
| |- path_root
| |- child1 (0)
| |- child2 (1)
..
A quick side note about defaultdict:
somedict = {}
print(somedict[3]) # KeyError
someddict = defaultdict(int) # or str
print(someddict[3]) # print int(), thus 0 (str will return you '')
Related
I'm editing a PDF template with using pdftk
command = ("pdftk " + '"' +
template + '"' +
" fill_form " + '"' +
pathUser + user['mail'] + ".xfdf" + '"' +
" output " + '"' +
pathUser + user['mail'] + ".pdf" + '"' +
" need_appearances")
command = command.replace('/', '\\')
os.system(command)
First I'm writing my data in a .xfdf file
for key, value in user.items():
print(key, value)
fields.append(u"""<field name="%s"><value>%s</value></field>""" % (key, value))
tpl = u"""<?xml version="1.0" encoding="UTF-8"?>
<xfdf xmlns="http://ns.adobe.com/xfdf/" xml:space="preserve">
<fields>
%s
</fields>
</xfdf>""" % "\n".join(fields)
f = open(pathUser + user['mail'] + '.xfdf', 'wb')
f.write(tpl.encode("utf-8"))
f.close()
I fetch the template and as shown above, write the data from the xfdf to pdf but for some reason, only the ime gets written.
Templates get fetched using some basic conditional logic as shown below:
for item in user['predavanja']:
user[acthead + str(actn)] = item
actn += 1
for item in user['radionice']:
user[acthead + str(actn)] = item
actn += 1
for item in user['izlet']:
user[acthead + str(actn)] = item
actn += 1
print(actn)
templates = {}
templates['0'] = "Template/2019/certificate_2019.pdf"
templates['5'] = "Template/2019/certificate_2019_5.pdf"
templates['10'] = "Template/2019/certificate_2019_10.pdf"
templates['15'] = "Template/2019/certificate_2019_15.pdf"
templates['20'] = "Template/2019/certificate_2019_20.pdf"
templates['25'] = "Template/2019/certificate_2019_25.pdf"
templates['30'] = "Template/2019/certificate_2019_30.pdf"
templates['35'] = "Template/2019/certificate_2019_35.pdf"
templates['40'] = "Template/2019/certificate_2019_40.pdf"
templates['45'] = "Template/2019/certificate_2019_45.pdf"
templates['50'] = "Template/2019/certificate_2019_50.pdf"
I'm writing this data
user['id'] = data['recommendations'][0]['role_in_team']['user']['id']
user['ime'] = data['recommendations'][0]['role_in_team']['user']['first_name']
user['prezime'] = data['recommendations'][0]['role_in_team']['user']['last_name']
user['tim'] = data['recommendations'][0]['role_in_team']['team']['short_name']
user['mail'] = data['recommendations'][0]['role_in_team']['user']['estudent_email']
user['puno_ime'] = (data['recommendations'][0]['role_in_team']['user']['first_name'] + ' ' +
data['recommendations'][0]['role_in_team']['user']['last_name'])
user['predavanja'] = predavanja
user['radionice'] = radionice
user['izlet'] = izlet
One note. predavanja, radionice and izlet are lists.
I've tried printing tpl which shows all the data being properly added to the scheme.
Turns out the issue was the naming of the variables since they didn't match the field names in the acroform PDF. So the solution was to rename the variables in the code to match the field names.
I’m writing a program that makes music albums into files that you can search for, and for that i need a str in the file that have a specific value that is made after the list is complete. Can you go back in that list and change a blank str with a new value?
I have searched online and found something called words.replace, but it doesn’t work, i get a Attribute error.
def create_album():
global idnumber, current_information
file_information = []
if current_information[0] != 'N/A':
save()
file_information.append(idnumber)
idnumber += 1
print('Type c at any point to abort creation')
for i in creation_list:
value = input('\t' + i)
if value.upper == 'C':
menu()
else:
-1file_information.append('')
file_information.append(value)
file_information.append('Album created - ' + file_information[2] +'\nSongs:')
-2file_information = [w.replace(file_information[1], str(file_information[0]) + '-' + file_information[2]) for w in file_information]
current_information = file_information
save_name = open(save_path + str(file_information[0]) + '-' + str(file_information[2]) + '.txt', 'w')
for i in file_information:
save_name.write(str(i) + '\n')
current_files_ = open(information_file + 'files.txt', 'w')
filenames.append(file_information[0])
for i in filenames:
current_files_.write(str(i) + '\n')
id_file = open(information_file + 'albumid.txt', 'w')
id_file.write(str(idnumber))
-1 is where i have put aside a blank row
-2 is the where i try to replace row 1 in the list with the value of row 0 and row 2.
The error message I receive is ‘int’ object has no attribute ‘replace’
Did you try this?
-2file_information = [w.replace(str(file_information[1]), str(file_information[0]) + '-' + file_information[2]) for w in file_information]
I have a list of strings:
fileList = ['YMML.2019.09.10-Run.1-Final.pdf',
'YMML.2019.09.10-Run.2-Initial.pdf',
'YMML.2019.09.11-Run.2-Initial.pdf',
'YMML.2019.09.11-Run.1-Final.pdf',
'YMML.2019.09.12-Run.2-Initial.pdf',
'YMML.2019.09.13-Run.2-Initial.pdf',
'YMML.2019.09.12-Run.1-Final.pdf',
'YMML.2019.09.13-Run.1-Final.pdf',
'YMML.2019.09.14-Run.1-Final.pdf',]
and I'd like to confirm that there is both a Run.1-Final and Run.2-Initial for each date.
I've tried something like:
for i in range(len(directoryList)):
if directoryList[i][5:15] != directoryList[i + 1][5:15]:
print(directoryList[i] + ' is missing.')
i += 2
and I'd like the output to be
'YMML.2019.09.14-Run.2-Initial.pdf is missing,
Perhaps something like
dates = [directoryList[i][5:15] for i in range(len(directoryList))]
counter = collections.Counter(dates)
But then having trouble extracting from the dictionary.
To make it more readable, you could create a list of dates first, then loop over those.
file_list = ['YMML.2019.09.10-Run.1-Final.pdf',
'YMML.2019.09.10-Run.2-Initial.pdf',
'YMML.2019.09.11-Run.2-Initial.pdf',
'YMML.2019.09.11-Run.1-Final.pdf',
'YMML.2019.09.12-Run.2-Initial.pdf',
'YMML.2019.09.13-Run.2-Initial.pdf',
'YMML.2019.09.12-Run.1-Final.pdf',
'YMML.2019.09.13-Run.1-Final.pdf',
'YMML.2019.09.14-Run.1-Final.pdf',]
dates = set([item[5:15] for item in file_list])
for date in dates:
if 'YMML.' + date + '-Run.1-Final.pdf' not in file_list:
print('YMML.' + date + '-Run.1-Final.pdf is missing')
if 'YMML.' + date + '-Run.2-Initial.pdf' not in file_list:
print('YMML.' + date + '-Run.2-Initial.pdf is missing')
set() takes the unique values in the list to avoid looping through them all twice.
I'm kind of late but here's what i found to be the simplest way, maybe not the most efficent :
for file in fileList:
if file[20:27] == "1-Final":
if (file[0:20] + "2-Initial.pdf") not in fileList:
print(file)
elif file[19:29] is "2-Initial.pdf":
if (file[0:20] + "1-Final.pdf") not in fileList:
print(file)
Here's an O(n) solution which collects items into a defaultdict by date, then filters on quantity seen, restoring original names from the remaining value:
from collections import defaultdict
files = [
'YMML.2019.09.10-Run.1-Final.pdf',
'YMML.2019.09.10-Run.2-Initial.pdf',
'YMML.2019.09.11-Run.2-Initial.pdf',
'YMML.2019.09.11-Run.1-Final.pdf',
'YMML.2019.09.12-Run.2-Initial.pdf',
'YMML.2019.09.13-Run.2-Initial.pdf',
'YMML.2019.09.12-Run.1-Final.pdf',
'YMML.2019.09.13-Run.1-Final.pdf',
'YMML.2019.09.14-Run.1-Final.pdf',
]
seen = defaultdict(list)
for x in files:
seen[x[5:15]].append(x)
missing = [v[0] for k, v in seen.items() if len(v) < 2]
print(missing) # => ['YMML.2019.09.14-Run.1-Final.pdf']
Getting names of partners can be done with a conditional:
names = [
x[:20] + "2-Initial.pdf" if x[20] == "1" else
x[:20] + "1-Final.pdf" for x in missing
]
print(names) # => ['YMML.2019.09.14-Run.2-Initial.pdf']
This works:
fileList = ['YMML.2019.09.10-Run.1-Final.pdf',
'YMML.2019.09.10-Run.2-Initial.pdf',
'YMML.2019.09.11-Run.2-Initial.pdf',
'YMML.2019.09.11-Run.1-Final.pdf',
'YMML.2019.09.12-Run.2-Initial.pdf',
'YMML.2019.09.13-Run.2-Initial.pdf',
'YMML.2019.09.12-Run.1-Final.pdf',
'YMML.2019.09.13-Run.1-Final.pdf',
'YMML.2019.09.14-Run.1-Final.pdf',]
initial_set = {filename[:15] for filename in fileList if 'Initial' in filename}
final_set = {filename[:15] for filename in fileList if 'Final' in filename}
for filename in final_set - initial_set:
print(filename + '-Run.2-Initial.pdf is missing.')
for filename in initial_set - final_set:
print(filename + '-Run.1-Final.pdf is missing.')
Writing a script to help with data migration in renaming images. It seems as though when I try to access the variable filename from within the inner-for-loop, it's just printing .DS_Store
See commented lines for example:
#!/usr/bin/env python
import os
import csv
FILE_PATH = '/Users/admin/Desktop/data-migration/images/product'
COUNT = 0
with open('paths_formatted.csv') as csvfile:
reader = csv.reader(csvfile)
# Walk the tree.
for root, directories, files in os.walk(FILE_PATH):
for filename in files:
# Join the two strings in order to form the full filepath.
filePath = os.path.join(root, filename)
#print(filePath) - this results in the actual file path
for row in reader:
#print(filePath) - this results in .DS_Store
oldFilePath = row[1].strip()
displayName = row[0].strip()
colour = row[2].strip()
if " " in colour:
colour = colour.replace(" ", "-")
slashIndex = oldFilePath.rfind("/")
oldFileName = oldFilePath[slashIndex+1:]
if oldFileName == filename:
number = 1;
newFileName = displayName + "_" + colour + "-" + str(number) + ".jpg"
while os.path.exists(FILE_PATH + leadingPath + newFileName):
number = number + 1
newFileName = filePath, displayName + "_" + colour + "-" + str(number)
os.rename(newFileName)
COUNT = COUNT+1
print(COUNT)
Why would this be?
After changing my code as per the comments, to store the results in a list, now the for root, directories, files in os.walk(FILE_PATH): is not being executed.
I verified that the FILE_PATH exists and printed it to console, also that it has contents.
My new code is as follows:
#!/usr/bin/env python
import os
import csv
FILE_PATH = '/Users/admin/Desktop/data-migration/images/product'
COUNT = 0
productInfo = []
with open('paths_formatted.csv') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
productInfo.append(row)
for root, directories, files in os.walk(FILE_PATH):
for filename in files:
for info in productInfo:
displayName = info[0]
oldFilePath = info[1]
colour = info[2]
slashIndex = oldFilePath.rfind("/")
oldFileName = oldFilePath[slashIndex+1:]
if " " in colour:
colour = colour.replace(" ", "-")
if oldFileName == filename:
number = 1;
newFileName = displayName + "_" + colour + "-" + str(number) + ".jpg"
while os.path.exists(FILE_PATH + leadingPath + newFileName):
number = number + 1
newFileName = filePath, displayName + "_" + colour + "-" + str(number) + ".jpg"
os.rename(newFileName)
COUNT = COUNT + 1
print(COUNT)
I've been successful using xml.etree.ElementTree to parse an xml, search for content, then write this to a different xml. However, I just worked with text, inside of a singe tag.
import os, sys, glob, xml.etree.ElementTree as ET
path = r"G:\\63D RRC GIS Data\\metadata\\general\\2010_contract"
for fn in os.listdir(path):
filepaths = glob.glob(path + os.sep + fn + os.sep + "*overall.xml")
for filepath in filepaths:
(pa, filename) = os.path.split(filepath)
####use this section to grab element text from old, archived metadata files; this text then gets put into current, working .xml###
root = ET.parse(pa + os.sep + "archive" + os.sep + "base_metadata_overall.xml").getroot()
iterator = root.getiterator()
for item in iterator:
if item.tag == "abstract":
correct_abstract = item.text
root2 = ET.parse(pa + os.sep + "base_metadata_overall.xml").getroot()
iterator2 = root2.getiterator("descript")
for item in iterator2:
if item.tag == "abstract":
old_abstract = item.find("abstract")
old_abstract_text = old_abstract.text
item.remove(old_abstract)
new_symbol_abstract = ET.SubElement(item, "title")
new_symbol_abstract.text = correct_abstract
tree = ET.ElementTree(root2)
tree.write(pa + os.sep + "base_metadata_overall.xml")
print "created --- " + filename + " metadata"
But now, I need to:
1) search an xml and grab everything between "attr" tags, below is example:
<attr><attrlabl Sync="TRUE">OBJECTID</attrlabl><attalias Sync="TRUE">ObjectIdentifier</attalias><attrtype Sync="TRUE">OID</attrtype><attwidth Sync="TRUE">4</attwidth><atprecis Sync="TRUE">0</atprecis><attscale Sync="TRUE">0</attscale><attrdef Sync="TRUE">Internal feature number.</attrdef></attr>
2) Now, I need to open a different xml and search for all content between the same "attr" tag and replace with the above.
Basically, what I was doing before, but ignoring subelements, attributes, ect... between "attr" tags and treat it like text.
thanks!!
Please bear with me, this forum is a little different (posting) then Im used to!
Here's what I have so far:
import os, sys, glob, re, xml.etree.ElementTree as ET
from lxml import etree
path = r"C:\\temp\\python\\xml"
for fn in os.listdir(path):
filepaths = glob.glob(path + os.sep + fn + os.sep + "*overall.xml")
for filepath in filepaths:
(pa, filename) = os.path.split(filepath)
xml = open(pa + os.sep + "attributes.xml")
xmltext = xml.read()
correct_attrs = re.findall("<attr> (.*?)</attr>",xmltext,re.DOTALL)
for item in correct_attrs:
correct_attribute = "<attr>" + item + "</attr>"
xml2 = open(pa + os.sep + "base_metadata_overall.xml")
xmltext2 = xml2.read()
old_attrs = re.findall("<attr>(.*?)</attr>",xmltext,re.DOTALL)
for item2 in old_attrs:
old_attribute = "<attr>" + item + "</attr>"
old = etree.fromstring(old_attribute)
replacement = new.xpath('//attr')
for attr in old.xpath('//attr'):
attr.getparent().replace(attr, copy.deepcopy(replacement))
print lxml.etree.tostring(old)
got this working, see below, even figured out how to export to new .xml
However, If the # of attr's is dif. from source to dest, I get the following error, any suggestions?
node = replacements.pop()
IndexError: pop from empty list
import os, sys, glob, re, copy, lxml, xml.etree.ElementTree as ET
from lxml import etree
path = r"C:\\temp\\python\\xml"
for fn in os.listdir(path):
filepaths = glob.glob(path + os.sep + fn + os.sep + "*overall.xml")
for filepath in filepaths:
xmlatributes = open(pa + os.sep + "attributes.xml")
xmlatributes_txt = xmlatributes.read()
xmltarget = open(pa + os.sep + "base_metadata_overall.xml")
xmltarget_txt = xmltarget.read()
source = lxml.etree.fromstring(xmlatributes_txt)
dest = lxml.etree.fromstring(xmltarget_txt)
replacements = source.xpath('//attr')
replacements.reverse()
for attr in dest.xpath('//attr'):
node = replacements.pop()
attr.getparent().replace(attr, copy.deepcopy(node))
#print lxml.etree.tostring(dest)
tree = ET.ElementTree(dest)
tree.write (pa + os.sep + "edited_metadata.xml")
print fn + "--- sucessfully edited"
update 5/16/2011
restructured a few things to fix the "IndexError: pop from empty list" error mentioned above. Realized that the replacement of the "attr" tags will not always be a 1-to-1 replacement. For ex. sometimes the source .xml has 20 attr's and the destination .xml has 25 attr's. In this case, the 1-to-1 replacement would choke.
Anyway, the below will remove all attr's, then replace with the source attr's. It also checks for another tag, "subtype" if it exists, it adds them after the attr's, but inside the "detailed" tags.
thanks again to everyone who helped.
import os, sys, glob, re, copy, lxml, xml.etree.ElementTree as ET
from lxml import etree
path = r"G:\\63D RRC GIS Data\\metadata\\general\\2010_contract"
#path = r"C:\\temp\python\\xml"
for fn in os.listdir(path):
correct_title = fn.replace ('_', ' ') + " various facilities"
correct_fc_name = fn.replace ('_', ' ')
filepaths = glob.glob(path + os.sep + fn + os.sep + "*overall.xml")
for filepath in filepaths:
print "-----" + fn + "-----"
(pa, filename) = os.path.split(filepath)
xmlatributes = open(pa + os.sep + "attributes.xml")
xmlatributes_txt = xmlatributes.read()
xmltarget = open(pa + os.sep + "base_metadata_overall.xml")
xmltarget_txt = xmltarget.read()
source = lxml.etree.fromstring(xmlatributes_txt)
dest = lxml.etree.fromstring(xmltarget_txt)
replacements = source.xpath('//attr')
replacesubtypes = source.xpath('//subtype')
subtype_true_f = len(replacesubtypes)
attrtag = dest.xpath('//attr')
#print len(attrtag)
num_realatrs = len(replacements)
for n in attrtag:
n.getparent().remove(n)
print n.tag + " removed"
detailedtag = dest.xpath('//detailed')
for n2 in detailedtag:
pos = 0
for realatrs in replacements:
n2.insert(pos + 1, realatrs)
print "attr's replaced"
if subtype_true_f >= 1:
#print subtype_true_f
for realsubtypes in replacesubtypes:
n2.insert(num_realatrs + 1, realsubtypes)
print "subtype's replaced"
tree = ET.ElementTree(dest)
tree.write (pa + os.sep + "base_metadata_overall_v2.xml")
print fn + "--- sucessfully edited"
Here is an example of using lxml to do this. I'm not exactly sure how you want the <attr/> nodes replaced, but this example should provide a pattern you can reuse.
Update - I changed it to replace each <attr> in tree2 with the corresponding node from tree1, in document order:
import copy
import lxml.etree
xml1 = '''<root><attr><chaos foo="0"/></attr><attr><arena foo="1"/></attr></root>'''
xml2 = '''<tree><attr><one/></attr><attr><two/></attr></tree>'''
tree1 = lxml.etree.fromstring(xml1)
tree2 = lxml.etree.fromstring(xml2)
# select <attr/> nodes from tree1, will be used to replace corresponding
# nodes in tree2
replacements = tree1.xpath('//attr')
replacements.reverse()
for attr in tree2.xpath('//attr'):
# replace the attr node in tree2 with 'replacement' from tree1
node = replacements.pop()
attr.getparent().replace(attr, copy.deepcopy(node))
print lxml.etree.tostring(tree2)
Result:
<tree>
<attr><chaos foo="0"/></attr>
<attr><arena foo="1"/></attr>
</tree>
This sounds like something that XSL-T transformations were made for. Have you tried that?
I'd also recommend a library like Beautiful Soup for parsing and manipulating XML.