Parsing values from CSV as strings into text file - python

I am attempting to create a txt file that includes XML files in a dir and the text within each XML files when a tag is present.
I am having trouble reading a csv row in as a variable using the command below. I have attempted to pull the required values multiple ways but continue to run into a brick wall.
Here is the code:
container = raw_input("Choose a filename for your container:")
epub = zipfile.ZipFile( container + ".zip", 'w')
xmlinput = glob.glob('./*.xml')
def xmldrop(dir):
for r,d,f in os.walk(dir):
for files in f:
if files.endswith(".xml"):
dom=parse(os.path.join(r, files))
name = dom.getElementsByTagName('title')
with open('catalog.csv', 'a') as f:
f.write(files + "," + name[0].firstChild.nodeValue + "\n")
xmldrop("./")
line_number = 0
with open('catalog.csv', 'rb') as f:
mycsv = csv.reader(f)
mycsv = list(mycsv)
text = mycsv[line_number+1][1]
list_tpl = '''
<Container>
<FileName>
%(FileName)s
</FileName>
</Container>'''
FileName = ""
for i, xml in enumerate(xmlinput):
basename = os.path.basename(xml)
FileName += ('<Fileid="%i" filename="%s"> <title>%s</title> </Fileid>' %
(i+1, basename, text))
epub.writestr('list.txt', list_tpl % {
'FileName': FileName
})
I am able to successfully pull the information into a csv file as seen with this output:
file_1.xml,Intro
file_2.xml,Assessment
file_3.xml,Review
file_4.xml,Catalog
but the list.txt file that gets generated looks like:
<Container>
<FileName>
<Fileid="1" filename="file_1.xml"> <title>Assessment</title></p> </Fileid>
<Fileid="2" filename="file_2.xml"> <title>Assessment</title></p> </Fileid>
<Fileid="3" filename="file_3.xml"> <title>Assessment</title></p> </Fileid>
<Fileid="4" filename="file_4.xml"> <title>Assessment</title></p> </Fileid>
</FileName>
</Container>
Desired output would be:
<Container>
<FileName>
<Fileid="1" filename="file_1.xml"> <title>Intro</title> </Fileid>
<Fileid="2" filename="file_2.xml"> <title>Assessment</title> </Fileid>
<Fileid="3" filename="file_3.xml"> <title>Review</title> </Fileid>
<Fileid="4" filename="file_4.xml"> <title>Catalog</title> </Fileid>
</FileName>
</Container>
Any assistance is greatly appreciated. I have been trying to pair the two up for over a week now with no success.

You aren't updating the text variable when you are printing out your xml.
You set it once
text = mycsv[line_number+1][1]
but you never update it again, so it keeps outputting Assesment

Related

Searching excel files for string in file through multiple folder directories not working

I have this code where I am trying to search a Directory and Sub Directories for a specified string within .xls and .xlsx files and return the file name for now. When I run this - I get a return of each file directory path as text for the files ending in .xls and .xlsx and the search string parameter I use under those same returned results. The code is not isolating the files with the string - rather, just returning the file path as text for all results and adding my string parameter to search for under that. What could be happening here? and is it possible to pass a list here and copy the discovered files to a folder? That is where I am trying to get with this in the end. Thank you.
import os
import openpyxl
def findFiles(strings, dir, subDirs, fileContent, fileExtensions):
filesInDir = []
foundFiles = []
filesFound = 0
if not subDirs:
for filename in os.listdir(dir):
if os.path.isfile(os.path.join(dir, filename).replace("\\", "/")):
filesInDir.append(os.path.join(dir, filename).replace("\\", "/"))
else:
for root, subdirs, files in os.walk(dir):
for f in files:
if not os.path.isdir(os.path.join(root, f).replace("\\", "/")):
filesInDir.append(os.path.join(root, f).replace("\\", "/"))
print(filesInDir)
if filesInDir:
for file in filesInDir:
print("Current file: "+file)
filename, extension = os.path.splitext(file)
if fileExtensions:
fileText = extension
else:
fileText = os.path.basename(filename).lower()
if fileContent:
fileText += getFileContent(file).lower()
for string in strings:
print(string)
if string in fileText:
foundFiles.append(file)
filesFound += 1
break
return foundFiles
def getFileContent(filename):
if filename.partition(".")[2] in supportedTypes:
if filename.endswith(".xls"):
content = ""
with openpyxl.load_workbook(filename) as pdf:
for x in range(0, len(pdf.pages)):
page = pdf.pages[x]
content = content + page.extract_text()
return content
elif filename.endswith(".xlsx"):
with openpyxl.load_workbook(filename, 'r') as f:
content = ""
lines = f.readlines()
for x in lines:
content = content + x
f.close()
return content
else:
return ""
supportedTypes = [".xls", ".xlsx"]
print(findFiles(strings=["55413354"], dir="C:/Users/User/", subDirs=True, fileContent=True, fileExtensions=False))
Expected output sample - reflects a find for string '55413354` - as in, that string was located in below file name only out of 3 files.
Excel File Name 123
Actual output - Returns everything - no filter is happening, and includes my search string under the file name.
path/Excel File Name 123
55413354
path/Excel File Name 321
55413354
path/Excel File Name 111
55413354

Extract certain data from multiple .txt files using Python and RegEx

I have several .txt files and I need to extract certain data from them. Files looks similar, but each of them stores different data. Here is an example of that file:
Start Date: 21/05/2016
Format: TIFF
Resolution: 300dpi
Source: X Company
...
There is more information in the text files, but I need to extract the start date, format and the resolution. Files are in the same parent directory ("E:\Images") but each file has its own folder. Therefore I need a script for recursive reading of these files. Here is my script so far:
#importing a library
import os
#defining location of parent folder
BASE_DIRECTORY = 'E:\Images'
#scanning through subfolders
for dirpath, dirnames, filenames in os.walk(BASE_DIRECTORY):
for filename in filenames:
#defining file type
txtfile=open(filename,"r")
txtfile_full_path = os.path.join(dirpath, filename)
try:
for line in txtfile:
if line.startswidth('Start Date:'):
start_date = line.split()[-1]
elif line.startswidth('Format:'):
data_format = line.split()[-1]
elif line.startswidth('Resolution:'):
resolution = line.split()[-1]
print(
txtfile_full_path,
start_date,
data_format,
resolution)
Ideally it might be better if Python extracts it together with a name of ech file and saves it in a text file. Because I don't have much experience in Python, I don't know how to progress any further.
Here is the code I've used:
# importing libraries
import os
# defining location of parent folder
BASE_DIRECTORY = 'E:\Images'
output_file = open('output.txt', 'w')
output = {}
file_list = []
# scanning through sub folders
for (dirpath, dirnames, filenames) in os.walk(BASE_DIRECTORY):
for f in filenames:
if 'txt' in str(f):
e = os.path.join(str(dirpath), str(f))
file_list.append(e)
for f in file_list:
print f
txtfile = open(f, 'r')
output[f] = []
for line in txtfile:
if 'Start Date:' in line:
output[f].append(line)
elif 'Format' in line:
output[f].append(line)
elif 'Resolution' in line:
output[f].append(line)
tabs = []
for tab in output:
tabs.append(tab)
tabs.sort()
for tab in tabs:
output_file.write(tab + '\n')
output_file.write('\n')
for row in output[tab]:
output_file.write(row + '')
output_file.write('\n')
output_file.write('----------------------------------------------------------\n')
raw_input()
You do not need regular expressions. You can use
basic string functions:
txtfile=open(filename,"r")
for line in txtfile:
if line.startswidth("Start Date:"):
start_date = line.split()[-1]
...
break if you have all information collected.
To grab the Start Date, you can use the following regex:
^(?:Start Date:)\D*(\d+/\d+/\d+)$
# ^ anchor the regex to the start of the line
# capture the string "Start Date:" in a group
# followed by non digits zero or unlimited times
# followed by a group with the start date in it
In Python this would be:
import re
regex = r"^(?:Start Date:)\D*(\d+/\d+/\d+)$"
# the variable line points to your line in the file
if re.search(regex, line):
# do sth. useful here
See a demo on regex 101.

Merge files into xlsx and then reconstruct the dir

I have many files ('*.pl-pl'). My script has to find each of this files and merge them into one xlsx file using openpyxl.
Now, I want to rebuild those files, I want rebuild the same files as originals.
But there is a problem after writing:
(content variable contains content of one file (read from one excel cell))
with open(path,'w') as f:
f.write(content.encode('utf-8'))
So now, I check, whether original files are the same as new files. Text in those files seems to be the same but there are little differencies in size. When I use WinDiff application to check them, it finds some touples which are different but it says that they are different in blanks only.
Could you give me an advice how to rebuild those files to be the same as before?
Or is this way correct?
Note: I try to rebuild them to be sure that there will be the same encoding etc. because the merged excel file will be used to translation and then translated files has to be rebuilt instead of originals.
Here is the code - it checks directory and prints all file names and contents into the one temporary file. Then, it creates an excel file - 1st. column is path (to be able reconstruct dir) and 2nd column contains content of the file, where new lines has been switched to '='
def print_to_file():
import os
for root, dirs, files in os.walk("OriginalDir"):
for file in files:
text = []
if file.endswith(".pl-pl"):
abs_path = os.path.join(root, file)
with open(abs_path) as f:
for line in f:
text.append(line.strip('\n'))
mLib.printToFile('files.mdoc', abs_path + '::' + '*=*'.join(text)) #'*=*' represents '\n'
def write_it():
from openpyxl import Workbook
import xlsxwriter
file = 'files.mdoc'
workbook = Workbook()
worksheet = workbook.worksheets[0]
worksheet.title = "Translate"
i = 0
with open(file) as f:
classes = set()
for line in f:
i += 1
splitted = line.strip('\n').split('::')
name = splitted[0]
text = splitted[1].split('*=*')
text = [x.encode('string-escape') for x in text]
worksheet.cell('B{}'.format(i)).style.alignment.wrap_text = True
worksheet.cell('B{}'.format(i)).value = splitted[1]
worksheet.cell('A{}'.format(i)).value = splitted[0]
workbook.save('wrap_text1.xlsx')
import openpyxl
def rebuild():
wb = openpyxl.load_workbook('wrap_text1.xlsx')
ws = wb.worksheets[0]
row_count = ws.get_highest_row()
for i in xrange(1, row_count + 1):
dir_file = ws.cell('A{}'.format(i)).value
content = ws.cell('B{}'.format(i)).value
remake(dir_file, content)
import os
def remake(path, content):
content = re.sub('\*=\*', '\n', content)
result = ''
splt = path.split('\\')
file = splt[-1]
for dir in splt[:-1]:
result += dir + '/'
# print result
if not os.path.isdir(result):
# print result
os.mkdir(result)
with open(path, 'w') as f:
f.write(content.encode('utf-8'))
# print_to_file() # print to temp file - paths and contents separated by '::'
# write_it() # write it into the excel file
# rebuilt() # reconstruct directory

Python - I'm trying to unzip a file that has multiple zip files within

My goal is to get to a txt file that is withing the second layer of zip files. The issue is that the txt file has the same name in all the .zip, so it overwrites the .txt and it only returns 1 .txt
from ftplib import *
import os, shutil, glob, zipfile, xlsxwriter
ftps = FTP_TLS()
ftps.connect(host='8.8.8.8', port=23)
ftps.login(user='xxxxxxx', passwd='xxxxxxx')
print ftps.getwelcome()
print 'Access was granted'
ftps.prot_p()
ftps.cwd('DirectoryINeed')
data = ftps.nlst() #Returns a list of .zip diles
data.sort() #Sorts the thing out
theFile = data[-2] #Its a .zip file #Stores the .zip i need to retrieve
fileSize = ftps.size(theFile) #gets the size of the file
print fileSize, 'bytes' #prints the size
def grabFile():
filename = 'the.zip'
localfile = open(filename, 'wb')
ftps.retrbinary('RETR ' + theFile, localfile.write)
ftps.quit()
localfile.close()
def unzipping():
zip_files = glob.glob('*.zip')
for zip_file in zip_files:
with zipfile.ZipFile(zip_file, 'r')as Z:
Z.extractall('anotherdirectory')
grabFile()
unzipping()
lastUnzip()
After this runs it grabs the .zip that I need and extracts the contents to a folder named anotherdirectory. Where it holds the second tier of .zips. This is where I get into trouble. When I try to extract the files from each zip. They all share the same name. I end up with a single .txt when I need one for each zip.
I think you're specifying the same output directory and filename each time. In the unzipping function,
change
Z.extractall('anotherdirectory')
to
Z.extractall(zip_file)
or
Z.extractall('anotherdirectory' + zip_file)
if the zip_file's are all the same, give each output folder a unique numbered name:
before unzipping function:
count = 1
then replace the other code with this:
Z.extractall('anotherdirectory/' + str(count))
count += 1
Thanks to jeremydeanlakey's response, I was able to get this part of my script. Here is how I did it:
folderUnzip = 'DirectoryYouNeed'
zip_files = glob.glob('*.zip')
count = 1
for zip_file in zip_files:
with zipfile.ZipFile(zip_file, 'r') as Z:
Z.extractall(folderUnzip + '/' + str(count))
count += 1

How to replace strings in multiple file using Python

I have two files (say file1 and file2). There are strings in file1 and file2 (equal numbers of strings).
I want to search the content of file1 in a directory(which have multiple sub-directories and XML files) which contains XML files and replace it with the content for file2.
import subprocess
import sys
import os
f_line = f.readlines()
g_line = g.readlines()
f=open("file1.txt")
g=open("file2.txt")
i = 0
for line in f_line:
if line.replace("\r\n", "") != g_line[i].replace("\r\n", "") :
print (line)
print(g_line[i])
cmd = "sed -i 's/" + line.replace("\r\n", "") + "/" + line[i].replace("\r\n","") + "/g' " + "`grep -l -R " + line.replace("\r\n", "") + " *.xml`"
print(cmd)
os.system(cmd)
i = i + 1
But the problem I'm facing is like this. The script searches the files and string and prints also (print(cmd)) but when I sun this script placing in the directory, I see this error in CYGWIN window "no input files for sed".
read two files into a dictionary
walk the directory reading xml files, replacing their contents, backing them up and overwriting the originals
f1 = open('pathtofile1').readlines()
f2 = open('pathtofile2').readlines()
replaceWith = dict()
for i in range(len(f1)):
replaceWith[f1[i].strip()] = f2[i].strip()
for root, dirnames, filenames in os.walk('pathtodir'):
for f in filenames:
f = open(os.path.join(root, f), 'r')
contents = f.read()
for k, v in replaceWith:
contents = re.sub(k, v, contents)
f.close()
shutil.copyfile(os.path.join(root, f), os.path.join(root, f)+'.bak')
f = open(os.path.join(root, f), 'w')
f.write(contents)
f.close()
A limitation is that if some search strings appear in replacements strings, a string may be replaced many times over.

Categories