Cutting character values according to value from file - python

This is the which i am doing
import csv
output = open('output.txt' , 'wb')
# this functions return the min for num.txt
def get_min(num):
return int(open('%s.txt' % num, 'r+').readlines()[0])
# temporary variables
last_line = ''
input_list = []
#iterate over input.txt in sort the input in a list of tuples
for i, line in enumerate(open('input.txt', 'r+').readlines()):
if i%2 == 0:
last_line = line
else:
input_list.append((last_line, line))
filtered = [(header, data[:get_min(header[-2])] + '\n' ) for (header, data) in input_list]
[output.write(''.join(data)) for data in filtered]
output.close()
In this code input.txt is something like this
>012|013|0|3|M
AFDSFASDFASDFA
>005|5|67|0|6
ACCTCTGACC
>029|032|4|5|S
GGCAGGGAGCAGGCCTGTA
and num.txt is something like this
M 4
P 10
I want that in above input.txt check the amount of value from the num.txt by looking at its last column which is same like in num.txt and cut its character according to that values
I think the error in my code is that it only accept the integer text file , where it should also accept file which contain alphabets

The totally revised version, after a long chat with the OP;
import os
import re
# Fetch all hashes and counts
file_c = open('num.txt')
file_c = file_c.read()
lines = re.findall(r'\w+\.txt \d+', file_c)
numbers = {}
for line in lines:
line_split = line.split('.txt ')
hash_name = line_split[0]
count = line_split[1]
numbers[hash_name] = count
#print(numbers)
# The input file
file_i = open('input.txt')
file_i = file_i.read()
for hash_name, count in numbers.iteritems():
regex = '(' + hash_name.strip() + ')'
result = re.findall(r'>.*\|(' + regex + ')(.*?)>', file_i, re.S)
if len(result) > 0:
data_original = result[0][2]
stripped_data = result[0][2][int(count):]
file_i = file_i.replace(data_original, '\n' + stripped_data)
#print(data_original)
#print(stripped_data)
#print(file_i)
# Write the input file to new input_new.txt
f = open('input_new.txt', 'wt')
f.write(file_i)

You can do it like so;
import re
min_count = 4 # this variable will contain that count integer from where to start removing
str_to_match = 'EOG6CC67M' # this variable will contain the filename you read
input = '' # The file input (input.txt) will go in here
counter = 0
def callback_f(e):
global min_count
global counter
counter += 1
# Check your input
print(str(counter) + ' >>> ' + e.group())
# Only replace the value with nothing (remove it) after a certain count
if counter > min_count:
return '' # replace with nothing
result = re.sub(r''+str_to_match, callback_f, input)
With this tactic you can keep count with a global counter and there's no need to do hard line-loops with complex structures.
Update
More detailed version with file access;
import os
import re
def callback_f(e):
global counter
counter += 1
# Check your input
print(str(counter) + ' >>> ' + e.group())
# Fetch all hash-file names and their content (count)
num_files = os.listdir('./num_files')
numbers = {}
for file in num_files:
if file[0] != '.':
file_c = open('./num_files/' + file)
file_c = file_c.read()
numbers[file.split('.')[0]] = file_c
# Now the CSV files
csv_files = os.listdir('./csv_files')
for file in csv_files:
if file[0] != '.':
for hash_name, min_count in numbers.iteritems():
file_c = open('./csv_files/' + file)
file_c = file_c.read()
counter = 0
result = re.sub(r''+hash_name, callback_f, file_c)
# Write the replaced content back to the file here
Considered directory/file structure;
+ Projects
+ Project_folder
+ csv_files
- input1.csv
- input2.csv
~ etc.
+ num_files
- EOG6CC67M.txt
- EOG62JQZP.txt
~ etc.
- python_file.py
The CSV files contain the big chunks of text you state in your original question.
The Num files contain the hash-files with an Integer in them
What happens in this script;
Collect all Hash files (in a dictionary) and it's inner count number
Loop through all CSV files
Subloop through the collected numbers for each CSV file
Replace/remove (based on what you do in callback_f()) hashes after a certain count
Write the output back (it's the last comment in the script, would contain the file.write() functionality)

Related

Python: text log file processing and transposing rows to columns

I am new to python and stuck with a log file in text format, where it has following repetitive structure and I am required to extract the data from rows and change it into column depending upon the data. e.g.
First 50 line are trash like below(in first six lines):
-------------------------------------------------------------
Logging to file xyz.
Char
1,
3
r
=
----------------------------------------------
Pid 0
Name SAB=1, XYZ=3
----------------------------------------------
a 1
b 2
c 3
----------------------------------------------
Pid 0
Name SAB=1, XYZ=3, P_NO=546467
----------------------------------------------
Test_data_1 00001
Test_data_2 FOXABC
Test_data_3 SHEEP123
Country US
----------------------------------------------
Pid 0
Name SAB=1
----------------------------------------------
Sno 893489423
Log FileFormat
------------Continues for another million lines.
Now the required output is like below:
Required output format
PID, Name, a,b,c
0, "SAB=1, XYZ=3", 1,2,3
PID, Name , Test_data_1, Test_data_2, Test_data_3, Country
0, "SAB=1, XYZ=3, P_NO=546467", 00001, FOXABC, SHEEP123, US
Pid, Name, Sno
0, SAB=1, 893489423
I tried to write a code but failed to get the desired results: My attempt was as below:
'''
fn=open(file_name,'r')
for i,line in enumerate(fn ):
if i >= 50 and "Name " in line: # for first 50 line deletion/or starting point
last_tag=line.split(",")[-1]
last_element=last_tag.split("=")[0]
print(last_element)
'''
Any help would be really appreciated.
Newly Discovered Structure
RBY Structure
The solution I came up with is a bit messy but it works, check it out below:
import sys
import re
import StringIO
ifile = open(sys.argv[1],'r') #Input log file as command-line argument
ofile = open(sys.argv[1][:-4]+"_formatted.csv",'w') #output formatted log txt
stringOut = ""
i = 0
flagReturn = True
j = 0
reVal = re.compile("Pid[\s]+(.*)\nName[\s]+(.*)\n[-]+\<br\>(.*)\<br\>") #Regex pattern for separating the Pid & Name from the variables
reVar = re.compile("(.*)[ ]+(.*)") #Regex pattern for getting vars and their values
reVarStr = re.compile(">>> [0-9]+.(.*)=(.*)") #Regex Pattern for Struct
reVarStrMatch = re.compile("Struct(.*)+has(.*)+members:") #Regex pattern for Struct check
for lines in ifile.readlines():
if(i>8): #Omitting the first 9 lines of Garbage values
if(lines.strip()=="----------------------------------------------"): #Checking for separation between PID & Name group and the Var group
j+=1 #variable keeping track of whether we are inside the vars section or not (between two rows of hyphens)
flagReturn = not flagReturn #To print the variables in single line to easily separate them with regex pattern reVal
if(not flagReturn):
stringTmp = lines.strip()+"<br>" #adding break to the end of each vars line in order for easier separation
else:
stringTmp = lines #if not vars then save each line as is
stringOut += stringTmp #concatenating each lines to form the searchable string
i+=1 #incrementing for omitting lines (useless after i=8)
if(j==2): #Once a complete set of PIDs, Names and Vars have been collected
j=0 #Reset j
matchObj = reVal.match(stringOut) #Match for PID, Name & Vars
line1 = "Pid,Name,"
line2 = matchObj.group(1).strip()+",\""+matchObj.group(2)+"\","
buf = StringIO.StringIO(matchObj.group(3).replace("<br>","\n"))
structFlag = False
for line in buf.readlines(): #Separate each vars and add to the respective strings for writing to file
if(not (reVarStrMatch.match(line) is None)):
structFlag = True
elif(structFlag and (not (reVarStr.match(line) is None))):
matchObjVars = reVarStr.match(line)
line1 += matchObjVars.group(1).strip()+","
line2 += matchObjVars.group(2).strip()+","
else:
structFlag = False
matchObjVars = reVar.match(line)
try:
line1 += matchObjVars.group(1).strip()+","
line2 += matchObjVars.group(2).strip()+","
except:
line1 += line.strip()+","
line2 += " ,"
ofile.writelines(line1[:-1]+"\n")
ofile.writelines(line2[:-1]+"\n")
ofile.writelines("\n")
stringOut = "" #Reseting the string
ofile.close()
ifile.close()
EDIT
This is what I came up with to include the new pattern as well.
I suggest you do the following:
Run the parser script on a copy of the log file and see where it fails next.
Identify and write down the new pattern that broke the parser.
Delete all data in the newly identified pattern.
Repeat from Step 1 till all patterns have been identified.
Create individual regular expressions pattern for each type of pattern and call them in separate functions to write to the string.
EDIT 2
structFlag = False
RBYflag = False
for line in buf.readlines(): #Separate each vars and add to the respective strings for writing to file
if(not (reVarStrMatch.match(line) is None)):
structFlag = True
elif(structFlag and (not (reVarStr.match(line) is None))):
matchObjVars = reVarStr.match(line)
if(matchObjVars.group(1).strip()=="RBY" and not RBYFlag):
line1 += matchObjVars.group(1).strip()+","
line2 += matchObjVars.group(2).strip()+"**"
RBYFlag = True
elif(matchObjVars.group(1).strip()=="RBY"):
line2 += matchObjVars.group(2).strip()+"**"
else:
if(RBYFlag):
line2 = line2[:-2]
RBYFlag = False
line1 += matchObjVars.group(1).strip()+","
line2 += matchObjVars.group(2).strip()+","
else:
structFlag = False
if(RBYFlag):
line2 = line2[:-2]
RBYFlag = False
matchObjVars = reVar.match(line)
try:
line1 += matchObjVars.group(1).strip()+","
line2 += matchObjVars.group(2).strip()+","
except:
line1 += line.strip()+","
line2 += " ,"`
NOTE
This loop has become very bloated and it is better to create a separate function to identify the type of data and return some value accordingly.

split() issues with pdf extractText()

I'm working on a minor content analysis program that I was hoping that I could have running through several pdf-files and return the sum of frequencies that some specific words are mentioned in the text. The words that are searched for are specified in a separate text file (list.txt) and can be altered. The program runs just fine through files with .txt format, but the result is completely different when running the program on a .pdf file. To illustrate, the test text that I have the program running trhough is the following:
"Hello
This is a product development notice
We’re working with innovative measures
A nice Innovation
The world that we live in is innovative
We are currently working on a new process
And in the fall, you will experience our new product development introduction"
The list of words grouped in categories are the following (marked in .txt file with ">>"):
innovation: innovat
product: Product, development, introduction
organization: Process
The output from running the code with a .txt file is the following:
Whereas the ouput from running it with a .pdf is the following:
As you can see, my issue is pertaining to the splitting of the words, where in the .pdf output i can have a string like "world" be split into 'w','o','rld'. I have tried to search for why this happens tirelessly, without success. As I am rather new to Python programming, I would appreciate any answe or direction to where I can fin and answer to why this happens, should you know any source.
Thanks
The code for the .txt is as follows:
import string, re, os
import PyPDF2
dictfile = open('list.txt')
lines = dictfile.readlines()
dictfile.close()
dic = {}
scores = {}
i = 2011
while i < 2012:
f = 'annual_report_' + str(i) +'.txt'
textfile = open(f)
text = textfile.read().split() # lowercase the text
print (text)
textfile.close()
i = i + 1
# a default category for simple word lists
current_category = "Default"
scores[current_category] = 0
# import the dictionary
for line in lines:
if line[0:2] == '>>':
current_category = line[2:].strip()
scores[current_category] = 0
else:
line = line.strip()
if len(line) > 0:
pattern = re.compile(line, re.IGNORECASE)
dic[pattern] = current_category
# examine the text
for token in text:
for pattern in dic.keys():
if pattern.match( token ):
categ = dic[pattern]
scores[categ] = scores[categ] + 1
print (os.path.basename(f))
for key in scores.keys():
print (key, ":", scores[key])
While the code for the .pdf is as follows:
import string, re, os
import PyPDF2
dictfile = open('list.txt')
lines = dictfile.readlines()
dictfile.close()
dic = {}
scores = {}
i = 2011
while i < 2012:
f = 'annual_report_' + str(i) +'.pdf'
textfile = open(f, 'rb')
text = PyPDF2.PdfFileReader(textfile)# lowercase the text
for pageNum in range(0, text.numPages):
texts = text.getPage(pageNum)
textfile = texts.extractText().split()
print (textfile)
i = i + 1
# a default category for simple word lists
current_category = "Default"
scores[current_category] = 0
# import the dictionary
for line in lines:
if line[0:2] == '>>':
current_category = line[2:].strip()
scores[current_category] = 0
else:
line = line.strip()
if len(line) > 0:
pattern = re.compile(line, re.IGNORECASE)
dic[pattern] = current_category
# examine the text
for token in textfile:
for pattern in dic.keys():
if pattern.match( token ):
categ = dic[pattern]
scores[categ] = scores[categ] + 1
print (os.path.basename(f))
for key in scores.keys():
print (key, ":", scores[key])

Extracting data from a text file to an output file

I have alot of files which names are just number. (Starting from 1 to whatever is the maximum number) and each of these files are similar to each other by their "tags" (ObjectID =, X =, Y =, etc.), but the values after those tags are not the same at all.
I wanted to make my job easier from manually copy/pasting the data from one file to another and made a small script using Python (since I am slightly experienced in it).
This is the full script:
import os
BASE_DIRECTORY = 'C:\Users\Tom\Desktop\TheServer\scriptfiles\Objects'
output_file = open('output.txt', 'w')
output = {}
file_list = []
for (dirpath, dirnames, filenames) in os.walk(BASE_DIRECTORY):
for f in filenames:
if 'txt' in str(f):
e = os.path.join(str(dirpath), str(f))
file_list.append(e)
for f in file_list:
print f
txtfile = open(f, 'r')
output[f] = []
for line in txtfile:
if 'ObjectID =' in line:
output[f].append(line)
elif 'X =' in line:
output[f].append(line)
elif 'Y =' in line:
output[f].append(line)
tabs = []
for tab in output:
tabs.append(tab)
tabs.sort()
for tab in tabs:
for row in output[tab]:
output_file.write(row + '')
Now, everything is working fine, the output file looks like this:
ObjectID = 1216
X = -1480.500610
Y = 2610.885742
ObjectID = 970
X = -1517.210693
Y = 2522.842285
ObjectID = 3802
X = -1512.156616
Y = 2521.116210
etc.
But I don't want it to be like that (each value has a new line). I need it to do this for every file:
Read the file.
Remove the tags infront of the values.
Format a single line which will have those values in the output folder. (Let's say I want to make it look like this: "(1216,-1480.500610,2522.842285)" )
Write that line in the output folder.
Repeat for every file.
Any help please?
Hope this helps.
data = open('sam.txt', 'r').read()
>>> print data
ObjectID = 1216
X = -1480.500610
Y = 2610.885742
ObjectID = 970
X = -1517.210693
Y = 2522.842285
ObjectID = 3802
X = -1512.156616
Y = 2521.116210
>>>
Now lets do some string replacements :)
>>> data = data.replace('ObjectID =', '').replace('\nX = ', ',').replace('\nY = ', ',')
>>> print data
1216,-1480.500610,2610.885742
970,-1517.210693,2522.842285
3802,-1512.156616,2521.116210
In your loop, keep track of whether you are 'in' a record:
records = []
in_record = False
id, x, y = 0, 0, 0
for line in txtfile:
if not in_record:
if 'ObjectID =' in line:
in_record = True
id = line[10:]
elif 'X =' in line:
x = line[3:]
elif 'Y =' in line:
y = line[3:]
records.append((id, x, y))
in_record = False
Then you'll have a list of tuples which you can easily write with the csv module.
Find here a version of the loop you have generating the contents.
I rewrote it so the line contents ObjectId, X and Y are in the same line.
It looks that is what you want to do:
for f in file_list:
print f
txtfile = open(f, 'r')
output[f] = []
for line in txtfile:
myline = ''
if 'ObjectID =' in line:
pos = line.rfind("ObjectID =") + len("ObjectID =")
rest = line[pos:]
# Here you set the delimiter after the ObjectID value. Can be ","
numbers = rest.split(" ")
if len(numbers) > 0:
myline.append(numbers[0])
elif 'X =' in line:
pos = line.rfind("X =") + len("X =")
rest = line[pos:]
# Here you set the delimiter after the ObjectID value. Can be ","
numbers = rest.split(" ")
if len(numbers) > 0:
myline.append(numbers[0])
elif 'Y =' in line:
pos = line.rfind("Y =") + len("Y =")
rest = line[pos:]
# Here you set the delimiter after the ObjectID value. Can be ","
numbers = rest.split(" ")
if len(numbers) > 0:
myline.append(numbers[0])
output[f].append(myline)
Note that you need to know which character (in the code the delimiter) separates the names you try to find: ObjectID = from the actual values you want to grab from the line.
Here is what you need. I did not have enough time to write the code for appending the result to a new file. Instead it just prints it, but you get the point.
import os.path
path = "path"
#getting the number of files in your folder
num_files = len([f for f in os.listdir(path)
if os.path.isfile(os.path.join(path, f))])
#function that returns your desired output for a given file
def file_head_ext(file_path, file_num):
with open(file_path + "/" + file_num) as myfile:
head = [next(myfile).split("=") for x in range(3)]
formatted_head = [elm[1].replace("\n",'').replace(" ","") for elm in head]
return(",".join(formatted_head))
for filnum in range(1,num_files):
print(file_head_ext(path, str(filnum)))

Search text file for multiple strings and print out results to a new text file

I'm fairly new to python programming and I'm trying to learn File I/O as best I can.
I am currently in the process of making a simple program to read from a text document and print out the result. So far I've been able to create this program with the help of many resources and questions on this website.
However I'm curious on how I can read from a text document for multiple individual strings and save the resulting strings to a text document.
The program below is one i've made that allows me to search a text document for a Keyword and print the results between those Keywords into another Text File. However I can only do one set of Starting and Ending Keyword per search:
from Tkinter import *
import tkSimpleDialog
import tkMessageBox
from tkFileDialog import askopenfilename
root = Tk()
w = Label(root, text ="Configuration Inspector")
w.pack()
tkMessageBox.showinfo("Welcome", "This is version 1.00 of Configuration Inspector Text")
filename = askopenfilename() # Data Search Text File
outputfilename = askopenfilename() #Output Text File
with open(filename, "rb") as f_input:
start_token = tkSimpleDialog.askstring("Serial Number", "What is the device serial number?")
end_token = tkSimpleDialog.askstring("End Keyword", "What is the end keyword")
reText = re.search("%s(.*?)%s" % (re.escape(start_token + ",SHOWALL"), re.escape(end_token)), f_input.read(), re.S)
if reText:
output = reText.group(1)
fo = open(outputfilename, "wb")
fo.write(output)
fo.close()
print output
else:
tkMessageBox.showinfo("Output", "Sorry that input was not found in the file")
print "not found"
So what this program does is, it allows a user to select a text document search that document for a Beginning Keyword and an End Keyword then print out everything in between those two key words into a new text document.
What I am trying to achieve is allow a user to select a text document and search that text document for multiple sets keywords and print the result to the same output text file.
In other words let's say I have the following Text Document:
something something something something
something something something something STARTkeyword1 something
data1
data2
data3
data4
data5
ENDkeyword1
something something something something
something something something something STARTkeyword2 something
data1
data2
data3
data4
data5
Data6
ENDkeyword2
something something something something
something something something something STARTkeyword3 something
data1
data2
data3
data4
data5
data6
data7
data8
ENDkeyword3
I want to be able to search this text document with 3 different starting keywords and 3 different ending keywords then print whats in between to the same output text file.
So for example my output text document would look something like:
something
data1
data2
data3
data4
data5
ENDkeyword1
something
data1
data2
data3
data4
data5
Data6
ENDkeyword2
something
data1
data2
data3
data4
data5
data6
data7
data8
ENDkeyword3
One brute force method I've tried is to make a loop to make the user input a new Keyword one at a time however whenever I try to write to the same Output File in the Text document it will over write the previous entry using Append. Is there any way to make it so a user can search a text document for multiple strings and print out the multiple results with or without a loop?
----------------- EDIT:
So many thanks to all of you Im getting closer with your tips to a nice finalized version or so.. This is my current code:
def process(infile, outfile, keywords):
keys = [ [k[0], k[1], 0] for k in keywords ]
endk = None
with open(infile, "rb") as fdin:
with open(outfile, "wb") as fdout:
for line in fdin:
if endk is not None:
fdout.write(line)
if line.find(endk) >= 0:
fdout.write("\n")
endk = None
else:
for k in keys:
index = line.find(k[0])
if index >= 0:
fdout.write(line[index + len(k[0]):].lstrip())
endk = k[1]
k[2] += 1
if endk is not None:
raise Exception(endk + " not found before end of file")
return keys
from Tkinter import *
import tkSimpleDialog
import tkMessageBox
from tkFileDialog import askopenfilename
root = Tk()
w = Label(root, text ="Configuration Inspector")
w.pack()
tkMessageBox.showinfo("Welcome", "This is version 1.00 of Configuration Inspector ")
infile = askopenfilename() #
outfile = askopenfilename() #
start_token = tkSimpleDialog.askstring("Serial Number", "What is the device serial number?")
end_token = tkSimpleDialog.askstring("End Keyword", "What is the end keyword")
process(infile,outfile,((start_token + ",SHOWALL",end_token),))
So far It works however now it's time to for part im getting myself lost on and that is a multiple string input separated by a Delimiter. So if i had inputted
STARTKeyword1, STARTKeyword2, STARTKeyword3, STARTKeyword4
into the program prompt I want to be able to separate those keywords and place them into the
process(infile,outfile,keywords)
function so that the user is only prompted to input once and allow for multiple strings to search through the files. I was thinking of using maybe a loop or creating the separated inputs into an array.
If this question is far from the original I ask I will close this one and open another so i can give credit where credit is due.
I would use a separate function that takes:
the path of the input file
the path of the output file
an iterable containing the (startkeyword, endkeyword) pairs
Then I would process the file line by line copying line if between a start and an end, counting how many time each pair has been found. That way caller could know what pairs were found and how many times for each.
Here is a possible implemenatation:
def process(infile, outfile, keywords):
'''Search through inputfile whatever is between a pair startkeyword (excluded)
and endkeyword (included). Each chunk if copied to outfile and followed with
an empty line.
infile and outfile are strings representing file paths
keyword is an iterable containing pairs (startkeyword, endkeyword)
Raises an exception if an endkeyword is not found before end of file
Returns a list of lists [ startkeyword, endkeyword, nb of occurences]'''
keys = [ [k[0], k[1], 0] for k in keywords ]
endk = None
with open(infile, "r") as fdin:
with open(outfile, "w") as fdout:
for line in fdin:
if endk is not None:
fdout.write(line)
if line.find(endk) >= 0:
fdout.write("\n")
endk = None
else:
for k in keys:
index = line.find(k[0])
if index >= 0:
fdout.write(line[index + len(k[0]):].lstrip())
endk = k[1]
k[2] += 1
if endk is not None:
raise Exception(endk + " not found before end of file")
return keys
I try to write to the same Output File in the Text document it will
over write the previous entry.
Have you tried using append rather than write?
f = open('filename', 'a')
This could be the beginning of something
import os
directory = os.getcwd()
path = directory + "/" + "textdoc.txt"
newFileName = directory + '/' + "data" + '.txt'
newFob = open(newFileName, 'w')
keywords = [["STARTkeyword1","ENDkeyword1"],["STARTkeyword2","ENDkeyword2"]]
fob = open(path, 'r')
objectLines = fob.readlines()
startfound=False
for keyword in keywords:
for line in objectLines:
if startfound and keyword[1] in line:
startfound=False
if startfound:
newFob.write(line)
if keyword[0] in line:
startfound=True
newFob.close()
If a text called textdoc.txt with the data you have provided is in the current directory, the script will create a file in the current directory called data.txt with the following output:
data1
data2
data3
data4
data5
data1
data2
data3
data4
data5
Data6
Ideally you would input more keywords, or allow the user to input some, and fill that up in the keywords list.
For each start/end pair, create a instance of a class, say Pair, with a feed(line) method, its start and key and a buffer or list to store data of interest.
Put all the pair instances in a list. Use those keywords to scan for matches and feed each of those Pair instances each line. If an instance is not between start and end keyword the Pair instance is inactive & forgets the data, otherwise feed adds it to its own StringBuffer or line list. Or even directly to your output file.
Write your file at end.
I am not 100% sure I understand the problem. But if I understand correctly, you can just have a list of lists, containing each start/end keyword couple. For each word in the document, check if it is equal to the first element in one of the lists (the start keyword). If it is, pop the keyword from the list (making the end-keyword be the first element of the list) and start saving all subsequent words into a string (different strings for each start/end keyword-couple). Once you hit the end keyword, just pop it from the list as well and remove the list that used to contain the start/end keywords from the surrounding list. At the end you should have 3 strings containing all the words between the different start/end keywords. Now just print all 3 of them to the file
EDIT:
If the main problem is that you're not able to append to the file, but you're actually rewriting the file every time, try opening the file this way instead of the way you're doing it today:
fo = open(outputfilename, "ab")
From the python docs:
"The first argument is a string containing the filename. The second argument is another string containing a few characters describing the way in which the file will be used. mode can be 'r' when the file will only be read, 'w' for only writing (an existing file with the same name will be erased), and 'a' opens the file for appending; any data written to the file is automatically added to the end. 'r+' opens the file for both reading and writing. The mode argument is optional; 'r' will be assumed if it’s omitted."
I tested the code using the following example text file.
something something something something
something something something something
something something something something
something something something something
something something something something
something something something something STARTkeyword3 something
data3
data3
data3
data3
data3
ENDkeyword3
something something something something
something something something something
something something something something
something something something something STARTkeyword1 something
data1
data1
data1
ENDkeyword1
something something something something
something something something something
something something something something
something something something something
something something something something STARTkeyword2 something
data2
data2
data2
Data2
ENDkeyword2
something something something something
something something something something
something something something something
something something something something
something something something something
something something something something STARTkeyword3 something
data3
data3
data3
data3
data3
ENDkeyword3
Visit https://www.youtube.com/watch?v=D6LYa6X2Otg&feature=youtu.be
Here is the code:
## ---------------------------------------------------------------------------
## Created by: James P. Lopez
## Created date: 9/21/2020
## Modified date: 9/29/2020, 10/20/2020
## https://stackoverflow.com/questions/32097118/search-text-file-for-multiple-strings-and-print-out-results-to-a-new-text-file
## Search text file for multiple strings and print out results to a new text file
## ---------------------------------------------------------------------------
import time, datetime, string
############################################################################################
############ Adding date to end of output file name
start_c = datetime.datetime.now();##print start_c
c1 = (("%(start_c)s" % vars()).partition(' ')[0]);##print c1
new_str = string.replace(c1, '-', '_');new_str = "_"+new_str;##print(new_str)
#### Path
path = "S:\\PythonProjects\\TextFileReadWrite"
Bslash = "\\"
#### Text files
#### Read file
##filename1 = "openfilename1" ## This is the read file
#### Below sample with exceptions
filename1 = "openfilenameexception1" ## This is the read file
#### Write file
outputfilename1 = "writefilename2" ## This is the write file
#### Text file extension
extT = ".txt"
#### Full Text file name
filename = ("%(path)s%(Bslash)s%(filename1)s%(extT)s" % vars());print filename
outputfilename = ("%(path)s%(Bslash)s%(outputfilename1)s%(new_str)s%(extT)s" % vars())
print outputfilename
#### Sum rows in text file
with open(filename) as filename1:
SumReadFile = sum(1 for _ in filename1)
filename1.close()
##print SumReadFile
############################################################################################
############ Create text file if it does not exist
############ or truncate if it does exist
outputfilename2=open('%(outputfilename)s' % vars(),'w')
outputfilename2.close()
#### Counters
foundstart = 0
CountAllStartKeys = 0
CountAllEndKeys = 0
CountAllBetweenData = 0
CountAllRecordsProcessed = 0
#### Set of keys
s1 = 'STARTkeyword1';s2 = 'STARTkeyword2';s3 = 'STARTkeyword3'
e1 = 'ENDkeyword1';e2 = 'ENDkeyword2';e3 = 'ENDkeyword3'
#### Opening output file append
outputFiles=open('%(outputfilename)s' % vars(),'a')
SetOfKeys = [(s1,e1),(s2,e2),(s3,e3)]
for keys in SetOfKeys:
print(keys)
search1 = keys[0]; ##print search1 ## This is the start key
search2 = keys[1]; ##print search2 ## This is the end key
with open("%(filename)s"% vars(), "r") as readFile1:
for line in readFile1:
print line
CountAllRecordsProcessed += 1
if foundstart == 0:
if search1 in line:
#### It will write the start key
print ("Yes found start key %(search1)s within line = %(line)s" % vars())
outputFiles.write(line)
foundstart += 1;CountAllStartKeys += 1
## time.sleep(2)
continue
if foundstart >= 1:
if search2 in line:
#### It will write the end key
print ("Yes found end key %(search2)s within line = %(line)s\nn" % vars())
outputFiles.write(line)
foundstart = 0;CountAllEndKeys += 1
## time.sleep(2)
elif search1 in line:
#### It will append to output text file and write no end key found
print ("\nATTENTION! No matching end key within line = %(line)s\n" % vars())
print ("\nHowever, found start key %(search1)s within line = %(line)s\n" % vars())
outputFiles.write("\nNo matching end key within line = %(line)s\n" % vars())
outputFiles.write("\nHowever, found start key %(search1)s within line = %(line)s\n" % vars())
outputFiles.write(line)
CountAllStartKeys += 1
## time.sleep(5)
continue
else:
#### It will write the rows between start and end key
print ("Yes, found between data = %(line)s" % vars())
outputFiles.write(line)
CountAllBetweenData += 1
## time.sleep(2)
readFile1.close()
outputFiles.close()
print "\nFinished Text File Read and Write"
print "\nTotal Number of Start Key Words Processed = " + str(CountAllStartKeys)
print "Total Number of End Key Words Processed = " + str(CountAllEndKeys)
print "\nTotal Number of Between Data Processed = " + str(CountAllBetweenData)
print "\nTotal Sum of Lines in Read File = " + str(SumReadFile)
NumberOfSetOfKeys = CountAllRecordsProcessed / SumReadFile; ##print NumberOfSetOfKeys
print "Total Number of Set of Keys = " + str(NumberOfSetOfKeys)
print "Total Number of Records Processed = " + str(CountAllRecordsProcessed)
print ("\n%(SumReadFile)s multiplied by %(NumberOfSetOfKeys)s = %(CountAllRecordsProcessed)s" % vars())
This program will process the read file only once instead of my previous post which processed the start and end keys one at a time for a total of three full iterations. It has been tested with the sample data in my previous post.
Visit: https://youtu.be/PJjBftGhSNc
## ---------------------------------------------------------------------------
## Created by: James P. Lopez
## Created date: 9/21/2020
## Modified date: 10/1/2020, 10/20/2020
## https://stackoverflow.com/questions/32097118/search-text-file-for-multiple-strings-and-print-out-results-to-a-new-text-file
## Search text file for multiple strings and print out results to a new text file
## ---------------------------------------------------------------------------
import time, datetime, string
############################################################################################
############ Adding date to end of file name
start_c = datetime.datetime.now(); ##print start_c
c1 = (("%(start_c)s" % vars()).partition(' ')[0]); ##print c1
new_str = string.replace(c1, '-', '_');new_str = "_"+new_str;##print(new_str)
#### Path
path = "S:\\PythonProjects\\TextFileReadWrite"
Bslash = "\\"
#### Text files
#### Read file
##filename1 = "openfilename1" ## This is the read file
#### Below sample with exceptions
filename1 = "openfilenameexception1" ## This is the read file
#### Write file
outputfilename1 = "writefilename2" ## This is the write file
#### Full Text file name
filename = ("%(path)s%(Bslash)s%(filename1)s.txt" % vars());print filename
outputfilename = ("%(path)s%(Bslash)s%(outputfilename1)s%(new_str)s.txt" % vars())
print outputfilename
#### Counters
foundstart = 0
CountAllStartKeys = 0
CountAllEndKeys = 0
CountAllBetweenData = 0
CountAllRecordsProcessed = 0
#### Start Key Not Found
SKNF = 0
#### Sum number or rows in text file
with open(filename) as filename1:
SumReadFile = sum(1 for _ in filename1)
filename1.close()
print SumReadFile
#### Total set of keys
start1 = 'STARTkeyword1';start2 = 'STARTkeyword2';start3 = 'STARTkeyword3'
end1 = 'ENDkeyword1';end2 = 'ENDkeyword2';end3 = 'ENDkeyword3'
#### Count number of unique start and end keys
SK1 = 0;SK2 = 0;SK3 = 0;EK1 = 0;EK2 = 0;EK3 = 0
Keys = [start1,start2,start3,end1,end2,end3]
##print Keys
with open(filename) as filename1:
for line in filename1:
## print line
if any(word in line for word in Keys):
if start1 in line:
SK1+=1
elif start2 in line:
SK2+=1
elif start3 in line:
SK3+=1
elif end1 in line:
EK1+=1
elif end2 in line:
EK2+=1
elif end3 in line:
EK3+=1
filename1.close()
############################################################################################
############ Create if it does not exist or truncate if it does exist
outputfilename2=open('%(outputfilename)s' % vars(),'w')
outputfilename2.close()
### Opening output file to append data
outputFiles=open('%(outputfilename)s' % vars(),'a')
#### We are only checking for the start keys
StartKeys = [start1,start2,start3]
#### Opening and reading the first line in read text file
with open("%(filename)s"% vars(), "r") as readFile1:
for line in readFile1:
CountAllRecordsProcessed += 1
#### We are checking if one of the StartKeys is in line
if any(word in line for word in StartKeys):
#### Setting the variables (s1 and e1) for the start and end keys
if start1 in line:
s1 = start1; e1 = end1; SKNF = 1; ##print ('%(s1)s , %(e1)s' % vars())
elif start2 in line:
s1 = start2; e1 = end2; SKNF = 1; ##print ('%(s1)s , %(e1)s' % vars())
elif start3 in line:
s1 = start3; e1 = end3; SKNF = 1; ##print ('%(s1)s , %(e1)s' % vars())
## time.sleep(2)
if foundstart == 0 and SKNF <> 0:
if s1 in line:
#### It will append to output text file and write the start key
print ("Yes found start key %(s1)s within line = %(line)s" % vars())
outputFiles.write(line)
foundstart += 1; CountAllStartKeys += 1
## time.sleep(2)
continue
if foundstart >= 1 and SKNF <> 0:
if e1 in line:
#### It will append to output text file and write the end key
print ("Yes found end key %(e1)s within line = %(line)s" % vars())
outputFiles.write(line)
foundstart = 0; SKNF = 0; CountAllEndKeys += 1
## time.sleep(2)
elif s1 in line:
#### It will append to output text file and write no end key found
print ("\nATTENTION! No matching end key within line = %(line)s\n" % vars())
print ("\nHowever, found start key %(s1)s within line = %(line)s\n" % vars())
outputFiles.write("\nNo matching end key within line = %(line)s\n" % vars())
outputFiles.write("\nHowever, found start key %(s1)s within line = %(line)s\n" % vars())
outputFiles.write(line)
CountAllStartKeys += 1
## time.sleep(2)
continue
else:
#### It will append to output text file and write the rows between start and end key
print ("Yes found between data = %(line)s" % vars())
outputFiles.write(line)
CountAllBetweenData += 1
## time.sleep(2)
#### Closing read and write text files
readFile1.close()
outputFiles.close()
print "\nFinished Text File Read and Write"
print '\nTotal Number of Unique Start Keys'
print ("%(start1)s = " % vars())+str(SK1)
print ("%(start2)s = " % vars())+str(SK2)
print ("%(start3)s = " % vars())+str(SK3)
print "Total Number of Start Key Words Processed = " + str(CountAllStartKeys)
print '\nTotal Number of Unique End Keys'
print ("%(end1)s = " % vars())+str(EK1)
print ("%(end2)s = " % vars())+str(EK2)
print ("%(end3)s = " % vars())+str(EK3)
print "Total Number of End Key Words Processed = " + str(CountAllEndKeys)
print "\nTotal Number of Between Data Processed = " + str(CountAllBetweenData)
print "\nTotal Sum of Lines in Read File = " + str(SumReadFile)
print "Total Number of Records Processed = " + str(CountAllRecordsProcessed)

Pick parts from a txt file and copy to another file with python

I'm in trouble here. I need to read a file. Txt file that contains a sequence of records, check the records that I want to copy them to a new file.
The file content is like this (this is just an example, the original file has more than 30 000 lines):
AAAAA|12|120 #begin file
00000|46|150 #begin register
03000|TO|460
99999|35|436 #end register
00000|46|316 #begin register
03000|SP|467
99999|33|130 #end register
00000|46|778 #begin register
03000|TO|478
99999|33|457 #end register
ZZZZZ|15|111 #end file
The records that begin with 03000 and have the characters 'TO' must be written to a new file. Based on the example, the file should look like this:
AAAAA|12|120 #begin file
00000|46|150 #begin register
03000|TO|460
99999|35|436 #end register
00000|46|778 #begin register
03000|TO|478
99999|33|457 #end register
ZZZZZ|15|111 #end file
Code:
file = open("file.txt",'r')
newFile = open("newFile.txt","w")
content = file.read()
file.close()
# here I need to check if the record exists 03000 characters 'TO', if it exists, copy the recordset 00000-99999 for the new file.
I did multiple searches and found nothing to help me.
Thank you!
with open("file.txt",'r') as inFile, open("newFile.txt","w") as outFile:
outFile.writelines(line for line in inFile
if line.startswith("03000") and "TO" in line)
If you need the previous and the next line, then you have to iterate inFile in triads. First define:
def gen_triad(lines, prev=None):
after = current = next(lines)
for after in lines:
yield prev, current, after
prev, current = current, after
And then do like before:
outFile.writelines(''.join(triad) for triad in gen_triad(inFile)
if triad[1].startswith("03000") and "TO" in triad[1])
import re
pat = ('^00000\|\d+\|\d+.*\n'
'^03000\|TO\|\d+.*\n'
'^99999\|\d+\|\d+.*\n'
'|'
'^AAAAA\|\d+\|\d+.*\n'
'|'
'^ZZZZZ\|\d+\|\d+.*')
rag = re.compile(pat,re.MULTILINE)
with open('fifi.txt','r') as f,\
open('newfifi.txt','w') as g:
g.write(''.join(rag.findall(f.read())))
For files with additional lines between lines beginning with 00000, 03000 and 99999, I didn't find simpler code than this one:
import re
pat = ('(^00000\|\d+\|\d+.*\n'
'(?:.*\n)+?'
'^99999\|\d+\|\d+.*\n)'
'|'
'(^AAAAA\|\d+\|\d+.*\n'
'|'
'^ZZZZZ\|\d+\|\d+.*)')
rag = re.compile(pat,re.MULTILINE)
pit = ('^00000\|.+?^03000\|TO\|\d+.+?^99999\|')
rig = re.compile(pit,re.DOTALL|re.MULTILINE)
def yi(text):
for g1,g2 in rag.findall(text):
if g2:
yield g2
elif rig.match(g1):
yield g1
with open('fifi.txt','r') as f,\
open('newfifi.txt','w') as g:
g.write(''.join(yi(f.read())))
file = open("file.txt",'r')
newFile = open("newFile.txt","w")
content = file.readlines()
file.close()
newFile.writelines(filter(lambda x:x.startswith("03000") and "TO" in x,content))
This seems to work. The other answers seem to only be writing out records that contain '03000|TO|' but you have to write out the record before and after that as well.
import sys
# ---------------------------------------------------------------
# ---------------------------------------------------------------
# import file
file_name = sys.argv[1]
file_path = 'C:\\DATA_SAVE\\pick_parts\\' + file_name
file = open(file_path,"r")
# ---------------------------------------------------------------
# create output files
output_file_path = 'C:\\DATA_SAVE\\pick_parts\\' + file_name + '.out'
output_file = open(output_file_path,"w")
# create output files
# ---------------------------------------------------------------
# process file
temp = ''
temp_out = ''
good_write = False
bad_write = False
for line in file:
if line[:5] == 'AAAAA':
temp_out += line
elif line[:5] == 'ZZZZZ':
temp_out += line
elif good_write:
temp += line
temp_out += temp
temp = ''
good_write = False
elif bad_write:
bad_write = False
temp = ''
elif line[:5] == '03000':
if line[6:8] != 'TO':
temp = ''
bad_write = True
else:
good_write = True
temp += line
temp_out += temp
temp = ''
else:
temp += line
output_file.write(temp_out)
output_file.close()
file.close()
Output:
AAAAA|12|120 #begin file
00000|46|150 #begin register
03000|TO|460
99999|35|436 #end register
00000|46|778 #begin register
03000|TO|478
99999|33|457 #end register
ZZZZZ|15|111 #end file
Does it have to be python? These shell commands would do the same thing in a pinch.
head -1 inputfile.txt > outputfile.txt
grep -C 1 "03000|TO" inputfile.txt >> outputfile.txt
tail -1 inputfile.txt >> outputfile.txt
# Whenever I have to parse text files I prefer to use regular expressions
# You can also customize the matching criteria if you want to
import re
what_is_being_searched = re.compile("^03000.*TO")
# don't use "file" as a variable name since it is (was?) a builtin
# function
with open("file.txt", "r") as source_file, open("newFile.txt", "w") as destination_file:
for this_line in source_file:
if what_is_being_searched.match(this_line):
destination_file.write(this_line)
and for those who prefer a more compact representation:
import re
with open("file.txt", "r") as source_file, open("newFile.txt", "w") as destination_file:
destination_file.writelines(this_line for this_line in source_file
if re.match("^03000.*TO", this_line))
code:
fileName = '1'
fil = open(fileName,'r')
import string
##step 1: parse the file.
parsedFile = []
for i in fil:
##tuple1 = (1,2,3)
firstPipe = i.find('|')
secondPipe = i.find('|',firstPipe+1)
tuple1 = (i[:firstPipe],\
i[firstPipe+1:secondPipe],\
i[secondPipe+1:i.find('\n')])
parsedFile.append(tuple1)
fil.close()
##search criterias:
searchFirst = '03000'
searchString = 'TO' ##can be changed if and when required
##step 2: used the parsed contents to write the new file
filout = open('newFile','w')
stringToWrite = parsedFile[0][0] + '|' + parsedFile[0][1] + '|' + parsedFile[0][2] + '\n'
filout.write(stringToWrite) ##to write the first entry
for i in range(1,len(parsedFile)):
if parsedFile[i][1] == searchString and parsedFile[i][0] == searchFirst:
for j in range(-1,2,1):
stringToWrite = parsedFile[i+j][0] + '|' + parsedFile[i+j][1] + '|' + parsedFile[i+j][2] + '\n'
filout.write(stringToWrite)
stringToWrite = parsedFile[-1][0] + '|' + parsedFile[-1][1] + '|' + parsedFile[-1][2] + '\n'
filout.write(stringToWrite) ##to write the first entry
filout.close()
I know that this solution may be a bit long. But it is quite easy to understand. And it seems an intuitive way to do it. And I have already checked this with the Data that you have provided and it works perfectly.
Please tell me if you need some more explanation on the code. I will definitely add the same.
I tip (Beasley and Joran elyase) very interesting, but it only allows to get the contents of the line 03000. I would like to get the contents of the lines 00000 to line 99999.
I even managed to do here, but I am not satisfied, I wanted to make a more cleaner.
See how I did:
file = open(url,'r')
newFile = open("newFile.txt",'w')
lines = file.readlines()
file.close()
i = 0
lineTemp = []
for line in lines:
lineTemp.append(line)
if line[0:5] == '03000':
state = line[21:23]
if line[0:5] == '99999':
if state == 'TO':
newFile.writelines(lineTemp)
else:
linhaTemp = []
i = i+1
newFile.close()
Suggestions...
Thanks to all!

Categories