Python extract text files - python

I have several test files kept in one directory
I want to go to each file and search some text "Text 1" and "Text 2" and print everything in front of this text in output file....
This I have done using python script.....
But next thing is I want only the first instance of "Text 1" and "Text 2" in each file. If I add break in the current script I am not able to print in out file..
Please guide me.. I am a python beginner...
import os
path = "D:\test"
in_files = os.listdir(path)
desc = open("desc.txt", "w")
print >> desc, "Mol_ID, Text1, Text2"
moldesc = ['Text1', 'Text2']
for f in in_files:
file = os.path.join(path, f)
text = open(file, "r")
hit_count = 0
hit_count1 = 0
for line in text:
if moldesc[0] in line:
Text1 = line.split()[-1]
if moldesc[1] in line:
Text2 = line.split()[-1]
print >> desc, f + "," + Text1 + "," + Text2
text.close()
print "Text extraction done !!!"

There are a couple of issues with your code:
Your text.close() should be at the same level as the for line in text loop.
The print >> desc statement is out of place: you should print only if both Text1 and Text2 are defined. You could set them as None just outside the for line in text loop, and test if they are both not None. (Alternatively, you could set hit_count0=1 in the if moldesc[0] test, hit_count1=1 in the if moldesc[1] and test for hit_count0 and hit_count1). In that case, print the output and use a break to escape the loop.
(so, in plain code:)
for f in in_files:
file = os.path.join(path, f)
with open(file, "r") as text:
hit_count = 0
hit_count1 = 0
for line in text:
if moldesc[0] in line:
Text1 = line.split()[-1]
hit_count = 1
if moldesc[1] in line:
Text2 = line.split()[-1]
hit_count1 = 1
if hit_count and hit_count1:
print >> desc, f + "," + Text1 + "," + Text2
break
There's a third issue:
You mention wanting the text before Text1 ? Then you may want to use Text1 = line[:line.index(moldesc[0])] instead of your Text1 = line.split()[-1]...

I would go for an mmap and possibly use CSV for the results file approach, something like (untested) and rough around the edges... (needs better error handling, may want to use mm.find() instead of an regex, some of the code is copied verbatim from OP etc..., and my computer's battery is about to die...)
import os
import csv
import mmap
from collections import defaultdict
PATH = r"D:\test" # note 'r' prefix to escape '\t' interpretation
in_files = os.listdir(path)
fout = open('desc.txt', 'w')
csvout = csv.writer(fout)
csvout.writerow( ['Mol_ID', 'Text1', 'Text2'] )
dd = defaultdict(list)
for filename in in_files:
fin = open(os.path.join(path, f))
mm = mmap.mmap(fin.fileno(), 0, access=mmap.ACCESS_READ)
# Find stuff
matches = re.findall(r'(.*?)(Text[12])', mm) # maybe user finditer depending on exact needs
for text, matched in matches:
dd[matched].append(text)
# do something with dd - write output using csvout.writerow()...
mm.close()
fin.close()
csvout.close()

Related

Lines missing in python

I am writing a code in python where I am removing all the text after a specific word but in output lines are missing. I have a text file in unicode which have 3 lines:
my name is test1
my name is
my name is test 2
What I want is to remove text after word "test" so I could get the output as below
my name is test
my name is
my name is test
I have written a code but it does the task but also removes the second line "my name is"
My code is below
txt = ""
with open(r"test.txt", 'r') as fp:
for line in fp.readlines():
splitStr = "test"
index = line.find(splitStr)
if index > 0:
txt += line[:index + len(splitStr)] + "\n"
with open(r"test.txt", "w") as fp:
fp.write(txt)
It looks like if there is no keyword found the index become -1.
So you are avoiding the lines w/o keyword.
I would modify your if by adding the condition as follows:
txt = ""
with open(r"test.txt", 'r') as fp:
for line in fp.readlines():
splitStr = "test"
index = line.find(splitStr)
if index > 0:
txt += line[:index + len(splitStr)] + "\n"
elif index < 0:
txt += line
with open(r"test.txt", "w") as fp:
fp.write(txt)
No need to add \n because the line already contains it.
Your code does not append the line if the splitStr is not defined.
txt = ""
with open(r"test.txt", 'r') as fp:
for line in fp.readlines():
splitStr = "test"
index = line.find(splitStr)
if index != -1:
txt += line[:index + len(splitStr)] + "\n"
else:
txt += line
with open(r"test.txt", "w") as fp:
fp.write(txt)
In my solution I simulate the input file via io.StringIO. Compared to your code my solution remove the else branch and only use one += operater. Also splitStr is set only one time and not on each iteration. This makes the code more clear and reduces possible errore sources.
import io
# simulates a file for this example
the_file = io.StringIO("""my name is test1
my name is
my name is test 2""")
txt = ""
splitStr = "test"
with the_file as fp:
# each line
for line in fp.readlines():
# cut somoething?
if splitStr in line:
# find index
index = line.find(splitStr)
# cut after 'splitStr' and add newline
line = line[:index + len(splitStr)] + "\n"
# append line to output
txt += line
print(txt)
When handling with files in Python 3 it is recommended to use pathlib for that like this.
import pathlib
file_path = pathlib.Path("test.txt")
# read from wile
with file_path.open('r') as fp:
# do something
# write back to the file
with file_path.open('w') as fp:
# do something
Suggestion:
for line in fp.readlines():
i = line.find('test')
if i != -1:
line = line[:i]

Replace a line with a pattern

I am trying to replace a line when a pattern (only one pattern I have in that file) found with the below code, but it replaced whole content of the file.
Could you please advise or any better way with pathlib ?
import datetime
def insert_timestamp():
""" To Update the current date in DNS files """
pattern = '; serial number'
current_day = datetime.datetime.today().strftime('%Y%m%d')
subst = "\t" + str(current_day) + "01" + " ; " + pattern
print(current_day)
with open(lab_net_file, "w+") as file:
for line in file:
file.write(line if pattern not in line else line.replace(pattern, subst))
lab_net_file = '/Users/kams/nameserver/10_15'
insert_timestamp()
What you would want to do is read the file, replace the pattern, and write to it again like this:
with open(lab_net_file, "r") as file:
read = file.read()
read = read.replace(pattern, subst)
with open(lab_net_file, "w") as file:
file.write(read)
The reason that you don't need to use if/else is because if there is no pattern inside read, then .replace won't do anything, and you don't need to worry about it. If pattern is inside read, then .replace will replace it throughout the entire string.
I am able to get the output I wanted with this block of code.
def insert_timestamp(self):
""" To Update the current date in DNS files """
pattern = re.compile(r'\s[0-9]*\s;\sserial number')
current_day = datetime.datetime.today().strftime('%Y%m%d')
subst = "\t" + str(current_day) + "01" + " ; " + 'serial number'
with open(lab_net_file, "r") as file:
reading_file = file.read()
pattern = pattern.search(reading_file).group()
reading_file = reading_file.replace(pattern, subst)
with open(lab_net_file, "w") as file:
file.write(reading_file)
Thank you #Timmy

4 functions near identical, one works but the others dont no errors

I have 4 functions that should grab some text from a specific docx file that are highlighted scan4<colour> is the varialbe of the text, these functions are near identical but are searching and replacing different highglighted text, they print out the same but they dont replace the text
these are 2 functions of 4, the yellow one works while the green one doesnt
what the code does is it searches and replaces the text then encrypts then uses the encrypted string into the main document, the other does the exact same but searches for a different highlighted colour. I try to encrypt the first function works but the second one doesn't
This code searches for yellow encodes the text and then replaces it so the un-encrypted document shows the encrypted string of the contents
def securi1_key():
file = open("C:/Users/devff/PycharmProjects/SecurityLevels/Stored Keys/Securi1.key", "rb")
global key1
key1 = file.read()
file.close()
def rewrite_yellow():
securi1_key()
save_yellow_text()
# get key from file
file = open("C:/Users/devff/PycharmProjects/SecurityLevels/Stored Keys/Securi1.key", "rb")
texty = scan4yellow.decode("utf-8")
encodedy = texty.encode()
# encrypt message
f = Fernet(key1)
encryptedy = f.encrypt(encodedy)
print(scan4yellow)
print(scan4yellow.decode("utf-8"))
document = docx.Document(f1)
for paragraph in document.paragraphs:
if scan4yellow.decode("utf-8") in paragraph.text:
inline = paragraph.runs
# loops for runs
for i in range(len(inline)):
if scan4yellow.decode("utf-8") in inline[i].text:
text = inline[i].text.replace(scan4yellow.decode("utf-8"), encryptedy.decode("utf-8"))
inline[i].text = text
document.save(f1)
def save_yellow_text():
securi1_key()
fp = f1
p = Path(fp)
filename1 = p.stem
storedtexty = filename1 + " Yellow Text"
storedtextencryptedy = storedtexty + ".encrypted"
list_of_files = os.listdir("C:/Users/devff/PycharmProjects/SecurityLevels/Stored Text/")
if storedtexty in list_of_files:
storedtexty = (storedtexty + "1")
file = open("C:/Users/devff/PycharmProjects/SecurityLevels/Stored Text/" + storedtexty, "w+")
file.write(scan4yellow.decode("utf-8"))
input_file1 = ("C:/Users/devff/PycharmProjects/SecurityLevels/Stored Text/" + storedtexty)
if storedtextencryptedy in list_of_files:
storedtextencryptedy = (storedtextencryptedy + "1")
output_file1 = ("C:/Users/devff/PycharmProjects/SecurityLevels/Stored Text/" + storedtextencryptedy)
with open(input_file1, "rb") as f:
data = f.read()
fernet = Fernet(key1)
encrypted = fernet.encrypt(data)
with open(output_file1, "wb") as f:
f.write(encrypted)
file.close()
os.remove(input_file1)
this code should do the exact same but for the colour green :
def securi2_key():
file = open("C:/Users/devff/PycharmProjects/SecurityLevels/Stored Keys/Securi2.key", "rb")
global key2
key2 = file.read()
file.close()
def rewrite_green():
securi2_key()
save_green_text()
# get key from file
file = open("C:/Users/devff/PycharmProjects/SecurityLevels/Stored Keys/Securi2.key", "rb")
textg = scan4green.decode("utf-8")
encodedg = textg.encode()
print(encodedg)
# encrypt message
f = Fernet(key2)
encryptedg = f.encrypt(encodedg)
print(encryptedg)
document = docx.Document(f1)
for paragraph in document.paragraphs:
if scan4green.decode("utf-8") in paragraph.text:
inline = paragraph.runs
# loops for runs
for i in range(len(inline)):
if scan4green.decode("utf-8") in inline[i].text:
text = inline[i].text.replace(scan4green.decode("utf-8"), encryptedg.decode("utf-8"))
inline[i].text = text
document.save(f1)
def save_green_text():
securi2_key()
fp = f1
p = Path(fp)
filename2 = p.stem
storedtextg = filename2 + " Green Text"
storedtextencryptedg = storedtextg + ".encrypted"
list_of_files = os.listdir("C:/Users/devff/PycharmProjects/SecurityLevels/Stored Text/")
if storedtextg in list_of_files:
storedtextg = (storedtextg + "1")
file = open("C:/Users/devff/PycharmProjects/SecurityLevels/Stored Text/" + storedtextg, "w+")
file.write(scan4green.decode("utf-8"))
print(scan4green.decode("utf-8") + "tested1")
input_file2 = ("C:/Users/devff/PycharmProjects/SecurityLevels/Stored Text/" + storedtextg)
if storedtextencryptedg in list_of_files:
storedtextencryptedg = (storedtextencryptedg + "1")
output_file2 = ("C:/Users/devff/PycharmProjects/SecurityLevels/Stored Text/" + storedtextencryptedg)
with open(input_file2, "rb") as f:
data = f.read()
fernet = Fernet(key2)
encrypted = fernet.encrypt(data)
with open(output_file2, "wb") as f:
f.write(encrypted)
file.close()
os.remove(input_file2)
I should have some incoherent string replace the actual text and the actual text saved in another file encrypted, but all this does is work for the first yellow function but not the green function
Ideally it should take the text from the read in file, make a copy write it out to a file and encrypt that then take the string of encryption from that and replace it where it was in the read in file, but it only works for the yellow code while the green and the other code which are near identical do not work
I have found an answer, what I changed has nothing to do with the code I provided but when I read in the text it was scan4green = (word.find(tag_t).text.encode('utf-8').lower()). this apparently caused it not to work properly but the scan4yellow did work because i didn't have .lower() attached to the end.

Pick parts from a txt file and copy to another file with python

I'm in trouble here. I need to read a file. Txt file that contains a sequence of records, check the records that I want to copy them to a new file.
The file content is like this (this is just an example, the original file has more than 30 000 lines):
AAAAA|12|120 #begin file
00000|46|150 #begin register
03000|TO|460
99999|35|436 #end register
00000|46|316 #begin register
03000|SP|467
99999|33|130 #end register
00000|46|778 #begin register
03000|TO|478
99999|33|457 #end register
ZZZZZ|15|111 #end file
The records that begin with 03000 and have the characters 'TO' must be written to a new file. Based on the example, the file should look like this:
AAAAA|12|120 #begin file
00000|46|150 #begin register
03000|TO|460
99999|35|436 #end register
00000|46|778 #begin register
03000|TO|478
99999|33|457 #end register
ZZZZZ|15|111 #end file
Code:
file = open("file.txt",'r')
newFile = open("newFile.txt","w")
content = file.read()
file.close()
# here I need to check if the record exists 03000 characters 'TO', if it exists, copy the recordset 00000-99999 for the new file.
I did multiple searches and found nothing to help me.
Thank you!
with open("file.txt",'r') as inFile, open("newFile.txt","w") as outFile:
outFile.writelines(line for line in inFile
if line.startswith("03000") and "TO" in line)
If you need the previous and the next line, then you have to iterate inFile in triads. First define:
def gen_triad(lines, prev=None):
after = current = next(lines)
for after in lines:
yield prev, current, after
prev, current = current, after
And then do like before:
outFile.writelines(''.join(triad) for triad in gen_triad(inFile)
if triad[1].startswith("03000") and "TO" in triad[1])
import re
pat = ('^00000\|\d+\|\d+.*\n'
'^03000\|TO\|\d+.*\n'
'^99999\|\d+\|\d+.*\n'
'|'
'^AAAAA\|\d+\|\d+.*\n'
'|'
'^ZZZZZ\|\d+\|\d+.*')
rag = re.compile(pat,re.MULTILINE)
with open('fifi.txt','r') as f,\
open('newfifi.txt','w') as g:
g.write(''.join(rag.findall(f.read())))
For files with additional lines between lines beginning with 00000, 03000 and 99999, I didn't find simpler code than this one:
import re
pat = ('(^00000\|\d+\|\d+.*\n'
'(?:.*\n)+?'
'^99999\|\d+\|\d+.*\n)'
'|'
'(^AAAAA\|\d+\|\d+.*\n'
'|'
'^ZZZZZ\|\d+\|\d+.*)')
rag = re.compile(pat,re.MULTILINE)
pit = ('^00000\|.+?^03000\|TO\|\d+.+?^99999\|')
rig = re.compile(pit,re.DOTALL|re.MULTILINE)
def yi(text):
for g1,g2 in rag.findall(text):
if g2:
yield g2
elif rig.match(g1):
yield g1
with open('fifi.txt','r') as f,\
open('newfifi.txt','w') as g:
g.write(''.join(yi(f.read())))
file = open("file.txt",'r')
newFile = open("newFile.txt","w")
content = file.readlines()
file.close()
newFile.writelines(filter(lambda x:x.startswith("03000") and "TO" in x,content))
This seems to work. The other answers seem to only be writing out records that contain '03000|TO|' but you have to write out the record before and after that as well.
import sys
# ---------------------------------------------------------------
# ---------------------------------------------------------------
# import file
file_name = sys.argv[1]
file_path = 'C:\\DATA_SAVE\\pick_parts\\' + file_name
file = open(file_path,"r")
# ---------------------------------------------------------------
# create output files
output_file_path = 'C:\\DATA_SAVE\\pick_parts\\' + file_name + '.out'
output_file = open(output_file_path,"w")
# create output files
# ---------------------------------------------------------------
# process file
temp = ''
temp_out = ''
good_write = False
bad_write = False
for line in file:
if line[:5] == 'AAAAA':
temp_out += line
elif line[:5] == 'ZZZZZ':
temp_out += line
elif good_write:
temp += line
temp_out += temp
temp = ''
good_write = False
elif bad_write:
bad_write = False
temp = ''
elif line[:5] == '03000':
if line[6:8] != 'TO':
temp = ''
bad_write = True
else:
good_write = True
temp += line
temp_out += temp
temp = ''
else:
temp += line
output_file.write(temp_out)
output_file.close()
file.close()
Output:
AAAAA|12|120 #begin file
00000|46|150 #begin register
03000|TO|460
99999|35|436 #end register
00000|46|778 #begin register
03000|TO|478
99999|33|457 #end register
ZZZZZ|15|111 #end file
Does it have to be python? These shell commands would do the same thing in a pinch.
head -1 inputfile.txt > outputfile.txt
grep -C 1 "03000|TO" inputfile.txt >> outputfile.txt
tail -1 inputfile.txt >> outputfile.txt
# Whenever I have to parse text files I prefer to use regular expressions
# You can also customize the matching criteria if you want to
import re
what_is_being_searched = re.compile("^03000.*TO")
# don't use "file" as a variable name since it is (was?) a builtin
# function
with open("file.txt", "r") as source_file, open("newFile.txt", "w") as destination_file:
for this_line in source_file:
if what_is_being_searched.match(this_line):
destination_file.write(this_line)
and for those who prefer a more compact representation:
import re
with open("file.txt", "r") as source_file, open("newFile.txt", "w") as destination_file:
destination_file.writelines(this_line for this_line in source_file
if re.match("^03000.*TO", this_line))
code:
fileName = '1'
fil = open(fileName,'r')
import string
##step 1: parse the file.
parsedFile = []
for i in fil:
##tuple1 = (1,2,3)
firstPipe = i.find('|')
secondPipe = i.find('|',firstPipe+1)
tuple1 = (i[:firstPipe],\
i[firstPipe+1:secondPipe],\
i[secondPipe+1:i.find('\n')])
parsedFile.append(tuple1)
fil.close()
##search criterias:
searchFirst = '03000'
searchString = 'TO' ##can be changed if and when required
##step 2: used the parsed contents to write the new file
filout = open('newFile','w')
stringToWrite = parsedFile[0][0] + '|' + parsedFile[0][1] + '|' + parsedFile[0][2] + '\n'
filout.write(stringToWrite) ##to write the first entry
for i in range(1,len(parsedFile)):
if parsedFile[i][1] == searchString and parsedFile[i][0] == searchFirst:
for j in range(-1,2,1):
stringToWrite = parsedFile[i+j][0] + '|' + parsedFile[i+j][1] + '|' + parsedFile[i+j][2] + '\n'
filout.write(stringToWrite)
stringToWrite = parsedFile[-1][0] + '|' + parsedFile[-1][1] + '|' + parsedFile[-1][2] + '\n'
filout.write(stringToWrite) ##to write the first entry
filout.close()
I know that this solution may be a bit long. But it is quite easy to understand. And it seems an intuitive way to do it. And I have already checked this with the Data that you have provided and it works perfectly.
Please tell me if you need some more explanation on the code. I will definitely add the same.
I tip (Beasley and Joran elyase) very interesting, but it only allows to get the contents of the line 03000. I would like to get the contents of the lines 00000 to line 99999.
I even managed to do here, but I am not satisfied, I wanted to make a more cleaner.
See how I did:
file = open(url,'r')
newFile = open("newFile.txt",'w')
lines = file.readlines()
file.close()
i = 0
lineTemp = []
for line in lines:
lineTemp.append(line)
if line[0:5] == '03000':
state = line[21:23]
if line[0:5] == '99999':
if state == 'TO':
newFile.writelines(lineTemp)
else:
linhaTemp = []
i = i+1
newFile.close()
Suggestions...
Thanks to all!

Python CSV module, add column to the side, not the bottom

I am new in python, and I need some help. I made a python script that takes two columns from a file and copies them into a "new file". However, every now and then I need to add columns to the "new file". I need to add the columns on the side, not the bottom. My script adds them to the bottom. Someone suggested using CSV, and I read about it, but I can't make it in a way that it adds the new column to the side of the previous columns. Any help is highly appreciated.
Here is the code that I wrote:
import sys
import re
filetoread = sys.argv[1]
filetowrite = sys.argv[2]
newfile = str(filetowrite) + ".txt"
openold = open(filetoread,"r")
opennew = open(newfile,"a")
rline = openold.readlines()
number = int(len(rline))
start = 0
for i in range (len(rline)) :
if "2theta" in rline[i] :
start = i
for line in rline[start + 1 : number] :
words = line.split()
word1 = words[1]
word2 = words[2]
opennew.write (word1 + " " + word2 + "\n")
openold.close()
opennew.close()
Here is the second code I wrote, using CSV:
import sys
import re
import csv
filetoread = sys.argv[1]
filetowrite = sys.argv[2]
newfile = str(filetowrite) + ".txt"
openold = open(filetoread,"r")
rline = openold.readlines()
number = int(len(rline))
start = 0
for i in range (len(rline)) :
if "2theta" in rline[i] :
start = i
words1 = []
words2 = []
for line in rline[start + 1 : number] :
words = line.split()
word1 = words[1]
word2 = words[2]
words1.append([word1])
words2.append([word2])
with open(newfile, 'wb') as file:
writer = csv.writer(file, delimiter= "\n")
writer.writerow(words1)
writer.writerow(words2)
These are some samples of input files:
https://dl.dropbox.com/u/63216126/file5.txt
https://dl.dropbox.com/u/63216126/file6.txt
My first script works "almost" great, except that it writes the new columns at the bottom and I need them at side of the previous columns.
The proper way to use writerow is to give it a single list that contains the data for all the columns.
words.append(word1)
words.append(word2)
writer.writerow(words)

Categories