I am working on a fairly basic encoder/decoder where you can input your own text file (as a string) and your own encoder (also as a string: it must be a text file).
Here is my decoder function:
def cDecode(file_name, encoder='standard_encoder.txt', save_new=True): # does not decode multi-lines correctly -- everything goes on a single line. See next comment
'''Decodes <'file_name'> with the reverse method of <'encoder'>.'''
if type(file_name) != str or type(encoder) != str: raise TypeError("<'file_name'> and <'encoder'> must be of type <'str'>.")
if type(save_new) != bool: raise TypeError("<'save_new'> must be of type <'bool'>.")
if file_name[-4:] != '.txt': file_name += '.txt'
if encoder[-4:] != '.txt': encoder += '.txt'
decoder_set = {}
with open(encoder, 'r') as encoding_file:
for line in encoding_file:
line_parts = line.split(': ')
my_key, my_value = line_parts[1], line_parts[0]
I think the error is in here:
I have to remove the '\n' because every character (in the decoding file) is on a new line, like such: 'A: Ð'.
if '\n' in my_key:
loc = my_key.find('\n') # this may be the cause of the single-line of the decoding.
my_key = my_key[:loc] + my_key[loc + 1:]
decoder_set[my_key] = my_value
except IOError:
encoder = 'standard_encoder.txt'
with open(encoder, 'r') as encoding_file:
for line in encoding_file:
line_parts = line.split(': ')
my_key, my_value = line_parts[1], line_parts[0]
# every key has a new line character automatically because it's on a different line
if '\n' in my_key:
loc = my_key.find('\n')
my_key = my_key[:loc] + my_key[loc + 1:]
decoder_set[my_key] = my_value
decodingKeys = decoder_set.keys()
Here is the rest of the function:
if save_new:
decoded_file_name = file_name[:-12] + '_decoded' + file_name[-4:]
encoded_file = open(decoded_file_name, 'a+')
with open(file_name, 'r') as my_file:
for line in my_file:
de_line = ''
for char in line:
if char in decodingKeys: de_char = decoder_set[char]
else: de_char = char
de_line += de_char
except IOError:
raise NameError(file_name + ' was not found. Decoding process terminated.')
import os
encoded_file = file_name[:-12] + '_decoded' + file_name[-4:]
with open(file_name, 'r+') as my_file:
for line in my_file:
de_line = ''
for char in line:
if char in decodingKeys: en_char = decoding_set[char]
else: de_char = char
de_line += de_char
os.rename(encoded_file, file_name)
except IOError:
raise NameError(file_name + ' was not found. Decoding process terminated.')
Say I have a multi-line text-file:
This is a test.
As is this one.
Good bye!
When encoded and then decoded afterward, it shows up like this: This is a test.As is this one.Good bye!.
How can I fix this? I'm expecting it to show up like:
This is a test.
As is this one.
Good bye!
Add a '\n' while writing back the line to file:
I am writing a code in python where I am removing all the text after a specific word but in output lines are missing. I have a text file in unicode which have 3 lines:
my name is test1
my name is
my name is test 2
What I want is to remove text after word "test" so I could get the output as below
my name is test
my name is
my name is test
I have written a code but it does the task but also removes the second line "my name is"
My code is below
txt = ""
with open(r"test.txt", 'r') as fp:
for line in fp.readlines():
splitStr = "test"
index = line.find(splitStr)
if index > 0:
txt += line[:index + len(splitStr)] + "\n"
with open(r"test.txt", "w") as fp:
It looks like if there is no keyword found the index become -1.
So you are avoiding the lines w/o keyword.
I would modify your if by adding the condition as follows:
txt = ""
with open(r"test.txt", 'r') as fp:
for line in fp.readlines():
splitStr = "test"
index = line.find(splitStr)
if index > 0:
txt += line[:index + len(splitStr)] + "\n"
elif index < 0:
txt += line
with open(r"test.txt", "w") as fp:
No need to add \n because the line already contains it.
Your code does not append the line if the splitStr is not defined.
txt = ""
with open(r"test.txt", 'r') as fp:
for line in fp.readlines():
splitStr = "test"
index = line.find(splitStr)
if index != -1:
txt += line[:index + len(splitStr)] + "\n"
txt += line
with open(r"test.txt", "w") as fp:
In my solution I simulate the input file via io.StringIO. Compared to your code my solution remove the else branch and only use one += operater. Also splitStr is set only one time and not on each iteration. This makes the code more clear and reduces possible errore sources.
import io
# simulates a file for this example
the_file = io.StringIO("""my name is test1
my name is
my name is test 2""")
txt = ""
splitStr = "test"
with the_file as fp:
# each line
for line in fp.readlines():
# cut somoething?
if splitStr in line:
# find index
index = line.find(splitStr)
# cut after 'splitStr' and add newline
line = line[:index + len(splitStr)] + "\n"
# append line to output
txt += line
When handling with files in Python 3 it is recommended to use pathlib for that like this.
import pathlib
file_path = pathlib.Path("test.txt")
# read from wile
with file_path.open('r') as fp:
# do something
# write back to the file
with file_path.open('w') as fp:
# do something
for line in fp.readlines():
i = line.find('test')
if i != -1:
line = line[:i]
I am having an issue getting the train function to work correctly in python. I can not modify the def function. I am at the point where I need to get the second file to read lines one at a time for PosList and i need to match the value of movieWordCount[z] in OpenPos. If the file is there, then I am good to incrment column 2 by one of t hat line (segmented by a space). If it is not, then I need the else to append it to the file end. It does not work. It does not append the values if it is missing and I am not sure if it will find the value if it is there. I have been stuck getting thsi to work for two days.
Here is my code segment I am working with:
with open("PosList") as OpenPos:
lines = OpenPos.readlines()
print lines
if movieWordCount[z] in lines:
print "found"
#Now use tokenize to split it apart by space and set to new array for me to call column2
print "not found"
lines.append(movieWordCount[z] + " 1" + "\n")
Here is my full code:
#Import Counter
import collections
from collections import Counter
#Was already here but pickle is used for data input and export
import math, os, pickle, re
class Bayes_Classifier:
def __init__(self, trainDirectory = "movie_reviews/"):
#If file listing exists skip to train
if os.path.isfile('iFileList'):
print "file found"
#If file listing does not exist skip to train
if not os.path.isfile('iFileList'):
print "no file"
newfile = 'iFileList'
tempList = set()
subDir = './movie_reviews'
for filenames in os.listdir(subDir):
my_sub_path = os.path.join(os.sep,subDir,filenames)
self.save("filenames", "try3")
f = []
for fFileObj in os.walk("movie_reviews/"):
pickle.dump(f, open( "save.p", "wb" ))
self.save(f, "try4")
with open(newfile, 'wb') as fi:
pickle.dump(tempList, fi)
#print tempList
def train(self):
'''Trains the Naive Bayes Sentiment Classifier.'''
print "File ready for training"
#Open iFileList to use as input for opening movie files
x = 0
OpenIFileList = open('iFileList','r')
print "iFileList now Open"
#Loop through the file
for line in OpenIFileList:
#print "Ready to read lines"
#print "reading line " + line
if x > 4:
if x % 2 == 0:
#print line
s = line
if '-' in s:
comp = s.split("'")
#print comp[2]
print comp[1] #This is What you need for t he movie file
compValue1 = comp[1]
#Determine Positive/Negative.
#compType is the variable I am storing it to.
compType = compValue1.split("-",2)[1]
#print compType #Prints that middle value like 5 or 1
# This will do the work based on the value.
if compType == '5':
# print "you have a five" #Confirms the loop I am in.
#If file does not exists create it
if not os.path.exists('PosList'):
print "no file"
file('PosList', 'w').close()
#Open file that needs to be reviewed for word count
compValue2 = "movie_reviews/" + compValue1
print compValue2 #Prints the directory and file path
OpenMovieList = open(compValue2,'r')
for commentLine in OpenMovieList:
commentPositive = commentLine.split(" ")
commentPositiveCounter = Counter(commentPositive)
#print commentPositiveCounter # " Comment Pos goes here"
#if commentLine != '' or commentLine != ' ':
#Get first word, second word, ....
if commentLine and (not commentLine.isspace()):
movieWordCount = self.tokenize(commentLine)
y = len(movieWordCount) #determines length of string
print y
z = 0
#print movieWordCount[0] # Shows the zero position in the file.
while z < y:
print "position " + str(z) + " word is " + movieWordCount[z] # Shows the word we are at and position id
with open("PosList") as OpenPos:
lines = OpenPos.readlines()
print lines
if movieWordCount[z] in lines:
print "found"
print "not found"
z = z + 1
#Close the files
x += 1
#for line2 in OpenIFileList.readlines():
#for line in open('myfile','r').readlines():
#Save results
#Close the File List
def loadFile(self, sFilename):
'''Given a file name, return the contents of the file as a string.'''
f = open(sFilename, "r")
sTxt = f.read()
return sTxt
def save(self, dObj, sFilename):
'''Given an object and a file name, write the object to the file using pickle.'''
f = open(sFilename, "w")
p = pickle.Pickler(f)
def load(self, sFilename):
'''Given a file name, load and return the object stored in the file.'''
f = open(sFilename, "r")
u = pickle.Unpickler(f)
dObj = u.load()
return dObj
def tokenize(self, sText):
'''Given a string of text sText, returns a list of the individual tokens that
occur in that string (in order).'''
lTokens = []
sToken = ""
for c in sText:
if re.match("[a-zA-Z0-9]", str(c)) != None or c == "\'" or c == "_" or c == '-':
sToken += c
if sToken != "":
sToken = ""
if c.strip() != "":
if sToken != "":
return lTokens
To open a file for writing, you can use
with open('PosList', 'w') as Open_Pos
As you are using the with form, you do not need to close the file; Python will do that for you at the end of the with-block.
So assuming that the way you add data to the lines variable is correct, you could remove the superfluous code OpenMovieList.close() and OpenPos.close(), and append 2 lines to your code:
with open("PosList") as OpenPos:
lines = OpenPos.readlines()
print lines
if movieWordCount[z] in lines:
print "found"
print "not found"
with open("PosList", "w") as OpenPos:
I ran into a curious problem while parsing json objects in large text files, and the solution I found doesn't really make much sense. I was working with the following script. It copies bz2 files, unzips them, then parses each line as a json object.
import os, sys, json
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
args = sys.argv
extractDir = outputDir = ""
if (len(args) >= 2):
extractDir = args[1]
extractDir = raw_input('Directory to extract from: ')
if (len(args) >= 3):
outputDir = args[2]
outputDir = raw_input('Directory to output to: ')
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
tweetModel = [u'id', u'text', u'lang', u'created_at', u'retweeted', u'retweet_count', u'in_reply_to_user_id', u'coordinates', u'place', u'hashtags', u'in_reply_to_status_id']
filenames = next(os.walk(extractDir))[2]
for file in filenames:
if file[-4:] != ".bz2":
os.system("cp " + extractDir + '/' + file + ' ' + outputDir)
os.system("bunzip2 " + outputDir + '/' + file)
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
input = open (outputDir + '/' + file[:-4], 'r')
output = open (outputDir + '/p_' + file[:-4], 'w+')
for line in input.readlines():
tweet = json.loads(line)
for field in enumerate(tweetModel):
if tweet.has_key(field[1]) and tweet[field[1]] != None:
if field[0] != 0:
fieldData = tweet[field[1]]
if not isinstance(fieldData, unicode):
fieldData = unicode(str(fieldData), "utf-8")
except ValueError as e:
print ("Parse Error: " + str(e))
print line
line = input.readline()
print "Success! " + str(len(line))
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
os.system("rm " + outputDir + '/' + file[:-4])
While reading in certain lines in the for line in input.readlines(): loop, the lines would occasionally be truncated at inconsistent locations. Since the newline character was truncated as well, it would keep reading until it found the newline character at the end of the next json object. The result was an incomplete json object followed by a complete json object, all considered one line by the parser. I could not find the reason for this issue, but I did find that changing the loop to
filedata = input.read()
for line in filedata.splitlines():
worked. Does anyone know what is going on here?
After looking at the source code for file.readlines and string.splitlines I think I see whats up. Note: This is python 2.7 source code so if you're using another version... maybe this answer pertains maybe not.
readlines uses the function Py_UniversalNewlineFread to test for a newline splitlines uses a constant STRINGLIB_ISLINEBREAK that just tests for \n or \r. I would suspect Py_UniversalNewlineFread is picking up some character in the file stream as linebreak when its not really intended as a line break, could be from the encoding.. I don't know... but when you just dump all that same data to a string the splitlines checks it against \r and \n theres no match so splitlines moves on until the real line break is encountered and you get your intended line.
I'm in trouble here. I need to read a file. Txt file that contains a sequence of records, check the records that I want to copy them to a new file.
The file content is like this (this is just an example, the original file has more than 30 000 lines):
AAAAA|12|120 #begin file
00000|46|150 #begin register
99999|35|436 #end register
00000|46|316 #begin register
99999|33|130 #end register
00000|46|778 #begin register
99999|33|457 #end register
ZZZZZ|15|111 #end file
The records that begin with 03000 and have the characters 'TO' must be written to a new file. Based on the example, the file should look like this:
AAAAA|12|120 #begin file
00000|46|150 #begin register
99999|35|436 #end register
00000|46|778 #begin register
99999|33|457 #end register
ZZZZZ|15|111 #end file
file = open("file.txt",'r')
newFile = open("newFile.txt","w")
content = file.read()
# here I need to check if the record exists 03000 characters 'TO', if it exists, copy the recordset 00000-99999 for the new file.
I did multiple searches and found nothing to help me.
Thank you!
with open("file.txt",'r') as inFile, open("newFile.txt","w") as outFile:
outFile.writelines(line for line in inFile
if line.startswith("03000") and "TO" in line)
If you need the previous and the next line, then you have to iterate inFile in triads. First define:
def gen_triad(lines, prev=None):
after = current = next(lines)
for after in lines:
yield prev, current, after
prev, current = current, after
And then do like before:
outFile.writelines(''.join(triad) for triad in gen_triad(inFile)
if triad[1].startswith("03000") and "TO" in triad[1])
import re
pat = ('^00000\|\d+\|\d+.*\n'
rag = re.compile(pat,re.MULTILINE)
with open('fifi.txt','r') as f,\
open('newfifi.txt','w') as g:
For files with additional lines between lines beginning with 00000, 03000 and 99999, I didn't find simpler code than this one:
import re
pat = ('(^00000\|\d+\|\d+.*\n'
rag = re.compile(pat,re.MULTILINE)
pit = ('^00000\|.+?^03000\|TO\|\d+.+?^99999\|')
rig = re.compile(pit,re.DOTALL|re.MULTILINE)
def yi(text):
for g1,g2 in rag.findall(text):
if g2:
yield g2
elif rig.match(g1):
yield g1
with open('fifi.txt','r') as f,\
open('newfifi.txt','w') as g:
file = open("file.txt",'r')
newFile = open("newFile.txt","w")
content = file.readlines()
newFile.writelines(filter(lambda x:x.startswith("03000") and "TO" in x,content))
This seems to work. The other answers seem to only be writing out records that contain '03000|TO|' but you have to write out the record before and after that as well.
import sys
# ---------------------------------------------------------------
# ---------------------------------------------------------------
# import file
file_name = sys.argv[1]
file_path = 'C:\\DATA_SAVE\\pick_parts\\' + file_name
file = open(file_path,"r")
# ---------------------------------------------------------------
# create output files
output_file_path = 'C:\\DATA_SAVE\\pick_parts\\' + file_name + '.out'
output_file = open(output_file_path,"w")
# create output files
# ---------------------------------------------------------------
# process file
temp = ''
temp_out = ''
good_write = False
bad_write = False
for line in file:
if line[:5] == 'AAAAA':
temp_out += line
elif line[:5] == 'ZZZZZ':
temp_out += line
elif good_write:
temp += line
temp_out += temp
temp = ''
good_write = False
elif bad_write:
bad_write = False
temp = ''
elif line[:5] == '03000':
if line[6:8] != 'TO':
temp = ''
bad_write = True
good_write = True
temp += line
temp_out += temp
temp = ''
temp += line
AAAAA|12|120 #begin file
00000|46|150 #begin register
99999|35|436 #end register
00000|46|778 #begin register
99999|33|457 #end register
ZZZZZ|15|111 #end file
Does it have to be python? These shell commands would do the same thing in a pinch.
head -1 inputfile.txt > outputfile.txt
grep -C 1 "03000|TO" inputfile.txt >> outputfile.txt
tail -1 inputfile.txt >> outputfile.txt
# Whenever I have to parse text files I prefer to use regular expressions
# You can also customize the matching criteria if you want to
import re
what_is_being_searched = re.compile("^03000.*TO")
# don't use "file" as a variable name since it is (was?) a builtin
# function
with open("file.txt", "r") as source_file, open("newFile.txt", "w") as destination_file:
for this_line in source_file:
if what_is_being_searched.match(this_line):
and for those who prefer a more compact representation:
import re
with open("file.txt", "r") as source_file, open("newFile.txt", "w") as destination_file:
destination_file.writelines(this_line for this_line in source_file
if re.match("^03000.*TO", this_line))
fileName = '1'
fil = open(fileName,'r')
import string
##step 1: parse the file.
parsedFile = []
for i in fil:
##tuple1 = (1,2,3)
firstPipe = i.find('|')
secondPipe = i.find('|',firstPipe+1)
tuple1 = (i[:firstPipe],\
##search criterias:
searchFirst = '03000'
searchString = 'TO' ##can be changed if and when required
##step 2: used the parsed contents to write the new file
filout = open('newFile','w')
stringToWrite = parsedFile[0][0] + '|' + parsedFile[0][1] + '|' + parsedFile[0][2] + '\n'
filout.write(stringToWrite) ##to write the first entry
for i in range(1,len(parsedFile)):
if parsedFile[i][1] == searchString and parsedFile[i][0] == searchFirst:
for j in range(-1,2,1):
stringToWrite = parsedFile[i+j][0] + '|' + parsedFile[i+j][1] + '|' + parsedFile[i+j][2] + '\n'
stringToWrite = parsedFile[-1][0] + '|' + parsedFile[-1][1] + '|' + parsedFile[-1][2] + '\n'
filout.write(stringToWrite) ##to write the first entry
I know that this solution may be a bit long. But it is quite easy to understand. And it seems an intuitive way to do it. And I have already checked this with the Data that you have provided and it works perfectly.
Please tell me if you need some more explanation on the code. I will definitely add the same.
I tip (Beasley and Joran elyase) very interesting, but it only allows to get the contents of the line 03000. I would like to get the contents of the lines 00000 to line 99999.
I even managed to do here, but I am not satisfied, I wanted to make a more cleaner.
See how I did:
file = open(url,'r')
newFile = open("newFile.txt",'w')
lines = file.readlines()
i = 0
lineTemp = []
for line in lines:
if line[0:5] == '03000':
state = line[21:23]
if line[0:5] == '99999':
if state == 'TO':
linhaTemp = []
i = i+1
Thanks to all!
I am (attempting) to write a program that searches through a hex file for instances of a hex string between two values, eg. Between D4135B and D414AC, incrementing between the first value until the second is reached- D4135B, D4135C, D4135D etc etc.
I have managed to get it to increment etc, but it’s the search part I am having trouble with.
This is the code I have so far, it's been cobbled together from other places and I need to make it somehow output all search hits into the output file (file_out)
I have exceeded the limit of my Python understanding and I'm sure there's probably a much easier way of doing this. I would be very grateful for any help.
def search_process(hx): # searching for two binary strings
global FLAG
while threeByteHexPlusOne != threeByteHex2: #Keep incrementing until second value reached
If Flag:
if hx.find(threeByteHex2) != -1:
FLAG = False #If threeByteHex = ThreeByteHexPlusOne, end search
Print (“Reached the end of the search”,hx.find(threeByteHexPlusOne))
If hx.find(threeByteHexPlusOne) != -1:
FLAG = True
Return -1 #If no results found
if __name__ == '__main__':
file_in = open(FILE_IN, "r") #opening input file
file_out = open(FILE_OUT, 'w') #opening output file
hx_read = file_in.read #read from input file
tmp = ''
found = ''
while hx_read: #reading from file till file is empty
hx_read = tmp + hx_read
pos = search_process(hx_read)
while pos != -1:
hex_read = hx_read[pos:]
if FLAG:
found = found + hx_read
pos = search_process(hx_read)
tmp = bytes_read[]
hx_read = file_in.read
file_out.write(found) #writing to output file
except IOError:
print('FILE NOT FOUND!!! Check your filename or directory/PATH')
Here's a program that looks through a hex string from a file 3 bytes at a time and if the 3-byte hex string is between the given hex bounds, it writes it to another file. It makes use of generators to make getting the bytes from the hex string a little cleaner.
import base64
import sys
_usage_string = 'Usage: python {} <input_file> <output_file>'.format(sys.argv[0])
def _to_base_10_int(value):
return int(value, 16)
def get_bytes(hex_str):
# Two characters equals one byte
for i in range(0, len(hex_str), 2):
yield hex_str[i:i+2]
def get_three_byte_hexes(hex_str):
bytes = get_bytes(hex_str)
while True:
three_byte_hex = next(bytes) + next(bytes) + next(bytes)
except StopIteration:
yield three_byte_hex
def find_hexes_in_range(hex_str, lower_bound_hex, upper_bound_hex):
lower_bound = _to_base_10_int(lower_bound_hex)
upper_bound = _to_base_10_int(upper_bound_hex)
found = []
for three_byte_hex in get_three_byte_hexes(hex_str):
hex_value = _to_base_10_int(three_byte_hex)
if lower_bound <= hex_value < upper_bound:
return found
if __name__ == "__main__":
assert(len(sys.argv) == 3)
except AssertionError:
print _usage_string
file_contents = open(sys.argv[1], 'rb').read()
hex_str = base64.decodestring(file_contents).encode('hex')
found = find_hexes_in_range(hex_str, 'D4135B', 'D414AC')
if found:
with open(sys.argv[2], 'wb') as fout:
for _hex in found:
Check out some more info on generators here