I am trying to write an annotator which loops through a list of names, tagging a separate document when these names appear. These names can consist of one or two words.
The buffer on the program works, so it recognises whether it needs to look at one or both lines of the file for tagging, and tags when the name which comes up matches the candidate exactly.
However, instead of looping through all the names in the list for each candidate, takes the name in the loop it has on that particular round and if the name does not match the candidate, writes the line and moves onto the next line (with the next name in the list). This obviously results in there being many names in the file which are not tagged when they should be.
Below is my code:
import json
from tagging import import_names
def split_line(line):
"""Split a line into four parts, word, pos, lemma and tag."""
# TODO: Speak to Diana about the spaces in the vert file - do they mean
# anything?
line = line.strip().split()
if len(line) == 1:
word = line[0]
pos, lemma, tag = None, None, None
elif len(line) == 3:
word, pos, lemma = line
tag = ''
elif len(line) == 4:
word, pos, lemma, tag = line
return [word, pos, lemma, tag]
class MWUTagger(object):
"""Contains a buffer of lines split into word, pos, lemma, tag items."""
def __init__(self, f_in, f_out, n, gnrd_file, indicators=None):
"""Populate the buffer."""
# read the input vert file
self.f_in = open(f_in, 'r')
# populate the buffer (first n lines of the vert file)
self.buffer = []
for i in range(n):
self.buffer.append(split_line(self.f_in.readline()))
# read in list of names or save
self.names = import_names(gnrd_file)
# create the output vert file
self.f_out = f_out
def __iter__(self):
return self
def write_line(self):
"""Write out the oldest line in the buffer, and add a new line to the buffer."""
# write the oldest line from the buffer
tagged_line = self.buffer.pop(0)
tagged_line = [i for i in tagged_line if i]
with open(self.f_out, 'a') as f:
if tagged_line[0].startswith('<') and tagged_line[-1].endswith('>'):
f.write(' '.join(tagged_line) + '\n')
else:
f.write('\t'.join(tagged_line) + '\n')
def __next__(self):
"""write out the oldest line in the buffer and add a new line to the buffer"""
#write the oldest line from the buffer
self.write_line()
# add a new line to the buffer (found an example here https://bufferoverflow.com/a/14797993/1706564)
line = self.f_in.readline()
if line:
self.buffer.append(split_line(line))
else:
self.f_in.close()
self.flush()
raise StopIteration
def flush(self):
"""Write all remaining lines from buffer file to the output file"""
while self.buffer:
self.write_line()
def check_for_name(self, name):
"""Depending on length of name, check if the first n items in the buffer
match name."""
# check if tagged
if self.buffer[0][-1] == 'SCI':
return
name = name.strip().split()
name = [n + '-n' for n in name]
n = len(name)
# check if they match
candidate = [line[2] for line in self.buffer[:n]]
if name == candidate:
# edit the tags in the first n items in the buffer if they do
for i in range(n):
self.buffer[i][-1] += "SCI%i" % (i + 1)
# check if other names in the dictionary match
def main():
mwutagger = MWUTagger('zenodo_test_untag.vert', 'zenodomwutagged.vert', 2,'JSON_file_test.json')
while True:
try:
for name in mwutagger.names:
mwutagger.check_for_name(name)
mwutagger.__next__()
except StopIteration:
break
if __name__ == '__main__':
main ()
I am unsure if I need to add something into the check_for_name module to say if candidate =! name, go to next name til the end of the list until last list then just print, or if that is not being adequately handled in the main method.
Can anyone give me advice on this?
Related
I am new to python and stuck with a log file in text format, where it has following repetitive structure and I am required to extract the data from rows and change it into column depending upon the data. e.g.
First 50 line are trash like below(in first six lines):
-------------------------------------------------------------
Logging to file xyz.
Char
1,
3
r
=
----------------------------------------------
Pid 0
Name SAB=1, XYZ=3
----------------------------------------------
a 1
b 2
c 3
----------------------------------------------
Pid 0
Name SAB=1, XYZ=3, P_NO=546467
----------------------------------------------
Test_data_1 00001
Test_data_2 FOXABC
Test_data_3 SHEEP123
Country US
----------------------------------------------
Pid 0
Name SAB=1
----------------------------------------------
Sno 893489423
Log FileFormat
------------Continues for another million lines.
Now the required output is like below:
Required output format
PID, Name, a,b,c
0, "SAB=1, XYZ=3", 1,2,3
PID, Name , Test_data_1, Test_data_2, Test_data_3, Country
0, "SAB=1, XYZ=3, P_NO=546467", 00001, FOXABC, SHEEP123, US
Pid, Name, Sno
0, SAB=1, 893489423
I tried to write a code but failed to get the desired results: My attempt was as below:
'''
fn=open(file_name,'r')
for i,line in enumerate(fn ):
if i >= 50 and "Name " in line: # for first 50 line deletion/or starting point
last_tag=line.split(",")[-1]
last_element=last_tag.split("=")[0]
print(last_element)
'''
Any help would be really appreciated.
Newly Discovered Structure
RBY Structure
The solution I came up with is a bit messy but it works, check it out below:
import sys
import re
import StringIO
ifile = open(sys.argv[1],'r') #Input log file as command-line argument
ofile = open(sys.argv[1][:-4]+"_formatted.csv",'w') #output formatted log txt
stringOut = ""
i = 0
flagReturn = True
j = 0
reVal = re.compile("Pid[\s]+(.*)\nName[\s]+(.*)\n[-]+\<br\>(.*)\<br\>") #Regex pattern for separating the Pid & Name from the variables
reVar = re.compile("(.*)[ ]+(.*)") #Regex pattern for getting vars and their values
reVarStr = re.compile(">>> [0-9]+.(.*)=(.*)") #Regex Pattern for Struct
reVarStrMatch = re.compile("Struct(.*)+has(.*)+members:") #Regex pattern for Struct check
for lines in ifile.readlines():
if(i>8): #Omitting the first 9 lines of Garbage values
if(lines.strip()=="----------------------------------------------"): #Checking for separation between PID & Name group and the Var group
j+=1 #variable keeping track of whether we are inside the vars section or not (between two rows of hyphens)
flagReturn = not flagReturn #To print the variables in single line to easily separate them with regex pattern reVal
if(not flagReturn):
stringTmp = lines.strip()+"<br>" #adding break to the end of each vars line in order for easier separation
else:
stringTmp = lines #if not vars then save each line as is
stringOut += stringTmp #concatenating each lines to form the searchable string
i+=1 #incrementing for omitting lines (useless after i=8)
if(j==2): #Once a complete set of PIDs, Names and Vars have been collected
j=0 #Reset j
matchObj = reVal.match(stringOut) #Match for PID, Name & Vars
line1 = "Pid,Name,"
line2 = matchObj.group(1).strip()+",\""+matchObj.group(2)+"\","
buf = StringIO.StringIO(matchObj.group(3).replace("<br>","\n"))
structFlag = False
for line in buf.readlines(): #Separate each vars and add to the respective strings for writing to file
if(not (reVarStrMatch.match(line) is None)):
structFlag = True
elif(structFlag and (not (reVarStr.match(line) is None))):
matchObjVars = reVarStr.match(line)
line1 += matchObjVars.group(1).strip()+","
line2 += matchObjVars.group(2).strip()+","
else:
structFlag = False
matchObjVars = reVar.match(line)
try:
line1 += matchObjVars.group(1).strip()+","
line2 += matchObjVars.group(2).strip()+","
except:
line1 += line.strip()+","
line2 += " ,"
ofile.writelines(line1[:-1]+"\n")
ofile.writelines(line2[:-1]+"\n")
ofile.writelines("\n")
stringOut = "" #Reseting the string
ofile.close()
ifile.close()
EDIT
This is what I came up with to include the new pattern as well.
I suggest you do the following:
Run the parser script on a copy of the log file and see where it fails next.
Identify and write down the new pattern that broke the parser.
Delete all data in the newly identified pattern.
Repeat from Step 1 till all patterns have been identified.
Create individual regular expressions pattern for each type of pattern and call them in separate functions to write to the string.
EDIT 2
structFlag = False
RBYflag = False
for line in buf.readlines(): #Separate each vars and add to the respective strings for writing to file
if(not (reVarStrMatch.match(line) is None)):
structFlag = True
elif(structFlag and (not (reVarStr.match(line) is None))):
matchObjVars = reVarStr.match(line)
if(matchObjVars.group(1).strip()=="RBY" and not RBYFlag):
line1 += matchObjVars.group(1).strip()+","
line2 += matchObjVars.group(2).strip()+"**"
RBYFlag = True
elif(matchObjVars.group(1).strip()=="RBY"):
line2 += matchObjVars.group(2).strip()+"**"
else:
if(RBYFlag):
line2 = line2[:-2]
RBYFlag = False
line1 += matchObjVars.group(1).strip()+","
line2 += matchObjVars.group(2).strip()+","
else:
structFlag = False
if(RBYFlag):
line2 = line2[:-2]
RBYFlag = False
matchObjVars = reVar.match(line)
try:
line1 += matchObjVars.group(1).strip()+","
line2 += matchObjVars.group(2).strip()+","
except:
line1 += line.strip()+","
line2 += " ,"`
NOTE
This loop has become very bloated and it is better to create a separate function to identify the type of data and return some value accordingly.
I have many lines like the following:
>ENSG00000003137|ENST00000001146|CYP26B1|72374964|72375167|4732
CGTCGTTAACCGCCGCCATGGCTCCCGCAGAGGCCGAGT
>ENSG00000001630|ENST00000003100|CYP51A1|91763679|91763844|3210
TCCCGGGAGCGCGCTTCTGCGGGATGCTGGGGCGCGAGCGGGACTGTTGACTAAGCTTCG
>ENSG00000003137|ENST00000412253|CYP26B1|72370133;72362405|72370213;72362548|4025
AGCCTTTTTCTTCGACGATTTCCG
In this example ENSG00000003137 is name and 4732 which is the last one is length. as you see some names are repeated but they have different length.
I want to make a new file in which I only have those with the longest length. meaning the results would be like this:
>ENSG00000003137|ENST00000001146|CYP26B1|72374964|72375167|4732
CGTCGTTAACCGCCGCCATGGCTCCCGCAGAGGCCGAGT
>ENSG00000001630|ENST00000003100|CYP51A1|91763679|91763844|3210
TCCCGGGAGCGCGCTTCTGCGGGATGCTGGGGCGCGAGCGGGACTGTTGACTAAGCTTCG
I have made this code to split but don't know how to make the file I want:
file = open(“file.txt”, “r”)
for line in file:
if line.startswith(“>”):
line = line.split(“|”)
You'll need to read the file twice; the first time round, track the largest size per entry:
largest = {}
with open(inputfile) as f:
for line in f:
if line.startswith('>'):
parts = line.split('|')
name, length = parts[0][1:], int(parts[-1])
largest[name] = max(length, largest.get(name, -1))
then write out the copy in a second pass, but only those sections whose name and length match the extracted largest length from the first pass:
with open(inputfile) as f, open(outpufile, 'w') as out:
copying = False
for line in f:
if line.startswith('>'):
parts = line.split('|')
name, length = parts[0][1:], int(parts[-1])
copying = largest[name] == length
if copying:
out.write(line)
you have to do two types of handling in the loop, one that compares your 'length', and one that stores the CGTA when its needed. I wrote an example for you that reads those into dicts:
file = open("file.txt", "r")
myDict = {}
myValueDict = {}
action = 'remember'
geneDict = {}
for line in file:
if line.startswith(">"):
line = line.rstrip().split("|")
line_name = line[0]
line_number = int(line[-1])
if line_name in myValueDict:
if myValueDict[line_name] < line_number:
action = 'remember'
myValueDict[line_name] = line_number
myDict[line_name] = line
else:
action = 'forget'
else:
myDict[line_name] = line
myValueDict[line_name] = line_number
else:
if action == 'remember':
geneDict[line_name] = line.rstrip()
for key in myDict:
print(myDict[key])
for key in geneDict:
print(geneDict[key])
this ignores the lower length items. you can now store those dicts any way you want.
I was given this code to transform an arff file. I had to download the numpy library, and now when I try to run it with my files it gives me keyerrors such as
" imgInfo[1][clstrDct[clstr]] += 1 # increment the cluster count
KeyError: 'cluster35\r'"
import numpy as np
def xfrm(arFil='KBcls-100-10-20'):
'''transform a clustered patch arff file to an image training / test file'''
global imgDct, clstrDct, num, clsts, lne
imgDct = {}
clstrDct = {}
with open(arFil + '.arff', 'r') as ptchFil:
while True: # find Cluster attribute
lne = ptchFil.readline()
if lne == '': return 'EOF bfore one'
if lne.lower().startswith('#attribute cluster'):
clsts = lne[lne.find('{')+1 : lne.find('}')].split(',')
num = len(clsts)
break
for i in range(len(clsts)): # map cluster names to integers 0+ w/ inverted mapping also
clstrDct[clsts[i]] = i
clstrDct[i] = clsts[i]
while True: # first patch data line
lne = ptchFil.readline()
if lne == '': return 'EOF bfore two'
if lne.startswith('#data'): break
while True:
lne = ptchFil.readline() # read through patch lines
if lne == '': break # EOF
if lne[-1] == '\n': lne=lne[:-1] # all end with \n except possibly the last line of the file
attrs = lne.split(',')
imgId = attrs[0]
clstr = attrs[-1]
cls = attrs[-2]
try: imgInfo = imgDct[imgId]
except KeyError:
imgInfo = [cls, np.zeros((num), dtype=int)] # new cluster counting array
imgDct[imgId] = imgInfo
imgInfo[1][clstrDct[clstr]] += 1 # increment the cluster count
with open(arFil + '-img.arff', 'w') as arFile:
arFile.write('% from {0:}.arff: {1:} patch clusters\n%\n'.format(arFil, num))
arFile.write('#relation Image-Patch-Clusters\n#attribute Image-ID numeric\n')
for i in range(num):
arFile.write('#attribute {} numeric\n'.format(clstrDct[i])) # cluster attributes
arFile.write('#attribute class {unknown, street, highway}\n#data')
for imid,iminfo in imgDct.items():
arFile.write('\n{}, '.format(imid))
for i in range(num):
arFile.write('{}, '.format(iminfo[1][i]))
arFile.write('{}'.format(iminfo[0]))
if __name__ == "__main__":
xfrm('Test1Clust')
readline includes the line ending along with the rest of the content. This means that you have an extra \r, \n, or \n\r at the end of every attrs[-1]. This is why there is a \r in "cluster35\r". You can get rid of this using strip.
clstr = attrs[-1].strip()
I've been trying to write lines to a file based on specific file names from the same directory, a search of the file names in another log file(given as an input), and the modified date of the files.
The output is limiting me to under 80 characters per line.
def getFiles(flag, file):
if (flag == True):
file_version = open(file)
if file_version:
s = mmap.mmap(file_version.fileno(), 0, access=mmap.ACCESS_READ)
file_version.close()
file = open('AllModules.txt', 'wb')
for i, values in dict.items():
# search keys in version file
if (flag == True):
index = s.find(bytes(i))
if index > 0:
s.seek(index + len(i) + 1)
m = s.readline()
line_new = '{:>0} {:>12} {:>12}'.format(i, m, values)
file.write(line_new)
s.seek(0)
else:
file.write(i +'\n')
file.close()
if __name__ == '__main__':
dict = {}
for file in os.listdir(os.getcwd()):
if os.path.splitext(file)[1] == '.psw' or os.path.splitext(file)[1] == '.pkw':
time.ctime(os.path.getmtime(file))
dict.update({str(os.path.splitext(file)[0]).upper():time.strftime('%d/%m/%y')})
if (len(sys.argv) > 1) :
if os.path.exists(sys.argv[1]):
getFiles(True, sys.argv[1])
else:
getFiles(False, None)
The output is always like:
BW_LIB_INCL 13.1 rev. 259 [20140425 16:28]
16/05/14
The interpretation of data is correct, then again the formatting is not correct as the time is put on the next line (not on the same).
This is happening to all the lines of my new file.
Could someone give me a hint?
m = s.readline() has \n at the end of line. Then you're doing .format(i, m, values) which writes m in the middle of the string.
I leave it as exercise to the reader to find out what's happening when you're writing such line to a file. :-)
(hint: m = s.readline().rstrip('\n'))
I am (attempting) to write a program that searches through a hex file for instances of a hex string between two values, eg. Between D4135B and D414AC, incrementing between the first value until the second is reached- D4135B, D4135C, D4135D etc etc.
I have managed to get it to increment etc, but it’s the search part I am having trouble with.
This is the code I have so far, it's been cobbled together from other places and I need to make it somehow output all search hits into the output file (file_out)
I have exceeded the limit of my Python understanding and I'm sure there's probably a much easier way of doing this. I would be very grateful for any help.
def search_process(hx): # searching for two binary strings
global FLAG
while threeByteHexPlusOne != threeByteHex2: #Keep incrementing until second value reached
If Flag:
if hx.find(threeByteHex2) != -1:
FLAG = False #If threeByteHex = ThreeByteHexPlusOne, end search
Print (“Reached the end of the search”,hx.find(threeByteHexPlusOne))
Else:
If hx.find(threeByteHexPlusOne) != -1:
FLAG = True
Return -1 #If no results found
if __name__ == '__main__':
try:
file_in = open(FILE_IN, "r") #opening input file
file_out = open(FILE_OUT, 'w') #opening output file
hx_read = file_in.read #read from input file
tmp = ''
found = ''
while hx_read: #reading from file till file is empty
hx_read = tmp + hx_read
pos = search_process(hx_read)
while pos != -1:
hex_read = hx_read[pos:]
if FLAG:
found = found + hx_read
pos = search_process(hx_read)
tmp = bytes_read[]
hx_read = file_in.read
file_out.write(found) #writing to output file
except IOError:
print('FILE NOT FOUND!!! Check your filename or directory/PATH')
Here's a program that looks through a hex string from a file 3 bytes at a time and if the 3-byte hex string is between the given hex bounds, it writes it to another file. It makes use of generators to make getting the bytes from the hex string a little cleaner.
import base64
import sys
_usage_string = 'Usage: python {} <input_file> <output_file>'.format(sys.argv[0])
def _to_base_10_int(value):
return int(value, 16)
def get_bytes(hex_str):
# Two characters equals one byte
for i in range(0, len(hex_str), 2):
yield hex_str[i:i+2]
def get_three_byte_hexes(hex_str):
bytes = get_bytes(hex_str)
while True:
try:
three_byte_hex = next(bytes) + next(bytes) + next(bytes)
except StopIteration:
break
yield three_byte_hex
def find_hexes_in_range(hex_str, lower_bound_hex, upper_bound_hex):
lower_bound = _to_base_10_int(lower_bound_hex)
upper_bound = _to_base_10_int(upper_bound_hex)
found = []
for three_byte_hex in get_three_byte_hexes(hex_str):
hex_value = _to_base_10_int(three_byte_hex)
if lower_bound <= hex_value < upper_bound:
found.append(three_byte_hex)
return found
if __name__ == "__main__":
try:
assert(len(sys.argv) == 3)
except AssertionError:
print _usage_string
sys.exit(2)
file_contents = open(sys.argv[1], 'rb').read()
hex_str = base64.decodestring(file_contents).encode('hex')
found = find_hexes_in_range(hex_str, 'D4135B', 'D414AC')
print('Found:')
print(found)
if found:
with open(sys.argv[2], 'wb') as fout:
for _hex in found:
fout.write(_hex)
Check out some more info on generators here