How to parse this custom log file in Python3 - python

The log file is generated by a program written in C++.
Here is the demo log:
|Frame:0|NUMBER:0
|Frame:1|NUMBER:1|{INDEX:0|RECT:[11,24][31,43]}
|Frame:2|NUMBER:2|{INDEX:0|RECT:[11,24][31,43]}|{INDEX:1|RECT:[11,24][31,43]}
|Frame:3|NUMBER:0
I am trying to read those log files into a list/dict or etc.
Here is the information that I hope to capture from the demo log above:
#frame, number, index, rect
[0, 0]
[1, 1, 0, 11,24,31,43]
[2, 2, 0, 11,24,31,43, 1, 11,24,31,43]
[3, 0]

Thanks to #Juan Facundo Peña.
This answer is base his answer. Which makes some improvement to the duplicate keys.
import re
program_result = []
code_list = []
with open("2.log", "r") as f:
logs = f.readlines()
for line in logs:
if line.startswith("|Frame:"):
parsed_line = line.split("|")
code_dict = {}
next_rect_idx_key = ""
for parse in parsed_line:
rect_idx = 0
split_line = parse.strip("{}").split(":")
key = split_line[0]
if not key:
continue
data_as_strings = re.findall(r"\d+", split_line[-1])
data_as_integers = [int(s) for s in data_as_strings]
if("" != next_rect_idx_key):
code_dict[next_rect_idx_key] = data_as_integers
next_rect_idx_key = ""
else:
if('INDEX' == key):
next_rect_idx_key = key + str(data_as_integers)
else:
code_dict[key] = data_as_integers
print(code_dict)
code_list.append(code_dict)

This can be solved using the re library.
import re
code_list = []
with open("log_file.log", "r") as f:
logs = f.readlines()
for line in logs:
parsed_line = line.split("|")
code_dict = {}
for parse in parsed_line:
split_line = parse.split(":")
key = split_line[0]
if not key:
continue
value = re.findall(r"\d+", split_line[-1])
code_dict[key] = value
code_list.append(code_dict)
You will end up with a list of dictionaries (i.e.:code_list), each of which contains both the key and the values in each line.
In line 3, you will have two "INDEX - RECT" dictionaries, but you can then split the whole logs list by "Frame" to understand what codes belong to what line (if needed).
If you only wish for the numbers, you can also try:
import re
code_list = []
with open("log_file.log", "r") as f:
logs = f.readlines()
for line in logs:
codes = re.findall(r"\d+", line)
code_list.append(codes)
This approach will give you a list of lists, each of which contains a single line.
Edit: if you try to loop through a single string other than a file, try:
import re
code_list = []
logs = log_string.split("\n")
for line in logs:
# <<<business as usual>>>

Related

Removing lines that have strings with same hexadecimal values from a text file

I have a file in1.txt
info="0x0000b573" data="0x7" id="sp. PCU(Si)"
info="0x0000b573" data="0x00000007" id="HI all. SHa"
info="0x00010AC3" data="0x00000003" id="abc_16. PS"
info="0x00010ac3" data="0x00000045" id="hB2_RC/BS (Spr)"
info="0x205" data="0x00000010" id="cgc_15. PK"
info="0x205" data="0x10" id="cgsd_GH/BS (Scd)"
Expected output: out.txt
info="0x00010AC3" data="0x00000003" id="abc_16. PS"
info="0x00010ac3" data="0x00000045" id="hB2_RC/BS (Spr)"
I need only lines that have same info values and different data values to be written to out.txt.
but the current code removes all the line that have string data in it.
with open("in.txt", "r") as fin,open("out.txt", "w") as fout:
for line in fin:
if 'data' not in line:
fout.write(line.strip()+'\n')
what i need is for eg: line 1 and line 2 is having same info="0x0000b573" and data is "0x7" & "0x00000007" which is same then remove that line.
You can use regex
import re
s = '''info="0x0000b573" data="0x7" id="sp. PCU(Si)"
info="0x0000b573" data="0x00000007" id="HI all. SHa"
info="0x00010AC3" data="0x00000003" id="abc_16. PS"
info="0x00010ac3" data="0x00000045" id="hB2_RC/BS (Spr)"
info="0x205" data="0x00000010" id="cgc_15. PK"
info="0x205" data="0x10" id="cgsd_GH/BS (Scd)"'''
parsed_data = re.findall(r'info="([^"]+)" data="([^"]+)" id="[^"]+"', s, re.MULTILINE)
parsed_data = sorted([list(map(lambda x: int(x, 16), i)) + [index] for index,i in enumerate(parsed_data)])
row_numbers = [j for i in [[parsed_data[i][-1], parsed_data[i+1][-1]] for i in range(0,len(parsed_data),2) if parsed_data[i][1] != parsed_data[i+1][1]] for j in i]
final_output = []
for index,line in enumerate(s.split('\n')):
if index in row_numbers:
final_output.append(line)
final_out_text = '\n'.join(final_output)
print(final_out_text)
# info="0x00010AC3" data="0x00000003" id="abc_16. PS"
# info="0x00010ac3" data="0x00000045" id="hB2_RC/BS (Spr)"
You could try something like that too, I think
#!/usr/bin/python3
records = {}
items = []
info = []
data = []
with open("in.dat", "r") as fin:
for line in fin:
items=line.split(' ')
info = items[0].split('=')
data = items[1].split('=')
try:
key = info[1].strip('"').lower()
value = str(int(data[1].strip('"'), 16))
records[key][value] += 1
except KeyError:
try:
records[key][value] = 1
except KeyError:
records[key] = {value: 1}
out = dict()
for key in records:
for value in records[key]:
if records[key][value] == 1:
try:
out[key].append(value)
except KeyError:
out[key] = [value]
with open("out.dat", "w") as fout:
for key in out:
for value in out[key]:
fout.write(f"{key}={value}\n")
Something like this could work:
found_info_values = []
with open("in.txt", "r") as fin,open("out.txt", "w") as fout:
for line in fin:
info = line.split('"')[1]
if info not in found_info_values:
fout.write(line.strip()+'\n')
found_info_values += info

filteration using separators in python

I have many lines like the following:
>ENSG00000003137|ENST00000001146|CYP26B1|72374964|72375167|4732
CGTCGTTAACCGCCGCCATGGCTCCCGCAGAGGCCGAGT
>ENSG00000001630|ENST00000003100|CYP51A1|91763679|91763844|3210
TCCCGGGAGCGCGCTTCTGCGGGATGCTGGGGCGCGAGCGGGACTGTTGACTAAGCTTCG
>ENSG00000003137|ENST00000412253|CYP26B1|72370133;72362405|72370213;72362548|4025
AGCCTTTTTCTTCGACGATTTCCG
In this example ENSG00000003137 is name and 4732 which is the last one is length. as you see some names are repeated but they have different length.
I want to make a new file in which I only have those with the longest length. meaning the results would be like this:
>ENSG00000003137|ENST00000001146|CYP26B1|72374964|72375167|4732
CGTCGTTAACCGCCGCCATGGCTCCCGCAGAGGCCGAGT
>ENSG00000001630|ENST00000003100|CYP51A1|91763679|91763844|3210
TCCCGGGAGCGCGCTTCTGCGGGATGCTGGGGCGCGAGCGGGACTGTTGACTAAGCTTCG
I have made this code to split but don't know how to make the file I want:
file = open(“file.txt”, “r”)
for line in file:
if line.startswith(“>”):
line = line.split(“|”)
You'll need to read the file twice; the first time round, track the largest size per entry:
largest = {}
with open(inputfile) as f:
for line in f:
if line.startswith('>'):
parts = line.split('|')
name, length = parts[0][1:], int(parts[-1])
largest[name] = max(length, largest.get(name, -1))
then write out the copy in a second pass, but only those sections whose name and length match the extracted largest length from the first pass:
with open(inputfile) as f, open(outpufile, 'w') as out:
copying = False
for line in f:
if line.startswith('>'):
parts = line.split('|')
name, length = parts[0][1:], int(parts[-1])
copying = largest[name] == length
if copying:
out.write(line)
you have to do two types of handling in the loop, one that compares your 'length', and one that stores the CGTA when its needed. I wrote an example for you that reads those into dicts:
file = open("file.txt", "r")
myDict = {}
myValueDict = {}
action = 'remember'
geneDict = {}
for line in file:
if line.startswith(">"):
line = line.rstrip().split("|")
line_name = line[0]
line_number = int(line[-1])
if line_name in myValueDict:
if myValueDict[line_name] < line_number:
action = 'remember'
myValueDict[line_name] = line_number
myDict[line_name] = line
else:
action = 'forget'
else:
myDict[line_name] = line
myValueDict[line_name] = line_number
else:
if action == 'remember':
geneDict[line_name] = line.rstrip()
for key in myDict:
print(myDict[key])
for key in geneDict:
print(geneDict[key])
this ignores the lower length items. you can now store those dicts any way you want.

How to format the output of dictionary in python

I would like to format the values of a dictionary in python. Here is the script that i have used to generate the output
entries = {}
entries1 = {}
with open('no_dup.txt', 'r') as fh_in:
for line in fh_in:
if line.startswith('E'):
line = line.strip()
line = line.split()
entry = line[0]
if entry in entries:
entries[entry].append(line)
else:
entries[entry] = [line]
with open('no_dup_out.txt', 'w') as fh_out:
for kee, val in entries.iteritems():
if len(val) == 1:
fh_out.write("{} \n".format(val))
with open('no_dup_out.txt', 'r') as fh_in2:
for line in fh_in2:
line = line.strip()
line = line.split()
entry = line[1]
if entry in entries1:
entries1[entry].append(line)
else:
entries1[entry] = [line]
with open('no_dup_out_final.txt', 'w') as fh_out2:
for kee, val in entries1.iteritems():
if len(val) == 1:
fh_out2.write("{} \n".format(val))
For example by running the above script i generated the following output
[["[['ENSGMOG00000003747',", "'ENSORLG00000006947']]"]]
[["[['ENSGMOG00000003752',", "'ENSORLG00000005385']]"]]
[["[['ENSGMOG00000003760',", "'ENSORLG00000005379']]"]]
[["[['ENSGMOG00000003748',", "'ENSORLG00000004636']]"]]
[["[['ENSGMOG00000003761',", "'ENSORLG00000005382']]"]]
And i would like to format it such as way that i remove all the parentheses and commas (ENSGMOG00000003747 ENSORLG00000006947) and output the rest as it is using tab delimited format. How can i do that?
If your list of lists is full_list, then you could have the following code give your desired output:
desired_list = ['\t'.join([element.split('\'')[1] for element in list_item[0]]) for list_item in full_list]

python script that will filter data from file

I am writing a script for scrap data from file (any format like csv,text,json,html etc.) and match list with another file and then replace that particular string from another file , each file contain same data and i would like to use regular expression because i want to scrap data after %%string%% and then store string in to the list
format of file
file1.txt
{
"alias": "%%demo%%",
"demo": "%%demo%%",
"dns_domain": "googlr.com",
"max_physical_memory": "%%maxmemory%%",
"dataset_uuid": "%%DS_UUID%%",
"nics": [
{
"nic_tag": "stub0",
"ip": "%%ip%%",
"netmask": "255.255.240.0",
"primary": "1"
}
]
}
I want to get all of the string in to the list between %%____%% sign
Python Code
import sys
import re
list = []
list1 = []
i = 0
for n in sys.argv[1:]:
#list = []
#list1 = []
print n
input1 = open(n, "w")
#print input1
output = open(n,"r")
for line1 in output:
s = line1.split("=",1)[1:2]
for m in s:
list1.append(m.strip())
for line in input1:
a = re.findall(r"%%([^%^\n]+)%%", line)
for val in a:
list.append(val)
stext = list[i:0]
rtext = list1[i:0]
input1.write(line.replace(val, rtext))
i += 1
input1.close()
output.close()
print list and list2 , list2 having values from file2.txt
file2.txt
demo=somehost
demo=somehost2
maxmemory=1025
DS_UUID = 454s5da5d4a
ip=127.0.0.1
i want to replace in file1 from file2 , please check my code and let me know how can we do it
It's easy to find data inside well-known markers using regular expressions:
>>> import re
>>> re.findall(r"%%([^%^\n]+)%%", "hello %%there%% how\n are %%you%%")
['there', 'you']
From your updated example, you can extend the list instead of adding sublists
import fileinput
import re
array = []
for line in fileinput.input():
array.extend(re.findall(r"%%([^%^\n]+)%%", line))
print array
fileinput.close()
Thanks to all for your time, finally i achive what i want and my code is below
import sys
import re
list2 = []
file1 = 'file1.json'
file2 = 'test-var.txt'
output = open(file2, "r")
for line1 in output:
s = line1.split("=",1)[1:2]
for m in s:
list2.append(m)
input1 = open(file1, "r")
list1 = []
txt = ''
for line in input1:
a = re.findall(r"%%([^%^\n]+)%%",line)
a = ''.join(a)
if a =='':
txt = txt + line
continue
if any(a in s for s in list1):
val = '%%'+a+"1"+'%%'
line = line.replace('%%'+a+'%%', val)
a = a + "1"
txt = txt + line
list1.append(a)
for i in range(len(list1)):
string1 = '%%'+''.join(list1[i])+'%%'
string2 = ''.join(list2[i])
txt = txt.replace(string1,string2)
input1.close
output.close()
output = open(file1, "w")
print txt
output.write(txt)
output.close()

Searching a file with the contents of another file python

I have a file that has a unique ID number on each line. I am trying to search a different file for the occurrences of these ID numbers and return the line where these id numbers are in the second file, in this case into an output file. I am new to programming and this is what I have so far.
outlist = []
with open('readID.txt', 'r') as readID, \
open('GOlines.txt', 'w') as output, \
open('GO.txt', 'r') as GO:
x = readID.readlines()
print x
for line in GO:
if x[1:-1] in line:
outlist.append(line)
outlist.append('\n')
if x[1:-1] in line:
outlist.append(line)
outlist.append('\n')
print outlist
output.writelines(outlist)
The files look like this: readID.txt
00073810.1
00082422.1
00018647.1
00063072.1
GO.txt
#query GO reference DB reference family
HumanDistalGut_READ_00048904.2 GO:0006412 TIGRFAM TIGR00001
HumanDistalGut_READ_00043244.3 GO:0022625 TIGRFAM TIGR00001
HumanDistalGut_READ_00048644.4 GO:0000315 TIGRFAM TIGR00001
HumanDistalGut_READ_00067264.5 GO:0003735 TIGRFAM TIGR00001
The read ids match up with some but not all of the ids after READ...
#!/usr/bin/env python
# encoding: utf-8
import sys
import re
def extract_id(line):
"""
input: HumanDistalGut_READ_00048904.2 GO:0006412 TIGRFAM TIGR00001
returns: 00048904.2
"""
result = re.search(r'READ_(\d{8}\.\d)', line)
if result != None:
return result.group(1)
else:
return None
def extract_go_num(line):
"""
input: HumanDistalGut_READ_00048904.2 GO:0006412 TIGRFAM TIGR00001
returns: 0006412
"""
result = re.search(r'GO:(\d{7})', line)
if result != None:
return result.group(1)
else:
return None
def main(argv = None):
if argv is None:
argv = sys.argv
with open('readID.txt', 'r') as f:
ids = frozenset(f.readlines())
with open('GO.txt', 'r') as haystack, \
open('GOLines.txt', 'w') as output:
for line in haystack:
if extract_id(line) in ids:
output.write(extract_go_num(line) + '\n')
if __name__ == "__main__":
sys.exit(main())
I'm trading memory overhead for an O(n) solution rather than O(n^2).
I'm using regular expressions to extract the ids and go numbers, but it's brittle if the number of digits change.
Maybe something like this:
with open('readID.txt', 'r') as readID, open('GOlines.txt', 'w') as output, open('GO.txt', 'r') as GO:
for ID in readID:
for line in GO:
if ID in line:
output.write(line)
If your files are small enough to fit in your memory.
with open('/somepath/GO.txt') as f:
pool = f.readlines()
with open('/somepath/readID.txt') as f:
tokens = f.readlines()
# strip spaces/new lines
tokens = [t.strip() for t in tokens]
found = [(t, lno) for t in tokens for (lno, l) in enumerate(pool) if t in l]
You could then print your found list into your outfile.

Categories