Storing the data into dictionary with count - python

I have 2 text files that look like the ones below:
input.txt
shoes
memory card
memory card
input1.txt
shoes
shoes
i want to store the data in a dictionary query will be my key. and count will be my value.input.txt and input1.txt is my two input.in the format word : [count of word in input.txt,count of word in input1.txt]
like below:-
op:-
shoes:[1,2]
memory card:[2,0]

Here is my approach with normal loops:
i = 'shoes\nmemory card\nmemory card'
i2 = 'shoes\nshoes'
dict = {}
i = i.split('\n')
i2 = i2.split('\n')
for ele in i:
if ele not in dict:
dict[ele] = [1,0]
else:
dict[ele][0] +=1
for ele in i2:
if ele not in dict:
dict[ele] = [0,1]
else:
dict[ele][1] +=1
print dict

You could do as follows:
from collections import Counter
with open('input.txt', 'r') as f:
lines1 = [l.strip() for l in f]
with open('input1.txt', 'r') as f:
lines2 = [l.strip() for l in f]
# count the number of occurrences of your words in each file
c1 = Counter(lines1)
c2 = Counter(lines2)
out_dict = {}
# for each unqiue word from both files,
#put its count int the out_dict.
for k in set(c1) | set(c2) :
out_dict[k] = [c1.get(k,0), c2.get(k,0)]
print(out_dict)
The result is:
{'shoes': [1, 2], 'memory card': [2, 0]}

actually i am getting data from some json file.
import os
import json
auto_prefix = u'\u200B'
path1 = "/home/mine/new-log/1-search-logs-01-20/"
dict = {}
for path,dirs,files in os.walk(path1):
for data in files:
with open("/home/mine/new-log/1-search-logs-01-20/"+data) as text:
for line in text:
file2 = json.loads(line)
try:
query = (file2["appData"]["_request_params"]["params"]["q"])[0]
if auto_prefix in query:
query = query.replace(auto_prefix,"")
query = query.encode('utf-8').lower()
if query not in dict:
dict[query] = [1,0]
else:
dict[query][0] += 1
else:
query = query.encode('utf-8').lower()
if query not in dict:
dict[query] = [0,1]
else:
dict[query][1] += 1
except KeyError:
pass
print dict

Related

Removing lines that have strings with same hexadecimal values from a text file

I have a file in1.txt
info="0x0000b573" data="0x7" id="sp. PCU(Si)"
info="0x0000b573" data="0x00000007" id="HI all. SHa"
info="0x00010AC3" data="0x00000003" id="abc_16. PS"
info="0x00010ac3" data="0x00000045" id="hB2_RC/BS (Spr)"
info="0x205" data="0x00000010" id="cgc_15. PK"
info="0x205" data="0x10" id="cgsd_GH/BS (Scd)"
Expected output: out.txt
info="0x00010AC3" data="0x00000003" id="abc_16. PS"
info="0x00010ac3" data="0x00000045" id="hB2_RC/BS (Spr)"
I need only lines that have same info values and different data values to be written to out.txt.
but the current code removes all the line that have string data in it.
with open("in.txt", "r") as fin,open("out.txt", "w") as fout:
for line in fin:
if 'data' not in line:
fout.write(line.strip()+'\n')
what i need is for eg: line 1 and line 2 is having same info="0x0000b573" and data is "0x7" & "0x00000007" which is same then remove that line.
You can use regex
import re
s = '''info="0x0000b573" data="0x7" id="sp. PCU(Si)"
info="0x0000b573" data="0x00000007" id="HI all. SHa"
info="0x00010AC3" data="0x00000003" id="abc_16. PS"
info="0x00010ac3" data="0x00000045" id="hB2_RC/BS (Spr)"
info="0x205" data="0x00000010" id="cgc_15. PK"
info="0x205" data="0x10" id="cgsd_GH/BS (Scd)"'''
parsed_data = re.findall(r'info="([^"]+)" data="([^"]+)" id="[^"]+"', s, re.MULTILINE)
parsed_data = sorted([list(map(lambda x: int(x, 16), i)) + [index] for index,i in enumerate(parsed_data)])
row_numbers = [j for i in [[parsed_data[i][-1], parsed_data[i+1][-1]] for i in range(0,len(parsed_data),2) if parsed_data[i][1] != parsed_data[i+1][1]] for j in i]
final_output = []
for index,line in enumerate(s.split('\n')):
if index in row_numbers:
final_output.append(line)
final_out_text = '\n'.join(final_output)
print(final_out_text)
# info="0x00010AC3" data="0x00000003" id="abc_16. PS"
# info="0x00010ac3" data="0x00000045" id="hB2_RC/BS (Spr)"
You could try something like that too, I think
#!/usr/bin/python3
records = {}
items = []
info = []
data = []
with open("in.dat", "r") as fin:
for line in fin:
items=line.split(' ')
info = items[0].split('=')
data = items[1].split('=')
try:
key = info[1].strip('"').lower()
value = str(int(data[1].strip('"'), 16))
records[key][value] += 1
except KeyError:
try:
records[key][value] = 1
except KeyError:
records[key] = {value: 1}
out = dict()
for key in records:
for value in records[key]:
if records[key][value] == 1:
try:
out[key].append(value)
except KeyError:
out[key] = [value]
with open("out.dat", "w") as fout:
for key in out:
for value in out[key]:
fout.write(f"{key}={value}\n")
Something like this could work:
found_info_values = []
with open("in.txt", "r") as fin,open("out.txt", "w") as fout:
for line in fin:
info = line.split('"')[1]
if info not in found_info_values:
fout.write(line.strip()+'\n')
found_info_values += info

How to parse this custom log file in Python3

The log file is generated by a program written in C++.
Here is the demo log:
|Frame:0|NUMBER:0
|Frame:1|NUMBER:1|{INDEX:0|RECT:[11,24][31,43]}
|Frame:2|NUMBER:2|{INDEX:0|RECT:[11,24][31,43]}|{INDEX:1|RECT:[11,24][31,43]}
|Frame:3|NUMBER:0
I am trying to read those log files into a list/dict or etc.
Here is the information that I hope to capture from the demo log above:
#frame, number, index, rect
[0, 0]
[1, 1, 0, 11,24,31,43]
[2, 2, 0, 11,24,31,43, 1, 11,24,31,43]
[3, 0]
Thanks to #Juan Facundo Peña.
This answer is base his answer. Which makes some improvement to the duplicate keys.
import re
program_result = []
code_list = []
with open("2.log", "r") as f:
logs = f.readlines()
for line in logs:
if line.startswith("|Frame:"):
parsed_line = line.split("|")
code_dict = {}
next_rect_idx_key = ""
for parse in parsed_line:
rect_idx = 0
split_line = parse.strip("{}").split(":")
key = split_line[0]
if not key:
continue
data_as_strings = re.findall(r"\d+", split_line[-1])
data_as_integers = [int(s) for s in data_as_strings]
if("" != next_rect_idx_key):
code_dict[next_rect_idx_key] = data_as_integers
next_rect_idx_key = ""
else:
if('INDEX' == key):
next_rect_idx_key = key + str(data_as_integers)
else:
code_dict[key] = data_as_integers
print(code_dict)
code_list.append(code_dict)
This can be solved using the re library.
import re
code_list = []
with open("log_file.log", "r") as f:
logs = f.readlines()
for line in logs:
parsed_line = line.split("|")
code_dict = {}
for parse in parsed_line:
split_line = parse.split(":")
key = split_line[0]
if not key:
continue
value = re.findall(r"\d+", split_line[-1])
code_dict[key] = value
code_list.append(code_dict)
You will end up with a list of dictionaries (i.e.:code_list), each of which contains both the key and the values in each line.
In line 3, you will have two "INDEX - RECT" dictionaries, but you can then split the whole logs list by "Frame" to understand what codes belong to what line (if needed).
If you only wish for the numbers, you can also try:
import re
code_list = []
with open("log_file.log", "r") as f:
logs = f.readlines()
for line in logs:
codes = re.findall(r"\d+", line)
code_list.append(codes)
This approach will give you a list of lists, each of which contains a single line.
Edit: if you try to loop through a single string other than a file, try:
import re
code_list = []
logs = log_string.split("\n")
for line in logs:
# <<<business as usual>>>

Count number of occurences of a words list in multiple text files

I have a list of words :
words = ["hello","my","name"]
files = ["file1.txt","file2.txt"]
what i want is to count the number of occurences of every single word of the list in all text files.
My work so far:
import re
occ = []
for file in files:
try:
fichier = open(file, encoding="utf-8")
except:
pass
data = fichier.read()
for wrd in words:
count = sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(wrd), data))
occ.append(wrd + " : " + str(count))
texto = open("occurence.txt", "w+b")
for ww in occ:
texto.write(ww.encode("utf-8")+"\n".encode("utf-8"))
So this code works fine with a single file but when i try a list of files it gives me only the result of the last file.
Use json to store the count.
Ex:
import json
# Read Json
with open('data_store.json') as jfile:
data = json.load(jfile)
for wrd in words:
count = sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(wrd), data))
if wrd not in data:
data[wrd] = 0
data[wrd] += count # Increment Count
# Write Result to JSON
with open('data_store.json', "w") as jfile:
json.dump(data, jfile)
Use a dictionary instead of a list:
import re
occ = {} # Create an empty dictionary
words = ["hello", "my", "name"]
files = ["f1.txt", "f2.txt", "f3.txt" ]
for file in files:
try:
fichier = open(file, encoding="utf-8")
except:
pass
else:
data = fichier.read()
for wrd in words:
count = sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(wrd), data))
if wrd in occ:
occ[wrd] += count # If wrd is already in dictionary, increment occurrence count
else:
occ[wrd] = count # Else add wrd to dictionary with occurrence count
print(occ)
If you want it as a list of strings as in your question:
occ_list = [ f"{key} : {value}" for key, value in occ.items() ]

How to split a string into parts to populate a dict in a loop

I have a list of files I need to draw information from and I want to populate a dict with it. I know how to extract the information with a regular expression but the dict part mystifies me.
Code
import re
mylist = ['anna01','bobby03','dean120']
mydict = {}
Intended result
mydict = {'anna01': 1, 'bobby03':3, 'dean120':120}
The regex I use in the actual problem is :
for file in os.listdir(path):
if file.endswith('.bmp'):
image_name = file
files.append(os.path.join(path, file))
print(os.path.join(path, file))
print(get_burst_from_name(file))
print(image_name)
pattern = '_\d*%'
result = re.findall(pattern, file)[0]
result = result.replace("_","")
duty = result.replace("%","")
print('duty=', duty)
where the key for a dict would be 'file' and the value would be 'duty'
i think maybe this little code will help you:
import re
mylist = ['anna01','bobby03','dean120']
myDict = {}
pattern = '(?P<Name>[^\d]+)(?P<Number>\d+)'
for index,value in enumerate(mylist):
searchedRegex = re.search(pattern , value)
if searchedRegex:
number = searchedRegex.group("Number")
myDict[value] = int(number)
print(myDict)
Output
{'anna01': 1, 'bobby03': 3, 'dean120': 120}
Exactly as you called.
And if you just want to have their names(without number), then you can use group("Name") as i prepared it in the regex.
mylist = ['anna01','bobby03','dean120']
mydict = {}
for i in mylist:
word = ''
num = ''
for j in i:
if j.isalpha():
word += j
else:
num += j
if num[0] == '0':
num = num[1:]
mydict[word] = int(num)
print(mydict)
Result
{'dean': 120, 'anna': 1, 'bobby': 3}
I think you'll get your desired output.
import re
mylist = ['anna01','bobby03','dean120']
mydict = {}
for i in mylist:
key = str(re.findall("\D+", i)[0])
val = int(re.findall("\d+", i)[0])
mydict[key] = val
print("mydict = {}".format(mydict))
Output
mydict = {'anna': 1, 'bobby': 3, 'dean': 120}

How to remove Duplicates in .txt file

I have a .txt file with the below entries:-
Apples 51824
Oranges 131236
Peaches 6564
Apples 5879
Peaches 69878
I am trying to remove the entire row (when duplicate entries are found) from this file whenever a word (say Apples) matches in that row (keeping in mind that the entry with the highest value stays).
What I presently do:-
1. Open the file in Excel.
2. Go to Data --> Remove Duplicates
The issue with this approach according to me is that I am not sure whether the end result gives me the data with highest values all the time or not.
So, How can it be programmatically (in python, preferably) done?
Here are 2 solutions one in Python and another in Nodejs without using third party libraries:
Python:
import re
import json
with open('data.txt', 'r') as file:
lines = file.read()
lines = lines.split('\n')
fruit = {}
for line in lines:
key, value = re.split(r'\s{4}', line)
if (key not in fruit or int(fruit[key]) < int(value)):
fruit[key] = value
fruit = json.dumps(fruit)
fruit = re.sub(r'["{}:]', '', fruit)
fruit = re.sub(r', ', '\n', fruit)
with open('fruits.txt', 'w') as file:
file.write(fruit)
Nodejs:
import fs from 'fs'
const file = fs.readFileSync('data.txt', 'utf8');
const lines = file.split('\n');
let fruit = {}
for (const line of lines) {
const [key, value] = line.split(/\s{4}/)
!fruit[key] || +fruit[key] < +value ? fruit[key] = value : null
}
fruit = JSON.stringify(fruit)
.replace(/["{}]/g, '')
.replace(/:/g, ' ')
.replace(/,/g, '\n')
fs.writeFileSync('fruits.txt', fruit)
The intuitive way is to use dictionaries:
f = open('test.txt', 'r')
lines = f.readlines()
my_dict = {};
for line in lines:
s_line = line.split()
if s_line[0] in my_dict and my_dict[s_line[0]] < int(s_line[1]):
my_dict[s_line[0]] = int(s_line[1])
else:
my_dict[s_line[0]] = int(s_line[1])
new_f = open('test_no_duplicates.txt', 'w')
for key in my_dict:
new_f.write(key + " " + str(my_dict[key]) + "\n")
f.close()
new_f.close()
That would probably work
from collections import defaultdict
filename1 = ""
filename2 = ""
words = defaultdict(int)
with open(filename1) as f1:
for line in f1:
word, value = line.strip().split()
if int(value) > words[word]:
words[word] = int(value)
with open(filename2, "w") as f2:
for line in words.items():
f2.write(line)
If you have pandas data frame then:
import pandas
df = pandas.read_csv(filepath)
result = df.groupby('Name').agg({'values': 'max'})
print(result)
from pathlib import Path
import pandas as pd
import numpy as np
textFile = Path("./sample1.txt")
text = textFile.read_text()
rows = text.split("\n")
entries = [x.split(" ") for x in rows]
data = {
"Fruits": [x[0] for x in entries],
"Values": [x[1] for x in entries]
}
df = pd.DataFrame(data)
new_df = df.groupby(["Fruits"]).max()
new_df.reset_index(inplace=True)
np.savetxt("./out.txt", new_df.values, fmt='%s')
Example:
sample1.txt
Apples 51824
Oranges 131236
Peaches 6564
Apples 5879
Peaches 69878
out.txt
Apples 5879
Oranges 131236
Peaches 69878
Here's a quick solution in just a few lines, and outputs a nice and flat CSV file.
Code:
import pandas as pd
with open('apples.txt') as f:
text = [[i, int(j)] for i, j in [i.strip().split() for i in f.readlines()]]
(pd.DataFrame(text, columns=['fruit', 'count'])
.groupby('fruit')
.agg({'count': 'max'})
.reset_index()
.to_csv('apples_out.txt', index=False))
Output:
fruit,count
Apples,51824
Oranges,131236
Peaches,69878
Use dictionary to remember best value/line pair for each fruit:
results = {}
with open('file.txt') as f:
for line in f:
fruit, value = line.split()
value = int(value)
if fruit not in results or results[fruit][0] < value:
results[fruit] = (value, line.strip())
print('\n'.join(item[1] for item in results.values()))

Categories