I am working on vector space model, data set consists of 50 text files. Iterating through them splitting into words and saving them in dictionary. Now i want to use nested dictionary like:
dictionary = { {someword: {Doc1:23},{Doc21:2},{Doc34:3}},
{someword: {Doc1:23},{Doc21:2},{Doc34:3}},
{someword: {Doc1:23},{Doc21:2},{Doc34:3}}
}
but when i am running my program it replaces not only the document but also it does not calculates frequency by adding that how many times 'someword' occurred in a particular document.
for iterator in range(1, 51):
f = open(directory + str(iterator) + ext, "r")
for line in f.read().lower().split():
line = getwords(line)
for word in line:
if check(word, stopwords) == 0:
if existence(word, terms, iterator) != 1:
terms[word] = {}
terms[word]["Doc"+str(iterator)] = 1
else:
terms[word]["Doc"+str(iterator)] = int(terms[word]["Doc"+str(iterator)]) + 1
f.close()
existence function is :
def existence(tok, diction, iteration):
if tok in diction:
temp = "Doc"+str(iteration)
if temp in diction:
return 1
else:
return 0
else:
return 0
Result Somewhat like this.
{'blunder': {'Doc1': 1}, 'by': {'Doc50': 1}, 'anton': {'Doc27': 1}, 'chekhov': {'Doc27': 1}, 'an': {'Doc50': 1}, 'illustration': {'Doc48': 1}, 'story': {'Doc48': 1}, 'author': {'Doc48': 1}, 'portrait'...
Do you want to know how many times each word appears in each file? This is easily accomplished with a defaultdict of Counters, courtesy of the collections module.
You've got the right idea I think, looping over the files, reading line by line and splitting into words. It's the counting part you need help with.
from collections import defaultdict, Counter
from string import punctuation
fnames = ['1.txt', '2.txt', '3.txt', '4.txt', '5.txt']
word_counter = defaultdict(Counter)
for fname in fnames:
with open(fname, 'r') as txt:
for line in txt:
words = line.lower().strip().split()
for word in words:
word = word.strip(punctuation)
if word:
word_counter[word][fname] += 1
The data look will like this inside word_counter:
{
'within': {
'1.txt': 2,
},
'we': {
'1.txt': 3,
'2.txt': 2,
'3.txt': 2,
'4.txt': 2,
'5.txt': 4,
},
'do': {
'1.txt': 7,
'2.txt': 8,
'3.txt': 8,
'4.txt': 6,
'5.txt': 5,
},
...
}
Related
I'm trying to parse a huge collection of JSON files. Around 60000 JSON file (size range 100 KB- 700 MB) total of 1.8 TB, so I made this script which parse JSON file and extract some features and export them in CSV file, it works fine but its extremely slow some of the JSON files take more than 30 minutes to be parsed, I tried to make it faster but I couldn't due to my short Python experience. Is there anyway I can make it faster because I need to parse these huge collection sooner. I'm posting a snippet of my code I know its a little dump.
And here is sample of my JSON files please feel free to check
https://gofile.io/d/vddzHY
count1=0
my_file_list = [f for f in glob.glob(r"E:\JsonOrgnized\Pach\*.json")]
final_result = []
for filename in my_file_list:
try:
with open(filename, 'r', encoding='utf8', errors='ignore') as f:
row = {}
info = ijson.items(f, 'info')
f.seek(0)
for o in info:
row['AA-Added']= float(o.get('added'))
row['AB-Started']= float(o.get('started'))
row['AC-Duration']= o.get('duration')
row['AD-Ended']= float(o.get('ended'))
f.seek(0)
domains = ijson.items(f, 'network.domains.item')
domain_count = 0
for domain in domains:
domain_count+=1
row['AE-DomainCount'] = domain_count
f.seek(0)
signatures = ijson.items(f, 'signatures.item')
signature_count = 0
for signature in signatures:
signature_count+=1
row['AF-SignatureCount'] = signature_count
f.seek(0)
domains = ijson.items(f, 'behavior.generic.item')
domain_count = 0
for domain in domains:
domain_count+=1
row['AG-GenericCount'] = domain_count
f.seek(0)
apistats = ijson.items(f, 'behavior.apistats')
apistat_count = 0
for apistat in apistats:
for inner_apistat in apistat:
apistat_count+=1
row['AH-ApistatCount'] = apistat_count
f.seek(0)
processes = ijson.items(f, 'behavior.processes.item')
process_count = 0
for process in processes:
process_count+=1
row['AI-ProcessCount'] = process_count
f.seek(0)
summaries = ijson.items(f, 'behavior.summary')
summary_count = 0
for summary in summaries:
for inner_summary in summary:
summary_count+=1
row['AJ-SummaryCount'] = summary_count
f.seek(0)
apistats_element = ijson.items(f, 'behavior.apistats')
for inner_apistats in apistats_element:
for index, inner_fields in inner_apistats.items():
row = dict(Counter(row)+Counter(inner_fields))
row['AK-Filename'] = os.path.basename(filename)
except Exception as e:
#pass
#print(f"Filename {filename} has issue with {e}")
row = {}
if row:
final_result.append(row)
count1+=1
print("File Number" , count1 , "Is Finished!")
Print("<<<<<<<<<<<<<<<<<<<DONE>>>>>>>>>>>>>>>>>>")
This seems to be a little faster and I think cleaner.
We will use one of the more "lower level" calls from ijson. and based on the paths we get take some sort of action.
We will store paths of interest and the actions to take when encountered in a little work dictionary.
import ijson
import os
def fn_set_value(row, key, value):
row[key] = value
def fn_increment_count(row, key):
row[key] = row.get(key, 0) + 1
# ---------------------
# When these keys (tuples) are encountered, we will take the corresponding action.
# ---------------------
work = {
("info.added", "number"): lambda row, value: fn_set_value(row, "AA-Added", value),
("info.started", "number"): lambda row, value: fn_set_value(row, "AB-Started", value),
("info.duration", "number"): lambda row, value: fn_set_value(row, "AC-Duration", value),
("info.ended", "number"): lambda row, value: fn_set_value(row, "AD-Ended", value),
("network.domains.item", "start_map"): lambda row, value: fn_increment_count(row, "AE-DomainCount"),
("signatures.item", "start_map"): lambda row, value: fn_increment_count(row, "AF-SignatureCount"),
("behavior.generic.item", "start_map"): lambda row, value: fn_increment_count(row, "AG-GenericCount"),
("behavior.apistats", "map_key"): lambda row, value: fn_increment_count(row, "AH-ApistatCount"),
("behavior.processes.item", "start_map"): lambda row, value: fn_increment_count(row, "AI-ProcessCount"),
("behavior.summary", "map_key"): lambda row, value: fn_increment_count(row, "AJ-SummaryCount"),
}
# ---------------------
# ---------------------
# Your initial set of files
# ---------------------
my_file_list = [
"d:/temp/foo/report1.json",
"d:/temp/foo/report2.json",
"d:/temp/foo/report3.json",
"d:/temp/foo/report4.json",
"d:/temp/foo/report5.json"
]
# ---------------------
final_result = []
for index, filename in enumerate(my_file_list):
print(f"Processing file {index+1} from {filename}")
try:
row = {}
with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
for i in ijson.parse(f):
key = (i[0], i[1])
if key in work.keys(): # if the tuple is an interesting one
work[key](row, i[2]) # use it to take an action on row
row["AK-Filename"] = os.path.basename(filename)
final_result.append(row)
except Exception as e:
print(f"\tUnable to process \"{filename}\": {e}")
# retry with ascii or having stripped out the bad character?
pass
print("<<<<<<<<<<<<<<<<<<<DONE>>>>>>>>>>>>>>>>>>")
print(final_result)
This produce this result in a couple of seconds.
[
{
'AA-Added': Decimal('1631536343.897729'),
'AB-Started': Decimal('1631536440.728626'),
'AC-Duration': 21,
'AD-Ended': Decimal('1631536461.778441'),
'AE-DomainCount': 3,
'AF-SignatureCount': 5,
'AG-GenericCount': 3,
'AH-ApistatCount': 2,
'AI-ProcessCount': 3,
'AJ-SummaryCount': 14,
'AK-Filename': 'report1.json'
},
{
'AA-Added': Decimal('1631536343.90739'),
'AB-Started': Decimal('1631536461.849837'),
'AC-Duration': 12,
'AD-Ended': Decimal('1631536474.755813'),
'AE-DomainCount': 3,
'AF-SignatureCount': 2,
'AG-GenericCount': 2,
'AH-ApistatCount': 1,
'AI-ProcessCount': 2,
'AJ-SummaryCount': 2,
'AK-Filename': 'report2.json'
},
{
'AA-Added': Decimal('1631536343.962804'),
'AB-Started': Decimal('1631536692.972615'),
'AC-Duration': 312,
'AD-Ended': Decimal('1631537005.710977'),
'AE-DomainCount': 4,
'AF-SignatureCount': 36,
'AG-GenericCount': 13,
'AH-ApistatCount': 12,
'AI-ProcessCount': 13,
'AJ-SummaryCount': 22,
'AK-Filename': 'report3.json'
},
{
'AA-Added': Decimal('1631536344.049105'),
'AB-Started': Decimal('1631537026.918725'),
'AC-Duration': 316,
'AD-Ended': Decimal('1631537342.92093'),
'AE-DomainCount': 3,
'AF-SignatureCount': 16,
'AG-GenericCount': 4,
'AH-ApistatCount': 3,
'AI-ProcessCount': 4,
'AJ-SummaryCount': 16,
'AK-Filename': 'report4.json'
},
{
'AA-Added': Decimal('1631536344.112968'),
'AB-Started': Decimal('1631537322.81162'),
'AC-Duration': 14,
'AD-Ended': Decimal('1631537337.342377'),
'AE-DomainCount': 3,
'AF-SignatureCount': 1,
'AG-GenericCount': 2,
'AH-ApistatCount': 1,
'AI-ProcessCount': 2,
'AJ-SummaryCount': 7,
'AK-Filename': 'report5.json'
}
]
So I created a helper function to help my main function in extracting stuff from a dictionary...
and here is my code and function
def rdict(recipes):
recipes_splitted = {}
for r in recipes:
recipe_name, parts = r.split(":")
recipe_parts = {}
for part in parts.split(','):
product, number = part.split('*')
recipe_parts[product] = int(number)
recipes_splitted[recipe_name] = recipe_parts
return recipes_splitted
def extract(recipes, data):
result = []
for r in recipes:
tmp = []
for key in data[r]:
tmp.append(f"{key}:{data[r][key]}")
final_string = ""
for i in range(len(tmp)):
if i < len(tmp) - 1:
final_string += tmp[i] + ", "
else:
final_string += tmp[i]
result.append(final_string)
return result
So what I'm trying to do is make sure data in extract(recipe, data) go through rdict(data) since rdict will convert data into a dictionary, which is what I need.. However, when I tried doing for key in rdict(data[r]): the output returns Error. String is not supscriptable..
what should I do to successfully implement the changes??
Edit
So from my current code, here is a sample input..
print(extract(recipes = ['T-Bone', 'Green Salad1'],data = ["Pork Stew:Cabbage*5,Carrot*1,Fatty Pork*10",
"Green Salad1:Cabbage*10,Carrot*2,Pineapple*5",
"T-Bone:Carrot*2,Steak Meat*1"]
))
and in order for my code to work, it has to be like this
print(extract(recipes = ['T-Bone', 'Green Salad1'], data = {'Pork Stew': {'Cabbage': 5, 'Carrot': 1, 'Fatty Pork': 10}, 'Green Salad1': {'Cabbage': 10, 'Carrot': 2, 'Pineapple': 5},'T-Bone': {'Carrot': 2, 'Steak Meat': 1}}))
So from the input, data should be changed from
data = ["Pork Stew:Cabbage*5,Carrot*1,Fatty Pork*10",
"Green Salad1:Cabbage*10,Carrot*2,Pineapple*5",
"T-Bone:Carrot*2,Steak Meat*1"]
to
data = {'Pork Stew': {'Cabbage': 5, 'Carrot': 1, 'Fatty Pork': 10}, 'Green Salad1': {'Cabbage': 10, 'Carrot': 2, 'Pineapple': 5},'T-Bone': {'Carrot': 2, 'Steak Meat': 1}}
Convert the data to dict in extract().
recipes = ['T-Bone', 'Green Salad1']
data = ["Pork Stew:Cabbage*5,Carrot*1,Fatty Pork*10",
"Green Salad1:Cabbage*10,Carrot*2,Pineapple*5",
"T-Bone:Carrot*2,Steak Meat*1"]
def rdict(recipes):
recipes_splitted = {}
for r in recipes:
recipe_name, parts = r.split(":")
recipe_parts = {}
for part in parts.split(','):
product, number = part.split('*')
recipe_parts[product] = int(number)
recipes_splitted[recipe_name] = recipe_parts
return recipes_splitted
def extract(recipes, data):
data = rdict(data) # convert data to dict first
result = []
for r in recipes:
tmp = []
for key in data[r]:
tmp.append(f"{key}:{data[r][key]}")
final_string = ""
for i in range(len(tmp)):
if i < len(tmp) - 1:
final_string += tmp[i] + ", "
else:
final_string += tmp[i]
result.append(final_string)
return result
print(extract(recipes, data))
Output:
['Carrot:2, Steak Meat:1', 'Cabbage:10, Carrot:2, Pineapple:5']
Renamed rdict to parse_recipe, and modified it to return a tuple that is lighter and easier to process
In extract:
a) Build a dict of recipes: data_recipes
b) Built result by getting the wanted recipes, with a guard against missing recipe (which be an empty dict:{} )
def parse_recipe(s):
recipe, ings_s = s.split(':')
ings_l = ings_s.split(',')
ings_d= {}
for ing in ings_l:
i,q = ing.split('*')
ings_d[i.strip()] = q.strip()
return recipe.strip(), ings_d
def extract(recipes, data):
data_recipes = {}
for s in data:
recipe, ings_d = parse_recipe(s)
data_recipes[recipe] = ings_d
return {r: data_recipes.get(r, dict()) for r in recipes}
if filename not in dict1.keys():
dict1[filename] = {}
if transId not in dict1[filename].keys():
dict1[filename][transId] = {}
if error_type in dict1[filename][transId].keys():
count1 = dict1[filename][transId][error_type]
count1 = count1 + 1
dict1[filename][transId][error_type] = count1
dict data is :
{'abc': {'ACE12345678': {'ERR-2': 2}, {'ERR-3': 4}}}
where 'abc' is a filename, 'ACE12345678' a TransId, and 'ERR-2' an Error Type.
I would also like to add loglines for each transid(Eg: 'ACE12345678') so that the dict looks like as below :
{'abc': {'ACE12345678': {'ERR-2': 2, data1\n data2\n data3\n}, {'ERR-3': 4, data1\n data2\n data3\n}}}.
Can someone help me getting this output.
you can add a new key loglines that holds all the lines in a list:
dict1 = {'abc': {'ACE12345678': {'ERR-2': 2}}}
filename = 'abc'
transID = 'ACE12345678'
error_type = 'ERR-2'
logline = 'data1\n'
my_error = dict1.setdefault(filename, {}).setdefault(transID, {})
my_error[error_type] = my_error.get(error_type, 0) + 1
my_error.setdefault('loglines', []).append(logline)
print(dict1)
output:
{'abc': {'ACE12345678': {'ERR-2': 3, 'loglines': ['data1\n']}}}
I have a json file with objects and a text file with several groups (Each group have 5 numbers and I have them in a list this way: the first number of each group are in list 1, the second number of each group, are in list 2, etc). I basically have to match each object of the json with each group I created. The problem is that Im getting as result the last element from the Json. The groups from the text file are created in the correct way.
This is my code:
import json
NUM_LIST = 5
index = 0
def report(a, b, c, d, e, index):
json_file = 'json_global.json'
json_data = open(json_file)
data = json.load(json_data)
i = 0
index = 0
item = 0
cmd = " "
ind = 0
for node in data:
for i in range(0, 5):
item = data[i]['item']
cmd = data[i]['command']
index+= 1
print item, cmd, a, b, c, d, e
f = open("Output.txt", "r")
lines = [line.rstrip() for line in f if line != "\n"]
NUM_LISTS = 5
groups = [[] for i in range(NUM_LISTS)]
listIndex = 0
for line in lines:
if "Transactions/Sec for Group" not in line:
groups[listIndex].append(float(line))
listIndex += 1
if listIndex == NUM_LISTS:
listIndex = 0
value0 = groups[0]
value1 = groups[1]
value2 = groups[2]
value3 = groups[3]
value4 = groups[4]
for i in range(0, 5):
a = value0[i]
b = value1[i]
c = value2[i]
d = value3[i]
e = value4[i]
i += 1
report(a, b, c, d, e, index)
The Json file looks like:
[
{
"item": 1,
"command": "AA"
},
{
"item": 2,
"command": "BB",
},
{
"item": 3,
"command": "CC",
},
{
"item": 4,
"command": "DD",
},
{
"item": 5,
"command": "EE",
}
]
The text file looks like this:
Transactions/Sec for Group = AA\CODE1\KK
1011.5032
2444.8864
2646.6893
2740.8531
2683.8178
Transactions/Sec for Group = BB\CODE1\KK
993.2360
2652.8784
3020.2740
2956.5260
3015.5910
Transactions/Sec for Group = CC\CODE1\KK
1179.5766
3271.5700
4588.2059
4174.6358
4452.6785
Transactions/Sec for Group = DD\CODE1\KK
1112.2567
3147.1466
4014.8404
3913.3806
3939.0626
Transactions/Sec for Group = EE\CODE1\KK
1205.8499
3364.8987
4401.1702
4747.4354
4765.7614
The logic in the body of the program works fine. The groups appears ok, but instead of having the list from 1 to 5 from the Json file, is appearing everything with the number 5 command EE. Instead should appear: Item 1, 2, 3, 4, 5, with their commands
My list 1 will have the numbers: 1011.5032, 993.2360, 1179.5766, 1112.2567, 1205.8499.
My list 2 will have the numbers: 2444.8864, 2652.8784, 3271.5700, 3147.1466,
The python version I'm using is 2.6
Based on your explanation it's hard to tell what you're trying to do -- do you mean the nested loop below? The inner loop executes 5 times, but in every iteration it overwrites the previous values for item and cmd.
for node in data:
for i in range(0, 5):
item = data[i]['item']
cmd = data[i]['command']
index+= 1
Try printing the values each time the inner loop executes:
for node in data:
for i in range(0, 5):
item = data[i]['item']
cmd = data[i]['command']
print item, cmd
index+= 1
I think this code is your problem:
for node in data:
for i in range(0, 5):
item = data[i]['item']
cmd = data[i]['command']
Item will always be "5" and command will always be "EE" after this executes. Perhaps your indents are off for the code beneath it, and that code is supposed to be within the loop?
I want to split keys and values and display the dictionary result below mentioned format. I'm reading a file and splitting the data into list and later moving to dictionary.
Please help me to get the result.
INPUT FILE - commands.txt
login url=http://demo.url.net username=test#url.net password=mytester
create-folder foldername=demo
select-folder foldername=test123
logout
Expected result format
print result_dict
"0": {
"login": [
{
"url": "http://demo.url.net",
"username": "test#url.net",
"password": "mytester"
}
]
},
"1": {
"create-folder": {
"foldername": "demo"
}
},
"2": {
"select-folder": {
"foldername": "test-folder"
}
},
"3": {
"logout": {}
}
CODE
file=os.path.abspath('catalog/commands.txt')
list_output=[f.rstrip().split() for f in open(file).readlines()]
print list_output
counter=0
for data in list_output:
csvdata[counter]=data[0:]
counter=counter+1
print csvdata
for key,val in csvdata.iteritems():
for item in val:
if '=' in item:
key,value=item.split("=")
result[key]=value
print result
As a function:
from collections import defaultdict
from itertools import count
def read_file(file_path):
result = defaultdict(dict)
item = count()
with open(file_path) as f:
for line in f:
if not line:
continue
parts = line.split()
result[next(item)][parts[0]] = dict(p.split('=') for p in parts[1:])
return dict(result)
Better example and explanation:
s = """
login url=http://demo.url.net username=test#url.net password=mytester
create-folder foldername=demo
select-folder foldername=test123
logout
"""
from collections import defaultdict
from itertools import count
result_dict = defaultdict(dict)
item = count()
# pretend you opened the file and are reading it line by line
for line in s.splitlines():
if not line:
continue # skip empty lines
parts = line.split()
result_dict[next(item)][parts[0]] = dict(p.split('=') for p in parts[1:])
With pretty print:
>>> pprint(dict(result_dict))
{0: {'login': {'password': 'mytester',
'url': 'http://demo.url.net',
'username': 'test#url.net'}},
1: {'create-folder': {'foldername': 'demo'}},
2: {'select-folder': {'foldername': 'test123'}},
3: {'logout': {}}}
lines = ["login url=http://demo.url.net username=test#url.net password=mytester",
"create-folder foldername=demo",
"select-folder foldername=test123",
"logout"]
result = {}
for no, line in enumerate(lines):
values = line.split()
pairs = [v.split('=') for v in values[1:]]
result[str(no)] = {values[0]: [dict(pairs)] if len(pairs) > 1 else dict(pairs)}
import pprint
pprint.pprint(result)
Output:
{'0': {'login': [{'password': 'mytester',
'url': 'http://demo.url.net',
'username': 'test#url.net'}]},
'1': {'create-folder': {'foldername': 'demo'}},
'2': {'select-folder': {'foldername': 'test123'}},
'3': {'logout': {}}}
But are you sure you need the extra list inside the login value? If not, just change [dict(pairs)] if len(pairs) > 1 else dict(pairs) to dict(pairs).
r = dict()
f = open('commands.txt')
for i, line in enumerate(f.readlines()):
r[str(i)] = dict()
actions = line.split()
list_actions = {}
for action in actions[1:]:
if "=" in action:
k, v = action.split('=')
list_actions[k] = v
if len(actions[1:]) > 1:
r[str(i)][actions[0]] = [list_actions]
else:
r[str(i)][actions[0]] = list_actions
print r
Should be work