Extract features from huge (60000) JSON file directrory into a CSV - python

I'm trying to parse a huge collection of JSON files. Around 60000 JSON file (size range 100 KB- 700 MB) total of 1.8 TB, so I made this script which parse JSON file and extract some features and export them in CSV file, it works fine but its extremely slow some of the JSON files take more than 30 minutes to be parsed, I tried to make it faster but I couldn't due to my short Python experience. Is there anyway I can make it faster because I need to parse these huge collection sooner. I'm posting a snippet of my code I know its a little dump.
And here is sample of my JSON files please feel free to check
https://gofile.io/d/vddzHY
count1=0
my_file_list = [f for f in glob.glob(r"E:\JsonOrgnized\Pach\*.json")]
final_result = []
for filename in my_file_list:
try:
with open(filename, 'r', encoding='utf8', errors='ignore') as f:
row = {}
info = ijson.items(f, 'info')
f.seek(0)
for o in info:
row['AA-Added']= float(o.get('added'))
row['AB-Started']= float(o.get('started'))
row['AC-Duration']= o.get('duration')
row['AD-Ended']= float(o.get('ended'))
f.seek(0)
domains = ijson.items(f, 'network.domains.item')
domain_count = 0
for domain in domains:
domain_count+=1
row['AE-DomainCount'] = domain_count
f.seek(0)
signatures = ijson.items(f, 'signatures.item')
signature_count = 0
for signature in signatures:
signature_count+=1
row['AF-SignatureCount'] = signature_count
f.seek(0)
domains = ijson.items(f, 'behavior.generic.item')
domain_count = 0
for domain in domains:
domain_count+=1
row['AG-GenericCount'] = domain_count
f.seek(0)
apistats = ijson.items(f, 'behavior.apistats')
apistat_count = 0
for apistat in apistats:
for inner_apistat in apistat:
apistat_count+=1
row['AH-ApistatCount'] = apistat_count
f.seek(0)
processes = ijson.items(f, 'behavior.processes.item')
process_count = 0
for process in processes:
process_count+=1
row['AI-ProcessCount'] = process_count
f.seek(0)
summaries = ijson.items(f, 'behavior.summary')
summary_count = 0
for summary in summaries:
for inner_summary in summary:
summary_count+=1
row['AJ-SummaryCount'] = summary_count
f.seek(0)
apistats_element = ijson.items(f, 'behavior.apistats')
for inner_apistats in apistats_element:
for index, inner_fields in inner_apistats.items():
row = dict(Counter(row)+Counter(inner_fields))
row['AK-Filename'] = os.path.basename(filename)
except Exception as e:
#pass
#print(f"Filename {filename} has issue with {e}")
row = {}
if row:
final_result.append(row)
count1+=1
print("File Number" , count1 , "Is Finished!")
Print("<<<<<<<<<<<<<<<<<<<DONE>>>>>>>>>>>>>>>>>>")

This seems to be a little faster and I think cleaner.
We will use one of the more "lower level" calls from ijson. and based on the paths we get take some sort of action.
We will store paths of interest and the actions to take when encountered in a little work dictionary.
import ijson
import os
def fn_set_value(row, key, value):
row[key] = value
def fn_increment_count(row, key):
row[key] = row.get(key, 0) + 1
# ---------------------
# When these keys (tuples) are encountered, we will take the corresponding action.
# ---------------------
work = {
("info.added", "number"): lambda row, value: fn_set_value(row, "AA-Added", value),
("info.started", "number"): lambda row, value: fn_set_value(row, "AB-Started", value),
("info.duration", "number"): lambda row, value: fn_set_value(row, "AC-Duration", value),
("info.ended", "number"): lambda row, value: fn_set_value(row, "AD-Ended", value),
("network.domains.item", "start_map"): lambda row, value: fn_increment_count(row, "AE-DomainCount"),
("signatures.item", "start_map"): lambda row, value: fn_increment_count(row, "AF-SignatureCount"),
("behavior.generic.item", "start_map"): lambda row, value: fn_increment_count(row, "AG-GenericCount"),
("behavior.apistats", "map_key"): lambda row, value: fn_increment_count(row, "AH-ApistatCount"),
("behavior.processes.item", "start_map"): lambda row, value: fn_increment_count(row, "AI-ProcessCount"),
("behavior.summary", "map_key"): lambda row, value: fn_increment_count(row, "AJ-SummaryCount"),
}
# ---------------------
# ---------------------
# Your initial set of files
# ---------------------
my_file_list = [
"d:/temp/foo/report1.json",
"d:/temp/foo/report2.json",
"d:/temp/foo/report3.json",
"d:/temp/foo/report4.json",
"d:/temp/foo/report5.json"
]
# ---------------------
final_result = []
for index, filename in enumerate(my_file_list):
print(f"Processing file {index+1} from {filename}")
try:
row = {}
with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
for i in ijson.parse(f):
key = (i[0], i[1])
if key in work.keys(): # if the tuple is an interesting one
work[key](row, i[2]) # use it to take an action on row
row["AK-Filename"] = os.path.basename(filename)
final_result.append(row)
except Exception as e:
print(f"\tUnable to process \"{filename}\": {e}")
# retry with ascii or having stripped out the bad character?
pass
print("<<<<<<<<<<<<<<<<<<<DONE>>>>>>>>>>>>>>>>>>")
print(final_result)
This produce this result in a couple of seconds.
[
{
'AA-Added': Decimal('1631536343.897729'),
'AB-Started': Decimal('1631536440.728626'),
'AC-Duration': 21,
'AD-Ended': Decimal('1631536461.778441'),
'AE-DomainCount': 3,
'AF-SignatureCount': 5,
'AG-GenericCount': 3,
'AH-ApistatCount': 2,
'AI-ProcessCount': 3,
'AJ-SummaryCount': 14,
'AK-Filename': 'report1.json'
},
{
'AA-Added': Decimal('1631536343.90739'),
'AB-Started': Decimal('1631536461.849837'),
'AC-Duration': 12,
'AD-Ended': Decimal('1631536474.755813'),
'AE-DomainCount': 3,
'AF-SignatureCount': 2,
'AG-GenericCount': 2,
'AH-ApistatCount': 1,
'AI-ProcessCount': 2,
'AJ-SummaryCount': 2,
'AK-Filename': 'report2.json'
},
{
'AA-Added': Decimal('1631536343.962804'),
'AB-Started': Decimal('1631536692.972615'),
'AC-Duration': 312,
'AD-Ended': Decimal('1631537005.710977'),
'AE-DomainCount': 4,
'AF-SignatureCount': 36,
'AG-GenericCount': 13,
'AH-ApistatCount': 12,
'AI-ProcessCount': 13,
'AJ-SummaryCount': 22,
'AK-Filename': 'report3.json'
},
{
'AA-Added': Decimal('1631536344.049105'),
'AB-Started': Decimal('1631537026.918725'),
'AC-Duration': 316,
'AD-Ended': Decimal('1631537342.92093'),
'AE-DomainCount': 3,
'AF-SignatureCount': 16,
'AG-GenericCount': 4,
'AH-ApistatCount': 3,
'AI-ProcessCount': 4,
'AJ-SummaryCount': 16,
'AK-Filename': 'report4.json'
},
{
'AA-Added': Decimal('1631536344.112968'),
'AB-Started': Decimal('1631537322.81162'),
'AC-Duration': 14,
'AD-Ended': Decimal('1631537337.342377'),
'AE-DomainCount': 3,
'AF-SignatureCount': 1,
'AG-GenericCount': 2,
'AH-ApistatCount': 1,
'AI-ProcessCount': 2,
'AJ-SummaryCount': 7,
'AK-Filename': 'report5.json'
}
]

Related

Nested dictionary replacing previous value + key instead of appending

I am working on vector space model, data set consists of 50 text files. Iterating through them splitting into words and saving them in dictionary. Now i want to use nested dictionary like:
dictionary = { {someword: {Doc1:23},{Doc21:2},{Doc34:3}},
{someword: {Doc1:23},{Doc21:2},{Doc34:3}},
{someword: {Doc1:23},{Doc21:2},{Doc34:3}}
}
but when i am running my program it replaces not only the document but also it does not calculates frequency by adding that how many times 'someword' occurred in a particular document.
for iterator in range(1, 51):
f = open(directory + str(iterator) + ext, "r")
for line in f.read().lower().split():
line = getwords(line)
for word in line:
if check(word, stopwords) == 0:
if existence(word, terms, iterator) != 1:
terms[word] = {}
terms[word]["Doc"+str(iterator)] = 1
else:
terms[word]["Doc"+str(iterator)] = int(terms[word]["Doc"+str(iterator)]) + 1
f.close()
existence function is :
def existence(tok, diction, iteration):
if tok in diction:
temp = "Doc"+str(iteration)
if temp in diction:
return 1
else:
return 0
else:
return 0
Result Somewhat like this.
{'blunder': {'Doc1': 1}, 'by': {'Doc50': 1}, 'anton': {'Doc27': 1}, 'chekhov': {'Doc27': 1}, 'an': {'Doc50': 1}, 'illustration': {'Doc48': 1}, 'story': {'Doc48': 1}, 'author': {'Doc48': 1}, 'portrait'...
Do you want to know how many times each word appears in each file? This is easily accomplished with a defaultdict of Counters, courtesy of the collections module.
You've got the right idea I think, looping over the files, reading line by line and splitting into words. It's the counting part you need help with.
from collections import defaultdict, Counter
from string import punctuation
fnames = ['1.txt', '2.txt', '3.txt', '4.txt', '5.txt']
word_counter = defaultdict(Counter)
for fname in fnames:
with open(fname, 'r') as txt:
for line in txt:
words = line.lower().strip().split()
for word in words:
word = word.strip(punctuation)
if word:
word_counter[word][fname] += 1
The data look will like this inside word_counter:
{
'within': {
'1.txt': 2,
},
'we': {
'1.txt': 3,
'2.txt': 2,
'3.txt': 2,
'4.txt': 2,
'5.txt': 4,
},
'do': {
'1.txt': 7,
'2.txt': 8,
'3.txt': 8,
'4.txt': 6,
'5.txt': 5,
},
...
}

writing data to csv from dictionaries with multiple values per key

Background
I am storing data in dictionaries. The dictionaries can be off different length and in a particular dictionary there could be keys with multiple values. I am trying to spit out the data on a CSV file.
Problem/Solution
Image 1 is how my actual output prints out. Image 2 shows how i would want my output to actually printout. Image 2 is the desired output.
CODE
import csv
from itertools import izip_longest
e = {'Lebron':[25,10],'Ray':[40,15]}
c = {'Nba':5000}
def writeData():
with open('file1.csv', mode='w') as csv_file:
fieldnames = ['Player Name','Points','Assist','Company','Total Employes']
writer = csv.writer(csv_file)
writer.writerow(fieldnames)
for employee, company in izip_longest(e.items(), c.items()):
row = list(employee)
row += list(company) if company is not None else ['', ''] # Write empty fields if no company
writer.writerow(row)
writeData()
I am open to all solutions/suggestions that can help me get my desired output format.
For a much simpler answer, you just need to add one line of code to what you have:
row = [row[0]] + row[1]
so:
for employee, company in izip_longest(e.items(), c.items()):
row = list(employee)
row = [row[0]] + row[1]
row += list(company) if company is not None else ['', ''] # Write empty fields if no company
from collections import defaultdict
values = defaultdict(dict)
values[Name1] = {Points: [], Assist: [], Company: blah, Total_Employees: 123}
for generating the output, traverse through each item in the values to give you names, and populate other values using the key_values in the nested dict.
Again, make sure that there no multiple entries with same name, or choose the one with unique entries in the defaultdict.
Demo for the example-
>>> from collections import defaultdict
>>> import csv
>>> values = defaultdict(dict)
>>> vals = [["Lebron", 25, 10, "Nba", 5000], ["Ray", 40, 15]]
>>> fields = ["Name", "Points", "Assist", "Company", "Total Employes"]
>>> for item in vals:
... if len(item) == len(fields):
... details = dict()
... for j in range(1, len(fields)):
... details[fields[j]] = item[j]
... values[item[0]] = details
... elif len(item) < len(fields):
... details = dict()
... for j in range(1, len(fields)):
... if j+1 <= len(item):
... details[fields[j]] = item[j]
... else:
... details[fields[j]] = ""
... values[item[0]] = details
...
>>> values
defaultdict(<class 'dict'>, {'Lebron': {'Points': 25, 'Assist': 10, 'Company': 'Nba', 'Total Employes': 5000}, 'Ray': {'Points': 40, 'Assist': 15, 'Company': '', 'Total Employes': ''}})
>>> csv_file = open('file1.csv', 'w')
>>> writer = csv.writer(csv_file)
>>> for i in values:
... row = [i]
... for j in values[i]:
... row.append(values[i][j])
... writer.writerow(row)
...
23
13
>>> csv_file.close()
Contents of 'file1.csv':
Lebron,25,10,Nba,5000
Ray,40,15,,

create a list of list of parameters from a file

Hi i am trying to create a list of parameters from a file
The final result should be something like
param=[[field],[units],[height],[site]]
The problem is that the information is split into lines and some of the parameters do not have all the information
#info in the file
[field1]
unit=m/s
height=70.4
site=site1
[field2]
height=20.6
site=site2
[field3]
units=m
...
so i would like to fulfill all the fields in such a way that, if there is not information assigns 0 or ''
Final result in the example
param={field1:'m/s',70.4,'site1',field2:'',20.6,site2, field3:'m',0,''}
I know how to create a dictionary from list of lists but not to set default values ('' for the strings values an 0 for the numeric ones) in case some values are missing
Thanks
You could group using a defaultdict:
from collections import defaultdict
with open("test.txt") as f:
d = defaultdict(list)
for line in map(str.rstrip, f):
if line.startswith("["):
d["fields"].append(line.strip("[]"))
else:
k,v = line.split("=")
d[k].append(v)
Input::
[field1]
unit=m/s
height=70.4
site=site1
[field2]
height=20.6
site=site2
[field3]
unit=m
height=6.0
site=site3
Output:
defaultdict(<type 'list'>, {'fields': ['field1', 'field2', 'field3'],
'site': ['site1', 'site2', 'site3'], 'unit': ['m/s', 'm'],
'height': ['70.4', '20.6', '6.0']})
If you actually want to group by field, you can use itertools.groupby grouping on lines that start with [:
from itertools import groupby
with open("test.txt") as f:
grps, d = groupby(map(str.rstrip,f), key=lambda x: x.startswith("[")), {}
for k,v in grps:
if k:
k, v = next(v).strip("[]"), list(next(grps)[1])
d[k] = v
print(d)
Output:
{'field2': ['height=20.6', 'site=site2'],
'field3': ['unit=m', 'height=6.0', 'site=site3'],
'field1': ['unit=m/s', 'height=70.4', 'site=site1']}
Each k is a line starting with [, we then call next on the grouper object to get all the lines up to the next line starting with [ or the EOF:
This would fill in the missing information.
f= open('file.txt','r')
field, units, height, site = [],[],[],[]
param = [ field, units, height, site]
lines = f.readlines()
i=0
while True:
try:
line1 = lines[i].rstrip()
if line1.startswith('['):
field.append(line1.strip('[]'))
else:
field.append(0)
i-= 1
except:
field.append(0)
try:
line2 = lines[i+1].rstrip()
if line2.startswith('unit') or line2.startswith('units'):
units.append(line2.split('=')[-1])
else:
units.append('')
i-=1
except:
units.append('')
try:
line3 = lines[i+2].rstrip()
if line3.startswith('height'):
height.append(line3.split('=')[-1])
else:
height.append(0)
i-=1
except:
height.append(0)
try:
line4 = lines[i+3].rstrip()
if line4.startswith('site'):
site.append(line4.split('=')[-1])
else:
site.append('')
except:
site.append('')
break
i +=4
Output:
param:
[['field1', 'field2', 'field3'],
['m/s', '', 'm'],
['70.4', '20.6', 0],
['site1', 'site2', '']]

loop is not working when I try to read a Json file and a text file with python

I have a json file with objects and a text file with several groups (Each group have 5 numbers and I have them in a list this way: the first number of each group are in list 1, the second number of each group, are in list 2, etc). I basically have to match each object of the json with each group I created. The problem is that Im getting as result the last element from the Json. The groups from the text file are created in the correct way.
This is my code:
import json
NUM_LIST = 5
index = 0
def report(a, b, c, d, e, index):
json_file = 'json_global.json'
json_data = open(json_file)
data = json.load(json_data)
i = 0
index = 0
item = 0
cmd = " "
ind = 0
for node in data:
for i in range(0, 5):
item = data[i]['item']
cmd = data[i]['command']
index+= 1
print item, cmd, a, b, c, d, e
f = open("Output.txt", "r")
lines = [line.rstrip() for line in f if line != "\n"]
NUM_LISTS = 5
groups = [[] for i in range(NUM_LISTS)]
listIndex = 0
for line in lines:
if "Transactions/Sec for Group" not in line:
groups[listIndex].append(float(line))
listIndex += 1
if listIndex == NUM_LISTS:
listIndex = 0
value0 = groups[0]
value1 = groups[1]
value2 = groups[2]
value3 = groups[3]
value4 = groups[4]
for i in range(0, 5):
a = value0[i]
b = value1[i]
c = value2[i]
d = value3[i]
e = value4[i]
i += 1
report(a, b, c, d, e, index)
The Json file looks like:
[
{
"item": 1,
"command": "AA"
},
{
"item": 2,
"command": "BB",
},
{
"item": 3,
"command": "CC",
},
{
"item": 4,
"command": "DD",
},
{
"item": 5,
"command": "EE",
}
]
The text file looks like this:
Transactions/Sec for Group = AA\CODE1\KK
1011.5032
2444.8864
2646.6893
2740.8531
2683.8178
Transactions/Sec for Group = BB\CODE1\KK
993.2360
2652.8784
3020.2740
2956.5260
3015.5910
Transactions/Sec for Group = CC\CODE1\KK
1179.5766
3271.5700
4588.2059
4174.6358
4452.6785
Transactions/Sec for Group = DD\CODE1\KK
1112.2567
3147.1466
4014.8404
3913.3806
3939.0626
Transactions/Sec for Group = EE\CODE1\KK
1205.8499
3364.8987
4401.1702
4747.4354
4765.7614
The logic in the body of the program works fine. The groups appears ok, but instead of having the list from 1 to 5 from the Json file, is appearing everything with the number 5 command EE. Instead should appear: Item 1, 2, 3, 4, 5, with their commands
My list 1 will have the numbers: 1011.5032, 993.2360, 1179.5766, 1112.2567, 1205.8499.
My list 2 will have the numbers: 2444.8864, 2652.8784, 3271.5700, 3147.1466,
The python version I'm using is 2.6
Based on your explanation it's hard to tell what you're trying to do -- do you mean the nested loop below? The inner loop executes 5 times, but in every iteration it overwrites the previous values for item and cmd.
for node in data:
for i in range(0, 5):
item = data[i]['item']
cmd = data[i]['command']
index+= 1
Try printing the values each time the inner loop executes:
for node in data:
for i in range(0, 5):
item = data[i]['item']
cmd = data[i]['command']
print item, cmd
index+= 1
I think this code is your problem:
for node in data:
for i in range(0, 5):
item = data[i]['item']
cmd = data[i]['command']
Item will always be "5" and command will always be "EE" after this executes. Perhaps your indents are off for the code beneath it, and that code is supposed to be within the loop?

re reading a csv file in python without loading it again

I made the following code which works but I want to improve it. I don't want to re-read the file, but if I delete sales_input.seek(0) it won't iterate throw each row in sales. How can i improve this?
def computeCritics(mode, cleaned_sales_input = "data/cleaned_sales.csv"):
if mode == 1:
print "creating customer.critics.recommendations"
critics_output = open("data/customer/customer.critics.recommendations",
"wb")
ID = getCustomerSet(cleaned_sales_input)
sales_dict = pickle.load(open("data/customer/books.dict.recommendations",
"r"))
else:
print "creating books.critics.recommendations"
critics_output = open("data/books/books.critics.recommendations",
"wb")
ID = getBookSet(cleaned_sales_input)
sales_dict = pickle.load(open("data/books/users.dict.recommendations",
"r"))
critics = {}
# make critics dict and pickle it
for i in ID:
with open(cleaned_sales_input, 'rb') as sales_input:
sales = csv.reader(sales_input) # read new
for j in sales:
if mode == 1:
if int(i) == int(j[2]):
sales_dict[int(j[6])] = 1
else:
if int(i) == int(j[6]):
sales_dict[int(j[2])] = 1
critics[int(i)] = sales_dict
pickle.dump(critics, critics_output)
print "done"
cleaned_sales_input looks like
6042772,2723,3546414,9782072488887,1,9.99,314968
6042769,2723,3546414,9782072488887,1,9.99,314968
...
where number 6 is the book ID and number 0 is the customer ID
I want to get a dict wich looks like
critics = {
CustomerID1: {
BookID1: 1,
BookID2: 0,
........
BookIDX: 0
},
CustomerID2: {
BookID1: 0,
BookID2: 1,
...
}
}
or
critics = {
BookID1: {
CustomerID1: 1,
CustomerID2: 0,
........
CustomerIDX: 0
},
BookID1: {
CustomerID1: 0,
CustomerID2: 1,
...
CustomerIDX: 0
}
}
I hope this isn't to much information
Here are some suggestions:
Let's first look at this code pattern:
for i in ID:
for j in sales:
if int(i) == int(j[2])
notice that i is only being compared with j[2]. That's its only purpose in the loop. int(i) == int(j[2]) can only be True at most once for each i.
So, we can completely remove the for i in ID loop by rewriting it as
for j in sales:
key = j[2]
if key in ID:
Based on the function names getCustomerSet and getBookSet, it sounds as if
ID is a set (as opposed to a list or tuple). We want ID to be a set since
testing membership in a set is O(1) (as opposed to O(n) for a list or tuple).
Next, consider this line:
critics[int(i)] = sales_dict
There is a potential pitfall here. This line is assigning sales_dict to
critics[int(i)] for each i in ID. Each key int(i) is being mapped to the very same dict. As we loop through sales and ID, we are modifying sales_dict like this, for example:
sales_dict[int(j[6])] = 1
But this will cause all values in critics to be modified simultaneously, since all keys in critics point to the same dict, sales_dict. I doubt that is what you want.
To avoid this pitfall, we need to make copies of the sales_dict:
critics = {i:sales_dict.copy() for i in ID}
def computeCritics(mode, cleaned_sales_input="data/cleaned_sales.csv"):
if mode == 1:
filename = 'customer.critics.recommendations'
path = os.path.join("data/customer", filename)
ID = getCustomerSet(cleaned_sales_input)
sales_dict = pickle.load(
open("data/customer/books.dict.recommendations", "r"))
key_idx, other_idx = 2, 6
else:
filename = 'books.critics.recommendations'
path = os.path.join("data/books", filename)
ID = getBookSet(cleaned_sales_input)
sales_dict = pickle.load(
open("data/books/users.dict.recommendations", "r"))
key_idx, other_idx = 6, 2
print "creating {}".format(filename)
ID = {int(item) for item in ID}
critics = {i:sales_dict.copy() for i in ID}
with open(path, "wb") as critics_output:
# make critics dict and pickle it
with open(cleaned_sales_input, 'rb') as sales_input:
sales = csv.reader(sales_input) # read new
for j in sales:
key = int(j[key_idx])
if key in ID:
other_key = int(j[other_idx])
critics[key][other_key] = 1
critics[key] = sales_dict
pickle.dump(dict(critics), critics_output)
print "done"
#unutbu's answer is better but if you are stuck with this structure you can put the whole file in memory:
sales = []
with open(cleaned_sales_input, 'rb') as sales_input:
sales_reader = csv.reader(sales_input)
[sales.append(line) for line in sales_reader]
for i in ID:
for j in sales:
#do stuff

Categories