Python string to excel rows - python

Hello I'm VERY new in python. I just have to do 1 thing with it.
When i print my string names, this is what comes up:
{'id': 1, 'xd_id': 2, 'name': 'nameea', 'description': 'somethingveryweird', 'again_id': 6, 'some_id': None, 'everything': False, 'is_ready': False, 'test_on': None, 'something': None, 'something': [], 'count_count': 28, 'other_count': 0, 'again_count': 0, 'new_count': 0, 'why_count': 0, 'custom_count': 0, 'custom2_count': 0, 'custom3_count': 0, 'custom4_count': 0, 'custom5_count': 0, 'custom_status6_count': 0, 'custom7_count': 0, 'lol_id': 7, 'wtf_id': None, 'numbers_on': 643346, 'something_by': 99, 'site': 'google.com'}
I would to get this info to excel with the left row being the "id": and the right being the 1. And all the info like this. for example. "site" on the left and "google.com" on the right. my current code adds all this info to the first row on the excel and i can't seem to find any tutorial for this. Thanks for all answers. My current code:
f = open('test.csv', 'w')
s = str(names)
f.write(s)
f.close()

if python is not going to be your key skill and only this task needs to be done, then here is the answer.
f = open('test.csv', 'w')
csvwt = csv.writer(f)
for x in names.items():
csvwt.writerow(x)
f.close()
if you want to write to an excel, then you have to do this,
workbook = xlsxwriter.Workbook('test.xlsx')
worksheet = workbook.add_worksheet()
row = 0
col = 0
for x in names.items():
worksheet.write(row, col, str(x[0]))
worksheet.write(row, col + 1, str(x[1]))
row += 1
workbook.close()

Related

Extract features from huge (60000) JSON file directrory into a CSV

I'm trying to parse a huge collection of JSON files. Around 60000 JSON file (size range 100 KB- 700 MB) total of 1.8 TB, so I made this script which parse JSON file and extract some features and export them in CSV file, it works fine but its extremely slow some of the JSON files take more than 30 minutes to be parsed, I tried to make it faster but I couldn't due to my short Python experience. Is there anyway I can make it faster because I need to parse these huge collection sooner. I'm posting a snippet of my code I know its a little dump.
And here is sample of my JSON files please feel free to check
https://gofile.io/d/vddzHY
count1=0
my_file_list = [f for f in glob.glob(r"E:\JsonOrgnized\Pach\*.json")]
final_result = []
for filename in my_file_list:
try:
with open(filename, 'r', encoding='utf8', errors='ignore') as f:
row = {}
info = ijson.items(f, 'info')
f.seek(0)
for o in info:
row['AA-Added']= float(o.get('added'))
row['AB-Started']= float(o.get('started'))
row['AC-Duration']= o.get('duration')
row['AD-Ended']= float(o.get('ended'))
f.seek(0)
domains = ijson.items(f, 'network.domains.item')
domain_count = 0
for domain in domains:
domain_count+=1
row['AE-DomainCount'] = domain_count
f.seek(0)
signatures = ijson.items(f, 'signatures.item')
signature_count = 0
for signature in signatures:
signature_count+=1
row['AF-SignatureCount'] = signature_count
f.seek(0)
domains = ijson.items(f, 'behavior.generic.item')
domain_count = 0
for domain in domains:
domain_count+=1
row['AG-GenericCount'] = domain_count
f.seek(0)
apistats = ijson.items(f, 'behavior.apistats')
apistat_count = 0
for apistat in apistats:
for inner_apistat in apistat:
apistat_count+=1
row['AH-ApistatCount'] = apistat_count
f.seek(0)
processes = ijson.items(f, 'behavior.processes.item')
process_count = 0
for process in processes:
process_count+=1
row['AI-ProcessCount'] = process_count
f.seek(0)
summaries = ijson.items(f, 'behavior.summary')
summary_count = 0
for summary in summaries:
for inner_summary in summary:
summary_count+=1
row['AJ-SummaryCount'] = summary_count
f.seek(0)
apistats_element = ijson.items(f, 'behavior.apistats')
for inner_apistats in apistats_element:
for index, inner_fields in inner_apistats.items():
row = dict(Counter(row)+Counter(inner_fields))
row['AK-Filename'] = os.path.basename(filename)
except Exception as e:
#pass
#print(f"Filename {filename} has issue with {e}")
row = {}
if row:
final_result.append(row)
count1+=1
print("File Number" , count1 , "Is Finished!")
Print("<<<<<<<<<<<<<<<<<<<DONE>>>>>>>>>>>>>>>>>>")
This seems to be a little faster and I think cleaner.
We will use one of the more "lower level" calls from ijson. and based on the paths we get take some sort of action.
We will store paths of interest and the actions to take when encountered in a little work dictionary.
import ijson
import os
def fn_set_value(row, key, value):
row[key] = value
def fn_increment_count(row, key):
row[key] = row.get(key, 0) + 1
# ---------------------
# When these keys (tuples) are encountered, we will take the corresponding action.
# ---------------------
work = {
("info.added", "number"): lambda row, value: fn_set_value(row, "AA-Added", value),
("info.started", "number"): lambda row, value: fn_set_value(row, "AB-Started", value),
("info.duration", "number"): lambda row, value: fn_set_value(row, "AC-Duration", value),
("info.ended", "number"): lambda row, value: fn_set_value(row, "AD-Ended", value),
("network.domains.item", "start_map"): lambda row, value: fn_increment_count(row, "AE-DomainCount"),
("signatures.item", "start_map"): lambda row, value: fn_increment_count(row, "AF-SignatureCount"),
("behavior.generic.item", "start_map"): lambda row, value: fn_increment_count(row, "AG-GenericCount"),
("behavior.apistats", "map_key"): lambda row, value: fn_increment_count(row, "AH-ApistatCount"),
("behavior.processes.item", "start_map"): lambda row, value: fn_increment_count(row, "AI-ProcessCount"),
("behavior.summary", "map_key"): lambda row, value: fn_increment_count(row, "AJ-SummaryCount"),
}
# ---------------------
# ---------------------
# Your initial set of files
# ---------------------
my_file_list = [
"d:/temp/foo/report1.json",
"d:/temp/foo/report2.json",
"d:/temp/foo/report3.json",
"d:/temp/foo/report4.json",
"d:/temp/foo/report5.json"
]
# ---------------------
final_result = []
for index, filename in enumerate(my_file_list):
print(f"Processing file {index+1} from {filename}")
try:
row = {}
with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
for i in ijson.parse(f):
key = (i[0], i[1])
if key in work.keys(): # if the tuple is an interesting one
work[key](row, i[2]) # use it to take an action on row
row["AK-Filename"] = os.path.basename(filename)
final_result.append(row)
except Exception as e:
print(f"\tUnable to process \"{filename}\": {e}")
# retry with ascii or having stripped out the bad character?
pass
print("<<<<<<<<<<<<<<<<<<<DONE>>>>>>>>>>>>>>>>>>")
print(final_result)
This produce this result in a couple of seconds.
[
{
'AA-Added': Decimal('1631536343.897729'),
'AB-Started': Decimal('1631536440.728626'),
'AC-Duration': 21,
'AD-Ended': Decimal('1631536461.778441'),
'AE-DomainCount': 3,
'AF-SignatureCount': 5,
'AG-GenericCount': 3,
'AH-ApistatCount': 2,
'AI-ProcessCount': 3,
'AJ-SummaryCount': 14,
'AK-Filename': 'report1.json'
},
{
'AA-Added': Decimal('1631536343.90739'),
'AB-Started': Decimal('1631536461.849837'),
'AC-Duration': 12,
'AD-Ended': Decimal('1631536474.755813'),
'AE-DomainCount': 3,
'AF-SignatureCount': 2,
'AG-GenericCount': 2,
'AH-ApistatCount': 1,
'AI-ProcessCount': 2,
'AJ-SummaryCount': 2,
'AK-Filename': 'report2.json'
},
{
'AA-Added': Decimal('1631536343.962804'),
'AB-Started': Decimal('1631536692.972615'),
'AC-Duration': 312,
'AD-Ended': Decimal('1631537005.710977'),
'AE-DomainCount': 4,
'AF-SignatureCount': 36,
'AG-GenericCount': 13,
'AH-ApistatCount': 12,
'AI-ProcessCount': 13,
'AJ-SummaryCount': 22,
'AK-Filename': 'report3.json'
},
{
'AA-Added': Decimal('1631536344.049105'),
'AB-Started': Decimal('1631537026.918725'),
'AC-Duration': 316,
'AD-Ended': Decimal('1631537342.92093'),
'AE-DomainCount': 3,
'AF-SignatureCount': 16,
'AG-GenericCount': 4,
'AH-ApistatCount': 3,
'AI-ProcessCount': 4,
'AJ-SummaryCount': 16,
'AK-Filename': 'report4.json'
},
{
'AA-Added': Decimal('1631536344.112968'),
'AB-Started': Decimal('1631537322.81162'),
'AC-Duration': 14,
'AD-Ended': Decimal('1631537337.342377'),
'AE-DomainCount': 3,
'AF-SignatureCount': 1,
'AG-GenericCount': 2,
'AH-ApistatCount': 1,
'AI-ProcessCount': 2,
'AJ-SummaryCount': 7,
'AK-Filename': 'report5.json'
}
]

Trouble in manipulating the data for treeview in tkinter

everyone. Let me first paste the code.
c.execute("SELECT * FROM c20 WHERE Position = 'chain';")
data1 = c.fetchall()
c.execute("SELECT * FROM c20 WHERE Position = 'center';")
data2 = c.fetchall()
c.execute("SELECT * FROM c20 WHERE Position = 'Total';")
data3 = c.fetchall()
data1 = p_mod.list_multiply(data, copies_data)
data2 = p_mod.list_multiply(data2, copies_data)
data3 = p_mod.list_multiply(data3, copies_data)
meta_data = [data1, data2, data3]
n = 0
while n != 3:
for i in meta_data:
my_tree.insert(parent="", index="end", iid=n, text=f"{n + 1}", values=i)
n += 1
if n == 3:
my_tree.pack(pady=20)
root1.mainloop()
This is the code where I need to fetch queries regarding a requirement and the output required is as follows:
conn = sqlite3.connect("userdata.db")
>>> c = conn.cursor()
>>> c.execute("SELECT * FROM c20 WHERE Position = 'chain';")
<sqlite3.Cursor object at 0x00000221DA432F80>
>>> data1 = c.fetchall()
>>> data1
[('chain', 100, 350, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)]
I have also used a remote function named p_mod.list_multiply().
The function looks like this:
def list_multiply(list_input, number):
new_list = []
list_input = list(list_input)[0]
list_input1 = list_input[1 : -1]
for i in list_input1:
data = int(i) * number
new_list.append(data)
if list_input[0] == 'chain':
new_list.insert(0, 'chain')
elif list_input[0] == 'center':
new_list.insert(0, 'center')
elif list_input[0] == 'Total':
new_list.insert(0, 'Total')
new_list = tuple(new_list)
return new_list
Now the problem arises...
Whenever I try to run the code with same outputs(data1, data2,...) using the function remotely from the main code,
it runs successfully, but whenever I am trying to run the script inside the main program it gives me an error.
Error is as follows:
PS D:\RM INCORPORATION\RM Software DEV Company Pvt\Jewellery App> & C:/Users/ONE/AppData/Local/Programs/Python/Python39/python.exe "d:/RM INCORPORATION/RM Software DEV Company Pvt/Jewellery App/contact.py"
h
Exception in Tkinter callback
Traceback (most recent call last):
File "C:\Users\ONE\AppData\Local\Programs\Python\Python39\lib\tkinter\__init__.py", line 1884, in __call__
return self.func(*args)
File "d:\RM INCORPORATION\RM Software DEV Company Pvt\Jewellery App\contact.py", line 53, in select
data1 = p_mod.list_multiply(data, copies_data)
File "d:\RM INCORPORATION\RM Software DEV Company Pvt\Jewellery App\p_mod.py", line 15, in list_multiply
data = int(i) * number
ValueError: invalid literal for int() with base 10: 'h'
Let me show you the output used with the function remotely, from the main code...
PS D:\RM INCORPORATION\RM Software DEV Company Pvt\Jewellery App> & C:/Users/ONE/AppData/Local/Programs/Python/Python39/python.exe "d:/RM INCORPORATION/RM Software DEV Company Pvt/Jewellery App/p_mod.py"
('chain', 200, 700, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) ('center', 222, 826, 82, 124, 98, 70, 756, 2, 2, 6, 8, 24, 24, 16, 0, 0) ('Total', 422, 1526, 82, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1878, 70)
Then what is the problem dude?
Eagerly waiting someone's reply
You have overwritten list_input by the following line in list_multiply():
list_input = list(list_input)[0]
Therefore, list_input will be a string after that.
Just remove this line will solve the issue.
Also the following line:
list_input1 = list_input[1 : -1]
will not copy the last item of list_input into list_input1.
It should be
list_input1 = list_input[1:]
list_multiply() can be simplified as below:
def list_multiply(list_input, number):
new_list = tuple(int(x)*number for x in list_input[1:])
return (list_input[0],) + new_list

Count values from rowlist divided by key in nested dict

Here is an example dataset
Firstly, I try to create a dict from values in rows:
import csv
who = set()
figure = set()
date = set()
action = []
activity = {'play': 0, 'throw': 0, 'pin': 0, 'tap': 0}
with open(r'ShtrudelT.csv',
mode = 'r') as csv_file:
lines = csv_file.readlines()
for row in lines:
data = row.split(',')
who.add(data[1])
figure.add(data[2])
date.add(data[3][:7])
action.append(data[4].strip())
xdict = dict.fromkeys(who,
dict.fromkeys(figure,
dict.fromkeys(date, activity)))
The result is:
{'Googenhaim': {'Circle': {'2020-04': {'play': 0,'throw': 0, 'pin': 0, 'tap': 0},
'2020-06': {'play': 0, 'throw': 0, 'pin': 0, 'tap': 0},
'2020-05': {'play': 0, 'throw': 0, 'pin': 0, 'tap': 0}},
'Rectangle': {'2020-04': {'play': 0, 'throw': 0, 'pin': 0, 'tap': 0}...}
Secondly, I need to count actions divided by key to analyze data. For example, how many times Googenhaim use Circle by any type of action in every month.
Is there a solution without using Pandas?
import csv
count_dict = {}
with open(r'ShtrudelT.csv',
mode = 'r') as csv_file:
lines = csv_file.readlines()
for row in lines:
data = row.split(',')
key = data[1] + "_" + data[2] + "_" + data[3][:7] + "_" + data[4].strip()
if key in count_dict:
count_dict[key] += 1
else:
count_dict[key] = 1
print("\t".join(["Name", "Shape", "Month", "Action", "Count"]))
for element, count in count_dict.items():
items = element.split("_")
print("\t".join(items) + "\t" + str(count))
We use a dictionary where every key is the combination that we want to count. This combination is formed from name of the user, shape, month and the action. While processing every line, we form the key and store it in the dictionary. If it is encountered for the first time then we insert it or else we update the count.
After all the lines are processed, we can do any kind of post processing we want to do.
Hope that solves it.

xlrd original value of the cell

I'm reading xls file using xlrd. The problem is, when xlrd reading value like this "12/09/2012", i get result like this "xldate:41252.0". When I use xlrd.xldate_as_tuple, i get this result:
(2016, 12, 10, 0, 0, 0)
My code:
curr_row = -1
while curr_row < num_rows:
curr_row += 1
row = worksheet.row(curr_row)
for x in xrange(num_cols):
field_type = worksheet.cell_type(curr_row, x)
if field_type == 3: # this is date
field_value = worksheet.cell_value(curr_row, x)
print worksheet.cell(curr_row, x).value
print xlrd.xldate_as_tuple(field_value, 1)
Result:
41252.0
(2016, 12, 10, 0, 0, 0)
Both results are wrong for me. How can i get original cell value "12/09/2012" using xlrd ?
According to the docstring, you should pass your workbook's datemode to xldate_as_tuple as a second parameter:
from datetime import datetime
import xlrd
book = xlrd.open_workbook("test.xls")
sheet = book.sheet_by_index(0)
a1 = sheet.cell_value(rowx=0, colx=0)
print a1 # prints 41252.0
print xlrd.xldate_as_tuple(a1, 1) # prints (2016, 12, 10, 0, 0, 0)
a1_tuple = xlrd.xldate_as_tuple(a1, book.datemode)
print a1_tuple # prints (2012, 12, 9, 0, 0, 0)
a1_datetime = datetime(*a1_tuple)
print a1_datetime.strftime("%m/%d/%Y") # prints 12/09/2012

python: incrementing values in a tokyo cabinet store

I am using tcdb to hold a large key-value store. The keys are strings representing user IDs, the values are dicts of the form
{'coord':0,'node':0,'way':0,'relation':0}
The store is filled iterating over a data file that has coord, node, way and relation objects, each linked to a specific user. Here's my code for incrementing the fields:
def increment(self,uid,typ):
uid = str(uid)
type = str(typ)
try:
self.cache[uid][typ] += 1
except KeyError:
try:
self.cache[uid][typ] = 1
except KeyError:
try:
print 'creating record for %s' % uid
self.cache[uid] = {'coord':0,'node':0,'way':0,'relation':0}
except KeyError:
print 'something\'s messed up'
This does not work. I end up with a table that has all zero values:
def result(self):
print 'cache is now %i records' % len(self.cache)
for key in self.cache:
print key + ': ' + str(self.cache[key])
yields:
...
4951: {'node': 0, 'coord': 0, 'relation': 0, 'way': 0}
409553: {'node': 0, 'coord': 0, 'relation': 0, 'way': 0}
92274: {'node': 0, 'coord': 0, 'relation': 0, 'way': 0}
259040: {'node': 0, 'coord': 0, 'relation': 0, 'way': 0}
...
Why?
The last exception is never called.
EDIT This code in the first try block:
tempdict = self.cache[uid]
tempdict[typ] = tempdict.get(typ,0) + 1
self.cache[uid] = tempdict
instead of the original
self.cache[uid][typ] += 1
works, but looks really ugly to me.
After this line:
self.cache[uid] = {'coord':0,'node':0,'way':0,'relation':0}
Add this:
self.cache[uid][type] = 1
Also, please don't use type as a variable name as it hides the built-in of the same name.

Categories