I have the below code:
datedict = defaultdict(set)
with open('d:/info.csv', 'r') as csvfile:
filereader = csv.reader(csvfile, 'excel')
#passing the header
read_header = False
start_date=date(year=2009,month=1,day=1)
#print((seen_date - start_date).days)
tdic = {}
for row in filereader:
if not read_header:
read_header = True
continue
# reading the rest rows
name,id,firstseen = row[0],row[1],row[3]
try:
seen_date = datetime.datetime.strptime(firstseen, '%d/%m/%Y').date()
deltadays = (seen_date-start_date).days
deltaweeks = deltadays/7 + 1
key = name +'-'+id
currentvalue = tdic.get(key, [])
currentvalue.append(deltaweeks)
tdic[key] = currentvalue
except ValueError:
print('Date value error')
pass
tdic = dict((name, max(weeks) - min(weeks) + 1) for name, weeks in tdic.iteritems())
pprint.pprint(tdic)
in which I get the below result:
{'Mali-2': 20,
'Gooki-3': 6,
'Piata-4': 6,
'Goerge-5': 4,
'Samoo-6': 1,
'Marria-7': 2}
Now I would like to write and print the three items, name,id and weeks as separate columns in an excel file. Anyone knows how it is possible?
>>> with open('out.csv', 'w') as f:
w = csv.writer(f)
for k, v in tdic.iteritems():
name, id_ = k.split('-')
weeks = v
w.writerow([name, id_, weeks])
>>> with open('out.csv') as f:
print f.read()
Piata,4,6
Mali,2,20
Goerge,5,4
Gooki,3,6
Samoo,6,1
Marria,7,2
I however don't like the way you have done this, here are some suggestions for your code:
key = name +'-'+id
Instead of using string manipulation to create a key, use a tuple:
key = (name, id)
Change this line:
tdic = dict((name, max(weeks) - min(weeks) + 1) for name, weeks in tdic.iteritems())
to just say
tdic = dict((key, max(weeks) - min(weeks) + 1) for key, weeks in tdic.iteritems())
since now it is a key of (name, id_) we should reflect that (it's a minor thing but important)
Then the above code would simply be
>>> with open('out.csv', 'w') as f:
w = csv.writer(f)
for (name, id_), weeks in tdic.iteritems():
w.writerow([name, id_, weeks])
You can define a dict of dict like this:
a_dict = {key: { anotherKey: value}}
Or if the dict already exists:
a_dict[key] = {anotherkey: value}
print a_dict[key][anotherkey]
Related
I have a file in1.txt
info="0x0000b573" data="0x7" id="sp. PCU(Si)"
info="0x0000b573" data="0x00000007" id="HI all. SHa"
info="0x00010AC3" data="0x00000003" id="abc_16. PS"
info="0x00010ac3" data="0x00000045" id="hB2_RC/BS (Spr)"
info="0x205" data="0x00000010" id="cgc_15. PK"
info="0x205" data="0x10" id="cgsd_GH/BS (Scd)"
Expected output: out.txt
info="0x00010AC3" data="0x00000003" id="abc_16. PS"
info="0x00010ac3" data="0x00000045" id="hB2_RC/BS (Spr)"
I need only lines that have same info values and different data values to be written to out.txt.
but the current code removes all the line that have string data in it.
with open("in.txt", "r") as fin,open("out.txt", "w") as fout:
for line in fin:
if 'data' not in line:
fout.write(line.strip()+'\n')
what i need is for eg: line 1 and line 2 is having same info="0x0000b573" and data is "0x7" & "0x00000007" which is same then remove that line.
You can use regex
import re
s = '''info="0x0000b573" data="0x7" id="sp. PCU(Si)"
info="0x0000b573" data="0x00000007" id="HI all. SHa"
info="0x00010AC3" data="0x00000003" id="abc_16. PS"
info="0x00010ac3" data="0x00000045" id="hB2_RC/BS (Spr)"
info="0x205" data="0x00000010" id="cgc_15. PK"
info="0x205" data="0x10" id="cgsd_GH/BS (Scd)"'''
parsed_data = re.findall(r'info="([^"]+)" data="([^"]+)" id="[^"]+"', s, re.MULTILINE)
parsed_data = sorted([list(map(lambda x: int(x, 16), i)) + [index] for index,i in enumerate(parsed_data)])
row_numbers = [j for i in [[parsed_data[i][-1], parsed_data[i+1][-1]] for i in range(0,len(parsed_data),2) if parsed_data[i][1] != parsed_data[i+1][1]] for j in i]
final_output = []
for index,line in enumerate(s.split('\n')):
if index in row_numbers:
final_output.append(line)
final_out_text = '\n'.join(final_output)
print(final_out_text)
# info="0x00010AC3" data="0x00000003" id="abc_16. PS"
# info="0x00010ac3" data="0x00000045" id="hB2_RC/BS (Spr)"
You could try something like that too, I think
#!/usr/bin/python3
records = {}
items = []
info = []
data = []
with open("in.dat", "r") as fin:
for line in fin:
items=line.split(' ')
info = items[0].split('=')
data = items[1].split('=')
try:
key = info[1].strip('"').lower()
value = str(int(data[1].strip('"'), 16))
records[key][value] += 1
except KeyError:
try:
records[key][value] = 1
except KeyError:
records[key] = {value: 1}
out = dict()
for key in records:
for value in records[key]:
if records[key][value] == 1:
try:
out[key].append(value)
except KeyError:
out[key] = [value]
with open("out.dat", "w") as fout:
for key in out:
for value in out[key]:
fout.write(f"{key}={value}\n")
Something like this could work:
found_info_values = []
with open("in.txt", "r") as fin,open("out.txt", "w") as fout:
for line in fin:
info = line.split('"')[1]
if info not in found_info_values:
fout.write(line.strip()+'\n')
found_info_values += info
I want to define a function, that reads a table of a textfile as a dictionary and than use it for returning specific values. The keys are chemical symbols (like "He" for Helium,...). The values return their specific atom masses.
I don't understand, what I have to do...
The first five lines of the textfile read:
H,1.008
He,4.0026
Li,6.94
Be,9.0122
B,10.81
Here are my attempts: (I don't know where to place the parameter key so that I can define it)
def read_masses():
atom_masses = {}
with open["average_mass.csv") as f:
for line in f:
(key, value) = line.split(",")
atom_masses[key] = value
return(value)
m = read_masses("average_mass.csv)
print(m["N"]) #for the mass of nitrogen ```
once return has called, the code below it doesn't execute. What you need to return is the atom_masses not value and you have to place it outside the for loop
def read_masses(file):
atom_masses = {}
with open(file) as f:
for line in f:
(key, value) = line.split(",")
atom_masses[key] = value
return (atom_masses)
m = read_masses("average_mass.csv")
print(m["H"])
>>> 1.008
Try:
def read_masses(name):
data = {}
with open(name, "r") as f_in:
for line in map(str.strip, f_in):
if line == "":
continue
a, b = map(str.strip, line.split(",", maxsplit=1))
data[a] = float(b)
return data
m = read_masses("your_file.txt")
print(m.get("He"))
Prints:
4.0026
I am trying to convert txt file data to a Python dictionary before dumping it to JSON and then writing JSON data into avro. My text file contains unwanted data that I need to remove, I only need field names and field types:
mappings = ['name', 'type']
with open('xxx.txt', 'r') as fn:
dict1 = {}
names = []
types = []
for line in file.readlines():
names_lines = line.split()[0] # index 0 = names
names.append(names_lines)
types_lines = line.split()[1] # index 1 = types
types.append(types_lines)
id = 1
for d in fn:
desc = list(d.strip().split(' ', 1))
name = desc[0]
i = 0
dict2 = {}
while i < len(mappings):
dict2[mappings[i]] = desc[i]
i = i + 1
dict1[name] = dict2
id = id + 1
print(dict1)
1- My Error is:
name 'file' is not defined. Did you mean: 'filter'?***
replace file with fn:
for line in fn.readlines():
From what you described you wanted in the comments, try this:
records = []
with open('xxx.txt', 'r') as f:
for line in f:
name, type_, *_ = line.split()
records.append({"name": name, "type": type_})
print(records)
I'll try to look for help once more, so my base code is ready, in the very beginning, it converts all the negative values to 0, and after that, it does calculate the sum and cumulative values of the csv data:
import csv
from collections import defaultdict, OrderedDict
def convert(data):
try:
return int(data)
except ValueError:
return 0
with open('MonthData1.csv', 'r') as file1:
read_file = csv.reader(file1, delimiter=';')
delheader = next(read_file)
data = defaultdict(int)
for line in read_file:
valuedata = max(0, sum([convert(i) for i in line[1:5]]))
data[line[0].split()[0]] += valuedata
for key in OrderedDict(sorted(data.items())):
print('{};{}'.format(key, data[key]))
print("")
previous_values = []
for key, value in OrderedDict(sorted(data.items())).items():
print('{};{}'.format(key, value + sum(previous_values)))
previous_values.append(value)
This code prints:
1.5.2018 245
2.5.2018 105
4.5.2018 87
1.5.2018 245
2.5.2018 350
4.5.2018 437
That's how I want it to print the data. First the sum of each day, and then the cumulative value. My question is, how can I format this data so it can be written to a new csv file with the same format as it prints it? So the new csv file should look like this:
I have tried to do it myself (with dateime), and searched for answers but I just can't find a way. I hope to get a solution this time, I'd appreciate it massively.
The data file as csv: https://files.fm/u/2vjppmgv
Data file in pastebin https://pastebin.com/Tw4aYdPc
Hope this can be done with default libraries
Writing a CSV is simply a matter of writing values separated by commas (or semi-colons in this case. A CSV is a plain text file (a .txt if you will). You can read it and write using python's open() function if you'd like to.
You could actually get rid of the CSV module if you wish. I included an example of this in the end.
This version uses only the libraries that were available in your original code.
import csv
from collections import defaultdict, OrderedDict
def convert(data):
try:
return int(data)
except ValueError:
return 0
file1 = open('Monthdata1.csv', 'r')
file2 = open('result.csv', 'w')
read_file = csv.reader(file1, delimiter=';')
delheader = next(read_file)
data = defaultdict(int)
for line in read_file:
valuedata = max(0, sum([convert(i) for i in line[1:5]]))
data[line[0].split()[0]] += valuedata
for key in OrderedDict(sorted(data.items())):
file2.write('{};{}\n'.format(key, data[key]))
file2.write('\n')
previous_values = []
for key, value in OrderedDict(sorted(data.items())).items():
file2.write('{};{}\n'.format(key, value + sum(previous_values)))
previous_values.append(value)
file1.close()
file2.close()
There is a gotcha here, though. As I didn't import the os module (that is a default library) I used the characters \n to end the line. This will work fine under Linux and Mac, but under windows you should use \r\n. To avoid this issue you should import the os module and use os.linesep instead of \n.
import os
(...)
file2.write('{};{}{}'.format(key, data[key], os.linesep))
(...)
file2.write('{};{}{}'.format(key, value + sum(previous_values), os.linesep))
As a sidenote this is an example of how you could read your CSV without the need for the CSV module:
data = [i.split(";") for i in open('MonthData1.csv').read().split('\n')]
If you had a more complex CSV file, especially if it had strings that could have semi-colons within, you'd better go for the CSV module.
The pandas library, mentioned in other answers is a great tool. It will most certainly be able to handle any need you might have to deal with CSV data.
This code creates a new csv file with the same format as what's printed.
import pandas as pd #added
import csv
from collections import defaultdict, OrderedDict
def convert(data):
try:
return int(data)
except ValueError:
return 0
keys = [] #added
data_keys = [] #added
with open('MonthData1.csv', 'r') as file1:
read_file = csv.reader(file1, delimiter=';')
delheader = next(read_file)
data = defaultdict(int)
for line in read_file:
valuedata = max(0, sum([convert(i) for i in line[1:5]]))
data[line[0].split()[0]] += valuedata
for key in OrderedDict(sorted(data.items())):
print('{} {}'.format(key, data[key]))
keys.append(key) #added
data_keys.append(data[key]) #added
print("")
keys.append("") #added
data_keys.append("") #added
previous_values = []
for key, value in OrderedDict(sorted(data.items())).items():
print('{} {}'.format(key, value + sum(previous_values)))
keys.append(key) #added
data_keys.append(value + sum(previous_values)) #added
previous_values.append(value)
df = pd.DataFrame(data_keys,keys) #added
df.to_csv('new_csv_file.csv', header=False) #added
This is the version that does not use any imports at all
def convert(data):
try:
out = int(data)
except ValueError:
out = 0
return out ### try to avoid multiple return statements
with open('Monthdata1.csv', 'rb') as file1:
lines = file1.readlines()
data = [ [ d.strip() for d in l.split(';')] for l in lines[ 1 : : ] ]
myDict = dict()
for d in data:
key = d[0].split()[0]
value = max(0, sum([convert(i) for i in d[1:5]]))
try:
myDict[key] += value
except KeyError:
myDict[key] = value
s1=""
s2=""
accu = 0
for key in sorted( myDict.keys() ):
accu += myDict[key]
s1 += '{} {}\n'.format( key, myDict[key] )
s2 += '{} {}\n'.format( key, accu )
with open( 'out.txt', 'wb') as fPntr:
fPntr.write( s1 + "\n" + s2 )
This uses non-ordered dictionaries, though, such that sorted() may result in problems. So you actually might want to use datetime giving, e.g.:
import datetime
with open('Monthdata1.csv', 'rb') as file1:
lines = file1.readlines()
data = [ [ d.strip() for d in l.split(';')] for l in lines[ 1 : : ] ]
myDict = dict()
for d in data:
key = datetime.datetime.strptime( d[0].split()[0], '%d.%m.%Y' )
value = max(0, sum([convert(i) for i in d[1:5]]))
try:
myDict[key] += value
except KeyError:
myDict[key] = value
s1=""
s2=""
accu = 0
for key in sorted( myDict.keys() ):
accu += myDict[key]
s1 += '{} {}\n'.format( key.strftime('%d.%m.%y'), myDict[key] )
s2 += '{} {}\n'.format( key.strftime('%d.%m.%y'), accu )
with open( 'out.txt', 'wb') as fPntr:
fPntr.write( s1 + "\n" + s2 )
Note that I changed to the 2 digit year by using %y instead of %Y in the output. This formatting also adds a 0 to day and month.
The client includes 3 rows at the bottom that contain totals for me to reconcile against in my program. Only problem is that my program is exhausting the input file with readlines() before it can do anything else. Is there a way to keep the file from being exhausted during my get_recon_total function call?
#!/usr/bin/env python
# pre_process.py
import csv
import sys
def main():
infile = sys.argv[1]
outfile = sys.argv[2]
with open(infile, 'rbU') as in_obj:
# Create reader object, get fieldnames for later on
reader, fieldnames = open_reader(in_obj)
nav_tot_cnt, nav_rec_cnt, nav_erec_cnt = get_recon_totals(in_obj)
print nav_tot_cnt, nav_rec_cnt, nav_erec_cnt
# This switches the dictionary to a sorted list... necessary??
reader_list = sorted(reader, key=lambda key: (key['PEOPLE_ID'],
key['DON_DATE']))
# Create a list to contain section header information
header_list = create_header_list(reader_list)
# Create dictionary that contains header list as the key,
# then all rows that match as a list of dictionaries.
master_dict = map_data(header_list, reader_list)
# Write data to processed file, create recon counts to compare
# to footer record
tot_cnt, rec_cnt, erec_cnt = write_data(master_dict, outfile, fieldnames)
print tot_cnt, rec_cnt, erec_cnt
def open_reader(file_obj):
'''
Uses DictReader from the csv module to take the first header line
as the fieldnames, then applies them to each element in the file.
Returns the DictReader object and the fieldnames being used (used
later when data is printed out with DictWriter.)
'''
reader = csv.DictReader(file_obj, delimiter=',')
return reader, reader.fieldnames
def create_header_list(in_obj):
p_id_list = []
for row in in_obj:
if (row['PEOPLE_ID'], row['DON_DATE']) not in p_id_list:
p_id_list.append((row['PEOPLE_ID'], row['DON_DATE']))
return p_id_list
def map_data(header_list, data_obj):
master_dict = {}
client_section_list = []
for element in header_list:
for row in data_obj:
if (row['PEOPLE_ID'], row['DON_DATE']) == element:
client_section_list.append(row)
element = list(element)
element_list = [client_section_list[0]['DEDUCT_AMT'],
client_section_list[0]['ND_AMT'],
client_section_list[0]['DEDUCT_YTD'],
client_section_list[0]['NONDEDUCT_YTD']
]
try:
element_list.append((float(client_section_list[0]['DEDUCT_YTD']) +
float(client_section_list[0]['NONDEDUCT_YTD'])
))
except ValueError:
pass
element.extend(element_list)
element = tuple(element)
master_dict[element] = client_section_list
client_section_list = []
return master_dict
def write_data(in_obj, outfile, in_fieldnames):
with open(outfile, 'wb') as writer_outfile:
writer = csv.writer(writer_outfile, delimiter=',')
dict_writer = csv.DictWriter(writer_outfile,
fieldnames=in_fieldnames,
extrasaction='ignore')
tot_cnt = 0
rec_cnt = 0
email_cnt = 0
for k, v in in_obj.iteritems():
writer_outfile.write(' -01- ')
writer.writerow(k)
rec_cnt += 1
for i, e in enumerate(v):
if v[i]['INT_CODE_EX0006'] != '' or v[i]['INT_CODE_EX0028'] != '':
email_cnt += 1
writer_outfile.write(' -02- ')
dict_writer.writerow(e)
tot_cnt += 1
return tot_cnt, rec_cnt, email_cnt
def get_recon_totals(in_obj):
print in_obj
client_tot_cnt = 0
client_rec_cnt = 0
client_erec_cnt = 0
for line in in_obj.readlines():
line = line.split(',')
if line[0] == 'T' and line[1] == 'Total Amount':
print 'Total Amount found.'
client_tot_cnt = line[2]
elif line[0] == 'T' and line[1] == 'Receipt Count':
print 'Receipt Count found.'
client_rec_cnt = line[2]
elif line[0] == 'T' and line[1] == 'Email Receipt Count':
print 'E-Receipt Count Found.'
client_erec_cnt = line[2]
return client_tot_cnt, client_rec_cnt, client_erec_cnt
if __name__ == '__main__':
main()
If your file is not very large, you can convert reader generator to a list of dcitonary , by calling list() on reader and then use it in your code instead of trying to read from the file directly.
Example -
def main():
infile = sys.argv[1]
outfile = sys.argv[2]
with open(infile, 'rbU') as in_obj:
# Create reader object, get fieldnames for later on
reader, fieldnames = open_reader(in_obj)
reader_list = list(reader)
nav_tot_cnt, nav_rec_cnt, nav_erec_cnt = get_recon_totals(reader_list)
print nav_tot_cnt, nav_rec_cnt, nav_erec_cnt
# This switches the dictionary to a sorted list... necessary??
reader_list = sorted(reader_list, key=lambda key: (key['PEOPLE_ID'],
key['DON_DATE']))
.
.
def get_recon_totals(reader_list):
print in_obj
client_tot_cnt = 0
client_rec_cnt = 0
client_erec_cnt = 0
for line in reader_list: #line here is a dict
if line[<fieldname for first column>] == 'T' and line[<fieldname for secondcolumn>] == 'Total Amount':
print 'Total Amount found.'
client_tot_cnt = line[<fieldname for third column>]
.
. #continued like above
.
return client_tot_cnt, client_rec_cnt, client_erec_cnt