I'll try to look for help once more, so my base code is ready, in the very beginning, it converts all the negative values to 0, and after that, it does calculate the sum and cumulative values of the csv data:
import csv
from collections import defaultdict, OrderedDict
def convert(data):
try:
return int(data)
except ValueError:
return 0
with open('MonthData1.csv', 'r') as file1:
read_file = csv.reader(file1, delimiter=';')
delheader = next(read_file)
data = defaultdict(int)
for line in read_file:
valuedata = max(0, sum([convert(i) for i in line[1:5]]))
data[line[0].split()[0]] += valuedata
for key in OrderedDict(sorted(data.items())):
print('{};{}'.format(key, data[key]))
print("")
previous_values = []
for key, value in OrderedDict(sorted(data.items())).items():
print('{};{}'.format(key, value + sum(previous_values)))
previous_values.append(value)
This code prints:
1.5.2018 245
2.5.2018 105
4.5.2018 87
1.5.2018 245
2.5.2018 350
4.5.2018 437
That's how I want it to print the data. First the sum of each day, and then the cumulative value. My question is, how can I format this data so it can be written to a new csv file with the same format as it prints it? So the new csv file should look like this:
I have tried to do it myself (with dateime), and searched for answers but I just can't find a way. I hope to get a solution this time, I'd appreciate it massively.
The data file as csv: https://files.fm/u/2vjppmgv
Data file in pastebin https://pastebin.com/Tw4aYdPc
Hope this can be done with default libraries
Writing a CSV is simply a matter of writing values separated by commas (or semi-colons in this case. A CSV is a plain text file (a .txt if you will). You can read it and write using python's open() function if you'd like to.
You could actually get rid of the CSV module if you wish. I included an example of this in the end.
This version uses only the libraries that were available in your original code.
import csv
from collections import defaultdict, OrderedDict
def convert(data):
try:
return int(data)
except ValueError:
return 0
file1 = open('Monthdata1.csv', 'r')
file2 = open('result.csv', 'w')
read_file = csv.reader(file1, delimiter=';')
delheader = next(read_file)
data = defaultdict(int)
for line in read_file:
valuedata = max(0, sum([convert(i) for i in line[1:5]]))
data[line[0].split()[0]] += valuedata
for key in OrderedDict(sorted(data.items())):
file2.write('{};{}\n'.format(key, data[key]))
file2.write('\n')
previous_values = []
for key, value in OrderedDict(sorted(data.items())).items():
file2.write('{};{}\n'.format(key, value + sum(previous_values)))
previous_values.append(value)
file1.close()
file2.close()
There is a gotcha here, though. As I didn't import the os module (that is a default library) I used the characters \n to end the line. This will work fine under Linux and Mac, but under windows you should use \r\n. To avoid this issue you should import the os module and use os.linesep instead of \n.
import os
(...)
file2.write('{};{}{}'.format(key, data[key], os.linesep))
(...)
file2.write('{};{}{}'.format(key, value + sum(previous_values), os.linesep))
As a sidenote this is an example of how you could read your CSV without the need for the CSV module:
data = [i.split(";") for i in open('MonthData1.csv').read().split('\n')]
If you had a more complex CSV file, especially if it had strings that could have semi-colons within, you'd better go for the CSV module.
The pandas library, mentioned in other answers is a great tool. It will most certainly be able to handle any need you might have to deal with CSV data.
This code creates a new csv file with the same format as what's printed.
import pandas as pd #added
import csv
from collections import defaultdict, OrderedDict
def convert(data):
try:
return int(data)
except ValueError:
return 0
keys = [] #added
data_keys = [] #added
with open('MonthData1.csv', 'r') as file1:
read_file = csv.reader(file1, delimiter=';')
delheader = next(read_file)
data = defaultdict(int)
for line in read_file:
valuedata = max(0, sum([convert(i) for i in line[1:5]]))
data[line[0].split()[0]] += valuedata
for key in OrderedDict(sorted(data.items())):
print('{} {}'.format(key, data[key]))
keys.append(key) #added
data_keys.append(data[key]) #added
print("")
keys.append("") #added
data_keys.append("") #added
previous_values = []
for key, value in OrderedDict(sorted(data.items())).items():
print('{} {}'.format(key, value + sum(previous_values)))
keys.append(key) #added
data_keys.append(value + sum(previous_values)) #added
previous_values.append(value)
df = pd.DataFrame(data_keys,keys) #added
df.to_csv('new_csv_file.csv', header=False) #added
This is the version that does not use any imports at all
def convert(data):
try:
out = int(data)
except ValueError:
out = 0
return out ### try to avoid multiple return statements
with open('Monthdata1.csv', 'rb') as file1:
lines = file1.readlines()
data = [ [ d.strip() for d in l.split(';')] for l in lines[ 1 : : ] ]
myDict = dict()
for d in data:
key = d[0].split()[0]
value = max(0, sum([convert(i) for i in d[1:5]]))
try:
myDict[key] += value
except KeyError:
myDict[key] = value
s1=""
s2=""
accu = 0
for key in sorted( myDict.keys() ):
accu += myDict[key]
s1 += '{} {}\n'.format( key, myDict[key] )
s2 += '{} {}\n'.format( key, accu )
with open( 'out.txt', 'wb') as fPntr:
fPntr.write( s1 + "\n" + s2 )
This uses non-ordered dictionaries, though, such that sorted() may result in problems. So you actually might want to use datetime giving, e.g.:
import datetime
with open('Monthdata1.csv', 'rb') as file1:
lines = file1.readlines()
data = [ [ d.strip() for d in l.split(';')] for l in lines[ 1 : : ] ]
myDict = dict()
for d in data:
key = datetime.datetime.strptime( d[0].split()[0], '%d.%m.%Y' )
value = max(0, sum([convert(i) for i in d[1:5]]))
try:
myDict[key] += value
except KeyError:
myDict[key] = value
s1=""
s2=""
accu = 0
for key in sorted( myDict.keys() ):
accu += myDict[key]
s1 += '{} {}\n'.format( key.strftime('%d.%m.%y'), myDict[key] )
s2 += '{} {}\n'.format( key.strftime('%d.%m.%y'), accu )
with open( 'out.txt', 'wb') as fPntr:
fPntr.write( s1 + "\n" + s2 )
Note that I changed to the 2 digit year by using %y instead of %Y in the output. This formatting also adds a 0 to day and month.
Related
I have a file in1.txt
info="0x0000b573" data="0x7" id="sp. PCU(Si)"
info="0x0000b573" data="0x00000007" id="HI all. SHa"
info="0x00010AC3" data="0x00000003" id="abc_16. PS"
info="0x00010ac3" data="0x00000045" id="hB2_RC/BS (Spr)"
info="0x205" data="0x00000010" id="cgc_15. PK"
info="0x205" data="0x10" id="cgsd_GH/BS (Scd)"
Expected output: out.txt
info="0x00010AC3" data="0x00000003" id="abc_16. PS"
info="0x00010ac3" data="0x00000045" id="hB2_RC/BS (Spr)"
I need only lines that have same info values and different data values to be written to out.txt.
but the current code removes all the line that have string data in it.
with open("in.txt", "r") as fin,open("out.txt", "w") as fout:
for line in fin:
if 'data' not in line:
fout.write(line.strip()+'\n')
what i need is for eg: line 1 and line 2 is having same info="0x0000b573" and data is "0x7" & "0x00000007" which is same then remove that line.
You can use regex
import re
s = '''info="0x0000b573" data="0x7" id="sp. PCU(Si)"
info="0x0000b573" data="0x00000007" id="HI all. SHa"
info="0x00010AC3" data="0x00000003" id="abc_16. PS"
info="0x00010ac3" data="0x00000045" id="hB2_RC/BS (Spr)"
info="0x205" data="0x00000010" id="cgc_15. PK"
info="0x205" data="0x10" id="cgsd_GH/BS (Scd)"'''
parsed_data = re.findall(r'info="([^"]+)" data="([^"]+)" id="[^"]+"', s, re.MULTILINE)
parsed_data = sorted([list(map(lambda x: int(x, 16), i)) + [index] for index,i in enumerate(parsed_data)])
row_numbers = [j for i in [[parsed_data[i][-1], parsed_data[i+1][-1]] for i in range(0,len(parsed_data),2) if parsed_data[i][1] != parsed_data[i+1][1]] for j in i]
final_output = []
for index,line in enumerate(s.split('\n')):
if index in row_numbers:
final_output.append(line)
final_out_text = '\n'.join(final_output)
print(final_out_text)
# info="0x00010AC3" data="0x00000003" id="abc_16. PS"
# info="0x00010ac3" data="0x00000045" id="hB2_RC/BS (Spr)"
You could try something like that too, I think
#!/usr/bin/python3
records = {}
items = []
info = []
data = []
with open("in.dat", "r") as fin:
for line in fin:
items=line.split(' ')
info = items[0].split('=')
data = items[1].split('=')
try:
key = info[1].strip('"').lower()
value = str(int(data[1].strip('"'), 16))
records[key][value] += 1
except KeyError:
try:
records[key][value] = 1
except KeyError:
records[key] = {value: 1}
out = dict()
for key in records:
for value in records[key]:
if records[key][value] == 1:
try:
out[key].append(value)
except KeyError:
out[key] = [value]
with open("out.dat", "w") as fout:
for key in out:
for value in out[key]:
fout.write(f"{key}={value}\n")
Something like this could work:
found_info_values = []
with open("in.txt", "r") as fin,open("out.txt", "w") as fout:
for line in fin:
info = line.split('"')[1]
if info not in found_info_values:
fout.write(line.strip()+'\n')
found_info_values += info
I am reading from a huge file (232MB) line by line.
First, i recognize each line according to a Regular Expression.
Then for each line, I am writing to different city.txt files under the 'report' directory according to a cityname in each line. However, this process takes a while. I am wondering if there is anyway of speeding up the process?
Example of input file: (each column split by a \t)
2015-02-03 19:20 Sane Diebgo Music 692.08 Cash
Actually i have tested the code with writing to different files and not writing to different file(simply process the large file and come up with 2 dicts) the time difference is huge. 80% of the time is spent writing to different files
def processFile(file):
pattern = re.compile(r"(\d{4}-\d{2}-\d{2})\t(\d{2}:\d{2})\t(.+)\t(.+)\t(\d+\.\d+|\d+)\t(\w+)\n")
f = open(file)
total_sale = 0
city_dict = dict()
categories_dict = dict()
os.makedirs("report", exist_ok = True)
for line in f:
valid_entry = pattern.search(line)
if valid_entry == None:
print("Invalid entry: '{}'".format(line.strip()))
continue
else:
entry_sale = float(valid_entry.group(5))
total_sale += entry_sale
city_dict.update({valid_entry.group(3) : city_dict.get(valid_entry.group(3), 0) + entry_sale})
categories_dict.update({valid_entry.group(4) : categories_dict.get(valid_entry.group(4), 0) + entry_sale})
filename = "report/" + valid_entry.group(3) + ".txt"
if os.path.exists(filename):
city_file = open(filename, "a")
city_file.write(valid_entry.group(0))
city_file.close()
else:
city_file = open(filename, "w")
city_file.write(valid_entry.group(0))
city_file.close()
f.close()
return (city_dict, categories_dict, total_sale)
The dictionary lookups and updates could be improved by using defaultdict:
from collections import defaultdict
city_dict = defaultdict(float)
categories_dict = defaultdict(float)
...
city = valid_entry.group(3)
category = valid_entry.group(4)
...
city_dict[city] += entry_sale
category_dict[category] += entry_sale
I have a list of 1500 emoji character dictionary in a json file, and I wanted to import those to my python code, I did a file read and convert it to a python dictionary but now I have only 143 records. How can I import all the emoji to my code, this is my code.
import sys
import ast
file = open('emojidescription.json','r').read()
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
emoji_dictionary = ast.literal_eval(file.translate(non_bmp_map))
#word = word.replaceAll(",", " ");
keys = list(emoji_dictionary["emojis"][0].keys())
values = list(emoji_dictionary["emojis"][0].values())
file_write = open('output.txt','a')
print(len(keys))
for i in range(len(keys)):
try:
content = 'word = word.replace("{0}", "{1}")'.format(keys[i],values[i][0])
except Exception as e:
content = 'word = word.replace("{0}", "{1}")'.format(keys[i],'')
#file.write()
#print(keys[i],values[i])
print(content)
file_write.close()
This is my input sample
{
"emojis": [
{
"👨🎓": ["Graduate"],
"©": ["Copy right"],
"®": ["Registered"],
"👨👩👧": ["family"],
"👩❤️💋👩": ["love"],
"™": ["trademark"],
"👨❤👨": ["love"],
"⌚": ["time"],
"⌛": ["wait"],
"⭐": ["star"],
"🐘": ["Elephant"],
"🐕": ["Cat"],
"🐜": ["ant"],
"🐔": ["cock"],
"🐓": ["cock"],
This is my result, and the 143 denotes number of emoji.
143
word = word.replace("����", "family")
word = word.replace("Ⓜ", "")
word = word.replace("♥", "")
word = word.replace("♠", "")
word = word.replace("⌛", "wait")
I'm not sure why you're seeing only 143 records from an input of 1500 (your sample doesn't seem to display this behavior).
The setup doesn't seem to do anything useful, but what you're doing boils down to (simplified and skipping lots of details):
d = ..read json as python dict.
keys = d.keys()
values = d.values()
for i in range(len(keys)):
key = keys[i]
value = values[i]
and that should be completely correct. There are better ways to do this in Python, however, like using the zip function:
d = ..read json as python dict.
keys = d.keys()
values = d.values()
for key, value in zip(keys, values): # zip picks pair-wise elements
...
or simply asking the dict for its items:
for key, value in d.items():
...
The json module makes reading and writing json much simpler (and safer), and using the idiom from above the problem reduces to this:
import json
emojis = json.load(open('emoji.json', 'rb'))
with open('output.py', 'wb') as fp:
for k,v in emojis['emojis'][0].items():
val = u'word = word.replace("{0}", "{1}")\n'.format(k, v[0] if v else "")
fp.write(val.encode('u8'))
Why do you replace all emojis with 0xfffd in the lines:
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
emoji_dictionary = ast.literal_eval(file.translate(non_bmp_map))
Just don't to this!
Using json:
import json
with open('emojidescription.json', encoding="utf8") as emojis:
emojis = json.load(emojis)
with open('output.txt','a', encoding="utf8") as output:
for emoji, text in emojis["emojis"][0].items():
text = "" if not text else text[0]
output.write('word = word.replace("{0}", "{1}")\n'.format(emoji, text))
I am doing text processing and using 'readline()' function as follows:
ifd = open(...)
for line in ifd:
while (condition)
do something...
line = ifd.readline()
condition = ....
#Here when the condition becomes false I need to rewind the pointer so that the 'for' loop read the same line again.
ifd.fseek() followed by readline is giving me a '\n' character. How to rewind the pointer so that the whole line is read again.
>>> ifd.seek(-1,1)
>>> line = ifd.readline()
>>> line
'\n'
Here is my code
labtestnames = sorted(tmp)
#Now read each line in the inFile and write into outFile
ifd = open(inFile, "r")
ofd = open(outFile, "w")
#read the header
header = ifd.readline() #Do nothing with this line. Skip
#Write header into the output file
nl = "mrn\tspecimen_id\tlab_number\tlogin_dt\tfluid"
offset = len(nl.split("\t"))
nl = nl + "\t" + "\t".join(labtestnames)
ofd.write(nl+"\n")
lenFields = len(nl.split("\t"))
print "Reading the input file and converting into modified file for further processing (correlation analysis etc..)"
prevTup = (0,0,0)
rowComplete = 0
k=0
for line in ifd:
k=k+1
if (k==200): break
items = line.rstrip("\n").split("\t")
if((items[0] =='')):
continue
newline= list('' for i in range(lenFields))
newline[0],newline[1],newline[3],newline[2],newline[4] = items[0], items[1], items[3], items[2], items[4]
ltests = []
ltvals = []
while(cmp(prevTup, (items[0], items[1], items[3])) == 0): # If the same mrn, lab_number and specimen_id then fill the same row. else create a new row.
ltests.append(items[6])
ltvals.append(items[7])
pos = ifd.tell()
line = ifd.readline()
prevTup = (items[0], items[1], items[3])
items = line.rstrip("\n").split("\t")
rowComplete = 1
if (rowComplete == 1): #If the row is completed, prepare newline and write into outfile
indices = [labtestnames.index(x) for x in ltests]
j=0
ifd.seek(pos)
for i in indices:
newline[i+offset] = ltvals[j]
j=j+1
if (rowComplete == 0): #
currTup = (items[0], items[1], items[3])
ltests = items[6]
ltvals = items[7]
pos = ifd.tell()
line = ifd.readline()
items = line.rstrip("\n").split("\t")
newTup = (items[0], items[1], items[3])
if(cmp(currTup, newTup) == 0):
prevTup = currTup
ifd.seek(pos)
continue
else:
indices = labtestnames.index(ltests)
newline[indices+offset] = ltvals
ofd.write(newline+"\n")
The problem can be handled more simply using itertools.groupby. groupby can cluster all the contiguous lines that deal with the same mrn, specimen_id, and lab_num.
The code that does this is
for key, group in IT.groupby(reader, key = mykey):
where reader iterates over the lines of the input file, and mykey is defined by
def mykey(row):
return (row['mrn'], row['specimen_id'], row['lab_num'])
Each row from reader is passed to mykey, and all rows with the same key are clustered together in the same group.
While we're at it, we might as well use the csv module to read each line into a dict (which I call row). This frees us from having to deal with low-level string manipulation like line.rstrip("\n").split("\t") and instead of referring to columns by index numbers (e.g. row[3]) we can write code that speaks in higher-level terms such as row['lab_num'].
import itertools as IT
import csv
inFile = 'curious.dat'
outFile = 'curious.out'
def mykey(row):
return (row['mrn'], row['specimen_id'], row['lab_num'])
fieldnames = 'mrn specimen_id date lab_num Bilirubin Lipase Calcium Magnesium Phosphate'.split()
with open(inFile, 'rb') as ifd:
reader = csv.DictReader(ifd, delimiter = '\t')
with open(outFile, 'wb') as ofd:
writer = csv.DictWriter(
ofd, fieldnames, delimiter = '\t', lineterminator = '\n', )
writer.writeheader()
for key, group in IT.groupby(reader, key = mykey):
new = {}
row = next(group)
for key in ('mrn', 'specimen_id', 'date', 'lab_num'):
new[key] = row[key]
new[row['labtest']] = row['result_val']
for row in group:
new[row['labtest']] = row['result_val']
writer.writerow(new)
yields
mrn specimen_id date lab_num Bilirubin Lipase Calcium Magnesium Phosphate
4419529 1614487 26.2675 5802791G 0.1
3319529 1614487 26.2675 5802791G 0.3 153 8.1 2.1 4
5713871 682571 56.0779 9732266E 4.1
This seems to be a perfect use case for yield expressions. Consider the following example that prints lines from a file, repeating some of them at random:
def buflines(fp):
r = None
while True:
r = yield r or next(fp)
if r:
yield None
from random import randint
with open('filename') as fp:
buf = buflines(fp)
for line in buf:
print line
if randint(1, 100) > 80:
print 'ONCE AGAIN::'
buf.send(line)
Basically, if you want to process an item once again, you send it back to the generator. On the next iteration you will be reading the same item once again.
It takes a file of 500 complaints, returns the number of the complaint as the key and a tuple with the make of the car, date of complaint, Crash True or False, City and State as the value.
ex) mydict("Complaints.txt")[416]
('CHRYSLER', datetime.date(1995, 1, 9), False, 'ARCADIA', 'FL')
so far I have :
from collections import defaultdict
import datetime
def fieldict(filename):
with open(filename) as f:
x=[line.split('\t')[0].strip() for line in f] #list of complaint numbers
y= line.split('\t') #list of full complaints
d={}
for j in x:
Y= True
N= False
d[j] = tuple(y[2],datetime.date(y[7]), y[6], y[12], y[13]) #dict with number of complaint as key and tuple with index as values
return d
y is the entire complaint broken up into a list with \t characters removed. If someone could point me in the right direction it would be much appreciated
You could also lean on the csv module a bit (untested):
import csv
def fieldict(filename):
fullDict = {}
with open(filename) as f:
reader = csv.reader(f, delimiter='\t')
for y in reader:
fullDict[y[0].strip()] = (y[2],datetime.date(y[7]), y[6], y[12], y[13])
return fullDict
if __name__ == "__main__":
mydict = fieldict("Complaints.txt")
print mydict[416]
if I am understanding your correctly, I think this is what you are looking for.
import datetime
def fieldict(filename):
returnDict = {}
with open(filename) as f:
for line in f:
lineList = line.split('\t')
index = lineList[0].strip()
complaint = tuple(lineList[2],datetime.date(lineList[7]), lineList[6], lineList[12], lineList[13])
returnDict[index] = complaint
return returnDict
if __name__ == "__main__":
mydict = fieldict("Complaints.txt")
print mydict[416]