Comparing Thousands of datetimes in a reasonable amount of time?

Comparing Thousands of datetimes in a reasonable amount of time? - python

Long story short, I'm trying to calibrate a thermometer. I have a CSV from my reference thermometer with 16k records (CSV-A), and a CSV from the thermometer to be calibrated with about 52k records (CSV-B). I need to compare the records from CSV-B to the CSV-A to get the times where CSV-B is closest to CSV-A and store the time and value in a different array.
I believe that I have gotten the basic logic of checking the datetimes in place, but the problem seems to be the fact that I have to iterate through an array of 52,000 items 16,000 times. I've tried implementing both multiprocessing and multithreading, but the script has yet to finish running.
import numpy as np, csv, multiprocessing as mp
from datetime import datetime as d
from multiprocessing import Process, Pool
d_rt_t = []
d_rh_t = []
d_dt = []
d_dh = []
d_rt = []
d_rh = []
nts = d.now().timestamp()
with open(f"calib_temp-{nts}.csv", 'w') as ctw:
pass
with open(f"calib_humid-{nts}.csv", 'w') as chw:
pass
def find_nearest(array, value):
nearest = min(array, key=lambda x: abs(d.strptime(x[1], '%m/%d/%Y %H:%M:%S:%f') - d.strptime(value, '%Y-%m-%d %H:%M:%S')))
return nearest
def comp_d_rt_t():
for row in d_rt:
pool = Pool()
d_rt_t.append([pool.map(find_nearest, d_dt, row[1]),row[1]])
def comp_d_rh_t():
for row in d_rh:
d_rh_t.append([pool.map(find_nearest, d_dt, row[1]),row[1]])
#str2date = lambda x: d.strptime(x.decode("utf-8"), '%m/%d/%Y %H:%M:%S:%f')
#str2date2 = lambda x: d.strptime(x.decode("utf-8"), '%Y-%m-%d %H:%M:%S')
with open("dht-temp.csv", 'r', newline='') as ddt:
fr_dt = csv.reader(ddt, delimiter=',')
for row in fr_dt:
d_dt.append([row[0],row[1]])
ddt.close
with open("dht-humid.csv", 'r', newline='') as ddh:
fr_dh = csv.reader(ddh, delimiter=',')
for row in fr_dh:
d_dh.append([row[0],row[1]])
ddh.close
with open("ref-temp.csv", 'r', newline='') as drt:
fr_rt = csv.reader(drt, delimiter=',')
for row in fr_rt:
d_rt.append([row[0],row[1]])
drt.close
with open("ref-humid.csv", 'r', newline='') as drh:
fr_rh = csv.reader(drh, delimiter=',')
for row in fr_rh:
d_rh.append([row[0],row[1]])
drh.close
p1 = Process(target=comp_d_rt_t, args=(d_dt,row[1]))
p2 = Process(target=comp_d_rh_t, args=(d_dh,row[1]))
p1.start()
p2.start()
p1.join()
p2.join()
print(d_rt_t)
with open(f"calib_temp-{nts}.csv", 'a', newline='') as ct:
c = csv.writer(ct, delimiter = ',')
for row in d_rt_t:
dt = np.where(d_dt == row[1])
rt = np.where(d_rt == row[1])
print(rt)
c.writerow([dt[0], rt[0]])
with open(f"calib_humid-{nts}.csv", 'a', newline='') as ch:
c = csv.writer(ch, delimiter = ',')
for row in d_rh_t:
dh = np.where(d_dh == row[1])
print(dh)
rh = np.where(d_rh == row[1])
print(rh)
c.writerow([dh[0], rh[0]])
I moved the for loops around a bit, but before they just called numpy append which called the find_nearest method.

Related

Why a "While True" loop could stop out of nowhere?

I´m sensing the vibrations of a machine live with Arduino. For that I use an accelerometer, an Arduino Uno board, and a Python script that I wrote. The code is intented to read the data of the sensor from the serial port (g), calculate the root mean square of the secuence (RMS) and save the data in an csv file. The problem I have is that my sript stops showing and saving the data out of nowhere, do you see any mistakes in the code? I couldn't identify any relationship between the different incidents, because each one happened at different timing. Here it is:
import csv
from math import sqrt
import serial
from itertools import count
import os
ruta = 'C:/Users/jabde/OneDrive/Documentos/Juan/PhD/Ensayos/Acelerómetro/archivo.csv'
nombre_archivo = input("Ingrese el nombre del archivo: ")
ruta_completa = os.path.join(os.path.dirname(ruta), nombre_archivo + '.csv')
os.chdir(os.path.dirname(ruta))
arduinoData=serial.Serial('com3',115200)
fieldnames = ["t", "g", "RMS"]
i = 0
t = 0
g = 0.15
RMS = 0.1425
suma_cuadrados = 0
with open(ruta_completa, 'w', newline= '') as csv_file:
csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
csv_writer.writeheader()
while True:
with open(ruta_completa, 'a', newline= '') as csv_file:
csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
try:
g = arduinoData.readline()
g = float(g) / 800
t = t + 0.2
t = round(t,1)
i = i + 1
cuadrados = g * g
suma_cuadrados = suma_cuadrados + cuadrados
RMS = suma_cuadrados / i
RMS = sqrt(RMS)
info = {
"g": g,
"t": t,
"RMS": RMS
}
with open(ruta_completa, 'a', newline= '') as csv_file:
csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
csv_writer.writerow(info)
print(t, g, RMS)
time.sleep(0.2)
except:
pass
Thanks in advance!
pd: this is my first post, I'm a newby :)
I thought it could be a space problem in my laptop so I changed the rute where it was saving the CSV file but nothing happened.

I would start with something like this and see where it got me. Note that some of your code is commented out and/or mocked for testing on my part.
The idea is to handle exceptions via an outer loop while doing expected work in the inner loop.
import csv
import math
#import serial
#import os
import time
import random
#ruta = 'C:/Users/jabde/OneDrive/Documentos/Juan/PhD/Ensayos/Acelerómetro/archivo.csv'
#nombre_archivo = input("Ingrese el nombre del archivo: ")
#ruta_completa = os.path.join(os.path.dirname(ruta), nombre_archivo + '.csv')
ruta_completa = "out.csv"
#os.chdir(os.path.dirname(ruta))
#arduinoData = serial.Serial('com3',115200)
class arduinoData:
readline = lambda : 100_000 * random.random()
start_time = int(time.time())
suma_cuadrados = 0
i = 1
with open(ruta_completa, 'w', newline= '') as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["t", "g", "RMS"])
while True:
try:
with open(ruta_completa, 'a', newline= '') as csv_file:
csv_writer = csv.writer(csv_file)
while True:
if random.randint(0, 10) == 10:
raise Exception("Test Exception")
t = round(time.time() - start_time, 1)
g = float(arduinoData.readline()) / 800
suma_cuadrados += (g * g)
RMS = math.sqrt(suma_cuadrados / i)
row = [g, t, RMS]
csv_writer.writerow([g, t, RMS])
print(row)
i += 1
time.sleep(0.2)
except Exception as e:
print(f"Error: {e}")
print("\tTrying again in 5 seconds...")
time.sleep(5)

Multi-threading list iterating for loop

this function reads from a text file and re-formats the contents, and then writes the contents to a csv. I'm trying to use threading to multi-thread the for i in lines loop, this is the longest part of a larger script and takes up most of the run time because the list lines contains thousands of elements. Can someone help me straighten this out? Doing this synchronously instead of in parallel is taking up tons of time. I have seen many other answers to similar questions but I've yet to understand the answers and implement them correctly so far.
def sheets(i):
# time format for spreadsheet
dt_time = datetime.now().strftime('%m/%d|%H:%M')
# for league name (NFL,NBA,NHL ETC.) in list containing league names
for league_name in leagues2:
league_name = league_name.split('|')[0]
with open(final_stats_path, 'r+') as lines:
lines = lines.readlines()
# i = one long string containg details about the event in the loop, eg. sport, game day, game id, home team name
for i in lines:
i = i.split(',')
minprice = i[6]
totaltix = i[5]
event_date = i[2]
try:
dayofweek = datetime.strptime(event_date, '%Y-%m-%d').strftime('%A')
except:
continue
event_date = i[2][2:]
event_date = str(event_date).split('-')
event_date = event_date[1]+'/'+event_date[2]
sport = i[4]
event = i[1].replace('Basketball','').replace('\n','')
away = i[8].replace('Basketball', '').replace('\n','')
eventid = i[0]
event_home = i[9].replace('Basketball', '').replace('\n','')
event = event.split(' at ')[0]
tixdata = str(totaltix)
eventid = 'https://pro.stubhub.com/simweb/sim/services/priceanalysis?eventId='+str(eventid)+'&sectionId=0'
directory = root+'\data'+'\\'+sport+'\\'
report = directory+'report.xlsx'
fname = directory+'teams.txt'
eventleague = sport
f = open(directory+'acronym.txt', 'r+')
lines_2 = f.readlines()
for qt in lines_2:
qt = qt.split('-')
compare = qt[1]
if event_home in compare:
event_home = qt[0]
else:
pass
troop = []
d = {
'ID' : eventid,
'Date' : event_date,
'Day' : dayofweek,
'Away' : away,
}
s = {
'time' : tixdata
}
numbers = event_home+'.txt'
numbers_new = 'bk\\bk_'+numbers
with open(directory+numbers_new, 'a+') as y:
pass
with open(directory+numbers, 'a+') as o:
pass
with open(directory+numbers, 'r+') as g:
for row in g:
if str(eventid) in row:
#print('the event is in the list')
row_update = row.replace('}', ", '"+dt_time+"': '"+tixdata+"'}")
with open(directory+numbers_new, 'a+') as y:
y.write(row_update)
break
else:
with open(directory+numbers, 'a+') as p:
#print('the event is not in the list')
p.write(str(d)+'\n')
with open(directory+numbers_new, 'a+') as n:
n.write(str(d)+'\n')
sizefile = os.path.getsize(directory+numbers_new)
if sizefile > 0:
shutil.copy(directory+numbers_new, directory+numbers)
open(directory+numbers_new, 'w').close()
else:
pass
df = []
with open(directory+numbers, 'r+') as t:
for row in t:
b = eval(row)
dfs = df.append(b)
df = pd.DataFrame(df)
yark = list(df.columns)[:-5]
zed = ['ID', 'Date', 'Day', 'Away']
columns = zed+yark
try:
df = df[columns]
except:
pass
df.index = range(1, 2*len(df)+1, 2)
df = df.reindex(index=range(2*len(df)))
writer = pd.ExcelWriter(directory+event_home+'.xlsx', engine='xlsxwriter')
try:
df.to_excel(writer, sheet_name=event_home)
except:
continue
workbook = writer.book
worksheet = writer.sheets[event_home]
format1 = workbook.add_format({'num_format': '#,##0.00'})
worksheet.set_column('A:ZZ', 18, format1)
writer.save()
if __name__ == "__main__":
pool = ThreadPool(8) # Make the Pool of workers
results = pool.map(sheets) #Open the urls in their own threads
pool.close() #close the pool and wait for the work to finish
pool.join()
##get_numbers()
##stats_to_csv()
##stats_to_html()
#sheets()

Try changing the following line:
results = pool.map(sheets)
to:
results = pool.map(sheets,range(8))

Sum of a particular column in a csv file

There is a csv file, say A.csv, having content:
Place,Hotel,Food,Fare
Norway,Regal,NonVeg,5000
Poland,Jenny,Italiano,6000
Norway,Suzane,Vegeterian,4000
Norway,Regal,NonVeg,5000
I have to parse this csv and obtain an output by passing arguments in command prompt.
Example 1:
mycode.py Place
Desired output is:
Place,Fare
Norway,14000
Poland,6000
Example 2:
mycode.py Place Hotel
Desired output is:
Place,Hotel,Fare
Norway,Regal,10000
Poland,Jenny,6000
Norway,Suzane,4000
So it is clear from the above example that no matter what you pass as argument it gives you the sum of the Fare header for the common ones.
Below is my code and I am able to pass arguments and get an output, but I am stuck in sum of Fare. Can any one help me with this.
import sys
import csv
import collections
d = collections.defaultdict(list)
Data = []
Result = []
Final = []
Argvs = []
argv_len = len(sys.argv)
index = 0
input = ''
file = open('A.csv', 'rb')
try:
reader = csv.reader(file)
for row in reader:
Data.append(row)
for x in range(1, argv_len):
Argvs.append(sys.argv[x])
Argvs.append('Fare')
for input in Argvs:
for y in range(0, len(Data[0])):
if(input == Data[0][y]):
for z in range(1, len(Data)):
Result.append(Data[z][y])
break
Final.append(Result)
Result = []
New = []
NewFinal = []
for x in range(0, len(Final[0])):
for y in range(0, len(Final)):
New.append(Final[y][x])
NewFinal.append(New)
New = []
out = {}
for a in NewFinal:
out.setdefault(a[0],[]).append(int(a[-1]))
with open("output.csv", "wb") as csv_file:
writer = csv.writer(csv_file, dialect='excel', delimiter=',')
writer.writerow(Argvs)
for k,v in out.iteritems():
writer.writerow((k,sum(v)))
except Exception,e:
print str(e)
finally:
file.close()
I edit the code and tried to group it. Now I am able to get the aggregate of the Fare but not the desired output.
So when I am passing:
mycode.py Place Hotel
Instead of:
Place,Hotel,Fare
Norway,Regal,10000
Poland,Jenny,6000
Norway,Suzane,4000
I am getting:
Place,Hotel,Fare
Norway,14000
Poland,6000

Finally i managed to get my desired output.
Below i am sharing the final code. \
import sys
import csv
Data = []
Result = []
Final = []
Argvs = []
argv_len = len(sys.argv)
index = 0
input = ''
file = open('A.csv', 'rb')
try:
reader = csv.reader(file)
for row in reader:
Data.append(row)
for x in range(1, argv_len):
Argvs.append(sys.argv[x])
Argvs.append('Fare')
for input in Argvs:
for y in range(0, len(Data[0])):
if(input == Data[0][y]):
for z in range(1, len(Data)):
Result.append(Data[z][y])
break
Final.append(Result)
Result = []
New = []
NewFinal = []
for x in range(0, len(Final[0])):
for y in range(0, len(Final)):
New.append(Final[y][x])
NewFinal.append(New)
New = []
out = {}
for a in NewFinal:
count_val = a[-1]
del a[-1]
key_val = ','.join(a)
out.setdefault(key_val.strip('"'),[]).append(int(count_val))
with open("output.csv", "wb") as csv_file:
writer = csv.writer(csv_file, delimiter=',',quotechar=' ')
writer.writerow(Argvs)
for k,v in out.iteritems():
writer.writerow((k,sum(v)))
except Exception,e:
print str(e)
finally:
file.close()

List of tuples to an xls Python

Sorry if I do something wrong I'm new here.
I got a Problem with my Python Code.
I have a sorted_List out of an dictionary. the sorted List looks like
sorted_Dict = [('158124', 26708), ('146127', 12738), ('21068', 9949),
('274186', 8255), ('189509', 6550), ('165758', 5346), ...]
I now want to print them in an xls file which should look like
x y
'158124' 26708
i have to plot it in Excell but i also want to plot it in python (which is not necessary but cool) but i don't get how to do this. Here is my whole code. Thank you for any help
cheers
Sven
# -*- coding: iso-8859-1 -*-
from __future__ import division
import csv
import operator
def computeSoldProducts():
catalog = csv.reader(open("data/catalog.csv", "r"))
sales = csv.reader(open("data/sales_3yr.csv", "r"))
output = open("output.csv", "a")
catalogIDs = set()
lineNumber = 0
# lese katalog
for line in catalog:
id = line[0]
if lineNumber <> 0:
catalogIDs.add(eval(id))
lineNumber = 1
soldItems = set()
lineNumber = 0
# lese sales
for line in sales:
id = line[6]
if lineNumber <> 0:
soldItems.add(eval(id))
lineNumber = 1
print "anzahl Produkte:", len(catalogIDs)
print "verkaufte Produkte", len(soldItems)
notSoldIDs = catalogIDs - soldItems
print len(notSoldIDs)
catalog = csv.reader(open("data/catalog.csv", "r"))
sales = csv.reader(open("data/sales_3yr.csv", "r"))
soldDict = {}
for k in catalog:
soldDict[str(k[0])] = 0
for item in sales:
if str(item[6]) in soldDict:
soldDict[str(item[6])] +=1
sorted_soldDict = sorted(soldDict.iteritems(), key=operator.itemgetter(1), reverse=True)
print sorted_soldDict
print sorted_soldDict
for k in sorted_soldDict:
output.write(sorted_soldDict[k])
print "done"
computeSoldProducts()

Straight from the docs for the csv module
import csv
with open('text.csv', 'wb') as csvfile:
fwriter = csv.writer(csvfile)
for x in sorted_list:
fwriter.writerow(x)
You can then open this csv file in excel.

One alternative is to use my library pyexcel, documentation is here: http://pythonhosted.org//pyexcel/
import pyexcel
sorted_list_of_sets = ....
writer = pyexcel.Writer("output.csv")
writer.write_array(sorted_list_of_sets)
writer.close()
Your original solution becomes the following if pyexcel is used:
import pyexcel
import operator
def computeSoldProducts():
catalog = pyexcel.SeriesReader("data/catalog.csv")
sales = pyexcel.SeriesReader("data/sales_3yr.csv")
print "anzahl Produkte:", catalog.number_of_rows()
print "verkaufte Produkte", sales.number_of_rows()
product_list = catalog.column_at(0)
solditem_list = sales.column_at(6)
soldOnes = []
for item in solditem_list:
if item not in soldOnes:
soldOnes.append(item)
notSoldIDs = catalog.number_of_rows() - len(soldOnes)
print notSoldIDs
print product_list
print solditem_list
# initialize the soldDict
zeros_array = [0] * len(product_list)
soldDict = dict(zip(product_list, zeros_array))
for item in solditem_list:
if item in product_list:
soldDict[item] += 1
sorted_soldDict = sorted(soldDict.iteritems(), key=operator.itemgetter(1), reverse=True)
print sorted_soldDict
writer = pyexcel.Writer("output.csv")
writer.write_row(["product", "number"])
writer.write_array(sorted_soldDict)
writer.close()
print "done"
computeSoldProducts()

Rewind the file pointer to the beginning of the previous line

I am doing text processing and using 'readline()' function as follows:
ifd = open(...)
for line in ifd:
while (condition)
do something...
line = ifd.readline()
condition = ....
#Here when the condition becomes false I need to rewind the pointer so that the 'for' loop read the same line again.
ifd.fseek() followed by readline is giving me a '\n' character. How to rewind the pointer so that the whole line is read again.
>>> ifd.seek(-1,1)
>>> line = ifd.readline()
>>> line
'\n'
Here is my code
labtestnames = sorted(tmp)
#Now read each line in the inFile and write into outFile
ifd = open(inFile, "r")
ofd = open(outFile, "w")
#read the header
header = ifd.readline() #Do nothing with this line. Skip
#Write header into the output file
nl = "mrn\tspecimen_id\tlab_number\tlogin_dt\tfluid"
offset = len(nl.split("\t"))
nl = nl + "\t" + "\t".join(labtestnames)
ofd.write(nl+"\n")
lenFields = len(nl.split("\t"))
print "Reading the input file and converting into modified file for further processing (correlation analysis etc..)"
prevTup = (0,0,0)
rowComplete = 0
k=0
for line in ifd:
k=k+1
if (k==200): break
items = line.rstrip("\n").split("\t")
if((items[0] =='')):
continue
newline= list('' for i in range(lenFields))
newline[0],newline[1],newline[3],newline[2],newline[4] = items[0], items[1], items[3], items[2], items[4]
ltests = []
ltvals = []
while(cmp(prevTup, (items[0], items[1], items[3])) == 0): # If the same mrn, lab_number and specimen_id then fill the same row. else create a new row.
ltests.append(items[6])
ltvals.append(items[7])
pos = ifd.tell()
line = ifd.readline()
prevTup = (items[0], items[1], items[3])
items = line.rstrip("\n").split("\t")
rowComplete = 1
if (rowComplete == 1): #If the row is completed, prepare newline and write into outfile
indices = [labtestnames.index(x) for x in ltests]
j=0
ifd.seek(pos)
for i in indices:
newline[i+offset] = ltvals[j]
j=j+1
if (rowComplete == 0): #
currTup = (items[0], items[1], items[3])
ltests = items[6]
ltvals = items[7]
pos = ifd.tell()
line = ifd.readline()
items = line.rstrip("\n").split("\t")
newTup = (items[0], items[1], items[3])
if(cmp(currTup, newTup) == 0):
prevTup = currTup
ifd.seek(pos)
continue
else:
indices = labtestnames.index(ltests)
newline[indices+offset] = ltvals
ofd.write(newline+"\n")

The problem can be handled more simply using itertools.groupby. groupby can cluster all the contiguous lines that deal with the same mrn, specimen_id, and lab_num.
The code that does this is
for key, group in IT.groupby(reader, key = mykey):
where reader iterates over the lines of the input file, and mykey is defined by
def mykey(row):
return (row['mrn'], row['specimen_id'], row['lab_num'])
Each row from reader is passed to mykey, and all rows with the same key are clustered together in the same group.
While we're at it, we might as well use the csv module to read each line into a dict (which I call row). This frees us from having to deal with low-level string manipulation like line.rstrip("\n").split("\t") and instead of referring to columns by index numbers (e.g. row[3]) we can write code that speaks in higher-level terms such as row['lab_num'].
import itertools as IT
import csv
inFile = 'curious.dat'
outFile = 'curious.out'
def mykey(row):
return (row['mrn'], row['specimen_id'], row['lab_num'])
fieldnames = 'mrn specimen_id date lab_num Bilirubin Lipase Calcium Magnesium Phosphate'.split()
with open(inFile, 'rb') as ifd:
reader = csv.DictReader(ifd, delimiter = '\t')
with open(outFile, 'wb') as ofd:
writer = csv.DictWriter(
ofd, fieldnames, delimiter = '\t', lineterminator = '\n', )
writer.writeheader()
for key, group in IT.groupby(reader, key = mykey):
new = {}
row = next(group)
for key in ('mrn', 'specimen_id', 'date', 'lab_num'):
new[key] = row[key]
new[row['labtest']] = row['result_val']
for row in group:
new[row['labtest']] = row['result_val']
writer.writerow(new)
yields
mrn specimen_id date lab_num Bilirubin Lipase Calcium Magnesium Phosphate
4419529 1614487 26.2675 5802791G 0.1
3319529 1614487 26.2675 5802791G 0.3 153 8.1 2.1 4
5713871 682571 56.0779 9732266E 4.1

This seems to be a perfect use case for yield expressions. Consider the following example that prints lines from a file, repeating some of them at random:
def buflines(fp):
r = None
while True:
r = yield r or next(fp)
if r:
yield None
from random import randint
with open('filename') as fp:
buf = buflines(fp)
for line in buf:
print line
if randint(1, 100) > 80:
print 'ONCE AGAIN::'
buf.send(line)
Basically, if you want to process an item once again, you send it back to the generator. On the next iteration you will be reading the same item once again.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Comparing Thousands of datetimes in a reasonable amount of time? - python

Related

Why a "While True" loop could stop out of nowhere?

Multi-threading list iterating for loop

Sum of a particular column in a csv file

List of tuples to an xls Python

Rewind the file pointer to the beginning of the previous line

Categories

Resources