Summing up datetimes without using pandas - python

I have a data set of rain fall in half hour intervals. I want to sum up the rainfall for each day and keep track of how many data points are summed per day to account for data gaps. Then I want to create a new file with a column for the date, a column for the rainfall, and a column for how many data points were available to sum for each day.
daily sum is my function that is trying to do this, get data is my function for extracting the data.
def get_data(avrains):
print('opening{}'.format(avrains))
with open(avrains, 'r') as rfile:
header = rfile.readline()
dates = []
rainfalls = []
for line in rfile:
line = (line.strip())
row = line.split(',')
d = datetime.strptime(row[0], '%Y-%m-%d %H:%M:%S')
r = row[-1]
dates.append(d)
rainfalls.append(float(r))
data = zip(dates, rainfalls)
data = sorted(data)
return (data)
def dailysum(rains):
day_date = []
rain_sum = []
for i in rains:
dayi = i[0]
rainsi = i[1]
for i in dayi:
try:
if dayi[i]== dayi[i+1]:
s= rains[i]+rains[i+1]
rain_sum.append(float(s))
except:
pass
day_date.append(dayi[i])

There's a lot of ways to solve this, but I'll try to stay as close to your existing code as I can:
def get_data(avrains):
"""
opens the file specified in avrains and returns a dictionary
keyed by date, containing a 2-tuple of the total rainfall and
the count of data points, like so:
{
date(2018, 11, 1) : (0.25, 6),
date(2018, 11, 2) : (0.00, 5),
}
"""
print('opening{}'.format(avrains))
rainfall_totals = dict()
with open(avrains, 'r') as rfile:
header = rfile.readline()
for line in rfile:
line = (line.strip())
row = line.split(',')
d = datetime.strptime(row[0], '%Y-%m-%d %H:%M:%S')
r = row[-1]
try:
daily_rainfall, daily_count = rainfalls[d]
daily_rainfall += r
daily_count += 1
rainfalls[d] = (daily_rainfall, daily_count)
except KeyError:
# if we don't find that date in rainfalls, add it
rainfalls[d] = (r, 1)
return rainfalls
Now when you call get_data("/path/to/file"), you'll get back a dictionary. You can spit out the values with some thing like this:
foo = get_data("/path/to/file")
for (measure_date, (rainfall, observations)) in foo.items():
print measure_date, rainfall, observations
(I will leave the formatting of the date, and any sorting or file-writing as an exercise :) )

Related

How would I split a list of tuples into chunks

I have a list that looks something like this:
weather_history=((year,month,day),precip,tmin,tmax)
I need to split it into one-year chunks where each chunk is a list with one years worth of data
please help!
all_years_data: List[Weather_List] = []
for line,p,n,x in weather_history:
year=line[0]
day=line[2]
month=line[1]
precip=p
tmin=n
tmax=x
if year not in all_years_data:
all_years_data.append(year)
this is my code so far. I've tried many different things to get all of each years worth of data into one list but can't figure it out
How about this?
A = [((1,2,3),4,5,6), ((10,20,30),40,50,60), ((100,200,300),400,500,600)]
B = [i[0][0] for i in A]
If your data is like this:
weather_history = ((2020,6,12),30,10,40)
you can use weather_history index without for statement :
year = weather_history[0][0]
day = weather_history[0][1]
month = weather_history[0][2]
precip = weather_history[1]
tmin = weather_history[2]
tmax = weather_history[3]
if year not in all_years_data:
all_years_data.append(year)
But if your data is like this:
weather_history = [((2020,6,12),30,10,40),((2021,6,12),30,10,40),((2022,6,12),30,10,40)]
you should loop weather_history data with for statement :
for line in weather_history:
year = line[0][0]
day = line[0][1]
month = line[0][2]
precip = line[1]
tmin = line[2]
tmax = line[3]
if year not in all_years_data:
all_years_data.append(year)

Trouble with dimensions in netcdf : index exceeds dimension bounds

I want to extract monthly temperature data from several netCDF files in different locations. Files are built as follows:
> print(data.variables.keys())
dict_keys(['lon', 'lat', 'time', 'tmp','stn'])
Files hold names like "tmp_1901_1910."
Here is the code I use:
import glob
import pandas as pd
import os
import numpy as np
import time
os.chdir('PATH/data_tmp')
all_years = []
for file in glob.glob('*.nc'):
data = Dataset(file,'r')
time_data = data.variables['time'][:]
time = data.variables['time']
year = str(file)[4:13]
all_years.append(year)
# Empty pandas dataframe
year_start = min(all_years)
end_year = max(all_years)
date_range = pd.date_range(start = str(year_start[0:4]) + '-01-01', end = str(end_year[5:9]) + '-12-31', freq ='M')
df = pd.DataFrame(0.0, columns = ['Temp'], index = date_range)
# Defining the location, lat, lon based on the csv data
cities = pd.read_csv(r'PATH/cities_coordinates.csv', sep =',')
cities['city']= cities['city'].map(str)
for index, row in cities.iterrows():
location = row['code_nbs']
location_latitude = row['lat']
location_longitude = row['lon']
# Sorting the list
all_years.sort()
for yr in all_years:
#Reading in the data
data = Dataset('tmp_'+str(yr)+'.nc','r')
# Storing the lat and lon data into variables of the netCDF file into variables
lat = data.variables['lat'][:]
lon = data.variables['lon'][:]
# Squared difference between the specified lat, lon and the lat, lon of the netCDF
sq_diff_lat = (lat - location_latitude)**2
sq_diff_lon = (lon - location_longitude)**2
# Retrieving the index of the min value for lat and lon
min_index_lat = sq_diff_lat.argmin()
min_index_lon = sq_diff_lon.argmin()
# Accessing the temperature data
tmp = data.variables['tmp']
start = str(yr[0:4])+'-01-01'
end = str(yr[5:11])+'-12-31'
d_range = pd.date_range(start = start, end = end, freq='M')
for t_index in np.arange(0, len(d_range)):
print('Recording the value for: '+str(d_range[t_index]))
df.loc[d_range[t_index]]['Temp']=tmp[min_index_lon, min_index_lat, t_index]
df.to_csv(location +'.csv')
I obtain the following message while running the command df.loc[d_range[t_index]]['Temp']=tmp[min_index_lon, min_index_lat, t_index]
IndexError: index exceeds dimension bounds
I inspect the object's values and have:
print(d_range)
DatetimeIndex(['1901-01-31', '1901-02-28', '1901-03-31', '1901-04-30',
'1901-05-31', '1901-06-30', '1901-07-31', '1901-08-31',
'1901-09-30', '1901-10-31',
...
'1910-03-31', '1910-04-30', '1910-05-31', '1910-06-30',
'1910-07-31', '1910-08-31', '1910-09-30', '1910-10-31',
'1910-11-30', '1910-12-31'],
dtype='datetime64[ns]', length=120, freq='M')
On the first t_index within the loop, I have:
print(t_index)
0
print(d_range[t_index])
1901-01-31 00:00:00
print(min_index_lat)
259
print(min_index_lon)
592
I don't understand what went wrong with the dimensions.
Thank you for any help!
I assume, you want to read in all .nc data and map the closest city to it. For that, I suggest to read all data first and afterwards calculate to which city a location belongs. The following code probably needs some adoptions to your data. It should show in which direction you could go to get the code more robust.
Step 1: Import your 'raw' data
e.g. into a DataFrame(s). Depends if you can import all data at once. If not split step 1 and 2 into chunks
df_list = []
for file in glob.glob('*.nc'):
data = Dataset(file,'r')
df_i = pd.DataFrame({
variables.keys())
'time': data.variables['time'][:],
'lat': data.variables['lat'][:],
'lon': data.variables['lon'][:],
'tmp': data.variables['tmp'][:],
'stn': data.variables['stn'][:],
'year': str(file)[4:13], # maybe not needed as 'time' should have this info already, and [4:13] needs exactly this format
'file_name': file, # to track back the file
# ... and more
})
df_list.append(df_i)
df = pandas.concat(df_list, ignore_index=True)
Second step: map the locations
e.g. with groupby but there are several other methods. Depending on the amount of data, I suggest to use pandas or numpy routines over any python loops. They are way faster.
df['city'] = None
gp = df.groupby(['lon', 'lat'])
for values_i, indexes_i in gp.groups.items():
# Add your code to get the closest city
# values_i[0] is 'lon'
# values_i[1] is 'lat'
# e.g.:
diff_lon_lat = np.hypot(cities['lon']-values_i[0], cities['lat']-values_i[1])
location = cities.loc[diff_lon_lat.argmin(), 'code_nbs']
# and add the parameters to the df
df.loc[indexes_i, 'city'] = location

Python: How to Filter Pandas Dataframe and plot incremental values

I have a problem with grouping data and plotting in over time to show incremental change.
The data structure is below in the incoming data and added to a pandas dataframe:
"DateTime","Classification", "Confidence"
What I want to do is show the unique values of classification and count how many times they occur every 5 minutes.
I then want to plot this in a graph that will update every 5 minutes showing the incremental values over time.
I have tried different approaches but I just cant get my head around it. The dataframe I can get is:
Index
class
count
0
Car
2
1
Truck
1
2
Boat
3
I got 'Index', 'Class', 'Count'
This I can get updated every 5 minutes or I can add this to a list containing
'TimeStamp','Dataframe', where the dataframe looks like above.
The output in a chart, that I would like to have, is one line per class in different colors, showing how many they are in the dataframe every 5 minutes.
How can I do this with pandas and matplotlib in python?
I attach my junk code below just to show what I have been using as
starting point...
support is most appriciated
def CreateStats():
print("Reading from file")
fo = open("/home/User/Temp/test_data.txt", "r")
df = pd.DataFrame(columns=['time', 'class', 'conf'])
ndf = pd.DataFrame(columns=['class', 'class count'])
pos = 0
nPos=0
for t in range(1):
fo.seek(0, 0)
for line in fo:
#print(str(datetime.now())+" - " + line)
#time.sleep(1)
splitted = line.split(";")
df.loc[pos] = [datetime.now().strftime("%Y-%m-%d %H:%M:%S"),splitted[0],right(splitted[1],1)]
pos=pos+1
#time.sleep(1)
df['time'] = pd.to_datetime(df['time'])
ndf = df.groupby('class').agg({'class':['count']}).reset_index()
#ndf = df.groupby('class').count().reset_index()
#ndf = df.groupby('class').agg('count').reset_index()
#print(df.head())
#newDf = [datetime.now(),ndf]
print(ndf)
#ndf.plot.scatter(x='class', y='time count')
#plt.show()
fo.close()
I found a way. not the python way perhaps:
def CreateStats():
print("Reading from file")
aggDict = {}
fo = open("/home/user/Temp/test_data.txt", "r")
for t in range(20):
fo.seek(0, 0)
aggDict[t] = defaultdict(int)
for line in fo:
#print(str(datetime.now())+" - " + line)
defect = line.split(";")
aggDict[t][defect[0]] += 1
if t > 0:
for key in aggDict[t]:
aggDict[t][key] += aggDict[t-1][key]
print(aggDict)
df = pd.DataFrame(aggDict)
df2 = df.transpose()
lines = df2.plot.line()
plt.show()
{
0: defaultdict(<class 'int'>, { 'Bubbles': 2, 'Rabbits': 2, 'Cup': 1}),
1: defaultdict(<class 'int'>, {'Bubbles': 12, 'Rabbits': 10, 'Cup': 2}),
2: defaultdict(<class 'int'>, {'Bubbles': 62, 'Rabbits': 42, 'Cup': 3})}
The file that is used contains a 2 column semi-colon ; separated list of a type and a value. Not using the value in this code...

sum two columns, calculate max, min and mean value in MapReduce

I have a sample code of mapper as the following shows, the key is UCO, the value is TaxiTotal, which should be the sum of two columns, TaxiIn and TaxiOut, how to sum the two columns?
my current solution TaxiIn + TaxiOut result in a paste number, like 333+444 = 333444, I need it to be 777, how to write the code?
#! /usr/bin/env python
import sys
# -- Airline Data
# Year, Month, DayofMonth, DayOfWeek, DepTime, CRSDepTime, ArrTime, CRSArrTime, UniqueCarrier, FlightNum,
# TailNum, ActualElapsedTime, CRSElapsedTime, AirTime, ArrDelay, DepDelay, Origin, Dest, Distance, TaxiIn,
# TaxiOut, Cancelled, CancellationCode, Diverted, CarrierDelay, WeatherDelay, NASDelay, SecurityDelay, LateAircraftDelay
for line in sys.stdin:
line = line.strip()
unpacked = line.split(",")
Year, Month, DayofMonth, DayOfWeek, DepTime, CRSDepTime, ArrTime, CRSArrTime, UniqueCarrier, FlightNum, TailNum, ActualElapsedTime, CRSElapsedTime, AirTime, ArrDelay, DepDelay, Origin, Dest, Distance, TaxiIn,TaxiOut, Cancelled, CancellationCode, Diverted, CarrierDelay, WeatherDelay, NASDelay, SecurityDelay, LateAircraftDelay = line.split(",")
UCO = "-".join([UniqueCarrier, Origin])
results = [UCO, TaxiIn+TaxiOut]
print("\t".join(results))
Convert TaxiIn + TaxiOut to:
int(TaxiIn) + int(TaxiOut)
See below example:
In [1612]: TaxiIn = '333'
In [1613]: TaxiOut = '444'
In [1614]: TaxiIn + TaxiOut
Out[1614]: '333444'
In [1615]: int(TaxiIn) + int(TaxiOut)
Out[1615]: 777
You can't have numerical sums of string, for that convert str to int or float.
your code should be:
results = [UCO, str(int(TaxiIn) + int(TaxiOut))]
print("\t".join(results))

getting max value from each column of the csv file

Would anybody help me to solve the following problem. I have tried it on my own and I have attached the solution also. I have used 2-d list, but I want a different solution without 2-d list, which should be more pythonic.
pl suggest me any of you have any other way of doing this.
Q) Consider Share prices for a N number of companies given for each month since year 1990 in a CSV file. Format of the file is as below with first line as header.
Year,Month,Company A, Company B,Company C, .............Company N
1990, Jan, 10, 15, 20, , ..........,50
1990, Feb, 10, 15, 20, , ..........,50
.
.
.
.
2013, Sep, 50, 10, 15............500
The solution should be in this format.
a) List for each Company year and month in which the share price was highest.
Here is my answer using 2-d list.
def generate_list(file_path):
'''
return list of list's containing file data.'''
data_list=None #local variable
try:
file_obj = open(file_path,'r')
try:
gen = (line.split(',') for line in file_obj) #generator, to generate one line each time until EOF (End of File)
for j,line in enumerate(gen):
if not data_list:
#if dl is None then create list containing n empty lists, where n will be number of columns.
data_list = [[] for i in range(len(line))]
if line[-1].find('\n'):
line[-1] = line[-1][:-1] #to remove last list element's '\n' character
#loop to convert numbers from string to float, and leave others as strings only
for i,l in enumerate(line):
if i >=2 and j >= 1:
data_list[i].append(float(l))
else:
data_list[i].append(l)
except IOError, io_except:
print io_except
finally:
file_obj.close()
except IOError, io_exception:
print io_exception
return data_list
def generate_result(file_path):
'''
return list of tuples containing (max price, year, month,
company name).
'''
data_list = generate_list(file_path)
re=[] #list to store results in tuple formet as follow [(max_price, year, month, company_name), ....]
if data_list:
for i,d in enumerate(data_list):
if i >= 2:
m = max(data_list[i][1:]) #max_price for the company
idx = data_list[i].index(m) #getting index of max_price in the list
yr = data_list[0][idx] #getting year by using index of max_price in list
mon = data_list[1][idx] #getting month by using index of max_price in list
com = data_list[i][0] #getting company_name
re.append((m,yr,mon,com))
return re
if __name__ == '__main__':
file_path = 'C:/Document and Settings/RajeshT/Desktop/nothing/imp/New Folder/tst.csv'
re = generate_result(file_path)
print 'result ', re
I have tried to solve it with generator also, but in that case it was giving result for only one company i.e. only one column.
p = 'filepath.csv'
f = open(p,'r')
head = f.readline()
gen = ((float(line.split(',')[n]), line.split(',',2)[0:2], head.split(',')[n]) for n in range(2,len(head.split(','))) for i,line in enumerate(f))
x = max((i for i in gen),key=lambda x:x[0])
print x
you can take the below provided input data which is in csv format..
year,month,company 1,company 2,company 3,company 4,company 5
1990,jan,201,245,243,179,133
1990,feb,228,123,124,121,180
1990,march,63,13,158,88,79
1990,april,234,68,187,67,135
1990,may,109,128,46,185,236
1990,june,53,36,202,73,210
1990,july,194,38,48,207,72
1990,august,147,116,149,93,114
1990,september,51,215,15,38,46
1990,october,16,200,115,205,118
1990,november,241,86,58,183,100
1990,december,175,97,143,77,84
1991,jan,190,68,236,202,19
1991,feb,39,209,133,221,161
1991,march,246,81,38,100,122
1991,april,37,137,106,138,26
1991,may,147,48,182,235,47
1991,june,57,20,156,38,245
1991,july,165,153,145,70,157
1991,august,154,16,162,32,21
1991,september,64,160,55,220,138
1991,october,162,72,162,222,179
1991,november,215,207,37,176,30
1991,december,106,153,31,247,69
expected output is following.
[(246.0, '1991', 'march', 'company 1'),
(245.0, '1990', 'jan', 'company 2'),
(243.0, '1990', 'jan', 'company 3'),
(247.0, '1991', 'december', 'company 4'),
(245.0, '1991', 'june', 'company 5')]
Thanks in advance...
Using collections.OrderedDict and collections.namedtuple:
import csv
from collections import OrderedDict, namedtuple
with open('abc1') as f:
reader = csv.reader(f)
tup = namedtuple('tup', ['price', 'year', 'month'])
d = OrderedDict()
names = next(reader)[2:]
for name in names:
#initialize the dict
d[name] = tup(0, 'year', 'month')
for row in reader:
year, month = row[:2] # Use year, month, *prices = row in py3.x
for name, price in zip(names, map(int, row[2:])): # map(int, prices) py3.x
if d[name].price < price:
d[name] = tup(price, year, month)
print d
Output:
OrderedDict([
('company 1', tup(price=246, year='1991', month='march')),
('company 2', tup(price=245, year='1990', month='jan')),
('company 3', tup(price=243, year='1990', month='jan')),
('company 4', tup(price=247, year='1991', month='december')),
('company 5', tup(price=245, year='1991', month='june'))])
I wasn't entirely sure how you wanted to output so for now I just have it print the output to screen.
import os
import csv
import codecs
## Import data !!!!!!!!!!!! CHANGE TO APPROPRIATE PATH !!!!!!!!!!!!!!!!!
filename= os.path.expanduser("~/Documents/PYTHON/StackTest/tailor_raj/Workbook1.csv")
## Get useable data
data = [row for row in csv.reader(codecs.open(filename, 'rb', encoding="utf_8"))]
## Find Number of rows
row_count= (sum(1 for row in data)) -1
## Find Number of columns
## Since this cannot be explicitly done, I set it to run through the columns on one row until it fails.
## Failure is caught by try/except so the program does not crash
columns_found = False
column_try =1
while columns_found == False:
column_try +=1
try:
identify_column = data[0][column_try]
except:
columns_found=True
## Set column count to discoverd column count (1 before it failed)
column_count=column_try-1
## Set which company we are checking (start with the first company listed. Since it starts at 0 the first company is at 2 not 3)
companyIndex = 2
#This will keep all the company bests as single rows of text. I was not sure how you wanted to output them.
companyBest=[]
## Set loop to go through each company
while companyIndex <= (column_count):
## For each new company reset the rowIndex and highestShare
rowIndex=1
highestShare=rowIndex
## Set loop to go through each row
while rowIndex <=row_count:
## Test if data point is above or equal to current max
## Currently set to use the most recent high point
if int(data[highestShare][companyIndex]) <= int(data[rowIndex][companyIndex]):
highestShare=rowIndex
## Move on to next row
rowIndex+=1
## Company best = Company Name + year + month + value
companyBest.append(str(data[0][companyIndex])+": "+str(data[highestShare][0]) +", "+str(data[highestShare][1])+", "+str(data[highestShare][companyIndex]))
## Move on to next company
companyIndex +=1
for item in companyBest:
print item
Be sure to change your filename path one more appropriate.
Output is currently displayed like this:
Company A: 1990, Nov, 1985
Company B: 1990, May, 52873
Company C: 1990, May, 3658
Company D: 1990, Nov, 156498
Company E: 1990, Jul, 987
No generator unfortunately but small code size, especially in Python 3:
from operator import itemgetter
from csv import reader
with open('test.csv') as f:
year, month, *data = zip(*reader(f))
for pricelist in data:
name = pricelist[0]
prices = map(int, pricelist[1:])
i, price = max(enumerate(prices), key=itemgetter(1))
print(name, price, year[i+1], month[i+1])
In Python 2.X you can do the same thing but slightly more clumsy, using the following (and the different print statement):
with open('test.csv') as f:
columns = zip(*reader(f))
year, month = columns[:2]
data = columns[2:]
Okay I came up with some gruesome generators! Also it makes use of lexicographic tuple comparison and reduce to compare consecutive lines:
from functools import reduce # only in Python 3
import csv
def group(year, month, *prices):
return ((int(p), year, month) for p in prices)
def compare(a, b):
return map(max, zip(a, group(*b)))
def run(fname):
with open(fname) as f:
r = csv.reader(f)
names = next(r)[2:]
return zip(names, reduce(compare, r, group(*next(r))))
list(run('test.csv'))

Categories