Create CSV file from another - python

I have a csv file as shown below:
19/04/2015 00:00 180 187 85 162 608 61
19/04/2015 01:00 202 20 26 70 171 61
19/04/2015 02:00 20 40 40 11 40 810
19/04/2015 03:00 20 80 81 24 0 86
19/04/2015 04:00 25 30 70 91 07 50
19/04/2015 05:00 80 611 691 70 790 37
19/04/2015 06:00 199 69 706 70 790 171
19/04/2015 07:00 80 81 90 192 57 254
19/04/2015 08:00 40 152 454 259 52 151
Each row is in the same cell in the file.
I'm trying to make it look like this:
19/04/2015 00:00 180
19/04/2015 00:10 187
19/04/2015 00:20 85
19/04/2015 00:30 162
19/04/2015 00:40 608
19/04/2015 00:50 61
19/04/2015 01:00 202
etc..
Explaination:
The first list of numbers is a date dd/M/YYYY HH:mm with 6 values, each value per 10 minutes.
In the second presentation, I wanted to have the date of each value with the exact time with minutes.
Here is what I've tried so far:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import sys, getopt
import tarfile
import re
import pandas as pd
import tempfile
import shutil
import collections
import urllib
import numpy
import logging
import csv
csvFile = "testfile.csv"
data = []
minutes = ['00:00','10:00','20:00','30:00','40:00','50:00']
with open(csvFile, 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for row in reader:
row[0] = re.sub("\s+", ";", row[0].strip())
rowlist = row[0].split(';')
while(len(rowlist)<8):
rowlist.append(0)
for i in range(len(rowlist)):
for m in minutes:
data.append(rowlist[0]+rowlist[1]+m)
data.append(rowlist[i])
df = pd.DataFrame(data)
df.to_csv('example.csv')
But this code didn't give me the desired result.
Any suggestions?

Okay I'm going to be explaining a lot in this one.
I Highly recommend you to use datetime objects if you are going to deal with dates. Because that's exactly why they are in the first place. Convert them into datetime object and you could do lots and lots of manipulations.
This a complete working code for you. I'll explain all of the concepts in depth!.
Input:
19/04/2015 00:00, 180 , 187 , 85 , 162 , 608 , 61
19/04/2015 01:00, 202 , 20 , 26 , 70 , 171 , 61
19/04/2015 02:00, 20 , 40 , 40 , 11 , 40 , 810
The code:
import csv
from datetime import datetime,timedelta
list_of_list = []
with open("old_file.csv","r+") as my_csv:
for line in my_csv:
line = line.strip().replace(" ",'').split(',')
list_of_list.append(line)
for item in list_of_list:
dt = datetime.strptime(item[0],'%d/%m/%Y%H:%M')
item[0]=dt
fin_list = []
for item in list_of_list:
temp_list = [item[0]+timedelta(minutes=10*(i)) for i,x in enumerate(item)]
my_list = [list(a) for a in zip(temp_list,item[1:])]
fin_list.extend(my_list)
for item in fin_list:
item[0] = datetime.strftime(item[0],"%d/%m/%Y %H:%M")
item[0].replace('"','')
print(fin_list)
with open("new_file.csv","w+") as my_csv:
csvWriter = csv.writer(my_csv,delimiter=' ',quotechar = " ")
csvWriter.writerows(fin_list)
output:
19/04/2015 00:00 180
19/04/2015 00:10 187
19/04/2015 00:20 85
19/04/2015 00:30 162
19/04/2015 00:40 608
19/04/2015 00:50 61
19/04/2015 01:00 202
19/04/2015 01:10 20
19/04/2015 01:20 26
19/04/2015 01:30 70
19/04/2015 01:40 171
19/04/2015 01:50 61
19/04/2015 02:00 20
19/04/2015 02:10 40
19/04/2015 02:20 40
19/04/2015 02:30 11
19/04/2015 02:40 40
19/04/2015 02:50 810
1) See I'm taking each row and making them into a list. Also replacing and stripping all the whitespaces,\n,\r
line = line.strip().replace(" ",'').split(',')
list_of_list.append(line)
output after this:
['19/04/201500:00', '180', '187', '85', '162', '608']
2) dt = datetime.strptime(item[0],'%d/%m/%Y%H:%M') what's this? the strptime from datetime takes a string and converts it into a datetime object which you can manipulate easily.
Example:
>>> datetime.strptime('19/04/201500:00','%d/%m/%Y%H:%M')
>>> datetime.datetime(2015, 4, 19, 0, 0)
>>> datetime.strptime('19/04/2015 00:00','%d/%m/%Y %H:%M') #notice how this is different from above!
>>> datetime.datetime(2015, 4, 19, 0, 0)
>>> datetime.strptime('Apr 19 2015 12:00','%b %d %Y %H:%M')
>>> datetime.datetime(2015, 4, 19, 12, 0)
Can you see how it transformed? Once you change it into a datetime object you can then easily add minutes,days,hours,months anything you want with it!.
But to add them you need a timedelta object. Consider it like this to an integer you add an integer same way to datetime add timedelta.
[item[0]+timedelta(minutes=10*(i)) for i,x in enumerate(item)]
You might be thinking hey what the hell's this?.enumerate of a iterable (list,string,tuple..etc) gives two things i,element. Where i would be 0,1,2,3,....till last index of iterable (here list) . So first i,x would be 0,item[0] next i,x would be 1,item[1] and so on.
So the list comprehension just adds 0,10,20 ,30,40,.. minutes to every datetime object.
Each item would be the below,
[datetime.datetime(2015, 4, 19, 0, 0), '180']
And finally after extend you get this:
[[datetime.datetime(2015, 4, 19, 0, 0), '180'],
[datetime.datetime(2015, 4, 19, 0, 10), '187'],
[datetime.datetime(2015, 4, 19, 0, 20), '85'],
[datetime.datetime(2015, 4, 19, 0, 30), '162'],
[datetime.datetime(2015, 4, 19, 0, 40), '608'],
[datetime.datetime(2015, 4, 19, 0, 50), '61']]
How beautiful?
Now again convert the datetime objects to string using this,
item[0] = datetime.strftime(item[0],"%d/%m/%Y %H:%M")
So strftime converts it into desired format!. And alas write them in the new csv file using csv writer.
Note: This would print dates along with quotes by default!. Which you didn't want in your output so use quotechar = " " to remove them.

This should work:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
in_name = 'test.csv'
out_name = 'sample.csv'
with open(in_name, 'rb') as infile, open(out_name, 'wb') as out_file:
for line in infile:
parts = line.split()
date, time, data = parts[0], parts[1], parts[2:]
hours, _ = time.split(':')
for minutes, value in zip(range(0, 60, 10), data):
out_file.write('{date} {hours}:{minutes:02d} {value:>5}\n'.format(
date=date, hours=hours, minutes=minutes, value=value
))
You also had a lot of unused imports which were unnecessary and could reduce performance.

Related

How would I add concatenate items to a master dataframe that I receive from a requests.get()

I have an api call that is returning the date and adjusted close prices for given tickers in a for loop.
There are 1100 unique tickers x 252 days, and I want to create a dataframe that's 1100x252 with the index as date.
The problem is that I can only query the api one ticker at a time, and it returns below (a sample of first several rows) which is for AAPL:
[{'date': '2020-01-02T00:00:00.000Z', 'adjClose': 73.4677943274},
{'date': '2020-01-03T00:00:00.000Z', 'adjClose': 72.7535410914},
{'date': '2020-01-06T00:00:00.000Z', 'adjClose': 73.3332603275},
{'date': '2020-01-07T00:00:00.000Z', 'adjClose': 72.9883640731},
{'date': '2020-01-08T00:00:00.000Z', 'adjClose': 74.1624789816},
{'date': '2020-01-09T00:00:00.000Z', 'adjClose': 75.7377498172},
{'date': '2020-01-10T00:00:00.000Z', 'adjClose': 75.908974908}]
What I am trying to create is a loop that extracts the adjClose, and merges them to a master dataframe, of sorts.
I am currently looping through via:
tickers = list(data.ticker.unique)
for ticker in tickers:
api_call = reqests.get(f'api_call_from_site.com/stonkz&ticker={tickers}')
df = pd.DataFrame(api_call.json())
I then want to concatenate the adjClose from each specific api call to said "master dataframe," but I have no idea where to start. This would look like the below:
NOTE: values for AAPL (and other tickers) would resemble above.
date AAPL TSLA AMD NVDA etc
2020-01-02 75 110 65 205 100
2020-01-03 76 111 66 206 101
2020-01-04 77 112 67 207 102
2020-01-05 78 113 68 208 103
2020-01-06 79 114 69 209 104
2020-01-07 80 115 70 210 105
2020-01-08 81 116 71 211 106
2020-01-09 82 117 72 212 107
2020-01-10 83 118 73 213 108
Any and all help is appreciated, and thank you in advance.
First idea is create list of Series by append and then join in concat:
L = []
for ticker in tickers:
api_call = reqests.get(f'api_call_from_site.com/stonkz&ticker={ticker}')
df = pd.DataFrame(api_call.json()).set_index('date')['adjClose'].rename(ticker)
L.append(df)
df_big = pd.concat(L, axis=1)
Or create list of dictionaries out and last call DataFrame with transpose:
out = []
for ticker in tickers:
api_call = reqests.get(f'api_call_from_site.com/stonkz&ticker={ticker}').json()
d = {x['date']:x['adjClose'] for x in api_call}
out.append(d)
df_big = pd.DataFrame(out, index=tickers).T

How to update HDF5 table with partial data?

I'm wondering how one might update an HDF5 table when one only has partial data? For example, suppose the following df is stored in an HDF5 table.
import pandas as pd
df = pd.DataFrame([
[100,90,80,70,36,45],
[101,78,65,88,55,78],
[92,77,42,79,43,32],
[103,98,76,54,45,65]],
index = pd.date_range(start='2022-01-01', periods=4)
)
df.columns = pd.MultiIndex.from_tuples(
(("mkf", "Open"),
("mkf", "Close"),
("tdf", "Open"),
("tdf","Close"),
("ghi","Open"),
("ghi", "Close"))
)
df
mkf tdf ghi
Open Close Open Close Open Close
2022-01-01 100 90 80 70 36 45
2022-01-02 101 78 65 88 55 78
2022-01-03 92 77 42 79 43 32
2022-01-04 103 98 76 54 45 65
store = pd.HDFStore('store.h5')
store.append('data', df)
Next, suppose I obtain partial data (e.g. data for mkf and tdf but not ghi).
df1 = pd.DataFrame([
[70,80,90,70],
[91,68,45,88],
[92,47,32,79],
[43,38,77,74]],
index = pd.date_range(start='2022-01-05', periods=4)
)
df1.columns = pd.MultiIndex.from_tuples((("mkf", "Open"),
("mkf", "Close"),
("tdf", "Open"),
("tdf","Close"),
)
)
df1
mkf tdf
Open Close Open Close
2022-01-05 70 80 90 70
2022-01-06 91 68 45 88
2022-01-07 92 47 32 79
2022-01-08 43 38 77 74
How can I update store? I tried the following but got a ValueError:
store.append('data',df1)
ValueError: cannot match existing table structure for [(mkf, Open),(mkf, Close),(tdf, Open),(tdf, Close),(ghi, Open),(ghi, Close)] on appending data

How to iterate pandas Dataframe month-wise to satisfy demand over time

Suppose I have a dataframe df
pd demand mon1 mon2 mon3
abc1 137 46 37 31
abc2 138 33 37 50
abc3 120 38 47 46
abc4 149 39 30 30
abc5 129 33 42 42
abc6 112 30 45 43
abc7 129 43 33 45
I want to satisfy the demand of each pd month-wise. I am generating some random numbers which indicate satisfied demand. For example, for pd abc1, demand is 137, say I have produced 42 units for mon1, but mon1 demand is 46. Hence revised dataframe would be
pd demand mon2 mon3
abc1 137 - 42= 95 37 + 4 (Unsatisfied demand for previous month) 31
Then it will run for mon2 and so on. In this way, I would like to capture, how much demand would be satisfied for each pd (excess or unsatisfied).
My try:
import pandas as pd
import random
mon = ['mon1', 'mon2', 'mon3']
for i in df['pd'].values.tolist():
t = df.loc[df['pd'] == i, :]
for m in t.columns[2:]:
y = t[m].iloc[0]
n = random.randint(20, 70)
t['demand'] = t['demand'].iloc[0] - n
Not finding the logic exactly.

Iterating over pandas rows to get minimum

Here is my dataframe:
Date cell tumor_size(mm)
25/10/2015 113 51
22/10/2015 222 50
22/10/2015 883 45
20/10/2015 334 35
19/10/2015 564 47
19/10/2015 123 56
22/10/2014 345 36
13/12/2013 456 44
What I want to do is compare the size of the tumors detected on the different days. Let's consider the cell 222 as an example; I want to compare its size to different cells but detected on earlier days e.g. I will not compare its size with cell 883, because they were detected on the same day. Or I will not compare it with cell 113, because it was detected later on.
As my dataset is too large, I have iterate over the rows. If I explain it in a non-pythonic way:
for the cell 222:
get_size_distance(absolute value):
(50 - 35 = 15), (50 - 47 = 3), (50 - 56 = 6), (50 - 36 = 14), (44 - 36 = 8)
get_minumum = 3, I got this value when I compared it with 564, so I will name it as a pait for the cell 222
Then do it for the cell 883
The resulting output should look like this:
Date cell tumor_size(mm) pair size_difference
25/10/2015 113 51 222 1
22/10/2015 222 50 123 6
22/10/2015 883 45 456 1
20/10/2015 334 35 345 1
19/10/2015 564 47 456 3
19/10/2015 123 56 456 12
22/10/2014 345 36 456 8
13/12/2013 456 44 NaN NaN
I will really appreciate your help
It's not pretty, but I believe it does the trick
a = pd.read_clipboard()
# Cut off last row since it was a faulty date. You can skip this.
df = a.copy().iloc[:-1]
# Convert to dates and order just in case (not really needed I guess).
df['Date'] = df.Date.apply(lambda x: datetime.strptime(x, '%d/%m/%Y'))
df.sort_values('Date', ascending=False)
# Rename column
df = df.rename(columns={"tumor_size(mm)": 'tumor_size'})
# These will be our lists of pairs and size differences.
pairs = []
diffs = []
# Loop over all unique dates
for date in df.Date.unique():
# Only take dates earlier then current date.
compare_df = df.loc[df.Date < date].copy()
# Loop over each cell for this date and find the minimum
for row in df.loc[df.Date == date].itertuples():
# If no cells earlier are available use nans.
if compare_df.empty:
pairs.append(float('nan'))
diffs.append(float('nan'))
# Take lowest absolute value and fill in otherwise
else:
compare_df['size_diff'] = abs(compare_df.tumor_size - row.tumor_size)
row_of_interest = compare_df.loc[compare_df.size_diff == compare_df.size_diff.min()]
pairs.append(row_of_interest.cell.values[0])
diffs.append(row_of_interest.size_diff.values[0])
df['pair'] = pairs
df['size_difference'] = diffs
returns:
Date cell tumor_size pair size_difference
0 2015-10-25 113 51 222.0 1.0
1 2015-10-22 222 50 564.0 3.0
2 2015-10-22 883 45 564.0 2.0
3 2015-10-20 334 35 345.0 1.0
4 2015-10-19 564 47 345.0 11.0
5 2015-10-19 123 56 345.0 20.0
6 2014-10-22 345 36 NaN NaN

data extraction and its summation using python

I have a following data representation in an text file called as
data.txt
03/05/2016 11:00 50
03/05/2016 11:10 10
03/05/2016 11:20 30
03/05/2016 11:30 40
03/05/2016 11:40 40
03/05/2016 11:50 50
03/05/2016 11:60 70
03/05/2016 12:00 25
03/05/2016 12:10 69
03/05/2016 12:20 25
03/05/2016 12:30 59
03/05/2016 12:40 25
03/05/2016 12:50 29
03/05/2016 12:60 25
I want to perform certain mathematical operation such that i can obtain the end result as
03/05/2016 11:00 - 12:00 290
03/05/2016 12:00 - 13:00 257
where this result is stored in another text file say data1.txt
here 290 is the sum of data from 11:00 to 12:00 and 257 is the sum of the data from 12:00 to 13:00
I want to write this code in python 2.7
How can i achieve this....
**UPDATED**
import time
import datetime
while 1:
final_sensorvalue = 0
st_time = time.time()
crntdatetime = 0.0
while ((time.time() - st_time) < 600.0):
sensorvalue = 10 # read sensor value
final_sensorvalue = final_sensorvalue + sensorvalue
time.sleep(2)
f = open('data.txt','a')
crntdatetime = datetime.datetime.now()
timestamp = crntdatetime.strftime("%d/%m/%Y %H:%M")
outstring = str(timestamp)+" "+str(final_sensorvalue)+ "\n"
print outstring
f.write(outstring)
f.close()
time.sleep(2)
You could convert the lines to Counter objects where the key is date & hour ('03/05/2016 11') and value is number as int. Then you could add all the Counter objects together, sort the items and write them to a file:
from collections import Counter
import re
with open('test.txt') as f:
res = sum((Counter({x.group(1): int(x.group(2))})
for x in (re.search('(.*?):.*\s(\d+)', line) for line in f) if x),
Counter())
with open('output.txt', 'w') as f:
f.writelines('{0}:00 - {1}:00 {2}\n'.format(k, int(k.split()[-1]) + 1, v)
for k, v in sorted(res.iteritems()))
Contents of output.txt:
03/05/2016 11:00 - 12:00 290
03/05/2016 12:00 - 13:00 257
You can try like this :
fo = open("data.txt","r")
lines = fo.readlines()
#print lines
d={}
for i in range(0,len(lines),2):
l = lines[i].split()
if int(l[1].split(":")[0]) != 23:
time = l[1].split(":")[0] + ":00-" + str(int(l[1].split(":")[0])+1) +":00"
else:
time = l[1].split(":")[0] + ":00-0:00"
#key = l[0]+"_"+l[1].split(":")[0]
key = l[0]+"_"+time
if key in d:
d[key] = int(d[key]) + int(l[2])
else:
d[key] = int(l[2])
print d
>>>
{'03/05/2016_11:00-12:00': 290, '03/05/2016_12:00-13:00': 257}

Categories