Why is my second thread not executing at all? - python

I have a GUI which takes values and when the button on the GUI is pressed receives a data block from my server and puts them into a queue and using threads calls two functions recvData() and calculate_threshold. The first function is to keep receiving data blocks from the server and put them in the queue whereas the second function is to remove the data blocks from queue continuously and perform some calculations on it before writing the results to the file. It continues to do it till the queue is not empty.
Below is my client code:
import socket
import turtle
#import time
import queue
import threading
from tkinter import *
class GUI:
entries = []
def __init__(self, master):
self.master = master
master.title("Collision Detection")
self.buff_data = queue.Queue()
self.t1 = threading.Thread(target = self.recvData)
self.t2 = threading.Thread(target=self.calculate_threshold)
self.entries = []
self.host = '127.0.0.1'
self.port = 5000
self.s = socket.socket()
self.s.connect((self.host, self.port))
self.create_GUI()
def create_GUI(self):
self.input_label = Label(root, text = "Input all the gratings set straight wavelength values in nm")
self.input_label.grid(row = 0)
self.core_string = "Core "
self.label_col_inc = 0
self.entry_col_inc = 1
self.core_range = range(1, 5)
for y in self.core_range:
self.core_text = self.core_string + str(y) + '_' + '25'
self.core_label = Label(root, text = self.core_text)
self.entry = Entry(root)
self.core_label.grid(row=1, column=self.label_col_inc, sticky=E)
self.entry.grid(row=1, column=self.entry_col_inc)
self.entries.append(self.entry)
self.label_col_inc += 2
self.entry_col_inc += 2
self.threshold_label = Label(root, text = "Threshold in nm")
self.entry_threshold = Entry(root)
self.threshold_label.grid(row = 2, sticky = E)
self.entry_threshold.grid(row = 2, column = 1)
self.light_label = Label(root, text = 'Status')
self.light_label.grid(row = 3, column = 3)
self.canvas = Canvas(root, width = 150, height = 50)
self.canvas.grid(row = 4, column = 3)
# Green light
self.green_light = turtle.RawTurtle(self.canvas)
self.green_light.shape('circle')
self.green_light.color('grey')
self.green_light.penup()
self.green_light.goto(0,0)
# Red light
self.red_light = turtle.RawTurtle(self.canvas)
self.red_light.shape('circle')
self.red_light.color('grey')
self.red_light.penup()
self.red_light.goto(40,0)
self.data_button = Button(root, text = "Get data above threshold", command = self.getData)
self.data_button.grid(row = 5, column = 0)
len_message = self.s.recv(4)
print('len_message', len_message)
bytes_length = int(len_message.decode('utf-8')) # for the self-made server
recvd_data = self.s.recv(bytes_length)
print('data', recvd_data)
self.buff_data.put(recvd_data)
#print('buffer', self.buff_data)
self.t1.start()
self.t2.start()
def recvData(self):
len_message = self.s.recv(4)
print('len_message', len_message)
while len_message:
bytes_length = int(len_message.decode('utf-8')) # for the self-made server
recvd_data = self.s.recv(bytes_length)
print('data', recvd_data)
self.buff_data.put(recvd_data)
len_message = self.s.recv(4)
print('len_message', len_message)
else:
print('out of loop')
self.s.close()
def calculate_threshold(self):
while not self.buff_data.empty:
rmv_data = self.buff_data.get()
stringdata = rmv_data.decode('utf-8')
rep_str = stringdata.replace(",", ".")
splitstr = rep_str.split()
# received wavelength values
inc = 34
wav_threshold = []
for y in self.entries:
straight_wav = float(y.get())
wav = float(splitstr[inc])
wav_diff = wav - straight_wav
if wav_diff < 0:
wav_diff = wav_diff * (-1)
wav_threshold.append(wav_diff)
inc += 56
threshold = float(self.entry_threshold.get())
# writing into the file
data = []
inc1 = 0
col1 = 2
col2 = 6
data.insert(0, (str(splitstr[0])))
data.insert(1, (str(splitstr[1])))
for x in wav_threshold:
if (x > threshold):
self.red_light.color('red')
self.green_light.color('grey')
data.insert(col1, (str(splitstr[34 + inc1])))
data.insert(col2,(str(x)))
else:
self.red_light.color('grey')
self.green_light.color('green')
data.insert(col1,'-')
data.insert(col2,'-')
inc1 += 56
col1 += 1
col2 += 1
self.write_file(data)
# function to write into the file
def write_file(self,data):
with open("Output.txt", "a") as text_file:
text_file.write('\t'.join(data[0:]))
text_file.write('\n')
if __name__ == '__main__':
root = Tk()
gui = GUI(root)
root.mainloop()
My server code is:
import socket
import threading
import os
def Main():
host = '127.0.0.1'
port = 5000
s = socket.socket()
s.bind((host,port))
s.listen(5)
print("Server started")
while True:
c,addr = s.accept()
print("Client connected ip:<" + str(addr) + ">")
c.sendall('1685 2020/03/02 14:42:05 318301 4 1 25 0 0 0 0 1513,094 1516,156 1519,154 1521,969 1525,029 1527,813 1530,921 1533,869 1536,740 1539,943 1542,921 1545,879 1548,843 1551,849 1554,760 1557,943 1560,782 1563,931 1566,786 1569,751 1572,690 1575,535 1578,638 1581,755 1584,759 41 39 33 39 48 44 49 55 61 58 64 55 68 74 68 59 57 74 61 68 58 64 54 47 46 2 25 0 0 0 0 1512,963 1515,935 1518,857 1521,849 1524,655 1527,577 1530,332 1533,233 1536,204 1539,488 1542,571 1545,725 1549,200 1552,430 1555,332 1558,484 1561,201 1564,285 1567,001 1569,870 1572,758 1575,491 1578,512 1581,547 1584,405 48 43 37 42 57 54 59 62 67 58 71 59 77 82 82 64 71 88 77 79 72 73 63 49 50 3 25 0 0 0 0 1513,394 1516,517 1519,536 1522,082 1525,428 1527,963 1531,288 1534,102 1536,659 1539,757 1542,707 1545,627 1548,389 1551,459 1554,406 1557,986 1560,667 1564,103 1567,036 1570,144 1573,189 1575,888 1579,185 1582,323 1585,338 35 36 32 37 57 58 61 64 75 73 70 62 61 62 59 51 52 64 58 62 70 70 64 54 55 4 25 0 0 0 0 1512,658 1515,752 1518,797 1521,707 1524,744 1527,627 1530,871 1534,002 1537,086 1540,320 1543,217 1546,010 1548,660 1551,385 1554,253 1557,074 1560,193 1563,116 1566,043 1568,963 1571,855 1574,957 1577,954 1581,128 1584,273 43 42 39 40 56 50 56 62 65 54 59 62 75 79 73 63 67 77 73 75 68 62 54 51 51 100 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN'.encode())
c.sendall('1685 2020/03/03 14:42:05 318302 4 1 25 0 0 0 0 1513,094 1516,156 1519,154 1521,969 1525,029 1527,813 1530,921 1533,869 1536,740 1539,943 1542,921 1545,879 1548,843 1551,849 1554,760 1557,943 1560,782 1563,931 1566,786 1569,751 1572,690 1575,535 1578,638 1581,755 1584,759 41 39 33 39 48 44 49 55 61 58 64 55 68 74 68 59 57 74 61 68 58 64 54 47 46 2 25 0 0 0 0 1512,963 1515,935 1518,857 1521,849 1524,655 1527,577 1530,332 1533,233 1536,204 1539,488 1542,571 1545,725 1549,200 1552,430 1555,332 1558,484 1561,201 1564,285 1567,001 1569,870 1572,758 1575,491 1578,512 1581,547 1584,405 48 43 37 42 57 54 59 62 67 58 71 59 77 82 82 64 71 88 77 79 72 73 63 49 50 3 25 0 0 0 0 1513,394 1516,517 1519,536 1522,082 1525,428 1527,963 1531,288 1534,102 1536,659 1539,757 1542,707 1545,627 1548,389 1551,459 1554,406 1557,986 1560,667 1564,103 1567,036 1570,144 1573,189 1575,888 1579,185 1582,323 1585,338 35 36 32 37 57 58 61 64 75 73 70 62 61 62 59 51 52 64 58 62 70 70 64 54 55 4 25 0 0 0 0 1512,658 1515,752 1518,797 1521,707 1524,744 1527,627 1530,871 1534,002 1537,086 1540,320 1543,217 1546,010 1548,660 1551,385 1554,253 1557,074 1560,193 1563,116 1566,043 1568,963 1571,855 1574,957 1577,954 1581,128 1584,273 43 42 39 40 56 50 56 62 65 54 59 62 75 79 73 63 67 77 73 75 68 62 54 51 51 100 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN'.encode())
c.sendall('1685 2020/03/04 14:42:05 318303 4 1 25 0 0 0 0 1513,094 1516,156 1519,154 1521,969 1525,029 1527,813 1530,921 1533,869 1536,740 1539,943 1542,921 1545,879 1548,843 1551,849 1554,760 1557,943 1560,782 1563,931 1566,786 1569,751 1572,690 1575,535 1578,638 1581,755 1584,759 41 39 33 39 48 44 49 55 61 58 64 55 68 74 68 59 57 74 61 68 58 64 54 47 46 2 25 0 0 0 0 1512,963 1515,935 1518,857 1521,849 1524,655 1527,577 1530,332 1533,233 1536,204 1539,488 1542,571 1545,725 1549,200 1552,430 1555,332 1558,484 1561,201 1564,285 1567,001 1569,870 1572,758 1575,491 1578,512 1581,547 1584,405 48 43 37 42 57 54 59 62 67 58 71 59 77 82 82 64 71 88 77 79 72 73 63 49 50 3 25 0 0 0 0 1513,394 1516,517 1519,536 1522,082 1525,428 1527,963 1531,288 1534,102 1536,659 1539,757 1542,707 1545,627 1548,389 1551,459 1554,406 1557,986 1560,667 1564,103 1567,036 1570,144 1573,189 1575,888 1579,185 1582,323 1585,338 35 36 32 37 57 58 61 64 75 73 70 62 61 62 59 51 52 64 58 62 70 70 64 54 55 4 25 0 0 0 0 1512,658 1515,752 1518,797 1521,707 1524,744 1527,627 1530,871 1534,002 1537,086 1540,320 1543,217 1546,010 1548,660 1551,385 1554,253 1557,074 1560,193 1563,116 1566,043 1568,963 1571,855 1574,957 1577,954 1581,128 1584,273 43 42 39 40 56 50 56 62 65 54 59 62 75 79 73 63 67 77 73 75 68 62 54 51 51 100 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN'.encode())
c.sendall('1685 2020/03/05 14:42:05 318411 4 1 25 0 0 0 0 1513,094 1516,156 1519,154 1521,969 1525,029 1527,813 1530,921 1533,869 1536,740 1539,943 1542,921 1545,879 1548,843 1551,849 1554,760 1557,943 1560,782 1563,931 1566,786 1569,751 1572,690 1575,535 1578,638 1581,755 1584,759 41 39 33 39 48 44 49 55 61 58 64 55 68 74 68 59 57 74 61 68 58 64 54 47 46 2 25 0 0 0 0 1512,963 1515,935 1518,857 1521,849 1524,655 1527,577 1530,332 1533,233 1536,204 1539,488 1542,571 1545,725 1549,200 1552,430 1555,332 1558,484 1561,201 1564,285 1567,001 1569,870 1572,758 1575,491 1578,512 1581,547 1584,405 48 43 37 42 57 54 59 62 67 58 71 59 77 82 82 64 71 88 77 79 72 73 63 49 50 3 25 0 0 0 0 1513,394 1516,517 1519,536 1522,082 1525,428 1527,963 1531,288 1534,102 1536,659 1539,757 1542,707 1545,627 1548,389 1551,459 1554,406 1557,986 1560,667 1564,103 1567,036 1570,144 1573,189 1575,888 1579,185 1582,323 1585,338 35 36 32 37 57 58 61 64 75 73 70 62 61 62 59 51 52 64 58 62 70 70 64 54 55 4 25 0 0 0 0 1512,658 1515,752 1518,797 1521,707 1524,744 1527,627 1530,871 1534,002 1537,086 1540,320 1543,217 1546,010 1548,660 1551,385 1554,253 1557,074 1560,193 1563,116 1566,043 1568,963 1571,855 1574,957 1577,954 1581,128 1584,273 43 42 39 40 56 50 56 62 65 54 59 62 75 79 73 63 67 77 73 75 68 62 54 51 51 100 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN'.encode())
c.close()
if __name__ == '__main__':
Main()
But I see, my second thread is not running. Can someone point out where I am going wrong. Please help!

Tkinter is single-threaded, so Thread has little to no useage.

Related

Locating columns values in pandas dataframe with conditions

We have a dataframe (df_source):
Unnamed: 0 DATETIME DEVICE_ID COD_1 DAT_1 COD_2 DAT_2 COD_3 DAT_3 COD_4 DAT_4 COD_5 DAT_5 COD_6 DAT_6 COD_7 DAT_7
0 0 200520160941 002222111188 35 200408100500.0 12 200408100400 16 200408100300 11 200408100200 19 200408100100 35 200408100000 43
1 19 200507173541 000049000110 00 190904192701.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 20 200507173547 000049000110 00 190908185501.0 08 190908185501 NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 21 200507173547 000049000110 00 190908205601.0 08 190908205601 NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 22 200507173547 000049000110 00 190909005800.0 08 190909005800 NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
159 775 200529000843 000049768051 40 200529000601.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
160 776 200529000843 000049015792 00 200529000701.0 33 200529000701 NaN NaN NaN NaN NaN NaN NaN NaN NaN
161 779 200529000843 000049180500 00 200529000601.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
162 784 200529000843 000049089310 00 200529000201.0 03 200529000201 61 200529000201 NaN NaN NaN NaN NaN NaN NaN
163 786 200529000843 000049768051 40 200529000401.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
We calculated values_cont, a dict, for a subset:
v_subset = ['COD_1', 'COD_2', 'COD_3', 'COD_4', 'COD_5', 'COD_6', 'COD_7']
values_cont = pd.value_counts(df_source[v_subset].values.ravel())
We obtained as result (values, counter):
00 134
08 37
42 12
40 12
33 3
11 3
03 2
35 2
43 2
44 1
61 1
04 1
12 1
60 1
05 1
19 1
34 1
16 1
Now, the question is:
How to locate values in columns corresponding to counter, for instance:
How to locate:
df['DEVICE_ID'] # corresponding with values ('00') and counter ('134')
df['DEVICE_ID'] # corresponding with values ('08') and counter ('37')
...
df['DEVICE_ID'] # corresponding with values ('16') and counter ('1')
I believe you need DataFrame.melt with aggregate join for ID and GroupBy.size for counts.
This implementation will result in a dataframe with a column (value) for the CODES, all the associated DEVICE_IDs, and the count of ids associated with each code.
This is an alternative to values_cont in the question.
v_subset = ['COD_1', 'COD_2', 'COD_3', 'COD_4', 'COD_5', 'COD_6', 'COD_7']
df = (df_source.melt(id_vars='DEVICE_ID', value_vars=v_subset)
.dropna(subset=['value'])
.groupby('value')
.agg(DEVICE_ID = ('DEVICE_ID', ','.join), count= ('value','size'))
.reset_index())
print (df)
value DEVICE_ID count
0 00 000049000110,000049000110,000049000110,0000490... 7
1 03 000049089310 1
2 08 000049000110,000049000110,000049000110 3
3 11 002222111188 1
4 12 002222111188 1
5 16 002222111188 1
6 19 002222111188 1
7 33 000049015792 1
8 35 002222111188,002222111188 2
9 40 000049768051,000049768051 2
10 43 002222111188 1
11 61 000049089310 1
# print DEVICE_ID for CODES == '03'
print(df.DEVICE_ID[df.value == '03'])
[out]:
1 000049089310
Name: DEVICE_ID, dtype: object
Given the question as related to df_source, to select specific parts of the dataframe, use Pandas: Boolean Indexing
# to return all rows where COD_1 is '00'
df_source[df_source.COD_1 == '00']
# to return only the DEVICE_ID column where COD_1 is '00'
df_source['DEVICE_ID'][df_source.COD_1 == '00']
You can use df.iloc to search out rows that match based on columns. Then from that row you can select the column of interest and output it. There may be a more pythonic way to do this.
df2=df.iloc[df['COD_1']==00]
df3=df2.iloc[df2['DAT_1']==134]
df_out=df3.iloc['DEVICE_ID']
here's more info in .iloc: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html

How can i read each row till the first occurence of NaN.?

From excel file i want to read each row and use it independently to process
here how the data looks like in excel file
12 32 45 67 89 54 23 56 78 98
34 76 34 89 34 3
76 34 54 12 43 78 56
76 56 45 23 43 45 67 76 67 8
87 9 9 0 89 90 6 89
23 90 90 32 23 34 56 9 56 87
23 56 34 3 5 8 7 6 98
32 23 34 6 65 78 67 87 89 87
12 23 34 32 43 67 45
343 76 56 7 8 9 4
but when i read it through pandas then the remaining columns are filled with NaN.
the data after reading from pandas looks like
0 12 32 45 67 89 54 23.0 56.0 78.0 98.0
1 34 76 34 89 34 3 NaN NaN NaN NaN
2 76 34 54 12 43 78 56.0 NaN NaN NaN
3 76 56 45 23 43 45 67.0 76.0 67.0 8.0
4 87 9 9 0 89 90 6.0 89.0 NaN NaN
5 23 90 90 32 23 34 56.0 9.0 56.0 87.0
6 23 56 34 3 5 8 7.0 6.0 98.0 NaN
7 32 23 34 6 65 78 67.0 87.0 89.0 87.0
8 12 23 34 32 43 67 45.0 NaN NaN NaN
9 343 76 56 7 8 9 4.0 5.0 8.0 68.0
Here it can be seen the remaining columns of each row is filled with NaN which i don't want.
Nor i wanted to replace it with some other value or drop the whole rows contains NaN .
How can i read columns of each row till the first occurence of NaN. ?
For eg.The second row in pandas is 34 76 34 89 34 3 NaN NaN NaN NaN
so my desired output will be that it reads only 34 76 34 89 34 3
My preference is pandas but if not possible then is their any other way of doing it like with some other libraries
Any resource or reference will be helpful?
Thanks
While calling the pd.read_excel function, try setting keep_default_na = False. This will avoid default NaN values while reading.

replace do work in str but does not work in object dtype

ab = '1 234'
ab = ab.replace(" ", "")
ab
'1234'
its easy to use replace() to get rid of the white space, but when I have a column of pandas dataframe;
gbpusd['Profit'] = gbpusd['Profit'].replace(" ", "")
gbpusd['Profit'].head()
3 7 000.00
4 6 552.00
11 4 680.00
14 3 250.00
24 1 700.00
Name: Profit, dtype: object
But it didnt work, googled many times but no solutions...
gbpusd['Profit'].sum() TypeError: can only concatenate str (not "int")
to str
Then, as the whitespace is still here, which cannot do further analysis, like sum()
The thing is harder than I think: the raw data is
gbpusd.head()
Ticket Open Time Type Volume Item Price S / L T / P Close Time Price.1 Commission Taxes Swap Profit
84 50204109.0 2019.10.24 09:56:32 buy 0.5 gbpusd 1.29148 0.0 0.0 2019.10.24 09:57:48 1.29179 0 0.0 0.0 15.5
85 50205025.0 2019.10.24 10:10:13 buy 0.5 gbpusd 1.29328 0.0 0.0 2019.10.24 15:57:02 1.29181 0 0.0 0.0 -73.5
86 50207371.0 2019.10.24 10:34:10 buy 0.5 gbpusd 1.29236 0.0 0.0 2019.10.24 15:57:18 1.29197 0 0.0 0.0 -19.5
87 50207747.0 2019.10.24 10:40:32 buy 0.5 gbpusd 1.29151 0.0 0.0 2019.10.24 15:57:24 1.29223 0 0.0 0.0 36
88 50212252.0 2019.10.24 11:47:14 buy 1.5 gbpusd 1.28894 0.0 0.0 2019.10.24 15:57:12 1.29181 0 0.0 0.0 430.5
when I did
gbpusd['Profit'] = gbpusd['Profit'].str.replace(" ", "")
gbpusd['Profit']
84 NaN
85 NaN
86 NaN
87 NaN
88 NaN
89 NaN
90 NaN
91 NaN
92 NaN
93 NaN
94 NaN
95 NaN
96 NaN
97 NaN
98 NaN
99 NaN
100 NaN
101 NaN
102 NaN
103 NaN
104 NaN
105 NaN
106 NaN
107 NaN
108 NaN
109 NaN
110 NaN
111 NaN
112 NaN
113 NaN
...
117 4680.00
118 NaN
119 NaN
120 NaN
121 NaN
122 NaN
123 NaN
124 NaN
125 NaN
126 NaN
127 NaN
128 NaN
129 NaN
130 -2279.00
131 -2217.00
132 -2037.00
133 -5379.00
134 -1620.00
135 -7154.00
136 -4160.00
137 1144.00
138 NaN
139 NaN
140 NaN
141 -1920.00
142 7000.00
143 3250.00
144 NaN
145 1700.00
146 NaN
Name: Profit, Length: 63, dtype: object
The white space is replaced, but some data which has no space is NaN now...someone may have the same problem...
also need to use str
gbpusdprofit = gbpusd['Profit'].str.replace(" ", "")
Output:
0 7000.00
1 6552.00
2 4680.00
3 3250.00
4 1700.00
Name: Profit, dtype: object
and for sum:
gbpusd['Profit'].str.replace(" ", "").astype('float').sum()
Result:
23182.0
You can convert to string and sum in a oneliner:
gbpusd['Profit'].str.replace(' ', "").astype(float).sum()

Pandas groupby, resample, calculate pct_change and the store result back into original freq. dataframe

I have a dataframe of daily stock data, which is indexed by a datetimeindex.
There are multiple stock entries, thus there are duplicate datetimeindex values.
I am looking for a way to:
Group the dataframe by the stock symbol
Resample the prices for each symbol group into monthly price frequency data
Perform a pct_change calculation on each symbol group monthly price
Store it as a new column 'monthly_return' in the original dataframe.
I have been able to manage the first three operations. Storing the result in the original dataframe is where I'm having some trouble.
To illustrate this, I created a toy dataset which includes a 'dummy' index (idx) column which I use to assist creation of the desired output later on in the third code block.
import random
import pandas as pd
import numpy as np
datelist = pd.date_range(pd.datetime(2018,1,1), periods=PER).to_pydatetime().tolist() * 2
ids = [random.choice(['A', 'B']) for i in range(len(datelist))]
prices = random.sample(range(200), len(datelist))
idx = range(len(datelist))
df1 = pd.DataFrame(data=zip(idx, ids, prices), index=datelist, columns='idx label prices'.split())
print(df1.head(10))
df1
idx label prices
2018-01-01 0 B 40
2018-01-02 1 A 190
2018-01-03 2 A 159
2018-01-04 3 A 25
2018-01-05 4 A 89
2018-01-06 5 B 164
...
2018-01-31 30 A 102
2018-02-01 31 A 117
2018-02-02 32 A 120
2018-02-03 33 B 75
2018-02-04 34 B 170
...
Desired Output
idx label prices monthly_return
2018-01-01 0 B 40 0.000000
2018-01-02 1 A 190 0.000000
2018-01-03 2 A 159 0.000000
2018-01-04 3 A 25 0.000000
2018-01-05 4 A 89 0.000000
2018-01-06 5 B 164 0.000000
...
2018-01-31 30 A 102 -0.098039
2018-02-01 31 A 117 0.000000
2018-02-02 32 A 120 0.000000
...
2018-02-26 56 B 152 0.000000
2018-02-27 57 B 2 0.000000
2018-02-28 58 B 49 -0.040816
2018-03-01 59 B 188 0.000000
...
2018-01-28 89 A 88 0.000000
2018-01-29 90 A 26 0.000000
2018-01-30 91 B 128 0.000000
2018-01-31 92 A 144 -0.098039
...
2018-02-26 118 A 92 0.000000
2018-02-27 119 B 111 0.000000
2018-02-28 120 B 34 -0.040816
...
What I have tried so far is:
dfX = df1.copy(deep=True)
dfX = df1.groupby('label').resample('M')['prices'].last().pct_change(1).shift(-1)
print(dfX)
Which outputs:
label
A 2018-01-31 -0.067961
2018-02-28 -0.364583
2018-03-31 0.081967
B 2018-01-31 1.636364
2018-02-28 -0.557471
2018-03-31 NaN
This is quite close to what I would like to do, however I am only getting pct_change data on end of month dates back which is annoying to store back in the original dataframe (df1) as a new column.
Something like this doesn't work:
dfX = df1.copy(deep=True)
dfX['monthly_return'] = df1.groupby('label').resample('M')['prices'].last().pct_change(1).shift(-1)
As it yields the error:
TypeError: incompatible index of inserted column with frame index
I have considered 'upsampling' the monthly_return data back into a daily series, however this could likely end up causing the same error mentioned above since the original dataset could be missing dates (such as weekends). Additionally, resetting the index to clear this error would still create problems as the grouped dfX does not have the same number of rows/frequency as the original df1 which is of daily frequency.
I have a hunch that this can be done by using multi-indexing and dataframe merging however I am unsure how to go about doing so.
This generates my desired output, but it isn't as clean of a solution as I was hoping for
df1 is generated the same as before (code given in question):
idx label prices
2018-01-01 0 A 145
2018-01-02 1 B 86
2018-01-03 2 B 141
...
2018-01-25 86 B 12
2018-01-26 87 B 71
2018-01-27 88 B 186
2018-01-28 89 B 151
2018-01-29 90 A 161
2018-01-30 91 B 143
2018-01-31 92 B 88
...
Then:
def fun(x):
dates = x.date
x = x.set_index('date', drop=True)
x['monthly_return'] = x.resample('M').last()['prices'].pct_change(1).shift(-1)
x = x.reindex(dates)
return x
dfX = df1.copy(deep=True)
dfX.reset_index(inplace=True)
dfX.columns = 'date idx label prices'.split()
dfX = dfX.groupby('label').apply(fun).droplevel(level='label')
print(dfX)
Which outputs the desired result (unsorted):
idx label prices monthly_return
date
2018-01-01 0 A 145 NaN
2018-01-06 5 A 77 NaN
2018-01-08 7 A 48 NaN
2018-01-09 8 A 31 NaN
2018-01-11 10 A 20 NaN
2018-01-12 11 A 27 NaN
2018-01-14 13 A 109 NaN
2018-01-15 14 A 166 NaN
2018-01-17 16 A 130 NaN
2018-01-18 17 A 139 NaN
2018-01-19 18 A 191 NaN
2018-01-21 20 A 164 NaN
2018-01-22 21 A 112 NaN
2018-01-23 22 A 167 NaN
2018-01-25 24 A 140 NaN
2018-01-26 25 A 42 NaN
2018-01-30 29 A 107 NaN
2018-02-04 34 A 9 NaN
2018-02-07 37 A 84 NaN
2018-02-08 38 A 23 NaN
2018-02-10 40 A 30 NaN
2018-02-12 42 A 89 NaN
2018-02-15 45 A 79 NaN
2018-02-16 46 A 115 NaN
2018-02-19 49 A 197 NaN
2018-02-21 51 A 11 NaN
2018-02-26 56 A 111 NaN
2018-02-27 57 A 126 NaN
2018-03-01 59 A 135 NaN
2018-03-03 61 A 28 NaN
2018-01-01 62 A 120 NaN
2018-01-03 64 A 170 NaN
2018-01-05 66 A 45 NaN
2018-01-07 68 A 173 NaN
2018-01-08 69 A 158 NaN
2018-01-09 70 A 63 NaN
2018-01-11 72 A 62 NaN
2018-01-12 73 A 168 NaN
2018-01-14 75 A 169 NaN
2018-01-15 76 A 142 NaN
2018-01-17 78 A 83 NaN
2018-01-18 79 A 96 NaN
2018-01-21 82 A 25 NaN
2018-01-22 83 A 90 NaN
2018-01-23 84 A 59 NaN
2018-01-29 90 A 161 NaN
2018-02-01 93 A 150 NaN
2018-02-04 96 A 85 NaN
2018-02-06 98 A 124 NaN
2018-02-14 106 A 195 NaN
2018-02-16 108 A 136 NaN
2018-02-17 109 A 134 NaN
2018-02-18 110 A 183 NaN
2018-02-19 111 A 32 NaN
2018-02-24 116 A 102 NaN
2018-02-25 117 A 72 NaN
2018-02-27 119 A 38 NaN
2018-03-02 122 A 137 NaN
2018-03-03 123 A 171 NaN
2018-01-02 1 B 86 NaN
2018-01-03 2 B 141 NaN
2018-01-04 3 B 189 NaN
2018-01-05 4 B 60 NaN
2018-01-07 6 B 1 NaN
2018-01-10 9 B 87 NaN
2018-01-13 12 B 44 NaN
2018-01-16 15 B 147 NaN
2018-01-20 19 B 92 NaN
2018-01-24 23 B 81 NaN
2018-01-27 26 B 190 NaN
2018-01-28 27 B 24 NaN
2018-01-29 28 B 116 NaN
2018-01-31 30 B 98 1.181818
2018-02-01 31 B 121 NaN
2018-02-02 32 B 110 NaN
2018-02-03 33 B 66 NaN
2018-02-05 35 B 4 NaN
2018-02-06 36 B 13 NaN
2018-02-09 39 B 114 NaN
2018-02-11 41 B 16 NaN
2018-02-13 43 B 174 NaN
2018-02-14 44 B 78 NaN
2018-02-17 47 B 144 NaN
2018-02-18 48 B 14 NaN
2018-02-20 50 B 133 NaN
2018-02-22 52 B 156 NaN
2018-02-23 53 B 159 NaN
2018-02-24 54 B 177 NaN
2018-02-25 55 B 43 NaN
2018-02-28 58 B 19 -0.338542
2018-03-02 60 B 127 NaN
2018-01-02 63 B 2 NaN
2018-01-04 65 B 97 NaN
2018-01-06 67 B 8 NaN
2018-01-10 71 B 54 NaN
2018-01-13 74 B 106 NaN
2018-01-16 77 B 74 NaN
2018-01-19 80 B 188 NaN
2018-01-20 81 B 172 NaN
2018-01-24 85 B 51 NaN
2018-01-25 86 B 12 NaN
2018-01-26 87 B 71 NaN
2018-01-27 88 B 186 NaN
2018-01-28 89 B 151 NaN
2018-01-30 91 B 143 NaN
2018-01-31 92 B 88 1.181818
2018-02-02 94 B 75 NaN
2018-02-03 95 B 103 NaN
2018-02-05 97 B 82 NaN
2018-02-07 99 B 128 NaN
2018-02-08 100 B 123 NaN
2018-02-09 101 B 52 NaN
2018-02-10 102 B 18 NaN
2018-02-11 103 B 21 NaN
2018-02-12 104 B 50 NaN
2018-02-13 105 B 64 NaN
2018-02-15 107 B 185 NaN
2018-02-20 112 B 125 NaN
2018-02-21 113 B 108 NaN
2018-02-22 114 B 132 NaN
2018-02-23 115 B 180 NaN
2018-02-26 118 B 67 NaN
2018-02-28 120 B 192 -0.338542
2018-03-01 121 B 58 NaN
Perhaps there is a more concise and pythonic way of doing this.

Python: Expand a dataframe row-wise based on datetime

I have a dataframe like this:
ID Date Value
783 C 2018-02-23 0.704
580 B 2018-08-04 -1.189
221 A 2018-08-10 -0.788
228 A 2018-08-17 0.038
578 B 2018-08-02 1.188
What I want is expanding the dataframe based on Date column to 1-month earlier, and fill ID with the same person, and fill Value with nan until the last observation.
The expected result is similar to this:
ID Date Value
0 C 2018/01/24 nan
1 C 2018/01/25 nan
2 C 2018/01/26 nan
3 C 2018/01/27 nan
4 C 2018/01/28 nan
5 C 2018/01/29 nan
6 C 2018/01/30 nan
7 C 2018/01/31 nan
8 C 2018/02/01 nan
9 C 2018/02/02 nan
10 C 2018/02/03 nan
11 C 2018/02/04 nan
12 C 2018/02/05 nan
13 C 2018/02/06 nan
14 C 2018/02/07 nan
15 C 2018/02/08 nan
16 C 2018/02/09 nan
17 C 2018/02/10 nan
18 C 2018/02/11 nan
19 C 2018/02/12 nan
20 C 2018/02/13 nan
21 C 2018/02/14 nan
22 C 2018/02/15 nan
23 C 2018/02/16 nan
24 C 2018/02/17 nan
25 C 2018/02/18 nan
26 C 2018/02/19 nan
27 C 2018/02/20 nan
28 C 2018/02/21 nan
29 C 2018/02/22 nan
30 C 2018/02/23 1.093
31 B 2018/07/05 nan
32 B 2018/07/06 nan
33 B 2018/07/07 nan
34 B 2018/07/08 nan
35 B 2018/07/09 nan
36 B 2018/07/10 nan
37 B 2018/07/11 nan
38 B 2018/07/12 nan
39 B 2018/07/13 nan
40 B 2018/07/14 nan
41 B 2018/07/15 nan
42 B 2018/07/16 nan
43 B 2018/07/17 nan
44 B 2018/07/18 nan
45 B 2018/07/19 nan
46 B 2018/07/20 nan
47 B 2018/07/21 nan
48 B 2018/07/22 nan
49 B 2018/07/23 nan
50 B 2018/07/24 nan
51 B 2018/07/25 nan
52 B 2018/07/26 nan
53 B 2018/07/27 nan
54 B 2018/07/28 nan
55 B 2018/07/29 nan
56 B 2018/07/30 nan
57 B 2018/07/31 nan
58 B 2018/08/01 nan
59 B 2018/08/02 nan
60 B 2018/08/03 nan
61 B 2018/08/04 0.764
62 A 2018/07/11 nan
63 A 2018/07/12 nan
64 A 2018/07/13 nan
65 A 2018/07/14 nan
66 A 2018/07/15 nan
67 A 2018/07/16 nan
68 A 2018/07/17 nan
69 A 2018/07/18 nan
70 A 2018/07/19 nan
71 A 2018/07/20 nan
72 A 2018/07/21 nan
73 A 2018/07/22 nan
74 A 2018/07/23 nan
75 A 2018/07/24 nan
76 A 2018/07/25 nan
77 A 2018/07/26 nan
78 A 2018/07/27 nan
79 A 2018/07/28 nan
80 A 2018/07/29 nan
81 A 2018/07/30 nan
82 A 2018/07/31 nan
83 A 2018/08/01 nan
84 A 2018/08/02 nan
85 A 2018/08/03 nan
86 A 2018/08/04 nan
87 A 2018/08/05 nan
88 A 2018/08/06 nan
89 A 2018/08/07 nan
90 A 2018/08/08 nan
91 A 2018/08/09 nan
92 A 2018/08/10 2.144
93 A 2018/07/18 nan
94 A 2018/07/19 nan
95 A 2018/07/20 nan
96 A 2018/07/21 nan
97 A 2018/07/22 nan
98 A 2018/07/23 nan
99 A 2018/07/24 nan
100 A 2018/07/25 nan
101 A 2018/07/26 nan
102 A 2018/07/27 nan
103 A 2018/07/28 nan
104 A 2018/07/29 nan
105 A 2018/07/30 nan
106 A 2018/07/31 nan
107 A 2018/08/01 nan
108 A 2018/08/02 nan
109 A 2018/08/03 nan
110 A 2018/08/04 nan
111 A 2018/08/05 nan
112 A 2018/08/06 nan
113 A 2018/08/07 nan
114 A 2018/08/08 nan
115 A 2018/08/09 nan
116 A 2018/08/10 nan
117 A 2018/08/11 nan
118 A 2018/08/12 nan
119 A 2018/08/13 nan
120 A 2018/08/14 nan
121 A 2018/08/15 nan
122 A 2018/08/16 nan
123 A 2018/08/17 0.644
124 B 2018/07/03 nan
125 B 2018/07/04 nan
126 B 2018/07/05 nan
127 B 2018/07/06 nan
128 B 2018/07/07 nan
129 B 2018/07/08 nan
130 B 2018/07/09 nan
131 B 2018/07/10 nan
132 B 2018/07/11 nan
133 B 2018/07/12 nan
134 B 2018/07/13 nan
135 B 2018/07/14 nan
136 B 2018/07/15 nan
137 B 2018/07/16 nan
138 B 2018/07/17 nan
139 B 2018/07/18 nan
140 B 2018/07/19 nan
141 B 2018/07/20 nan
142 B 2018/07/21 nan
143 B 2018/07/22 nan
144 B 2018/07/23 nan
145 B 2018/07/24 nan
146 B 2018/07/25 nan
147 B 2018/07/26 nan
148 B 2018/07/27 nan
149 B 2018/07/28 nan
150 B 2018/07/29 nan
151 B 2018/07/30 nan
152 B 2018/07/31 nan
153 B 2018/08/01 nan
154 B 2018/08/02 -0.767
The source data can be created as below:
import pandas as pd
from itertools import chain
import numpy as np
df_1 = pd.DataFrame({
'ID' : list(chain.from_iterable([['A'] * 365, ['B'] * 365, ['C'] * 365])),
'Date' : pd.date_range(start = '2018-01-01', end = '2018-12-31').tolist() + pd.date_range(start = '2018-01-01', end = '2018-12-31').tolist() + pd.date_range(start = '2018-01-01', end = '2018-12-31').tolist(),
'Value' : np.random.randn(365 * 3)
})
df_1 = df_1.sample(5, random_state = 123)
Thanks for the advice!
You can create another DataFrame with previous months, then join together by concat, create DatetimeIndex, so possible use groupby with resample by d for days for add all values between:
df_2 = df_1.assign(Date = df_1['Date'] - pd.DateOffset(months=1) + pd.DateOffset(days=1),
Value = np.nan)
df = (pd.concat([df_2, df_1], sort=False)
.reset_index()
.set_index('Date')
.groupby('index', sort=False)
.resample('D')
.ffill()
.reset_index(level=1)
.drop('index', 1)
.rename_axis(None))
print (df)
Date ID Value
783 2018-01-24 C NaN
783 2018-01-25 C NaN
783 2018-01-26 C NaN
783 2018-01-27 C NaN
783 2018-01-28 C NaN
.. ... .. ...
578 2018-07-29 B NaN
578 2018-07-30 B NaN
578 2018-07-31 B NaN
578 2018-08-01 B NaN
578 2018-08-02 B 0.562684
[155 rows x 3 columns]
Another solution with list comprehension and concat, but last is necessary back filling of columns for index and ID, solution working if no missing value in original ID column:
offset = pd.DateOffset(months=1) + pd.DateOffset(days=1)
df=pd.concat([df_1.iloc[[i]].reset_index().set_index('Date').reindex(pd.date_range(d-offset,d))
for i, d in enumerate(df_1['Date'])], sort=False)
df = (df.assign(index = df['index'].bfill().astype(int), ID = df['ID'].bfill())
.rename_axis('Date')
.reset_index()
.set_index('index')
.rename_axis(None)
)
print (df)
Date ID Value
783 2018-01-24 C NaN
783 2018-01-25 C NaN
783 2018-01-26 C NaN
783 2018-01-27 C NaN
783 2018-01-28 C NaN
.. ... .. ...
578 2018-07-29 B NaN
578 2018-07-30 B NaN
578 2018-07-31 B NaN
578 2018-08-01 B NaN
578 2018-08-02 B 1.224345
[155 rows x 3 columns]
We can create a date range in the "Date" column, then explode it.
Then group the "Value" column by the index and set values to nan but the last.
Finally reset the index.
def drange(t):
return pd.date_range( t-pd.DateOffset(months=1)+pd.DateOffset(days=1),t,freq="D",normalize=True)
df["Date"]= df["Date"].transform(drange)
ID Date Value
index
783 C DatetimeIndex(['2018-01-24', '2018-01-25', '20... 0.704
580 B DatetimeIndex(['2018-07-05', '2018-07-06', '20... -1.189
221 A DatetimeIndex(['2018-07-11', '2018-07-12', '20... -0.788
228 A DatetimeIndex(['2018-07-18', '2018-07-19', '20... 0.038
578 B DatetimeIndex(['2018-07-03', '2018-07-04', '20... 1.188
df= df.reset_index(drop=True).explode(column="Date")
ID Date Value
0 C 2018-01-24 0.704
0 C 2018-01-25 0.704
0 C 2018-01-26 0.704
0 C 2018-01-27 0.704
0 C 2018-01-28 0.704
.. .. ... ...
4 B 2018-07-29 1.188
4 B 2018-07-30 1.188
4 B 2018-07-31 1.188
4 B 2018-08-01 1.188
4 B 2018-08-02 1.188
df["Value"]= df.groupby(level=0)["Value"].transform(lambda v: [np.nan]*(len(v)-1)+[v.iloc[0]])
df= df.reset_index(drop=True)
ID Date Value
0 C 2018-01-24 NaN
1 C 2018-01-25 NaN
2 C 2018-01-26 NaN
3 C 2018-01-27 NaN
4 C 2018-01-28 NaN
.. .. ... ...
150 B 2018-07-29 NaN
151 B 2018-07-30 NaN
152 B 2018-07-31 NaN
153 B 2018-08-01 NaN
154 B 2018-08-02 1.188

Categories