Why is my second thread not executing at all? - python
I have a GUI which takes values and when the button on the GUI is pressed receives a data block from my server and puts them into a queue and using threads calls two functions recvData() and calculate_threshold. The first function is to keep receiving data blocks from the server and put them in the queue whereas the second function is to remove the data blocks from queue continuously and perform some calculations on it before writing the results to the file. It continues to do it till the queue is not empty.
Below is my client code:
import socket
import turtle
#import time
import queue
import threading
from tkinter import *
class GUI:
entries = []
def __init__(self, master):
self.master = master
master.title("Collision Detection")
self.buff_data = queue.Queue()
self.t1 = threading.Thread(target = self.recvData)
self.t2 = threading.Thread(target=self.calculate_threshold)
self.entries = []
self.host = '127.0.0.1'
self.port = 5000
self.s = socket.socket()
self.s.connect((self.host, self.port))
self.create_GUI()
def create_GUI(self):
self.input_label = Label(root, text = "Input all the gratings set straight wavelength values in nm")
self.input_label.grid(row = 0)
self.core_string = "Core "
self.label_col_inc = 0
self.entry_col_inc = 1
self.core_range = range(1, 5)
for y in self.core_range:
self.core_text = self.core_string + str(y) + '_' + '25'
self.core_label = Label(root, text = self.core_text)
self.entry = Entry(root)
self.core_label.grid(row=1, column=self.label_col_inc, sticky=E)
self.entry.grid(row=1, column=self.entry_col_inc)
self.entries.append(self.entry)
self.label_col_inc += 2
self.entry_col_inc += 2
self.threshold_label = Label(root, text = "Threshold in nm")
self.entry_threshold = Entry(root)
self.threshold_label.grid(row = 2, sticky = E)
self.entry_threshold.grid(row = 2, column = 1)
self.light_label = Label(root, text = 'Status')
self.light_label.grid(row = 3, column = 3)
self.canvas = Canvas(root, width = 150, height = 50)
self.canvas.grid(row = 4, column = 3)
# Green light
self.green_light = turtle.RawTurtle(self.canvas)
self.green_light.shape('circle')
self.green_light.color('grey')
self.green_light.penup()
self.green_light.goto(0,0)
# Red light
self.red_light = turtle.RawTurtle(self.canvas)
self.red_light.shape('circle')
self.red_light.color('grey')
self.red_light.penup()
self.red_light.goto(40,0)
self.data_button = Button(root, text = "Get data above threshold", command = self.getData)
self.data_button.grid(row = 5, column = 0)
len_message = self.s.recv(4)
print('len_message', len_message)
bytes_length = int(len_message.decode('utf-8')) # for the self-made server
recvd_data = self.s.recv(bytes_length)
print('data', recvd_data)
self.buff_data.put(recvd_data)
#print('buffer', self.buff_data)
self.t1.start()
self.t2.start()
def recvData(self):
len_message = self.s.recv(4)
print('len_message', len_message)
while len_message:
bytes_length = int(len_message.decode('utf-8')) # for the self-made server
recvd_data = self.s.recv(bytes_length)
print('data', recvd_data)
self.buff_data.put(recvd_data)
len_message = self.s.recv(4)
print('len_message', len_message)
else:
print('out of loop')
self.s.close()
def calculate_threshold(self):
while not self.buff_data.empty:
rmv_data = self.buff_data.get()
stringdata = rmv_data.decode('utf-8')
rep_str = stringdata.replace(",", ".")
splitstr = rep_str.split()
# received wavelength values
inc = 34
wav_threshold = []
for y in self.entries:
straight_wav = float(y.get())
wav = float(splitstr[inc])
wav_diff = wav - straight_wav
if wav_diff < 0:
wav_diff = wav_diff * (-1)
wav_threshold.append(wav_diff)
inc += 56
threshold = float(self.entry_threshold.get())
# writing into the file
data = []
inc1 = 0
col1 = 2
col2 = 6
data.insert(0, (str(splitstr[0])))
data.insert(1, (str(splitstr[1])))
for x in wav_threshold:
if (x > threshold):
self.red_light.color('red')
self.green_light.color('grey')
data.insert(col1, (str(splitstr[34 + inc1])))
data.insert(col2,(str(x)))
else:
self.red_light.color('grey')
self.green_light.color('green')
data.insert(col1,'-')
data.insert(col2,'-')
inc1 += 56
col1 += 1
col2 += 1
self.write_file(data)
# function to write into the file
def write_file(self,data):
with open("Output.txt", "a") as text_file:
text_file.write('\t'.join(data[0:]))
text_file.write('\n')
if __name__ == '__main__':
root = Tk()
gui = GUI(root)
root.mainloop()
My server code is:
import socket
import threading
import os
def Main():
host = '127.0.0.1'
port = 5000
s = socket.socket()
s.bind((host,port))
s.listen(5)
print("Server started")
while True:
c,addr = s.accept()
print("Client connected ip:<" + str(addr) + ">")
c.sendall('1685 2020/03/02 14:42:05 318301 4 1 25 0 0 0 0 1513,094 1516,156 1519,154 1521,969 1525,029 1527,813 1530,921 1533,869 1536,740 1539,943 1542,921 1545,879 1548,843 1551,849 1554,760 1557,943 1560,782 1563,931 1566,786 1569,751 1572,690 1575,535 1578,638 1581,755 1584,759 41 39 33 39 48 44 49 55 61 58 64 55 68 74 68 59 57 74 61 68 58 64 54 47 46 2 25 0 0 0 0 1512,963 1515,935 1518,857 1521,849 1524,655 1527,577 1530,332 1533,233 1536,204 1539,488 1542,571 1545,725 1549,200 1552,430 1555,332 1558,484 1561,201 1564,285 1567,001 1569,870 1572,758 1575,491 1578,512 1581,547 1584,405 48 43 37 42 57 54 59 62 67 58 71 59 77 82 82 64 71 88 77 79 72 73 63 49 50 3 25 0 0 0 0 1513,394 1516,517 1519,536 1522,082 1525,428 1527,963 1531,288 1534,102 1536,659 1539,757 1542,707 1545,627 1548,389 1551,459 1554,406 1557,986 1560,667 1564,103 1567,036 1570,144 1573,189 1575,888 1579,185 1582,323 1585,338 35 36 32 37 57 58 61 64 75 73 70 62 61 62 59 51 52 64 58 62 70 70 64 54 55 4 25 0 0 0 0 1512,658 1515,752 1518,797 1521,707 1524,744 1527,627 1530,871 1534,002 1537,086 1540,320 1543,217 1546,010 1548,660 1551,385 1554,253 1557,074 1560,193 1563,116 1566,043 1568,963 1571,855 1574,957 1577,954 1581,128 1584,273 43 42 39 40 56 50 56 62 65 54 59 62 75 79 73 63 67 77 73 75 68 62 54 51 51 100 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN'.encode())
c.sendall('1685 2020/03/03 14:42:05 318302 4 1 25 0 0 0 0 1513,094 1516,156 1519,154 1521,969 1525,029 1527,813 1530,921 1533,869 1536,740 1539,943 1542,921 1545,879 1548,843 1551,849 1554,760 1557,943 1560,782 1563,931 1566,786 1569,751 1572,690 1575,535 1578,638 1581,755 1584,759 41 39 33 39 48 44 49 55 61 58 64 55 68 74 68 59 57 74 61 68 58 64 54 47 46 2 25 0 0 0 0 1512,963 1515,935 1518,857 1521,849 1524,655 1527,577 1530,332 1533,233 1536,204 1539,488 1542,571 1545,725 1549,200 1552,430 1555,332 1558,484 1561,201 1564,285 1567,001 1569,870 1572,758 1575,491 1578,512 1581,547 1584,405 48 43 37 42 57 54 59 62 67 58 71 59 77 82 82 64 71 88 77 79 72 73 63 49 50 3 25 0 0 0 0 1513,394 1516,517 1519,536 1522,082 1525,428 1527,963 1531,288 1534,102 1536,659 1539,757 1542,707 1545,627 1548,389 1551,459 1554,406 1557,986 1560,667 1564,103 1567,036 1570,144 1573,189 1575,888 1579,185 1582,323 1585,338 35 36 32 37 57 58 61 64 75 73 70 62 61 62 59 51 52 64 58 62 70 70 64 54 55 4 25 0 0 0 0 1512,658 1515,752 1518,797 1521,707 1524,744 1527,627 1530,871 1534,002 1537,086 1540,320 1543,217 1546,010 1548,660 1551,385 1554,253 1557,074 1560,193 1563,116 1566,043 1568,963 1571,855 1574,957 1577,954 1581,128 1584,273 43 42 39 40 56 50 56 62 65 54 59 62 75 79 73 63 67 77 73 75 68 62 54 51 51 100 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN'.encode())
c.sendall('1685 2020/03/04 14:42:05 318303 4 1 25 0 0 0 0 1513,094 1516,156 1519,154 1521,969 1525,029 1527,813 1530,921 1533,869 1536,740 1539,943 1542,921 1545,879 1548,843 1551,849 1554,760 1557,943 1560,782 1563,931 1566,786 1569,751 1572,690 1575,535 1578,638 1581,755 1584,759 41 39 33 39 48 44 49 55 61 58 64 55 68 74 68 59 57 74 61 68 58 64 54 47 46 2 25 0 0 0 0 1512,963 1515,935 1518,857 1521,849 1524,655 1527,577 1530,332 1533,233 1536,204 1539,488 1542,571 1545,725 1549,200 1552,430 1555,332 1558,484 1561,201 1564,285 1567,001 1569,870 1572,758 1575,491 1578,512 1581,547 1584,405 48 43 37 42 57 54 59 62 67 58 71 59 77 82 82 64 71 88 77 79 72 73 63 49 50 3 25 0 0 0 0 1513,394 1516,517 1519,536 1522,082 1525,428 1527,963 1531,288 1534,102 1536,659 1539,757 1542,707 1545,627 1548,389 1551,459 1554,406 1557,986 1560,667 1564,103 1567,036 1570,144 1573,189 1575,888 1579,185 1582,323 1585,338 35 36 32 37 57 58 61 64 75 73 70 62 61 62 59 51 52 64 58 62 70 70 64 54 55 4 25 0 0 0 0 1512,658 1515,752 1518,797 1521,707 1524,744 1527,627 1530,871 1534,002 1537,086 1540,320 1543,217 1546,010 1548,660 1551,385 1554,253 1557,074 1560,193 1563,116 1566,043 1568,963 1571,855 1574,957 1577,954 1581,128 1584,273 43 42 39 40 56 50 56 62 65 54 59 62 75 79 73 63 67 77 73 75 68 62 54 51 51 100 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN'.encode())
c.sendall('1685 2020/03/05 14:42:05 318411 4 1 25 0 0 0 0 1513,094 1516,156 1519,154 1521,969 1525,029 1527,813 1530,921 1533,869 1536,740 1539,943 1542,921 1545,879 1548,843 1551,849 1554,760 1557,943 1560,782 1563,931 1566,786 1569,751 1572,690 1575,535 1578,638 1581,755 1584,759 41 39 33 39 48 44 49 55 61 58 64 55 68 74 68 59 57 74 61 68 58 64 54 47 46 2 25 0 0 0 0 1512,963 1515,935 1518,857 1521,849 1524,655 1527,577 1530,332 1533,233 1536,204 1539,488 1542,571 1545,725 1549,200 1552,430 1555,332 1558,484 1561,201 1564,285 1567,001 1569,870 1572,758 1575,491 1578,512 1581,547 1584,405 48 43 37 42 57 54 59 62 67 58 71 59 77 82 82 64 71 88 77 79 72 73 63 49 50 3 25 0 0 0 0 1513,394 1516,517 1519,536 1522,082 1525,428 1527,963 1531,288 1534,102 1536,659 1539,757 1542,707 1545,627 1548,389 1551,459 1554,406 1557,986 1560,667 1564,103 1567,036 1570,144 1573,189 1575,888 1579,185 1582,323 1585,338 35 36 32 37 57 58 61 64 75 73 70 62 61 62 59 51 52 64 58 62 70 70 64 54 55 4 25 0 0 0 0 1512,658 1515,752 1518,797 1521,707 1524,744 1527,627 1530,871 1534,002 1537,086 1540,320 1543,217 1546,010 1548,660 1551,385 1554,253 1557,074 1560,193 1563,116 1566,043 1568,963 1571,855 1574,957 1577,954 1581,128 1584,273 43 42 39 40 56 50 56 62 65 54 59 62 75 79 73 63 67 77 73 75 68 62 54 51 51 100 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN'.encode())
c.close()
if __name__ == '__main__':
Main()
But I see, my second thread is not running. Can someone point out where I am going wrong. Please help!
Tkinter is single-threaded, so Thread has little to no useage.
Related
Locating columns values in pandas dataframe with conditions
We have a dataframe (df_source): Unnamed: 0 DATETIME DEVICE_ID COD_1 DAT_1 COD_2 DAT_2 COD_3 DAT_3 COD_4 DAT_4 COD_5 DAT_5 COD_6 DAT_6 COD_7 DAT_7 0 0 200520160941 002222111188 35 200408100500.0 12 200408100400 16 200408100300 11 200408100200 19 200408100100 35 200408100000 43 1 19 200507173541 000049000110 00 190904192701.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2 20 200507173547 000049000110 00 190908185501.0 08 190908185501 NaN NaN NaN NaN NaN NaN NaN NaN NaN 3 21 200507173547 000049000110 00 190908205601.0 08 190908205601 NaN NaN NaN NaN NaN NaN NaN NaN NaN 4 22 200507173547 000049000110 00 190909005800.0 08 190909005800 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 159 775 200529000843 000049768051 40 200529000601.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 160 776 200529000843 000049015792 00 200529000701.0 33 200529000701 NaN NaN NaN NaN NaN NaN NaN NaN NaN 161 779 200529000843 000049180500 00 200529000601.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 162 784 200529000843 000049089310 00 200529000201.0 03 200529000201 61 200529000201 NaN NaN NaN NaN NaN NaN NaN 163 786 200529000843 000049768051 40 200529000401.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN We calculated values_cont, a dict, for a subset: v_subset = ['COD_1', 'COD_2', 'COD_3', 'COD_4', 'COD_5', 'COD_6', 'COD_7'] values_cont = pd.value_counts(df_source[v_subset].values.ravel()) We obtained as result (values, counter): 00 134 08 37 42 12 40 12 33 3 11 3 03 2 35 2 43 2 44 1 61 1 04 1 12 1 60 1 05 1 19 1 34 1 16 1 Now, the question is: How to locate values in columns corresponding to counter, for instance: How to locate: df['DEVICE_ID'] # corresponding with values ('00') and counter ('134') df['DEVICE_ID'] # corresponding with values ('08') and counter ('37') ... df['DEVICE_ID'] # corresponding with values ('16') and counter ('1')
I believe you need DataFrame.melt with aggregate join for ID and GroupBy.size for counts. This implementation will result in a dataframe with a column (value) for the CODES, all the associated DEVICE_IDs, and the count of ids associated with each code. This is an alternative to values_cont in the question. v_subset = ['COD_1', 'COD_2', 'COD_3', 'COD_4', 'COD_5', 'COD_6', 'COD_7'] df = (df_source.melt(id_vars='DEVICE_ID', value_vars=v_subset) .dropna(subset=['value']) .groupby('value') .agg(DEVICE_ID = ('DEVICE_ID', ','.join), count= ('value','size')) .reset_index()) print (df) value DEVICE_ID count 0 00 000049000110,000049000110,000049000110,0000490... 7 1 03 000049089310 1 2 08 000049000110,000049000110,000049000110 3 3 11 002222111188 1 4 12 002222111188 1 5 16 002222111188 1 6 19 002222111188 1 7 33 000049015792 1 8 35 002222111188,002222111188 2 9 40 000049768051,000049768051 2 10 43 002222111188 1 11 61 000049089310 1 # print DEVICE_ID for CODES == '03' print(df.DEVICE_ID[df.value == '03']) [out]: 1 000049089310 Name: DEVICE_ID, dtype: object Given the question as related to df_source, to select specific parts of the dataframe, use Pandas: Boolean Indexing # to return all rows where COD_1 is '00' df_source[df_source.COD_1 == '00'] # to return only the DEVICE_ID column where COD_1 is '00' df_source['DEVICE_ID'][df_source.COD_1 == '00']
You can use df.iloc to search out rows that match based on columns. Then from that row you can select the column of interest and output it. There may be a more pythonic way to do this. df2=df.iloc[df['COD_1']==00] df3=df2.iloc[df2['DAT_1']==134] df_out=df3.iloc['DEVICE_ID'] here's more info in .iloc: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html
How can i read each row till the first occurence of NaN.?
From excel file i want to read each row and use it independently to process here how the data looks like in excel file 12 32 45 67 89 54 23 56 78 98 34 76 34 89 34 3 76 34 54 12 43 78 56 76 56 45 23 43 45 67 76 67 8 87 9 9 0 89 90 6 89 23 90 90 32 23 34 56 9 56 87 23 56 34 3 5 8 7 6 98 32 23 34 6 65 78 67 87 89 87 12 23 34 32 43 67 45 343 76 56 7 8 9 4 but when i read it through pandas then the remaining columns are filled with NaN. the data after reading from pandas looks like 0 12 32 45 67 89 54 23.0 56.0 78.0 98.0 1 34 76 34 89 34 3 NaN NaN NaN NaN 2 76 34 54 12 43 78 56.0 NaN NaN NaN 3 76 56 45 23 43 45 67.0 76.0 67.0 8.0 4 87 9 9 0 89 90 6.0 89.0 NaN NaN 5 23 90 90 32 23 34 56.0 9.0 56.0 87.0 6 23 56 34 3 5 8 7.0 6.0 98.0 NaN 7 32 23 34 6 65 78 67.0 87.0 89.0 87.0 8 12 23 34 32 43 67 45.0 NaN NaN NaN 9 343 76 56 7 8 9 4.0 5.0 8.0 68.0 Here it can be seen the remaining columns of each row is filled with NaN which i don't want. Nor i wanted to replace it with some other value or drop the whole rows contains NaN . How can i read columns of each row till the first occurence of NaN. ? For eg.The second row in pandas is 34 76 34 89 34 3 NaN NaN NaN NaN so my desired output will be that it reads only 34 76 34 89 34 3 My preference is pandas but if not possible then is their any other way of doing it like with some other libraries Any resource or reference will be helpful? Thanks
While calling the pd.read_excel function, try setting keep_default_na = False. This will avoid default NaN values while reading.
replace do work in str but does not work in object dtype
ab = '1 234' ab = ab.replace(" ", "") ab '1234' its easy to use replace() to get rid of the white space, but when I have a column of pandas dataframe; gbpusd['Profit'] = gbpusd['Profit'].replace(" ", "") gbpusd['Profit'].head() 3 7 000.00 4 6 552.00 11 4 680.00 14 3 250.00 24 1 700.00 Name: Profit, dtype: object But it didnt work, googled many times but no solutions... gbpusd['Profit'].sum() TypeError: can only concatenate str (not "int") to str Then, as the whitespace is still here, which cannot do further analysis, like sum() The thing is harder than I think: the raw data is gbpusd.head() Ticket Open Time Type Volume Item Price S / L T / P Close Time Price.1 Commission Taxes Swap Profit 84 50204109.0 2019.10.24 09:56:32 buy 0.5 gbpusd 1.29148 0.0 0.0 2019.10.24 09:57:48 1.29179 0 0.0 0.0 15.5 85 50205025.0 2019.10.24 10:10:13 buy 0.5 gbpusd 1.29328 0.0 0.0 2019.10.24 15:57:02 1.29181 0 0.0 0.0 -73.5 86 50207371.0 2019.10.24 10:34:10 buy 0.5 gbpusd 1.29236 0.0 0.0 2019.10.24 15:57:18 1.29197 0 0.0 0.0 -19.5 87 50207747.0 2019.10.24 10:40:32 buy 0.5 gbpusd 1.29151 0.0 0.0 2019.10.24 15:57:24 1.29223 0 0.0 0.0 36 88 50212252.0 2019.10.24 11:47:14 buy 1.5 gbpusd 1.28894 0.0 0.0 2019.10.24 15:57:12 1.29181 0 0.0 0.0 430.5 when I did gbpusd['Profit'] = gbpusd['Profit'].str.replace(" ", "") gbpusd['Profit'] 84 NaN 85 NaN 86 NaN 87 NaN 88 NaN 89 NaN 90 NaN 91 NaN 92 NaN 93 NaN 94 NaN 95 NaN 96 NaN 97 NaN 98 NaN 99 NaN 100 NaN 101 NaN 102 NaN 103 NaN 104 NaN 105 NaN 106 NaN 107 NaN 108 NaN 109 NaN 110 NaN 111 NaN 112 NaN 113 NaN ... 117 4680.00 118 NaN 119 NaN 120 NaN 121 NaN 122 NaN 123 NaN 124 NaN 125 NaN 126 NaN 127 NaN 128 NaN 129 NaN 130 -2279.00 131 -2217.00 132 -2037.00 133 -5379.00 134 -1620.00 135 -7154.00 136 -4160.00 137 1144.00 138 NaN 139 NaN 140 NaN 141 -1920.00 142 7000.00 143 3250.00 144 NaN 145 1700.00 146 NaN Name: Profit, Length: 63, dtype: object The white space is replaced, but some data which has no space is NaN now...someone may have the same problem...
also need to use str gbpusdprofit = gbpusd['Profit'].str.replace(" ", "") Output: 0 7000.00 1 6552.00 2 4680.00 3 3250.00 4 1700.00 Name: Profit, dtype: object and for sum: gbpusd['Profit'].str.replace(" ", "").astype('float').sum() Result: 23182.0
You can convert to string and sum in a oneliner: gbpusd['Profit'].str.replace(' ', "").astype(float).sum()
Pandas groupby, resample, calculate pct_change and the store result back into original freq. dataframe
I have a dataframe of daily stock data, which is indexed by a datetimeindex. There are multiple stock entries, thus there are duplicate datetimeindex values. I am looking for a way to: Group the dataframe by the stock symbol Resample the prices for each symbol group into monthly price frequency data Perform a pct_change calculation on each symbol group monthly price Store it as a new column 'monthly_return' in the original dataframe. I have been able to manage the first three operations. Storing the result in the original dataframe is where I'm having some trouble. To illustrate this, I created a toy dataset which includes a 'dummy' index (idx) column which I use to assist creation of the desired output later on in the third code block. import random import pandas as pd import numpy as np datelist = pd.date_range(pd.datetime(2018,1,1), periods=PER).to_pydatetime().tolist() * 2 ids = [random.choice(['A', 'B']) for i in range(len(datelist))] prices = random.sample(range(200), len(datelist)) idx = range(len(datelist)) df1 = pd.DataFrame(data=zip(idx, ids, prices), index=datelist, columns='idx label prices'.split()) print(df1.head(10)) df1 idx label prices 2018-01-01 0 B 40 2018-01-02 1 A 190 2018-01-03 2 A 159 2018-01-04 3 A 25 2018-01-05 4 A 89 2018-01-06 5 B 164 ... 2018-01-31 30 A 102 2018-02-01 31 A 117 2018-02-02 32 A 120 2018-02-03 33 B 75 2018-02-04 34 B 170 ... Desired Output idx label prices monthly_return 2018-01-01 0 B 40 0.000000 2018-01-02 1 A 190 0.000000 2018-01-03 2 A 159 0.000000 2018-01-04 3 A 25 0.000000 2018-01-05 4 A 89 0.000000 2018-01-06 5 B 164 0.000000 ... 2018-01-31 30 A 102 -0.098039 2018-02-01 31 A 117 0.000000 2018-02-02 32 A 120 0.000000 ... 2018-02-26 56 B 152 0.000000 2018-02-27 57 B 2 0.000000 2018-02-28 58 B 49 -0.040816 2018-03-01 59 B 188 0.000000 ... 2018-01-28 89 A 88 0.000000 2018-01-29 90 A 26 0.000000 2018-01-30 91 B 128 0.000000 2018-01-31 92 A 144 -0.098039 ... 2018-02-26 118 A 92 0.000000 2018-02-27 119 B 111 0.000000 2018-02-28 120 B 34 -0.040816 ... What I have tried so far is: dfX = df1.copy(deep=True) dfX = df1.groupby('label').resample('M')['prices'].last().pct_change(1).shift(-1) print(dfX) Which outputs: label A 2018-01-31 -0.067961 2018-02-28 -0.364583 2018-03-31 0.081967 B 2018-01-31 1.636364 2018-02-28 -0.557471 2018-03-31 NaN This is quite close to what I would like to do, however I am only getting pct_change data on end of month dates back which is annoying to store back in the original dataframe (df1) as a new column. Something like this doesn't work: dfX = df1.copy(deep=True) dfX['monthly_return'] = df1.groupby('label').resample('M')['prices'].last().pct_change(1).shift(-1) As it yields the error: TypeError: incompatible index of inserted column with frame index I have considered 'upsampling' the monthly_return data back into a daily series, however this could likely end up causing the same error mentioned above since the original dataset could be missing dates (such as weekends). Additionally, resetting the index to clear this error would still create problems as the grouped dfX does not have the same number of rows/frequency as the original df1 which is of daily frequency. I have a hunch that this can be done by using multi-indexing and dataframe merging however I am unsure how to go about doing so.
This generates my desired output, but it isn't as clean of a solution as I was hoping for df1 is generated the same as before (code given in question): idx label prices 2018-01-01 0 A 145 2018-01-02 1 B 86 2018-01-03 2 B 141 ... 2018-01-25 86 B 12 2018-01-26 87 B 71 2018-01-27 88 B 186 2018-01-28 89 B 151 2018-01-29 90 A 161 2018-01-30 91 B 143 2018-01-31 92 B 88 ... Then: def fun(x): dates = x.date x = x.set_index('date', drop=True) x['monthly_return'] = x.resample('M').last()['prices'].pct_change(1).shift(-1) x = x.reindex(dates) return x dfX = df1.copy(deep=True) dfX.reset_index(inplace=True) dfX.columns = 'date idx label prices'.split() dfX = dfX.groupby('label').apply(fun).droplevel(level='label') print(dfX) Which outputs the desired result (unsorted): idx label prices monthly_return date 2018-01-01 0 A 145 NaN 2018-01-06 5 A 77 NaN 2018-01-08 7 A 48 NaN 2018-01-09 8 A 31 NaN 2018-01-11 10 A 20 NaN 2018-01-12 11 A 27 NaN 2018-01-14 13 A 109 NaN 2018-01-15 14 A 166 NaN 2018-01-17 16 A 130 NaN 2018-01-18 17 A 139 NaN 2018-01-19 18 A 191 NaN 2018-01-21 20 A 164 NaN 2018-01-22 21 A 112 NaN 2018-01-23 22 A 167 NaN 2018-01-25 24 A 140 NaN 2018-01-26 25 A 42 NaN 2018-01-30 29 A 107 NaN 2018-02-04 34 A 9 NaN 2018-02-07 37 A 84 NaN 2018-02-08 38 A 23 NaN 2018-02-10 40 A 30 NaN 2018-02-12 42 A 89 NaN 2018-02-15 45 A 79 NaN 2018-02-16 46 A 115 NaN 2018-02-19 49 A 197 NaN 2018-02-21 51 A 11 NaN 2018-02-26 56 A 111 NaN 2018-02-27 57 A 126 NaN 2018-03-01 59 A 135 NaN 2018-03-03 61 A 28 NaN 2018-01-01 62 A 120 NaN 2018-01-03 64 A 170 NaN 2018-01-05 66 A 45 NaN 2018-01-07 68 A 173 NaN 2018-01-08 69 A 158 NaN 2018-01-09 70 A 63 NaN 2018-01-11 72 A 62 NaN 2018-01-12 73 A 168 NaN 2018-01-14 75 A 169 NaN 2018-01-15 76 A 142 NaN 2018-01-17 78 A 83 NaN 2018-01-18 79 A 96 NaN 2018-01-21 82 A 25 NaN 2018-01-22 83 A 90 NaN 2018-01-23 84 A 59 NaN 2018-01-29 90 A 161 NaN 2018-02-01 93 A 150 NaN 2018-02-04 96 A 85 NaN 2018-02-06 98 A 124 NaN 2018-02-14 106 A 195 NaN 2018-02-16 108 A 136 NaN 2018-02-17 109 A 134 NaN 2018-02-18 110 A 183 NaN 2018-02-19 111 A 32 NaN 2018-02-24 116 A 102 NaN 2018-02-25 117 A 72 NaN 2018-02-27 119 A 38 NaN 2018-03-02 122 A 137 NaN 2018-03-03 123 A 171 NaN 2018-01-02 1 B 86 NaN 2018-01-03 2 B 141 NaN 2018-01-04 3 B 189 NaN 2018-01-05 4 B 60 NaN 2018-01-07 6 B 1 NaN 2018-01-10 9 B 87 NaN 2018-01-13 12 B 44 NaN 2018-01-16 15 B 147 NaN 2018-01-20 19 B 92 NaN 2018-01-24 23 B 81 NaN 2018-01-27 26 B 190 NaN 2018-01-28 27 B 24 NaN 2018-01-29 28 B 116 NaN 2018-01-31 30 B 98 1.181818 2018-02-01 31 B 121 NaN 2018-02-02 32 B 110 NaN 2018-02-03 33 B 66 NaN 2018-02-05 35 B 4 NaN 2018-02-06 36 B 13 NaN 2018-02-09 39 B 114 NaN 2018-02-11 41 B 16 NaN 2018-02-13 43 B 174 NaN 2018-02-14 44 B 78 NaN 2018-02-17 47 B 144 NaN 2018-02-18 48 B 14 NaN 2018-02-20 50 B 133 NaN 2018-02-22 52 B 156 NaN 2018-02-23 53 B 159 NaN 2018-02-24 54 B 177 NaN 2018-02-25 55 B 43 NaN 2018-02-28 58 B 19 -0.338542 2018-03-02 60 B 127 NaN 2018-01-02 63 B 2 NaN 2018-01-04 65 B 97 NaN 2018-01-06 67 B 8 NaN 2018-01-10 71 B 54 NaN 2018-01-13 74 B 106 NaN 2018-01-16 77 B 74 NaN 2018-01-19 80 B 188 NaN 2018-01-20 81 B 172 NaN 2018-01-24 85 B 51 NaN 2018-01-25 86 B 12 NaN 2018-01-26 87 B 71 NaN 2018-01-27 88 B 186 NaN 2018-01-28 89 B 151 NaN 2018-01-30 91 B 143 NaN 2018-01-31 92 B 88 1.181818 2018-02-02 94 B 75 NaN 2018-02-03 95 B 103 NaN 2018-02-05 97 B 82 NaN 2018-02-07 99 B 128 NaN 2018-02-08 100 B 123 NaN 2018-02-09 101 B 52 NaN 2018-02-10 102 B 18 NaN 2018-02-11 103 B 21 NaN 2018-02-12 104 B 50 NaN 2018-02-13 105 B 64 NaN 2018-02-15 107 B 185 NaN 2018-02-20 112 B 125 NaN 2018-02-21 113 B 108 NaN 2018-02-22 114 B 132 NaN 2018-02-23 115 B 180 NaN 2018-02-26 118 B 67 NaN 2018-02-28 120 B 192 -0.338542 2018-03-01 121 B 58 NaN Perhaps there is a more concise and pythonic way of doing this.
Python: Expand a dataframe row-wise based on datetime
I have a dataframe like this: ID Date Value 783 C 2018-02-23 0.704 580 B 2018-08-04 -1.189 221 A 2018-08-10 -0.788 228 A 2018-08-17 0.038 578 B 2018-08-02 1.188 What I want is expanding the dataframe based on Date column to 1-month earlier, and fill ID with the same person, and fill Value with nan until the last observation. The expected result is similar to this: ID Date Value 0 C 2018/01/24 nan 1 C 2018/01/25 nan 2 C 2018/01/26 nan 3 C 2018/01/27 nan 4 C 2018/01/28 nan 5 C 2018/01/29 nan 6 C 2018/01/30 nan 7 C 2018/01/31 nan 8 C 2018/02/01 nan 9 C 2018/02/02 nan 10 C 2018/02/03 nan 11 C 2018/02/04 nan 12 C 2018/02/05 nan 13 C 2018/02/06 nan 14 C 2018/02/07 nan 15 C 2018/02/08 nan 16 C 2018/02/09 nan 17 C 2018/02/10 nan 18 C 2018/02/11 nan 19 C 2018/02/12 nan 20 C 2018/02/13 nan 21 C 2018/02/14 nan 22 C 2018/02/15 nan 23 C 2018/02/16 nan 24 C 2018/02/17 nan 25 C 2018/02/18 nan 26 C 2018/02/19 nan 27 C 2018/02/20 nan 28 C 2018/02/21 nan 29 C 2018/02/22 nan 30 C 2018/02/23 1.093 31 B 2018/07/05 nan 32 B 2018/07/06 nan 33 B 2018/07/07 nan 34 B 2018/07/08 nan 35 B 2018/07/09 nan 36 B 2018/07/10 nan 37 B 2018/07/11 nan 38 B 2018/07/12 nan 39 B 2018/07/13 nan 40 B 2018/07/14 nan 41 B 2018/07/15 nan 42 B 2018/07/16 nan 43 B 2018/07/17 nan 44 B 2018/07/18 nan 45 B 2018/07/19 nan 46 B 2018/07/20 nan 47 B 2018/07/21 nan 48 B 2018/07/22 nan 49 B 2018/07/23 nan 50 B 2018/07/24 nan 51 B 2018/07/25 nan 52 B 2018/07/26 nan 53 B 2018/07/27 nan 54 B 2018/07/28 nan 55 B 2018/07/29 nan 56 B 2018/07/30 nan 57 B 2018/07/31 nan 58 B 2018/08/01 nan 59 B 2018/08/02 nan 60 B 2018/08/03 nan 61 B 2018/08/04 0.764 62 A 2018/07/11 nan 63 A 2018/07/12 nan 64 A 2018/07/13 nan 65 A 2018/07/14 nan 66 A 2018/07/15 nan 67 A 2018/07/16 nan 68 A 2018/07/17 nan 69 A 2018/07/18 nan 70 A 2018/07/19 nan 71 A 2018/07/20 nan 72 A 2018/07/21 nan 73 A 2018/07/22 nan 74 A 2018/07/23 nan 75 A 2018/07/24 nan 76 A 2018/07/25 nan 77 A 2018/07/26 nan 78 A 2018/07/27 nan 79 A 2018/07/28 nan 80 A 2018/07/29 nan 81 A 2018/07/30 nan 82 A 2018/07/31 nan 83 A 2018/08/01 nan 84 A 2018/08/02 nan 85 A 2018/08/03 nan 86 A 2018/08/04 nan 87 A 2018/08/05 nan 88 A 2018/08/06 nan 89 A 2018/08/07 nan 90 A 2018/08/08 nan 91 A 2018/08/09 nan 92 A 2018/08/10 2.144 93 A 2018/07/18 nan 94 A 2018/07/19 nan 95 A 2018/07/20 nan 96 A 2018/07/21 nan 97 A 2018/07/22 nan 98 A 2018/07/23 nan 99 A 2018/07/24 nan 100 A 2018/07/25 nan 101 A 2018/07/26 nan 102 A 2018/07/27 nan 103 A 2018/07/28 nan 104 A 2018/07/29 nan 105 A 2018/07/30 nan 106 A 2018/07/31 nan 107 A 2018/08/01 nan 108 A 2018/08/02 nan 109 A 2018/08/03 nan 110 A 2018/08/04 nan 111 A 2018/08/05 nan 112 A 2018/08/06 nan 113 A 2018/08/07 nan 114 A 2018/08/08 nan 115 A 2018/08/09 nan 116 A 2018/08/10 nan 117 A 2018/08/11 nan 118 A 2018/08/12 nan 119 A 2018/08/13 nan 120 A 2018/08/14 nan 121 A 2018/08/15 nan 122 A 2018/08/16 nan 123 A 2018/08/17 0.644 124 B 2018/07/03 nan 125 B 2018/07/04 nan 126 B 2018/07/05 nan 127 B 2018/07/06 nan 128 B 2018/07/07 nan 129 B 2018/07/08 nan 130 B 2018/07/09 nan 131 B 2018/07/10 nan 132 B 2018/07/11 nan 133 B 2018/07/12 nan 134 B 2018/07/13 nan 135 B 2018/07/14 nan 136 B 2018/07/15 nan 137 B 2018/07/16 nan 138 B 2018/07/17 nan 139 B 2018/07/18 nan 140 B 2018/07/19 nan 141 B 2018/07/20 nan 142 B 2018/07/21 nan 143 B 2018/07/22 nan 144 B 2018/07/23 nan 145 B 2018/07/24 nan 146 B 2018/07/25 nan 147 B 2018/07/26 nan 148 B 2018/07/27 nan 149 B 2018/07/28 nan 150 B 2018/07/29 nan 151 B 2018/07/30 nan 152 B 2018/07/31 nan 153 B 2018/08/01 nan 154 B 2018/08/02 -0.767 The source data can be created as below: import pandas as pd from itertools import chain import numpy as np df_1 = pd.DataFrame({ 'ID' : list(chain.from_iterable([['A'] * 365, ['B'] * 365, ['C'] * 365])), 'Date' : pd.date_range(start = '2018-01-01', end = '2018-12-31').tolist() + pd.date_range(start = '2018-01-01', end = '2018-12-31').tolist() + pd.date_range(start = '2018-01-01', end = '2018-12-31').tolist(), 'Value' : np.random.randn(365 * 3) }) df_1 = df_1.sample(5, random_state = 123) Thanks for the advice!
You can create another DataFrame with previous months, then join together by concat, create DatetimeIndex, so possible use groupby with resample by d for days for add all values between: df_2 = df_1.assign(Date = df_1['Date'] - pd.DateOffset(months=1) + pd.DateOffset(days=1), Value = np.nan) df = (pd.concat([df_2, df_1], sort=False) .reset_index() .set_index('Date') .groupby('index', sort=False) .resample('D') .ffill() .reset_index(level=1) .drop('index', 1) .rename_axis(None)) print (df) Date ID Value 783 2018-01-24 C NaN 783 2018-01-25 C NaN 783 2018-01-26 C NaN 783 2018-01-27 C NaN 783 2018-01-28 C NaN .. ... .. ... 578 2018-07-29 B NaN 578 2018-07-30 B NaN 578 2018-07-31 B NaN 578 2018-08-01 B NaN 578 2018-08-02 B 0.562684 [155 rows x 3 columns] Another solution with list comprehension and concat, but last is necessary back filling of columns for index and ID, solution working if no missing value in original ID column: offset = pd.DateOffset(months=1) + pd.DateOffset(days=1) df=pd.concat([df_1.iloc[[i]].reset_index().set_index('Date').reindex(pd.date_range(d-offset,d)) for i, d in enumerate(df_1['Date'])], sort=False) df = (df.assign(index = df['index'].bfill().astype(int), ID = df['ID'].bfill()) .rename_axis('Date') .reset_index() .set_index('index') .rename_axis(None) ) print (df) Date ID Value 783 2018-01-24 C NaN 783 2018-01-25 C NaN 783 2018-01-26 C NaN 783 2018-01-27 C NaN 783 2018-01-28 C NaN .. ... .. ... 578 2018-07-29 B NaN 578 2018-07-30 B NaN 578 2018-07-31 B NaN 578 2018-08-01 B NaN 578 2018-08-02 B 1.224345 [155 rows x 3 columns]
We can create a date range in the "Date" column, then explode it. Then group the "Value" column by the index and set values to nan but the last. Finally reset the index. def drange(t): return pd.date_range( t-pd.DateOffset(months=1)+pd.DateOffset(days=1),t,freq="D",normalize=True) df["Date"]= df["Date"].transform(drange) ID Date Value index 783 C DatetimeIndex(['2018-01-24', '2018-01-25', '20... 0.704 580 B DatetimeIndex(['2018-07-05', '2018-07-06', '20... -1.189 221 A DatetimeIndex(['2018-07-11', '2018-07-12', '20... -0.788 228 A DatetimeIndex(['2018-07-18', '2018-07-19', '20... 0.038 578 B DatetimeIndex(['2018-07-03', '2018-07-04', '20... 1.188 df= df.reset_index(drop=True).explode(column="Date") ID Date Value 0 C 2018-01-24 0.704 0 C 2018-01-25 0.704 0 C 2018-01-26 0.704 0 C 2018-01-27 0.704 0 C 2018-01-28 0.704 .. .. ... ... 4 B 2018-07-29 1.188 4 B 2018-07-30 1.188 4 B 2018-07-31 1.188 4 B 2018-08-01 1.188 4 B 2018-08-02 1.188 df["Value"]= df.groupby(level=0)["Value"].transform(lambda v: [np.nan]*(len(v)-1)+[v.iloc[0]]) df= df.reset_index(drop=True) ID Date Value 0 C 2018-01-24 NaN 1 C 2018-01-25 NaN 2 C 2018-01-26 NaN 3 C 2018-01-27 NaN 4 C 2018-01-28 NaN .. .. ... ... 150 B 2018-07-29 NaN 151 B 2018-07-30 NaN 152 B 2018-07-31 NaN 153 B 2018-08-01 NaN 154 B 2018-08-02 1.188