How to extract unique rows depending on rule? - python

i have a dataframe like this
id_1,date_1,id_2,date_2
I need dataframe where rows (date_1 + 15 days) < date_2
in case where this rule is match I need only first occurence
Just using boolean mask is not solve the problem
So I think may be I need to use
some kind offor index, row in df.iterrows():
and create new dataframe

import pandas as pd
from datetime import timedelta
df = pd.DataFrame(data=dd)
df[['date_1', 'date_2']] = df[['date_1', 'date_2']].apply(pd.to_datetime)
df['date_1_15'] = df['date_1'] + timedelta(4)
def apply_mask(row):
if row['date_1_15'] < row['date_2']:
row['mask'] = True
else:
row['mask'] = False
return row
df = df.apply(lambda row: apply_mask(row), axis=1)
dx = df.loc[df['mask'] == True]
dx = dx.groupby(['date_1']).first()
dx['mask_first'] = True
dx = dx.reset_index()
dx = dx[['date_1', 'date_2', 'mask_first']]
df = pd.merge(df, dx, on=['date_1', 'date_2'], how='outer')

import pandas as pd
from datetime import timedelta
df = pd.DataFrame(data={'id_1':[1,2,3,4],
'date1': ['2018-01-10', '2018-02-05', '2018-02-20', '2018-02-21'],
'date2': ['2018-01-11', '2018-02-15', '2018-02-27', '2018-02-22']})
df[['date1', 'date2']] = df[['date1', 'date2']].apply(pd.to_datetime)
df['date1_15'] = df['date1'] + timedelta(15)
df = df.loc[df['date1_15'] < df['date2']].head(1)

Related

Have two dataframes with dt, How do I get the count of rows in the last 7 days?

import pandas as pd
l1 = ["2021-11-15","2021-11-13","2021-11-10","2021-05-28","2021-06-02","2021-06-02","2021-11-02"]
l2 = ["2021-11-11","2021-03-02","2021-11-05","2021-05-20","2021-05-01","2021-06-01","2021-04-08"]
#convert to dt
l1=pd.to_datetime(l1)
l2= pd.to_datetime(l2)
#put in df
df1=pd.DataFrame(l1)
df2=pd.DataFrame(l2)
df1.columns = ['0']
df2.columns = ['0']
df1=df1.set_index('0')
df2=df2.set_index('0')
#sort asc
df1=df1.sort_index()
df2=df2.sort_index()
How can I get a COUNT from each dataframe based on the number of rows that are within the last 7 days?
you can slice between two timestamps and then get the number of rows with .shape[0]:
def get_count_last_7_days(df):
stop = df.index.max()
start = stop - pd.Timedelta('7D')
return df.loc[start:stop].shape[0]
count1 = get_count_last_7_days(df1)
count2 = get_count_last_7_days(df2)
import pandas as pd
import numpy as np
from datetime import date, timedelta
x = (date.today() - timedelta(days=100))
y = (date.today() - timedelta(days=7))
z = date.today()
dates = pd.date_range(x, periods=100)
d = np.arange(1, 101)
df = pd.DataFrame(data=d, index=pd.DatetimeIndex(dates))
df = df.sort_index()
last_seven_days = df.loc[y:z]
print(last_seven_days.count())
Your last 7 days is ambiguous, I assume it's calculated from current time:
today = datetime.today()
week_ago = today - timedelta(days=7)
Since you already set the date as index, you can use .loc directly, but you also can use a mask:
df = df1.loc[week_ago:today]
# or
df = df1[(df1.index > week_ago) & (df1.index < today)]
To get row count, you can use shape accessor or sum the boolean mask
count = df1.loc[week_ago:today].shape[0]
# or
sum((df1.index > week_ago) & (df1.index < today))

Pandas Dataframe masking issues: referring to previous rows and selecting values

I am new to Pandas, and I'm trying to avoid iterating over a DataFrame, and attempting to use vectorisation instead. I am not able to get the results I want; I need help in the more complicated masking and selection statements
This is my code:
import random
from datetime import datetime, timedelta
import pandas as pd
dates = []
temp = []
press = []
vel = []
fmt = '%Y-%m-%d %H:%M:%S'
stime = datetime.strptime('2020-01-06 10:28:16', fmt)
etime = datetime.strptime('2020-04-10 03:43:12', fmt)
td = etime - stime
l = set([random.random() for x in range(0, 1000)])
dates = [((td * x) + stime) for x in random.sample(l, 100)]
for i in range(100):
press.append(random.uniform(14,95.5))
temp.append(random.uniform(-15,45))
vel.append(random.uniform(50,153))
measurements = {
'date' : dates,
'pressure' : press,
'velocity' : vel,
'temperature': temp
}
df = pd.DataFrame(measurements)
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df = df.sort_index()
df2 = pd.DataFrame()
# if temp increased from previous row, set flag
df2['temp_inc'] = df['temperature'] - df.shift(1)['temperature'] > 0
df2['temp_inc'] = df2['temp_inc'].replace({True: 1, False: 0})
# need to fetch velocity where pressure has increased from previous row, else 0
press_up_mask = df.where( (df['pressure'] - df.shift(1)['pressure']) > 0)
#df2['press_spike_velocity'] = df[press_up_mask]['velocity']
# Need to perform calc based on 'temp_inc' column: if 'temp_inc' column is 1: calculate pressure * velocity, else 0
temp_inc_mask = df2['temp_inc'] == 1
df2['boyle_fact'] = df[temp_inc_mask]['pressure'] * df[temp_inc_mask]['velocity']
# Get some stats
df2['short_max_temp'] = df['temperature'].rolling(3).max()
df2['long_min_pressure'] = df['pressure'].rolling(30).min()
print(df.head())
print(df2.head())
How do I correctly calculate columns 'press_spike_velocity' and 'boyle_fact' ?
Starting from the computations:
# if temp increased from previous row, set flag
df2['temp_inc'] = df['temperature'] - df.shift(1)['temperature'] > 0
# setting int type instead of replace
df2['temp_inc'] = df2['temp_inc'].astype(int)
# need to fetch velocity where pressure has increased from previous row, else 0
press_up_mask = df.where( (df['pressure'] - df['pressure'].shift(1)) > 0)
# set column to velocity then mask in zeros via assignment
df2['press_spike_velocity'] = df['velocity'].copy()
df2['press_spike_velocity'][~press_up_mask] = 0
# Need to perform calc based on 'temp_inc' column: if 'temp_inc' column is 1: calculate pressure * velocity, else 0
temp_inc_mask = df2['temp_inc'] == 1
# same masking approach as above
df2['boyle_fact'] = df['pressure'] * df['velocity']
df2['boyle_fact'][~temp_inc_mask] = 0
This is the simplest way to solve your problem with minimal changes to the code itself. If you dig into pandas more you could probably find methods to do this in 1-2 fewer lines via inplace operations but I don't know how much performance or readability you would gain from that.

Simulating autofit in python using xlswriter

I tried to follow what was suggested here. But it seems that only a few columns get effected when I tried either of the following functions:
def get_col_widths(dataframe):
# First we find the maximum length of the index column
idx_max = max([len(str(s)) for s in dataframe.index.values] + [len(str(dataframe.index.name))])
# Then, we concatenate this to the max of the lengths of column name and its values for each column, left to right
return [idx_max] + [max([len(str(s)) for s in dataframe[col].values] + [len(col)]) for col in dataframe.columns]
for i, width in enumerate(get_col_widths(df)):
worksheet.set_column(i, i, width)
def set_column_width(df):
length_list = [len(x) for x in df.columns]
for i, width in enumerate(length_list):
worksheet.set_column(i, i, width)
I may not be using the functions above correctly. Thus here is my entire code:
import openpyxl
import sql_utils as sql
from datetime import date
from datetime import timedelta
from dateutil.relativedelta import relativedelta
from pandas.tseries.holiday import USFederalHolidayCalendar
import pyodbc
import pandas as pd
import datetime
import numpy as np
import xlsxwriter
from openpyxl import load_workbook
import sys
import win32com.client as win32
from win32com.client import Dispatch
cal = USFederalHolidayCalendar()
# pylint: disable=no-member
cnxn = pyodbc.connect(sql.connection_string)
# Set dates for the queries
cutoff_date = datetime.date(2019, 6, 30)
output_file = ".\\pa-jpm-reporting\\output\\jpm_morgans_report.xlsx"
if len(sys.argv) > 1:
cutoff_date = datetime.datetime.strptime(sys.argv[1], '%Y-%m-%d').date()
output_file = sys.argv[2]
writer = pd.ExcelWriter(output_file)
def get_first_business_day_of_month(start_date, end_date):
return [
get_business_day(d).date()
for d in pd.date_range(start_date, end_date, freq="BMS")
]
def get_business_day(date):
while date.isoweekday() > 5 or date in cal.holidays():
date += timedelta(days=1)
return date
# Get as_of_date and archive_date
archive_date = get_first_business_day_of_month(cutoff_date, cutoff_date + relativedelta(days=+10))[0]
as_of_date = datetime.date(archive_date.year, 1, 1)
# Pull date
def get_sql(cutoff_date, archive_date, as_of_date):
return sql.jpm_rate_query.format(cutoff_date, archive_date, as_of_date)
def get_sql2(utoff_date, archive_date, as_of_date):
return sql.jpm_query.format(cutoff_date, archive_date, as_of_date)
# Get data into a pandas dataframe
def get_dataframe(cutoff_date, archive_date, as_of_date):
cnxn.execute(get_sql(cutoff_date, archive_date, as_of_date))
data = pd.read_sql(get_sql2(cutoff_date, archive_date, as_of_date), cnxn)
return data
df = get_dataframe(cutoff_date, archive_date, as_of_date)
df['ModEffectiveDate'] = pd.to_datetime(df['ModEffectiveDate'])
df['StepDate1'] = pd.to_datetime(df['StepDate1'])
# Fix step date
def date_check(date1, date2):
if date1 > date2:
return 'X'
else:
return ' '
df['Step date prior to mod'] = df.apply(lambda x: date_check(x['ModEffectiveDate'], x['StepDate1']), axis=1)
# Fix step check
cols = df.filter(regex='StepRate').columns
df['Later Step Rate not higher than previous'] = ' '
for i, col in enumerate(cols):
if i <= len(cols) - 2:
df['Later Step Rate not higher than previous'] = np.where(df[col] > df[cols[i+1]],'X',df['Later Step Rate not higher than previous'])
else:
break
# Format the Dates
df['CutoffDate'].astype('datetime64[ns]')
df['CutoffDate'] = pd.to_datetime(df['CutoffDate']).dt.strftime("%m/%d/%Y")
df['ModEffectiveDate'].astype('datetime64[ns]')
df['ModEffectiveDate'] = pd.to_datetime(df['ModEffectiveDate']).dt.strftime("%m/%d/%Y")
df['StepDate1'].astype('datetime64[ns]')
df['StepDate1'] = pd.to_datetime(df['StepDate1']).dt.strftime("%m/%d/%Y")
df = df.replace('NaT', '', regex=True)
# clean up (remove) zero step rates that follow the last true step
cols = df.filter(regex='StepRate').columns
for i, col in enumerate(cols):
df[col] = df[col].replace(0, '', regex=True)
# remove repeated loan number shown immediately before the step dates
df = df.drop(['loanid'], axis=1)
# Add color to column
def highlight(s):
same = s == df['CutoffDate']
return ['background-color: lightblue' if x else '' for x in same]
df.style.highlight_null(null_color='green')
df.to_excel(writer, index=False)
workbook = writer.book
worksheet = writer.sheets['Sheet1']
# Freeze 1st row
worksheet.freeze_panes(1,0)
# Place dollar amounts where relevant
num_format = workbook.add_format({'num_format': '#,##0.00'})
# Find the columns (numbers) with float64 type and set the number format
nametypemap = df.dtypes.apply(lambda x: x.name).to_dict()
for i, (k, v) in enumerate(nametypemap.items()):
# print("index: {}, key: {}, value: {}".format(i, k, v))
if v == 'float64':
worksheet.set_column(i, i, 12, num_format)
def get_col_widths(dataframe):
# First we find the maximum length of the index column
idx_max = max([len(str(s)) for s in dataframe.index.values] + [len(str(dataframe.index.name))])
# Then, we concatenate this to the max of the lengths of column name and its values for each column, left to right
return [idx_max] + [max([len(str(s)) for s in dataframe[col].values] + [len(col)]) for col in dataframe.columns]
for i, width in enumerate(get_col_widths(df)):
worksheet.set_column(i, i, width)
def set_column_width(df):
length_list = [len(x) for x in df.columns]
for i, width in enumerate(length_list):
worksheet.set_column(i, i, width)
set_column_width(df)
writer.save()
Please let me know if there is a way of doing this. I am also having a hard time trying to highlight a column in the dataframe. But I will make another separate post for that.

Assign custom categories to json data - pandas

Assigning labels to raw data instead of getting new indicator columns from get_dummies. I want something like this :
json_input:
[{id:100,vehicle_type:"Car", time:"2017-04-06 01:39:43", zone="A", type:"Checked"},
{id:101,vehicle_type:"Truck", time:"2017-04-06 02:35:45", zone="B", type:"Unchecked"},
{id:102,vehicle_type:"Truck", time:"2017-04-05 03:20:12", zone="A", type:"Checked"},
{id:103,vehicle_type:"Car", time:"2017-04-04 10:05:04", zone="C", type:"Unchecked"}
]
Result:
id , vehicle_type, time_range, zone, type
100, 0 , 1 , 1 , 1
101, 1 , 1 , 2 , 0
102, 1 , 2 , 1 , 1
103, 0 , 3 , 3 , 0
time stamp- TS
columns -> vehicle_type, type are binary , time_range (1 -> (TS1-TS2),2 -> (TS3-TS4), 3->(TS5-TS6)), zone-> categorical(1,2 or 3).
I want to auto assign these labels when I feed the flattened json to dataframe in pandas. Is this possible? ( I do not want zone_1, type_1, vehicle_type_3 indicator columns from get_dummies in pandas). If not possible with pandas, please suggest python lib for this automation.
Here is what I could come up with. I do not know what time ranges you are looking for
import datetime
import io
import pandas as pd
import numpy as np
df_string='[{"id":100,"vehicle_type":"Car","time":"2017-04-06 01:39:43","zone":"A","type":"Checked"},{"id":101,"vehicle_type":"Truck","time":"2017-04-06 02:35:45","zone":"B","type":"Unchecked"},{"id":102,"vehicle_type":"Truck","time":"2017-04-05 03:20:12","zone":"A","type":"Checked"},{"id":103,"vehicle_type":"Car","time":"2017-04-04 10:05:04","zone":"C","type":"Unchecked"}]'
df = pd.read_json(io.StringIO(df_string))
df['zone'] = pd.Categorical(df.zone)
df['vehicle_type'] = pd.Categorical(df.vehicle_type)
df['type'] = pd.Categorical(df.type)
df['zone_int'] = df.zone.cat.codes
df['vehicle_type_int'] = df.vehicle_type.cat.codes
df['type_int'] = df.type.cat.codes
df.head()
Edit
Here is what I could come up with
import datetime
import io
import math
import pandas as pd
#Taken from http://stackoverflow.com/questions/13071384/python-ceil-a-datetime-to-next-quarter-of-an-hour
def ceil_dt(dt, num_seconds=900):
nsecs = dt.minute*60 + dt.second + dt.microsecond*1e-6
delta = math.ceil(nsecs / num_seconds) * num_seconds - nsecs
return dt + datetime.timedelta(seconds=delta)
df_string='[{"id":100,"vehicle_type":"Car","time":"2017-04-06 01:39:43","zone":"A","type":"Checked"},{"id":101,"vehicle_type":"Truck","time":"2017-04-06 02:35:45","zone":"B","type":"Unchecked"},{"id":102,"vehicle_type":"Truck","time":"2017-04-05 03:20:12","zone":"A","type":"Checked"},{"id":103,"vehicle_type":"Car","time":"2017-04-04 10:05:04","zone":"C","type":"Unchecked"}]'
df = pd.read_json(io.StringIO(df_string))
df['zone'] = pd.Categorical(df.zone)
df['vehicle_type'] = pd.Categorical(df.vehicle_type)
df['type'] = pd.Categorical(df.type)
df['zone_int'] = df.zone.cat.codes
df['vehicle_type_int'] = df.vehicle_type.cat.codes
df['type_int'] = df.type.cat.codes
df['time'] = pd.to_datetime(df.time)
df['dayofweek'] = df.time.dt.dayofweek
df['month_int'] = df.time.dt.month
df['year_int'] = df.time.dt.year
df['day'] = df.time.dt.day
df['date'] = df.time.apply(lambda x: x.date())
df['month'] = df.date.apply(lambda x: datetime.date(x.year, x.month, 1))
df['year'] = df.date.apply(lambda x: datetime.date(x.year, 1, 1))
df['hour'] = df.time.dt.hour
df['mins'] = df.time.dt.minute
df['seconds'] = df.time.dt.second
df['time_interval_3hour'] = df.hour.apply(lambda x : math.floor(x/3)+1)
df['time_interval_6hour'] = df.hour.apply(lambda x : math.floor(x/6)+1)
df['time_interval_12hour'] = df.hour.apply(lambda x : math.floor(x/12)+1)
df['weekend'] = df.dayofweek.apply(lambda x: x>4)
df['ceil_quarter_an_hour'] =df.time.apply(lambda x : ceil_dt(x))
df['ceil_half_an_hour'] =df.time.apply(lambda x : ceil_dt(x, num_seconds=1800))
df.head()

appending to a pandas dataframe

I want to add make a pandas dataframe with two columns : read_id and score
I am using the following code :
reads_array = []
for x in Bio.SeqIO.parse("inp.fasta","fasta"):
reads_array.append(x)
columns = ["read_id","score"]
df = pd.DataFrame(columns = columns)
df = df.fillna(0)
for x in reads_array:
alignments=pairwise2.align.globalms("ACTTGAT",str(x.seq),2,-1,-.5,-.1)
sorted_alignments = sorted(alignments, key=operator.itemgetter(2),reverse = True)
read_id = x.name
score = sorted_alignments[0][2]
df['read_id'] = read_id
df['score'] = score
But this does not work. Can you suggest a way of generating the dataframe df
At the top make sure you have
import numpy as np
Then replace the code you shared with
reads_array = []
for x in Bio.SeqIO.parse("inp.fastq", "fastq"):
reads_array.append(x)
df = pd.DataFrame(np.zeros((len(reads_array), 2)), columns=["read_id", "score"])
for index, x in enumerate(reads_array):
alignments = pairwise2.align.globalms("ACTTGAT", str(x.seq), 2, -1, -.5, -.1)
sorted_alignments = sorted(alignments, key=operator.itemgetter(2), reverse=True)
read_id = x.name
score = sorted_alignments[0][2]
df.loc[index, 'read_id'] = read_id
df.loc[index, 'score'] = score
The main problem with your original code was two things:
1) Your dataframe had 0 rows
2) df['column_name'] refers to the entire column, not a single cell, so when you execute df['column_name'] = value, all cells in that column get set to that value
df['read_id'] and df['score'] is Series. So if you want to iterate reads_array and calculate some value, then assign it to df's columns, try following:
for i, x in enumerate(reads_array):
...
df.ix[i]['read_id'] = read_id
df.ix[i]['score'] = score

Categories