Python DATAFRAME add column as Matrix - python

I need to save matrix results in one dataFrame.
to do that:
i split matrix and i create a new dataFrameor each iteration
and i append it to Target dataFrame.
i don't know if is The good way or not
what about perFormance?
import pandas as pd
import numpy as np
def generate_Matrix_as_dataframe( productname,variableName,results):
# df_results = pd.DataFrame({'Values': result})
df= pd.DataFrame(results)
dimension = len(results[0])
df['Values'] = pd.Series(df.fillna('').values.tolist())
# convert to Array
df['Values'] = df['Values'].apply(lambda x: np.array(x))
df_results =df[df.columns.drop([i for i in range(dimension)])]
df_results = df_results.reset_index()
df_results= df_results.rename(columns={"index":"Generation"})
df_results['Depth'] = df_results.index + 1
df_results['ProductName'] = productname
df_results['VariableName'] = variableName
return df_results[['ProductName','VariableName' ,'Depth', 'Values']]
df_results_ifrs17 = pd.DataFrame(columns=['ProductName', 'VariableName','Depth', 'Values'])
products =['P1','P2']
variables =['V1','V2']
nbrproduct=1
nbvariables=1
for p in products:
for v in variables:
value= np.ones( (nbrproduct, nbvariables), dtype=np.int32 )
df_results = generate_Matrix_as_dataframe(p, v,value)
df_results_ifrs17 = df_results_ifrs17.append(df_results, ignore_index=True)
nbvariables=nbvariables+1
print(value)
nbrproduct=nbrproduct+1
print(df_results_ifrs17)

Related

Remove non numeric rows from dataframe

I have a dataframe of patients and their gene expressions. I has this format:
Patient_ID | gene1 | gene2 | ... | gene10000
p1 0.142 0.233 ... bla
p2 0.243 0.243 ... -0.364
...
p4000 1.423 bla ... -1.222
As you see, that dataframe contains noise, with cells that are values other then a float value.
I want to remove every row that has a any column with non numeric values.
I've managed to do this using apply and pd.to_numeric like this:
cols = df.columns[1:]
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
df = df.dropna()
The problem is that it's taking for ever to run, and I need a better and more efficient way of achieving this
EDIT: To reproduce something like my data:
arr = np.random.random_sample((3000,10000))
df = pd.DataFrame(arr, columns=['gene' + str(i) for i in range(10000)])
df = pd.concat([pd.DataFrame(['p' + str(i) for i in range(10000)], columns=['Patient_ID']),df],axis = 1)
df['gene0'][2] = 'bla'
df['gene9998'][4] = 'bla'
Was right it is worth trying numpy :)
I got 30-60x times faster version (bigger array, larger improvement)
Convert to numpy array (.values)
Iterate through all rows
Try to convert each row to row of floats
If it fails (some NaN present), note this in boolean array
Create array based on the results
Code:
import pandas as pd
import numpy as np
from line_profiler_pycharm import profile
def op_version(df):
cols = df.columns[1:]
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
return df.dropna()
def np_version(df):
keep = np.full(len(df), True)
for idx, row in enumerate(df.values[:, 1:]):
try:
row.astype(np.float)
except:
keep[idx] = False
pass # maybe its better to store to_remove list, depends on data
return df[keep]
#profile
def main():
arr = np.random.random_sample((3000, 5000))
df = pd.DataFrame(arr, columns=['gene' + str(i) for i in range(5000)])
df = pd.concat([pd.DataFrame(['p' + str(i) for i in range(3000)],
columns=['Patient_ID']), df], axis=1)
df['gene0'][2] = 'bla'
df['gene998'][4] = 'bla'
df2 = df.copy()
df = op_version(df)
df2 = np_version(df2)
Note I decreased number of columns so it is more feasible for tests.
Also, fixed small bug in your example, instead of:
df = pd.concat([pd.DataFrame(['p' + str(i) for i in range(10000)], columns=['Patient_ID']),df],axis = 1)
I think should be
df = pd.concat([pd.DataFrame(['p' + str(i) for i in range(3000)], columns=['Patient_ID']),df],axis = 1)

Find minima and maxima of DataFrame by chronological order

I have a pandas data frame where I extract minima and extrema values. It work good so far, but the problem is how can I place them by Date (chronological order) into a list? They are separated into two list and I only want one price values list with them being in chronological order
import pandas as pd
import numpy as np
import yfinance
from scipy.signal import argrelextrema
import matplotlib.dates as mpl_dates
def extract_data():
ticker = 'GBPJPY=X'
ticker = yfinance.Ticker(ticker)
start_date = '2022-09-25'
end_date = '2022-10-08'
df = ticker.history(interval='1h', start=start_date, end=end_date)
df['Date'] = pd.to_datetime(df.index)
df['Date'] = df['Date'].apply(mpl_dates.date2num)
df = df.loc[:, ['Date', 'Open', 'High', 'Low', 'Close']]
# Call function to find Min-Max Extrema
find_extrema(df)
def find_extrema(df):
n = 10 # number of points to be checked before and after
# Find local peaks
df['min'] = df.iloc[argrelextrema(df.Close.values, np.less_equal,
order=n)[0]]['Close']
df['max'] = df.iloc[argrelextrema(df.Close.values, np.greater_equal,
order=n)[0]]['Close']
min_values_list = []
max_values_list = []
# Add min value to list
for item in df['min']:
check_NaN = np.isnan(item) # check if values is empty
if check_NaN == True:
pass
else:
min_values_list.append(item)
# Add max values to list
for item in df['max']:
check_NaN = np.isnan(item) # check if values is empty
if check_NaN == True:
pass
else:
max_values_list.append(item)
print(f"Min: {min_values_list}")
print(f"Max: {max_values_list}")
extract_data()
Option 1
First, use df.to_numpy to convert columns min and max to a np.array.
Get rid of all the NaN values by selecting from the array using np.logical_or applied to a boolean mask (created with np.isnan).
arr = df[['min','max']].to_numpy()
value_list = arr[np.logical_not(np.isnan(arr))].tolist()
print(value_list)
[159.7030029296875,
154.8979949951172,
160.7830047607422,
165.43800354003906,
149.55799865722656,
162.80499267578125,
156.6529998779297,
164.31900024414062,
156.125,
153.13499450683594,
161.3520050048828,
156.9340057373047,
162.52200317382812,
155.7740020751953,
160.98500061035156,
161.83700561523438]
Option 2
Rather more cumbersome:
n = 10
# get the indices for `min` and `max` in two arrays
_min = argrelextrema(df.Close.values, np.less_equal, order=n)[0]
_max = argrelextrema(df.Close.values, np.greater_equal, order=n)[0]
# create columns (assuming you need this for other purposes as well)
df['min'] = df.iloc[_min]['Close']
df['max'] = df.iloc[_max]['Close']
# create lists for `min` and `max`
min_values_list = df['min'].dropna().tolist()
max_values_list = df['max'].dropna().tolist()
# join the lists
value_list2 = min_values_list + max_values_list
value_idxs = _min.tolist() + _max.tolist()
# finally, sort `value_list2` based on `value_idxs`
value_list2 = [x for _, x in sorted(zip(value_idxs, value_list2))]
# check if result is the same:
value_list2 == value_list
# True
Assuming that you have max and min columns, what about something like this?
df['max_or_min'] = np.where(df['max'].notna(), df['max'], df['min'])
min_max_values = df['max_or_min'].dropna().values.tolist()

pandas dataframe creating columns with loop

I'm trying to add new columns and fill them with data with for loops, take data from Price column and insert 1000 iterations into new dataframe column, after 1000 Price column iterations then make a new column for 1000 more, etc.
import pandas as pd
import matplotlib.pyplot as plt
data_frame = pd.read_csv('candle_data.csv', names=['Time', 'Symbol','Side', 'Size', 'Price','1','2','3','4','5'])
price_df = pd.DataFrame()
count_tick = 0
count_candle = 0
for price in data_frame['Price']:
if count_tick < 1000:
price_df[count_candle] = price
count_tick +=1
elif count_tick == 1000:
count_tick = 0
count_candle +=1
price_df.head()
It's not necessary that you loop through the data frame , you can use slicing to achieve this, look at below sample code. I have loaded a Dataframe with 100 rows and trying to create column -'col3' from first 50 rows of 'col1' and post that column 'col4' from the next 50 rows of 'col1'. You could modify the below code to point to your columns and the values that you want
import pandas as pd
import numpy as np
if __name__ == '__main__':
col1 = np.linspace(0,100,100)
col2 = np.linspace(100, 200, 100)
dict = {'col1':col1,'col2':col2}
df = pd.DataFrame(dict)
df['col3']= df['col1'][0:50]
df['col4'] = df['col1'][50:100]
print(df)
Solution 2 based on added info from comments
import pandas as pd
import numpy as np
if __name__ == '__main__':
pd.set_option('display.width', 100000)
pd.set_option('display.max_columns', 500)
### partition size for example I have taken a low volums 20
part_size = 20
## number generation for data frame
col1 = np.linspace(0,100,100)
col2 = np.linspace(100, 200, 100)
## create initial data frame
dict = {'col1':col1,'col2':col2}
df = pd.DataFrame(dict)
len = df.shape[0]
## tells you how many new columns you need
rec = int(len/part_size)
_ = {}
## initialize slicing variables
low =0
high=part_size
print(len)
for i in range(rec):
if high >= len:
_['col_name_here{0}'.format(i)] = df[low:]['col1']
break
else:
_['col_name_here{0}'.format(i)] = df[low:high]['col1']
low = high
high+= part_size
df = df.assign(**_)
print(df)

How to remove automatically added back ticks while using explode() in pyspark?

I want to add a new column with some expression as defined here(https://www.mien.in/2018/03/25/reshaping-dataframe-using-pivot-and-melt-in-apache-spark-and-pandas/#pivot-in-spark). While doing so, my explode() function changes column names to be sought by adding back ticks(" ` ") at the beginning and at the end of each column which then gives out the error:
Cannot resolve column name `Column_name` from [Column_name, Column_name2]
I tried reading the documentation and few other questions on SO but they don't address this issue.
I tried logging the different steps, in order to give the reader some clarity.
The error is at the line:
_tmp = df.withColumn("_vars_and_vals", explode(_vars_and_vals))
The output of explode(...) is available here(https://pastebin.com/LU9p53th)
The function snippet is:
def melt_df(
df: DataFrame,
id_vars: Iterable[str], value_vars: Iterable[str],
var_name: str = "variable", value_name: str = "value") -> DataFrame:
"""Convert :class:`DataFrame` from wide to long format."""
print("Value name is {} and value vars is {}".format(
value_name, value_vars
))
# df2 = df2.select([col(k).alias(actual_cols[k]) for k in keys_de_cols])
# Create array<struct<variable: str, value: ...>>
_vars_and_vals = array(*(
struct(lit(c).alias(var_name), col(c).alias(value_name))
for c in value_vars))
print("Explode: ")
print(explode(_vars_and_vals))
# Add to the DataFrame and explode
_tmp = df.withColumn("_vars_and_vals", explode(_vars_and_vals))
print("_tmp:")
print(_tmp)
sys.exit()
cols = id_vars + [
col("_vars_and_vals")[x].alias(x) for x in [var_name, value_name]]
return _tmp.select(*cols)
Whereas the whole code is:
import sys
from datetime import datetime
from itertools import chain
from typing import Iterable
from pyspark.context import SparkContext
from pyspark.sql import (DataFrame, DataFrameReader, DataFrameWriter, Row,
SparkSession)
from pyspark.sql.functions import *
from pyspark.sql.functions import array, col, explode, lit, struct
from pyspark.sql.types import *
spark = SparkSession.builder.appName('navydish').getOrCreate()
last_correct_constant = 11
output_file = "april19_1.csv"
input_file_name = "input_for_aviral.csv"
def melt_df(
df: DataFrame,
id_vars: Iterable[str], value_vars: Iterable[str],
var_name: str = "variable", value_name: str = "value") -> DataFrame:
"""Convert :class:`DataFrame` from wide to long format."""
print("Value name is {} and value vars is {}".format(
value_name, value_vars
))
# df2 = df2.select([col(k).alias(actual_cols[k]) for k in keys_de_cols])
# Create array<struct<variable: str, value: ...>>
_vars_and_vals = array(*(
struct(lit(c).alias(var_name), col(c).alias(value_name))
for c in value_vars))
print("Explode: ")
print(explode(_vars_and_vals))
# Add to the DataFrame and explode
_tmp = df.withColumn("_vars_and_vals", explode(_vars_and_vals))
print("_tmp:")
print(_tmp)
sys.exit()
cols = id_vars + [
col("_vars_and_vals")[x].alias(x) for x in [var_name, value_name]]
return _tmp.select(*cols)
def getrows(df, rownums=None):
return df.rdd.zipWithIndex().filter(
lambda x: x[1] in rownums).map(lambda x: x[0])
df = spark.read.csv(
input_file_name,
header=True
)
df2 = df
for _col in df.columns:
if _col.startswith("_c"):
df = df.drop(_col)
if int(_col.split("_c")[-1]) > last_correct_constant:
df2 = df2.drop(_col)
else:
# removes the reqd cols, keeps the messed up ones only.
df2 = df2.drop(_col)
actual_cols = getrows(df2, rownums=[0]).collect()[0].asDict()
keys_de_cols = actual_cols.keys()
# df2 = df2.select([col(x).alias("right_" + str(x)) for x in right_cols])
df2 = df2.select([col(k).alias(actual_cols[k]) for k in keys_de_cols])
periods = []
periods_cols = getrows(df, rownums=[0]).collect()[0].asDict()
for k, v in periods_cols.items():
if v not in periods:
periods.append(v)
# periods = list(set(periods))
expected_columns_from_df = [
'Value Offtake(000 Rs.)',
'Sales Volume (Volume(LITRES))'
]
for _col in df.columns:
if _col.startswith('Value Offtake(000 Rs.)') or _col.startswith('Sales Volume (Volume(LITRES))'):
continue
df = df.drop(_col)
df2 = df2.withColumn("id", monotonically_increasing_id())
df = df.withColumn("id", monotonically_increasing_id())
df = df2.join(df, "id", "inner").drop("id")
print("After merge, cols of final dataframe are: ")
for _col in df.columns:
print(_col)
# creating a list of all constant columns
id_vars = []
for i in range(len(df.columns)):
if i < 12:
id_vars.append(df.columns[i])
# creating a list of Values from expected columns
value_vars = []
for _col in df.columns:
if _col.startswith(expected_columns_from_df[0]):
value_vars.append(_col)
value_vars = id_vars + value_vars
print("Sending this value vars to melt:")
print(value_vars)
# the name of the column in the resulting DataFrame, Value Offtake(000 Rs.)
var_name = expected_columns_from_df[0]
# final value for which we want to melt, Periods
value_name = "Periods"
df = melt_df(
df,
id_vars, value_vars,
var_name, value_name
)
print("The final headers of the resultant dataframe are: ")
print(df.columns)
The whole error is here(https://pastebin.com/9cUupTy3)
I understand one would need the data but I guess if one could clarify the working of explode in a way that the extra unwanted quotes(" ` ") can be avoided, I can work.

My loop always skip the first index

Every time I creat a loop function, it's common to have problem with the first one:
For example:
dfd = quandl.get("FRED/DEXBZUS")
dfe = quandl.get("ECB/EURBRL")
df = [dfd, dfe]
dps = []
for i in df:
I just get the second dataframe values.
Using this:
dfd = quandl.get("FRED/DEXBZUS")
df = [dfd]
dps = []
for i in df:
I got this:
Empty DataFrame
Columns: []
Index: []
And if I use this (repeting the first one):
dfd = quandl.get("FRED/DEXBZUS")
dfe = quandl.get("ECB/EURBRL")
df = [dfd, dfd, dfe]
dps = []
for i in df:
I get both dataframes correcly
Examples :
import quandl
import pandas as pd
#import matplotlib
import matplotlib.pyplot as plt
dfd = quandl.get("FRED/DEXBZUS")
dfe = quandl.get("ECB/EURBRL")
df = [dfd, dfe]
dps = []
for i in df:
df1 = i.reset_index()
results = pd.DataFrame(df1)
results = results.rename(columns={'Date': 'ds','Value': 'y'})
dps = pd.DataFrame(dps.append(results))
print(dps)
Empty DataFrame
Columns: []
Index: []
ds y
0 2008-01-02 2.6010
1 2008-01-03 2.5979
2 2008-01-04 2.5709
3 2008-01-07 2.6027
4 2008-01-08 2.5796
UPDATE
As Bruno suggested, it is related to this function:
dps = pd.DataFrame(dps.append(results))
How to append all the dataset into a one data frame ?
result=Pd.DataFrame(df1) If you create dataframe like this and don't give columns, then by default first it will take 1st row as column and later you are renaming columns what default created.
So please create pd.DataFrame(df1,columns=[column_list]).
First row will not skip.
#this will print every element in df
for i in df:
print i
Also,
for dfIndex, i in enumerate(df):
print i
print dfIndex #this will print the index of i in df
Note that indexes start at 0, not 1.

Categories