I want to add a new column with some expression as defined here(https://www.mien.in/2018/03/25/reshaping-dataframe-using-pivot-and-melt-in-apache-spark-and-pandas/#pivot-in-spark). While doing so, my explode() function changes column names to be sought by adding back ticks(" ` ") at the beginning and at the end of each column which then gives out the error:
Cannot resolve column name `Column_name` from [Column_name, Column_name2]
I tried reading the documentation and few other questions on SO but they don't address this issue.
I tried logging the different steps, in order to give the reader some clarity.
The error is at the line:
_tmp = df.withColumn("_vars_and_vals", explode(_vars_and_vals))
The output of explode(...) is available here(https://pastebin.com/LU9p53th)
The function snippet is:
def melt_df(
df: DataFrame,
id_vars: Iterable[str], value_vars: Iterable[str],
var_name: str = "variable", value_name: str = "value") -> DataFrame:
"""Convert :class:`DataFrame` from wide to long format."""
print("Value name is {} and value vars is {}".format(
value_name, value_vars
))
# df2 = df2.select([col(k).alias(actual_cols[k]) for k in keys_de_cols])
# Create array<struct<variable: str, value: ...>>
_vars_and_vals = array(*(
struct(lit(c).alias(var_name), col(c).alias(value_name))
for c in value_vars))
print("Explode: ")
print(explode(_vars_and_vals))
# Add to the DataFrame and explode
_tmp = df.withColumn("_vars_and_vals", explode(_vars_and_vals))
print("_tmp:")
print(_tmp)
sys.exit()
cols = id_vars + [
col("_vars_and_vals")[x].alias(x) for x in [var_name, value_name]]
return _tmp.select(*cols)
Whereas the whole code is:
import sys
from datetime import datetime
from itertools import chain
from typing import Iterable
from pyspark.context import SparkContext
from pyspark.sql import (DataFrame, DataFrameReader, DataFrameWriter, Row,
SparkSession)
from pyspark.sql.functions import *
from pyspark.sql.functions import array, col, explode, lit, struct
from pyspark.sql.types import *
spark = SparkSession.builder.appName('navydish').getOrCreate()
last_correct_constant = 11
output_file = "april19_1.csv"
input_file_name = "input_for_aviral.csv"
def melt_df(
df: DataFrame,
id_vars: Iterable[str], value_vars: Iterable[str],
var_name: str = "variable", value_name: str = "value") -> DataFrame:
"""Convert :class:`DataFrame` from wide to long format."""
print("Value name is {} and value vars is {}".format(
value_name, value_vars
))
# df2 = df2.select([col(k).alias(actual_cols[k]) for k in keys_de_cols])
# Create array<struct<variable: str, value: ...>>
_vars_and_vals = array(*(
struct(lit(c).alias(var_name), col(c).alias(value_name))
for c in value_vars))
print("Explode: ")
print(explode(_vars_and_vals))
# Add to the DataFrame and explode
_tmp = df.withColumn("_vars_and_vals", explode(_vars_and_vals))
print("_tmp:")
print(_tmp)
sys.exit()
cols = id_vars + [
col("_vars_and_vals")[x].alias(x) for x in [var_name, value_name]]
return _tmp.select(*cols)
def getrows(df, rownums=None):
return df.rdd.zipWithIndex().filter(
lambda x: x[1] in rownums).map(lambda x: x[0])
df = spark.read.csv(
input_file_name,
header=True
)
df2 = df
for _col in df.columns:
if _col.startswith("_c"):
df = df.drop(_col)
if int(_col.split("_c")[-1]) > last_correct_constant:
df2 = df2.drop(_col)
else:
# removes the reqd cols, keeps the messed up ones only.
df2 = df2.drop(_col)
actual_cols = getrows(df2, rownums=[0]).collect()[0].asDict()
keys_de_cols = actual_cols.keys()
# df2 = df2.select([col(x).alias("right_" + str(x)) for x in right_cols])
df2 = df2.select([col(k).alias(actual_cols[k]) for k in keys_de_cols])
periods = []
periods_cols = getrows(df, rownums=[0]).collect()[0].asDict()
for k, v in periods_cols.items():
if v not in periods:
periods.append(v)
# periods = list(set(periods))
expected_columns_from_df = [
'Value Offtake(000 Rs.)',
'Sales Volume (Volume(LITRES))'
]
for _col in df.columns:
if _col.startswith('Value Offtake(000 Rs.)') or _col.startswith('Sales Volume (Volume(LITRES))'):
continue
df = df.drop(_col)
df2 = df2.withColumn("id", monotonically_increasing_id())
df = df.withColumn("id", monotonically_increasing_id())
df = df2.join(df, "id", "inner").drop("id")
print("After merge, cols of final dataframe are: ")
for _col in df.columns:
print(_col)
# creating a list of all constant columns
id_vars = []
for i in range(len(df.columns)):
if i < 12:
id_vars.append(df.columns[i])
# creating a list of Values from expected columns
value_vars = []
for _col in df.columns:
if _col.startswith(expected_columns_from_df[0]):
value_vars.append(_col)
value_vars = id_vars + value_vars
print("Sending this value vars to melt:")
print(value_vars)
# the name of the column in the resulting DataFrame, Value Offtake(000 Rs.)
var_name = expected_columns_from_df[0]
# final value for which we want to melt, Periods
value_name = "Periods"
df = melt_df(
df,
id_vars, value_vars,
var_name, value_name
)
print("The final headers of the resultant dataframe are: ")
print(df.columns)
The whole error is here(https://pastebin.com/9cUupTy3)
I understand one would need the data but I guess if one could clarify the working of explode in a way that the extra unwanted quotes(" ` ") can be avoided, I can work.
Related
I am able to write a function to merge columns to a new column, but fail to change int column into float before changing to string for merging.
I hope that in the new merged column, those integer would have pending ".00000".
At the end I was trying to make merged column as key for joining two vaex on multiple key/column. As it seems vaex only take one column/key for joining two vaex, I need to make combined column as key.
The changing of int to float is in case that column in one vaex is int and in another vaex is float.
code is as below.
Function new_column_by_column_merging is working, but function new_column_by_column_merging2 is not. Wondering if there is any way to make it work.
import vaex
import pandas as pd
import numpy as np
def new_column_by_column_merging(df, columns=None):
if columns is None:
columns = df.get_column_names()
if type(columns) is str:
df['merged_column_key'] = df[columns]
return df
df['merged_column_key'] = np.array(['']*len(df))
for col in columns:
df['merged_column_key'] = df['merged_column_key'] + '_' + df[col].astype('string')
return df
def new_column_by_column_merging2(df, columns=None):
if columns is None:
columns = df.get_column_names()
if type(columns) is str:
df['merged_column_key'] = df[columns]
return df
df['merged_column_key'] = np.array(['']*len(df))
for col in columns:
try:
df[col] = df[col].astype('float')
except:
print('fail to convert to float')
df['merged_column_key'] = df['merged_column_key'] + '_' + df[col].astype('string')
return df
pandas_df = pd.DataFrame({'Name': ['Tom', 'Joseph', 'Krish', 'John'], 'Last Name': ['Johnson', 'Cameron', 'Biden', 'Washington'], 'Age': [20, 21, 19, 18], 'Weight': [60.0, 61.0, 62.0, 63.0]})
print('pandas_df is')
print(pandas_df)
df = vaex.from_pandas(df=pandas_df, copy_index=False)
df1 = new_column_by_column_merging(df, ['Name', 'Age', 'Weight'])
print('new_column_by_column_merging returns')
print(df1)
df2 = new_column_by_column_merging2(df, ['Name', 'Age', 'Weight'])
print('new_column_by_column_merging2 returns')
print(df2)
It looks like the vaex expression system does not always play nicely with the try / except checks. So you need to be careful with the dtypes. One way of handing this:
import vaex
df = vaex.datasets.titanic() # dataframe for testing
def new_column_by_column_merging2(df, columns=None):
if columns is None:
columns = df.get_column_names()
if type(columns) is str:
df['merged_column_key'] = df[columns]
return df
df['merged_column_key'] = np.array(['']*len(df))
for col in columns:
if df[col].is_string():
pass
else:
df[col] = df[col].astype('float')
df['merged_column_key'] = df['merged_column_key'] + '_' + df[col].astype('string')
return df
new_column_by_column_merging2(df) # should work
Basically i modified the try/except statement to explicitly check for strings (since they can't be converted to floats). You might have to extend that check to check for other things like datetime etc.. if needed. Hope this helps
I need to save matrix results in one dataFrame.
to do that:
i split matrix and i create a new dataFrameor each iteration
and i append it to Target dataFrame.
i don't know if is The good way or not
what about perFormance?
import pandas as pd
import numpy as np
def generate_Matrix_as_dataframe( productname,variableName,results):
# df_results = pd.DataFrame({'Values': result})
df= pd.DataFrame(results)
dimension = len(results[0])
df['Values'] = pd.Series(df.fillna('').values.tolist())
# convert to Array
df['Values'] = df['Values'].apply(lambda x: np.array(x))
df_results =df[df.columns.drop([i for i in range(dimension)])]
df_results = df_results.reset_index()
df_results= df_results.rename(columns={"index":"Generation"})
df_results['Depth'] = df_results.index + 1
df_results['ProductName'] = productname
df_results['VariableName'] = variableName
return df_results[['ProductName','VariableName' ,'Depth', 'Values']]
df_results_ifrs17 = pd.DataFrame(columns=['ProductName', 'VariableName','Depth', 'Values'])
products =['P1','P2']
variables =['V1','V2']
nbrproduct=1
nbvariables=1
for p in products:
for v in variables:
value= np.ones( (nbrproduct, nbvariables), dtype=np.int32 )
df_results = generate_Matrix_as_dataframe(p, v,value)
df_results_ifrs17 = df_results_ifrs17.append(df_results, ignore_index=True)
nbvariables=nbvariables+1
print(value)
nbrproduct=nbrproduct+1
print(df_results_ifrs17)
I am importing a .txt file via read_table and get a DataFrame similar to
d = ['89278 5857', '1.000e-02', '1.591184e-02', '2.100053e-02', '89300 5857', '4.038443e-01', '4.037924e-01', '4.037336e-01']
df = pd.DataFrame(data = d)
and would like to reorganize it into
r = {'89278 5857': [1.000e-02, 1.591184e-02, 2.100053e-02], '89300 5857': [4.038443e-01, 4.037924e-01, 4.037336e-01]}
rf = pd.DataFrame(data = r)
The .txt file is typically 50k+ rows with an unknown number of '89278 5857' type values.
Thanks!
You can use itertools.groupby:
from itertools import groupby
data, cur_group = {}, None
for v, g in groupby(df[0], lambda k: " " in k):
if v:
cur_group = []
data[next(g)] = cur_group
else:
cur_group.extend(g)
df = pd.DataFrame(data)
print(df)
Prints:
89278 5857 89300 5857
0 1.000e-02 4.038443e-01
1 1.591184e-02 4.037924e-01
2 2.100053e-02 4.037336e-01
Assuming what delineates the start of the next group is a space, here what I would:
df.assign(
key=lambda df: numpy.where(
df['value'].str.contains(' '), # what defines each group
df['value'],
numpy.nan
),
).fillna(
method='ffill' # copy the group label down until the next group starts
).loc[
lambda df: df['value'] != df['key'] # remove the rows that kicked off each group
].assign(
idx=lambda df: df.groupby('key').cumcount() # get a row number for each group
).pivot(
index='idx', # pivot into the wide format
columns='key',
values='value'
).astype(float) # turn values into numbers instead of strings
And I get:
key 89278 5857 89300 5857
idx
0 0.010000 0.403844
1 0.015912 0.403792
2 0.021001 0.403734
I'm trying to write a Python function that does One-Hot encoding in-place but I'm having trouble finding a way to do a concat operation in-place at the end. It appears to make a copy of my DataFrame for the concat output and I am unable to assign this to my DataFrame that I passed by reference.
How can this be done?
def one_hot_encode(df, col: str):
"""One-Hot encode inplace. Includes NAN.
Keyword arguments:
df (DataFrame) -- the DataFrame object to modify
col (str) -- the column name to encode
"""
insert_loc = df.columns.get_loc(col)
insert_data = pd.get_dummies(df[col], prefix=col + '_', dummy_na=True)
df.drop(col, axis=1, inplace=True)
df[:] = pd.concat([df.iloc[:, :insert_loc], insert_data, df.iloc[:, insert_loc:]], axis=1) # Doesn't take effect outside function
I don't think you can pass function arguments by reference in python (see: How do I pass a variable by reference? )
Instead what you can do is just return the modified df from your function, and assign result to the original df:
def one_hot_encode(df, col: str):
...
return df
...
df=one_hot_encode(df, col)
To make the change take affect outside the function, we have to change the object that was passed in rather than replace its name (inside the function) with a new object.
To assign the new columns, you can use
df[insert_data.columns] = insert_data
instead of the concat.
That doesn't take advantage of your careful insert order though.
To retain your order, we can redindex the data frame.
df.reindex(columns=cols)
where cols is the combined list of columns in order:
cols = [cols[:insert_loc] + list(insert_data.columns) + cols[insert_loc:]]
Putting it all together,
import pandas as pd
def one_hot_encode(df, col: str):
"""One-Hot encode inplace. Includes NAN.
Keyword arguments:
df (DataFrame) -- the DataFrame object to modify
col (str) -- the column name to encode
"""
cols = list(df.columns)
insert_loc = df.columns.get_loc(col)
insert_data = pd.get_dummies(df[col], prefix=col + '_', dummy_na=True)
cols = [cols[:insert_loc] + list(insert_data.columns) + cols[insert_loc:]]
df[insert_data.columns] = insert_data
df.reindex(columns=cols)
df.drop(col, axis=1, inplace=True)
import seaborn
diamonds=seaborn.load_dataset("diamonds")
col="color"
one_hot_encode(diamonds, "color")
assert( "color" not in diamonds.columns )
assert( len([c for c in diamonds.columns if c.startswith("color")]) == 8 )
df.insert is inplace--but can only insert one column at a time. It might not be worth the reorder.
def one_hot_encode2(df, col: str):
"""One-Hot encode inplace. Includes NAN.
Keyword arguments:
df (DataFrame) -- the DataFrame object to modify
col (str) -- the column name to encode
"""
cols = list(df.columns)
insert_loc = df.columns.get_loc(col)
insert_data = pd.get_dummies(df[col], prefix=col + '_', dummy_na=True)
for offset, newcol in enumerate(insert_data.columns):
df.insert(loc=insert_loc+offset, column=newcol, value = insert_data[[newcol]])
df.drop(col, axis=1, inplace=True)
import seaborn
diamonds=seaborn.load_dataset("diamonds")
col="color"
one_hot_encode2(diamonds, "color")
assert( "color" not in diamonds.columns )
assert(len([c for c in diamonds.columns if c.startswith("color")]) == 8)
assert([(i) for i,c in enumerate(diamonds.columns) if c.startswith("color")][0] == 2)
The scope of the variables of a function are only inside that function. Simply include a return statement in the end of the function to get your modified dataframe as output. Calling this function will now return your modified dataframe. Also while assigning new (dummy) columns, instead of df[:] use df, as you are changing the dimension of original dataframe.
def one_hot_encode(df, col: str):
insert_loc = df.columns.get_loc(col)
insert_data = pd.get_dummies(df[col], prefix=col + '_', dummy_na=True)
df.drop(col, axis=1, inplace=True)
df = pd.concat([df.iloc[:, :insert_loc], insert_data, df.iloc[:, insert_loc:]], axis=1)
return df
Now to see the modified dataframe, call the function and assign it to a new/existing dataframe as below
df=one_hot_encode(df,'<any column name>')
I have a QTableWidget in editable mode in which user puts in integer input , how can I generate a list of data entered in this table so as to perform operations on it , here is my manual code for that:
def dataframe_generation_from_table(self,table):
number_of_rows = table.rowCount()
number_of_columns = table.columnCount()
tmp_df = pd.DataFrame({ 'Date' : [] , str(self.final_lvl_of_analysis) :[], 'Value': []})
for i in range(0,number_of_rows):
for j in range(0,number_of_columns):
tmp_item = table.item(i,j)
tmp_df2 = pd.DataFrame( { 'Date' : [pd.to_datetime(table.horizontalHeaderItem(j).data())] , str(self.final_lvl_of_analysis) :[ str(table.verticalHeaderItem(i).data())], 'Value': [float(tmp_item.data(0))]})
print tmp_df2
tmp_df.update(tmp_df2, join = 'left', overwrite = False)
return tmp_df
Also , I am using the following code for QTableWidget generation:
self.pd_table = QtGui.QTableWidget(self.groupBox_19)
self.pd_table.setObjectName(_fromUtf8("pd_table"))
self.pd_table.setColumnCount(0)
self.pd_table.setRowCount(0)
My specs are : pandas 0.18.1 , PyQt 4 and Python 2.7
I think you're overcomplicating it a little with the updates/joins. The simplest approach is to create the full-size DataFrame first (filled with NaN) and then assign the data to this:
def dataframe_generation_from_table(self,table):
number_of_rows = table.rowCount()
number_of_columns = table.columnCount()
tmp_df = pd.DataFrame(
columns=['Date', str(self.final_lvl_of_analysis), 'Value'], # Fill columnets
index=range(number_of_rows) # Fill rows
)
for i in range(number_of_rows):
for j in range(number_of_columns):
tmp_df.ix[i, j] = table.item(i, j).data()
return tmp_df
The above code assigns data to it's location by the numerical index, so position 1,1 in the QtTableWidget will end up at 1,1 in the DataFrame. This way you don't need to worry about the column headers when moving data. If you want to change the column names you can do that when creating the DataFrame, changing the values passed into the columns= parameter.
If you want to change a column to DateTime format, you should be able to do this in a single operation after the loop with:
tmp_df['Date'] = pd.to_datetime( tmp_df['Date'] )
The change from .data() to .text() eliminated the ValueError.
def saveFile(self):
df = pd.DataFrame()
savePath = QtGui.QFileDialog.getSaveFileName(None, "Blood Hound",
"Testing.csv", "CSV files (*.csv)")
rows = self.tableWidget.rowCount()
columns = self.tableWidget.columnCount()
for i in range(rows):
for j in range(columns):
df.loc[i, j] = str(self.tableWidget.item(i, j).text())
df.to_csv((savePath), header = None, index = 0)
# creates a new df from qtables dimensions,
# copies qtable (data & headers) to the df and returns the df
#staticmethod
def write_qtable_to_df(table):
col_count = table.columnCount()
row_count = table.rowCount()
headers = [str(table.horizontalHeaderItem(i).text()) for i in range(col_count)]
# df indexing is slow, so use lists
df_list = []
for row in range(row_count):
df_list2 = []
for col in range(col_count):
table_item = table.item(row,col)
df_list2.append('' if table_item is None else str(table_item.text()))
df_list.append(df_list2)
df = pandas.DataFrame(df_list, columns=headers)
return df