I have a QTableWidget in editable mode in which user puts in integer input , how can I generate a list of data entered in this table so as to perform operations on it , here is my manual code for that:
def dataframe_generation_from_table(self,table):
number_of_rows = table.rowCount()
number_of_columns = table.columnCount()
tmp_df = pd.DataFrame({ 'Date' : [] , str(self.final_lvl_of_analysis) :[], 'Value': []})
for i in range(0,number_of_rows):
for j in range(0,number_of_columns):
tmp_item = table.item(i,j)
tmp_df2 = pd.DataFrame( { 'Date' : [pd.to_datetime(table.horizontalHeaderItem(j).data())] , str(self.final_lvl_of_analysis) :[ str(table.verticalHeaderItem(i).data())], 'Value': [float(tmp_item.data(0))]})
print tmp_df2
tmp_df.update(tmp_df2, join = 'left', overwrite = False)
return tmp_df
Also , I am using the following code for QTableWidget generation:
self.pd_table = QtGui.QTableWidget(self.groupBox_19)
self.pd_table.setObjectName(_fromUtf8("pd_table"))
self.pd_table.setColumnCount(0)
self.pd_table.setRowCount(0)
My specs are : pandas 0.18.1 , PyQt 4 and Python 2.7
I think you're overcomplicating it a little with the updates/joins. The simplest approach is to create the full-size DataFrame first (filled with NaN) and then assign the data to this:
def dataframe_generation_from_table(self,table):
number_of_rows = table.rowCount()
number_of_columns = table.columnCount()
tmp_df = pd.DataFrame(
columns=['Date', str(self.final_lvl_of_analysis), 'Value'], # Fill columnets
index=range(number_of_rows) # Fill rows
)
for i in range(number_of_rows):
for j in range(number_of_columns):
tmp_df.ix[i, j] = table.item(i, j).data()
return tmp_df
The above code assigns data to it's location by the numerical index, so position 1,1 in the QtTableWidget will end up at 1,1 in the DataFrame. This way you don't need to worry about the column headers when moving data. If you want to change the column names you can do that when creating the DataFrame, changing the values passed into the columns= parameter.
If you want to change a column to DateTime format, you should be able to do this in a single operation after the loop with:
tmp_df['Date'] = pd.to_datetime( tmp_df['Date'] )
The change from .data() to .text() eliminated the ValueError.
def saveFile(self):
df = pd.DataFrame()
savePath = QtGui.QFileDialog.getSaveFileName(None, "Blood Hound",
"Testing.csv", "CSV files (*.csv)")
rows = self.tableWidget.rowCount()
columns = self.tableWidget.columnCount()
for i in range(rows):
for j in range(columns):
df.loc[i, j] = str(self.tableWidget.item(i, j).text())
df.to_csv((savePath), header = None, index = 0)
# creates a new df from qtables dimensions,
# copies qtable (data & headers) to the df and returns the df
#staticmethod
def write_qtable_to_df(table):
col_count = table.columnCount()
row_count = table.rowCount()
headers = [str(table.horizontalHeaderItem(i).text()) for i in range(col_count)]
# df indexing is slow, so use lists
df_list = []
for row in range(row_count):
df_list2 = []
for col in range(col_count):
table_item = table.item(row,col)
df_list2.append('' if table_item is None else str(table_item.text()))
df_list.append(df_list2)
df = pandas.DataFrame(df_list, columns=headers)
return df
Related
I have a excel sheet and the Columns header contains dynamic suffix like "s.FName", "g.LName", "Age", "Address" , "P.CAR", "S.Licsence" etc (about 100 of columns).
Problem is that, some of the columns does not have suffix. so If i use the below code that columns head is empty. I tried pathlib as well but doesnt work.
dataset = pd.read_excel(fileloc)
df = pd.DataFrame(dataset)
df.columns = df.columns.str.split('.').str[1]
So is there any way i can put a condition in the 3rd line of code.
I also used the below code but giving error
dataset = pd.read_excel(fileloc)
df = pd.DataFrame(dataset)
colindex = 0
for (columnName, columndata) in df.iteritems():
if str(columnName).__contains__('.'):
df.insert(colindex,str(columnName[0]).split('.')[1],columndata.value,True)
else:
df.insert(colindex, str(columnName[0]).split('.')[0],columndata.value,True)
EDIT
===Solution===
This may not be the optimal solution or the correct way to do this but other solution are always welcome.
collist =[]
dataset = pd.read_excel(fileloc)
df = pd.DataFrame(dataset)
#df.columns = df.columns.str.split('.').str[-1]
for col in (dataset.head(0)):
if str(col).__contains__('.'):
ncol = str(col).split('.')[1]
collist.append(str(ncol))
#df.rename(columns={col: ncol for col, ncol in zip(col, ncol)}, inplace=True)
else:
ncol = str(col)
collist.append(str(ncol))
#df.rename(columns={col: ncol for col, ncol in zip(str(col), ncol)}, inplace=True)
col_rename_dict = {i: j for i, j in zip(dataset.head(0), collist)}
df.rename(columns=col_rename_dict, inplace=True)
So I know my code isn't that close to right, but I am trying to loop through a list of csv's, line by line, to create a new csv where each line will list all csv's that met a condition. First column in all csv's is "date", I want to list the name of all csv's where data["entry"] > 3 on that date with date still being the 1st column.
Update: What I'm trying to do is for each csv, make a new list of each date the condition was met and on those days on the new csv append file_name to that row/rows.
###create list from dir
listdrs = os.listdir('c:/Users/17409/AppData/Local/Programs/Python/Python38/Indicators/SentdexTutorial/stock_dfs/')
###append full path to list
string = 'c:/Users/17409/AppData/Local/Programs/Python/Python38/Indicators/SentdexTutorial/stock_dfs/'
listdrs_path = [ string + x for x in listdrs]
complete_string = ' is complete'
listdrs_confirmation = [ x + complete_string for x in listdrs]
#print (listdrs_path)
###start loop, for each "file" in listdrs run the 2 functions below and overwrite saved csv.
for file_path in listdrs_path:
data = pd.read_csv(file_path, index_col=0)
########################################
####function 1
def get_price_hist(ticker):
# Put stock price data in dataframe
data = pd.read_csv(file_path)
#listdr = os.listdir('Users\17409\AppData\Local\Programs\Python\Python38\Indicators\Sentdex Tutorial\stock_dfs')
##print(listdr)
# Convert date to timestamp and make index
data.index = data["date"].apply(lambda x: pd.Timestamp(x))
data.drop("date", axis=1, inplace=True)
return data
##create new table and append data
data = data[data.Entry > 3]
for date in data.date:
new_table[date].append(file_path)
new_table_data = data.DataFrame([(k, ','.join(new_table[k])) for k in sorted(new_table.keys())], columns=['date', 'table names'])
print(new_table_data)
I would do something like this. You need to modify the following snippet according to your needs.
import pandas as pd
from glob import glob
from collections import defaultdict
# create and save some random data
df1 = pd.DataFrame({'date':[1,2,3], 'entry':[4,3,2]})
df2 = pd.DataFrame({'date':[1,2,3], 'entry':[1,2,4]})
df3 = pd.DataFrame({'date':[1,2,3], 'entry':[3,1,5]})
df1.to_csv('table1.csv')
df2.to_csv('table2.csv')
df3.to_csv('table3.csv')
# read all the csv
tables = glob('*.csv')
new_table = defaultdict(list)
# create new table
for table in tables:
df = pd.read_csv(table)
df = df[df.entry > 2]
for date in df.date:
new_table[date].append(table)
new_table_df = pd.DataFrame([(k, ','.join(new_table[k])) for k in sorted(new_table.keys())], columns=['date', 'table names'])
print (new_table_df)
date table names
0 1 table3.csv,table1.csv
1 2 table1.csv
2 3 table2.csv,table3.csv
Had some issues with the other code, here is the final solution I was able to come up with.
if 'Entry' in data:
##create new table and append data
data = data[data.Entry > 3]
if 'date' in data:
for date in data.date:
if date not in new_table:
new_table[date] = []
new_table[date].append(
pd.DataFrame({'FileName': [file_name], 'Entry': [int(data[data.date == date].Entry)]}))
new_table
elif 'Date' in data:
for date in data.Date:
if date not in new_table:
new_table[date] = []
new_table[date].append(
pd.DataFrame({'FileName': [file_name], 'Entry': [int(data[data.Date == date].Entry)]}))
# sorted(new_table, key=lambda x: x[0])
def find_max(tbl):
new_table_data = {}
for date in sorted(tbl.keys()):
merged_dt = pd.concat(tbl[date])
max_entry_v = max(list(merged_dt.Entry))
tbl_names = list(merged_dt[merged_dt.Entry == max_entry_v].FileName)
new_table_data[date] = tbl_names
return new_table_data
new_table_data = find_max(tbl=new_table)
#df = pd.DataFrame(new_table, columns =['date', 'tickers'])
#df.to_csv(input_path, index = False, header = True)
# find_max(new_table)
# new_table_data = pd.DataFrame([(k, ','.join(new_table[k])) for k in sorted(new_table.keys())],
# columns=['date', 'table names'])
print(new_table_data)
I have a dataframe and I want to create some new columns that contain the growth of the original columns.
First, I append the new columns to the dataframe, filling them with NaN values.
Then, for every row I check if the previous row corresponds to the previous year, and if it does I want to fill the new column with the growth of the variable. Otherwise I just leave the NaN value.
Here is my code:
for index, row in df.iterrows():
if df.loc[index,'year'] == df.loc[index - 1, 'year'] + 1 and df.loc[index,'name'] == df.loc[index - 1, 'name']:
df.loc[index,k:] = (df.loc[index,1:k-1]/df.loc[index-1,1:k-1]) - 1
Where k is the column index of the first new "growth" column that I created.
The problem with this code is that it leaves the new columns with NaN values, without making any change. Did I do anything wrong?
Thanks
df.sort_values('year', inplace = True)
growth_cols = [<your-growth-cols>]
new_cols = [x + "_growth" for x in growth_cols]
growth_df = df[growth_cols] / df[growth_cols].shift(1)
growth_df.rename(columns = dict(zip(growth_cols, new_cols)), inplace = True)
df = pd.concat([df, growth_df], axis =1)
df['gap'] = df.year.diff()
for col in new_cols:
df[col] = df[col] * df['gap']
df[col].replace(0, np.nan, inplace = True)
df.drop('gap', axis = 1, inplace = True)
EDIT (based on updated question):
You would need to change the line
df['gap'] = df.year.diff()
to:
df['gap'] = df.groupby('name').diff()
I want to add a new column with some expression as defined here(https://www.mien.in/2018/03/25/reshaping-dataframe-using-pivot-and-melt-in-apache-spark-and-pandas/#pivot-in-spark). While doing so, my explode() function changes column names to be sought by adding back ticks(" ` ") at the beginning and at the end of each column which then gives out the error:
Cannot resolve column name `Column_name` from [Column_name, Column_name2]
I tried reading the documentation and few other questions on SO but they don't address this issue.
I tried logging the different steps, in order to give the reader some clarity.
The error is at the line:
_tmp = df.withColumn("_vars_and_vals", explode(_vars_and_vals))
The output of explode(...) is available here(https://pastebin.com/LU9p53th)
The function snippet is:
def melt_df(
df: DataFrame,
id_vars: Iterable[str], value_vars: Iterable[str],
var_name: str = "variable", value_name: str = "value") -> DataFrame:
"""Convert :class:`DataFrame` from wide to long format."""
print("Value name is {} and value vars is {}".format(
value_name, value_vars
))
# df2 = df2.select([col(k).alias(actual_cols[k]) for k in keys_de_cols])
# Create array<struct<variable: str, value: ...>>
_vars_and_vals = array(*(
struct(lit(c).alias(var_name), col(c).alias(value_name))
for c in value_vars))
print("Explode: ")
print(explode(_vars_and_vals))
# Add to the DataFrame and explode
_tmp = df.withColumn("_vars_and_vals", explode(_vars_and_vals))
print("_tmp:")
print(_tmp)
sys.exit()
cols = id_vars + [
col("_vars_and_vals")[x].alias(x) for x in [var_name, value_name]]
return _tmp.select(*cols)
Whereas the whole code is:
import sys
from datetime import datetime
from itertools import chain
from typing import Iterable
from pyspark.context import SparkContext
from pyspark.sql import (DataFrame, DataFrameReader, DataFrameWriter, Row,
SparkSession)
from pyspark.sql.functions import *
from pyspark.sql.functions import array, col, explode, lit, struct
from pyspark.sql.types import *
spark = SparkSession.builder.appName('navydish').getOrCreate()
last_correct_constant = 11
output_file = "april19_1.csv"
input_file_name = "input_for_aviral.csv"
def melt_df(
df: DataFrame,
id_vars: Iterable[str], value_vars: Iterable[str],
var_name: str = "variable", value_name: str = "value") -> DataFrame:
"""Convert :class:`DataFrame` from wide to long format."""
print("Value name is {} and value vars is {}".format(
value_name, value_vars
))
# df2 = df2.select([col(k).alias(actual_cols[k]) for k in keys_de_cols])
# Create array<struct<variable: str, value: ...>>
_vars_and_vals = array(*(
struct(lit(c).alias(var_name), col(c).alias(value_name))
for c in value_vars))
print("Explode: ")
print(explode(_vars_and_vals))
# Add to the DataFrame and explode
_tmp = df.withColumn("_vars_and_vals", explode(_vars_and_vals))
print("_tmp:")
print(_tmp)
sys.exit()
cols = id_vars + [
col("_vars_and_vals")[x].alias(x) for x in [var_name, value_name]]
return _tmp.select(*cols)
def getrows(df, rownums=None):
return df.rdd.zipWithIndex().filter(
lambda x: x[1] in rownums).map(lambda x: x[0])
df = spark.read.csv(
input_file_name,
header=True
)
df2 = df
for _col in df.columns:
if _col.startswith("_c"):
df = df.drop(_col)
if int(_col.split("_c")[-1]) > last_correct_constant:
df2 = df2.drop(_col)
else:
# removes the reqd cols, keeps the messed up ones only.
df2 = df2.drop(_col)
actual_cols = getrows(df2, rownums=[0]).collect()[0].asDict()
keys_de_cols = actual_cols.keys()
# df2 = df2.select([col(x).alias("right_" + str(x)) for x in right_cols])
df2 = df2.select([col(k).alias(actual_cols[k]) for k in keys_de_cols])
periods = []
periods_cols = getrows(df, rownums=[0]).collect()[0].asDict()
for k, v in periods_cols.items():
if v not in periods:
periods.append(v)
# periods = list(set(periods))
expected_columns_from_df = [
'Value Offtake(000 Rs.)',
'Sales Volume (Volume(LITRES))'
]
for _col in df.columns:
if _col.startswith('Value Offtake(000 Rs.)') or _col.startswith('Sales Volume (Volume(LITRES))'):
continue
df = df.drop(_col)
df2 = df2.withColumn("id", monotonically_increasing_id())
df = df.withColumn("id", monotonically_increasing_id())
df = df2.join(df, "id", "inner").drop("id")
print("After merge, cols of final dataframe are: ")
for _col in df.columns:
print(_col)
# creating a list of all constant columns
id_vars = []
for i in range(len(df.columns)):
if i < 12:
id_vars.append(df.columns[i])
# creating a list of Values from expected columns
value_vars = []
for _col in df.columns:
if _col.startswith(expected_columns_from_df[0]):
value_vars.append(_col)
value_vars = id_vars + value_vars
print("Sending this value vars to melt:")
print(value_vars)
# the name of the column in the resulting DataFrame, Value Offtake(000 Rs.)
var_name = expected_columns_from_df[0]
# final value for which we want to melt, Periods
value_name = "Periods"
df = melt_df(
df,
id_vars, value_vars,
var_name, value_name
)
print("The final headers of the resultant dataframe are: ")
print(df.columns)
The whole error is here(https://pastebin.com/9cUupTy3)
I understand one would need the data but I guess if one could clarify the working of explode in a way that the extra unwanted quotes(" ` ") can be avoided, I can work.
I have a QTableWidget in editable mode in which user puts in integer input , how can I generate a list of data entered in this table so as to perform operations on it , here is my manual code for that:
def dataframe_generation_from_table(self,table):
number_of_rows = table.rowCount()
number_of_columns = table.columnCount()
tmp_df = pd.DataFrame({ 'Date' : [] , str(self.final_lvl_of_analysis) :[], 'Value': []})
for i in range(0,number_of_rows):
for j in range(0,number_of_columns):
tmp_item = table.item(i,j)
tmp_df2 = pd.DataFrame( { 'Date' : [pd.to_datetime(table.horizontalHeaderItem(j).data())] , str(self.final_lvl_of_analysis) :[ str(table.verticalHeaderItem(i).data())], 'Value': [float(tmp_item.data(0))]})
print tmp_df2
tmp_df.update(tmp_df2, join = 'left', overwrite = False)
return tmp_df
Also , I am using the following code for QTableWidget generation:
self.pd_table = QtGui.QTableWidget(self.groupBox_19)
self.pd_table.setObjectName(_fromUtf8("pd_table"))
self.pd_table.setColumnCount(0)
self.pd_table.setRowCount(0)
My specs are : pandas 0.18.1 , PyQt 4 and Python 2.7
I think you're overcomplicating it a little with the updates/joins. The simplest approach is to create the full-size DataFrame first (filled with NaN) and then assign the data to this:
def dataframe_generation_from_table(self,table):
number_of_rows = table.rowCount()
number_of_columns = table.columnCount()
tmp_df = pd.DataFrame(
columns=['Date', str(self.final_lvl_of_analysis), 'Value'], # Fill columnets
index=range(number_of_rows) # Fill rows
)
for i in range(number_of_rows):
for j in range(number_of_columns):
tmp_df.ix[i, j] = table.item(i, j).data()
return tmp_df
The above code assigns data to it's location by the numerical index, so position 1,1 in the QtTableWidget will end up at 1,1 in the DataFrame. This way you don't need to worry about the column headers when moving data. If you want to change the column names you can do that when creating the DataFrame, changing the values passed into the columns= parameter.
If you want to change a column to DateTime format, you should be able to do this in a single operation after the loop with:
tmp_df['Date'] = pd.to_datetime( tmp_df['Date'] )
The change from .data() to .text() eliminated the ValueError.
def saveFile(self):
df = pd.DataFrame()
savePath = QtGui.QFileDialog.getSaveFileName(None, "Blood Hound",
"Testing.csv", "CSV files (*.csv)")
rows = self.tableWidget.rowCount()
columns = self.tableWidget.columnCount()
for i in range(rows):
for j in range(columns):
df.loc[i, j] = str(self.tableWidget.item(i, j).text())
df.to_csv((savePath), header = None, index = 0)
# creates a new df from qtables dimensions,
# copies qtable (data & headers) to the df and returns the df
#staticmethod
def write_qtable_to_df(table):
col_count = table.columnCount()
row_count = table.rowCount()
headers = [str(table.horizontalHeaderItem(i).text()) for i in range(col_count)]
# df indexing is slow, so use lists
df_list = []
for row in range(row_count):
df_list2 = []
for col in range(col_count):
table_item = table.item(row,col)
df_list2.append('' if table_item is None else str(table_item.text()))
df_list.append(df_list2)
df = pandas.DataFrame(df_list, columns=headers)
return df