Python custom method set to new variable changes old variable - python

I have created a class with two methods, NRG_load and NRG_flat. The first loads a CSV, converts it into a DataFrame and applies some filtering; the second takes this DataFrame and, after creating two columns, it melts the DataFrame to pivot it.
I am trying out these methods with the following code:
nrg105 = eNRG.NRG_load('nrg_105a.tsv')
nrg105_flat = eNRG.NRG_flat(nrg105, '105')
where eNRG is the class, and '105' as second argument is needed to run an if-loop within the method to create the aforementioned columns.
The behaviour I cannot explain is that the second line - the one with the NRG_flat method - changes the nrg105 values.
Note that if I only run the NRG_load method, I get the expected DataFrame.
What is the behaviour I am missing? Because it's not the first time I apply a syntax like that, but I never had problems, so I don't know where I should look at.
Thank you in advance for all of your suggestions.
EDIT: as requested, here is the class' code:
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 16 15:22:21 2019
#author: CAPIZZI Filippo Antonio
"""
import pandas as pd
from FixFilename import FixFilename as ff
from SplitColumn import SplitColumn as sc
from datetime import datetime as ddt
class EurostatNRG:
# This class includes the modules needed to load and filter
# the Eurostat NRG files
# Default countries' lists to be used by the functions
COUNTRIES = [
'EU28', 'AL', 'AT', 'BE', 'BG', 'CY', 'CZ', 'DE', 'DK', 'EE', 'EL',
'ES', 'FI', 'FR', 'GE', 'HR', 'HU', 'IE', 'IS', 'IT', 'LT', 'LU', 'LV',
'MD', 'ME', 'MK', 'MT', 'NL', 'NO', 'PL', 'PT', 'RO', 'SE', 'SI', 'SK',
'TR', 'UA', 'UK', 'XK'
]
# Default years of analysis
YEARS = list(range(2005, int(ddt.now().year) - 1))
# NOTE: the 'datetime' library will call the current year, but since
# the code is using the 'range' function, the end years will be always
# current-1 (e.g. if we are in 2019, 'current year' will be 2018).
# Thus, I have added "-1" because the end year is t-2.
INDIC_PROD = pd.read_excel(
'./Datasets/VITO/map_nrg.xlsx',
sheet_name=[
'nrg105a_indic', 'nrg105a_prod', 'nrg110a_indic', 'nrg110a_prod',
'nrg110'
],
convert_float=True)
def NRG_load(dataset, countries=COUNTRIES, years=YEARS, unit='ktoe'):
# This module will load and refine the NRG dataset,
# preparing it to be filtered
# Fix eventual flags
dataset = ff.fix_flags(dataset)
# Load the dataset into a DataFrame
df = pd.read_csv(
dataset,
delimiter='\t',
encoding='utf-8',
na_values=[':', ': ', ' :'],
decimal='.')
# Clean up spaces from the column names
df.columns = df.columns.str.strip()
# Removes the mentioned column because it's not needed
if 'Flag and Footnotes' in df.columns:
df.drop(columns=['Flag and Footnotes'], inplace=True)
# Split the first column into separate columns
df = sc.nrg_split_column(df)
# Rename the columns
df.rename(
columns={
'country': 'COUNTRY',
'fuel_code': 'KEY_PRODUCT',
'nrg_code': 'KEY_INDICATOR',
'unit': 'UNIT'
},
inplace=True)
# Filter the dataset
df = EurostatNRG.NRG_filter(
df, countries=countries, years=years, unit=unit)
return df
def NRG_filter(df, countries, years, unit):
# This module will filter the input DataFrame 'df'
# showing only the 'countries', 'years' and 'unit' selected
# First, all of the units not of interest are removed
df.drop(df[df.UNIT != unit.upper()].index, inplace=True)
# Then, all of the countries not of interest are filtered out
df.drop(df[~df['COUNTRY'].isin(countries)].index, inplace=True)
# Finally, all of the years not of interest are removed,
# and the columns are rearranged according to the desired output
main_cols = ['KEY_INDICATOR', 'KEY_PRODUCT', 'UNIT', 'COUNTRY']
cols = main_cols + [str(y) for y in years if y not in main_cols]
df = df.reindex(columns=cols)
return df
def NRG_flat(df, name):
# This module prepares the DataFrame to be flattened,
# then it gives it as output
# Assign the indicators and products' names
if '105' in name: # 'name' is the name of the dataset
# Creating the 'INDICATOR' column
indic_dic = dict(
zip(EurostatNRG.INDIC_PROD['nrg105a_indic'].KEY_INDICATOR,
EurostatNRG.INDIC_PROD['nrg105a_indic'].INDICATOR))
df['INDICATOR'] = df['KEY_INDICATOR'].map(indic_dic)
# Creating the 'PRODUCT' column
prod_dic = dict(
zip(
EurostatNRG.INDIC_PROD['nrg105a_prod'].KEY_PRODUCT.astype(
str), EurostatNRG.INDIC_PROD['nrg105a_prod'].PRODUCT))
df['PRODUCT'] = df['KEY_PRODUCT'].map(prod_dic)
elif '110' in name:
# Creating the 'INDICATOR' column
indic_dic = dict(
zip(EurostatNRG.INDIC_PROD['nrg110a_indic'].KEY_INDICATOR,
EurostatNRG.INDIC_PROD['nrg110a_indic'].INDICATOR))
df['INDICATOR'] = df['KEY_INDICATOR'].map(indic_dic)
# Creating the 'PRODUCT' column
prod_dic = dict(
zip(
EurostatNRG.INDIC_PROD['nrg110a_prod'].KEY_PRODUCT.astype(
str), EurostatNRG.INDIC_PROD['nrg110a_prod'].PRODUCT))
df['PRODUCT'] = df['KEY_PRODUCT'].map(prod_dic)
# Delete che columns 'KEY_INDICATOR' and 'KEY_PRODUCT', and
# rearrange the columns in the desired order
df.drop(columns=['KEY_INDICATOR', 'KEY_PRODUCT'], inplace=True)
main_cols = ['INDICATOR', 'PRODUCT', 'UNIT', 'COUNTRY']
year_cols = [y for y in df.columns if y not in main_cols]
cols = main_cols + year_cols
df = df.reindex(columns=cols)
# Pivot the DataFrame to have it in flat format
df = df.melt(
id_vars=df.columns[:4], var_name='YEAR', value_name='VALUE')
# Convert the 'VALUE' column into float numbers
df['VALUE'] = pd.to_numeric(df['VALUE'], downcast='float')
# Drop rows that have no indicators (it means they are not in
# the Excel file with the products of interest)
df.dropna(subset=['INDICATOR', 'PRODUCT'], inplace=True)
return df
EDIT 2: if this could help, this is the error I receive when using the EurostatNRG class in IPython:
[autoreload of EurostatNRG failed: Traceback (most recent call last):
File
"C:\Users\CAPIZZIF\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\extensions\autoreload.py",
line 244, in check
superreload(m, reload, self.old_objects) File "C:\Users\CAPIZZIF\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\extensions\autoreload.py",
line 394, in superreload
update_generic(old_obj, new_obj) File "C:\Users\CAPIZZIF\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\extensions\autoreload.py",
line 331, in update_generic
update(a, b) File "C:\Users\CAPIZZIF\AppData\Local\Continuum\anaconda3\lib\site-packages\IPython\extensions\autoreload.py",
line 279, in update_class
if (old_obj == new_obj) is True: File "C:\Users\CAPIZZIF\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\generic.py",
line 1478, in nonzero
.format(self.class.name)) ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or
a.all(). ]

I managed to find the culprit.
In the NRG_flat method, the lines:
df['INDICATOR'] = df['KEY_INDICATOR'].map(indic_dic)
...
df['PRODUCT'] = df['KEY_PRODUCT'].map(indic_dic)
mess up the copies of the df DataFrame, thus I had to change them with the Pandas assign method:
df = df.assign(INDICATOR=df.KEY_INDICATOR.map(prod_dic))
...
df = df.assign(PRODUCT=df.KEY_PRODUCT.map(prod_dic))
I do not get any more error.
Thank you for replying!

Related

add Columns and reorder them

first data frame :
Index([ 'AvailabilityZone', 'CreateTime', 'Encrypted', 'Size',
'SnapshotId', 'State', 'VolumeId', 'Iops', 'VolumeType',
'MultiAttachEnabled', 'KmsKeyId', 'instanceId', 'name','Attachments']
dtype='object')
Second data frame :
Index(['Attachments', 'AvailabilityZone', 'CreateTime', 'Size',
'SnapshotId', 'VolumeId', 'Iops', 'Tags', 'VolumeType',
'KmsKeyId', 'instanceId', 'name'],
dtype='object')
I am calling API to pull data but i am getting columns in different order and sometimes columns are present and sometimes columns are not present
Example : In first data frame i have 'MultiAttachEnabled' and 'State' but i second dataframe we don't have those columns. I want to change the order columns as well and remove some of the columns like Tags and Encrypted
In Final csv file i want to get :
Attachments,
AvailabilityZone ,
CreateTime,
KmsKeyId,
Size,
SnapshotId,
State,
VolumeId,
Iops,
VolumeType,
MultiAttachEnabled,
instanceId,
Throughput.
You can try the following where you add missing columns and order column name wise.
import numpy as np
# Required columns
columns = ['Attachments', 'AvailabilityZone', 'CreateTime', 'KmsKeyId', 'Size', 'SnapshotId', 'State', 'VolumeId', 'Iops', 'VolumeType', 'MultiAttachEnabled', 'instanceId', 'Throughput']
# Get missing columns
missing_columns = set(columns).difference(set(df.columns))
# Add missing columns
for i in missing_columns:
df[i] = np.nan
# Reorder column
df = df.reindex(sorted(df.columns), axis=1)

Add a calculated column to a pivot table in pandas

Hi I am trying to create new columns to a multi-indexed pandas pivot table to do a countif statement (similar to excel) depending if a level of the index contains a specific string. This is the sample data:
df = pd.DataFrame({'City': ['Houston', 'Austin', 'Hoover','Adak','Denver','Houston','Adak','Denver'],
'State': ['Texas', 'Texas', 'Alabama','Alaska','Colorado','Texas','Alaska','Colorado'],
'Name':['Aria', 'Penelope', 'Niko','Susan','Aria','Niko','Aria','Niko'],
'Unit':['Sales', 'Marketing', 'Operations','Sales','Operations','Operations','Sales','Operations'],
'Assigned':['Yes','No','Maybe','No','Yes','Yes','Yes','Yes']},
columns=['City', 'State', 'Name', 'Unit','Assigned'])
pivot=df.pivot_table(index=['City','State'],columns=['Name','Unit'],values=['Assigned'],aggfunc=lambda x:', '.join(set(x)),fill_value='')
and this is the desired output (in screenshot). Thanks in advance!
try:
temp = pivot[('Mango', 'Aria', 'Sales')].str.len()>0
pivot['new col'] = temp.astype(int)
the result:
Based on your edit:
import numpy as np
temp = pivot.xs('Sales', level=2, drop_level=False, axis = 1).apply(lambda x: np.sum([1 if y!='' else 0 for y in x]), axis = 1)
pivot[('', 'total sales', 'count how many...')]=temp

Faster way to iterate over columns in pandas

I have the following task.
I have this data:
import pandas
import numpy as np
data = {'name': ['Todd', 'Chris', 'Jackie', 'Ben', 'Richard', 'Susan', 'Joe', 'Rick'],
'phone': [912341.0, np.nan , 912343.0, np.nan, 912345.0, 912345.0, 912347.0, np.nan],
' email': ['todd#gmail.com', 'chris#gmail.com', np.nan, 'ben#gmail.com', np.nan ,np.nan , 'joe#gmail.com', 'rick#gmail.com'],
'most_visited_airport': ['Heathrow', 'Beijing', 'Heathrow', np.nan, 'Tokyo', 'Beijing', 'Tokyo', 'Heathrow'],
'most_visited_place': ['Turkey', 'Spain',np.nan , 'Germany', 'Germany', 'Spain',np.nan , 'Spain']
}
df = pandas.DataFrame(data)
What I have to do is for every feature column (most_visited_airport etc.) and its values (Heathrow, Beijing, Tokyo) I have to generate personal information and output it to a file.
E.g. If we look at most_visited_airport and Heathrow
I need to output three files containing the names, emails and phones of the people who visited the airport the most.
Currently, I have this code to do the operation for both columns and all the values:
columns_to_iterate = [ x for x in df.columns if 'most' in x]
for each in df[columns_to_iterate]:
values = df[each].dropna().unique()
for i in values:
df1 = df.loc[df[each]==i,'name']
df2 = df.loc[df[each]==i,' email']
df3 = df.loc[df[each]==i,'phone']
df1.to_csv(f'{each}_{i}_{df1.name}.csv')
df2.to_csv(f'{each}_{i}_{df2.name}.csv')
df3.to_csv(f'{each}_{i}_{df3.name}.csv')
Is it possible to do this in a more elegant and maybe faster way? Currently I have small dataset but not sure if this code will perform well with big data. My particular concern are the nested loops.
Thank you in advance!
You could replace the call to unique with a groupby, which would not only get the unique values, but split up the dataframe for you:
for column in df.filter(regex='^most'):
for key, group in df.groupby(column):
for attr in ('name', 'phone', 'email'):
group['name'].dropna().to_csv(f'{column}_{key}_{attr}.csv')
You can do it this way.
cols = df.filter(regex='most').columns.values
def func_current_cols_to_csv(most_col):
place = [i for i in df[most_col].dropna().unique().tolist()]
csv_cols = ['name', 'phone', ' email']
result = [df[df[most_col] == i][j].dropna().to_csv(f'{most_col}_{i}_{j}.csv', index=False) for i in place for j in
csv_cols]
return result
[func_current_cols_to_csv(i) for i in cols]
also in the options when writing to csv, you can leave the index, but do not forget to reset it before writing.

how to apply a class function to replace NaN for mean within a subset of pandas df columns?

The class is composed of a set of attributes and functions including:
Attributes:
df : a pandas dataframe.
numerical_feature_names: df columns with a numeric value.
label_column_names: df string columns to be grouped.
Functions:
mean(nums): takes a list of numbers as input and returns the mean
fill_na(df, numerical_feature_names, label_columns): takes class attributes as inputs and returns a transformed df.
And here's the class:
class PLUMBER():
def __init__(self):
################# attributes ################
self.df=df
# specify label and numerical features names:
self.numerical_feature_names=numerical_feature_names
self.label_column_names=label_column_names
##################### mean ##############################
def mean(self, nums):
total=0.0
for num in nums:
total=total+num
return total/len(nums)
############ fill the numerical features ##################
def fill_na(self, df, numerical_feature_names, label_column_names):
# declaring parameters:
df=self.df
numerical_feature_names=self.numerical_feature_names
label_column_names=self.label_column_names
# now replacing NaN with group mean
for numerical_feature_name in numerical_feature_names:
df[numerical_feature_name]=df.groupby([label_column_names]).transform(lambda x: x.fillna(self.mean(x)))
return df
When trying to apply it to a pandas df:
if __name__=="__main__":
# initialize class
plumber=PLUMBER()
# replace NaN with group mean
df=plumber.fill_na(df=df, numerical_feature_names=numerical_feature_names, label_column_names=label_column_names)
The next error arises:
ValueError: Grouper and axis must be same length
data and class parameters
import pandas as pd
d={'month': ['01/01/2020', '01/02/2020', '01/03/2020', '01/01/2020', '01/02/2020', '01/03/2020'],
'country': ['Japan', 'Japan', 'Japan', 'Poland', 'Poland', 'Poland'],
'level':['A01', 'A01', 'A01', 'A00','A00', 'A00'],
'job title':['Insights Manager', 'Insights Manager', 'Insights Manager', 'Sales Director', 'Sales Director', 'Sales Director'],
'number':[np.nan, 450, 299, np.nan, 19, 29],
'age':[np.nan, 30, 28, np.nan, 29, 18]}
df=pd.DataFrame(d)
# headers
column_names=df.columns.values.tolist()
column_names= [column_name.strip() for column_name in column_names]
# label_column_names (to be grouped)
label_column_names=['country', 'level', 'job title']
# numerical_features:
numerical_feature_names = [x for x in column_names if x not in label_column_names]
numerical_feature_names.remove('month')
How could I change the class in order to get the transformed df (i.e. the one that replaces np.nan with it's group mean)?
First the error is because label_column_names is already a list, so in the groupby you don't need the [] around it. so it should be df.groupby(label_column_names)... instead of df.groupby([label_column_names])...
Now, to actually solve you problem, in the function fill_na of your class, replace the loop for (you don't need it actually) by
df[numerical_feature_names] = (
df[numerical_feature_names]
.fillna(
df.groupby(label_column_names)
[numerical_feature_names].transform('mean')
)
)
in which you fillna the columns numerical_feature_names by the result of the groupy.tranform with the mean of these columns

Finding the min of a column across multiple lists in python

I need to find the minimum and maximum of a given a column from a csv file and currently the value is a string but I need it to be an integer, right now my output after I have split all the lines into lists looks like this
['FRA', 'Europe', 'France', '14/06/2020', '390', '10\n']
['FRA', 'Europe', 'France', '11/06/2020', '364', '27\n']
['FRA', 'Europe', 'France', '12/06/2020', '802', '28\n']
['FRA', 'Europe', 'France', '13/06/2020', '497', '24\n']
And from that line along with its many others I want to find the minimum of the
5th column and currently when I do
min(column[4])
It just gives the min of each individual list which is just the number in that column rather than grouping them all up and getting that minimum.
P.S: I am very new to python and coding in general, I also have to do this without any importing of modules.
For you Azro.
def main(csvfile,country,analysis):
infile = csvfile
datafile = open(infile, "r")
country = country.capitalize()
if analysis == "statistics":
for line in datafile.readlines():
column = line.split(",")
if column[2] == country:
You may use pandas that allows to read csv file and manipulate them as DataFrame, then it's very easy to retrieve a min/max from a column
import pandas as pd
df = pd.read_csv("test.txt", sep=',')
mini = df['colName'].min()
maxi = df['colName'].max()
print(mini, maxi)
Then if you have already read your data in a list of lists, you max use builtin min and max
# use rstrip() when reading line, to remove leading \n
values = [
['FRA', 'Europe', 'France', '14/06/2020', '390', '10'],
['FRA', 'Europe', 'France', '14/06/2020', '395', '10']
]
mini = min(values, key=lambda x: int(x[4]))[4]
maxi = max(values, key=lambda x: int(x[4]))[4]
Take a look at the library pandas and especially the DataFrame class. This is probably the go-to method for handling .csv files and tabular data in general.
Essentially, your code would be something like this:
import pandas as pd
df = pd.read_csv('my_file.csv') # Construct a DataFrame from a csv file
print(df.columns) # check to see which column names the dataframe has
print(df['My Column'].min())
print(df['My Column'].max())
There are shorter ways to do this. But this example goes step by step:
# After you read a CSV file, you'll have a bunch of rows.
rows = [
['A', '390', '...'],
['B', '750', '...'],
['C', '207', '...'],
]
# Grab a column that you want.
col = [row[1] for row in rows]
# Convert strings to integers.
vals = [int(s) for s in col]
# Print max.
print(max(vals))

Categories