Pandas: cannot import name adjoin - python

From Wes:
def side_by_side(*objs, **kwds):
from pandas.core.common import adjoin
space = kwds.get('space', 4)
reprs = [repr(obj).split('\n') for obj in objs]
print adjoin(space, *reprs)
Apply below:
import pandas as pd
df1 = pd.DataFrame(np.random.rand(10,3))
df2 = pd.DataFrame(np.random.rand(10,3))
side_by_side(df1, df2)
Throws error:
ImportError Traceback (most recent call last)
<ipython-input-25-2674cd8a228c> in <module>()
3
4
----> 5 side_by_side(df1, df2)
<ipython-input-24-9f441ebc9cb3> in side_by_side(*objs, **kwds)
1 def side_by_side(*objs, **kwds):
----> 2 from pandas.core.common import adjoin
3 space = kwds.get('space', 4)
4 reprs = [repr(obj).split('\n') for obj in objs]
5 print adjoin(space, *reprs)
ImportError: cannot import name adjoin

I guess this function has been moved to pandas.formats.printing:
In [69]: from pandas.formats.printing import adjoin
UPDATE: as already mentioned by #debo for Pandas 0.20.0+ use:
from pandas.io.formats.printing import adjoin

Changed for pandas version 0.20.*
from pandas.io.formats.printing import adjoin

Related

Merge datasets using pandas

Below I have code which was provided to me in order to join 2 datasets.
import pandas as pd
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
df= pd.read_csv("student/student-por.csv")
ds= pd.read_csv("student/student-mat.csv")
print("before merge")
print(df)
print(ds)
print("After merging:")
dq = pd.merge(df,ds,by=c("school","sex","age","address","famsize","Pstatus","Medu","Fedu","Mjob","Fjob","reason","nursery","internet"))
print(dq)
I get this error:
Traceback (most recent call last):
File "/Users/PycharmProjects/datamining/main.py", line 15, in <module>
dq = pd.merge(df, ds,by=c ("school","sex","age","address","famsize","Pstatus","Medu","Fedu","Mjob","Fjob","reason","nursery","internet"))
NameError: name 'c' is not defined
Any help would be great, I've tried messing about with it for a while. I believe the 'by=c' is the issue.
Thanks
Hi 👋🏻 Hope you are doing well!
The error is happening because of the c symbol in the arguments of the merge function. Also merge function has a different signature and it doesn't have the argument by but instead it should be on, which accepts only the list of columns 🙂 So in summary it should something similar to this:
import pandas as pd
df = pd.read_csv("student/student-por.csv")
ds = pd.read_csv("student/student-mat.csv")
print("Before merge.")
print(df)
print(ds)
print("After merge.")
dq = pd.merge(
left=df,
right=ds,
on=[
"school",
"sex",
"age",
"address",
"famsize",
"Pstatus",
"Medu",
"Fedu",
"Mjob",
"Fjob",
"reason",
"nursery",
"internet",
],
)
print(dq)
Docs: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html

TypeError: 'DataFrame' object is not callable when dropping a column

In cols, I want to drop the class column in the breast_cancer_df, which corresponds with the last column. But I'm getting 'DataFrame' object is not callable error.
import numpy as np
import seaborn as sns; sns.set(style="ticks", color_codes=True)
import sklearn.datasets
import pandas as pd
breast_cancer = sklearn.datasets.load_breast_cancer()
breast_cancer_df = pd.DataFrame(
data= np.c_[breast_cancer.data, [breast_cancer.target_names[v] for v in breast_cancer.target]],
columns= list(breast_cancer.feature_names).append('class'))
cols = breast_cancer_df(columns=breast_cancer_df.columns[-1], axis=1, inplace=True)
breast_cancer_df[cols] = breast_cancer_df[cols].apply(pd.to_numeric)
g = sns.pairplot(breast_cancer_df, hue='class')
> --------------------------------------------------------------------------- TypeError Traceback (most recent call
> last) <ipython-input-224-76d71f9352d8> in <module>
> 3 data= np.c_[breast_cancer.data, [breast_cancer.target_names[v] for v in breast_cancer.target]],
> 4 columns= list(breast_cancer.feature_names).append('class'))
> ----> 5 cols = breast_cancer_df(columns=breast_cancer_df.columns[-1], axis=1, inplace=True)
> 6 breast_cancer_df[cols] = breast_cancer_df[cols].apply(pd.to_numeric)
> 7 g = sns.pairplot(breast_cancer_df, hue='class')
>
> TypeError: 'DataFrame' object is not callable
The syntax/logic is wrong at several levels: cols = breast_cancer_df(columns=breast_cancer_df.columns[-1], axis=1, inplace=True)
A dataframe is not a function, you cannot use breast_cancer_df(...). I imagine you want to slice here: breast_cancer_df[...] or use the drop method: breast_cancer_df.drop(...)
In drop(...) no need to combine columns=... and axis=1
If you use inplace=True indrop(...), this will output nothing. There is no point in catching the output in cols.
Use:
breast_cancer_df = breast_cancer_df.iloc[:, :-1]
Or:
breast_cancer_df = breast_cancer_df.drop(columns=['class'])
NB. Better not use inplace as it might get deprecated in the future

How can I get rid of "IndexError: string index out of range"

I am trying to find neighbor nodes in a graph network. I have imported data in coa_train and now trying to find neighbor nodes.
import matplotlib.pyplot as plt
from math import isclose
from sklearn.decomposition import PCA
import os
import networkx as nx
import numpy as np
import pandas as pd
#from stellargraph import StellarGraph, datasets
#from stellargraph.data import EdgeSplitter
from collections import Counter
import multiprocessing
#from IPython.display import display, HTML
from sklearn.model_selection import train_test_split
%matplotlib inline
def readTrainingData(tr):
trainingData = []
with open(tr) as f:
for line in f:
a1, a2 = line.strip().split()
trainingData.append((a1, a2))
return trainingData
coa_train = readTrainingData("training.txt")
coa_train
[('8193', '16056'),
('24578', '21968'),
('24578', '18297'),
('24578', '16770'),
('24578', '17038'),
('8195', '2072'),
('8195', '20568'),
----------------------
import collections
def getNeighbors(data):
neighbors=collections.defaultdict(set)
for pair in data:
neighbors[pair[0]].add(pair[1])
neighbors[pair[1]].add(pair[0])
return neighbors
coa_neighbors= getNeighbors("coa_train")
Here, I am getting an error like:
IndexError Traceback (most recent call last)
<ipython-input-41-c775c56181f7> in <module>
13 return neighbors
14
---> 15 coa_neighbors= getNeighbors("coa_train")
16
<ipython-input-41-c775c56181f7> in getNeighbors(data)
5 for pair in data:
6
----> 7 neighbors[pair[0]].add(pair[1])
8
9 neighbors[pair[1]].add(pair[0])
IndexError: string index out of range
I can't see any reason for this error,as I believe 0 and 1 index in coa_train data are valid.
You pass a string into the function
getNeighbors("coa_train")
but it should be the variable instead
getNeighbors(coa_train)
With a string as argument, the loop
for pair in data:
will give you single characters. And with a single character, you can't do pair[1] any more.
you are passing string parameter to the function
coa_neighbors= getNeighbors("coa_train")
hence your,
for pair in data:
traverse single character from your string so that's why it gives index out of range error. try this..
coa_neighbors= getNeighbors(coa_train)
i hope it works.

using library inside python function

Problem: Importing an python file (EDA.py) into a jupyter notebook.The python file uses pandas and has an "Import pandas as pd" in it. But in Jupyter I get the error that pd is not defined.
Python file:
<EDA.py>
def eda_df(df):
import pandas as pd
print('=================Unique Values============================')
unique_series = df.apply(pd.Series.nunique).sort_values()
print(unique_series)
Jupyter Notebook:
import EDA
train = pd.read_csv(r'.\kaggle\housing\house-prices-advanced-regression-techniques\train.csv')
eda_df(train)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-269-86ee9695b171> in <module>
----> 1 eda_df(train)
~\iCloudDrive\Adnan PC\Data Science\Jupyter NB\EDA.py in eda_df(df)
13 print('Features missing more than 40% data: ',len(missing_data_list))
14 print(missing_data_list)
---> 15 print('=================Unique Values============================')
16 unique_series = df.apply(pd.Series.nunique).sort_values()
17 unique_list = unique_series[unique_series<15].index.to_list()
NameError: name 'pd' is not defined
You just need to import pandas as pd:
import pandas as pd
def eda_df(df):
unique_series = df.apply(pd.Series.nunique).sort_values()
return (unique_series)

How to remove every possible accents from a column in python

I am new in python. I have a data frame with a column, named 'Name'. The column contains different type of accents. I am trying to remove those accents. For example, rubén => ruben, zuñiga=zuniga, etc. I wrote following code:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import unicodedata
data=pd.read_csv('transactions.csv')
data.head()
nm=data['Name']
normal = unicodedata.normalize('NFKD', nm).encode('ASCII', 'ignore')
I am getting error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-41-1410866bc2c5> in <module>()
1 nm=data['Name']
----> 2 normal = unicodedata.normalize('NFKD', nm).encode('ASCII', 'ignore')
TypeError: normalize() argument 2 must be unicode, not Series
The reason why it is giving you that error is because normalize requires a string for the second parameter, not a list of strings. I found an example of this online:
unicodedata.normalize('NFKD', u"Durrës Åland Islands").encode('ascii','ignore')
'Durres Aland Islands'
Try this for one column:
nm = nm.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
Try this for multiple columns:
obj_cols = data.select_dtypes(include=['O']).columns
data.loc[obj_cols] = data.loc[obj_cols].apply(lambda x: x.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8'))
Try this for one column:
df[column_name] = df[column_name].apply(lambda x: unicodedata.normalize(u'NFKD', str(x)).encode('ascii', 'ignore').decode('utf-8'))
Change the column name according to your data columns.

Categories