Making a feature table in pandas - python

I have two folders which has files where each file looks like this-
file1 file2
a 32 b 32
b 23 d 12
c 28 r 7
note that all letters in every word is not compulsary..they can be in any order.
Now I want to create a table of this format-
a b c d........r s t.... class
32 23 28 0
0 32 0 12 7 0 0 ... 0
...................................... 1
each row contains the alphabets value as in the file and if the alphabet is not there then 0.class 0 is for files in first folder and 1 for second folder.
My attempt-
import os
import pandas as pd
dir_list = "........","........"] #CHANGE INPUT PATH
df = pd.DataFrame(columns=['class'])
count=0
for l in dir_list:
for root, dirs, files in os.walk(l):
for name in files:
outfile2 = open(root+"/"+name,'r')
line = outfile2.readline()
print(name)
count+=1
while line:
words=line.split(" ")
if words[0] not in df.columns:
df[words[0]]=words[1]
elif words[0] in df.columns:
df.iloc[count-1][words[0]]=words[1]
line = outfile2.readline()
if l==" ":
df[count-1]['class']='M'
else:
df[count-1]['class']='B'
df=df.fillna(0)
print(df)
error-
Traceback (most recent call last):
File "E:\anaconda\lib\site-packages\pandas\core\indexes\base.py", line 2525, in get_loc
return self._engine.get_loc(key)
File "pandas/_libs/index.pyx", line 117, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 139, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1265, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1273, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 0
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "new.py", line 34, in <module>
df[count-1]['class']='M'
File "E:\anaconda\lib\site-packages\pandas\core\frame.py", line 2139, in __getitem__
return self._getitem_column(key)
File "E:\anaconda\lib\site-packages\pandas\core\frame.py", line 2146, in _getitem_column
return self._get_item_cache(key)
File "E:\anaconda\lib\site-packages\pandas\core\generic.py", line 1842, in _get_item_cache
values = self._data.get(item)
File "E:\anaconda\lib\site-packages\pandas\core\internals.py", line 3843, in get
loc = self.items.get_loc(item)
File "E:\anaconda\lib\site-packages\pandas\core\indexes\base.py", line 2527, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 117, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 139, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1265, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1273, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 0

Related

Error with fit_transform in scikit learn for a movie reccommender system

This is the full code that I have tried to run.
It runs all well but there is error in the 2nd line from the bottom
count_matrix = count.fit_transform(df['bag_of_words'])
I also don't know where this bag_of_word comes from..Please suggest Code Edit..
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
df = pd.read_csv('https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7')
df = df[['Title','Genre','Director','Actors','Plot']]
df.head()
# initializing the new column
df['Key_words'] = ""
for index, row in df.iterrows():
plot = row['Plot']
r = Rake()
r.extract_keywords_from_text(plot)
key_words_dict_scores = r.get_word_degrees()
row['Key_words'] = list(key_words_dict_scores.keys())
df.drop(columns = ['Plot'], inplace = True)
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)
The error is as follows
Traceback (most recent call last):
File "C:\Python38\lib\site-packages\pandas\core\indexes\base.py", line 2897, in get_loc
return self._engine.get_loc(key)
File "pandas/_libs/index.pyx", line 107, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 131, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1607, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1614, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'bag_of_words'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "movie2.py", line 36, in <module>
count_matrix = count.fit_transform(df['bag_of_words'])
File "C:\Python38\lib\site-packages\pandas\core\frame.py", line 2995, in __getitem__
indexer = self.columns.get_loc(key)
File "C:\Python38\lib\site-packages\pandas\core\indexes\base.py", line 2899, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 107, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 131, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1607, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1614, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'bag_of_words'
Please tell me what to do?

During handling of the above exception, another exception occured

so basically what I'm trying to do is read a column from a csv file to an array then do calculations with that array. I have successfully gotten the array 'rawSunlightData' from the csv file but for some reason every time I try to select a variable from 'raySunlightData' array I get the error [During handling of the above exception, another exception occured] I can print the whole rawSunlightData but can't print individual values like rawSunlightData[0]
cleanSunlightData = []
rawSunlightData = pd.read_csv('Average daily sunlight per month.csv', header = None)
rawSunlightData = rawSunlightData.drop(rawSunlightData.columns[[0]], axis=1)
print(rawSunlightData[0])
i = 0
while i <= len(rawSunlightData):
arrayDivider = []
m = 0
while m < 12:
x = i + m
print(x)
m += 1
i += 12
the error message is
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 3078, in get_loc
return self._engine.get_loc(key)
File "pandas/_libs/index.pyx", line 140, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 162, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 958, in pandas._libs.hashtable.Int64HashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 964, in pandas._libs.hashtable.Int64HashTable.get_item
KeyError: 0
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/kennethwong/Desktop/Singapore crop yield /Downloaded data/Data cleaner.py", line 67, in <module>
cleanSunlightData()
File "/Users/kennethwong/Desktop/Singapore crop yield /Downloaded data/Data cleaner.py", line 46, in cleanSunlightData
print(rawSunlightData[0])
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/pandas/core/frame.py", line 2688, in __getitem__
return self._getitem_column(key)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/pandas/core/frame.py", line 2695, in _getitem_column
return self._get_item_cache(key)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/pandas/core/generic.py", line 2489, in _get_item_cache
values = self._data.get(item)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/pandas/core/internals.py", line 4115, in get
loc = self.items.get_loc(item)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 3080, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 140, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 162, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 958, in pandas._libs.hashtable.Int64HashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 964, in pandas._libs.hashtable.Int64HashTable.get_item
KeyError: 0
its okay guy I found out why, im still new to coding so I make mistakes... when you pull data from csv file and store it in a data frame, it is NOT an array ! you will have to convert it to an array by array.to_records()

Pandas reading html table

import pandas as pd
import pandas_datareader.data as web
coins = pd.read_html('https://coinmarketcap.com/')
for name in coins[0][1][1:]:
print(name)
Results in the error message below. When I print coins, I get the complete table, but when I try and get specific info it gives me this error message. I know this format works as I have copied it exactly from other exercises I have been learning from, and have just changed the website. Many thanks.
C:\Users\AppData\Local\Programs\Python\Python36-32\python.exe C:/Users/Desktop/python_work/crypto/crypto_corr.py
Traceback (most recent call last):
File "C:\Users\AppData\Local\Programs\Python\Python36-32\lib\site-packages\pandas\core\indexes\base.py", line 2525, in get_loc
return self._engine.get_loc(key)
File "pandas\_libs\index.pyx", line 117, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 139, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1265, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1273, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 1
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:/Users/Desktop/python_work/crypto/crypto_corr.py", line 6, in <module>
for name in coins[0][1][1:]:
File "C:\Users\AppData\Local\Programs\Python\Python36-32\lib\site-packages\pandas\core\frame.py", line 2139, in __getitem__
return self._getitem_column(key)
File "C:\Users\AppData\Local\Programs\Python\Python36-32\lib\site-packages\pandas\core\frame.py", line 2146, in _getitem_column
return self._get_item_cache(key)
File "C:\Users\AppData\Local\Programs\Python\Python36-32\lib\site-packages\pandas\core\generic.py", line 1842, in _get_item_cache
values = self._data.get(item)
File "C:\Users\AppData\Local\Programs\Python\Python36-32\lib\site-packages\pandas\core\internals.py", line 3843, in get
loc = self.items.get_loc(item)
File "C:\Users\AppData\Local\Programs\Python\Python36-32\lib\site-packages\pandas\core\indexes\base.py", line 2527, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\_libs\index.pyx", line 117, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 139, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1265, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1273, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 1
Process finished with exit code 1
If df is a dataframe, indexing like df[column] looks for columns called column. In your case, coins[0] is a dataframe, which does not have a column 1. However, it does have a column Name, so to print all names do the following:
df = coins[0]
for name in df['Name']:
print(name)

pandas._libs.hashtable.PyObjectHashTable.get_item KeyError: 0

I am trying to map emotions from one dataset to another and drop everything that is bigger than 6 in the current dataset. How should I fix this error?
import pandas as pd
csv_file = 'sample.csv'
count = 1
my_filtered_csv = pd.read_csv(csv_file, usecols=['subDirectory_filePath', 'expression'])
#my_filtered_csv['expression'] = my_filtered_csv['expression'].map({ '0':'6', '1':'3', '2':'4', '3':'5', '4':'2', '5':'1', '6':'0'})
df = pd.DataFrame(columns=['subDirectory_filePath', 'expression'])
print(my_filtered_csv.dtypes.index)
filtered_csv = my_filtered_csv[my_filtered_csv.expression <= 6 ]
for i in range(len(filtered_csv['expression'])):
if filtered_csv['expression'][i]==0:
filtered_csv['expression'][i] = 6
elif filtered_csv['expression'][i]==1:
filtered_csv['expression'][i] = 3
elif filtered_csv['expression'][i]==2:
filtered_csv['expression'][i] = 4
elif filtered_csv['expression'][i]==3:
filtered_csv['expression'][i] = 5
elif filtered_csv['expression'][i]==4:
filtered_csv['expression'][i] = 2
elif filtered_csv['expression'][i]==5:
filtered_csv['expression'][i] = 1
elif filtered_csv['expression'][i]==6:
filtered_csv['expression'][i] = 0
print(len(my_filtered_csv))
print('****')
for val in df['expression']:
print(val)
emotion_map = { '0':'6', '1':'3', '2':'4', '3':'5', '4':'2', '5':'1', '6':'0'}
print(emotion_map)
for key, value in emotion_map.items():
print(key,' : ', value)
'''
affectnet
0: Neutral,
1: Happiness,
2: Sadness,
3: Surprise,
4: Fear,
5: Disgust,
6: Anger,
7: Contempt,
8: None,
9: Uncertain,
10: No-Face
FER13
(0=Angry, 1=Disgust, 2=Fear, 3=Happy, 4=Sad, 5=Surprise, 6=Neutral).
0-->6
1-->3
2-->4
3-->5
4-->2
5-->1
6-->0
'''
Error is:
Index(['subDirectory_filePath', 'expression'], dtype='object')
Traceback (most recent call last):
File "/Users/mona/anaconda/lib/python3.6/site-packages/pandas/core/series.py", line 778, in _set_with_engine
self.index._engine.set_value(values, key, value)
File "pandas/_libs/index.pyx", line 116, in pandas._libs.index.IndexEngine.set_value
File "pandas/_libs/index.pyx", line 124, in pandas._libs.index.IndexEngine.set_value
File "pandas/_libs/index.pyx", line 154, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1210, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1218, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 0
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/mona/anaconda/lib/python3.6/site-packages/pandas/core/indexes/base.py", line 2442, in get_loc
return self._engine.get_loc(key)
File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 154, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1210, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1218, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 0
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/mona/anaconda/lib/python3.6/site-packages/pandas/core/series.py", line 719, in setitem
self._set_with_engine(key, value)
File "/Users/mona/anaconda/lib/python3.6/site-packages/pandas/core/series.py", line 781, in _set_with_engine
values[self.index.get_loc(key)] = value
File "/Users/mona/anaconda/lib/python3.6/site-packages/pandas/core/indexes/base.py", line 2444, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/index.pyx", line 154, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 1210, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 1218, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 0
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/mona/CS585/project/affnet/emotion_map.py", line 17, in <module>
df['expression'][i] = 3
File "/Users/mona/anaconda/lib/python3.6/site-packages/pandas/core/series.py", line 771, in __setitem__
setitem(key, value)
File "/Users/mona/anaconda/lib/python3.6/site-packages/pandas/core/series.py", line 728, in setitem
values[key] = value
IndexError: index 0 is out of bounds for axis 0 with size 0
Process finished with exit code 1
A few lines of the cvs is:
,subDirectory_filePath,expression
0,689/737db2483489148d783ef278f43f486c0a97e140fc4b6b61b84363ca.jpg,1
1,392/c4db2f9b7e4b422d14b6e038f0cdc3ecee239b55326e9181ee4520f9.jpg,0
2,468/21772b68dc8c2a11678c8739eca33adb6ccc658600e4da2224080603.jpg,0
3,944/06e9ae8d3b240eb68fa60534783eacafce2def60a86042f9b7d59544.jpg,1
4,993/02e06ee5521958b4042dd73abb444220609d96f57b1689abbe87c024.jpg,8
I think this error comes from your [i] notation, which is trying to look for the DataFrame index value of 0, which doesn't exist. Try replacing every instance of [i] with .iloc[i].
Also, you should be able to replace the for loop with much more compact, readable, and less error-prone code, especially since you define emotion_map but use it only for output. Try changing the mapping from strings to integers with emotion_map = { 0:6, 1:3, 2:4, 3:5, 4:2, 5:1, 6:0}, then move it to just under filtered_csv = ..., and replace that for loop with
filtered_csv['expression'] = filtered_csv['expression'].replace(emotion_map)
Pandas Dataframe should always be used with iloc as it does not support to indexing with 0.
I faced the same issue when there was only single element & I was trying to access that single element by prefixing 0......
Easy thing to do is also to convert your dataframe to an numpy array
df=np.array(df)
then you can use your for loop without any problems
In my situation error is raised because I trying to create series inside for loop.
So solution of this case is to create series outside the for loop, and then populate it inside for loop as I want it to be

KeyError: 0 when changing time format of data

I have a column of data that are date formatted as "%d%m%Y" like "15022016".
I need to convert them as "%Y-%m-%d" like"2016-02-15".
The data frame have 911,462 rows, and the code is as below:
for i in range(0,911462):
df['Date'][i]=datetime.datetime.strftime(datetime.datetime.strptime(df['Date'][i],"%d%m%Y"),"%Y-%m-%d")
Then I met with error as below:
Traceback (most recent call last):
File "C:\Users\liangfan\AppData\Local\Programs\Python\Python36-32\lib\site-packages\pandas\core\indexes\base.py", line 2393, in get_loc
return self._engine.get_loc(key)
File "pandas\_libs\index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5239)
File "pandas\_libs\index.pyx", line 154, in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5085)
File "pandas\_libs\hashtable_class_helper.pxi", line 1207, in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20405)
File "pandas\_libs\hashtable_class_helper.pxi", line 1215, in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20359)
KeyError: 0
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "<input>", line 2, in <module>
File "C:\Users\liangfan\AppData\Local\Programs\Python\Python36-32\lib\site-packages\pandas\core\frame.py", line 2062, in __getitem__
return self._getitem_column(key)
File "C:\Users\liangfan\AppData\Local\Programs\Python\Python36-32\lib\site-packages\pandas\core\frame.py", line 2074, in _getitem_column
result = result[key]
File "C:\Users\liangfan\AppData\Local\Programs\Python\Python36-32\lib\site-packages\pandas\core\frame.py", line 2062, in __getitem__
return self._getitem_column(key)
File "C:\Users\liangfan\AppData\Local\Programs\Python\Python36-32\lib\site-packages\pandas\core\frame.py", line 2069, in _getitem_column
return self._get_item_cache(key)
File "C:\Users\liangfan\AppData\Local\Programs\Python\Python36-32\lib\site-packages\pandas\core\generic.py", line 1534, in _get_item_cache
values = self._data.get(item)
File "C:\Users\liangfan\AppData\Local\Programs\Python\Python36-32\lib\site-packages\pandas\core\internals.py", line 3590, in get
loc = self.items.get_loc(item)
File "C:\Users\liangfan\AppData\Local\Programs\Python\Python36-32\lib\site-packages\pandas\core\indexes\base.py", line 2395, in get_loc
return self._engine.get_loc(self._maybe_cast_indexer(key))
File "pandas\_libs\index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5239)
File "pandas\_libs\index.pyx", line 154, in pandas._libs.index.IndexEngine.get_loc (pandas\_libs\index.c:5085)
File "pandas\_libs\hashtable_class_helper.pxi", line 1207, in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20405)
File "pandas\_libs\hashtable_class_helper.pxi", line 1215, in pandas._libs.hashtable.PyObjectHashTable.get_item (pandas\_libs\hashtable.c:20359)
KeyError: 0
I check the raw data in excel, they are all fine so there should be no problems with the raw data.
It's quite wired that Key Error is 0. I totally have no idea what's wrong with it and how to deal with it.
Thanks for reading and waiting for your help! :)
You need pandas.to_datetime with parameter format:
df = pd.DataFrame({'Date':[15022016,15022016]})
print (df)
Date
0 15022016
1 15022016
df['Date'] = pd.to_datetime(df['Date'], format='%d%m%Y')
print (df)
Date
0 2016-02-15
1 2016-02-15
print (df['Date'].dtype)
datetime64[ns]

Categories