I'm trying to learn about multiprocessing and pools to process some tweets I've got in a MySQL DB. Here is the code and error messages.
import multiprocessing
import sqlalchemy
import pandas as pd
import config
from nltk import tokenize as token
q = multiprocessing.Queue()
engine = sqlalchemy.create_engine(config.sqlConnectionString)
def getRow(pandasSeries):
df = pd.DataFrame()
tweetTokenizer = token.TweetTokenizer()
print(pandasSeries.loc['BODY'], "\n", type(pandasSeries.loc['BODY']))
for tokens in tweetTokenizer.tokenize(pandasSeries.loc['BODY']):
df = df.append(pd.Series(data=[pandasSeries.loc['ID'], tokens, pandasSeries.loc['AUTHOR'],
pandasSeries.loc['RETWEET_COUNT'], pandasSeries.loc['FAVORITE_COUNT'],
pandasSeries.loc['FOLLOWERS_COUNT'], pandasSeries.loc['FRIENDS_COUNT'],
pandasSeries.loc['PUBLISHED_AT']],
index=['id', 'tweet', 'author', 'retweet', 'fav', 'followers', 'friends',
'published_at']), ignore_index=True)
df.to_sql(name="tweet_tokens", con=engine, if_exists='append')
if __name__ == '__main__':
##LOADING SQL INTO DATAFRAME##
databaseData = pd.read_sql_table(config.tweetTableName, engine)
pool = multiprocessing.Pool(6)
for row in databaseData.iterrows():
print(row)
pool.map(getRow, row)
pool.close()
q.close()
q.join_thread()
"""
OUPUT
C:\Users\Def\Anaconda3\python.exe C:/Users/Def/Dropbox/Dissertation/testThreadCopy.py
(0, ID 3247
AUTHOR b'Elon Musk News'
RETWEET_COUNT 0
FAVORITE_COUNT 0
FOLLOWERS_COUNT 20467
FRIENDS_COUNT 14313
BODY Elon Musk Takes an Adorable 5th Grader's Idea ...
PUBLISHED_AT 2017-03-03 00:00:01
Name: 0, dtype: object)
Elon Musk Takes an Adorable 5th Grader's
<class 'str'>
multiprocessing.pool.RemoteTraceback:
Traceback (most recent call last):
File "C:\Users\Def\Anaconda3\lib\multiprocessing\pool.py", line 119, in worker
result = (True, func(*args, **kwds))
File "C:\Users\Def\Anaconda3\lib\multiprocessing\pool.py", line 44, in mapstar
return list(map(*args))
File "C:\Users\Def\Dropbox\Dissertation\testThreadCopy.py", line 16, in getRow
print(pandasSeries.loc['BODY'], "\n", type(pandasSeries.loc['BODY']))
AttributeError: 'numpy.int64' object has no attribute 'loc'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:/Users/Def/Dropbox/Dissertation/testThreadCopy.py", line 34, in <module>
pool.map(getRow, row)
File "C:\Users\Def\Anaconda3\lib\multiprocessing\pool.py", line 260, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "C:\Users\Def\Anaconda3\lib\multiprocessing\pool.py", line 608, in get
raise self._value
AttributeError: 'numpy.int64' object has no attribute 'loc'
Process finished with exit code 1
"""
What I don't understand is why it prints out the first Series and then crashes? And why does it say that pandasSeries.loc['BODY'] is of type numpy.int64 when the print out it says that it is of type string? I'm sure I've gone wrong in a number of other places if you can see where please can you point it out.
Thanks.
When I construct a simple dataframe:
frame
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
and iterate twice I get:
for row in databaseData.iterrows():
for i in row:
print(i, type(i))
That inner loop produces 2 items, a row index/label, and a Series with the values.
0 <class 'numpy.int64'>
0 0
1 1
2 2
3 3
Name: 0, dtype: int32 <class 'pandas.core.series.Series'>
Your map does the same, sending a numeric index to one process (which produces the error), and a series to another.
If I use pool.map without the for row:
pool.map(getRow, databaseData.iterrows())
then getRow receives a 2 element tuple.
def getRow(aTuple):
rowlbl, rowSeries = aTuple
print(rowSeries)
...
Your print(row) shows this tuple; it's just harder to see because the Series part is multiline. If I add a \n it might be clearer
(0, # row label
ID 3247 # multiline Series
AUTHOR b'Elon Musk News'
RETWEET_COUNT 0
....
Name: 0, dtype: object)
Related
This is my code, I am trying to get months for a new column
import pandas as pd
df = pd.read_excel("..\Data.xlsx")
df.head(4)
p = df["Month"][0]
p[0:3]
I don't know what's the issue is here but it was working well for other datasets with the same attributes
Dataset:
Month Passengers
0 1995-01-01 112
1 1995-02-01 118
2 1995-03-01 132
3 1995-04-01 129
4 1995-05-01 121
P.S: In the excel data set month values are in Jan-1995 Feb-1995 format, it changed to YY:MM:DAY format because of pandas.
Traceback (most recent call last):
File "C:\Users\sreen\AppData\Local\Temp/ipykernel_27276/630478717.py", line 1, in <module>
p[0:3]
TypeError: 'Timestamp' object is not subscriptable
Maybe you need to write p = df["Month"]? In you current code, p is the first value of the Month column, so p[0:3] is just a Timestamp, which can't be subscripted.
This shall work for you:
df.rename(columns = {'Month':'Date'}, inplace = True)
df['Month'] = pd.DatetimeIndex(df['Date']).month
I'm working on a ML project to predict answer times in stack overflow based on tags. Sample data:
Unnamed: 0 qid i qs qt tags qvc qac aid j as at
0 1 563355 62701.0 0 1235000081 php,error,gd,image-processing 220 2 563372 67183.0 2 1235000501
1 2 563355 62701.0 0 1235000081 php,error,gd,image-processing 220 2 563374 66554.0 0 1235000551
2 3 563356 15842.0 10 1235000140 lisp,scheme,subjective,clojure 1047 16 563358 15842.0 3 1235000177
3 4 563356 15842.0 10 1235000140 lisp,scheme,subjective,clojure 1047 16 563413 893.0 18 1235001545
4 5 563356 15842.0 10 1235000140 lisp,scheme,subjective,clojure 1047 16 563454 11649.0 4 1235002457
I'm stuck at the data cleaning process. I intend to create a new column named 'time_taken' which stores the difference between the at and qt columns.
Code:
import pandas as pd
import numpy as np
df = pd.read_csv("answers.csv")
df['time_taken'] = 0
print(type(df.time_taken))
for i in range(0,263541):
val = df.qt[i]
qtval = val.item()
val = df.at[i]
atval = val.item()
df.time_taken[i] = qtval - atval
I'm getting this error:
Traceback (most recent call last):
File "<ipython-input-39-9384be9e5531>", line 1, in <module>
val = df.at[0]
File "D:\Softwares\Anaconda\lib\site-packages\pandas\core\indexing.py", line 2080, in __getitem__
return super().__getitem__(key)
File "D:\Softwares\Anaconda\lib\site-packages\pandas\core\indexing.py", line 2027, in __getitem__
return self.obj._get_value(*key, takeable=self._takeable)
TypeError: _get_value() missing 1 required positional argument: 'col'
The problem here lies in the indexing of df.at
Types of both df.qt and df.at are
<class 'pandas.core.indexing._AtIndexer'>
<class 'pandas.core.series.Series'> respectively.
I'm an absolute beginner in data science and do not have enough experience with pandas and numpy.
There is, to put it mildly, an easier way to do this.
df['time_taken'] = df['at'] - df.qt
The AtIndexer issue comes up because .at is a pandas method. You want to make sure to not name columns any names that are the same as a Python/Pandas method for this reason. You can get around it just by indexing with df['at'] instead of df.at.
Besides that, this operation — if I'm understanding it — can be done with one short line vs. a long for loop.
I have a function which calculates the mode of columns of a pandas dataframe:
def my_func(df):
for col in df.columns:
stat = df[col].mode()
print(stat)
But I would like to make it more generic so that I can change which statistic I calculate e.g. mean, max,... I tried to pass the method mode() as an argument to my function:
def my_func(df, pandas_stat):
for col in df.columns:
stat = df[col].pandas_stat()
print(stat)
having referred to: How do I pass a method as a parameter in Python
However this doesn't seem to work for me.
Using a simple example:
> A
a b
0 1.0 2.0
1 2.0 4.0
2 2.0 6.0
3 3.0 NaN
4 NaN 4.0
5 3.0 NaN
6 2.0 6.0
7 4.0 6.0
It doesn't recognise the command mode:
> my_func(A, mode)
Traceback (most recent call last):
File "<ipython-input-332-c137de83a530>", line 1, in <module>
my_func(A, mode)
NameError: name 'mode' is not defined
so I tried pd.DataFrame.mode:
> my_func(A, pd.DataFrame.mode)
Traceback (most recent call last):
File "<ipython-input-334-dd913410abd0>", line 1, in <module>
my_func(A, pd.DataFrame.mode)
File "<ipython-input-329-8acf337bce92>", line 3, in my_func
stat = df[col].pandas_stat()
File "/anaconda3/envs/py36/lib/python3.6/site-packages/pandas/core/generic.py", line 4376, in __getattr__
return object.__getattribute__(self, name)
AttributeError: 'Series' object has no attribute 'pandas_stat'
Is there a way to pass the mode function?
You can use [getattr][1] built-in and __name__ attribute to do so, but I guess it makes your code somewhat unclear. May be a better approach exists.
df = pd.DataFrame({'col1': list(range(5)), 'col2': list(range(5, 0, -1))})
df
Out:
col1 col2
0 0 5
1 1 4
2 2 3
3 3 2
4 4 1
Define my_func this way and apply it to df:
def my_func(df, pandas_stat):
for col in df.columns:
stat = getattr(df[col], pandas_stat.__name__)()
print(stat)
my_func(df, pd.DataFrame.mean)
Out
2.0
3.0
Explanation: pd.DataFrame.mean has attribute __name__ which value is 'mean'. Getattr can get this attribute from pd.DataFrame object, than you can call it.
You can even pass an arguments, if you need it:
def my_func(df, pandas_stat, *args, **kwargs):
for col in df.columns:
stat = getattr(df[col], pandas_stat.__name__)(*args, **kwargs)
print(stat)
my_func(df, pd.DataFrame.apply, lambda x: x ** 2)
Out:
0 0
1 1
2 4
3 9
4 16
Name: col1, dtype: int64
0 25
1 16
2 9
3 4
4 1
Name: col2, dtype: int64
But I repeat, I guess this approach is a little confusing.
Edit
About an error:
> my_func(A, pd.DataFrame.mode)
Traceback (most recent call last):
File "<ipython-input-334-dd913410abd0>", line 1, in <module>
my_func(A, pd.DataFrame.mode)
File "<ipython-input-329-8acf337bce92>", line 3, in my_func
stat = df[col].pandas_stat()
File "/anaconda3/envs/py36/lib/python3.6/site-packages/pandas/core/generic.py", line 4376, in __getattr__
return object.__getattribute__(self, name)
AttributeError: 'Series' object has no attribute 'pandas_stat'
When df[col].pandas_stat() is executed, a dot . operator invokes a __getattribute__ method of dataframe object. It is an analog of a getattr, but it gets self as a first argument automaticly.
So, the second is the 'name' of a method, which is 'pandas_stat' in your code. It breaks down the execution, because pandas dataframe has no attribute with a such name.
If you provide correct name of actual method ('mean', 'apply' or so) to the getattr, this function find this method in pd.DataFrame.__dict__ where all the methods are listed, and return it. So you can call it via (*args, **kwargs) syntax.
You can do this with getattr:
def my_func(df, pandas_stat):
for col in df.columns:
print(getattr(df[col], pandas_stat)()) # the empty parenthesis
# are required to call
# the method
df_max = my_func(df, "max")
I am new to python and even newer to pandas, but relatively well versed in R. I am using Anaconda, with Python 3.5 and pandas 0.18.1. I am trying to read in an excel file as a dataframe. The file admittedly is pretty... ugly. There is a lot of empty space, missing headers, etc. (I am not sure if this is the source of any issues)
I create the file object, then find the appropriate sheet, then try to read that sheet as a dataframe:
xl = pd.ExcelFile(allFiles[i])
sName = [s for s in xl.sheet_names if 'security exposure' in s.lower()]
df = xl.parse(sName)
df
Results:
{'Security exposure - 21 day lag': Percent of Total Holdings \
0 KMNFC vs. 3 Month LIBOR AUD
1 04-OCT-16
2 Australian Dollar
3 NaN
4 NaN
5 NaN
6 NaN
7 NaN
8 Long/Short Net Exposure
9 Total
10 NaN
11 Long
12 NaN
13 NaN
14 NaN
15 NaN
16 NaN
17 NaN
(This goes on for 20-30 more rows and 5-6 more columns)
I am using Anaconda, and Spyder, which has a 'Variable Explorer'. It shows the variable df to be a dict of the DataFrame type:
However, I cannot use iloc:
df.iloc[:,1]
Traceback (most recent call last):
File "<ipython-input-77-d7b3e16ccc56>", line 1, in <module>
df.iloc[:,1]
AttributeError: 'dict' object has no attribute 'iloc'
Any thoughts? What am I missing?
EDIT:
To be clear, what I am really trying to do is reference the first column of the df. In R this would be df[,1]. Looking around it seems to be not a very popular way to do things, or not the 'correct' way. I understand why indexing by column names, or keys, is better, but in this situation, I really just need to index the dataframes by column numbers. Any working method of doing that would be greatly appreciated.
EDIT (2):
Per a suggestion, I tried 'read_excel', with the same results:
df = pd.ExcelFile(allFiles[i]).parse(sName)
df.loc[1]
Traceback (most recent call last):
File "<ipython-input-90-fc40aa59bd20>", line 2, in <module>
df.loc[1]
AttributeError: 'dict' object has no attribute 'loc'
df = pd.read_excel(allFiles[i], sheetname = sName)
df.loc[1]
Traceback (most recent call last):
File "<ipython-input-91-72b8405c6c42>", line 2, in <module>
df.loc[1]
AttributeError: 'dict' object has no attribute 'loc'
The problem was here:
sName = [s for s in xl.sheet_names if 'security exposure' in s.lower()]
which returned a single element list. I changed it to the following:
sName = [s for s in xl.sheet_names if 'security exposure' in s.lower()][0]
which returns a string, and the code then performs as expected.
All thanks to ayhan for pointing this out.
I want to import a txt file, and do a few basic actions on it.
For some reason I keep getting an unhashable type error, not sure what the issue is:
def loadAndPrepData(filepath):
import pandas as pd
pd.set_option('display.width',200)
dataFrame = pd.read_csv(filepath,header=0,sep='\t') #set header to first row and sep by tab
df = dataFrame[0:639,:]
print df
filepath = 'United States Cancer Statistics, 1999-2011 Incidencet.txt'
loadAndPrepData(filepath)
Traceback:
Traceback (most recent call last):
File "C:\Users\Michael\workspace\UCIIntrotoPythonDA\src\Michael_Madani_week3.py", line 16, in <module>
loadAndPrepData(filepath)
File "C:\Users\Michael\workspace\UCIIntrotoPythonDA\src\Michael_Madani_week3.py", line 12, in loadAndPrepData
df = dataFrame[0:639,:]
File "C:\Users\Michael\Anaconda\lib\site-packages\pandas\core\frame.py", line 1797, in __getitem__
return self._getitem_column(key)
File "C:\Users\Michael\Anaconda\lib\site-packages\pandas\core\frame.py", line 1804, in _getitem_column
return self._get_item_cache(key)
File "C:\Users\Michael\Anaconda\lib\site-packages\pandas\core\generic.py", line 1082, in _get_item_cache
res = cache.get(item)
TypeError: unhashable type
The problem is that using the item getter ([]) needs hashable types. When you provide it with [:] this is fine, but when you provide it with [:,:], you will get this error.
pd.DataFrame({"foo":range(1,10)})[:,:]
TypeError: unhashable type
While this works just fine:
pd.DataFrame({"foo":range(1,10)})[:]
However, you should be using .loc no matter how you want to slice.
pd.DataFrame({"foo":range(1,10)}).loc[:,:]
foo
0 1
1 2
2 3
3 4
4 5
5 6
6 7
7 8
8 9