Searching a Pandas series using a string produces a KeyError - python

I'm trying to use df[df['col'].str.contains("string")] (described in these two SO questions: 1 & 2) to select rows based on a partial string match. Here's my code:
import requests
import json
import pandas as pd
import datetime
url = "http://api.turfgame.com/v4/zones/all" # get request returns .json
r = requests.get(url)
df = pd.read_json(r.content) # create a df containing all zone info
print df[df['region'].str.contains("Uppsala")].head()
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-23-55bbf5679808> in <module>()
----> 1 print df[df['region'].str.contains("Uppsala")].head()
C:\Users\User\AppData\Local\Enthought\Canopy32\User\lib\site-packages\pandas\core\frame.pyc in __getitem__(self, key)
1670 if isinstance(key, (Series, np.ndarray, list)):
1671 # either boolean or fancy integer index
-> 1672 return self._getitem_array(key)
1673 elif isinstance(key, DataFrame):
1674 return self._getitem_frame(key)
C:\Users\User\AppData\Local\Enthought\Canopy32\User\lib\site-packages\pandas\core\frame.pyc in _getitem_array(self, key)
1714 return self.take(indexer, axis=0, convert=False)
1715 else:
-> 1716 indexer = self.ix._convert_to_indexer(key, axis=1)
1717 return self.take(indexer, axis=1, convert=True)
1718
C:\Users\User\AppData\Local\Enthought\Canopy32\User\lib\site-packages\pandas\core\indexing.pyc in _convert_to_indexer(self, obj, axis, is_setter)
1083 if isinstance(obj, tuple) and is_setter:
1084 return {'key': obj}
-> 1085 raise KeyError('%s not in index' % objarr[mask])
1086
1087 return indexer
KeyError: '[ nan nan nan ..., nan nan nan] not in index'
I don't understand the which I get a KeyError because df.columns returns:
Index([u'dateCreated', u'id', u'latitude', u'longitude', u'name', u'pointsPerHour', u'region', u'takeoverPoints', u'totalTakeovers'], dtype='object')
So the Key is in the list of columns and opening the page in an internet browser I can find 739 instances of 'Uppsala'.
The column in which I'm search was a nested .json table that looks like this {"id":200,"name":"Scotland","country":"gb"}. Do I have do something special to search between '{}' characters? Could somebody explain where I've made my mistake(s)?

Looks to me like your region column contains dictionaries, which aren't really supported as elements, and so .str isn't working. One way to solve the problem is to promote the region dictionaries to columns in their own right, maybe with something like:
>>> region = pd.DataFrame(df.pop("region").tolist())
>>> df = df.join(region, rsuffix="_region")
after which you have
>>> df.head()
dateCreated id latitude longitude name pointsPerHour takeoverPoints totalTakeovers country id_region name_region
0 2013-06-15T08:00:00+0000 14639 55.947079 -3.206477 GrandSquare 1 185 32 gb 200 Scotland
1 2014-06-15T20:02:37+0000 31571 55.649181 12.609056 Stenringen 1 185 6 dk 172 Hovedstaden
2 2013-06-15T08:00:00+0000 18958 54.593570 -5.955772 Hospitality 0 250 1 gb 206 Northern Ireland
3 2013-06-15T08:00:00+0000 18661 53.754283 -1.526638 LanshawZone 0 250 0 gb 202 Yorkshire & The Humber
4 2013-06-15T08:00:00+0000 17424 55.949285 -3.144777 NoDogsZone 0 250 5 gb 200 Scotland
and
>>> df[df["name_region"].str.contains("Uppsala")].head()
dateCreated id latitude longitude name pointsPerHour takeoverPoints totalTakeovers country id_region name_region
28 2013-07-16T18:53:48+0000 20828 59.793476 17.775389 MoraStenRast 5 125 536 se 142 Uppsala
59 2013-02-08T21:42:53+0000 14797 59.570418 17.482116 BålWoods 3 155 555 se 142 Uppsala
102 2014-06-19T12:00:00+0000 31843 59.617637 17.077094 EnaAlle 5 125 168 se 142 Uppsala
328 2012-09-24T20:08:22+0000 11461 59.634438 17.066398 BluePark 6 110 1968 se 142 Uppsala
330 2014-08-28T20:00:00+0000 33695 59.867027 17.710792 EnbackensBro 4 140 59 se 142 Uppsala
(A hack workaround would be df["region"].apply(str).str.contains("Uppsala"), but I think it's best to clean the data right at the start.)

Related

Python Pandas apply qcut to grouped by level 0 of multi-index in multi-index dataframe

I have a multi-index dataframe in pandas (date and entity_id) and for each date/entity I have obseravtions of a number of variables (A, B ...). My goal is to create a dataframe with the same shape but where the values are replaced by their decile scores.
My test data looks like this:
I want to apply qcut to each column grouped by level 0 of the multi-index - the issue I have is creating a result Dataframe
This code
def qcut_sub_index(df_with_sub_index):
# create empty return value same shape as passed dataframe
df_return=pd.DataFrame()
for date, sub_df in df_with_sub_index.groupby(level=0):
df_return=df_return.append(pd.DataFrame(pd.qcut(sub_df, 10, labels=False, duplicates='drop')))
print(df_return)
return df_return
print(df_values.apply(lambda x: qcut_sub_index(x), axis=0))
returns
A
as_at_date entity_id
2008-01-27 2928 0
2932 3
3083 6
3333 9
2008-02-27 2928 3
2935 9
3333 0
3874 6
2008-03-27 2928 1
2932 2
2934 0
2936 9
2937 4
2939 9
2940 7
2943 3
2944 0
2945 8
2946 6
2947 5
2949 4
B
as_at_date entity_id
2008-01-27 2928 9
2932 6
3083 0
3333 3
2008-02-27 2928 6
2935 0
3333 3
3874 9
2008-03-27 2928 0
2932 9
2934 2
2936 8
2937 7
2939 6
2940 3
2943 1
2944 4
2945 9
2946 5
2947 4
2949 0
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-104-72ff0e6da288> in <module>
11
12
---> 13 print(df_values.apply(lambda x: qcut_sub_index(x), axis=0))
~\Anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
7546 kwds=kwds,
7547 )
-> 7548 return op.get_result()
7549
7550 def applymap(self, func) -> "DataFrame":
~\Anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
178 return self.apply_raw()
179
--> 180 return self.apply_standard()
181
182 def apply_empty_result(self):
~\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
272
273 # wrap results
--> 274 return self.wrap_results(results, res_index)
275
276 def apply_series_generator(self) -> Tuple[ResType, "Index"]:
~\Anaconda3\lib\site-packages\pandas\core\apply.py in wrap_results(self, results, res_index)
313 # see if we can infer the results
314 if len(results) > 0 and 0 in results and is_sequence(results[0]):
--> 315 return self.wrap_results_for_axis(results, res_index)
316
317 # dict of scalars
~\Anaconda3\lib\site-packages\pandas\core\apply.py in wrap_results_for_axis(self, results, res_index)
369
370 try:
--> 371 result = self.obj._constructor(data=results)
372 except ValueError as err:
373 if "arrays must all be same length" in str(err):
~\Anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
466
467 elif isinstance(data, dict):
--> 468 mgr = init_dict(data, index, columns, dtype=dtype)
469 elif isinstance(data, ma.MaskedArray):
470 import numpy.ma.mrecords as mrecords
~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py in init_dict(data, index, columns, dtype)
281 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
282 ]
--> 283 return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
284
285
~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype, verify_integrity)
76 # figure out the index, if necessary
77 if index is None:
---> 78 index = extract_index(arrays)
79 else:
80 index = ensure_index(index)
~\Anaconda3\lib\site-packages\pandas\core\internals\construction.py in extract_index(data)
385
386 if not indexes and not raw_lengths:
--> 387 raise ValueError("If using all scalar values, you must pass an index")
388
389 if have_series:
ValueError: If using all scalar values, you must pass an index
so something is preventing the second application of the lambda function.
I'd appreciate your help, thanks for takign a look.
p.s. if this can be done implcitly without using apply would love to hear. thanks
You solution appears over complicated. Your terminology is none standard, multi-indexes have levels. Stated as qcut() by level 0 of multi-index (not talking about sub-frames which are not pandas concepts)
Bring it all back together
use **kwargs approach to pass arguments to assign() for all columns in data frame
groupby(level=0) is as_of_date
transform() to get a row back for every entry in index
s = 12
df = pd.DataFrame({"as_at_date":np.random.choice(pd.date_range(dt.date(2020,1,27), periods=3, freq="M"), s),
"entity_id":np.random.randint(2900, 3500, s),
"A":np.random.random(s),
"B":np.random.random(s)*(10**np.random.randint(8,10,s))
}).sort_values(["as_at_date","entity_id"])
df = df.set_index(["as_at_date","entity_id"])
df2 = df.assign(**{c:df.groupby(level=0)[c].transform(lambda x: pd.qcut(x, 10, labels=False))
for c in df.columns})
df
A B
as_at_date entity_id
2020-01-31 2926 0.770121 2.883519e+07
2943 0.187747 1.167975e+08
2973 0.371721 3.133071e+07
3104 0.243347 4.497294e+08
3253 0.591022 7.796131e+08
3362 0.810001 6.438441e+08
2020-02-29 3185 0.690875 4.513044e+08
3304 0.311436 4.561929e+07
2020-03-31 2953 0.325846 7.770111e+08
2981 0.918461 7.594753e+08
3034 0.133053 6.767501e+08
3355 0.624519 6.318104e+07
df2
A B
as_at_date entity_id
2020-01-31 2926 7 0
2943 0 3
2973 3 1
3104 1 5
3253 5 9
3362 9 7
2020-02-29 3185 9 9
3304 0 0
2020-03-31 2953 3 9
2981 9 6
3034 0 3
3355 6 0
Using concat inside an iteration on the original dataframe does the trick but is there a smarter way to do this?
thanks
def qcut_sub_index(df_with_sub_index):
# create empty return value same shape as passed dataframe
df_return=pd.DataFrame()
for date, sub_df in df_with_sub_index.groupby(level=0):
df_return=df_return.append(pd.DataFrame(pd.qcut(sub_df, 10, labels=False,
duplicates='drop')))
return df_return
df_x=pd.DataFrame()
for (columnName, columnData) in df_values.iteritems():
df_x=pd.concat([df_x, qcut_sub_index(columnData)], axis=1, join="outer")
df_x

How to fix TypeError: data type not understood with a datetime object in Pandas

I am working with a date column in pandas. I have a date column. I want to have just the year and month as a separate column.
I achieved that by:
df1["month"] = pd.to_datetime(Table_A_df['date']).dt.to_period('M')
Printing it looks like this:
df1["month"]
Out:
0 2017-03
1 2017-03
2 2017-03
3 2017-03
4 2017-03
...
79638 2018-03
79639 2018-03
79640 2018-03
79641 2018-03
79642 2018-03
Name: month, Length: 79643, dtype: period[M]
My customer id looks like this:
0 5094298f068196c5349d43847de5afc9125cf989
1 NaN
2 NaN
3 433fdf385e33176cf9b0d67ecf383aa928fa261c
4 NaN
...
79638 6836d8cdd9c6c537c702b35ccd972fae58070004
79639 bbc08d8abad5e699823f2f0021762797941679be
79640 39b5fdd28cb956053d3e4f3f0b884fb95749da8a
79641 3342d5b210274b01e947cc15531ad53fbe25435b
79642 b3f02d0768c0ba8334047d106eb759f3e80517ac
Name: customer_id, Length: 79643, dtype: object
Now trying to groupby customer id and transform the data.
user_groups = df1.groupby("customer_id")["month"]
df1["Cohort_month"] = user_groups.transform("min")
I get the following error:
TypeError: data type not understood
Complete error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-108-107e17f9a489> in <module>
----> 1 df1["Cohort_month"] = user_groups.transform("min")
C:\Users\Public\Anaconda\lib\site-packages\pandas\core\groupby\generic.py in transform(self, func, *args, **kwargs)
475 # result to the whole group. Compute func result
476 # and deal with possible broadcasting below.
--> 477 result = getattr(self, func)(*args, **kwargs)
478 return self._transform_fast(result, func)
479
C:\Users\Public\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in f(self, **kwargs)
1375 # try a cython aggregation if we can
1376 try:
-> 1377 return self._cython_agg_general(alias, alt=npfunc, **kwargs)
1378 except DataError:
1379 pass
C:\Users\Public\Anaconda\lib\site-packages\pandas\core\groupby\groupby.py in _cython_agg_general(self, how, alt, numeric_only, min_count)
887
888 result, agg_names = self.grouper.aggregate(
--> 889 obj._values, how, min_count=min_count
890 )
891
C:\Users\Public\Anaconda\lib\site-packages\pandas\core\groupby\ops.py in aggregate(self, values, how, axis, min_count)
568 ) -> Tuple[np.ndarray, Optional[List[str]]]:
569 return self._cython_operation(
--> 570 "aggregate", values, how, axis, min_count=min_count
571 )
572
C:\Users\Public\Anaconda\lib\site-packages\pandas\core\groupby\ops.py in _cython_operation(self, kind, values, how, axis, min_count, **kwargs)
560 result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype)
561 elif is_datetimelike and kind == "aggregate":
--> 562 result = result.astype(orig_values.dtype)
563
564 return result, names
TypeError: data type not understood
This was working before when I had 1 as the day, but when I made it just year and month. I am getting an error. Is there a fix around this?
It's working for the sample you shared, not sure where the issue is, are there any missing values in your month column?
df['month'] = pd.to_datetime(df['month']).dt.to_period('M')
user_groups = df.groupby("customer_id")["month"]
df["Cohort_month"] = user_groups.transform("min")
print(df)
customer_id month Cohort_month
0 5094298f068196c5349d43847de5afc9125cf989 2017-03 2017-03
1 NaN 2017-03 NaT
2 NaN 2017-03 NaT
3 433fdf385e33176cf9b0d67ecf383aa928fa261c 2017-03 2017-03
4 NaN 2017-03 NaT

Applying a function to a small dataframe: shape mismatch: value array of shape (4,) could not be broadcast

I'm trying to apply an API result to two columns.
My dummy dataframe below. unfortunately this is not very reproducable, as I'm using an API with key and password... this is just to give you an idea of the dimensions.
But I hope maybe somebody can spot an obvious problem.
I am pinging mobile numbers using an API, and recording the information to the columns phone_ping and phone_reason (whether they are active/inactive & other info)
df = pd.DataFrame(columns=columns, index=range(0, 4))
df['name'] = ['Jim Bob', 'Joe Bloggs', 'Chuck Norris', 'Jane Doe']
df['mobile'] = ['2222588', '+352654600810', pd.np.nan, '+123456']
print(df)
name mobile phone_ping phone_reason
0 Jim Bob 2222588 NaN NaN
1 Joe Bloggs +352654600810 NaN NaN
2 Chuck Norris NaN NaN NaN
3 Jane Doe +123456 NaN NaN
So the API checks the phone number, and returns a message such as success along with another message such as deliverable. These are the two columns phone_ping and phone_reason that I am filling.
Function below to apply to the df:
def pingmobile(phone):
# if both phone and email are present
if pd.notnull(phone):
# Perform GET request
response = requests.get(url).json()
# record status and message
status = response['status']
message = response['error_text']
# return it to my two columns
return pd.DataFrame([(status,message)])
else:
# if there is no phone number, just return Nan
return pd.DataFrame([(pd.np.nan,pd.np.nan)])
Applying the function, which should return the data to columns ['phone_ping','phone_reason']:
df.loc[:,['phone_ping','phone_reason']] = df.loc[:,'mobile'].apply(lambda x: pingmobile(x))
Expected results:
name mobile phone_ping phone_reason
0 Jim Bob 2222588 Fail Invalid Number
1 Joe Bloggs +352654600810 Success Deliverable
2 Chuck Norris NaN NaN NaN
3 Jane Doe +123456 Fail Invalid Number
Error code:
ValueError Traceback (most recent call last)
<ipython-input-318-7e506c1da8e2> in <module>
----> 1 df.loc[:,['phone_ping','phone_reason']] = df.loc[:,'mobile'].apply(lambda x: pingmobile(x))
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in __setitem__(self, key, value)
200 key = com.apply_if_callable(key, self.obj)
201 indexer = self._get_setitem_indexer(key)
--> 202 self._setitem_with_indexer(indexer, value)
203
204 def _validate_key(self, key, axis: int):
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _setitem_with_indexer(self, indexer, value)
577 # actually do the set
578 self.obj._consolidate_inplace()
--> 579 self.obj._data = self.obj._data.setitem(indexer=indexer, value=value)
580 self.obj._maybe_update_cacher(clear=True)
581
~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in setitem(self, **kwargs)
558
559 def setitem(self, **kwargs):
--> 560 return self.apply("setitem", **kwargs)
561
562 def putmask(self, **kwargs):
~\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
436 kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy)
437
--> 438 applied = getattr(b, f)(**kwargs)
439 result_blocks = _extend_blocks(applied, result_blocks)
440
~\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in setitem(self, indexer, value)
936 # set
937 else:
--> 938 values[indexer] = value
939
940 # coerce and try to infer the dtypes of the result
ValueError: shape mismatch: value array of shape (4,) could not be broadcast to indexing result of shape (2,4)
Try to return a Series instead of a DataFrame:
def pingmobile(phone):
if pd.notnull(phone):
response = requests.get(url).json()
status = response['status']
message = response['error_text']
return pd.Series({"status": response["status"],
"message": response["error_text"]})
return pd.Series({"status": pd.np.nan,
"message": pd.np.nan})
Then, just apply your function:
df[["phone_ping", "phone_reason"]] = df.mobile.apply(pingmobile)

Python selecting items by comparing values in a table using dictionary

I have a table with 12 columns and want to select the items in the first column (qseqid) based on the second column (sseqid). Meaning that the second column (sseqid) is repeating with different values in the 11th and 12th columns, which areevalueandbitscore, respectively.
The ones that I would like to get are having the lowestevalueand the highestbitscore(whenevalues are the same, the rest of the columns can be ignored and the data is down below).
So, I have made a short code which uses the second columns as a key for the dictionary. I can get five different items from the second column with lists of qseqid+evalueandqseqid+bitscore.
Here is the code:
#!usr/bin/python
filename = "data.txt"
readfile = open(filename,"r")
d = dict()
for i in readfile.readlines():
i = i.strip()
i = i.split("\t")
d.setdefault(i[1], []).append([i[0],i[10]])
d.setdefault(i[1], []).append([i[0],i[11]])
for x in d:
print(x,d[x])
readfile.close()
But, I am struggling to get the qseqid with the lowest evalue and the highest bitscore for each sseqid.
Is there any good logic to solve the problem?
Thedata.txtfile (including the header row and with»representing tab characters)
qseqid»sseqid»pident»length»mismatch»gapopen»qstart»qend»sstart»send»evalue»bitscore
ACLA_022040»TBB»32.71»431»258»8»39»468»24»423»2.00E-76»240
ACLA_024600»TBB»80»435»87»0»1»435»1»435»0»729
ACLA_031860»TBB»39.74»453»251»3»1»447»1»437»1.00E-121»357
ACLA_046030»TBB»75.81»434»105»0»1»434»1»434»0»704
ACLA_072490»TBB»41.7»446»245»3»4»447»3»435»2.00E-120»353
ACLA_010400»EF1A»27.31»249»127»8»69»286»9»234»3.00E-13»61.6
ACLA_015630»EF1A»22»491»255»17»186»602»3»439»8.00E-19»78.2
ACLA_016510»EF1A»26.23»122»61»4»21»127»9»116»2.00E-08»46.2
ACLA_023300»EF1A»29.31»447»249»12»48»437»3»439»2.00E-45»155
ACLA_028450»EF1A»85.55»443»63»1»1»443»1»442»0»801
ACLA_074730»CALM»23.13»147»101»4»6»143»2»145»7.00E-08»41.2
ACLA_096170»CALM»29.33»150»96»4»34»179»2»145»1.00E-13»55.1
ACLA_016630»CALM»23.9»159»106»5»58»216»4»147»5.00E-12»51.2
ACLA_031930»RPB2»36.87»1226»633»24»121»1237»26»1219»0»734
ACLA_065630»RPB2»65.79»1257»386»14»1»1252»4»1221»0»1691
ACLA_082370»RPB2»27.69»1228»667»37»31»1132»35»1167»7.00E-110»365
ACLA_061960»ACT»28.57»147»95»5»146»284»69»213»3.00E-12»57.4
ACLA_068200»ACT»28.73»463»231»13»16»471»4»374»1.00E-53»176
ACLA_069960»ACT»24.11»141»97»4»581»718»242»375»9.00E-09»46.2
ACLA_095800»ACT»91.73»375»31»0»1»375»1»375»0»732
And here's a little more readable version of the table's contents:
0 1 2 3 4 5 6 7 8 9 10 11
qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
ACLA_022040 TBB 32.71 431 258 8 39 468 24 423 2.00E-76 240
ACLA_024600 TBB 80 435 87 0 1 435 1 435 0 729
ACLA_031860 TBB 39.74 453 251 3 1 447 1 437 1.00E-121 357
ACLA_046030 TBB 75.81 434 105 0 1 434 1 434 0 704
ACLA_072490 TBB 41.7 446 245 3 4 447 3 435 2.00E-120 353
ACLA_010400 EF1A 27.31 249 127 8 69 286 9 234 3.00E-13 61.6
ACLA_015630 EF1A 22 491 255 17 186 602 3 439 8.00E-19 78.2
ACLA_016510 EF1A 26.23 122 61 4 21 127 9 116 2.00E-08 46.2
ACLA_023300 EF1A 29.31 447 249 12 48 437 3 439 2.00E-45 155
ACLA_028450 EF1A 85.55 443 63 1 1 443 1 442 0 801
ACLA_074730 CALM 23.13 147 101 4 6 143 2 145 7.00E-08 41.2
ACLA_096170 CALM 29.33 150 96 4 34 179 2 145 1.00E-13 55.1
ACLA_016630 CALM 23.9 159 106 5 58 216 4 147 5.00E-12 51.2
ACLA_031930 RPB2 36.87 1226 633 24 121 1237 26 1219 0 734
ACLA_065630 RPB2 65.79 1257 386 14 1 1252 4 1221 0 1691
ACLA_082370 RPB2 27.69 1228 667 37 31 1132 35 1167 7.00E-110 365
ACLA_061960 ACT 28.57 147 95 5 146 284 69 213 3.00E-12 57.4
ACLA_068200 ACT 28.73 463 231 13 16 471 4 374 1.00E-53 176
ACLA_069960 ACT 24.11 141 97 4 581 718 242 375 9.00E-09 46.2
ACLA_095800 ACT 91.73 375 31 0 1 375 1 375 0 732
Since you're a Python newbie I'm glad that there are several examples of how to this manually, but for comparison I'll show how it can be done using the pandas library which makes working with tabular data much simpler.
Since you didn't provide example output, I'm assuming that by "with the lowest evalue and the highest bitscore for each sseqid" you mean "the highest bitscore among the lowest evalues" for a given sseqid; if you want those separately, that's trivial too.
import pandas as pd
df = pd.read_csv("acla1.dat", sep="\t")
df = df.sort(["evalue", "bitscore"],ascending=[True, False])
df_new = df.groupby("sseqid", as_index=False).first()
which produces
>>> df_new
sseqid qseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
0 ACT ACLA_095800 91.73 375 31 0 1 375 1 375 0.000000e+00 732.0
1 CALM ACLA_096170 29.33 150 96 4 34 179 2 145 1.000000e-13 55.1
2 EF1A ACLA_028450 85.55 443 63 1 1 443 1 442 0.000000e+00 801.0
3 RPB2 ACLA_065630 65.79 1257 386 14 1 1252 4 1221 0.000000e+00 1691.0
4 TBB ACLA_024600 80.00 435 87 0 1 435 1 435 0.000000e+00 729.0
Basically, first we read the data file into an object called a DataFrame, which is kind of like an Excel worksheet. Then we sort by evalue ascending (so that lower evalues come first) and by bitscore descending (so that higher bitscores come first). Then we can use groupby to collect the data in groups of equal sseqid, and take the first one in each group, which because of the sorting will be the one we want.
#!usr/bin/python
import csv
DATA = "data.txt"
class Sequence:
def __init__(self, row):
self.qseqid = row[0]
self.sseqid = row[1]
self.pident = float(row[2])
self.length = int(row[3])
self.mismatch = int(row[4])
self.gapopen = int(row[5])
self.qstart = int(row[6])
self.qend = int(row[7])
self.sstart = int(row[8])
self.send = int(row[9])
self.evalue = float(row[10])
self.bitscore = float(row[11])
def __str__(self):
return (
"{qseqid}\t"
"{sseqid}\t"
"{pident}\t"
"{length}\t"
"{mismatch}\t"
"{gapopen}\t"
"{qstart}\t"
"{qend}\t"
"{sstart}\t"
"{send}\t"
"{evalue}\t"
"{bitscore}"
).format(**self.__dict__)
def entries(fname, header_rows=1, dtype=list, **kwargs):
with open(fname) as inf:
incsv = csv.reader(inf, **kwargs)
# skip header rows
for i in range(header_rows):
next(incsv)
for row in incsv:
yield dtype(row)
def main():
bestseq = {}
for seq in entries(DATA, dtype=Sequence, delimiter="\t"):
# see if a sequence with the same sseqid already exists
prev = bestseq.get(seq.sseqid, None)
if (
prev is None
or seq.evalue < prev.evalue
or (seq.evalue == prev.evalue and seq.bitscore > prev.bitscore)
):
bestseq[seq.sseqid] = seq
# display selected sequences
keys = sorted(bestseq)
for key in keys:
print(bestseq[key])
if __name__ == "__main__":
main()
which results in
ACLA_095800 ACT 91.73 375 31 0 1 375 1 375 0.0 732.0
ACLA_096170 CALM 29.33 150 96 4 34 179 2 145 1e-13 55.1
ACLA_028450 EF1A 85.55 443 63 1 1 443 1 442 0.0 801.0
ACLA_065630 RPB2 65.79 1257 386 14 1 1252 4 1221 0.0 1691.0
ACLA_024600 TBB 80.0 435 87 0 1 435 1 435 0.0 729.0
While not nearly as elegant and concise as using thepandaslibrary, it's quite possible to do what you want without resorting to third-party modules. The following uses thecollections.defaultdictclass to facilitate creation of dictionaries of variable-length lists of records. The use of theAttrDictclass is optional, but it makes accessing the fields of each dictionary-based records easier and is less awkward-looking than the usualdict['fieldname']syntax otherwise required.
import csv
from collections import defaultdict, namedtuple
from itertools import imap
from operator import itemgetter
data_file_name = 'data.txt'
DELIMITER = '\t'
ssqeid_dict = defaultdict(list)
# from http://stackoverflow.com/a/1144405/355230
def multikeysort(items, columns):
comparers = [((itemgetter(col[1:].strip()), -1) if col.startswith('-') else
(itemgetter(col.strip()), 1)) for col in columns]
def comparer(left, right):
for fn, mult in comparers:
result = cmp(fn(left), fn(right))
if result:
return mult * result
else:
return 0
return sorted(items, cmp=comparer)
# from http://stackoverflow.com/a/15109345/355230
class AttrDict(dict):
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
with open(data_file_name, 'rb') as data_file:
reader = csv.DictReader(data_file, delimiter=DELIMITER)
format_spec = '\t'.join([('{%s}' % field) for field in reader.fieldnames])
for rec in (AttrDict(r) for r in reader):
# Convert the two sort fields to numeric values for proper ordering.
rec.evalue, rec.bitscore = map(float, (rec.evalue, rec.bitscore))
ssqeid_dict[rec.sseqid].append(rec)
for ssqeid in sorted(ssqeid_dict):
# Sort each group of recs with same ssqeid. The first record after sorting
# will be the one sought that has the lowest evalue and highest bitscore.
selected = multikeysort(ssqeid_dict[ssqeid], ['evalue', '-bitscore'])[0]
print format_spec.format(**selected)
Output (»represents tabs):
ACLA_095800» ACT» 91.73» 375» 31» 0» 1» 375» 1» 375» 0.0» 732.0
ACLA_096170» CALM» 29.33» 150» 96» 4» 34» 179» 2» 145» 1e-13» 55.1
ACLA_028450» EF1A» 85.55» 443» 63» 1» 1» 443» 1» 442» 0.0» 801.0
ACLA_065630» RPB2» 65.79» 1257» 386» 14» 1» 1252» 4» 1221» 0.0» 1691.0
ACLA_024600» TBB» 80» 435» 87» 0» 1» 435» 1» 435» 0.0» 729.0
filename = 'data.txt'
readfile = open(filename,'r')
d = dict()
sseqid=[]
lines=[]
for i in readfile.readlines():
sseqid.append(i.rsplit()[1])
lines.append(i.rsplit())
sorted_sseqid = sorted(set(sseqid))
sdqDict={}
key =None
for sorted_ssqd in sorted_sseqid:
key=sorted_ssqd
evalue=[]
bitscore=[]
qseid=[]
for line in lines:
if key in line:
evalue.append(line[10])
bitscore.append(line[11])
qseid.append(line[0])
sdqDict[key]=[qseid,evalue,bitscore]
print sdqDict
print 'TBB LOWEST EVALUE' + '---->' + min(sdqDict['TBB'][1])
##I think you can do the list manipulation below to find out the qseqid
readfile.close()

Pandas Merge Error: MemoryError

Problem:
I'm trying to two relatively small datasets together, but the merge raises a MemoryError. I have two datasets of aggregates of country trade data, that I'm trying to merge on the keys year and country, so the data needs to be particularity placed. This unfortunately makes the use of concat and its performance benefits impossible as seen in the answer to this question: MemoryError on large merges with pandas in Python.
Here's the setup:
The attempted merge:
df = merge(df, i, left_on=['year', 'ComTrade_CC'], right_on=["Year","Partner Code"])
Basic data structure:
i:
Year Reporter_Code Trade_Flow_Code Partner_Code Classification Commodity Code Quantity Unit Code Supplementary Quantity Netweight (kg) Value Estimation Code
0 2003 381 2 36 H2 070951 8 1274 1274 13810 0
1 2003 381 2 36 H2 070930 8 17150 17150 30626 0
2 2003 381 2 36 H2 0709 8 20493 20493 635840 0
3 2003 381 1 36 H2 0507 8 5200 5200 27619 0
4 2003 381 1 36 H2 050400 8 56439 56439 683104 0
df:
mporter cod CC ComTrade_CC Distance_miles
0 110 215 215 757 428.989
1 110 215 215 757 428.989
2 110 215 215 757 428.989
3 110 215 215 757 428.989
4 110 215 215 757 428.989
Error Traceback:
MemoryError Traceback (most recent call last)
<ipython-input-10-8d6e9fb45de6> in <module>()
1 for i in c_list:
----> 2 df = merge(df, i, left_on=['year', 'ComTrade_CC'], right_on=["Year","Partner Code"])
/usr/local/lib/python2.7/dist-packages/pandas-0.12.0rc1_309_g9fc8636-py2.7-linux-x86_64.egg/pandas/tools/merge.pyc in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy)
36 right_index=right_index, sort=sort, suffixes=suffixes,
37 copy=copy)
---> 38 return op.get_result()
39 if __debug__:
40 merge.__doc__ = _merge_doc % '\nleft : DataFrame'
/usr/local/lib/python2.7/dist-packages/pandas-0.12.0rc1_309_g9fc8636-py2.7-linux-x86_64.egg/pandas/tools/merge.pyc in get_result(self)
193 copy=self.copy)
194
--> 195 result_data = join_op.get_result()
196 result = DataFrame(result_data)
197
/usr/local/lib/python2.7/dist-packages/pandas-0.12.0rc1_309_g9fc8636-py2.7-linux-x86_64.egg/pandas/tools/merge.pyc in get_result(self)
693 if klass in mapping:
694 klass_blocks.extend((unit, b) for b in mapping[klass])
--> 695 res_blk = self._get_merged_block(klass_blocks)
696
697 # if we have a unique result index, need to clear the _ref_locs
/usr/local/lib/python2.7/dist-packages/pandas-0.12.0rc1_309_g9fc8636-py2.7-linux-x86_64.egg/pandas/tools/merge.pyc in _get_merged_block(self, to_merge)
706 def _get_merged_block(self, to_merge):
707 if len(to_merge) > 1:
--> 708 return self._merge_blocks(to_merge)
709 else:
710 unit, block = to_merge[0]
/usr/local/lib/python2.7/dist-packages/pandas-0.12.0rc1_309_g9fc8636-py2.7-linux-x86_64.egg/pandas/tools/merge.pyc in _merge_blocks(self, merge_chunks)
728 # Should use Fortran order??
729 block_dtype = _get_block_dtype([x[1] for x in merge_chunks])
--> 730 out = np.empty(out_shape, dtype=block_dtype)
731
732 sofar = 0
MemoryError:
Thanks for your thoughts!
In case anyone coming across this question still has similar trouble with merge, you can probably get concat to work by renaming the relevant columns in the two dataframes to the same names, setting them as a MultiIndex (i.e. df = dv.set_index(['A','B'])), and then using concat to join them.
UPDATE
Example:
df1 = pd.DataFrame({'A':[1, 2], 'B':[2, 3], 'C':[3, 4]})
df2 = pd.DataFrame({'A':[1, 2], 'B':[2, 3], 'D':[7, 8]})
both = pd.concat([df1.set_index(['A','B']), df2.set_index(['A','B'])], axis=1).reset_index()
df1
A B C
0 1 2 3
1 2 3 4
df2
A B D
0 1 2 7
1 2 3 8
both
A B C D
0 1 2 3 7
1 2 3 4 8
I haven't benchmarked the performance of this approach, but it didn't get the memory error and worked for my applications.

Categories