creating a dataframe using lists - python

I am trying to create a dataframe that looks like this excel sheet but I can't figure out how to do so. Here is the code I am attempting to use
import pandas as pd
ls_super = ['Supernatant',total_volume_super_ml, Total_activity_super, total_protein_super_mg, specific_activity_sup, 100, 1]
df3 = pd.DataFrame(ls_super, columns =['Sample','Total Volume','Total activity','Total protein','Specific Activity','percent yield,','purification'
])
Here is the error message: ValueError Traceback (most recent call last)
/tmp/ipykernel_125580/4224246098.py in
20
21 # list of strings
---> 22 df3 = pd.DataFrame(ls_super, columns =['Sample','Total Volume','Total activity','Total protein','Specific Activity','percent yield,','purification'
23 ])
24
~/.local/lib/python3.8/site-packages/pandas/core/frame.py in init(self, data, index, columns, dtype, copy)
709 )
710 else:
--> 711 mgr = ndarray_to_mgr(
712 data,
713 index,
~/.local/lib/python3.8/site-packages/pandas/core/internals/construction.py in ndarray_to_mgr(values, index, columns, dtype, copy, typ)
322 )
323
--> 324 _check_values_indices_shape_match(values, index, columns)
325
326 if typ == "array":
~/.local/lib/python3.8/site-packages/pandas/core/internals/construction.py in _check_values_indices_shape_match(values, index, columns)
391 passed = values.shape
392 implied = (len(index), len(columns))
--> 393 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")
394
395
ValueError: Shape of passed values is (7, 1), indices imply (7, 7)

Problem: the DataFrame() constructor insists on interpreting the one-dimensional python list ls_super as 1 column and 7 rows ("Shape of passed values is (7, 1)"), as opposed to 1 row with 7 columns (which would have shape=(1, 7)).
Solution: add a second set of brackets ([]) around your definition of the ls_super list. In other words, make ls_super a two-dimensional list. The DataFrame constructor then sees a single value in the first dimension, and seven values in the second dimension, producing the desired shape of (1, 7).
ls_super = [['Supernatant',1, 2, 3, 4, 100, 1]]
df3 = pd.DataFrame(ls_super,
columns=['Sample', 'Total Volume', 'Total activity', 'Total protein', 'Specific Activity', 'percent yield', 'purification'])

import pandas as pd
first_row_list = [
"Supernatant",
total_volume_super_ml,
Total_activity_super,
total_protein_super_mg,
specific_activity_sup,
100,
1
]
columns = [
"Sample",
"Total Volume",
"Total Activity",
"Total protein",
"Specific Activity",
"percent yield",
"purification"
]
d = dict(zip(columns, [[f] for f in first_row_list]))
df = pd.DataFrame(d)
or
d = {'Sample': ["Supernatant"],
'Total Volume': [total_volume_super_ml],
'Total Acitivity': [Total_activity_super],
'Total protein': [total_protein_super_mg],
'Specific Activity': [specific_activity_sup],
'percent yield': [100],
'purification': [1]}
df = pd.DataFrame(d)

In pandas dataframe your input could be an a list of list.
import pandas as pd
from random import uniform
total_volume_super_ml = uniform(0,1)
Total_activity_super = uniform(0,1)
total_protein_super_mg = uniform(0,1)
specific_activity_sup = uniform(0,1)
ls_super = ['Supernatant',total_volume_super_ml, Total_activity_super, total_protein_super_mg, specific_activity_sup, 100, 1]
df3 = pd.DataFrame([ls_super], columns =['Sample','Total Volume','Total activity','Total protein','Specific Activity','percent yield','purification'])
df3

Related

Pandas assign a column value basis previous row different column value

I have a df like this:
and the resultDF I want needs to be like this:
So except first row I want Supply value to be added with Available value of previous row and then subtract it with order value. E.g. for row 3 in resultDF, Supply value (2306) is generated by adding Available value (145, row 2) from resultDF and Supply value (2161, row 3) from df. And then simply Available value is calculated using Supply - Order. Can anyone help me with how to generate resultDF.
Use cumsum:
df["Available"] = df["Supply"].cumsum() - df["Order"].cumsum()
df["Supply"] = df["Available"] + df["Order"]
>>> df
product Month Order Supply Available
0 xx-xxx 202107 718 1531.0 813.0
1 None 202108 668 813.0 145.0
2 None 202109 5030 2306.0 -2724.0
3 None 202110 667 -2724.0 -3391.0
Use cumsum to compute right values:
Assuming:
you want to fix your rows per product
your rows are already ordered by (product, month)
# Setup
data = {'Product': ['xx-xxx', 'xx-xxx', 'xx-xxx', 'xx-xxx'],
'Month': [202107, 202108, 202109, 202110],
'Order': [718, 668, 5030, 667],
'Supply': [1531, 0, 2161, 0],
'Available': [813, -668, -2869, -667]}
df = pd.DataFrame(data)
df[['Supply', 'Available']] = df.groupby('Product').apply(lambda x: \
pd.DataFrame({'Supply': x['Order'] + x['Supply'].cumsum() - x['Order'].cumsum(),
'Available': x['Supply'].cumsum() - x['Order'].cumsum()}))
Output:
>>> df
Product Month Order Supply Available
0 xx-xxx 202107 718 1531 813
1 xx-xxx 202108 668 813 145
2 xx-xxx 202109 5030 2306 -2724
3 xx-xxx 202110 667 -2724 -3391

How can I speed up this iteration?

I have a dataframe with over ten million rows containing 2 columns 'left_index' and 'right_index'.
'left_index' is the index of a value and 'right_index' contains indexes of rows that have a possible match.
The problem is that this contains duplicate matches (Ex: 0,1 and 1,0).
I want to filter this dataframe and only keep one combination of each match.
I'm using a list here for an example.
In: [(0,1), (1,0), (3,567)]
Out: [(0,1), (3, 567)]
The below code produces what I want however it is very slow. Is there a faster way to solve this?
lst2 = []
for i in lst1:
if(i in lst2):
lst1.remove(i)
else:
lst2.append((i[1],i[0]))
Using numpy to keep the first occurrence of a non-unique array:
import numpy as np
lst1 = [(1,0), (0,1), (2, 5), (3,567), (5,2)]
arr = np.array(lst1)
result = arr[np.unique(np.sort(arr), 1, axis=0)[1]]
>>> result
array([[ 1, 0],
[ 2, 5],
[ 3, 567]])
I believe Pandas saves you from using loop.
import pandas as pd
df = pd.DataFrame([
[(0, 0), (0, 0), 123],
[(0, 0), (0, 1), 234],
[(1, 0), (0, 1), 345],
[(1, 1), (0, 1), 456],
], columns=['left_index', 'right_index', 'value'])
print(df)
left_index right_index value
0 (0, 0) (0, 0) 123
1 (0, 0) (0, 1) 234
2 (1, 0) (0, 1) 345
3 (1, 1) (0, 1) 456
df['left_index_set'] = df['left_index'].apply(set)
df['right_index_set'] = df['right_index'].apply(set)
I am not sure what you need after this point. If you want to filter duplicates you do the following.
df = df[df['left_index_set'] != df['right_index_set']]
df_final1= df[['left_index', 'right_index', 'value']]
print(df_final1)
left_index right_index value
1 (0, 0) (0, 1) 234
3 (1, 1) (0, 1) 456
However, If you do not want to filter dataframe but to modify it:
df.loc[df['left_index_set'] != df['right_index_set'], 'right_index'] = None # None, '' or what you want. It's up to you
df_final2 = df[['left_index', 'right_index', 'value']]
print(df_final2)
left_index right_index value
0 (0, 0) (0, 0) 123
1 (0, 0) None 234
2 (1, 0) (0, 1) 345
3 (1, 1) None 456
You mention the data is in a dataframe and tagged pandas so we can use numpy to do this work for us using vectorization.
First, since you did not provide a way to create the data, I generated a dataframe per your description using:
import numpy as np
import pandas
def build_dataframe():
def rand_series():
"""Create series of 1 million random integers in range [0, 9999]."""
return (np.random.rand(1000000) * 10000).astype('int')
data = pandas.DataFrame({
'left_index': rand_series(),
'right_index': rand_series()
})
return data.set_index('left_index')
data = build_dataframe()
Since (0,1) is the same as (1,0) per your requirements, lets just create an index that has the values sorted for us. First create two new columns with the minimum and maximum value of left and right index:
data['min_index'] = np.minimum(data.index, data.right_index)
data['max_index'] = np.maximum(data.index, data.right_index)
print(data)
right_index min_index max_index
left_index
4270 438 438 4270
1277 9378 1277 9378
20 7080 20 7080
4646 6623 4646 6623
3280 4481 3280 4481
... ... ... ...
3656 2492 2492 3656
2345 210 210 2345
9241 1934 1934 9241
369 8362 369 8362
5251 6047 5251 6047
[1000000 rows x 2 columns]
Then we can reset the index to these two new columns (really we just want a multi-index, and this is one way of getting it for us).
data = data.reset_index().set_index(keys=['min_index', 'max_index'])
print(data)
left_index right_index
min_index max_index
438 4270 4270 438
1277 9378 1277 9378
20 7080 20 7080
4646 6623 4646 6623
3280 4481 3280 4481
... ... ...
2492 3656 3656 2492
210 2345 2345 210
1934 9241 9241 1934
369 8362 369 8362
5251 6047 5251 6047
[1000000 rows x 2 columns]
Then we just want the unique values of the index. This is the operation that takes the most time, but should still be significantly faster than the naive implementation using lists.
unique = data.index.unique()
print (unique)
MultiIndex([( 438, 4270),
(1277, 9378),
( 20, 7080),
(4646, 6623),
(3280, 4481),
(4410, 9367),
(1864, 7881),
( 516, 3287),
(1678, 6946),
(1253, 7890),
...
(6669, 9527),
(1095, 8866),
( 455, 7800),
(2862, 8587),
(8221, 9808),
(2492, 3656),
( 210, 2345),
(1934, 9241),
( 369, 8362),
(5251, 6047)],
names=['min_index', 'max_index'], length=990197)

Wrong number of items and non match shape of passed values when to merge dtype(O) and 'float64' array to a dataframe

I'm expected a dataframe with one 111 columns and one rows, which name of columns is from column1 and the first row is column2
Here's my data column1
Index(['Col 1', 'Col 2', 'Col 3',
...
'Col 110', 'Col 111'],
dtype='object', length=111)
Here's my data column2
array([0.06308672, 0.02825317, 0.05428668, 0.00368413, 0.00282469,
...
0.00129729])
Addtional information
column1.shape resulted (111,)
column1.dtype resulted dtype('float64')
column2.shape resulted (111,)
column2.dtype resulted dtype('O')
Here's what I did
pd.DataFrame(column2, columns = column1)
And the result is
ValueError Traceback (most recent call last)
/opt/conda/lib/python3.8/site-packages/pandas/core/internals/managers.py in create_block_manager_from_blocks(blocks, axes)
1661 blocks = [
-> 1662 make_block(values=blocks[0], placement=slice(0, len(axes[0])))
1663 ]
/opt/conda/lib/python3.8/site-packages/pandas/core/internals/blocks.py in make_block(values, placement, klass, ndim, dtype)
2721
-> 2722 return klass(values, ndim=ndim, placement=placement)
2723
/opt/conda/lib/python3.8/site-packages/pandas/core/internals/blocks.py in __init__(self, values, placement, ndim)
129 if self._validate_ndim and self.ndim and len(self.mgr_locs) != len(self.values):
--> 130 raise ValueError(
131 f"Wrong number of items passed {len(self.values)}, "
ValueError: Wrong number of items passed 1, placement implies 111
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-226-5a79c428cd8e> in <module>
----> 1 pd.DataFrame(column2, columns = column1)
/opt/conda/lib/python3.8/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
495 mgr = init_dict({data.name: data}, index, columns, dtype=dtype)
496 else:
--> 497 mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
498
499 # For data is list-like, or Iterable (will consume into list)
/opt/conda/lib/python3.8/site-packages/pandas/core/internals/construction.py in init_ndarray(values, index, columns, dtype, copy)
232 block_values = [values]
233
--> 234 return create_block_manager_from_blocks(block_values, [columns, index])
235
236
/opt/conda/lib/python3.8/site-packages/pandas/core/internals/managers.py in create_block_manager_from_blocks(blocks, axes)
1670 blocks = [getattr(b, "values", b) for b in blocks]
1671 tot_items = sum(b.shape[0] for b in blocks)
-> 1672 raise construction_error(tot_items, blocks[0].shape[1:], axes, e)
1673
1674
ValueError: Shape of passed values is (111, 1), indices imply (111, 111)
You should specify the column in the parameter index, specifying it in columns is throwing that error.Try this:
df = pd.DataFrame(column2 , index = column1)
The above code stores column1 as index, if you don't want to do that and have columns names, please try this:
df = pd.DataFrame(column2 , index = column1).reset_index()
df.columns = ['column1', 'column2']
You need bracket around column2.
df = pd.DataFrame([column2], columns = column1)
print(df)
Col 1 Col 2 Col 3 Col 4 Col 5 Col 6 Col 7 ... Col 105 Col 106 Col 107 Col 108 Col 109 Col 110 Col 111
0 0.933107 0.932057 0.681201 0.421621 0.159919 0.333099 0.79994 ... 0.540713 0.882822 0.372501 0.632753 0.684067 0.105904 0.492178

How to split several column's data in pandas?

I have a dataframe which looks like this:
df = pd.DataFrame({'hard': [['525', '21']], 'soft': [['1525', '221']], 'set': [['5245', '271']], 'purch': [['925', '201']], \
'mont': [['555', '621']], 'gest': [['536', '251']], 'memo': [['825', '241']], 'raw': [['532', '210']]})
df
Out:
gest hard memo mont purch raw set soft
0 [536, 251] [525, 21] [825, 241] [555, 621] [925, 201] [532, 210] [5245, 271] [1525, 221]
I should split all of the columns like this:
df1 = pd.DataFrame()
df1['gest_pos'] = df.gest.str[0].astype(int)
df1['gest_size'] = df.gest.str[1].astype(int)
df1['hard_pos'] = df.hard.str[0].astype(int)
df1['hard_size'] = df.hard.str[1].astype(int)
df1
gest_pos gest_size hard_pos hard_size
0 536 251 525 21
I have more than 70 columns and my method takes lot of place and time. Is there an easier way to do this job?
Thanks!
Different approach:
df2 = pd.DataFrame()
for column in df:
df2['{}_pos'.format(column)] = df[column].str[0].astype(int)
df2['{}_size'.format(column)] = df[column].str[1].astype(int)
print(df2)
You can use nested list comprehension with flattening and then create new DataFrame by constructor:
L = [[y for x in z for y in x] for z in df.values.tolist()]
#if want filter first 2 values per each list
#L = [[y for x in z for y in x[:2]] for z in df.values.tolist()]
#https://stackoverflow.com/a/45122198/2901002
def mygen(lst):
for item in lst:
yield item + '_pos'
yield item + '_size'
df = pd.DataFrame(L, columns = list(mygen(df.columns))).astype(int)
print (df)
hard_pos hard_size soft_pos soft_size set_pos set_size purch_pos purch_size \
0 525 21 1525 221 5245 271 925 201
mont_pos mont_size gest_pos gest_size memo_pos memo_size raw_pos raw_size
0 555 621 536 251 825 241 532 210
You can use NumPy operations to construct your list of columns and flatten out your series of lists:
import numpy as np
from itertools import chain
# create column label array
cols = np.repeat(df.columns, 2).values
cols[::2] += '_pos'
cols[1::2] += '_size'
# create data array
arr = np.array([list(chain.from_iterable(i)) for i in df.values]).astype(int)
# combine with pd.DataFrame constructor
res = pd.DataFrame(arr, columns=cols)
Result:
print(res)
gest_pos gest_size hard_pos hard_size memo_pos memo_size mont_pos \
0 536 251 525 21 825 241 555
mont_size purch_pos purch_size raw_pos raw_size set_pos set_size \
0 621 925 201 532 210 5245 271
soft_pos soft_size
0 1525 221

compute the difference of all possible rows

Based on a selection ds of a dataframe d with:
{ 'x': d.x, 'y': d.y, 'a':d.a, 'b':d.b, 'c':d.c 'row:d.n'})
Having n rows, x ranges from 0 to n-1. The column n is needed since it's a selection and indices need to be kept for a later query.
How do you efficiently compute the difference between each row (e.g.a_0, a_1, etc) of each column (a, b, c) without losing the rows information (e.g. new column with the indices of the rows that were used) ?
MWE
Sample selection ds:
x y a b c n
554.607085 400.971878 9789 4151 6837 146
512.231450 405.469524 8796 3811 6596 225
570.427284 694.369140 1608 2019 2097 291
Desired output:
dist euclidean distance math.hypot(x2 - x1, y2 - y1)
da, db, dc for da: np.abs(a1-a2)
ns a string with both ns of the employed rows
the result would look like:
dist da db dc ns
42.61365102824963 993 340 241 146-225
293.82347069813255 8181 2132 4740 146-291
.. .. .. .. 225-291
You can use itertools.combinations() to generate the pairs:
Read data first:
import pandas as pd
from io import StringIO
import numpy as np
text = """ x y a b c n
554.607085 400.971878 9789 4151 6837 146
512.231450 405.469524 8796 3811 6596 225
570.427284 694.369140 1608 2019 2097 291"""
df = pd.read_csv(StringIO(text), delim_whitespace=True)
Create the index and calculate the results:
from itertools import combinations
index = np.array(list(combinations(range(df.shape[0]), 2)))
df1, df2 = [df.iloc[idx].reset_index(drop=True) for idx in index.T]
res = pd.concat([
np.hypot(df1.x - df2.x, df1.y - df2.y),
df1[["a", "b", "c"]] - df2[["a", "b", "c"]],
df1.n.astype(str) + "-" + df2.n.astype(str)
], axis=1)
res.columns = ["dist", "da", "db", "dc", "ns"]
res
the output:
dist da db dc ns
0 42.613651 993 340 241 146-225
1 293.823471 8181 2132 4740 146-291
2 294.702805 7188 1792 4499 225-291
This approach makes good use of Pandas and the underlying numpy capabilities, but the matrix manipulations are a little hard to keep track of:
import pandas as pd, numpy as np
ds = pd.DataFrame(
[
[554.607085, 400.971878, 9789, 4151, 6837, 146],
[512.231450, 405.469524, 8796, 3811, 6596, 225],
[570.427284, 694.369140, 1608, 2019, 2097, 291]
],
columns = ['x', 'y', 'a', 'b', 'c', 'n']
)
def concat_str(*arrays):
result = arrays[0]
for arr in arrays[1:]:
result = np.core.defchararray.add(result, arr)
return result
# Make a panel with one item for each column, with a square data frame for
# each item, showing the differences between all row pairs.
# This creates perpendicular matrices of values based on the underlying numpy arrays;
# then numpy broadcasts them along the missing axis when calculating the differences
p = pd.Panel(
(ds.values[np.newaxis,:,:] - ds.values[:,np.newaxis,:]).transpose(),
items=['d'+c for c in ds.columns], major_axis=ds.index, minor_axis=ds.index
)
# calculate euclidian distance
p['dist'] = np.hypot(p['dx'], p['dy'])
# create strings showing row relationships
p['ns'] = concat_str(ds['n'].values.astype(str)[:,np.newaxis], '-', ds['n'].values.astype(str)[np.newaxis,:])
# remove unneeded items
del p['dx'], p['dy'], p['dn']
# convert to frame
diffs = p.to_frame().reindex_axis(['dist', 'da', 'db', 'dc', 'ns'], axis=1)
diffs
This gives:
dist da db dc ns
major minor
0 0 0.000000 0 0 0 146-146
1 42.613651 993 340 241 146-225
2 293.823471 8181 2132 4740 146-291
1 0 42.613651 -993 -340 -241 225-146
1 0.000000 0 0 0 225-225
2 294.702805 7188 1792 4499 225-291
2 0 293.823471 -8181 -2132 -4740 291-146
1 294.702805 -7188 -1792 -4499 291-225
2 0.000000 0 0 0 291-291

Categories