How to aapend data efficiently into pandas dataframe using for loop - python

I am trying to get the data from Python script and storing that into a list and then creating a dataframe out of it.
But it create different Datafarme's for individual items in for loop, how to avoid that and create a single dataFrame.
Code:
from __future__ import (absolute_import, division, print_function)
import getpass
import ssl
from pyVim.connect import SmartConnect
from pyVmomi import vim
from ssl import CERT_NONE, PROTOCOL_TLSv1_2, SSLContext
import pandas as pd
import numpy as np
from tabulate import tabulate
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('max_colwidth', None)
pd.set_option('expand_frame_repr', False)
s = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
s.verify_mode = ssl.CERT_NONE
userid = input("please enter your wbi userid (ex. abx#example.com):")
p = getpass.getpass("password:")
vcenter = ["vcenter-oracle.com","vcente.simplivity.com"]
for instance in vcenter:
try:
c = SmartConnect(host=instance, user=userid, pwd=p, sslContext=s)
except Exception as e:
print(e)
continue
content = c.content
obj_ds = content.viewManager.CreateContainerView(content.rootFolder,[vim.Datastore],True)
# Lists
a_list = []
b_list = []
c_list = []
d_list = []
# for loop
for z in obj_ds.view:
a_list.append(instance)
b_list.append(z)
c_list.append(int(z.summary.capacity/(1024*1024*1024)))
d_list.append(int(z.summary.freeSpace/(1024*1024*1024)))
#dataframe
df = pd.DataFrame({'A': a_list, 'B': b_list, 'C': c_list, 'D': d_list})
I also tried as below for for loop section but result are same:
raw_data = []
for z in obj_ds.view:
vc_data = instance, z, int(z.summary.capacity/(1024*1024*1024)) ,int(z.summary.freeSpace/(1024*1024*1024))
raw_data.append(vc_data)
df = pd.DataFrame(raw_data, columns=['Vcenter', 'DS', 'TDS', 'FDS'])
print(df)
Output:
A B C D
0 vcenter-oracle.com 'vim.Datastore:datastore-357' 439 430
1 vcenter-oracle.com 'vim.Datastore:datastore-311' 439 430
2 vcenter-oracle.com 'vim.Datastore:datastore-306' 439 430
3 vcenter-oracle.com 'vim.Datastore:datastore-262' 20480 7030
4 vcenter-oracle.com 'vim.Datastore:datastore-356' 439 430
5 vcenter-oracle.com 'vim.Datastore:datastore-465' 52 46
6 vcenter-oracle.com 'vim.Datastore:datastore-94' 5836 1850
7 vcenter-oracle.com 'vim.Datastore:datastore-122' 11646 3592
8 vcenter-oracle.com 'vim.Datastore:datastore-89' 52 46
9 vcenter-oracle.com 'vim.Datastore:datastore-83' 52 46
10 vcenter-oracle.com 'vim.Datastore:datastore-149' 52 46
A B C D
0 vcenter.simplivity.com 'vim.Datastore:datastore-143230' 1945 501
1 vcenter.simplivity.com 'vim.Datastore:datastore-52354' 5120 2096
2 vcenter.simplivity.com 'vim.Datastore:datastore-142927' 274 271
3 vcenter.simplivity.com 'vim.Datastore:datastore-143231' 2048 987
4 vcenter.simplivity.com 'vim.Datastore:datastore-878' 553 549
5 vcenter.simplivity.com 'vim.Datastore:datastore-877' 553 552
6 vcenter.simplivity.com 'vim.Datastore:datastore-74327' 1500 949
7 vcenter.simplivity.com 'vim.Datastore:datastore-142929' 274 271
8 vcenter.simplivity.com 'vim.Datastore:datastore-708' 4677 1933
Expected:
A B C D
0 vcenter-oracle.com 'vim.Datastore:datastore-357' 439 430
1 vcenter-oracle.com 'vim.Datastore:datastore-311' 439 430
2 vcenter-oracle.com 'vim.Datastore:datastore-306' 439 430
3 vcenter-oracle.com 'vim.Datastore:datastore-262' 20480 7030
4 vcenter-oracle.com 'vim.Datastore:datastore-356' 439 430
5 vcenter-oracle.com 'vim.Datastore:datastore-465' 52 46
6 vcenter-oracle.com 'vim.Datastore:datastore-94' 5836 1850
7 vcenter-oracle.com 'vim.Datastore:datastore-122' 11646 3592
8 vcenter-oracle.com 'vim.Datastore:datastore-89' 52 46
9 vcenter-oracle.com 'vim.Datastore:datastore-83' 52 46
10 vcenter-oracle.com 'vim.Datastore:datastore-149' 52 46
11 vcenter.simplivity.com 'vim.Datastore:datastore-143230' 1945 501
12 vcenter.simplivity.com 'vim.Datastore:datastore-52354' 5120 2096
13 vcenter.simplivity.com 'vim.Datastore:datastore-142927' 274 271
14 vcenter.simplivity.com 'vim.Datastore:datastore-143231' 2048 987
15 vcenter.simplivity.com 'vim.Datastore:datastore-878' 553 549
16 vcenter.simplivity.com 'vim.Datastore:datastore-877' 553 552
17 vcenter.simplivity.com 'vim.Datastore:datastore-74327' 1500 949
18 vcenter.simplivity.com 'vim.Datastore:datastore-142929' 274 271
19 vcenter.simplivity.com 'vim.Datastore:datastore-708' 4677 1933

There is little tweak needs to be done as mentioned in the previous answer that you need to declare list construct before the first loop, as after every instance change, raw_data getting reinitialize to an empty list.
try below, it should work for you.
s = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
s.verify_mode = ssl.CERT_NONE
userid = input("please enter your wbi userid (ex. abx#example.com):")
p = getpass.getpass("password:")
vcenter = ["vcenter-oracle.com","vcente.simplivity.com"]
# Place your list construct here
raw_data = []
# First loop
for instance in vcenter:
try:
c = SmartConnect(host=instance, user=userid, pwd=p, sslContext=s)
except Exception as e:
print(e)
continue
content = c.content
obj_ds = content.viewManager.CreateContainerView(content.rootFolder,[vim.Datastore],True)
# Second loop
for z in obj_ds.view:
vc_data = instance, z, int(z.summary.capacity/(1024*1024*1024)) ,int(z.summary.freeSpace/(1024*1024*1024))
raw_data.append(vc_data)
# Create the DataFrame and process the columns.
# Keep the DataFrame outside the for loop.
df = pd.DataFrame(raw_data, columns=['Vcenter', 'DS', 'TDS', 'FDS'])
print(df)

# new df to concat every instance
final_df = pd.DataFrame()
for instance in vcenter:
try:
c = SmartConnect(host=instance, user=userid, pwd=p, sslContext=s)
except Exception as e:
print(e)
continue
content = c.content
obj_ds = content.viewManager.CreateContainerView(content.rootFolder,[vim.Datastore],True)
# Lists
a_list = []
b_list = []
c_list = []
d_list = []
# for loop
for z in obj_ds.view:
a_list.append(instance)
b_list.append(z)
c_list.append(int(z.summary.capacity/(1024*1024*1024)))
d_list.append(int(z.summary.freeSpace/(1024*1024*1024)))
#dataframe
df = pd.DataFrame({'A': a_list, 'B': b_list, 'C': c_list, 'D': d_list})
# append in final_df
final_df = pd.concat([final_df, df])
# reset index
final_df = final_df.reset_index(drop=True)
print(final_df)

There are two for loops in your code, and you initialize pandas dataframe inside the first loop which results in your situation, where a new dataframe is generated for every instance. You need to initialize the dataframe before the first loop or outside the first loop to get a single dataframe.
Here is the corrected code:
df_rows = []
# first for loop
for instance in vcenter:
try:
c = SmartConnect(host=instance, user=userid, pwd=p, sslContext=s)
except Exception as e:
print(e)
continue
content = c.content
obj_ds = content.viewManager.CreateContainerView(content.rootFolder,[vim.Datastore],True)
# List for storing single row data
row = []
# second for loop
for z in obj_ds.view:
row.append(instance)
row.append(z)
row.append(int(z.summary.capacity/(1024*1024*1024)))
row.append(int(z.summary.freeSpace/(1024*1024*1024)))
df_rows.append(row)
df = pd.DataFrame(df_rows, columns=['A', 'B', 'C', 'D'])
print(df)
Hope this has helped.

Related

How to simplify data.table logic and make it doable in pandas?

I have a dataframe with multiple columns with numerical values. I wanted to new columns which compare the values of other columns and assign its column name as label. I already understood its logic in r, but wondering how should I do this easily in python. Can anyone point me out how this can be done in python when we try to add new column where need to compare value of multiple columns and assign column name which has max value? Any idea?
reproducible example
this is 100% working reproducible example in R:
library(data.table)
df <- data.frame(a = sample(seq(1:10), size=10), b = sample(LETTERS[1:10], size=10), cnt=sample(seq(1:100), size=5),
RECENT_MOV= sample(seq(1:1000), size = 10),
RETIRED= sample(seq(1:200), size = 10),
SERV_EMPL= sample(seq(1:500), size = 10),
SUB_BUS=sample(seq(1:2000), size = 10),
WORK_HOME=sample(seq(1:1200), size = 10)
)
dt <- as.data.table(df)
write.csv(dt, "sample.csv")
label = c("RECENT_MOV", "RETIRED", "SERV_EMPL", "SUB_BUS","WORK_HOME")
df$category <- NA_character_
df[, row_ind:= 1:nrow(df)]
df[cnt > 2, category := names(which.max(.SD[, label, with = FALSE])), by = row_ind]
current output is:
> dput(dt)
structure(list(a = c(5L, 10L, 1L, 6L, 7L, 3L, 2L, 8L, 4L, 9L),
b = c("E", "A", "D", "H", "J", "F", "G", "I", "C", "B"),
cnt = c(13L, 88L, 45L, 92L, 70L, 13L, 88L, 45L, 92L, 70L),
RECENT_MOV = c(70L, 195L, 620L, 572L, 354L, 648L, 798L, 657L,
233L, 672L), RETIRED = c(189L, 195L, 191L, 88L, 148L, 186L,
39L, 78L, 158L, 55L), SERV_EMPL = c(65L, 151L, 415L, 383L,
255L, 207L, 210L, 470L, 181L, 188L), SUB_BUS = c(894L, 829L,
1798L, 502L, 897L, 1461L, 744L, 1991L, 260L, 1697L), WORK_HOME = c(553L,
739L, 454L, 137L, 435L, 1042L, 316L, 697L, 517L, 1158L),
category = c("SUB_BUS", "SUB_BUS", "SUB_BUS", "RECENT_MOV",
"SUB_BUS", "SUB_BUS", "RECENT_MOV", "SUB_BUS", "WORK_HOME",
"SUB_BUS"), row_ind = 1:10), row.names = c(NA, -10L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x0000015a64b61ef0>)
my current python attempt
import pandas as pd
df=pd.read_csv("sample.csv", index_col=None, header=0)
label = ["RECENT_MOV", "RETIRED", "SERV_EMPL", "SUB_BUS","WORK_HOME"]
df['category'] = pd.NA
df['row_ind'] = range(1,len(df))
however, I have trouble to make this line in pythonic way:
df[cnt > 2, category := names(which.max(.SD[, label, with = FALSE])), by = row_ind]
basically, this lines said create new column variable called category where comparing columns in label where whichever column has max value, its column name will be assigned as value in category column. How should I do it this easily in python?
logic translation:
df[cnt > 2, category := names(which.max(.SD[, label, with = FALSE])), by = row_ind]
this line telling us that first do filter by cnt column where cnt > 2, then compare columns values of df[["RECENT_MOV", "RETIRED", "SERV_EMPL", "SUB_BUS","WORK_HOME"]] and pick the column with highest value by row-wise and assign that name of that column as value to df['category']=col_name_with_highest_value_in_each_row.
desirable output
this is desirable output that I want to produce in python:
a b cnt RECENT_MOV RETIRED SERV_EMPL SUB_BUS WORK_HOME category row_ind
1 5 E 13 70 189 65 894 553 SUB_BUS 1
2 10 A 88 195 195 151 829 739 SUB_BUS 2
3 1 D 45 620 191 415 1798 454 SUB_BUS 3
4 6 H 92 572 88 383 502 137 RECENT_MOV 4
5 7 J 70 354 148 255 897 435 SUB_BUS 5
6 3 F 13 648 186 207 1461 1042 SUB_BUS 6
7 2 G 88 798 39 210 744 316 RECENT_MOV 7
8 8 I 45 657 78 470 1991 697 SUB_BUS 8
9 4 C 92 233 158 181 260 517 WORK_HOME 9
10 9 B 70 672 55 188 1697 1158 SUB_BUS 10
This is actually really simple with pandas. Have a list of the columns to search in, and then use idxmax with axis=1:
# Filter out rows where `cnt` is less than or equal to 2
df = df[df['cnt'] > 2]
# Determine category for each row
search_cols = ['RECENT_MOV', 'RETIRED', 'SERV_EMPL', 'SUB_BUS', 'WORK_HOME']
df['category'] = df[search_cols].idxmax(axis=1)
# Assign row indexes
df['row_ind'] = df.index
Output:
>>> df
a b cnt RECENT_MOV RETIRED SERV_EMPL SUB_BUS WORK_HOME category row_ind
1 1 C 76 452 62 55 115 247 RECENT_MOV 1
2 7 E 14 50 165 337 1165 810 SUB_BUS 2
3 2 A 46 523 167 423 784 707 SUB_BUS 3
4 3 H 3 38 144 473 745 437 SUB_BUS 4
5 5 I 59 743 127 261 351 190 RECENT_MOV 5
6 8 J 76 143 49 470 1612 935 SUB_BUS 6
7 4 D 14 818 101 418 1919 314 SUB_BUS 7
8 6 F 46 714 9 446 1432 938 SUB_BUS 8
9 10 B 3 585 160 14 107 489 RECENT_MOV 9
10 9 G 59 814 73 449 937 287 SUB_BUS 10

Replace blank value in dataframe based on another column condition

I have many blanks in a merged data set and I want to fill them with a condition.
My current code looks like this
import pandas as pd
import csv
import numpy as np
pd.set_option('display.max_columns', 500)
# Read all files into pandas dataframes
Jan = pd.read_csv(r'C:\~\Documents\Jan.csv')
Feb = pd.read_csv(r'C:\~\Documents\Feb.csv')
Mar = pd.read_csv(r'C:\~\Documents\Mar.csv')
Jan=pd.DataFrame({'Department':['52','5','56','70','7'],'Item':['2515','254','818','','']})
Feb=pd.DataFrame({'Department':['52','56','765','7','40'],'Item':['2515','818','524','','']})
Mar=pd.DataFrame({'Department':['7','70','5','8','52'],'Item':['45','','818','','']})
all_df_list = [Jan, Feb, Mar]
appended_df = pd.concat(all_df_list)
df = appended_df
df.to_csv(r"C:\~\Documents\SallesDS.csv", index=False)
Data set:
df
Department Item
52 2515
5 254
56 818
70
7 50
52 2515
56 818
765 524
7
40
7 45
70
5 818
8
52
What I want is to fill the empty cells in Item with a correspondent values of the Department column.
So If Department is 52 and Item is empty it should be filled with 2515
Department 7 and Item is empty fill it with 45
and the result should look like this
df
Department Item
52 2515
5 254
56 818
70
7 50
52 2515
56 818
765 524
7 45
40
7 45
70
5 818
8
52 2515
I tried the following method but non of them worked.
1
df.loc[(df['Item'].isna()) & (df['Department'].str.contains(52)), 'Item'] = 2515
df.loc[(df['Item'].isna()) & (df['Department'].str.contains(7)), 'Item'] = 45
2
df["Item"] = df["Item"].fillna(df["Department"])
df = df.replace({"Item":{"52":"2515", "7":"45"}})
both ethir return error or do not work
Answer:
Hi I have used the below code and it worked
b = [52]
df.Item=np.where(df.Department.isin(b),df.Item.fillna(2515),df.Item)
a = [7]
df.Item=np.where(df.Department.isin(a),df.Item.fillna(45),df.Item)
Hope it helps someone who face the same issue
The following solution first creates a map of each department and it's maximum corresponding item (assuming there is one), and then matches that item to a department with a blank item. Note that in your data frame, the empty items are an empty string ("") and not NaN.
Create a map:
values = df.groupby('Department').max()
values['Item'] = values['Item'].apply(lambda x: np.nan if x == "" else x)
values = values.dropna().reset_index()
Department Item
0 5 818
1 52 2515
2 56 818
3 7 45
4 765 524
Then use df.apply():
df['Item'] = df.apply(lambda x: values[values['Department'] == x['Department']]['Item'].values if x['Item'] == "" else x['Item'], axis=1)
In this case, the new values will have brackets around them. They can be removed with str.replace():
df['Item'] = df['Item'].astype(str).str.replace(r'\[|\'|\'|\]', "", regex=True)
The result:
Department Item
0 52 2515
1 5 254
2 56 818
3 70
4 7 45
0 52 2515
1 56 818
2 765 524
3 7 45
4 40
0 7 45
1 70
2 5 818
3 8
4 52 2515
Hi I have used the below code and it worked
b = [52]
df.Item=np.where(df.Department.isin(b),df.Item.fillna(2515),df.Item)
a = [7]
df.Item=np.where(df.Department.isin(a),df.Item.fillna(45),df.Item)
Hope it helps someone who face the same issue

Sampling from static data set to create dataframe, ignore index in Python

I am trying to create some random samples (of a given size) from a static dataframe. The goal is to create multiple columns for each sample (and each sample drawn is the same size). I'm expecting to see multiple columns of the same length (i.e. sample size) in the fully sampled dataframe, but maybe append isn't the right way to go. Here is the code:
# create sample dataframe
target_df = pd.DataFrame(np.arange(1000))
target_df.columns=['pl']
# create the sampler:
sample_num = 5
sample_len = 10
df_max_row = len(target_df) - sample_len
for i in range(sample_num):
rndm_start = np.random.choice(df_max_row, 1)[0]
rndm_end = rndm_start + sample_len
slicer = target_df.iloc[rndm_start:rndm_end]['pl']
sampled_df = sampled_df.append(slicer, ignore_index=True)
sampled_df = sampled_df.T
The output of this is shown in the pic below - The red line shows the index I want remove.
The desired output is shown below that. How do I make this happen?
Thanks!
I would create new column using
sampled_df[i] = slicer.reset_index(drop=True)
Eventually I would use str(i) for column name because later it is simpler to select column using string than number
import pandas as pd
import random
target_df = pd.DataFrame({'pl': range(1000)})
# create the sampler:
sample_num = 5
sample_len = 10
df_max_row = len(target_df) - sample_len
sampled_df = pd.DataFrame()
for i in range(1, sample_num+1):
start = random.randint(0, df_max_row)
end = start + sample_len
slicer = target_df[start:end]['pl']
sampled_df[str(i)] = slicer.reset_index(drop=True)
sampled_df.index += 1
print(sampled_df)
Result:
1 2 3 4 5
1 735 396 646 534 769
2 736 397 647 535 770
3 737 398 648 536 771
4 738 399 649 537 772
5 739 400 650 538 773
6 740 401 651 539 774
7 741 402 652 540 775
8 742 403 653 541 776
9 743 404 654 542 777
10 744 405 655 543 778
But to create really random values then I would first shuffle values
np.random.shuffle(target_df['pl'])
and then I don't have to use random to select start
shuffle changes original column so it can't assign to new variable.
It doesn't repeat values in samples.
import pandas as pd
#import numpy as np
import random
target_df = pd.DataFrame({'pl': range(1000)})
# create the sampler:
sample_num = 5
sample_len = 10
sampled_df = pd.DataFrame()
#np.random.shuffle(target_df['pl'])
random.shuffle(target_df['pl'])
for i in range(1, sample_num+1):
start = i * sample_len
end = start + sample_len
slicer = target_df[start:end]['pl']
sampled_df[str(i)] = slicer.reset_index(drop=True)
sampled_df.index += 1
print(sampled_df)
Result:
1 2 3 4 5
1 638 331 171 989 170
2 22 643 47 136 764
3 969 455 211 763 194
4 859 384 174 552 566
5 221 829 62 926 414
6 4 895 951 967 381
7 758 688 594 876 873
8 757 691 825 693 707
9 235 353 34 699 121
10 447 81 36 682 251
If values can repeat then you could use
sampled_df[str(i)] = target_df['pl'].sample(n=sample_len, ignore_index=True)
import pandas as pd
target_df = pd.DataFrame({'pl': range(1000)})
# create the sampler:
sample_num = 5
sample_len = 10
sampled_df = pd.DataFrame()
for i in range(1, sample_num+1):
sampled_df[str(i)] = target_df['pl'].sample(n=sample_len, ignore_index=True)
sampled_df.index += 1
print(sampled_df)
EDIT
You may also get shuffled values as numpy array and use reshape - and later convert back to DataFrame with many columns. And later you can get some columns.
import pandas as pd
import random
target_df = pd.DataFrame({'pl': range(1000)})
# create the sampler:
sample_num = 5
sample_len = 10
random.shuffle(target_df['pl'])
sampled_df = pd.DataFrame(target_df['pl'].values.reshape([sample_len,-1]))
sampled_df = sampled_df.iloc[:, 0:sample_num]
sampled_df.index += 1
print(sampled_df)

How to get an exact one of python iterator?

I was reading a super big csv file(10G) using pandas, and read_csv(filename, chunksize=chunksize) return me an iterator (assum it names 'reader'). And now I want to get an exact chunk because I just want a certain few of lines(for example, the csv file I read has 1000000000 lines, and I want to get number 50000000 line and 1000 lines after it), what should I do except tranverse the iterator until it reaches the chunk I want?
Here is my former code:
def get_lines_by_chunk(file_name, line_beg, line_end, chunk_size=-1):
func_name = 'get_lines_by_chunk'
line_no = get_file_line_no(file_name)
if chunk_size < 0:
chunk_size = get_chunk_size(line_no, line_beg, line_end)
reader = pd.read_csv(file_name, chunksize=chunk_size)
data = pd.DataFrame({})
flag = 0
for chunk in reader:
line_before = flag * chunk_size
flag = flag + 1
line_after = flag * chunk_size
if line_beg >= line_before and line_beg <= line_after:
if line_end >= line_after:
temp = chunk[line_beg - line_before : chunk_size]
data = pd.concat([data, temp], ignore_index=True)
else:
temp = chunk[line_beg - line_before : line_end - line_before]
data = pd.concat([data, temp], ignore_index=True)
return data
elif line_end <= line_after and line_end >= line_before:
temp = chunk[0 : line_end - line_before]
data = pd.concat([data, temp], ignore_index=True)
return data
elif line_beg < line_before and line_end > line_after:
temp = chunk[0 : chunk_size]
data = pd.concat([data, temp], ignore_index=True)
return data
If you need to read your CSV file with differently-sized chunks you can use iterator=True:
Assuming we have a 1000rows DF (see in setup section for how it's been generated)
In [103]: reader = pd.read_csv(fn, iterator=True)
In [104]: reader.get_chunk(5)
Out[104]:
a b
0 1 8
1 2 28
2 3 85
3 4 56
4 5 29
In [105]: reader.get_chunk(3)
Out[105]:
a b
5 6 55
6 7 16
7 8 96
NOTE: get_chunk can't skip data, it will continuously read data with specified chunk sizes
if you want to read only the rows 100 - 110:
In [106]: cols = pd.read_csv(fn, nrows=1).columns.tolist()
In [107]: cols
Out[107]: ['a', 'b']
In [109]: pd.read_csv(fn, header=None, skiprows=100, nrows=10, names=cols)
Out[109]:
a b
0 100 52
1 101 15
2 102 74
3 103 10
4 104 35
5 105 73
6 106 48
7 107 49
8 108 1
9 109 56
But if you can use HDF5 format - it will be much easier and faster:
let's save it as HDF5 first :
In [110]: df.to_hdf('c:/temp/test.h5', 'mydf', format='t', data_columns=True, compression='blosc', complevel=9)
now we can read it by index positions as follows:
In [113]: pd.read_hdf('c:/temp/test.h5', 'mydf', start=99, stop=109)
Out[113]:
a b
99 100 52
100 101 15
101 102 74
102 103 10
103 104 35
104 105 73
105 106 48
106 107 49
107 108 1
108 109 56
or querying (SQL like):
In [115]: pd.read_hdf('c:/temp/test.h5', 'mydf', where="a >= 100 and a <= 110")
Out[115]:
a b
99 100 52
100 101 15
101 102 74
102 103 10
103 104 35
104 105 73
105 106 48
106 107 49
107 108 1
108 109 56
109 110 23
Setup:
In [99]: df = pd.DataFrame({'a':np.arange(1, 1001), 'b':np.random.randint(0, 100, 1000)})
In [100]: fn = r'C:\Temp\test.csv'
In [101]: df.to_csv(fn, index=False)
In [102]: df.shape
Out[102]: (1000, 2)

Python selecting items by comparing values in a table using dictionary

I have a table with 12 columns and want to select the items in the first column (qseqid) based on the second column (sseqid). Meaning that the second column (sseqid) is repeating with different values in the 11th and 12th columns, which areevalueandbitscore, respectively.
The ones that I would like to get are having the lowestevalueand the highestbitscore(whenevalues are the same, the rest of the columns can be ignored and the data is down below).
So, I have made a short code which uses the second columns as a key for the dictionary. I can get five different items from the second column with lists of qseqid+evalueandqseqid+bitscore.
Here is the code:
#!usr/bin/python
filename = "data.txt"
readfile = open(filename,"r")
d = dict()
for i in readfile.readlines():
i = i.strip()
i = i.split("\t")
d.setdefault(i[1], []).append([i[0],i[10]])
d.setdefault(i[1], []).append([i[0],i[11]])
for x in d:
print(x,d[x])
readfile.close()
But, I am struggling to get the qseqid with the lowest evalue and the highest bitscore for each sseqid.
Is there any good logic to solve the problem?
Thedata.txtfile (including the header row and with»representing tab characters)
qseqid»sseqid»pident»length»mismatch»gapopen»qstart»qend»sstart»send»evalue»bitscore
ACLA_022040»TBB»32.71»431»258»8»39»468»24»423»2.00E-76»240
ACLA_024600»TBB»80»435»87»0»1»435»1»435»0»729
ACLA_031860»TBB»39.74»453»251»3»1»447»1»437»1.00E-121»357
ACLA_046030»TBB»75.81»434»105»0»1»434»1»434»0»704
ACLA_072490»TBB»41.7»446»245»3»4»447»3»435»2.00E-120»353
ACLA_010400»EF1A»27.31»249»127»8»69»286»9»234»3.00E-13»61.6
ACLA_015630»EF1A»22»491»255»17»186»602»3»439»8.00E-19»78.2
ACLA_016510»EF1A»26.23»122»61»4»21»127»9»116»2.00E-08»46.2
ACLA_023300»EF1A»29.31»447»249»12»48»437»3»439»2.00E-45»155
ACLA_028450»EF1A»85.55»443»63»1»1»443»1»442»0»801
ACLA_074730»CALM»23.13»147»101»4»6»143»2»145»7.00E-08»41.2
ACLA_096170»CALM»29.33»150»96»4»34»179»2»145»1.00E-13»55.1
ACLA_016630»CALM»23.9»159»106»5»58»216»4»147»5.00E-12»51.2
ACLA_031930»RPB2»36.87»1226»633»24»121»1237»26»1219»0»734
ACLA_065630»RPB2»65.79»1257»386»14»1»1252»4»1221»0»1691
ACLA_082370»RPB2»27.69»1228»667»37»31»1132»35»1167»7.00E-110»365
ACLA_061960»ACT»28.57»147»95»5»146»284»69»213»3.00E-12»57.4
ACLA_068200»ACT»28.73»463»231»13»16»471»4»374»1.00E-53»176
ACLA_069960»ACT»24.11»141»97»4»581»718»242»375»9.00E-09»46.2
ACLA_095800»ACT»91.73»375»31»0»1»375»1»375»0»732
And here's a little more readable version of the table's contents:
0 1 2 3 4 5 6 7 8 9 10 11
qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
ACLA_022040 TBB 32.71 431 258 8 39 468 24 423 2.00E-76 240
ACLA_024600 TBB 80 435 87 0 1 435 1 435 0 729
ACLA_031860 TBB 39.74 453 251 3 1 447 1 437 1.00E-121 357
ACLA_046030 TBB 75.81 434 105 0 1 434 1 434 0 704
ACLA_072490 TBB 41.7 446 245 3 4 447 3 435 2.00E-120 353
ACLA_010400 EF1A 27.31 249 127 8 69 286 9 234 3.00E-13 61.6
ACLA_015630 EF1A 22 491 255 17 186 602 3 439 8.00E-19 78.2
ACLA_016510 EF1A 26.23 122 61 4 21 127 9 116 2.00E-08 46.2
ACLA_023300 EF1A 29.31 447 249 12 48 437 3 439 2.00E-45 155
ACLA_028450 EF1A 85.55 443 63 1 1 443 1 442 0 801
ACLA_074730 CALM 23.13 147 101 4 6 143 2 145 7.00E-08 41.2
ACLA_096170 CALM 29.33 150 96 4 34 179 2 145 1.00E-13 55.1
ACLA_016630 CALM 23.9 159 106 5 58 216 4 147 5.00E-12 51.2
ACLA_031930 RPB2 36.87 1226 633 24 121 1237 26 1219 0 734
ACLA_065630 RPB2 65.79 1257 386 14 1 1252 4 1221 0 1691
ACLA_082370 RPB2 27.69 1228 667 37 31 1132 35 1167 7.00E-110 365
ACLA_061960 ACT 28.57 147 95 5 146 284 69 213 3.00E-12 57.4
ACLA_068200 ACT 28.73 463 231 13 16 471 4 374 1.00E-53 176
ACLA_069960 ACT 24.11 141 97 4 581 718 242 375 9.00E-09 46.2
ACLA_095800 ACT 91.73 375 31 0 1 375 1 375 0 732
Since you're a Python newbie I'm glad that there are several examples of how to this manually, but for comparison I'll show how it can be done using the pandas library which makes working with tabular data much simpler.
Since you didn't provide example output, I'm assuming that by "with the lowest evalue and the highest bitscore for each sseqid" you mean "the highest bitscore among the lowest evalues" for a given sseqid; if you want those separately, that's trivial too.
import pandas as pd
df = pd.read_csv("acla1.dat", sep="\t")
df = df.sort(["evalue", "bitscore"],ascending=[True, False])
df_new = df.groupby("sseqid", as_index=False).first()
which produces
>>> df_new
sseqid qseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
0 ACT ACLA_095800 91.73 375 31 0 1 375 1 375 0.000000e+00 732.0
1 CALM ACLA_096170 29.33 150 96 4 34 179 2 145 1.000000e-13 55.1
2 EF1A ACLA_028450 85.55 443 63 1 1 443 1 442 0.000000e+00 801.0
3 RPB2 ACLA_065630 65.79 1257 386 14 1 1252 4 1221 0.000000e+00 1691.0
4 TBB ACLA_024600 80.00 435 87 0 1 435 1 435 0.000000e+00 729.0
Basically, first we read the data file into an object called a DataFrame, which is kind of like an Excel worksheet. Then we sort by evalue ascending (so that lower evalues come first) and by bitscore descending (so that higher bitscores come first). Then we can use groupby to collect the data in groups of equal sseqid, and take the first one in each group, which because of the sorting will be the one we want.
#!usr/bin/python
import csv
DATA = "data.txt"
class Sequence:
def __init__(self, row):
self.qseqid = row[0]
self.sseqid = row[1]
self.pident = float(row[2])
self.length = int(row[3])
self.mismatch = int(row[4])
self.gapopen = int(row[5])
self.qstart = int(row[6])
self.qend = int(row[7])
self.sstart = int(row[8])
self.send = int(row[9])
self.evalue = float(row[10])
self.bitscore = float(row[11])
def __str__(self):
return (
"{qseqid}\t"
"{sseqid}\t"
"{pident}\t"
"{length}\t"
"{mismatch}\t"
"{gapopen}\t"
"{qstart}\t"
"{qend}\t"
"{sstart}\t"
"{send}\t"
"{evalue}\t"
"{bitscore}"
).format(**self.__dict__)
def entries(fname, header_rows=1, dtype=list, **kwargs):
with open(fname) as inf:
incsv = csv.reader(inf, **kwargs)
# skip header rows
for i in range(header_rows):
next(incsv)
for row in incsv:
yield dtype(row)
def main():
bestseq = {}
for seq in entries(DATA, dtype=Sequence, delimiter="\t"):
# see if a sequence with the same sseqid already exists
prev = bestseq.get(seq.sseqid, None)
if (
prev is None
or seq.evalue < prev.evalue
or (seq.evalue == prev.evalue and seq.bitscore > prev.bitscore)
):
bestseq[seq.sseqid] = seq
# display selected sequences
keys = sorted(bestseq)
for key in keys:
print(bestseq[key])
if __name__ == "__main__":
main()
which results in
ACLA_095800 ACT 91.73 375 31 0 1 375 1 375 0.0 732.0
ACLA_096170 CALM 29.33 150 96 4 34 179 2 145 1e-13 55.1
ACLA_028450 EF1A 85.55 443 63 1 1 443 1 442 0.0 801.0
ACLA_065630 RPB2 65.79 1257 386 14 1 1252 4 1221 0.0 1691.0
ACLA_024600 TBB 80.0 435 87 0 1 435 1 435 0.0 729.0
While not nearly as elegant and concise as using thepandaslibrary, it's quite possible to do what you want without resorting to third-party modules. The following uses thecollections.defaultdictclass to facilitate creation of dictionaries of variable-length lists of records. The use of theAttrDictclass is optional, but it makes accessing the fields of each dictionary-based records easier and is less awkward-looking than the usualdict['fieldname']syntax otherwise required.
import csv
from collections import defaultdict, namedtuple
from itertools import imap
from operator import itemgetter
data_file_name = 'data.txt'
DELIMITER = '\t'
ssqeid_dict = defaultdict(list)
# from http://stackoverflow.com/a/1144405/355230
def multikeysort(items, columns):
comparers = [((itemgetter(col[1:].strip()), -1) if col.startswith('-') else
(itemgetter(col.strip()), 1)) for col in columns]
def comparer(left, right):
for fn, mult in comparers:
result = cmp(fn(left), fn(right))
if result:
return mult * result
else:
return 0
return sorted(items, cmp=comparer)
# from http://stackoverflow.com/a/15109345/355230
class AttrDict(dict):
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
with open(data_file_name, 'rb') as data_file:
reader = csv.DictReader(data_file, delimiter=DELIMITER)
format_spec = '\t'.join([('{%s}' % field) for field in reader.fieldnames])
for rec in (AttrDict(r) for r in reader):
# Convert the two sort fields to numeric values for proper ordering.
rec.evalue, rec.bitscore = map(float, (rec.evalue, rec.bitscore))
ssqeid_dict[rec.sseqid].append(rec)
for ssqeid in sorted(ssqeid_dict):
# Sort each group of recs with same ssqeid. The first record after sorting
# will be the one sought that has the lowest evalue and highest bitscore.
selected = multikeysort(ssqeid_dict[ssqeid], ['evalue', '-bitscore'])[0]
print format_spec.format(**selected)
Output (»represents tabs):
ACLA_095800» ACT» 91.73» 375» 31» 0» 1» 375» 1» 375» 0.0» 732.0
ACLA_096170» CALM» 29.33» 150» 96» 4» 34» 179» 2» 145» 1e-13» 55.1
ACLA_028450» EF1A» 85.55» 443» 63» 1» 1» 443» 1» 442» 0.0» 801.0
ACLA_065630» RPB2» 65.79» 1257» 386» 14» 1» 1252» 4» 1221» 0.0» 1691.0
ACLA_024600» TBB» 80» 435» 87» 0» 1» 435» 1» 435» 0.0» 729.0
filename = 'data.txt'
readfile = open(filename,'r')
d = dict()
sseqid=[]
lines=[]
for i in readfile.readlines():
sseqid.append(i.rsplit()[1])
lines.append(i.rsplit())
sorted_sseqid = sorted(set(sseqid))
sdqDict={}
key =None
for sorted_ssqd in sorted_sseqid:
key=sorted_ssqd
evalue=[]
bitscore=[]
qseid=[]
for line in lines:
if key in line:
evalue.append(line[10])
bitscore.append(line[11])
qseid.append(line[0])
sdqDict[key]=[qseid,evalue,bitscore]
print sdqDict
print 'TBB LOWEST EVALUE' + '---->' + min(sdqDict['TBB'][1])
##I think you can do the list manipulation below to find out the qseqid
readfile.close()

Categories