find max duplicate values in a list - python

I have a file containing multiple lines in format student code and followed by some answer. e.g
N00000047,B,,D,C,C,B,D,D,C,C,D,,A,B,D,C,,D,A,C,,D,B,D,C
N00000048,B,A,D,D,C,B,,D,C,C,D,B,A,B,A,D,B,D,A,C,A,A,B,D,D
N00000049,A,,D,D,C,B,D,,C,C,D,B,,B,A,C,C,D,A,C,A,A,B,D,D
N00000050,,C,,D,,D,D,A,C,A,A,B,A,B,A,D,B,D,A,C,D,A,B,D,D
N00000051,B,A,B,,C,B,D,A,C,C,D,D,A,B,A,C,B,C,A,,A,A,B,D,B
N00000052,B,A,D,D,,B,D,A,D,,D,B,A,B,A,C,B,C,A,C,A,A,B,D,D
N00000053,B,A,D,D,C,B,D,A,C,C,D,B,B,B,C,C,B,D,A,C,A,C,A,D,D
And now I have to find which is the most question was skipped by students by order which question, how many student skipped and how many % student skipped that question.
I was split then make a loop and add every entry of skipped question in a list and then got stuck in find the max duplicates values in a list (it can be more than 1 output).
This is some expected output:
Question that most people answer incorrectly: 10 - 4 - 0.20, 14 - 4 - 0.20, 16 - 4 - 0.20, 19 - 4 - 0.20, 22 - 4 - 0.20. In format : a - b - c which a is question number, b is how much student was skipped, c is it take how many percentage of total student in class. There are 3 question have the most skipped is 10, 14, 19 and 22 and they all have 4 skipped.
Edited:
I put all skipped question in a list and count for which question have a largest duplicate like this:
def find_max_count(list):
item_with_max_count = []
max_count = 0
for item in list:
item_count = list.count(item)
if item_count > max_count:
max_count = list.count
for item1 in list:
if list.count(item1) == max_count:
item_with_max_count.append(item1)
return item_with_max_count
but there is an error:
TypeError: '>' not supported between instances of 'int' and 'builtin_function_or_method'

Start by accumulating a dictionary of all responses to each question and a list of all skipped answers:
from collections import defaultdict
responses = defaultdict(list) # all responses to a given question
skipped = [] # all skiped answers
for record in data.splitlines():
student_id, *answers = record.split(',')
for question_number, answer in enumerate(answers, start=1):
responses[question_number].append(answer)
if answer == '':
skipped.append(question_number)
Next perform the analysis:
from statistics import multimode
print('Most skipped questions:', multimode(skipped))
print('Answer for questions with more than two or more skips')
for question, answers in responses.items():
if answers.count('') >= 2:
print(f'Question {question}: {answers}')
This outputs:
Most skipped questions: [2, 5]
Answer for questions with more than two or more skips
Question 2: ['', 'A', '', 'C', 'A', 'A', 'A']
Question 5: ['C', 'C', 'C', '', 'C', '', 'C']
I'm certain this is what you wanted (a target output wasn't shown), but this should get you started the key techniques for analysis. In particular, the multimode function is super helpful in identifying most frequent occurrences including ties for first place. Also defaultdict is super useful for transposing the data from answers by student to answers by question.

Let's get a dictionary with the student id and the answers.
data = """
N00000047,B,,D,C,C,B,D,D,C,C,D,,A,B,D,C,,D,A,C,,D,B,D,C
N00000048,B,A,D,D,C,B,,D,C,C,D,B,A,B,A,D,B,D,A,C,A,A,B,D,D
N00000049,A,,D,D,C,B,D,,C,C,D,B,,B,A,C,C,D,A,C,A,A,B,D,D
N00000050,,C,,D,,D,D,A,C,A,A,B,A,B,A,D,B,D,A,C,D,A,B,D,D
N00000051,B,A,B,,C,B,D,A,C,C,D,D,A,B,A,C,B,C,A,,A,A,B,D,B
N00000052,B,A,D,D,,B,D,A,D,,D,B,A,B,A,C,B,C,A,C,A,A,B,D,D
N00000053,B,A,D,D,C,B,D,A,C,C,D,B,B,B,C,C,B,D,A,C,A,C,A,D,D
"""
info = {s: a
for line in data.strip().split('\n')
for s, *a in [line.split(',')]}
# {'N00000047': ['B', '', 'D', 'C', 'C', 'B', 'D', 'D', 'C', 'C', 'D', '', 'A', 'B', 'D', 'C', '', 'D', 'A', 'C', '', 'D', 'B', 'D', 'C'],
# 'N00000048': ['B', 'A', 'D', 'D', 'C', 'B', '', 'D', 'C', 'C', 'D', 'B', 'A', 'B', 'A', 'D', 'B', 'D', 'A', 'C', 'A', 'A', 'B', 'D', 'D'],
# 'N00000049': ['A', '', 'D', 'D', 'C', 'B', 'D', '', 'C', 'C', 'D', 'B', '', 'B', 'A', 'C', 'C', 'D', 'A', 'C', 'A', 'A', 'B', 'D', 'D'],
# 'N00000050': ['', 'C', '', 'D', '', 'D', 'D', 'A', 'C', 'A', 'A', 'B', 'A', 'B', 'A', 'D', 'B', 'D', 'A', 'C', 'D', 'A', 'B', 'D', 'D'],
# 'N00000051': ['B', 'A', 'B', '', 'C', 'B', 'D', 'A', 'C', 'C', 'D', 'D', 'A', 'B', 'A', 'C', 'B', 'C', 'A', '', 'A', 'A', 'B', 'D', 'B'],
# 'N00000052': ['B', 'A', 'D', 'D', '', 'B', 'D', 'A', 'D', '', 'D', 'B', 'A', 'B', 'A', 'C', 'B', 'C', 'A', 'C', 'A', 'A', 'B', 'D', 'D'],
# 'N00000053': ['B', 'A', 'D', 'D', 'C', 'B', 'D', 'A', 'C', 'C', 'D', 'B', 'B', 'B', 'C', 'C', 'B', 'D', 'A', 'C', 'A', 'C', 'A', 'D', 'D']}
Now, we can use collections.Counter to count up answers.
from collections import Counter
info = {s: Counter(a)
for line in data.strip().split('\n')
for s, *a in [line.split(',')]}
# {'N00000047': Counter({'D': 8, 'C': 7, 'B': 4, '': 4, 'A': 2}),
# 'N00000048': Counter({'D': 8, 'B': 6, 'A': 6, 'C': 4, '': 1}),
# 'N00000049': Counter({'D': 7, 'C': 6, 'A': 5, 'B': 4, '': 3}),
# 'N00000050': Counter({'D': 8, 'A': 7, 'B': 4, '': 3, 'C': 3}),
# 'N00000051': Counter({'B': 7, 'A': 7, 'C': 5, 'D': 4, '': 2}),
# 'N00000052': Counter({'A': 7, 'D': 7, 'B': 6, 'C': 3, '': 2}),
# 'N00000053': Counter({'D': 7, 'C': 7, 'B': 6, 'A': 5})}
From here, finding the statistical data you're looking for should be much easier. For instance:
(Requires Python 3.8+ for := operator.)
{q: {'all': (c := Counter(a)),
'skipped': (s := c['']),
'percentage': s / len(a)}
for line in data.strip().split('\n')
for q, *a in [line.split(',')]}
# {'N00000047': {'all': Counter({'D': 8, 'C': 7, 'B': 4, '': 4, 'A': 2}), 'skipped': 4, 'percentage': 0.16},
# 'N00000048': {'all': Counter({'D': 8, 'B': 6, 'A': 6, 'C': 4, '': 1}), 'skipped': 1, 'percentage': 0.04},
# 'N00000049': {'all': Counter({'D': 7, 'C': 6, 'A': 5, 'B': 4, '': 3}), 'skipped': 3, 'percentage': 0.12},
# 'N00000050': {'all': Counter({'D': 8, 'A': 7, 'B': 4, '': 3, 'C': 3}), 'skipped': 3, 'percentage': 0.12},
# 'N00000051': {'all': Counter({'B': 7, 'A': 7, 'C': 5, 'D': 4, '': 2}), 'skipped': 2, 'percentage': 0.08},
# 'N00000052': {'all': Counter({'A': 7, 'D': 7, 'B': 6, 'C': 3, '': 2}), 'skipped': 2, 'percentage': 0.08},
# 'N00000053': {'all': Counter({'D': 7, 'C': 7, 'B': 6, 'A': 5}), 'skipped': 0, 'percentage': 0.0}}

Something like?:
cat skipped.csv
N00000047,B,,D,C,C,B,D,D,C,C,D,,A,B,D,C,,D,A,C,,D,B,D,C
N00000048,B,A,D,D,C,B,,D,C,C,D,B,A,B,A,D,B,D,A,C,A,A,B,D,D
N00000049,A,,D,D,C,B,D,,C,C,D,B,,B,A,C,C,D,A,C,A,A,B,D,D
N00000050,,C,,D,,D,D,A,C,A,A,B,A,B,A,D,B,D,A,C,D,A,B,D,D
N00000051,B,A,B,,C,B,D,A,C,C,D,D,A,B,A,C,B,C,A,,A,A,B,D,B
N00000052,B,A,D,D,,B,D,A,D,,D,B,A,B,A,C,B,C,A,C,A,A,B,D,D
N00000053,B,A,D,D,C,B,D,A,C,C,D,B,B,B,C,C,B,D,A,C,A,C,A,D,D
import csv
from collections import Counter
with open("skipped.csv", "r" , newline="") as csv_file:
reader = csv.reader(csv_file)
l = []
for line in reader:
d = {"q": line.pop(0)}
ct = Counter(line)
# In Python 3.10+ you can do ct.total() instead of below.
q_sum = sum(ct.values())
skipped = ct['']
perc = skipped/q_sum
d.update({"skipped": skipped, "percentage": perc})
l.append(d)
l.sort(key=lambda x: x['skipped'], reverse=True)
l
[{'q': 'N00000047', 'skipped': 4, 'percentage': 0.16},
{'q': 'N00000049', 'skipped': 3, 'percentage': 0.12},
{'q': 'N00000050', 'skipped': 3, 'percentage': 0.12},
{'q': 'N00000051', 'skipped': 2, 'percentage': 0.08},
{'q': 'N00000052', 'skipped': 2, 'percentage': 0.08},
{'q': 'N00000048', 'skipped': 1, 'percentage': 0.04},
{'q': 'N00000053', 'skipped': 0, 'percentage': 0.0}]

Related

Data wrangling in Python, calculate value from some conditions

I have a dataframe in Python below:
import pandas as pd
df = pd.DataFrame({
'CRDACCT_DLQ_CYC_1_MNTH_AGO' : [3, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
'CRDACCT_DLQ_CYC_2_MNTH_AGO': [4, 3, 3, 3, 3, 3, 2, 0, 5, 4, 3, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2],
'CRDACCT_DLQ_CYC_3_MNTH_AGO': [8, 7, 6, 5, 4, 3, 2, 'F', 'F', 0, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'F', 'C', 'C', 'F', 'F'],
'CRDACCT_DLQ_CYC_4_MNTH_AGO' : [0, 2, 'F', 'F', 'C', 'C', 'C', 'C', 0, 2, 0, 2, 0, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'F', 'C', 'F'],
'CRDACCT_DLQ_CYC_5_MNTH_AGO' : [2, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
'CRDACCT_DLQ_CYC_6_MNTH_AGO' : [2, 2, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 0, 2, 0, 2, 0],
'CRDACCT_DLQ_CYC_7_MNTH_AGO' : [3, 3, 2, 'C', 'C', 'C', 'F', 0, 6, 5, 4, 3, 2, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
'CRDACCT_DLQ_CYC_8_MNTH_AGO' : [5, 4, 4, 3, 3, 2, 3, 2, 2, 2, 1, 2, 0, 2, 'C', 'C', 0, 2, 2, 2, 'C', 'C', 0, 'Z'],
'CRDACCT_DLQ_CYC_9_MNTH_AGO' : [2, 2, 'C', 0, 2, 0, 2, 'C', 'C', 'C', 'C', 'C', 0, 3, 2, 'C', 'F', 'C', 'F', 'F', 'F', 'F', 'F', 'F'],
'CRDACCT_DLQ_CYC_10_MNTH_AGO' : [5, 4, 3, 2, 3, 2, 0, 2, 0, 2, 'C', 'C', 'F', 2, 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'C'],
'CRDACCT_DLQ_CYC_11_MNTH_AGO' : [4, 3, 2, 'F', 2, 0, 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z'],
'CRDACCT_DLQ_CYC_12_MNTH_AGO' : ['F', 8, 7, 6, 5, 4, 3, 2, 'C', 'C', 'C', 0, 2, 'C', 'C', 0, 2, 0, 3, 2, 'C', 'C', 'F', 2]
})
df.head()
I want to convert those values (string value: C, F, and Z) into some categories with this condition: if values in column CRDACCT_DLQ_CYC_1_MNTH_AGO, CRDACCT_DLQ_CYC_2_MNTH_AGO, ......., CRDACCT_DLQ_CYC_12_MNTH_AGO consist:
C = 0
F = 0
Z = 0
else value = value
#Convert value
df = df.replace({'C': 0, 'F': 0, 'Z': 0,' ':0}).astype(int)
Then, I want to create a new column with the name of MSD. MSD stands for Month Since Delinquent. MSD is calculated by identifying each of 12 columns CRDACCT_DLQ_CYC_1_MNTH_AGO, CRDACCT_DLQ_CYC_2_MNTH_AGO, .......up until CRDACCT_DLQ_CYC_12_MNTH_AGO with this kind of condition:
If value in CRDACCT_DLQ_CYC_1_MNTH_AGO > 1 then MSD = 1, otherwise MSD=0 or
If value in CRDACCT_DLQ_CYC_2_MNTH_AGO > 1 then MSD = 2, otherwise MSD=0 or
If value in CRDACCT_DLQ_CYC_3_MNTH_AGO > 1 then MSD = 3, otherwise MSD=0 or
If value in CRDACCT_DLQ_CYC_4_MNTH_AGO > 1 then MSD = 4, otherwise MSD=0 or
If value in CRDACCT_DLQ_CYC_5_MNTH_AGO > 1 then MSD = 5, otherwise MSD=0 or
If value in CRDACCT_DLQ_CYC_6_MNTH_AGO > 1 then MSD = 6, otherwise MSD=0 or
If value in CRDACCT_DLQ_CYC_7_MNTH_AGO > 1 then MSD = 7, otherwise MSD=0 or
If value in CRDACCT_DLQ_CYC_8_MNTH_AGO > 1 then MSD = 8, otherwise MSD=0 or
If value in CRDACCT_DLQ_CYC_9_MNTH_AGO > 1 then MSD = 9, otherwise MSD=0 or
If value in CRDACCT_DLQ_CYC_10_MNTH_AGO > 1 then MSD = 10, otherwise MSD=0 or
If value in CRDACCT_DLQ_CYC_11_MNTH_AGO > 1 then MSD = 11, otherwise MSD=0 or
If value in CRDACCT_DLQ_CYC_12_MNTH_AGO > 1 then MSD = 12, otherwise MSD=0
Note: otherwise if value 1 and 0, then MSD = 0.
For example:
index 0, MSD =1,because value 3 > 1 is in CRDACCT_DLQ_CYC_1_MNTH_AGO
(we no need to check CRDACCT_DLQ_CYC_2_MNTH_AGO > 1 because we have
found month since delinquent in CRDACCT_DLQ_CYC_1_MNTH_AGO) , hence
MSD is in 1 MNTH AGO
index 1, MSD=1,
index 2, MSD=2,
index 3, MSD=2, because value 3 > 1 is in
CRDACCT_DLQ_CYC_2_MNTH_AGO, hence MSD is in 2 MNTH AGO
index 4, MSD=2
Note: by checking each 12 columns with those conditions, If all values = 0 in each column CRDACCT_DLQ_CYC_1_MNTH_AGO, .....and CRDACCT_DLQ_CYC_12_MNTH_AGO, then MSD should be = 0.
Generally it is to check value > 1 in each 12 columns then determine the MSD value based on column name CRDACCT_DLQ_CYC_x_MNTH_AGO, x will be the value of MSD if > 1.
It ain't pretty but this one-liner should do the trick ;)
df['MSD'] = (df > 1).astype(int).apply(lambda row: int(row.idxmax().split('_')[3]) if row.sum() >=1 else 0, axis=1)
basically - check which values are over 1, get the first column for each row which is above one (the MSD as you defined it), and don't forget to check the edge case when it is 0.

Create a List in Python from Value of Max Data

I want to create a list in Python from the dataset below:
import pandas as pd
df = pd.DataFrame({
`'CRDACCT_DLQ_CYC_1_MNTH_AGO' : [3, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C',` `'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],`
'CRDACCT_DLQ_CYC_2_MNTH_AGO': [4, 3, 3, 3, 3, 3, 2, 0, 5, 4, 3, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2],
'CRDACCT_DLQ_CYC_3_MNTH_AGO': [8, 7, 6, 5, 4, 3, 2, 'F', 'F', 0, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'F', 'C', 'C', 'F', 'F'],
'CRDACCT_DLQ_CYC_4_MNTH_AGO' : [0, 2, 'F', 'F', 'C', 'C', 'C', 'C', 0, 2, 0, 2, 0, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'F', 'C', 'F'],
'CRDACCT_DLQ_CYC_5_MNTH_AGO' : [2, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
'CRDACCT_DLQ_CYC_6_MNTH_AGO' : [2, 2, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 0, 2, 0, 2, 0],
'CRDACCT_DLQ_CYC_7_MNTH_AGO' : [3, 3, 2, 'C', 'C', 'C', 'F', 0, 6, 5, 4, 3, 2, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
'CRDACCT_DLQ_CYC_8_MNTH_AGO' : [5, 4, 4, 3, 3, 2, 3, 2, 2, 2, 1, 2, 0, 2, 'C', 'C', 0, 2, 2, 2, 'C', 'C', 0, 'Z'],
'CRDACCT_DLQ_CYC_9_MNTH_AGO' : [2, 2, 'C', 0, 2, 0, 2, 'C', 'C', 'C', 'C', 'C', 0, 3, 2, 'C', 'F', 'C', 'F', 'F', 'F', 'F', 'F', 'F'],
'CRDACCT_DLQ_CYC_10_MNTH_AGO' : [5, 4, 3, 2, 3, 2, 0, 2, 0, 2, 'C', 'C', 'F', 2, 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'C'],
'CRDACCT_DLQ_CYC_11_MNTH_AGO' : [4, 3, 2, 'F', 2, 0, 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z'],
'CRDACCT_DLQ_CYC_12_MNTH_AGO' : ['F', 8, 7, 6, 5, 4, 3, 2, 'C', 'C', 'C', 0, 2, 'C', 'C', 0, 2, 0, 3, 2, 'C', 'C', 'F', 2]
})
With some data wrangling by transposing the data to convert the value with this code:
#Transpose data
dfT = pd.DataFrame(df.T).reset_index(inplace=False)
dfT
#Data converting
df = df.replace({'C': -1, 'F': -2, 'Z': -3}).astype(int).T
df
The data frame look like this:
For example,
#in column 0, max value is 8,
#in column 1, max value is 8,
#in column 2, max value is 7,
.....
and so on until column 23.
Final result that I expected should be a list that consists a maximum value from each column:
max_val = [8,8,7,6,5,4,3,2,6,5,...,2,2,2,2,2,2,2,2,2,2]
You can try this:
import pandas as pd
df = pd.DataFrame({
'CRDACCT_DLQ_CYC_1_MNTH_AGO' : [3, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
'CRDACCT_DLQ_CYC_2_MNTH_AGO': [4, 3, 3, 3, 3, 3, 2, 0, 5, 4, 3, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2],
'CRDACCT_DLQ_CYC_3_MNTH_AGO': [8, 7, 6, 5, 4, 3, 2, 'F', 'F', 0, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'F', 'C', 'C', 'F', 'F'],
'CRDACCT_DLQ_CYC_4_MNTH_AGO' : [0, 2, 'F', 'F', 'C', 'C', 'C', 'C', 0, 2, 0, 2, 0, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'F', 'C', 'F'],
'CRDACCT_DLQ_CYC_5_MNTH_AGO' : [2, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
'CRDACCT_DLQ_CYC_6_MNTH_AGO' : [2, 2, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 0, 2, 0, 2, 0],
'CRDACCT_DLQ_CYC_7_MNTH_AGO' : [3, 3, 2, 'C', 'C', 'C', 'F', 0, 6, 5, 4, 3, 2, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
'CRDACCT_DLQ_CYC_8_MNTH_AGO' : [5, 4, 4, 3, 3, 2, 3, 2, 2, 2, 1, 2, 0, 2, 'C', 'C', 0, 2, 2, 2, 'C', 'C', 0, 'Z'],
'CRDACCT_DLQ_CYC_9_MNTH_AGO' : [2, 2, 'C', 0, 2, 0, 2, 'C', 'C', 'C', 'C', 'C', 0, 3, 2, 'C', 'F', 'C', 'F', 'F', 'F', 'F', 'F', 'F'],
'CRDACCT_DLQ_CYC_10_MNTH_AGO' : [5, 4, 3, 2, 3, 2, 0, 2, 0, 2, 'C', 'C', 'F', 2, 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'C'],
'CRDACCT_DLQ_CYC_11_MNTH_AGO' : [4, 3, 2, 'F', 2, 0, 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z'],
'CRDACCT_DLQ_CYC_12_MNTH_AGO' : ['F', 8, 7, 6, 5, 4, 3, 2, 'C', 'C', 'C', 0, 2, 'C', 'C', 0, 2, 0, 3, 2, 'C', 'C', 'F', 2]
})
#Transpose data
dfT = pd.DataFrame(df.T).reset_index(inplace=False)
#Data converting
df = df.replace({'C': -1, 'F': -2, 'Z': -3}).astype(int).T
max_val = list(df.max())
print(max_val)
Output:
[8, 8, 7, 6, 5, 4, 3, 2, 6, 5, 4, 3, 2, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2]

Data Wrangling in Python to Create a List

I have a dataframe in Python below:
import pandas as pd
df = pd.DataFrame({
'CRDACCT_DLQ_CYC_1_MNTH_AGO' : [3, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
'CRDACCT_DLQ_CYC_2_MNTH_AGO': [4, 3, 3, 3, 3, 3, 2, 0, 5, 4, 3, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2],
'CRDACCT_DLQ_CYC_3_MNTH_AGO': [8, 7, 6, 5, 4, 3, 2, 'F', 'F', 0, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'F', 'C', 'C', 'F', 'F'],
'CRDACCT_DLQ_CYC_4_MNTH_AGO' : [0, 2, 'F', 'F', 'C', 'C', 'C', 'C', 0, 2, 0, 2, 0, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'F', 'C', 'F'],
'CRDACCT_DLQ_CYC_5_MNTH_AGO' : [2, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
'CRDACCT_DLQ_CYC_6_MNTH_AGO' : [2, 2, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 0, 2, 0, 2, 0],
'CRDACCT_DLQ_CYC_7_MNTH_AGO' : [3, 3, 2, 'C', 'C', 'C', 'F', 0, 6, 5, 4, 3, 2, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
'CRDACCT_DLQ_CYC_8_MNTH_AGO' : [5, 4, 4, 3, 3, 2, 3, 2, 2, 2, 1, 2, 0, 2, 'C', 'C', 0, 2, 2, 2, 'C', 'C', 0, 'Z'],
'CRDACCT_DLQ_CYC_9_MNTH_AGO' : [2, 2, 'C', 0, 2, 0, 2, 'C', 'C', 'C', 'C', 'C', 0, 3, 2, 'C', 'F', 'C', 'F', 'F', 'F', 'F', 'F', 'F'],
'CRDACCT_DLQ_CYC_10_MNTH_AGO' : [5, 4, 3, 2, 3, 2, 0, 2, 0, 2, 'C', 'C', 'F', 2, 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'C'],
'CRDACCT_DLQ_CYC_11_MNTH_AGO' : [4, 3, 2, 'F', 2, 0, 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z'],
'CRDACCT_DLQ_CYC_12_MNTH_AGO' : ['F', 8, 7, 6, 5, 4, 3, 2, 'C', 'C', 'C', 0, 2, 'C', 'C', 0, 2, 0, 3, 2, 'C', 'C', 'F', 2]
})
df.head()
I want to transform those values (string value: C, F, and Z) into some categories with this condition:
if values in column CRDACCT_DLQ_CYC_1_MNTH_AGO, CRDACCT_DLQ_CYC_2_MNTH_AGO, ......., CRDACCT_DLQ_CYC_12_MNTH_AGO consist:
C = -1
F = -2
Z = -3
else value = value
Then I transpose the table to identify Month Since Dlq (MSD).
dfT =pd.DataFrame(df.T).reset_index(inplace=False)
dfT
I want to create a list with the name of MSD. MSD is identified for value if it is greater than 1 ( value > 1). For example, in index 2 CRDACCT_DLQ_CYC_1_MNTH_AGO = C or after it has changed = -1 which is not greater than 1. Then, check CRDACCT_DLQ_CYC_2_MNTH_AGO is greater than 1? CRDACCT_DLQ_CYC_2_MNTH_AGO = 3 is greater than 1. Hence, the MSD is 2 because it's in CRDACCT_DLQ_CYC_2_MNTH_AGO. Detail flow chart & overview table for identification .
The MSD value is between 1 and 12 depends on i in CRDACCT_DLQ_CYC_i_MNTH_AGO, for i = 1,2,3,...,12.
So the final result is a MSD list with 24 value, identified for each index 0 -23.
Does it what you are looking for:
# From your dataframe
MSD = df.T.apply(pd.to_numeric, errors='coerce').ge(1).idxmax(axis=0) \
.str.extract(r'CYC_(\d+)_MNTH', expand=False).astype(int).tolist()
print(MSD)
# Output:
[1, 1, 2, 2, 2, 2, 2, 8, 2, 2, 2, 2, 7, 2, 2, 2, 2, 2, 2, 8, 2, 2, 6, 2]

Finding the second level keys of a multi key dictionary Python?

I have a multi key dict in the following format. I am trying to access the list of the second level keys, however, it is returning the keys in the format of a dict_keys list. What I am trying to get is ['a', 'b', 'c', 'd', 'e', 'f']
dictTest={}
dictTest[1]={'a':1, 'b':2}
dictTest[2]={'c':1, 'd':2}
dictTest[3]={'e':1, 'f':2}
print(dictTest)
print(list([dictTest[i].keys() for i in dictTest.keys()]))
{1: {'a': 1, 'b': 2}, 2: {'c': 1, 'd': 2}, 3: {'e': 1, 'f': 2}}
[dict_keys(['a', 'b']), dict_keys(['c', 'd']), dict_keys(['e', 'f'])]
You could use itertools.chain in combination with mapping dict.keys to all the dicts values:
from itertools import chain
dictTest = {1: {'a': 1, 'b': 2}, 2: {'c': 1, 'd': 2}, 3: {'e': 1, 'f': 2}}
print(list(chain(*map(dict.keys, dictTest.values()))))
['a', 'b', 'c', 'd', 'e', 'f']
>>> [v2 for v1 in dictTest.values() for v2 in v1]
['a', 'b', 'c', 'd', 'e', 'f']
Try this:
# sum([list(b.keys()) for b in dictTest.values()], [])
# syntax improvement by #wwii
sum([list(b) for b in dictTest.values()], [])
Output:
['a', 'b', 'c', 'd', 'e', 'f']

Created nested/recursive list

How can I create a list recursively?
I have this list:
l = ['a', 'b', 'new', 'c', 'd', 'new', 'z', 'x', 'c', 'fin', 'f', 'fin',
'g', 'l', 'new', 'z', 'x', 'c', 'fin', 'j']
The expected output is:
r = ['a', 'b', ['c', 'd', ['z', 'x', 'c'] 'f'], 'g', 'l', ['z', 'x', 'c'] 'j']
What I have tried so far:
def asd(l, index=0):
r = []
for i in l[index:]:
index += 1
if i == 'new':
i, index = asd(l, index)
r.append(i)
if i == 'fin':
return r
return r, index
r, index = asd(l)
I cannot understand how to make it work. Can anyone help me?
This is a non-recursive solution that can create your list, parsing in one pass without any need for costly index() operations:
l = ['a', 'b', 'new', 'c', 'd', 'new', 'f', 'fin', 'g', 'fin', 'j']
rv = []
curr = [rv] # things are always added to the last element if not 'fin' or 'new'
for elem in l:
if elem == "new":
# create a new list, put it at end of curr
curr.append([])
# add that list to the one before
curr[-2].append(curr[-1])
elif elem == "fin":
# done, remove from curr
curr.pop()
else:
curr[-1].append(elem)
print(rv)
Output:
['a', 'b', ['c', 'd', ['f'], 'g'], 'j']
l = ['a', 'b', 'new', '1', '2', '3', 'fin', 'c', 'new', 'x', 'y', 'z', 'fin',]
leads to
['a', 'b', ['1', '2', '3'], 'c', ['x', 'y', 'z']]
You need to foolproof it against unbalanced / incorrect new/fin's
Edited to make it more concise after Matthieu's comment.
Here is a straight forward recursive solution, using a deque as a stack data structure from which you can popleft the leftmost element in O(1).
Algorithm
from collections import deque
def nest(lst):
return _nest(deque(lst))
def _nest(deq):
result = []
while deq:
x = deq.popleft()
if x == 'fin':
break
elif x == 'new':
result.append(_nest(deq))
else:
result.append(x)
return result
Tests
tests = [
[],
[1, 2, 3],
[1, 2, 'new', 3, 4, 'fin', 5],
[1, 2, 'new', 3, 4, 'fin', 5, 6, 'new', 7, 'fin'],
['new', 'fin', 'new', 'fin', 'new', 'new', 'fin', 'fin'],
['new', 1, 2, 'fin'],
[1, 2, 3, 'new', 4, 'new', 5, 6, 'fin', 7, 8, 'fin', 9, 10, 'new', 11, 'fin', 12, 13]
]
for test in tests:
print(nest(test))
Output
[]
[1, 2, 3]
[1, 2, [3, 4], 5]
[1, 2, [3, 4], 5, 6, [7]]
[[], [], [[]]]
[[1, 2]]
[1, 2, 3, [4, [5, 6], 7, 8], 9, 10, [11], 12, 13]
You can use a stack instead and go through the list and using it:
def parse(l):
stack = [[]]
for i in l:
if i == "new":
stack.append([])
elif i == "fin":
pop = stack.pop()
stack[-1].append(pop)
else:
stack[-1].append(i)
return stack[0]
Recursive alternative :
def asd(l):
if 'new' in l:
index_new = l.index('new')
keyword = 1
for index_fin,e in enumerate(l[index_new+1:], index_new+1):
if e == 'new':
keyword += 1
elif e == 'fin':
keyword -=1
if not keyword:
break
return l[:index_new] + [asd(l[index_new+1:index_fin])] + asd(l[index_fin+1:])
else:
return l
Input :
['a', 'b', 'new', 'c', 'd', 'new', 'z', 'x', 'c', 'fin', 'f', 'fin',
'g', 'l', 'new', 'z', 'x', 'c', 'fin', 'j']
Output :
['a', 'b', ['c', 'd', ['z', 'x', 'c'], 'f'], 'g', 'l', ['z', 'x', 'c'], 'j']

Categories