Create a List in Python from Value of Max Data - python

I want to create a list in Python from the dataset below:
import pandas as pd
df = pd.DataFrame({
`'CRDACCT_DLQ_CYC_1_MNTH_AGO' : [3, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C',` `'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],`
'CRDACCT_DLQ_CYC_2_MNTH_AGO': [4, 3, 3, 3, 3, 3, 2, 0, 5, 4, 3, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2],
'CRDACCT_DLQ_CYC_3_MNTH_AGO': [8, 7, 6, 5, 4, 3, 2, 'F', 'F', 0, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'F', 'C', 'C', 'F', 'F'],
'CRDACCT_DLQ_CYC_4_MNTH_AGO' : [0, 2, 'F', 'F', 'C', 'C', 'C', 'C', 0, 2, 0, 2, 0, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'F', 'C', 'F'],
'CRDACCT_DLQ_CYC_5_MNTH_AGO' : [2, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
'CRDACCT_DLQ_CYC_6_MNTH_AGO' : [2, 2, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 0, 2, 0, 2, 0],
'CRDACCT_DLQ_CYC_7_MNTH_AGO' : [3, 3, 2, 'C', 'C', 'C', 'F', 0, 6, 5, 4, 3, 2, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
'CRDACCT_DLQ_CYC_8_MNTH_AGO' : [5, 4, 4, 3, 3, 2, 3, 2, 2, 2, 1, 2, 0, 2, 'C', 'C', 0, 2, 2, 2, 'C', 'C', 0, 'Z'],
'CRDACCT_DLQ_CYC_9_MNTH_AGO' : [2, 2, 'C', 0, 2, 0, 2, 'C', 'C', 'C', 'C', 'C', 0, 3, 2, 'C', 'F', 'C', 'F', 'F', 'F', 'F', 'F', 'F'],
'CRDACCT_DLQ_CYC_10_MNTH_AGO' : [5, 4, 3, 2, 3, 2, 0, 2, 0, 2, 'C', 'C', 'F', 2, 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'C'],
'CRDACCT_DLQ_CYC_11_MNTH_AGO' : [4, 3, 2, 'F', 2, 0, 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z'],
'CRDACCT_DLQ_CYC_12_MNTH_AGO' : ['F', 8, 7, 6, 5, 4, 3, 2, 'C', 'C', 'C', 0, 2, 'C', 'C', 0, 2, 0, 3, 2, 'C', 'C', 'F', 2]
})
With some data wrangling by transposing the data to convert the value with this code:
#Transpose data
dfT = pd.DataFrame(df.T).reset_index(inplace=False)
dfT
#Data converting
df = df.replace({'C': -1, 'F': -2, 'Z': -3}).astype(int).T
df
The data frame look like this:
For example,
#in column 0, max value is 8,
#in column 1, max value is 8,
#in column 2, max value is 7,
.....
and so on until column 23.
Final result that I expected should be a list that consists a maximum value from each column:
max_val = [8,8,7,6,5,4,3,2,6,5,...,2,2,2,2,2,2,2,2,2,2]

You can try this:
import pandas as pd
df = pd.DataFrame({
'CRDACCT_DLQ_CYC_1_MNTH_AGO' : [3, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
'CRDACCT_DLQ_CYC_2_MNTH_AGO': [4, 3, 3, 3, 3, 3, 2, 0, 5, 4, 3, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2],
'CRDACCT_DLQ_CYC_3_MNTH_AGO': [8, 7, 6, 5, 4, 3, 2, 'F', 'F', 0, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'F', 'C', 'C', 'F', 'F'],
'CRDACCT_DLQ_CYC_4_MNTH_AGO' : [0, 2, 'F', 'F', 'C', 'C', 'C', 'C', 0, 2, 0, 2, 0, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'F', 'C', 'F'],
'CRDACCT_DLQ_CYC_5_MNTH_AGO' : [2, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
'CRDACCT_DLQ_CYC_6_MNTH_AGO' : [2, 2, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 0, 2, 0, 2, 0],
'CRDACCT_DLQ_CYC_7_MNTH_AGO' : [3, 3, 2, 'C', 'C', 'C', 'F', 0, 6, 5, 4, 3, 2, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
'CRDACCT_DLQ_CYC_8_MNTH_AGO' : [5, 4, 4, 3, 3, 2, 3, 2, 2, 2, 1, 2, 0, 2, 'C', 'C', 0, 2, 2, 2, 'C', 'C', 0, 'Z'],
'CRDACCT_DLQ_CYC_9_MNTH_AGO' : [2, 2, 'C', 0, 2, 0, 2, 'C', 'C', 'C', 'C', 'C', 0, 3, 2, 'C', 'F', 'C', 'F', 'F', 'F', 'F', 'F', 'F'],
'CRDACCT_DLQ_CYC_10_MNTH_AGO' : [5, 4, 3, 2, 3, 2, 0, 2, 0, 2, 'C', 'C', 'F', 2, 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'C'],
'CRDACCT_DLQ_CYC_11_MNTH_AGO' : [4, 3, 2, 'F', 2, 0, 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z'],
'CRDACCT_DLQ_CYC_12_MNTH_AGO' : ['F', 8, 7, 6, 5, 4, 3, 2, 'C', 'C', 'C', 0, 2, 'C', 'C', 0, 2, 0, 3, 2, 'C', 'C', 'F', 2]
})
#Transpose data
dfT = pd.DataFrame(df.T).reset_index(inplace=False)
#Data converting
df = df.replace({'C': -1, 'F': -2, 'Z': -3}).astype(int).T
max_val = list(df.max())
print(max_val)
Output:
[8, 8, 7, 6, 5, 4, 3, 2, 6, 5, 4, 3, 2, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2]

Related

find max duplicate values in a list

I have a file containing multiple lines in format student code and followed by some answer. e.g
N00000047,B,,D,C,C,B,D,D,C,C,D,,A,B,D,C,,D,A,C,,D,B,D,C
N00000048,B,A,D,D,C,B,,D,C,C,D,B,A,B,A,D,B,D,A,C,A,A,B,D,D
N00000049,A,,D,D,C,B,D,,C,C,D,B,,B,A,C,C,D,A,C,A,A,B,D,D
N00000050,,C,,D,,D,D,A,C,A,A,B,A,B,A,D,B,D,A,C,D,A,B,D,D
N00000051,B,A,B,,C,B,D,A,C,C,D,D,A,B,A,C,B,C,A,,A,A,B,D,B
N00000052,B,A,D,D,,B,D,A,D,,D,B,A,B,A,C,B,C,A,C,A,A,B,D,D
N00000053,B,A,D,D,C,B,D,A,C,C,D,B,B,B,C,C,B,D,A,C,A,C,A,D,D
And now I have to find which is the most question was skipped by students by order which question, how many student skipped and how many % student skipped that question.
I was split then make a loop and add every entry of skipped question in a list and then got stuck in find the max duplicates values in a list (it can be more than 1 output).
This is some expected output:
Question that most people answer incorrectly: 10 - 4 - 0.20, 14 - 4 - 0.20, 16 - 4 - 0.20, 19 - 4 - 0.20, 22 - 4 - 0.20. In format : a - b - c which a is question number, b is how much student was skipped, c is it take how many percentage of total student in class. There are 3 question have the most skipped is 10, 14, 19 and 22 and they all have 4 skipped.
Edited:
I put all skipped question in a list and count for which question have a largest duplicate like this:
def find_max_count(list):
item_with_max_count = []
max_count = 0
for item in list:
item_count = list.count(item)
if item_count > max_count:
max_count = list.count
for item1 in list:
if list.count(item1) == max_count:
item_with_max_count.append(item1)
return item_with_max_count
but there is an error:
TypeError: '>' not supported between instances of 'int' and 'builtin_function_or_method'
Start by accumulating a dictionary of all responses to each question and a list of all skipped answers:
from collections import defaultdict
responses = defaultdict(list) # all responses to a given question
skipped = [] # all skiped answers
for record in data.splitlines():
student_id, *answers = record.split(',')
for question_number, answer in enumerate(answers, start=1):
responses[question_number].append(answer)
if answer == '':
skipped.append(question_number)
Next perform the analysis:
from statistics import multimode
print('Most skipped questions:', multimode(skipped))
print('Answer for questions with more than two or more skips')
for question, answers in responses.items():
if answers.count('') >= 2:
print(f'Question {question}: {answers}')
This outputs:
Most skipped questions: [2, 5]
Answer for questions with more than two or more skips
Question 2: ['', 'A', '', 'C', 'A', 'A', 'A']
Question 5: ['C', 'C', 'C', '', 'C', '', 'C']
I'm certain this is what you wanted (a target output wasn't shown), but this should get you started the key techniques for analysis. In particular, the multimode function is super helpful in identifying most frequent occurrences including ties for first place. Also defaultdict is super useful for transposing the data from answers by student to answers by question.
Let's get a dictionary with the student id and the answers.
data = """
N00000047,B,,D,C,C,B,D,D,C,C,D,,A,B,D,C,,D,A,C,,D,B,D,C
N00000048,B,A,D,D,C,B,,D,C,C,D,B,A,B,A,D,B,D,A,C,A,A,B,D,D
N00000049,A,,D,D,C,B,D,,C,C,D,B,,B,A,C,C,D,A,C,A,A,B,D,D
N00000050,,C,,D,,D,D,A,C,A,A,B,A,B,A,D,B,D,A,C,D,A,B,D,D
N00000051,B,A,B,,C,B,D,A,C,C,D,D,A,B,A,C,B,C,A,,A,A,B,D,B
N00000052,B,A,D,D,,B,D,A,D,,D,B,A,B,A,C,B,C,A,C,A,A,B,D,D
N00000053,B,A,D,D,C,B,D,A,C,C,D,B,B,B,C,C,B,D,A,C,A,C,A,D,D
"""
info = {s: a
for line in data.strip().split('\n')
for s, *a in [line.split(',')]}
# {'N00000047': ['B', '', 'D', 'C', 'C', 'B', 'D', 'D', 'C', 'C', 'D', '', 'A', 'B', 'D', 'C', '', 'D', 'A', 'C', '', 'D', 'B', 'D', 'C'],
# 'N00000048': ['B', 'A', 'D', 'D', 'C', 'B', '', 'D', 'C', 'C', 'D', 'B', 'A', 'B', 'A', 'D', 'B', 'D', 'A', 'C', 'A', 'A', 'B', 'D', 'D'],
# 'N00000049': ['A', '', 'D', 'D', 'C', 'B', 'D', '', 'C', 'C', 'D', 'B', '', 'B', 'A', 'C', 'C', 'D', 'A', 'C', 'A', 'A', 'B', 'D', 'D'],
# 'N00000050': ['', 'C', '', 'D', '', 'D', 'D', 'A', 'C', 'A', 'A', 'B', 'A', 'B', 'A', 'D', 'B', 'D', 'A', 'C', 'D', 'A', 'B', 'D', 'D'],
# 'N00000051': ['B', 'A', 'B', '', 'C', 'B', 'D', 'A', 'C', 'C', 'D', 'D', 'A', 'B', 'A', 'C', 'B', 'C', 'A', '', 'A', 'A', 'B', 'D', 'B'],
# 'N00000052': ['B', 'A', 'D', 'D', '', 'B', 'D', 'A', 'D', '', 'D', 'B', 'A', 'B', 'A', 'C', 'B', 'C', 'A', 'C', 'A', 'A', 'B', 'D', 'D'],
# 'N00000053': ['B', 'A', 'D', 'D', 'C', 'B', 'D', 'A', 'C', 'C', 'D', 'B', 'B', 'B', 'C', 'C', 'B', 'D', 'A', 'C', 'A', 'C', 'A', 'D', 'D']}
Now, we can use collections.Counter to count up answers.
from collections import Counter
info = {s: Counter(a)
for line in data.strip().split('\n')
for s, *a in [line.split(',')]}
# {'N00000047': Counter({'D': 8, 'C': 7, 'B': 4, '': 4, 'A': 2}),
# 'N00000048': Counter({'D': 8, 'B': 6, 'A': 6, 'C': 4, '': 1}),
# 'N00000049': Counter({'D': 7, 'C': 6, 'A': 5, 'B': 4, '': 3}),
# 'N00000050': Counter({'D': 8, 'A': 7, 'B': 4, '': 3, 'C': 3}),
# 'N00000051': Counter({'B': 7, 'A': 7, 'C': 5, 'D': 4, '': 2}),
# 'N00000052': Counter({'A': 7, 'D': 7, 'B': 6, 'C': 3, '': 2}),
# 'N00000053': Counter({'D': 7, 'C': 7, 'B': 6, 'A': 5})}
From here, finding the statistical data you're looking for should be much easier. For instance:
(Requires Python 3.8+ for := operator.)
{q: {'all': (c := Counter(a)),
'skipped': (s := c['']),
'percentage': s / len(a)}
for line in data.strip().split('\n')
for q, *a in [line.split(',')]}
# {'N00000047': {'all': Counter({'D': 8, 'C': 7, 'B': 4, '': 4, 'A': 2}), 'skipped': 4, 'percentage': 0.16},
# 'N00000048': {'all': Counter({'D': 8, 'B': 6, 'A': 6, 'C': 4, '': 1}), 'skipped': 1, 'percentage': 0.04},
# 'N00000049': {'all': Counter({'D': 7, 'C': 6, 'A': 5, 'B': 4, '': 3}), 'skipped': 3, 'percentage': 0.12},
# 'N00000050': {'all': Counter({'D': 8, 'A': 7, 'B': 4, '': 3, 'C': 3}), 'skipped': 3, 'percentage': 0.12},
# 'N00000051': {'all': Counter({'B': 7, 'A': 7, 'C': 5, 'D': 4, '': 2}), 'skipped': 2, 'percentage': 0.08},
# 'N00000052': {'all': Counter({'A': 7, 'D': 7, 'B': 6, 'C': 3, '': 2}), 'skipped': 2, 'percentage': 0.08},
# 'N00000053': {'all': Counter({'D': 7, 'C': 7, 'B': 6, 'A': 5}), 'skipped': 0, 'percentage': 0.0}}
Something like?:
cat skipped.csv
N00000047,B,,D,C,C,B,D,D,C,C,D,,A,B,D,C,,D,A,C,,D,B,D,C
N00000048,B,A,D,D,C,B,,D,C,C,D,B,A,B,A,D,B,D,A,C,A,A,B,D,D
N00000049,A,,D,D,C,B,D,,C,C,D,B,,B,A,C,C,D,A,C,A,A,B,D,D
N00000050,,C,,D,,D,D,A,C,A,A,B,A,B,A,D,B,D,A,C,D,A,B,D,D
N00000051,B,A,B,,C,B,D,A,C,C,D,D,A,B,A,C,B,C,A,,A,A,B,D,B
N00000052,B,A,D,D,,B,D,A,D,,D,B,A,B,A,C,B,C,A,C,A,A,B,D,D
N00000053,B,A,D,D,C,B,D,A,C,C,D,B,B,B,C,C,B,D,A,C,A,C,A,D,D
import csv
from collections import Counter
with open("skipped.csv", "r" , newline="") as csv_file:
reader = csv.reader(csv_file)
l = []
for line in reader:
d = {"q": line.pop(0)}
ct = Counter(line)
# In Python 3.10+ you can do ct.total() instead of below.
q_sum = sum(ct.values())
skipped = ct['']
perc = skipped/q_sum
d.update({"skipped": skipped, "percentage": perc})
l.append(d)
l.sort(key=lambda x: x['skipped'], reverse=True)
l
[{'q': 'N00000047', 'skipped': 4, 'percentage': 0.16},
{'q': 'N00000049', 'skipped': 3, 'percentage': 0.12},
{'q': 'N00000050', 'skipped': 3, 'percentage': 0.12},
{'q': 'N00000051', 'skipped': 2, 'percentage': 0.08},
{'q': 'N00000052', 'skipped': 2, 'percentage': 0.08},
{'q': 'N00000048', 'skipped': 1, 'percentage': 0.04},
{'q': 'N00000053', 'skipped': 0, 'percentage': 0.0}]

Data wrangling in Python, calculate value from some conditions

I have a dataframe in Python below:
import pandas as pd
df = pd.DataFrame({
'CRDACCT_DLQ_CYC_1_MNTH_AGO' : [3, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
'CRDACCT_DLQ_CYC_2_MNTH_AGO': [4, 3, 3, 3, 3, 3, 2, 0, 5, 4, 3, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2],
'CRDACCT_DLQ_CYC_3_MNTH_AGO': [8, 7, 6, 5, 4, 3, 2, 'F', 'F', 0, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'F', 'C', 'C', 'F', 'F'],
'CRDACCT_DLQ_CYC_4_MNTH_AGO' : [0, 2, 'F', 'F', 'C', 'C', 'C', 'C', 0, 2, 0, 2, 0, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'F', 'C', 'F'],
'CRDACCT_DLQ_CYC_5_MNTH_AGO' : [2, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
'CRDACCT_DLQ_CYC_6_MNTH_AGO' : [2, 2, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 0, 2, 0, 2, 0],
'CRDACCT_DLQ_CYC_7_MNTH_AGO' : [3, 3, 2, 'C', 'C', 'C', 'F', 0, 6, 5, 4, 3, 2, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
'CRDACCT_DLQ_CYC_8_MNTH_AGO' : [5, 4, 4, 3, 3, 2, 3, 2, 2, 2, 1, 2, 0, 2, 'C', 'C', 0, 2, 2, 2, 'C', 'C', 0, 'Z'],
'CRDACCT_DLQ_CYC_9_MNTH_AGO' : [2, 2, 'C', 0, 2, 0, 2, 'C', 'C', 'C', 'C', 'C', 0, 3, 2, 'C', 'F', 'C', 'F', 'F', 'F', 'F', 'F', 'F'],
'CRDACCT_DLQ_CYC_10_MNTH_AGO' : [5, 4, 3, 2, 3, 2, 0, 2, 0, 2, 'C', 'C', 'F', 2, 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'C'],
'CRDACCT_DLQ_CYC_11_MNTH_AGO' : [4, 3, 2, 'F', 2, 0, 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z'],
'CRDACCT_DLQ_CYC_12_MNTH_AGO' : ['F', 8, 7, 6, 5, 4, 3, 2, 'C', 'C', 'C', 0, 2, 'C', 'C', 0, 2, 0, 3, 2, 'C', 'C', 'F', 2]
})
df.head()
I want to convert those values (string value: C, F, and Z) into some categories with this condition: if values in column CRDACCT_DLQ_CYC_1_MNTH_AGO, CRDACCT_DLQ_CYC_2_MNTH_AGO, ......., CRDACCT_DLQ_CYC_12_MNTH_AGO consist:
C = 0
F = 0
Z = 0
else value = value
#Convert value
df = df.replace({'C': 0, 'F': 0, 'Z': 0,' ':0}).astype(int)
Then, I want to create a new column with the name of MSD. MSD stands for Month Since Delinquent. MSD is calculated by identifying each of 12 columns CRDACCT_DLQ_CYC_1_MNTH_AGO, CRDACCT_DLQ_CYC_2_MNTH_AGO, .......up until CRDACCT_DLQ_CYC_12_MNTH_AGO with this kind of condition:
If value in CRDACCT_DLQ_CYC_1_MNTH_AGO > 1 then MSD = 1, otherwise MSD=0 or
If value in CRDACCT_DLQ_CYC_2_MNTH_AGO > 1 then MSD = 2, otherwise MSD=0 or
If value in CRDACCT_DLQ_CYC_3_MNTH_AGO > 1 then MSD = 3, otherwise MSD=0 or
If value in CRDACCT_DLQ_CYC_4_MNTH_AGO > 1 then MSD = 4, otherwise MSD=0 or
If value in CRDACCT_DLQ_CYC_5_MNTH_AGO > 1 then MSD = 5, otherwise MSD=0 or
If value in CRDACCT_DLQ_CYC_6_MNTH_AGO > 1 then MSD = 6, otherwise MSD=0 or
If value in CRDACCT_DLQ_CYC_7_MNTH_AGO > 1 then MSD = 7, otherwise MSD=0 or
If value in CRDACCT_DLQ_CYC_8_MNTH_AGO > 1 then MSD = 8, otherwise MSD=0 or
If value in CRDACCT_DLQ_CYC_9_MNTH_AGO > 1 then MSD = 9, otherwise MSD=0 or
If value in CRDACCT_DLQ_CYC_10_MNTH_AGO > 1 then MSD = 10, otherwise MSD=0 or
If value in CRDACCT_DLQ_CYC_11_MNTH_AGO > 1 then MSD = 11, otherwise MSD=0 or
If value in CRDACCT_DLQ_CYC_12_MNTH_AGO > 1 then MSD = 12, otherwise MSD=0
Note: otherwise if value 1 and 0, then MSD = 0.
For example:
index 0, MSD =1,because value 3 > 1 is in CRDACCT_DLQ_CYC_1_MNTH_AGO
(we no need to check CRDACCT_DLQ_CYC_2_MNTH_AGO > 1 because we have
found month since delinquent in CRDACCT_DLQ_CYC_1_MNTH_AGO) , hence
MSD is in 1 MNTH AGO
index 1, MSD=1,
index 2, MSD=2,
index 3, MSD=2, because value 3 > 1 is in
CRDACCT_DLQ_CYC_2_MNTH_AGO, hence MSD is in 2 MNTH AGO
index 4, MSD=2
Note: by checking each 12 columns with those conditions, If all values = 0 in each column CRDACCT_DLQ_CYC_1_MNTH_AGO, .....and CRDACCT_DLQ_CYC_12_MNTH_AGO, then MSD should be = 0.
Generally it is to check value > 1 in each 12 columns then determine the MSD value based on column name CRDACCT_DLQ_CYC_x_MNTH_AGO, x will be the value of MSD if > 1.
It ain't pretty but this one-liner should do the trick ;)
df['MSD'] = (df > 1).astype(int).apply(lambda row: int(row.idxmax().split('_')[3]) if row.sum() >=1 else 0, axis=1)
basically - check which values are over 1, get the first column for each row which is above one (the MSD as you defined it), and don't forget to check the edge case when it is 0.

How can I rotate a list right and left?

I have been trying to rotate a list left and right in python
def rotate(l, r):
return l[r:] + l[:r]
l = eval(input())
r = int(input())
print(rotate(l, r))
but if i give input list as ['A','B','C','D',1,2,3,4,5] and r = -34 I'm getting output as
['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
but actual output is this :
['C', 'D', 1, 2, 3, 4, 5, 'A', 'B']
Can anyone tell how can I do it?
First you could use print() and test it for different values
def rotate(l, r):
return l[r:] + l[:r]
l = ['A','B','C','D',1,2,3,4,5]
print('len(l):', len(l))
for r in range(0, -34, -1):
print(f"{r:3}", rotate(l, r))
And you see
len(l): 9
0 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-1 [5, 'A', 'B', 'C', 'D', 1, 2, 3, 4]
-2 [4, 5, 'A', 'B', 'C', 'D', 1, 2, 3]
-3 [3, 4, 5, 'A', 'B', 'C', 'D', 1, 2]
-4 [2, 3, 4, 5, 'A', 'B', 'C', 'D', 1]
-5 [1, 2, 3, 4, 5, 'A', 'B', 'C', 'D']
-6 ['D', 1, 2, 3, 4, 5, 'A', 'B', 'C']
-7 ['C', 'D', 1, 2, 3, 4, 5, 'A', 'B']
-8 ['B', 'C', 'D', 1, 2, 3, 4, 5, 'A']
-9 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-10 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-11 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-12 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-13 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-14 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-15 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-16 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-17 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-18 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-19 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-20 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-21 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-22 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-23 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-24 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-25 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-26 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-27 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-28 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-29 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-30 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-31 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-32 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-33 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
When -r is bigger then len(r) then it doesn't work as you would expect.
It gets empty list + full list or full list + empty list
The same problem is with +34 and -34.
Because you get the same list for r=len(l), r=len(l)*2, ...r=len(l)*n so you would use modulo (r % len(l)) to have value smaller then len(l) and get what you need.
def rotate(l, r):
r = r % len(l)
return l[r:] + l[:r]
l = ['A','B','C','D',1,2,3,4,5]
print('len(l):', len(l))
for r in range(0, -34, -1):
print(f"{r:3}", rotate(l, r))
Result:
len(l): 9
0 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-1 [5, 'A', 'B', 'C', 'D', 1, 2, 3, 4]
-2 [4, 5, 'A', 'B', 'C', 'D', 1, 2, 3]
-3 [3, 4, 5, 'A', 'B', 'C', 'D', 1, 2]
-4 [2, 3, 4, 5, 'A', 'B', 'C', 'D', 1]
-5 [1, 2, 3, 4, 5, 'A', 'B', 'C', 'D']
-6 ['D', 1, 2, 3, 4, 5, 'A', 'B', 'C']
-7 ['C', 'D', 1, 2, 3, 4, 5, 'A', 'B']
-8 ['B', 'C', 'D', 1, 2, 3, 4, 5, 'A']
-9 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-10 [5, 'A', 'B', 'C', 'D', 1, 2, 3, 4]
-11 [4, 5, 'A', 'B', 'C', 'D', 1, 2, 3]
-12 [3, 4, 5, 'A', 'B', 'C', 'D', 1, 2]
-13 [2, 3, 4, 5, 'A', 'B', 'C', 'D', 1]
-14 [1, 2, 3, 4, 5, 'A', 'B', 'C', 'D']
-15 ['D', 1, 2, 3, 4, 5, 'A', 'B', 'C']
-16 ['C', 'D', 1, 2, 3, 4, 5, 'A', 'B']
-17 ['B', 'C', 'D', 1, 2, 3, 4, 5, 'A']
-18 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-19 [5, 'A', 'B', 'C', 'D', 1, 2, 3, 4]
-20 [4, 5, 'A', 'B', 'C', 'D', 1, 2, 3]
-21 [3, 4, 5, 'A', 'B', 'C', 'D', 1, 2]
-22 [2, 3, 4, 5, 'A', 'B', 'C', 'D', 1]
-23 [1, 2, 3, 4, 5, 'A', 'B', 'C', 'D']
-24 ['D', 1, 2, 3, 4, 5, 'A', 'B', 'C']
-25 ['C', 'D', 1, 2, 3, 4, 5, 'A', 'B']
-26 ['B', 'C', 'D', 1, 2, 3, 4, 5, 'A']
-27 ['A', 'B', 'C', 'D', 1, 2, 3, 4, 5]
-28 [5, 'A', 'B', 'C', 'D', 1, 2, 3, 4]
-29 [4, 5, 'A', 'B', 'C', 'D', 1, 2, 3]
-30 [3, 4, 5, 'A', 'B', 'C', 'D', 1, 2]
-31 [2, 3, 4, 5, 'A', 'B', 'C', 'D', 1]
-32 [1, 2, 3, 4, 5, 'A', 'B', 'C', 'D']
-33 ['D', 1, 2, 3, 4, 5, 'A', 'B', 'C']
BTW:
Without modulo you would have to use for-loops with [1:], [:1] or [-1:],[:-1] - but it need many moves - so it may need more time and memory (but for small list it is not visible).
def rotate(l, r):
if r >= 0:
for _ in range(0, r, 1):
l = l[1:] + l[:1]
else:
for _ in range(0, r, -1):
l = l[-1:] + l[:-1]
return l
l = ['A','B','C','D',1,2,3,4,5]
print('len(l):', len(l))
#for r in range(0, -34, -1):
# print(f"{r:3}", rotate(l, r))
for r in range(0, 34, 1):
print(f"{r:3}", rotate(l, r))
The same with one for-loop
def rotate(l, r):
if r >= 0:
s = 1
else:
s = -1
for _ in range(0, r, s):
l = l[s:] + l[:s]
return l
If r can be bigger than the list you need to add the modulo operater as #tim-roberts mentioned:
def rotate(l, r):
r = r % len(l)
return l[r:] + l[:r]
Outputs
l = [1,2,3]
print(rotate(l,0))
[1, 2, 3]
print(rotate(l,1))
[2, 3, 1]
print(rotate(l,-1))
[3, 1, 2]
print(rotate(l,4))
[2, 3, 1]
print(rotate(l,-4))
[3, 1, 2]
(personally I'd also turn around the rotation direction, using e.g. -r)

Data Wrangling in Python to Create a List

I have a dataframe in Python below:
import pandas as pd
df = pd.DataFrame({
'CRDACCT_DLQ_CYC_1_MNTH_AGO' : [3, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
'CRDACCT_DLQ_CYC_2_MNTH_AGO': [4, 3, 3, 3, 3, 3, 2, 0, 5, 4, 3, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2],
'CRDACCT_DLQ_CYC_3_MNTH_AGO': [8, 7, 6, 5, 4, 3, 2, 'F', 'F', 0, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'F', 'C', 'C', 'F', 'F'],
'CRDACCT_DLQ_CYC_4_MNTH_AGO' : [0, 2, 'F', 'F', 'C', 'C', 'C', 'C', 0, 2, 0, 2, 0, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'F', 'C', 'F'],
'CRDACCT_DLQ_CYC_5_MNTH_AGO' : [2, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
'CRDACCT_DLQ_CYC_6_MNTH_AGO' : [2, 2, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 0, 2, 0, 2, 0],
'CRDACCT_DLQ_CYC_7_MNTH_AGO' : [3, 3, 2, 'C', 'C', 'C', 'F', 0, 6, 5, 4, 3, 2, 2, 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C'],
'CRDACCT_DLQ_CYC_8_MNTH_AGO' : [5, 4, 4, 3, 3, 2, 3, 2, 2, 2, 1, 2, 0, 2, 'C', 'C', 0, 2, 2, 2, 'C', 'C', 0, 'Z'],
'CRDACCT_DLQ_CYC_9_MNTH_AGO' : [2, 2, 'C', 0, 2, 0, 2, 'C', 'C', 'C', 'C', 'C', 0, 3, 2, 'C', 'F', 'C', 'F', 'F', 'F', 'F', 'F', 'F'],
'CRDACCT_DLQ_CYC_10_MNTH_AGO' : [5, 4, 3, 2, 3, 2, 0, 2, 0, 2, 'C', 'C', 'F', 2, 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'C'],
'CRDACCT_DLQ_CYC_11_MNTH_AGO' : [4, 3, 2, 'F', 2, 0, 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z', 'Z'],
'CRDACCT_DLQ_CYC_12_MNTH_AGO' : ['F', 8, 7, 6, 5, 4, 3, 2, 'C', 'C', 'C', 0, 2, 'C', 'C', 0, 2, 0, 3, 2, 'C', 'C', 'F', 2]
})
df.head()
I want to transform those values (string value: C, F, and Z) into some categories with this condition:
if values in column CRDACCT_DLQ_CYC_1_MNTH_AGO, CRDACCT_DLQ_CYC_2_MNTH_AGO, ......., CRDACCT_DLQ_CYC_12_MNTH_AGO consist:
C = -1
F = -2
Z = -3
else value = value
Then I transpose the table to identify Month Since Dlq (MSD).
dfT =pd.DataFrame(df.T).reset_index(inplace=False)
dfT
I want to create a list with the name of MSD. MSD is identified for value if it is greater than 1 ( value > 1). For example, in index 2 CRDACCT_DLQ_CYC_1_MNTH_AGO = C or after it has changed = -1 which is not greater than 1. Then, check CRDACCT_DLQ_CYC_2_MNTH_AGO is greater than 1? CRDACCT_DLQ_CYC_2_MNTH_AGO = 3 is greater than 1. Hence, the MSD is 2 because it's in CRDACCT_DLQ_CYC_2_MNTH_AGO. Detail flow chart & overview table for identification .
The MSD value is between 1 and 12 depends on i in CRDACCT_DLQ_CYC_i_MNTH_AGO, for i = 1,2,3,...,12.
So the final result is a MSD list with 24 value, identified for each index 0 -23.
Does it what you are looking for:
# From your dataframe
MSD = df.T.apply(pd.to_numeric, errors='coerce').ge(1).idxmax(axis=0) \
.str.extract(r'CYC_(\d+)_MNTH', expand=False).astype(int).tolist()
print(MSD)
# Output:
[1, 1, 2, 2, 2, 2, 2, 8, 2, 2, 2, 2, 7, 2, 2, 2, 2, 2, 2, 8, 2, 2, 6, 2]

Natural Join Implementation Python

I am working on implementing natural join in python. The first two lines show the tables attributes and the next two lines each tables' tuples or rows.
Expected Output:
[['A', 1, 'A', 'a', 'A'],
['A', 1, 'A', 'a', 'Y'],
['A', 1, 'Y', 'a', 'A'],
['A', 1, 'Y', 'a', 'Y'],
['S', 2, 'B', 'b', 'S']]
And what I got:
[['A', 1, 'A', 'a', 'A', 'Y'],
['A', 1, 'A', 'a', 'A', 'Y']]
I have looked through the code and everything seems to be right, I would appreciate any help.
t1atts = ('A', 'B', 'C', 'D')
t2atts = ('B', 'D', 'E')
t1tuples = [['A', 1, 'A', 'a'],
['B', 2, 'Y', 'a'],
['Y', 4, 'B', 'b'],
['A', 1, 'Y', 'a'],
['S', 2, 'B', 'b']]
t2tuples = [[1, 'a', 'A'],
[3, 'a', 'B'],
[1, 'a', 'Y'],
[2, 'b', 'S'],
[3, 'b', 'E']]
def findindices(t1atts, t2atts):
t1index=[]
t2index=[]
for index, att in enumerate(t1atts):
for index2, att2 in enumerate(t2atts):
if att == att2:
t1index.append(index)
t2index.append(index2)
return t1index, t2index
def main():
tpl=0; tpl2=0; i=0; j=0; count=0; result=[]
t1index, t2index = findindices(t1atts, t2atts)
for tpl in t1tuples:
while tpl2 in range(len(t2tuples)):
i=0; j=0
while (i in range(len(t1index))) and (j in range(len(t2index))):
if tpl[t1index[i]] != t2tuples[tpl2][t2index[j]]:
i=len(t1index)
j=len(t1index)
else:
count+=1
i+=1
j+=1
if count == len(t1index):
extravals = [val for index, val in enumerate(t2tuples[tpl2]) if index not in t2index]
temp = tpl
tpl += extravals
result.append(tpl)
tpl = temp
count=0
tpl2+=1
print result
Here's what I came up with. I'd do some more refactoring, etc before calling it done
import pprint
t1atts = ('A', 'B', 'C', 'D')
t2atts = ('B', 'D', 'E')
t1tuples = [
['A', 1, 'A', 'a'],
['B', 2, 'Y', 'a'],
['Y', 4, 'B', 'b'],
['A', 1, 'Y', 'a'],
['S', 2, 'B', 'b']]
t2tuples = [
[1, 'a', 'A'],
[3, 'a', 'B'],
[1, 'a', 'Y'],
[2, 'b', 'S'],
[3, 'b', 'E']]
t1columns = set(t1atts)
t2columns = set(t2atts)
t1map = {k: i for i, k in enumerate(t1atts)}
t2map = {k: i for i, k in enumerate(t2atts)}
join_on = t1columns & t2columns
diff = t2columns - join_on
def match(row1, row2):
return all(row1[t1map[rn]] == row2[t2map[rn]] for rn in join_on)
results = []
for t1row in t1tuples:
for t2row in t2tuples:
if match(t1row, t2row):
row = t1row[:]
for rn in diff:
row.append(t2row[t2map[rn]])
results.append(row)
pprint.pprint(results)
And I get the expected results:
[['A', 1, 'A', 'a', 'A'],
['A', 1, 'A', 'a', 'Y'],
['A', 1, 'Y', 'a', 'A'],
['A', 1, 'Y', 'a', 'Y'],
['S', 2, 'B', 'b', 'S']]
Ok, here is the solution please verify and let me know if it works for you:
I change little bit of naming to understood myself:
#!/usr/bin/python
table1 = ('A', 'B', 'C', 'D')
table2 = ('B', 'D', 'E')
row1 = [['A', 1, 'A', 'a'],
['B', 2, 'Y', 'a'],
['Y', 4, 'B', 'b'],
['A', 1, 'Y', 'a'],
['S', 2, 'B', 'b']]
row2 = [[1, 'a', 'A'],
[3, 'a', 'B'],
[1, 'a', 'Y'],
[2, 'b', 'S'],
[3, 'b', 'E']]
def findindices(table1, table2):
inter = set(table1).intersection(set(table2))
tup_index1 = [table1.index(x) for x in inter]
tup_index2 = [table2.index(x) for x in inter]]
return tup_index1, tup_index2
def main():
final_lol = list()
tup_index1, tup_index2 = findindices(table1, table2)
merge_tup = zip(tup_index1, tup_index2)
for tup1 in row1:
for tup2 in row2:
for m in merge_tup:
if tup1[m[0]] != tup2[m[1]]:
break
else:
ls = []
ls.extend(tup1)
ls.append(tup2[-1])
final_lol.append(ls)
return final_lol
if __name__ == '__main__':
import pprint
pprint.pprint(main())
Output:
[['A', 1, 'A', 'a', 'A'],
['A', 1, 'A', 'a', 'Y'],
['A', 1, 'Y', 'a', 'A'],
['A', 1, 'Y', 'a', 'Y'],
['S', 2, 'B', 'b', 'S']]

Categories