I have a data frame that consists of a time-series of integers. I'm trying to group the data frame by year and then for each year count the number of times that the sum of the absolute value of consecutive entries with the same sign is greater than or equal to 5.
>>> import pandas as pd
>>> l = [1, -1, -4, 2, 2, 4, 5, 1, -3, -4]
>>> idx1 = pd.date_range('2019-01-01',periods=5)
>>> idx2 = pd.date_range('2020-01-01',periods=5)
>>> idx = idx1.union(idx2)
>>> df = pd.DataFrame(l, index=idx, columns=['a'])
>>> df
a
2019-01-01 1
2019-01-02 -1
2019-01-03 -4 \\ 2019 count = 1: abs(-1) + abs(-4) >= 5
2019-01-04 2
2019-01-05 2
2020-01-01 4
2020-01-02 5 \\ 2020 count = 1: abs(4) + abs(5) + abs(1) = 10 >=5
2020-01-03 1
2020-01-04 -3
2020-01-05 -4 \\ 2020 count = 2: abs(-3) + abs(-4) = 7 >= 5
The desired output is:
2019 1
2020 2
My approach to solve this problem is to chain groupby and apply. Below are the implementations of the functions I created to pass to groupby and apply respectively.
>>> def get_year(x):
return x.year
>>> def count(group, t=5):
c = 0 # counter
s = 0 # sum of consec vals w same sign
for i in range(1,len(group)):
if np.sign(group['a'].iloc[i-1]) == np.sign(group['a'].iloc[i]):
if s == 0:
s = group['a'].iloc[i-1] + group['a'].iloc[i]
else:
s += group['a'].iloc[i]
if i == (len(group) -1):
return c + 1
elif (np.sign(group['a'].iloc[i-1]) != np.sign(group['a'].iloc[i])) and (abs(s) >= t):
#if consec streak of vals w same sign is broken and abs(s) >= t then inc c and reset s
c += 1
s = 0
elif (np.sign(group['a'].iloc[i-1]) != np.sign(group['a'].iloc[i])) and (abs(s) < t):
#if consec streak of vals w same sign is broken and abs(s) < t then reset s
s = 0
return c
>>> by_year = df.groupby(get_year)
>>> by_year.apply(count)
2019 1
2020 2
My question is:
Is there a more "pythonic" implementation of the above count function that produces the desired result but doesn't rely on for loops?
I'm using python3. I would like to remove incorrect id's from my dataframe column.
Example:
d = {'name': ['a', 'b', 'c', 'd'], 'id': [9356622,9030321,9408530, 1112200]}
df = pd.DataFrame(data=d)
I need to verify id by multiplying each of the first six digits by a factor of 2 to 7 corresponding to their position from right to left. For example, for id 9356622:
(9×7) + (3×6) + (5×5) + (6×4) + (6×3) + (2×2) = 152. So in this case last number 2 so it's correct since the last number of id 9356622 is 2. I need to check with the last number after preforming this calculation.
Input data:
>>> df
name id
0 a 9356622
1 b 9030321
2 c 9408530
3 d 1112200
Explode the id numbers to digits:
df1 = df['id'].astype(str).map(list).apply(pd.Series).astype(int)
>>> df1
0 1 2 3 4 5 6
0 9 3 5 6 6 2 2 # 152 -> modulo(10) = 2 -> True
1 9 0 3 0 3 2 1 # 91 -> modulo(10) = 1 -> True
2 9 4 0 8 5 3 0 # 140 -> modulo(10) = 0 -> True
3 1 1 1 2 2 0 0 # 32 -> modulo(10) = 2 -> False
Now check your math operation:
>>> df1.iloc[:, :6].mul(range(7, 1, -1)).sum(axis=1).mod(10) == df1.iloc[:, 6]
0 True
1 True
2 True
3 False
dtype: bool
def fun_IMO(string):
try:
pattern = r"([0-9][0-9][0-9][0-9][0-9][0-9][0-9])"
regexFinder = re.compile(pattern)
string = string.lower()
res = regexFinder.search(string)
if res.groups():
try:
nuberIMO = res.groups()[0]
numberIMO_calc = (int(nuberIMO[0])*7) + (int(nuberIMO[1])*6) + (int(nuberIMO[2])*5) + (int(nuberIMO[3])*4) + (int(nuberIMO[4])*3) + (int(nuberIMO[5])*2)
if str(numberIMO_calc)[-1] == nuberIMO[0]:
return True
else:
return False
except Exception as e:
return e
except Exception as e:
return e
I have a dataframe like this:
ID, Values
1 10, 11, 12, 13
2 14
3 15, 16, 17, 18
I want to create a new dataframe like this:
ID COl1 Col2
1 10 11
1 11 12
1 12 13
2 14
3 15 16
3 16 17
3 17 18
Please help me in how to do this???
Note: The rows in Values column of input df are str type.
Use list comprehension with flattening and small change - if i > 0: to if i == 2: for correct working with one element values:
from collections import deque
#https://stackoverflow.com/a/36586925
def chunks(iterable, chunk_size=2, overlap=1):
# we'll use a deque to hold the values because it automatically
# discards any extraneous elements if it grows too large
if chunk_size < 1:
raise Exception("chunk size too small")
if overlap >= chunk_size:
raise Exception("overlap too large")
queue = deque(maxlen=chunk_size)
it = iter(iterable)
i = 0
try:
# start by filling the queue with the first group
for i in range(chunk_size):
queue.append(next(it))
while True:
yield tuple(queue)
# after yielding a chunk, get enough elements for the next chunk
for i in range(chunk_size - overlap):
queue.append(next(it))
except StopIteration:
# if the iterator is exhausted, yield any remaining elements
i += overlap
if i == 2:
yield tuple(queue)[-i:]
L = [[x] + list(z) for x, y in zip(df['ID'], df['Values']) for z in (chunks(y.split(', ')))]
df = pd.DataFrame(L, columns=['ID','Col1','Col2']).fillna('')
print (df)
ID Col1 Col2
0 1 10 11
1 1 11 12
2 1 12 13
3 2 14
4 3 15 16
5 3 16 17
6 3 17 18
Tried slightly different approach. Created a function which will return numbers in pairs from the initial comma separated string.
def pairup(mystring):
"""Function to return paired up list from string"""
mylist = mystring.split(',')
if len(mylist) == 1: return [mylist]
splitlist = []
for index, item in enumerate(mylist):
try:
splitlist.append([mylist[index], mylist[index+1]])
except:
pass
return splitlist
Now let's create the new data frame.
# https://stackoverflow.com/a/39955283/3679377
new_df = df[['ID']].join(
df.Values.apply(lambda x: pd.Series(pairup(x)))
.stack()
.apply(lambda x: pd.Series(x))
.fillna("")
.reset_index(level=1, drop=True),
how='left').reset_index(drop=True)
new_df.columns = ['ID', 'Col 1', 'Col 2']
Here's the output of print(new_df).
ID Col 1 Col 2
0 1 10 11
1 1 11 12
2 1 12 13
3 2 14
4 3 15 16
5 3 16 17
6 3 17 18
I'm looking to find the max run of consecutive zeros in a DataFrame with the result grouped by user. I'm interested in running the RLE on usage.
sample input:
user--day--usage
A-----1------0
A-----2------0
A-----3------1
B-----1------0
B-----2------1
B-----3------0
Desired output
user---longest_run
a - - - - 2
b - - - - 1
mydata <- mydata[order(mydata$user, mydata$day),]
user <- unique(mydata$user)
d2 <- data.frame(matrix(NA, ncol = 2, nrow = length(user)))
names(d2) <- c("user", "longest_no_usage")
d2$user <- user
for (i in user) {
if (0 %in% mydata$usage[mydata$user == i]) {
run <- rle(mydata$usage[mydata$user == i]) #Run Length Encoding
d2$longest_no_usage[d2$user == i] <- max(run$length[run$values == 0])
} else {
d2$longest_no_usage[d2$user == i] <- 0 #some users did not have no-usage days
}
}
d2 <- d2[order(-d2$longest_no_usage),]
this works in R but I want to do the same thing in python, I'm totally stumped
Use groupby with size by columns user, usage and helper Series for consecutive values first:
print (df)
user day usage
0 A 1 0
1 A 2 0
2 A 3 1
3 B 1 0
4 B 2 1
5 B 3 0
6 C 1 1
df1 = (df.groupby([df['user'],
df['usage'].rename('val'),
df['usage'].ne(df['usage'].shift()).cumsum()])
.size()
.to_frame(name='longest_run'))
print (df1)
longest_run
user val usage
A 0 1 2
1 2 1
B 0 3 1
5 1
1 4 1
C 1 6 1
Then filter only zero rows, get max and add reindex for append non 0 groups:
df2 = (df1.query('val == 0')
.max(level=0)
.reindex(df['user'].unique(), fill_value=0)
.reset_index())
print (df2)
user longest_run
0 A 2
1 B 1
2 C 0
Detail:
print (df['usage'].ne(df['usage'].shift()).cumsum())
0 1
1 1
2 2
3 3
4 4
5 5
6 6
Name: usage, dtype: int32
get max number of consecutive zeros on series:
def max0(sr):
return (sr != 0).cumsum().value_counts().max() - (0 if (sr != 0).cumsum().value_counts().idxmax()==0 else 1)
max0(pd.Series([1,0,0,0,0,2,3]))
4
I think the following does what you are looking for, where the consecutive_zero function is an adaptation of the top answer here.
Hope this helps!
import pandas as pd
from itertools import groupby
df = pd.DataFrame([['A', 1], ['A', 0], ['A', 0], ['B', 0],['B',1],['C',2]],
columns=["user", "usage"])
def len_iter(items):
return sum(1 for _ in items)
def consecutive_zero(data):
x = list((len_iter(run) for val, run in groupby(data) if val==0))
if len(x)==0: return 0
else: return max(x)
df.groupby('user').apply(lambda x: consecutive_zero(x['usage']))
Output:
user
A 2
B 1
C 0
dtype: int64
If you have a large dataset and speed is essential, you might want to try the high-performance pyrle library.
Setup:
# pip install pyrle
# or
# conda install -c bioconda pyrle
import numpy as np
np.random.seed(0)
import pandas as pd
from pyrle import Rle
size = int(1e7)
number = np.random.randint(2, size=size)
user = np.random.randint(5, size=size)
df = pd.DataFrame({"User": np.sort(user), "Number": number})
df
# User Number
# 0 0 0
# 1 0 1
# 2 0 1
# 3 0 0
# 4 0 1
# ... ... ...
# 9999995 4 1
# 9999996 4 1
# 9999997 4 0
# 9999998 4 0
# 9999999 4 1
#
# [10000000 rows x 2 columns]
Execution:
for u, udf in df.groupby("User"):
r = Rle(udf.Number)
is_0 = r.values == 0
print("User", u, "Max", np.max(r.runs[is_0]))
# (Wall time: 1.41 s)
# User 0 Max 20
# User 1 Max 23
# User 2 Max 20
# User 3 Max 22
# User 4 Max 23
I'm trying to bin a sample of observations into n discrete groups, then combine these groups until each subgroup has a mimimum of 6 members. So far, I've generated bins, and grouped my DataFrame into them:
# df is a DataFrame containing 135 measurments
bins = np.linspace(df.heights.min(), df.heights.max(), 21)
grp = df.groupby(np.digitize(df.heights, bins))
grp.size()
1 4
2 1
3 2
4 3
5 2
6 8
7 7
8 6
9 19
10 12
11 13
12 12
13 7
14 12
15 12
16 2
17 3
18 6
19 3
21 1
So I can see that I need to combine groups 1 - 3, 3 - 5, and 16 - 21, while leaving the others intact, but I don't know how to do this programmatically.
You can do this:
df = pd.DataFrame(np.random.random_integers(1,200,135), columns=['heights'])
bins = np.linspace(df.heights.min(), df.heights.max(), 21)
grp = df.groupby(np.digitize(df.heights, bins))
sizes = grp.size()
def f(vals, max):
sum = 0
group = 1
for v in vals:
sum += v
if sum <= max:
yield group
else:
group +=1
sum = v
yield group
#I've changed 6 by 30 for the example cause I don't have your original dataset
grp.size().groupby([g for g in f(sizes, 30)])
And if you do print grp.size().groupby([g for g in f(sizes, 30)]).cumsum() you will see that the cumulative sums is grouped as expected.
Also if you want to group the original values you can do something like:
dat = np.random.random_integers(0,200,135)
dat = np.array([78,116,146,111,147,78,14,91,196,92,163,144,107,182,58,89,77,134,
83,126,94,70,121,175,174,88,90,42,93,131,91,175,135,8,142,166,
1,112,25,34,119,13,95,182,178,200,97,8,60,189,49,94,191,81,
56,131,30,107,16,48,58,65,78,8,0,11,45,179,151,130,35,64,
143,33,49,25,139,20,53,55,20,3,63,119,153,14,81,93,62,162,
46,29,84,4,186,66,90,174,55,48,172,83,173,167,66,4,197,175,
184,20,23,161,70,153,173,127,51,186,114,27,177,96,93,105,169,158,
83,155,161,29,197,143,122,72,60])
df = pd.DataFrame({'heights':dat})
bins = np.digitize(dat,np.linspace(0,200,21))
grp = df.heights.groupby(bins)
m = 15 #you should put 6 here, the minimun
s = 0
c = 1
def f(x):
global c,s
res = pd.Series([c]*x.size,index=x.index)
s += x.size
if s>m:
s = 0
c += 1
return res
g = grp.apply(f)
print df.groupby(g).size()
#another way of doing the same, just a matter of taste
m = 15 #you should put 6 here, the minimun
s = 0
c = 1
def f2(x):
global c,s
res = [c]*x.size #here is the main difference with f
s += x.size
if s>m:
s = 0
c += 1
return res
g = grp.transform(f2) #call it this way
print df.groupby(g).size()