calculate churn based on 21 days interval - python

I worked on calculating churn using the mix of pandas dataframe of git logs and git show command for a particular commit to see where exactly the changes has been done based on loc. However, I could not able to calculate churn based on the days i.e. I mean calculate churn when an engineer rewrites or deletes their own code that is less than 3 weeks old.
This is how I have done for such dataframe for each commit based
git logs dataframe
sha timestamp date author message body age insertion deletion filepath churn merges
1 1 cae635054 Sat Jun 26 14:51:23 2021 -0400 2021-06-26 18:51:23+00:00 Andrew Clark `act`: Resolve to return value of scope function (#21759) When migrating some internal tests I found it annoying that I couldn't -24 days +12:21:32.839997
2 21 cae635054 Sat Jun 26 14:51:23 2021 -0400 2021-06-26 18:51:23+00:00 Andrew Clark `act`: Resolve to return value of scope function (#21759) When migrating some internal tests I found it annoying that I couldn't -24 days +12:21:32.839997 31.0 0.0 packages/react-reconciler/src/__tests__/ReactIsomorphicAct-test.js 31.0
3 22 cae635054 Sat Jun 26 14:51:23 2021 -0400 2021-06-26 18:51:23+00:00 Andrew Clark `act`: Resolve to return value of scope function (#21759) When migrating some internal tests I found it annoying that I couldn't -24 days +12:21:32.839997 1.0 1.0 packages/react-test-renderer/src/ReactTestRenderer.js 0.0
4 23 cae635054 Sat Jun 26 14:51:23 2021 -0400 2021-06-26 18:51:23+00:00 Andrew Clark `act`: Resolve to return value of scope function (#21759) When migrating some internal tests I found it annoying that I couldn't -24 days +12:21:32.839997 24.0 14.0 packages/react/src/ReactAct.js 10.0
5 25 e2453e200 Fri Jun 25 15:39:46 2021 -0400 2021-06-25 19:39:46+00:00 Andrew Clark act: Add test for bypassing queueMicrotask (#21743) Test for fix added in #21740 -25 days +13:09:55.839997 50.0 0.0 packages/react-reconciler/src/__tests__/ReactIsomorphicAct-test.js 50.0
6 27 73ffce1b6 Thu Jun 24 22:42:44 2021 -0400 2021-06-25 02:42:44+00:00 Brian Vaughn DevTools: Update tests to fix warnings/errors (#21748) Some new ones had slipped in (e.g. deprecated ReactDOM.render message from 18) -26 days +20:12:53.839997 4.0 5.0 packages/react-devtools-shared/src/__tests__/FastRefreshDevToolsIntegration-test.js -1.0
7 28 73ffce1b6 Thu Jun 24 22:42:44 2021 -0400 2021-06-25 02:42:44+00:00 Brian Vaughn DevTools: Update tests to fix warnings/errors (#21748) Some new ones had slipped in (e.g. deprecated ReactDOM.render message from 18) -26 days +20:12:53.839997 4.0 4.0 packages/react-devtools-shared/src/__tests__/componentStacks-test.js 0.0
8 29 73ffce1b6 Thu Jun 24 22:42:44 2021 -0400 2021-06-25 02:42:44+00:00 Brian Vaughn DevTools: Update tests to fix warnings/errors (#21748) Some new ones had slipped in (e.g. deprecated ReactDOM.render message from 18) -26 days +20:12:53.839997 12.0 12.0 packages/react-devtools-shared/src/__tests__/console-test.js 0.0
9 30 73ffce1b6 Thu Jun 24 22:42:44 2021 -0400 2021-06-25 02:42:44+00:00 Brian Vaughn DevTools: Update tests to fix warnings/errors (#21748) Some new ones had slipped in (e.g. deprecated ReactDOM.render message from 18) -26 days +20:12:53.839997 7.0 6.0 packages/react-devtools-shared/src/__tests__/editing-test.js 1.0
10 31 73ffce1b6 Thu Jun 24 22:42:44 2021 -0400 2021-06-25 02:42:44+00:00 Brian Vaughn DevTools: Update tests to fix warnings/errors (#21748) Some new ones had slipped in (e.g. deprecated ReactDOM.render message from 18) -26 days +20:12:53.839997 47.0 42.0 packages/react-devtools-shared/src/__tests__/inspectedElement-test.js 5.0
11 32 73ffce1b6 Thu Jun 24 22:42:44 2021 -0400 2021-06-25 02:42:44+00:00 Brian Vaughn DevTools: Update tests to fix warnings/errors (#21748) Some new ones had slipped in (e.g. deprecated ReactDOM.render message from 18) -26 days +20:12:53.839997 7.0 6.0 packages/react-devtools-shared/src/__tests__/ownersListContext-test.js 1.0
12 33 73ffce1b6 Thu Jun 24 22:42:44 2021 -0400 2021-06-25 02:42:44+00:00 Brian Vaughn DevTools: Update tests to fix warnings/errors (#21748) Some new ones had slipped in (e.g. deprecated ReactDOM.render message from 18) -26 days +20:12:53.839997 22.0 21.0 packages/react-devtools-shared/src/__tests__/profilerContext-test.js 1.0
churn calculation
commits = df["sha"].unique().tolist()
for commit in commits:
contribution, churn = await self.calculate_churn(commit)
async def calculate_churn(self, stream):
PREVIOUS_BASE_DIR = os.path.abspath("")
try:
GIT_DIR = os.path.join(PREVIOUS_BASE_DIR, "app/git/react.git")
os.chdir(GIT_DIR)
except FileNotFoundError as e:
raise ValueError(e)
cmd = f"git show --format= --unified=0 --no-prefix {stream}"
cmds = [f"{cmd}"]
results = get_proc_out(cmds)
[files, contribution, churn] = get_loc(results)
# need to circle back to previous path
os.chdir(PREVIOUS_BASE_DIR)
return contribution, churn
def is_new_file(result, file):
# search for destination file (+++ ) and update file variable
if result.startswith("+++"):
return result[result.rfind(" ") + 1 :]
else:
return file
def is_loc_change(result, loc_changes):
# search for loc changes (## ) and update loc_changes variable
# ## -1,5 +1,4 ##
# ## -l,s +l,s ##
if result.startswith("##"):
# loc_change = result[2+1: ] -> -1,5 +1,4 ##
loc_change = result[result.find(" ") + 1 :]
# loc_change = loc_change[:9] -> -1,5 +1,4
loc_change = loc_change[: loc_change.find(" ##")]
return loc_change
else:
return loc_changes
def get_loc_change(loc_changes):
# removals
# -1,5 +1,4 = -1,5
left = loc_changes[: loc_changes.find(" ")]
left_dec = 0
# 2
if left.find(",") > 0:
# 2
comma = left.find(",")
# 5
left_dec = int(left[comma + 1 :])
# 1
left = int(left[1:comma])
else:
left = int(left[1:])
left_dec = 1
# additions
# +1,4
right = loc_changes[loc_changes.find(" ") + 1 :]
right_dec = 0
if right.find(",") > 0:
comma = right.find(",")
right_dec = int(right[comma + 1 :])
right = int(right[1:comma])
else:
right = int(right[1:])
right_dec = 1
if left == right:
return {left: (right_dec - left_dec)}
else:
return {left: left_dec, right: right_dec}
def get_loc(results):
files = {}
contribution = 0
churn = 0
file = ""
loc_changes = ""
for result in results:
new_file = is_new_file(result, file)
if file != new_file:
file = new_file
if file not in files:
files[file] = {}
else:
new_loc_changes = is_loc_change(
result, loc_changes
) # returns either empmty or -6 +6 or -13, 0 +14, 2 format
if loc_changes != new_loc_changes:
loc_changes = new_loc_changes
locc = get_loc_change(loc_changes) # {2: 0} or {8: 0, 9: 1}
for loc in locc:
# files[file] = {2: 0, 8: 0, 9: 1}
# print("loc", loc, files[file], locc[loc])
if loc in files[file]:
# change of lines triggered
files[file][loc] += locc[loc]
churn += abs(locc[loc])
else:
files[file][loc] = locc[loc]
contribution += abs(locc[loc])
else:
continue
return [files, contribution, churn]
How can I utilize this same code but check churn only if there is changes in code that is only 3 weeks old?

The only practical way to do this is to iterate through the DataDrame, and because that sucks with pandas, it almost always means you have the wrong data structure. If you're not doing numerical analysis, and it looks like you aren't, then just keep a simple list of dicts. Pandas has its shining points, but it's not a universal database.
Here's the rough code you'd need, although I'm glossing over details:
# Go through the df row by row.
lastdate = {}
for index,row in df.iterrows():
if row['filepath'] in lastdate:
if lastdate[row['filepath']] - row['date'] < timedelta(days=21):
print( "Last change to", row['filepath'], "was within three weeks" )
lastdate[row['filepath']] = row['date']

Related

Instantiating python dataframe in a for loop

I am trying to create a condition using for loop and if statement for a python dataframe object. In order to accurately specify which row from the data table to extract upon a specific condition, I searched the row index, and created an argument to specify the location before the for loop. The specifics looks something like this:
import pandas as pd
input_csv_file = "./CSV/Officers_and_Shareholders.csv"
df = pd.read_csv(input_csv_file, skiprows=10, on_bad_lines='skip')
df.fillna('', inplace=True)
# df.drop([0, 3], inplace=True)
df.columns = ['Nama', 'Jabatan', 'Alamat', 'Klasifikasi Saham', 'Jumlah Lembar Saham', 'Total']
# print(df.shape)
# print(df.columns)
# print(df.iloc[:53])
# shareholders = df.iloc[24:42]
# print(shareholders)
# officers = df.iloc[0:23]
# print(officers)
dataframe = df.query("Total.ne('-')")
def get_shareholder_by_row_index():
for column in df.columns:
if object(df.iloc[column][:53]) == dataframe:
shareholders = df.iloc[24:42]
print(shareholders)
# elif object(df[:53][column]) != dataframe:
# officers = df.iloc[0:23]
# print(officers)
Because the format of the CSV file is not proper, I forced dataframe to re-create a header on top of the original CSV file, which I indicate under df.columns. The df.iloc[24:42] and df.iloc[0:23] are able to specifically locate the data range in the dataframe, but it doesn't return so when instantiated inside the for loop. Objectively, I want to create a function where if the row under the column Total is empty (-), then return the officers, but if the row under the column Total is not empty, then return shareholders. In this case, how should I modify the for loop and the if statement?
The desired output for shareholders will be:
24 PT CTCORP INFRASTRUKTUR D INDONESIA, ... Rp. 3.200.000.000
25 Nomor SK :- I ...
26 JalanKaptenPierreTendeanKavling12-14A ...
27 PT INTRERPORT PATIMBAN AGUNG, ... Rp. 2.900.000.000
28 Nomor SK :- ...
29 ...
30 ...
31 ...
32 ...
33 ...
34 PT PATIMBAN MAJU BERSAMA, ... Rp. 2.900.000.000
35 Nomor SK :AHU- ...
36 0061318.AH.01.01.TAHUN 2021 ...
37 Tanggal SK :30 September 2021 ...
38 ...
39 ...
40 PT TERMINAL PETIKEMAS ... Rp. 1.000.000.000
41 SURABAYA, ...
42 Nomor SK :- ...
and for the officers, it will return:
Nama ... Total
1 NIK: 3171060201830005 ...
2 NPWP: 246383541071000 ...
3 TTL: Jakarta, 02 Januari 1983 ...
5 NIK: 1271121011700003 ...
6 NPWP: 070970173112000 ...
7 TTL: Bogor, 10 November 1970 ...
8 ARLAN SEPTIA ANANDA ...
9 RASAM, ...
10 NIK: 3174051209620003 ...
11 NPWP: 080878200013000 ...
12 TTL: Jakarta, 12 September ...
13 1962 ...
15 NIK: 3171011605660004 ...
16 NPWP: 070141650093000 ...
17 TTL: Jakarta, 16 Mei 1966 ...
18 FUAD RIZAL, ...
21 PURNOMO, UTAMA RASRINIK: 3578032408610001 ...
22 NPWP: 097468813615000 ...
23 TTL: SLEMAN, 24 Agustus 1961 ...
Stakeholder and Officer will be printed withrecpect to the index (Row Number)
if this is not the desired answer then mention little detail
def get_shareholder_by_row_index():
for i in range(len(df)):
# this will give you shareholders if row under Total is empty else office if row is not empty
if df["Total"][i] == '' :
print(i," shareholders")
print(df.iloc[i])
# what ever your code is, will be here
else:
print(i," officers")
print(df.iloc[i])
# what ever your code is, will be here
# this will give you the indces where row under total is empty
print(df["Total"].iloc[:53][df["Total"] == ''])

How to find the time to iterate over two array?

The function is passed a dictionary containing three lists with timestamps (time in seconds):
lesson - the beginning and end of the lesson
pupil - intervals of pupil presence
tutor - intervals of the teacher's presence
The intervals are arranged as follows - it is always a list of an even number of items. Even indices (starting from 0) are the time of entry to the lesson, and odd ones are the time of leaving the lesson.
How can I calculate the time when both student and teacher are present in class at the same time? That is, the time of their crossing
dc = {
'lesson': [1594663200, 1594666800],
'pupil': [1594663340, 1594663389, 1594663390, 1594663395, 1594663396, 1594666472],
'tutor': [1594663290, 1594663430, 1594663443, 1594666473]}
My solution, I could only calculate the total time spent by each
lesson_times = dc['lesson']
pupil_times = dc['pupil']
tutor_times = dc['tutor']
total_time_pupil = 0
start_time_pupil = 0
end_time_pupil = 0
if len(pupil_times) > len(tutor_times):
for index, times in pupil_times
for index, time in enumerate(pupil_times):
if (index % 2) == 0:
start_time_pupil = time
else:
end_time_pupil = time
if start_time_pupil != 0 and end_time_pupil !=0:
total_time_pupil += (end_time_pupil - start_time_pupil)
start_time_pupil = 0
end_time_pupil = 0
tutor_times = dc['tutor']
total_time_tutor = 0
start_time_tutor = 0
end_time_tutor = 0
for index, time in enumerate(tutor_times):
if (index % 2) == 0:
start_time_tutor = time
else:
end_time_tutor = time
if start_time_tutor != 0 and end_time_tutor != 0:
total_time_tutor += (end_time_tutor - start_time_tutor)
start_time_tutor = 0
end_time_tutor = 0
I had written this when you posted the first time. This solves your problem.
import time
dc = {
'lesson': [1594663200, 1594666800],
'pupil': [1594663340, 1594663389, 1594663390, 1594663395, 1594663396, 1594666472],
'tutor': [1594663290, 1594663430, 1594663443, 1594666473]}
puptimes = dc['pupil'][:]
tuttimes = dc['tutor'][:]
pupil_in = False
tutor_in = False
last = 0
together = 0
while puptimes and tuttimes:
# Pick the event to come next.
if puptimes[0] < tuttimes[0]:
evt = puptimes.pop(0)
pupil_in = not pupil_in
else:
evt = tuttimes.pop(0)
tutor_in = not tutor_in
tc = time.ctime(evt)
if pupil_in and tutor_in:
print( tc, "Both are in the room." )
last = evt
else:
if last:
print( tc, "No longer both in, together time =", evt-last )
together += evt-last
last = 0
if pupil_in:
print( tc, "Pupil is in the room alone" )
elif tutor_in:
print( tc, "Tutor is in the room alone" )
else:
print( tc, "Room is empty" )
print( "Total time together:", together, "seconds" )
Output:
[timr#Tims-Pro:~/src]$ python x.py
Mon Jul 13 11:01:30 2020 Tutor is in the room alone
Mon Jul 13 11:02:20 2020 Both are in the room.
Mon Jul 13 11:03:09 2020 No longer both in, together time = 49
Mon Jul 13 11:03:09 2020 Tutor is in the room alone
Mon Jul 13 11:03:10 2020 Both are in the room.
Mon Jul 13 11:03:15 2020 No longer both in, together time = 5
Mon Jul 13 11:03:15 2020 Tutor is in the room alone
Mon Jul 13 11:03:16 2020 Both are in the room.
Mon Jul 13 11:03:50 2020 No longer both in, together time = 34
Mon Jul 13 11:03:50 2020 Pupil is in the room alone
Mon Jul 13 11:04:03 2020 Both are in the room.
Mon Jul 13 11:54:32 2020 No longer both in, together time = 3029
Mon Jul 13 11:54:32 2020 Tutor is in the room alone
Total time together: 3117 seconds
[timr#Tims-Pro:~/src]$

Python - Parsing a text file into a csv file

I have a text file that is output from a command that I ran with Netmiko to retrieve data from a Cisco WLC of things that are causing interference on our WiFi network. I stripped out just what I needed from the original 600k lines of code down to a couple thousand lines like this:
AP Name.......................................... 010-HIGH-FL4-AP04
Microwave Oven 11 10 -59 Mon Dec 18 08:21:23 2017
WiMax Mobile 11 0 -84 Fri Dec 15 17:09:45 2017
WiMax Fixed 11 0 -68 Tue Dec 12 09:29:30 2017
AP Name.......................................... 010-2nd-AP04
Microwave Oven 11 10 -61 Sat Dec 16 11:20:36 2017
WiMax Fixed 11 0 -78 Mon Dec 11 12:33:10 2017
AP Name.......................................... 139-FL1-AP03
Microwave Oven 6 18 -51 Fri Dec 15 12:26:56 2017
AP Name.......................................... 010-HIGH-FL3-AP04
Microwave Oven 11 10 -55 Mon Dec 18 07:51:23 2017
WiMax Mobile 11 0 -83 Wed Dec 13 16:16:26 2017
The goal is to end up with a csv file that strips out the 'AP Name ...' and puts what left on the same line as the rest of the information in the next line. The problem is some have two lines below the AP name and some have 1 or none. I have been at it for 8 hours and cannot find the best way to make this happen.
This is the latest version of code that I was trying to use, any suggestions for making this work? I just want something I can load up in excel and create a report with:
with open(outfile_name, 'w') as out_file:
with open('wlc-interference_raw.txt', 'r')as in_file:
#Variables
_ap_name = ''
_temp = ''
_flag = False
for i in in_file:
if 'AP Name' in i:
#write whatever was put in the temp file to disk because new ap now
#add another temp variable in case an ap has more than 1 interferer and check if new AP name
out_file.write(_temp)
out_file.write('\n')
#print(_temp)
_ap_name = i.lstrip('AP Name.......................................... ')
_ap_name = _ap_name.rstrip('\n')
_temp = _ap_name
#print(_temp)
elif '----' in i:
pass
elif 'Class Type' in i:
pass
else:
line_split = i.split()
for x in line_split:
_temp += ','
_temp += x
_temp += '\n'
I think your best option is to read all lines of the file, then split into sections starting with AP Name. Then you can work on parsing each section.
Example
s = """AP Name.......................................... 010-HIGH-FL4-AP04
Microwave Oven 11 10 -59 Mon Dec 18 08:21:23 2017
WiMax Mobile 11 0 -84 Fri Dec 15 17:09:45 2017
WiMax Fixed 11 0 -68 Tue Dec 12 09:29:30 2017
AP Name.......................................... 010-2nd-AP04
Microwave Oven 11 10 -61 Sat Dec 16 11:20:36 2017
WiMax Fixed 11 0 -78 Mon Dec 11 12:33:10 2017
AP Name.......................................... 139-FL1-AP03
Microwave Oven 6 18 -51 Fri Dec 15 12:26:56 2017
AP Name.......................................... 010-HIGH-FL3-AP04
Microwave Oven 11 10 -55 Mon Dec 18 07:51:23 2017
WiMax Mobile 11 0 -83 Wed Dec 13 16:16:26 2017"""
import re
class AP:
"""
A class holding each section of the parsed file
"""
def __init__(self):
self.header = ""
self.content = []
sections = []
section = None
for line in s.split('\n'): # Or 'for line in file:'
# Starting new section
if line.startswith('AP Name'):
# If previously had a section, add to list
if section is not None:
sections.append(section)
section = AP()
section.header = line
else:
if section is not None:
section.content.append(line)
sections.append(section) # Add last section outside of loop
for section in sections:
ap_name = section.header.lstrip("AP Name.") # lstrip takes all the characters given, not a literal string
for line in section.content:
print(ap_name + ",", end="")
# You can extract the date separately, if needed
# Splitting on more than one space using a regex
line = ",".join(re.split(r'\s\s+', line))
print(line.rstrip(',')) # Remove trailing comma from imperfect split
Output
010-HIGH-FL4-AP04,Microwave Oven,11,10,-59,Mon Dec 18 08:21:23 2017
010-HIGH-FL4-AP04,WiMax Mobile,11,0,-84,Fri Dec 15 17:09:45 2017
010-HIGH-FL4-AP04,WiMax Fixed,11,0,-68,Tue Dec 12 09:29:30 2017
010-2nd-AP04,Microwave Oven,11,10,-61,Sat Dec 16 11:20:36 2017
010-2nd-AP04,WiMax Fixed,11,0,-78,Mon Dec 11 12:33:10 2017
139-FL1-AP03,Microwave Oven,6,18,-51,Fri Dec 15 12:26:56 2017
010-HIGH-FL3-AP04,Microwave Oven,11,10,-55,Mon Dec 18 07:51:23 2017
010-HIGH-FL3-AP04,WiMax Mobile,11,0,-83,Wed Dec 13 16:16:26 2017
Tip:
You don't need Python to write the CSV, you can output to a file using the command line
python script.py > output.csv

Copy row data into another column description from text file

I am having trouble with a complicated problem that I will try my best to describe.
I have a text file that has the following information
Customer Name: Zack Customer Number:12345
10.4 2014556 - FSV Poly -1.50 Feb 16 6 Each Unit Order
10.5 2014556 - FSV Poly -1.50 Feb 16 6 Each Unit Order
Customer Name: Larry Customer Number:00099
1.4 2014556 - FSV Poly -1.50 Feb 16 6 Each Unit Order
1.5 2014556 - FSV Poly -1.50 Feb 16 6 Each Unit Order
Customer Name: James Customer Number:99999
5.4 2014556 - FSV Poly -1.50 Feb 16 6 Each Unit Order
5.5 2014556 - FSV Poly -1.50 Feb 16 6 Each Unit Order
And So on..... in the same format.
What I want to do, is add the Customer Name value, "Zack" and Customer Number "12345" to the end of the line and get rid of the current format.
Eventually ending up with this new format.
10.4 2014556 - FSV Poly -1.50 Feb 16 6 Each Unit Order Zach 12345
10.5 2014556 - FSV Poly -1.50 Feb 16 6 Each Unit Order Zack 12345
1.4 2014556 - FSV Poly -1.50 Feb 16 6 Each Unit Order Larry 00099
1.5 2014556 - FSV Poly -1.50 Feb 16 6 Each Unit Order Larry 00099
5.4 2014556 - FSV Poly -1.50 Feb 16 6 Each Unit Order James 99999
5.5 2014556 - FSV Poly -1.50 Feb 16 6 Each Unit Order James 99999
My thinking is something like... if line starts with a number then add the last customer Name value and customer Number value to the end of the line???
Is this even possible?
Thank you so much for your time!
Here is my code:
import re
file = open('Orders.txt', 'r')
for line in file:
if line.__contains__('Customer Name') or line[0].isdigit():
print(line.lstrip())
And here is a better example of the data:
First of all, be careful with opening a file, you have then to call to file.close() !
What you could do is iterate through the file and change the customer Name and number every time you find one
Here is my take:
customer_number = 0
customer_name = ''
with open('Orders.txt', 'r') as file: # Usually it's how files are opened
while True:
line = file.readline()
if line == '': # We get out of the loop when we're finished
break
if 'Customer Name' in line:
# Change customer number and name
# Quite hacky way to do it
listline = line.split('Customer')
customer_name = listline[0]
customer_number = listline[1]
else :
print('%s, %s, %s' % (line.strip(), customer_name, customer_number))
Note that at the end of the line will be written 'Name: yourcustomername', and 'Number: yourcustomernumber' with this method.
You can quite easily get only name and number of this but I'll let you find out yourself :)
I'd also recommend you to save your changes in a file rather than simply print them.

effective backup strategy with storage reuse but without duplicates

I'm trying to figure out how to implement an automatic backup file naming/recycling strategy that keeps older backup files but with decreasing frequency over time. The basic idea is that it would be possible to remove at maximum one file when adding a new one, but I was not successful implementing this from scratch.
That's why I started to try out the Grandfather-Father-Son pattern, but there is not a requirement to stick to this. I started my experiments using a single pool, but I failed more than once, so I started again from this more descriptive approach using four pools, one for each frequency:[1]
import datetime
t = datetime.datetime(2001, 1, 1, 5, 0, 0) # start at 1st of Jan 2001, at 5:00 am
d = datetime.timedelta(days=1)
days = []
weeks = []
months = []
years = []
def pool_it(t):
days.append(t)
if len(days) > 7: # keep not more than seven daily backups
del days[0]
if (t.weekday() == 6):
weeks.append(t)
if len(weeks) > 5: # ...not more than 5 weekly backups
del weeks[0]
if (t.day == 28):
months.append(t)
if len(months) > 12: # ... limit monthly backups
del months[0]
if (t.day == 28 and t.month == 12):
years.append(t)
if len(years) > 10: # ... limit yearly backups...
del years[0]
for i in range(4505):
pool_it(t)
t += d
no = 0
def print_pool(pool, rt):
global no
print("----")
for i in pool:
no += 1
print("{:3} {} {}".format(no, i.strftime("%Y-%m-%d %a"), (i-rt).days))
print_pool(years, t)
print_pool(months,t)
print_pool(weeks,t)
print_pool(days,t)
The output shows that there are duplicates, marked with * and **
----
1 2003-12-28 Sun -3414
2 2004-12-28 Tue -3048
3 2005-12-28 Wed -2683
4 2006-12-28 Thu -2318
5 2007-12-28 Fri -1953
6 2008-12-28 Sun -1587
7 2009-12-28 Mon -1222
8 2010-12-28 Tue -857
9 2011-12-28 Wed -492
10 2012-12-28 Fri -126 *
----
11 2012-05-28 Mon -340
12 2012-06-28 Thu -309
13 2012-07-28 Sat -279
14 2012-08-28 Tue -248
15 2012-09-28 Fri -217
16 2012-10-28 Sun -187
17 2012-11-28 Wed -156
18 2012-12-28 Fri -126 *
19 2013-01-28 Mon -95
20 2013-02-28 Thu -64
21 2013-03-28 Thu -36
22 2013-04-28 Sun -5 **
----
23 2013-03-31 Sun -33
24 2013-04-07 Sun -26
25 2013-04-14 Sun -19
26 2013-04-21 Sun -12
27 2013-04-28 Sun -5 **
----
28 2013-04-26 Fri -7
29 2013-04-27 Sat -6
30 2013-04-28 Sun -5 **
31 2013-04-29 Mon -4
32 2013-04-30 Tue -3
33 2013-05-01 Wed -2
34 2013-05-02 Thu -1
...which is not a big problem. What I'm getting from it is daily backups in the last week, weekly backups for the last month, monthly backups for the last year, and yearly backups for 10 years. The amount of files is always limited to 10+12+5+7=34.
My ideal solution would
create files with human-readable names including timestampes (i.e. xyz-yyyy-mm-dd.bak)
use only one pool (store/remove files within one folder)
recycle targeted, that is, would not delete more than one file a day
(naturally) not contain any duplicates
Do you have a trivial solution at hand or a suggestion where to learn more about it?
[1] I used python as to better understand/communicate my question, but the question is about the algorithm.
As a committer of pyExpireBackups i can point you to the ExpirationRule implementation of my solution (source below and in the github repo)
see https://wiki.bitplan.com/index.php/PyExpireBackups for the doku.
An example run would lead to:
keeping 7 files for dayly backup
keeping 6 files for weekly backup
keeping 8 files for monthly backup
keeping 4 files for yearly backup
expiring 269 files dry run
# 1✅: 0.0 days( 5 GB/ 5 GB)→./sql_backup.2022-04-02.tgz
# 2✅: 3.0 days( 5 GB/ 9 GB)→./sql_backup.2022-03-30.tgz
# 3✅: 4.0 days( 5 GB/ 14 GB)→./sql_backup.2022-03-29.tgz
# 4✅: 5.0 days( 5 GB/ 18 GB)→./sql_backup.2022-03-28.tgz
# 5✅: 7.0 days( 5 GB/ 23 GB)→./sql_backup.2022-03-26.tgz
# 6✅: 9.0 days( 5 GB/ 27 GB)→./sql_backup.2022-03-24.tgz
# 7✅: 11.0 days( 5 GB/ 32 GB)→./sql_backup.2022-03-22.tgz
# 8❌: 15.0 days( 5 GB/ 37 GB)→./sql_backup.2022-03-18.tgz
# 9❌: 17.0 days( 5 GB/ 41 GB)→./sql_backup.2022-03-16.tgz
# 10✅: 18.0 days( 5 GB/ 46 GB)→./sql_backup.2022-03-15.tgz
# 11❌: 19.0 days( 5 GB/ 50 GB)→./sql_backup.2022-03-14.tgz
# 12❌: 20.0 days( 5 GB/ 55 GB)→./sql_backup.2022-03-13.tgz
# 13❌: 22.0 days( 5 GB/ 59 GB)→./sql_backup.2022-03-11.tgz
# 14❌: 23.0 days( 5 GB/ 64 GB)→./sql_backup.2022-03-10.tgz
# 15✅: 35.0 days( 4 GB/ 68 GB)→./sql_backup.2022-02-26.tgz
# 16❌: 37.0 days( 4 GB/ 73 GB)→./sql_backup.2022-02-24.tgz
# 17❌: 39.0 days( 4 GB/ 77 GB)→./sql_backup.2022-02-22.tgz
# 18❌: 40.0 days( 5 GB/ 82 GB)→./sql_backup.2022-02-21.tgz
# 19✅: 43.0 days( 4 GB/ 86 GB)→./sql_backup.2022-02-18.tgz
...
class ExpirationRule():
'''
an expiration rule keeps files at a certain
'''
def __init__(self,name,freq:float,minAmount:int):
'''
constructor
name(str): name of this rule
freq(float): the frequency) in days
minAmount(int): the minimum of files to keep around
'''
self.name=name
self.ruleName=name # will late be changed by a sideEffect in getNextRule e.g. from "week" to "weekly"
self.freq=freq
self.minAmount=minAmount
if minAmount<0:
raise Exception(f"{self.minAmount} {self.name} is invalid - {self.name} must be >=0")
def reset(self,prevFile:BackupFile):
'''
reset my state with the given previous File
Args:
prevFile: BackupFile - the file to anchor my startAge with
'''
self.kept=0
if prevFile is None:
self.startAge=0
else:
self.startAge=prevFile.ageInDays
def apply(self,file:BackupFile,prevFile:BackupFile,debug:bool)->bool:
'''
apply me to the given file taking the previously kept File prevFile (which might be None) into account
Args:
file(BackupFile): the file to apply this rule for
prevFile(BackupFile): the previous file to potentially take into account
debug(bool): if True show debug output
'''
if prevFile is not None:
ageDiff=file.ageInDays - prevFile.ageInDays
keep=ageDiff>=self.freq
else:
ageDiff=file.ageInDays - self.startAge
keep=True
if keep:
self.kept+=1
else:
file.expire=True
if debug:
print(f"Δ {ageDiff}({ageDiff-self.freq}) days for {self.ruleName}({self.freq}) {self.kept}/{self.minAmount}{file}")
return self.kept>=self.minAmount

Categories