pandas creating new table from two tables - python

I have to join two tables and create a table with dates, but my code is way to long and I believe that I done it the super long way.Apparently the soulution to this only had 22 lines. Is there another way and more shorter way to approach this problem. Here is the question
HERE IS MY CODE, and again I believe it is to long and I think there is a shorter way to do this.
import numpy as np
import pandas as pd
import datetime
#YOUR CODE GOES HERE#
def get_month(i):
"""this function returns the number of the month based on stringinput"""
if i == "January":
return 1
elif i == "February":
return 2
elif i == "March":
return 3
elif i == "April":
return 4
elif i == "May":
return 5
elif i == "June":
return 6
elif i == "July":
return 7
elif i == "August":
return 8
elif i == "September":
return 9
elif i == "October":
return 10
elif i == "November":
return 11
elif i == "December":
return 12
def get_reformatted_date(s):
"""this function reformats a datetime object to the output we're looking for"""
return s.strftime("%d-%b-%y")
month_names = []
tab1 = pd.read_csv("data1.csv")
tab2 = pd.read_csv("data2.csv")
tab1_tweets = tab1['Tweet'].tolist()[::-1]
tab2_tweets = tab2['Tweet'].tolist()[::-1]
tab1_months = tab1['Month'].tolist()[::-1]
tab2_months = tab2['Month'].tolist()[::-1]
tab1_days = tab1['Day'].tolist()[::-1]
tab2_days = tab2['Day'].tolist()[::-1]
tab1_years = tab1['Year'].tolist()[::-1]
tab2_years = tab2['Year'].tolist()[::-1]
all_dates = []
all_tweets = []
tab1_count = 0
tab2_count = 0
for i in range(len(tab1_tweets) + len(tab2_tweets)):
if(tab1_count < len(tab1_years) and tab2_count < len(tab2_years)):
t1_date = datetime.date(tab1_years[tab1_count], tab1_months[tab1_count], tab1_days[tab1_count])
t2_date = datetime.date(tab2_years[tab2_count], get_month(tab2_months[tab2_count]), tab2_days[tab2_count])
if t1_date > t2_date:
all_dates.append(t1_date)
all_tweets.append(tab1_tweets[tab1_count])
tab1_count += 1
else:
all_dates.append(t2_date)
all_tweets.append(tab2_tweets[tab2_count])
tab2_count += 1
elif(tab2_count < len(tab2_years)):
t2_date = datetime.date(tab2_years[tab2_count], get_month(tab2_months[tab2_count]), tab2_days[tab2_count])
all_dates.append(t2_date)
all_tweets.append(tab2_tweets[tab2_count])
tab2_count += 1
else:
t1_date = datetime.date(tab1_years[tab1_count], tab1_months[tab1_count], tab1_days[tab1_count])
all_dates.append(t1_date)
all_tweets.append(tab1_tweets[tab1_count])
tab1_count += 1
table_data = {'Date': all_dates, 'Tweet': all_tweets}
df = pd.DataFrame(table_data)
df['Date'] = df['Date'].apply(get_reformatted_date)
print(df)
data1.csv is
Tweet Month Day Year
Hello World 6 2 2013
I want ice-cream! 7 23 2013
Friends will be friends 9 30 2017
Done with school 12 12 2017
the data2.csv is
Month Day Year Hour Tweet
January 2 2015 12 Happy New Year
March 21 2016 7 Today is my final
May 30 2017 23 Summer is about to begin
July 15 2018 11 Ocean is still cold

I think that you can theoretically do this whole thing in one line:
finaldf = (pd.concat([pd.read_csv('data1.csv',
parse_dates={'Date':['Year', 'Month', 'Day']}),
pd.read_csv('data2.csv',
parse_dates={'Date':['Year', 'Month', 'Day']})
[['Date', 'Tweet']]])
.sort_values('Date', ascending=False))
But for the sake of readability, its better to split it into a few lines:
df1 = pd.read_csv('data1.csv', parse_dates={'Date':['Year', 'Month','Day']})
df2 = pd.read_csv('data2.csv', parse_dates={'Date':['Year', 'Month','Day']})
finaldf = (pd.concat([df1, df2[['Date', 'Tweet']]])
.sort_values('Date', ascending=False))
I think that for what you're trying to do, the main things to read up about are the parse_dates argument of pandas read_csv, and pd.concat to concatenate dataframes
Edit: in order to get the dates in the correct format as you have in your example output, you can call this after the code above, using Series.dt.strftime():
finaldf['Date'] = finaldf['Date'].dt.strftime('%d-%b-%y')

Related

How can I convert a list in a DF column into comma separated values?

Right now I have a DataFrame that outputs to a CSV file. If I were to print out the "Affected IP Address" column, it would look like this:
['10.0.7.248']
['10.0.7.248', '10.0.8.56']
['10.0.6.72']
['10.0.6.72', '10.0.5.46']
['10.0.9.126']
['10.0.9.126', '10.0.7.248']
['10.0.9.126', '10.0.7.248', '10.0.8.56']
['10.0.6.72']
['10.0.6.72', '10.0.5.46']
['10.0.9.126']
['10.0.9.126', '10.0.7.248']
['10.0.9.126', '10.0.7.248', '10.0.8.56']
Each value in that column is a list and some are just single IPs and some are multiple. Is there any way I can omit the brackets and the apostrophes from the output? I would prefer it to output like this if possible:
10.0.7.248
10.0.7.248, 10.0.8.56
10.0.6.72
10.0.6.72, 10.0.5.46
10.0.9.126
10.0.9.126, 10.0.7.248
10.0.9.126, 10.0.7.248, 10.0.8.56
This get's written to a CSV file, so I'm not sure how to omit those characters so it's just the IPs and they're separated by commas.
Here's my script:
def main():
csv_data = open_csv()
get_scan_results(csv_data)
def open_csv():
#Opens CSV file
with open(f"{csv_filename}.csv", newline='') as f:
reader = csv.reader(f)
data = list(reader)
return data
def get_scan_results(data):
# New dictionary to be created
new_dict = {}
for ip, host, os, vuln_title, vuln_id, cvss2, cvss3, descr, proof, solu, cves in data[1:]:
# Converts CVSSv3 score into a 'Risk Exposure' metric, blank values return 'Null'
if len(cvss3.strip()):
converted_cvss3 = float(cvss3)
if converted_cvss3 < 4.0:
s = "Low"
elif converted_cvss3 >= 4 and converted_cvss3 < 7:
s = "Moderate"
else:
s = "High"
elif len(cvss2.strip()):
converted_cvss2 = float(cvss2)
if converted_cvss2 < 4.0:
s = "Low"
elif converted_cvss2 >= 4 and converted_cvss2 < 7:
s = "Moderate"
else:
s = "High"
else:
s = "Null"
# Populates 'new_dict' with values, the keys will also be the column names in CSV/Excel
vuln_data = new_dict.setdefault(vuln_id, {"Name": vuln_title, "Description": descr, "Source of Discovery": csv_filename, "Vulnerability ID": vuln_id, "Affected IP Address": [], "Solution": solu, "Risk Exposure": s })
vuln_data["Affected IP Address"].append(ip)
print (vuln_data["Affected IP Address"])
# Creates DF object and exports to CSV
new_list = new_dict.values()
df = pd.DataFrame(new_list)
df.to_csv(f"{exported_csv_filename}.csv", index=False)
if __name__ == "__main__":
main()
Here are few ways to do it
If you post the data as a code, it takes away the assumptions I made in creating the data frame to answer the question.
# if its of type list, then iterate through and concat with ','
df['IP'].apply(lambda x: ', '.join(x))
0 10.0.7.248
1 10.0.7.248,10.0.8.56
2 10.0.6.72
3 10.0.6.72,10.0.5.46
4 10.0.9.126
5 10.0.9.126,10.0.7.248
6 10.0.9.126,10.0.7.248,10.0.8.56
7 10.0.6.72
8 10.0.6.72,10.0.5.46
9 10.0.9.126
10 10.0.9.126,10.0.7.248
11 10.0.9.126,10.0.7.248,10.0.8.56
OR
# if its of type string, replace brackets and apostrophes
df['IP'].replace(r"'|\]|\[","", regex=True)
0 10.0.7.248
1 10.0.7.248, 10.0.8.56
2 10.0.6.72
3 10.0.6.72, 10.0.5.46
4 10.0.9.126
5 10.0.9.126, 10.0.7.248
6 10.0.9.126, 10.0.7.248, 10.0.8.56
7 10.0.6.72
8 10.0.6.72, 10.0.5.46
9 10.0.9.126
10 10.0.9.126, 10.0.7.248
11 10.0.9.126, 10.0.7.248, 10.0.8.56

Optimise a function with numerous conditions that depends on the previous row in a Python dataframe

I have the following dataframe:
country_ID
ID
direction
date
ESP_1
0
IN
2021-02-28
ENG
0
IN
2021-03-03
ENG
0
OUT
2021-03-04
ESP_2
0
IN
2021-03-05
FRA
1
OUT
2021-03-07
ENG
1
OUT
2021-03-09
ENG
1
OUT
2021-03-10
ENG
2
IN
2021-03-13
I have implemented the following functionality:
ef create_columns_analysis(df):
df['visit_ESP'] = 0
df['visit_ENG'] = 0
df['visit_FRA'] = 0
list_ids = []
for i in range(len(df)):
if df.loc[i,'country_ID'] == 'ENG':
country_ID_ENG(df, i, list_ids)
else:
# case country_ID = {FRA, ESP_1, ESP_2}
# other methods not specified
return df
For each row with a specific country_ID, a similarly structured function is applied.
I would like to optimise or simplify the code of the country_ID_ENG function. The country_ID_ENG function is defined as follows:
def country_ID_ENG(df, i, list_ids):
# If it is the first time the ID is detected
if df.loc[i,'ID'] not in list_ids:
# It adds up to one visit regardless of the direction of the ID
df.loc[i,'visit_ENG'] = 1
# Add the ID to the read list
list_ids.append(df.loc[i, 'ID'])
# Assigns the error column a start message
df.loc[i,'error'] = 'ERROR:1'
# If it is not the first time it detects that ID
else:
# Saves the information of the previous row
prev_row = df.loc[i-1]
# If the current row direction is 'IN'
if df.loc[i,'direction'] == 'IN':
# Add a visit
df.loc[i,'visit_ENG'] = 1
# Behaviour dependent on the previous row
# If the current row direction is 'IN' and previous row is 'IN'
if prev_row['direction'] == 'IN':
if prev_row['country_ID'] == 'FRA':
df.loc[i,'error'] = 'ERROR:0'
elif prev_row['country_ID'] in ['ESP_1','ESP_2']:
df.loc[i,'error'] = 'ERROR:2'
df.loc[i,'visit_FRA'] = 1
else:
df.loc[i,'error'] = 'ERROR:3'
# If the current row direction is 'IN' and previous row is 'OUT'
else:
if prev_row['country_ID'] == 'ENG':
df.loc[i,'error'] = 'ERROR:0'
elif prev_row['country_ID'] in ['FRA','ESP_2']:
df.loc[i,'error'] = 'ERROR:4'
df.loc[i,'visit_FRA'] = 1
else:
df.loc[i,'error'] = 'ERROR:5'
df.loc[i,'visit_ESP'] = 1
df.loc[i,'visit_FRA'] = 1
# If the current row direction is 'OUT'
else:
# If the current row direction is 'OUT' and previous row is 'IN'
if prev_row['direction'] == 'IN':
# If it detects an output before an input of the same 'country_ID',
# it calculates the visit time
if prev_row['country_ID'] == 'ENG':
df.loc[i,'mean_time'] = df.loc[i,'date']-prev_row['date']
df.loc[i,'error'] = 'ERROR:0'
elif prev_row['country_ID'] in ['ESP_1','ESP_2']:
df.loc[i,'error'] = 'ERROR:6'
df.loc[i,'visit_FRA'] = 1
df.loc[i,'visit_ENG'] = 1
else:
df.loc[i,'error'] = 'ERROR:7'
df.loc[i,'visit_ENG'] = 1
# If the current row direction is 'OUT' and previous row is 'OUT'
else:
df.loc[i,'visit_ENG'] = 1
if prev_row['country_ID'] == 'ENG':
df.loc[i,'error'] = 'ERROR:8'
elif prev_row['country_ID'] in ['FRA','ESP_2']:
df.loc[i,'error'] = 'ERROR:9'
df.loc[i,'visit_FRA'] = 1
else:
df.loc[i,'error'] = 'ERROR:10'
df.loc[i,'visit_ESP'] = 1
df.loc[i,'visit_FRA'] = 1
The above function uses the information from the current row and the previous row (if any) to create new columns for visit_ENG, visit_ESP, visit_FRA, mean_time and error.
For the example dataframe the function, applying the function country_ID_ENG to rows whose country_ID is equal to ENG, should return the following result:
country_ID
ID
direction
date
visit_ENG
visit_FRA
visit_ESP
mean_time
error
ESP_1
0
IN
2021-02-28
-
-
-
-
-
ENG
0
IN
2021-03-03
0
1
0
NaN
ERROR:2
ENG
0
OUT
2021-03-04
0
0
0
1 days
ERROR:0
ESP_2
0
IN
2021-03-05
-
-
-
-
-
FRA
1
OUT
2021-03-07
-
-
-
-
-
ENG
1
OUT
2021-03-09
1
1
0
NaN
ERROR:9
ENG
1
OUT
2021-03-10
1
0
0
NaN
ERROR:8
ENG
2
IN
2021-03-13
1
0
0
NaN
ERROR:1
The function is very long, and the other functions for rows with country_ID equal to ESP or FRA will have the same complexity. I would like you to help me to simplify or optimise the code of this function to also take it into account when defining the country_ID_ESP and country_ID_FRA functions. I appreciate your help.
I recently had to accomplish something similar. My solution was to create a custom class to iterate over moving the some of the logic out of the loop and into the class. Its not a complete solution but enough to work with.
main.py
import pandas as pd
DATA= {
'country_id': ['ESP_1', 'FRA', 'ENG', 'FRA'],
'ID': [0, 1, 2, 0, ],
'direction': ['IN', 'IN', 'OUT', 'OUT'],
'date': ['2021-02-28', '2021-02-28', '2021-02-28', '2021-02-28']
}
class CountryIDs:
def __init__(self, df: pd.DataFrame):
self._list_ids = []
self._country_ids = []
self._df = df
def __iter__(self):
for tup in self._df.itertuples():
yield tup, self._list_ids
def update_list_ids(self, new_value):
self._list_ids = [*self._list_ids, new_value.ID]
self._country_ids = [*self._country_ids, new_value.country_id]
def get_list(self):
return [self._list_ids, self._country_ids]
def start():
country_data = CountryIDs(pd.DataFrame(DATA))
for named_tuple, list_ids in country_data:
if named_tuple.ID not in list_ids:
country_data.update_list_ids(named_tuple)
print(ids.get_list())
if __name__ == '__main__':
start()
result
[[0, 1, 2], ['ESP_1', 'FRA', 'ENG']]

Trying to make Alarm Clock with Python, but it is not working

don't know where I got stuck. Seems fine for me.
Maybe the loop part doesen't work properly? Let me know if something is wrong.
import datetime
import time
year = (datetime.datetime.today().strftime("%Y"))
month = (datetime.datetime.today().strftime("%m"))
day = (datetime.datetime.today().strftime("%d"))
hour = (datetime.datetime.today().strftime("%H"))
minute = (datetime.datetime.today().strftime("%M"))
second = (datetime.datetime.today().strftime("%S"))
setyear = int(input("Year"))
setmonth = int(input("Month"))
setday = int(input("Day"))
sethour = int(input("Hour"))
setminute = int(input("Minute"))
setsecond = int(input("Second"))
while True:
time.sleep(1)
year = (datetime.datetime.today().strftime("%Y"))
month = (datetime.datetime.today().strftime("%m"))
day = (datetime.datetime.today().strftime("%d"))
hour = (datetime.datetime.today().strftime("%H"))
minute = (datetime.datetime.today().strftime("%M"))
second = (datetime.datetime.today().strftime("%S"))
if year == setyear and month == setmonth and day == setday and hour == sethour and minute == setminute and second == setsecond:
print("Alarm!")
break
else:
print("NO")
I believe because the types don't match and therefore values are interpretted differently.
For example:
input seconds "0" won't match to the seconds returned of "00", i removed your int()
conversions at the start and it now works. You just have to enter the proper digits:
import time
import datetime
year = (datetime.datetime.today().strftime("%Y"))
month = (datetime.datetime.today().strftime("%m"))
day = (datetime.datetime.today().strftime("%d"))
hour = (datetime.datetime.today().strftime("%H"))
minute = (datetime.datetime.today().strftime("%M"))
second = (datetime.datetime.today().strftime("%S"))
setyear = (input("Year"))
setmonth = (input("Month"))
setday = (input("Day"))
sethour = (input("Hour"))
setminute = (input("Minute"))
setsecond = (input("Second"))
while True:
print(year,'-',month,'-',day,' ',hour,':',minute,':',second)
time.sleep(1)
year = (datetime.datetime.today().strftime("%Y"))
month = (datetime.datetime.today().strftime("%m"))
day = (datetime.datetime.today().strftime("%d"))
hour = (datetime.datetime.today().strftime("%H"))
minute = (datetime.datetime.today().strftime("%M"))
second = (datetime.datetime.today().strftime("%S"))
if year == setyear and month == setmonth and day == setday and hour == sethour and minute == setminute and second == setsecond:
print("Alarm!")
break
else:
print("NO")
Output:
Year2020
Month11
Day12
Hour14
Minute12
Second00
2020 - 11 - 12 14 : 10 : 56
NO
2020 - 11 - 12 14 : 11 : 08
NO
....
2020 - 11 - 12 14 : 11 : 59
Alarm!

Cycling through a date

I have defined day and month in a previous part of the code. I need this program to run when day is less than 7. I am asking for an input of spendings in the last 7 days.
The problem: If the input of day is 4, for example, 4 - 4(days) is day 0 (does not exist), 4 - 5 is day -1, 4 - 6 is day -2, 4 - 7 is day -3. I need it to be day 4 - 4 = 31 or 30 or 28 or 29 (that is already defined), 4 - 5 = 30, 4 - 6 = 29.
I know that this is poorly structured, I apologize, English is not my first language. Will try to make it clear if it is not understood like this.
listOfSpendings = []
x = 0
while x < 7:
if day - x <=0:
month = month - 1
dayDiff= ###SOMETHING I DUNNOOOO
day = monthlenght - dayDiff
print ("How many liters did you spend on the day", day - x, "of", month)
spendings = input()
while True:
try:
spendings = int(spendings)
if spendings < 0:
spendings = input ("Insert a value =! 0")
else:
break
except ValueError:
spendings = input("Incorrect value, correct")
x = x+1
listOfSpendings.append(spendings)
sumSpendings = sum (listOfSpendings)
Your code as it is will run into negative numbers with months as well. Using the datetime library that was suggested you can do the following:
from datetime import datetime, timedelta
list_of_spendings = []
# Month number
for day in [(datetime.now()-timedelta(x)) for x in range(7)]:
print('How many liters did you spend on the day {} of {}'.format(day.day, day.month))
#Rest of your code
OR
# Month name
for day in [(datetime.now()-timedelta(x)) for x in range(7)]:
print('How many liters did you spend on the day {} of {}'.format(day.day, day.strftime('%B')))
#Rest of your code
OR
# Short month name
for day in [(datetime.now()-timedelta(x)) for x in range(7)]:
print('How many liters did you spend on the day {} of {}'.format(day.day, day.strftime('%b')))
#Rest of your code
This is how I figured it out. "Dia" and "mes" are previously defined.
Translations:
listaGastos = listOfSpendings
gastos = spendings
dia = day
mes = month
DuracMes = monthlenght
diaC = currentDay
mesC = currentMonth
Code:
DuracMes = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
listaGastos = []
x = 0
diaC = dia
mesC = mes
while x < 7:
dia = diaC - x
if dia <= 0:
mes = mesC - 1
diaDif = diaC - 1
dia = DuracMes [mes - 1] + dia
print("Quantos litros de soro fisiologico foram gastos no dia", dia, "de", mes)
gastos = input()
while True:
try:
gastos = int (gastos)
if gastos < 0:
gastos = input ("Introduza um valor diferente de 0: ")
else:
break
except ValueError:
gastos = input ("Nao inseriu um valor adequado, quantos litros de soro fisiologico foram gastos nos ultimos sete dias? ")
x += 1
listaGastos.append(gastos)
sumGastos = sum (listaGastos)

Python XML Parsing - need to correct while loop

Fairly new to Python. I'm parsing an XML file and the following code returns the undesired results. I can understand why I'm getting my results - there are two escalations in the XML for this deal and I'm getting results for each set. I'm need help updating my code to only return the monthly rent for each escalation in the XML:
<RentEscalations>
<RentEscalation ID="354781">
<BeginIn>7</BeginIn>
<Escalation>3.8</Escalation>
<RecurrenceInterval>12</RecurrenceInterval>
<EscalationType>bump</EscalationType>
</RentEscalation>
<RentEscalation ID="354782">
<BeginIn>61</BeginIn>
<Escalation>1.0</Escalation>
<RecurrenceInterval>12</RecurrenceInterval>
<EscalationType>bump</EscalationType>
</RentEscalation>
</RentEscalations>
The rent starts at $3.00/sqft for the first 6 months. This XML block shows that, for each 12 months (RecurrenceInterval), the rent will be $6.80/sqft ($3.00 base + $3.80 escalation). The following twelve months will be $10.60 ($6.80 + 3.80). Each year, the amount per square foot will increase by $3.80 until the 61st month in the term. At that point, the rent will increase by $1.00/sqft for the remainder of the term. The entire term of the lease is 120 months.
My results include 114 results based on the first escalation (3.80/sqft) followed by 114 rows showing as if the rent starts at $3.00/sqft incrementing by $1.00/sqft each year.
Any help is appreciated!
import xml.etree.ElementTree as ET
import pyodbc
import dateutil.relativedelta as rd
import datetime as dt
tree = ET.parse('C:\\FileLocation\\DealData.xml')
root = tree.getroot()
for deal in root.findall("Deals"):
for dl in deal.findall("Deal"):
dealid = dl.get("DealID")
for dts in dl.findall("DealTerms/DealTerm"):
dtid = dts.get("ID")
darea = float(dts.find("RentableArea").text)
dterm = int(dts.find("LeaseTerm").text)
for brrent in dts.findall("BaseRents/BaseRent"):
brid = brrent.get("ID")
rent = float(brrent.find("Rent").text)
darea = float(dts.find("RentableArea").text)
per = brrent.find("Period").text
dtstart = dts.find("CommencementDate").text
startyr = int(dtstart[0:4])
startmo = int(dtstart[5:7])
startday = int(dtstart[8:])
start = dt.date(startyr, startmo, startday)
end = start + rd.relativedelta(months=dterm)
if brrent.find("Duration").text is None:
duration = 0
else:
duration = int(brrent.find("Duration").text)
termbal = dterm - duration
for resc in dts.findall("RentEscalations/RentEscalation"):
rescid = resc.get("ID")
esctype = resc.find("EscalationType").text
begmo = int(resc.find("BeginIn").text)
esc = float(resc.find("Escalation").text)
intrvl = int(resc.find("RecurrenceInterval").text)
if intrvl != 0:
pers = termbal / intrvl
else:
pers = 0
escst = start + rd.relativedelta(months=begmo - 1)
i = 0
x = begmo
newrate = rent
while i < termbal:
billdt = escst + rd.relativedelta(months=i)
if per == "rsf/year":
monthlyamt = (newrate + esc) * darea / 12.0
if per == "month":
monthlyamt = newrate + esc
if per == "year":
monthlyamt = (newrate + esc) / 12.0
if per == "rsf/month":
monthlyamt = (newrate + esc) * darea
try:
if i % intrvl == 0:
level = x + 1
newrent = monthlyamt
x += 1
newrate += esc
else:
level = x
except ZeroDivisionError:
break
i += 1
if dealid == "1254278":
print(dealid, dtid, rescid, dterm, darea, escst, rent, intrvl, esctype, termbal, \
monthlyamt, billdt, pers, level, newrate, newrent)

Categories