I have a dataframe containing hotel prices separated by the date of the listing and
I would like to plot the median of those prices each in a monthly bar.
So I first want to group the dates into the months. Then calculate the median of the months and then plit them in a bar chart.
Can you please show me how to do that? (Python beginner here)
Thank you in advance.
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None # default='warn'
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
calendar_df = pd.read_csv("calendar.csv")
calendar_df_kurz.head()
Related
Can someone help me with how to create a scatterplot. I have written the following code, however, it is not the scatter plot link that I expected as all data only concentrate 3 values of x-variable
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from scipy.stats import skew
from warnings import filterwarnings
filterwarnings('ignore')
df_transactions = pd.read_csv('transactions.csv')
daily_revenue= df_transactions.groupby("days_after_open").sum()['revenue']
df_transactions["daily_revenue"] = daily_revenue
x = df_transactions["days_after_open"]
y = df_transactions["daily_revenue"]
plt.scatter(x,y,alpha=0.2)
plt.xlabel("Days After Open (days)")
plt.ylabel("Daily Reveue ($)")
plt.savefig("plot")
dataframe image
Please define the 'daily_revenue' following before moving to the scatter plot.
y = df_transactions["daily_revenue"]
i've got two datasets with the exact same data but they look different when plotted the same way. One is a .xlsx file and one is a .csv file.
Here are the two codes:
For the CSV:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.cluster import KMeans
daten = pd.read_csv(r"Path\Übungsdaten.csv", header=0, sep=";")
print("Total rows: {0}".format(len(daten)))
print(daten.columns)
plt.scatter(daten['InsuredValue'], daten['Policy'])
plt.xlim(2500000)
plt.ylim(100100)
plt.show()
And for the xlsx:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.cluster import KMeans
daten = pd.read_excel(r"Path\Übungsdaten.xlsx")
print("Total rows: {0}".format(len(daten)))
plt.scatter(daten['InsuredValue'],daten['Policy'] )
plt.xlim(2500000)
plt.ylim(100100)
plt.show()
Here are the two Plots:
csv with plt.xlim(2500000) plt.ylim(100100)
and the csv without restrictions:
and finally the .xlsx plot:
My question is first of all, why is there a black bar on the bottom of the first two plots? (im guessing this is every single value of "InsuredValue") and how can I form the csv plo to the same ratio as the xlsx plot?
Thank you very much
I had to convert the "InsuredValue" column to int with the following code:
daten.astype({'InsuredValue':'int'})
I start in data analysis and I encounter a problem on an exercise to recover on kaggle: file 'ENBsv' I import my data, determine the correlation, create a new column in my dataframe which totals my target variables
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import model_selection
from sklearn.model_selection import validation_curve
from sklearn import ensemble
from sklearn import svm
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import VotingClassifier
df = pd.read_csv('ENB.csv')
df.columns= ["relative_compactness","surface_area","wall_area","roof_area","overall_height","orientaion",
"glazing_area","glazing_area_dist","heating_load","cooling_load"]
df.head()
corr =df.corr(method = 'pearson')
plt.figure(figsize = (20,10))
sns.heatmap(df.corr(), annot=True, cmap='Greens');
df['total_charges'] = pd.Series([1]).astype(dtype=float)
df['total_charges'] = df['heating_load'] + df['cooling_load']
I have to instantiate new variable 'charges_classes' split the buildings into 4 distinct classes with the label 0,1,2,3 according to the 3 quantiles of the new variable created. But I have to look and seek I can not find a solution, someone can help me here is what I did:
charge_classes = pd.get_dummies(df['total_charges'])
charge_classes
You could use qcut:
df['charge_classes'] = pd.qcut(df['total_charges'], 4, labels=False)
from datetime import time
from numpy import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
df = pd.read_csv( "spam.csv" )
feature_col=['total']
X=df[feature_col]
y=df.target
clf_km=KMeans(n_clusters=1)
clf_km=clf_km.fit(X)
clf_km.cluster_centers_
clf_km.labels_
I try to implement the Kmeans clustering but I don't know how I can plot the original clusters and the new ones I created by the kmeans, I want to plot to scatter for the original one and another for the newpart of the csv file .
I'm trying to work around this issue that I am facing here.
#import libraries
from __future__ import division
from datetime import datetime, timedelta,date
import pandas as pd
%matplotlib inline
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans
import plotly.offline as pyoff
import plotly.graph_objs as go
from sklearn.model_selection import KFold, cross_val_score, train_test_split
#initate plotly
pyoff.init_notebook_mode()
#read data from csv and redo the data work we done before
tx_data = pd.read_csv(r'C:\Users\aayus\OneDrive\Desktop\Aayu\College Project\OnlineRetail.csv', encoding='latin1')
tx_data['InvoiceDate'] = pd.to_datetime(tx_data['InvoiceDate'])
tx_data
tx_uk = tx_data.query("Country=='United Kingdom'").reset_index(drop=True)
tx_uk
Everything is working perfectly till here. But as soon as add this section of the code. It gives an error.
#create 3m and 6m dataframes
tx_3m = tx_uk[(tx_uk.InvoiceDate < date(2011,6,1)) & (tx_uk.InvoiceDate >= date(2011,3,1))].reset_index(drop=True)
tx_6m = tx_uk[(tx_uk.InvoiceDate >= date(2011,6,1)) & (tx_uk.InvoiceDate < date(2011,12,1))].reset_index(drop=True)
The error is "Invalid comparison between dtype=datetime64[ns] and date"
I am still new to numpy and pandas, so would really appreciate the help.
Thanks you guys
.dt.date will convert the dataframe series to datetime.date, which can be compared to date(2011,6,1)
e.g., tx_uk.InvoiceDate.dt.date < date(2011,6,1) will work!~