Trying to plot and regroup 2 dataframes

Trying to plot and regroup 2 dataframes - python

Here is my code :
import pandas as pd
import io
data="""
;Barcode;Created;Hash;Modified;Tag;Tag2
0;9780735711020;2019-02-22T22:35:06.628Z;None;2019-02-22T22:35:06.628Z;NEED_PICS;
1;3178041328890;2019-02-22T22:37:44.546Z;None;2019-02-22T22:37:44.546Z;DISPLAY;
2;8718951129597;2019-02-23T04:53:17.925Z;None;2019-02-23T04:53:17.925Z;DISPLAY;
3;3770006053078;2019-02-23T05:25:56.454Z;None;2019-02-23T05:25:56.454Z;DISPLAY;
4;3468080404892;2019-02-23T05:26:39.923Z;None;2019-02-23T05:26:39.923Z;NEED_PICS;
5;3517360013757;2019-02-23T05:27:24.910Z;None;2019-02-23T05:27:24.910Z;DISPLAY;
6;3464660000768;2019-02-23T05:27:51.379Z;None;2019-02-23T05:27:51.379Z;DISPLAY;
7;30073357;2019-02-23T06:20:53.075Z;None;2019-02-23T06:20:53.075Z;NEED_PICS;
8;02992;2019-02-23T06:22:57.326Z;None;2019-02-23T06:22:57.326Z;NEED_PICS;
9;3605532558776;2019-02-23T06:23:45.010Z;None;2019-02-23T06:23:45.010Z;NEED_PICS;
10;3605532558776;2019-02-23T06:23:48.291Z;None;2019-02-23T06:23:48.291Z;NEED_PICS;
11;3605532558776;2019-02-23T06:23:52.579Z;None;2019-02-23T06:23:52.579Z;NEED_PICS;
"""
from io import StringIO
TESTDATA = StringIO(data)
df = pd.read_csv(TESTDATA, sep=";")
df["Created"] = pd.to_datetime(df["Created"],errors='coerce')
df["Barcode"] = df["Barcode"].astype(str)
df.set_index(df.columns[0], inplace=True)
df2 = df #df[df.Hash != "None"]
df3 = df2
df3 = df3.loc[df3.Tag == "DISPLAY"]
df = df2.merge(df3, on='Created', how='outer').fillna(0)
df['sum'] = df['Barcode_x']+df['Barcode_y']
df.plot(df['sum'], df['Created'])
So i'am trying at the end to plot, two line graph on the same plot.
I would like to have regrouped by day the number of occurence for two dataframe df2 all Tag , and df3 with just the tag display.
And i would like to plot 2 line in the same graph one with all occurence by day with all the time and one with occurence of just the tag display.
For the moment i only managed to get this :

import pandas as pd
import matplotlib.dates as mdates
import io
data="""
;Barcode;Created;Hash;Modified;Tag;Tag2
0;9780735711020;2019-02-22T22:35:06.628Z;None;2019-02-22T22:35:06.628Z;NEED_PICS;
1;3178041328890;2019-02-22T22:37:44.546Z;None;2019-02-22T22:37:44.546Z;DISPLAY;
2;8718951129597;2019-02-23T04:53:17.925Z;None;2019-02-23T04:53:17.925Z;DISPLAY;
3;3770006053078;2019-02-23T05:25:56.454Z;None;2019-02-23T05:25:56.454Z;DISPLAY;
4;3468080404892;2019-02-23T05:26:39.923Z;None;2019-02-23T05:26:39.923Z;NEED_PICS;
5;3517360013757;2019-02-23T05:27:24.910Z;None;2019-02-23T05:27:24.910Z;DISPLAY;
6;3464660000768;2019-02-23T05:27:51.379Z;None;2019-02-23T05:27:51.379Z;DISPLAY;
7;30073357;2019-02-23T06:20:53.075Z;None;2019-02-23T06:20:53.075Z;NEED_PICS;
8;02992;2019-02-23T06:22:57.326Z;None;2019-02-23T06:22:57.326Z;NEED_PICS;
9;3605532558776;2019-02-23T06:23:45.010Z;None;2019-02-23T06:23:45.010Z;NEED_PICS;
10;3605532558776;2019-02-23T06:23:48.291Z;None;2019-02-23T06:23:48.291Z;NEED_PICS;
11;3605532558776;2019-02-23T06:23:52.579Z;None;2019-02-23T06:23:52.579Z;NEED_PICS;
"""
from io import StringIO
TESTDATA = StringIO(data)
df = pd.read_csv(TESTDATA, sep=";")
df["Created"] = pd.to_datetime(df["Created"],errors='coerce').dt.date
df["Barcode"] = df["Barcode"].astype(str)
# custom date formatting
fig, ax = plt.subplots()
myFmt = mdates.DateFormatter('%Y-%m-%d')
ax.yaxis.set_major_formatter(myFmt)
df1 = df.groupby(["Created"])["Tag"].count().reset_index()
df2 = df[df["Tag"] == "DISPLAY"].groupby(["Created"])["Tag"].count().reset_index()
plt.plot(df1['Tag'], df1['Created'], label='ALL')
plt.plot(df2['Tag'], df2['Created'], label="DISPLAY")
plt.legend(loc='upper left')
plt.show()
Note that since there is no much data, I have chopped off the time part of the data
df["Created"] = pd.to_datetime(df["Created"],errors='coerce').dt.date
You can modify it as per your needs based on whether you want to bucket Tags by date or date-hours, or date-hours-minutes etc

Related

Interpreting Multiindex datetime

I have the following code:
import pandas as pd
from pandas import DataFrame as df
import matplotlib
from pandas_datareader import data as web
import matplotlib.pyplot as plt
import datetime
import warnings
warnings.filterwarnings("ignore")
start = datetime.date(2020,1,1)
end = datetime.date.today()
stock = 'fb'
data = web.DataReader(stock, 'yahoo', start, end)
data.index = pd.to_datetime(data.index, format ='%Y-%m-%d')
data = data[~data.index.duplicated(keep='first')]
data['year'] = data.index.year
data['month'] = data.index.month
data['week'] = data.index.week
data['day'] = data.index.day
data.set_index('year', append=True, inplace =True)
data.set_index('month',append=True,inplace=True)
data.set_index('week',append=True,inplace=True)
data.set_index('day',append=True,inplace=True)
fig, ax = plt.subplots(dpi=300, figsize =(30,4))
data.plot(y='Close', ax=ax, xlabel= 'Date')
plt.show()
What can I do to interpret the multiindex dates as the x axis in more readable year and month format? Such as in a format like strftime('%y -%m'). A similar question was asked here: Renaming months from number to name in pandas
But I am unable to see how I can use this to rename the x axis. Any help would be appreciated.

You can use the dates from matplotlib. See the following link for more details:
https://matplotlib.org/stable/api/dates_api.html#matplotlib.dates.ConciseDateFormatter
Here is the modified code:
import pandas as pd
from pandas import DataFrame as df
import matplotlib
from pandas_datareader import data as web
import matplotlib.pyplot as plt
import datetime
import warnings
warnings.filterwarnings("ignore")
from matplotlib import dates as mdates
start = datetime.date(2020,1,1)
end = datetime.date.today()
stock = 'fb'
data = web.DataReader(stock, 'yahoo', start, end)
data.index = pd.to_datetime(data.index, format ='%Y-%m-%d')
data = data[~data.index.duplicated(keep='first')]
data['year'] = data.index.year
data['month'] = data.index.month
data['week'] = data.index.week
data['day'] = data.index.day
data.set_index('year', append=True, inplace =True)
data.set_index('month',append=True,inplace=True)
data.set_index('week',append=True,inplace=True)
data.set_index('day',append=True,inplace=True)
fig, ax = plt.subplots(dpi=300, figsize =(15,4))
plt.plot(data.index.get_level_values('Date'), data['Close'])
#--------------------------------------
#Feel free to try different options
#--------------------------------------
#locator = mdates.AutoDateLocator()
locator = mdates.MonthLocator()
formatter = mdates.ConciseDateFormatter(locator)
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)
plt.show()
Here is the
output.

Apply style to specific columns in dataframe

I want to format only specific columns of the dataframe. Right now styling gets
applied to every column, this can be done using subset parameter in df.style.background_gradient()
as per documentation. I tried to do subset = df.iloc[:, 3:] as I want to apply styling to column 3
onwards but I am getting too many indexers error.
import pandas as pd
import seaborn as sns
filename = r'c:\Users\91956\Desktop\time_50.csv'
df = pd.read_csv(filename)
cm = sns.light_palette("green", as_cmap=True)
s = df.style.background_gradient(cmap=cm, axis = 1)
html = s.render()
with open("output.html","w") as fp:
fp.write(html)
what is the correct way to write subset parameter so that styling applies only to the
columns from index 3 afterwards.
after doing necessary changes.
import pandas as pd
import seaborn as sns
filename = r'c:\Users\91956\Desktop\time_50.csv'
df = pd.read_csv(filename,index_col = 0)
cm = sns.light_palette("green", as_cmap=True)
select_col = df.columns[2:]
s = df.style.background_gradient(cmap=cm, axis = 1,subset=select_col)
html = s.render()
with open("output.html","w") as fp:
fp.write(html)

To remove 'Unnamed 0' you can use :
df.set_index('Ticker')
del df['Unnamed: 0']
To use subset :
select_col = df.columns[3:]
s = df.style.background_gradient(cmap=cm, axis = 1,subset=select_col)

a quick and easy way to drop the unnamed column is:
new_df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

python time series country data

I have these columns within a csv and want to plot UK and France oranges in time series format.
I can plot for UK and Oranges in time series, but I cant plot UK + France oranges in time series.
Can someone help me?
Please look at picture attached for CSV
Oranges_Apples_country.csv
import logging
import datetime
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import orangesdata from CSV file
df = pd.read_csv('Oranges_Apples_country.csv')
df.head()
df['date'] = pd.to_datetime(df['date'])
df.set_index('date').head()
df = pd.DataFrame(df, columns = ['country', 'product', 'unit', 'date'])
df['country'] = ['UK', 'FRANCE']
df['product'] = ['ORANGES', 'APPLE']
df['unit'] = ['KG']

Here some code to subset on the axes you mentioned. I took the for loop from another SO post, but I don't remember the link. Apologies to that contributor.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (10,8)
fig, ax = plt.subplots()
df = pd.read_csv('fruit.csv', parse_dates = ['DATE'])
df.set_index('DATE', inplace=True)
df2 = df[((df['COUNTRY'] == 'UK') & (df['PRODUCT'] == 'ORANGE')) | \
((df['COUNTRY'] == 'FRANCE') & (df['PRODUCT'] == 'ORANGE'))]
labels = []
for key, grp in df2.groupby(['COUNTRY', 'PRODUCT']):
ax = grp.plot(ax = ax, y = 'QUANTITY')
labels.append(key)
lines, _ = ax.get_legend_handles_labels()
ax.legend(lines, labels, loc='upper left')
plt.show()
You can sum using groupby.sum().
df3 = df2.groupby('DATE').sum()
df3.plot()
plt.show()

Pandas Line Graph by Month, Grouped by Industry from Timestamped SQL Export

Newbie question, thank you in advance!
I'm trying to group the data by both date and industry and display a chart that shows the different industry revenue numbers across the time series in monthly increments.
I am working from a SQL export that has timestamps, having a bear of time getting this to work.
Posted sample csv data file here:
https://drive.google.com/open?id=0B4xdnV0LFZI1WGRMN3AyU2JERVU
Here's a small data example:
Industry Date Revenue
Fast Food 01-05-2016 12:18:02 100
Fine Dining 01-08-2016 09:17:48 110
Carnivals 01-18-2016 10:48:52 200
My failed attempt is here:
import pandas as pd
import datetime
import matplotlib.pyplot as plt
df = pd.read_csv('2012_to_12_27_2016.csv')
df['Ship_Date'] = pd.to_datetime(df['Ship_Date'], errors = 'coerce')
df['Year'] = df.Ship_Date.dt.year
df['Ship_Date'] = pd.DatetimeIndex(df.Ship_Date).normalize()
df.index = df['Ship_Date']
df_skinny = df[['Shipment_Piece_Revenue', 'Industry']]
groups = df_skinny[['Shipment_Piece_Revenue', 'Industry']].groupby('Industry')
groups = groups.resample('M').sum()
groups.index = df['Ship_Date']
fig, ax = plt.subplots()
groups.plot(ax=ax, legend=False)
names = [item[0] for item in groups]
ax.legend(ax.lines, names, loc='best')
plt.show()

You could use DataFrame.Series.unique to get a list of all industries and then, using DataFrame.loc, define a new DataFrame object that only contains data from a single Industry.
Then if we set the Ship Date column as the index of the new DataFrame, we can use DataFrame.resample, specify the frequency as months and call sum() to get the total revenue for that month.
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('Graph_Sample_Data.csv')
df['Ship Date'] = pd.to_datetime(df['Ship Date'], errors='coerce')
fig, ax = plt.subplots()
for industry in df.Industry.unique():
industry_df = df.loc[df.Industry == industry]
industry_df.index = industry_df['Ship Date']
industry_df = industry_df.resample('M').sum()
industry_df.plot(x=industry_df.index,
y='Revenue',
ax=ax,
label=industry)
plt.show()

Matplotlib plot.fill plots data incorrectly, fills negative and shifts

I am having issues plotting with matplotlib and using the fill plot type. I have attached a picture of what my plt.fill looks like. I created the plot from a dataframe containing datetimes in the first column and then calculated values in the next column. I would gladly post the sample data if required, but I wanted to show what is happening with my plot...it seems very strange that it fills below 0 (unless I am missing something here) and then at the end it shifts everything over along the diagonal line.
Any help is greatly appreciated!
import pandas as pd
import numpy as np
import os as os
import matplotlib as mpl
import matplotlib.pyplot as plt
import datetime as dt
#set working path
working_path = '/Users/Earth/desktop/mydata'
os.chdir(working_path)
# name csv files to variables
chilled_water_supply = 'chilled_water_supply1.csv'
chilled_water_return = 'chilled_water_return1.csv'
chilled_water_flow = 'CofK_CW_Flow.csv'
#read correct date time format from csv file
datetimeparse1 = lambda x: pd.datetime.strptime(x, '%m/%d/%Y %H:%M')
#import chilled water temperautres and flow
df_chwsup = pd.read_csv(chilled_water_supply, parse_dates = ['Date_Time'], date_parser = datetimeparse1)
df_chwret = pd.read_csv(chilled_water_return, parse_dates = ['Date_Time'], date_parser = datetimeparse1)
df_chwflow = pd.read_csv(chilled_water_flow, parse_dates = ['Date_Time'], date_parser = datetimeparse1)
#set start date time and length of period
startdate = dt.datetime(2015,7,14,11,41,0)
numintervals = (14*24*60)
#create data frame with index of row numbers and correct date time period
df_datetime = pd.DataFrame(pd.date_range(startdate ,periods = numintervals, freq = "1min"), columns = ["Date_Time"])
#creating master data frame with outputs
df_mstr1 = pd.merge(df_datetime, df_chwsup, how = 'left', left_on = 'Date_Time', right_on = 'Date_Time')
df_mstr2 = pd.merge(df_mstr1, df_chwret, how = 'left', left_on = 'Date_Time', right_on = 'Date_Time')
df_mstr3 = pd.merge(df_mstr2, df_chwflow, how= 'left', left_on = 'Date_Time', right_on = 'Date_Time')
df_mstr3['tons'] = 500*(1/12000)*df_mstr3['flow_gpm']*(df_mstr2['chwr_temp_F'] - df_mstr1['chws_temp_F'])
#plot cooling tons over date time period
x1 = df_mstr3['Date_Time']
y1 = df_mstr3['tons']
plt.Line2D(x1,y1)
plt.xlim(dt.datetime(2015,7,14), dt.datetime(2015,7,28))
plt.ylim(-100,300)
plt.show()

Maybe try using the pandas API for matplotlib:
df.index = df.date_time
df['tons'].plot(kind='area')

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Trying to plot and regroup 2 dataframes - python

Related

Interpreting Multiindex datetime

Apply style to specific columns in dataframe

python time series country data

Pandas Line Graph by Month, Grouped by Industry from Timestamped SQL Export

Matplotlib plot.fill plots data incorrectly, fills negative and shifts

Categories

Resources