Read from EXCEL File - Export to EXCEL File - Calculating Outliers - Using Lambda functions - Slice and dice data of Python Pandas module
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy.random as np
%matplotlib inline
In [2]:
np.seed(111)
def create_data_set(Number=1):
output_array =[]
date_range = pd.date_range(start='1/1/2009', end='12/31/2013', freq='W-MON')
data = np.randint(low=25,high=1000,size=len(date_range))
status = [1, 2, 3]
random_status = [status[np.randint(low=0, high=len(status))] for i in range(len(date_range))]
states = ['GA','FL','fl','NY','NJ','TX']
random_states = [states[np.randint(low=0, high=len(states))] for i in range(len(date_range))]
output_array.extend(zip(random_states, random_status, data, date_range))
return output_array
In [3]:
created_dataset = create_data_set(4)
df = pd.DataFrame(data=created_dataset, columns=['States', 'Status', 'CustomerCount', 'StatusDate'])
df.info()
RangeIndex: 261 entries, 0 to 260
Data columns (total 4 columns):
States 261 non-null object
Status 261 non-null int64
CustomerCount 261 non-null int64
StatusDate 261 non-null datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 8.2+ KB
In [4]:
df.head()
Out[4]:
States | Status | CustomerCount | StatusDate | |
---|---|---|---|---|
0 | fl | 2 | 877 | 2009-01-05 |
1 | NY | 2 | 901 | 2009-01-12 |
2 | FL | 3 | 749 | 2009-01-19 |
3 | TX | 3 | 111 | 2009-01-26 |
4 | GA | 3 | 300 | 2009-02-02 |
In [5]:
location = 'Data/lession3.xlsx'
df.to_excel(location, index=False)
print('Done')
Done
In [6]:
df = pd.read_excel(location, 0, index_col='StatusDate')
df.dtypes
Out[6]:
States object
Status int64
CustomerCount int64
dtype: object
In [7]:
df.index
Out[7]:
DatetimeIndex(['2009-01-05', '2009-01-12', '2009-01-19', '2009-01-26',
'2009-02-02', '2009-02-09', '2009-02-16', '2009-02-23',
'2009-03-02', '2009-03-09',
...
'2013-10-28', '2013-11-04', '2013-11-11', '2013-11-18',
'2013-11-25', '2013-12-02', '2013-12-09', '2013-12-16',
'2013-12-23', '2013-12-30'],
dtype='datetime64[ns]', name='StatusDate', length=261, freq=None)
In [8]:
df.head()
Out[8]:
States | Status | CustomerCount | |
---|---|---|---|
StatusDate | |||
2009-01-05 | fl | 2 | 877 |
2009-01-12 | NY | 2 | 901 |
2009-01-19 | FL | 3 | 749 |
2009-01-26 | TX | 3 | 111 |
2009-02-02 | GA | 3 | 300 |
In [9]:
df['States'].unique()
Out[9]:
array(['fl', 'NY', 'FL', 'TX', 'GA', 'NJ'], dtype=object)
In [10]:
df['States'] = df.States.apply(lambda x: x.upper())
In [11]:
df['States'].unique()
Out[11]:
array(['FL', 'NY', 'TX', 'GA', 'NJ'], dtype=object)
In [12]:
mask = df['Status'] == 1
df = df[mask]
In [13]:
df.head()
Out[13]:
States | Status | CustomerCount | |
---|---|---|---|
StatusDate | |||
2009-02-16 | NY | 1 | 347 |
2009-03-09 | TX | 1 | 992 |
2009-03-30 | GA | 1 | 293 |
2009-04-13 | FL | 1 | 474 |
2009-05-25 | NJ | 1 | 46 |
In [14]:
mask = df.States == 'NJ'
df['States'][mask] = 'NY'
In [15]:
df['States'].unique()
Out[15]:
array(['NY', 'TX', 'GA', 'FL'], dtype=object)
In [16]:
df['CustomerCount'].plot(figsize=(15,5))
Out[16]:
In [17]:
# sortdf = df[df['States'] == 'NY'].sort_values('CustomerCount', ascending=False)
sortdf = df[df['States'] == 'NY'].sort_index(axis=0)
sortdf.head(10)
Out[17]:
States | Status | CustomerCount | |
---|---|---|---|
StatusDate | |||
2009-02-16 | NY | 1 | 347 |
2009-05-25 | NY | 1 | 46 |
2009-06-29 | NY | 1 | 767 |
2009-08-31 | NY | 1 | 550 |
2009-09-14 | NY | 1 | 772 |
2009-10-12 | NY | 1 | 694 |
2010-05-03 | NY | 1 | 225 |
2010-09-13 | NY | 1 | 615 |
2010-10-25 | NY | 1 | 321 |
2010-12-13 | NY | 1 | 453 |
In [18]:
Daily = df.reset_index().groupby(['States', 'StatusDate']).sum()
Daily.head()
Out[18]:
Status | CustomerCount | ||
---|---|---|---|
States | StatusDate | ||
FL | 2009-04-13 | 1 | 474 |
2009-09-21 | 1 | 495 | |
2009-11-16 | 1 | 634 | |
2009-12-14 | 1 | 510 | |
2009-12-21 | 1 | 285 |
In [19]:
del Daily['Status']
Daily.head()
Out[19]:
CustomerCount | ||
---|---|---|
States | StatusDate | |
FL | 2009-04-13 | 474 |
2009-09-21 | 495 | |
2009-11-16 | 634 | |
2009-12-14 | 510 | |
2009-12-21 | 285 |
In [20]:
Daily.index
Out[20]:
MultiIndex(levels=[['FL', 'GA', 'NY', 'TX'], [2009-02-16 00:00:00, 2009-03-09 00:00:00, 2009-03-30 00:00:00, 2009-04-13 00:00:00, 2009-05-25 00:00:00, 2009-06-29 00:00:00, 2009-07-06 00:00:00, 2009-08-31 00:00:00, 2009-09-07 00:00:00, 2009-09-14 00:00:00, 2009-09-21 00:00:00, 2009-10-12 00:00:00, 2009-10-19 00:00:00, 2009-11-16 00:00:00, 2009-12-14 00:00:00, 2009-12-21 00:00:00, 2010-02-01 00:00:00, 2010-03-22 00:00:00, 2010-03-29 00:00:00, 2010-04-05 00:00:00, 2010-04-19 00:00:00, 2010-04-26 00:00:00, 2010-05-03 00:00:00, 2010-05-10 00:00:00, 2010-05-31 00:00:00, 2010-06-14 00:00:00, 2010-07-05 00:00:00, 2010-07-19 00:00:00, 2010-08-09 00:00:00, 2010-09-13 00:00:00, 2010-09-27 00:00:00, 2010-10-11 00:00:00, 2010-10-25 00:00:00, 2010-11-15 00:00:00, 2010-12-13 00:00:00, 2010-12-20 00:00:00, 2011-01-17 00:00:00, 2011-01-24 00:00:00, 2011-01-31 00:00:00, 2011-02-14 00:00:00, 2011-02-21 00:00:00, 2011-03-21 00:00:00, 2011-03-28 00:00:00, 2011-04-11 00:00:00, 2011-04-18 00:00:00, 2011-05-02 00:00:00, 2011-05-16 00:00:00, 2011-05-23 00:00:00, 2011-06-06 00:00:00, 2011-06-20 00:00:00, 2011-08-15 00:00:00, 2011-08-29 00:00:00, 2011-09-12 00:00:00, 2011-10-17 00:00:00, 2011-10-24 00:00:00, 2011-11-07 00:00:00, 2011-12-05 00:00:00, 2012-01-09 00:00:00, 2012-01-23 00:00:00, 2012-02-20 00:00:00, 2012-03-26 00:00:00, 2012-06-04 00:00:00, 2012-06-25 00:00:00, 2012-07-02 00:00:00, 2012-07-09 00:00:00, 2012-07-30 00:00:00, 2012-08-20 00:00:00, 2012-08-27 00:00:00, 2012-09-03 00:00:00, 2012-10-01 00:00:00, 2012-10-15 00:00:00, 2012-11-19 00:00:00, 2012-12-03 00:00:00, 2013-01-14 00:00:00, 2013-01-21 00:00:00, 2013-02-04 00:00:00, 2013-03-04 00:00:00, 2013-03-25 00:00:00, 2013-04-15 00:00:00, 2013-04-29 00:00:00, 2013-05-20 00:00:00, 2013-06-03 00:00:00, 2013-06-17 00:00:00, 2013-07-01 00:00:00, 2013-07-15 00:00:00, 2013-08-05 00:00:00, 2013-08-26 00:00:00, 2013-09-16 00:00:00, 2013-10-14 00:00:00, 2013-10-21 00:00:00, 2013-10-28 00:00:00, 2013-12-02 00:00:00]],
labels=[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], [3, 10, 13, 14, 15, 19, 20, 21, 24, 25, 26, 28, 33, 36, 39, 40, 43, 47, 51, 55, 56, 62, 67, 71, 73, 74, 75, 77, 83, 88, 2, 6, 8, 16, 17, 18, 31, 35, 59, 64, 68, 69, 78, 79, 85, 0, 4, 5, 7, 9, 11, 22, 29, 32, 34, 37, 38, 41, 44, 46, 48, 49, 57, 58, 61, 70, 72, 80, 81, 84, 86, 87, 89, 90, 91, 1, 12, 23, 27, 30, 42, 45, 50, 52, 53, 54, 60, 63, 65, 66, 76, 82]],
names=['States', 'StatusDate'])
In [21]:
Daily.index.levels[0]
Out[21]:
Index(['FL', 'GA', 'NY', 'TX'], dtype='object', name='States')
In [22]:
Daily.loc['FL'].plot()
Daily.loc['GA'].plot()
Daily.loc['NY'].plot()
Daily.loc['TX'].plot()
Out[22]:
In [23]:
Daily.loc['FL']['2012':].plot()
Daily.loc['GA']['2012':].plot()
Daily.loc['NY']['2012':].plot()
Daily.loc['TX']['2010':].plot()
Out[23]:
In [24]:
StateYearMonth = Daily.groupby([Daily.index.get_level_values(0), Daily.index.get_level_values(1).year, Daily.index.get_level_values(1).month])
Daily['Lower'] = StateYearMonth['CustomerCount'].transform(lambda x: x.quantile(q=.75) - (1.5*x.quantile(q=.75)-x.quantile(q=.25)))
Daily['Upper'] = StateYearMonth['CustomerCount'].transform(lambda x: x.quantile(q=.75) + (1.5*x.quantile(q=.75)-x.quantile(q=.25)))
Daily['Outlier'] = (Daily['CustomerCount'] < Daily['Lower']) | (Daily['CustomerCount'] > Daily['Upper'])
Daily[Daily['Outlier'] == False]
Daily.head(10)
Out[24]:
CustomerCount | Lower | Upper | Outlier | ||
---|---|---|---|---|---|
States | StatusDate | ||||
FL | 2009-04-13 | 474 | 237.000 | 711.000 | False |
2009-09-21 | 495 | 247.500 | 742.500 | False | |
2009-11-16 | 634 | 317.000 | 951.000 | False | |
2009-12-14 | 510 | 114.375 | 793.125 | False | |
2009-12-21 | 285 | 114.375 | 793.125 | False | |
2010-04-05 | 212 | 97.500 | 940.500 | False | |
2010-04-19 | 502 | 97.500 | 940.500 | False | |
2010-04-26 | 536 | 97.500 | 940.500 | False | |
2010-05-31 | 162 | 81.000 | 243.000 | False | |
2010-06-14 | 668 | 334.000 | 1002.000 | False |
In [25]:
ALL = pd.DataFrame(Daily['CustomerCount'].groupby(Daily.index.get_level_values(1)).sum())
ALL.columns = ['CustomerCount'] # rename column
YearMonth = ALL.groupby([lambda x: x.year, lambda x: x.month])
ALL['Max'] = YearMonth['CustomerCount'].transform(lambda x: x.max())
ALL.head()
Out[25]:
CustomerCount | Max | |
---|---|---|
StatusDate | ||
2009-02-16 | 347 | 347 |
2009-03-09 | 992 | 992 |
2009-03-30 | 293 | 992 |
2009-04-13 | 474 | 474 |
2009-05-25 | 46 | 46 |
In [26]:
data = [1000,2000,3000]
idx = pd.date_range(start='12/31/2011', end='12/31/2013', freq='A')
BHAG = pd.DataFrame(data, index=idx, columns=['BHAG'])
BHAG
Out[26]:
BHAG | |
---|---|
2011-12-31 | 1000 |
2012-12-31 | 2000 |
2013-12-31 | 3000 |
In [30]:
combined = pd.concat([ALL, BHAG], axis=0)
combined = combined.sort_index(axis=0)
combined.tail()
Out[30]:
BHAG | CustomerCount | Max | |
---|---|---|---|
2013-10-14 | NaN | 673.0 | 673.0 |
2013-10-21 | NaN | 373.0 | 673.0 |
2013-10-28 | NaN | 397.0 | 673.0 |
2013-12-02 | NaN | 851.0 | 851.0 |
2013-12-31 | 3000.0 | NaN | NaN |
In [31]:
fig, axes = plt.subplots(figsize=(12, 7))
combined['BHAG'].fillna(method='pad').plot(color='green', label='BHAG')
combined['Max'].plot(color='blue', label='All Markets')
plt.legend(loc='best');
In [33]:
ALL['Max'].plot(figsize=(10,5))
plt.title('All Markets')
# Last four Graphs
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 10))
fig.subplots_adjust(hspace=1.0)
Daily.loc['FL']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[0,0])
Daily.loc['GA']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[0,1])
Daily.loc['TX']['CustomerCount']['2010':].fillna(method='pad').plot(ax=axes[1,0])
Daily.loc['NY']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[1,1])
# Add titles
axes[0,0].set_title('Florida')
axes[0,1].set_title('Georgia')
axes[1,0].set_title('Texas')
axes[1,1].set_title('North East');