import numpy as np, pandas as pd
import itertools #provides various functions that work on iterators
import matplotlib
import matplotlib.pyplot as plt
import xlrd # in the case where raw data is in Excel/xls format 
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')


%matplotlib inline
matplotlib.rcParams['axes.labelsize'] = 14
matplotlib.rcParams['xtick.labelsize'] = 12
matplotlib.rcParams['ytick.labelsize'] = 12
matplotlib.rcParams['axes.titlesize'] = 15
matplotlib.rcParams['text.color'] = 'k'
rcParams['figure.figsize'] = 14, 5


df = pd.read_excel("Superstore.xls")
print('Data downloaded and read into a dataframe!')

Data downloaded and read into a dataframe!


df.head(3) # A quick preview of 3 rows of the dataframe


# 1.
df.columns.values

array(['Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Ship Mode',
       'Customer ID', 'Customer Name', 'Segment', 'Country', 'City',
       'State', 'Postal Code', 'Region', 'Product ID', 'Category',
       'Sub-Category', 'Product Name', 'Sales', 'Quantity', 'Discount',
       'Profit'], dtype=object)


# 2.
df.isnull().any()

Row ID           False
Order ID         False
Order Date       False
Ship Date        False
Ship Mode        False
Customer ID      False
Customer Name    False
Segment          False
Country          False
City             False
State            False
Postal Code      False
Region           False
Product ID       False
Category         False
Sub-Category     False
Product Name     False
Sales            False
Quantity         False
Discount         False
Profit           False
dtype: bool


# 3.
df.describe() # View basic statistics


# 4
df.Category.unique()

array(['Furniture', 'Office Supplies', 'Technology'], dtype=object)


df.Category.value_counts()

Office Supplies    6026
Furniture          2121
Technology         1847
Name: Category, dtype: int64


# Extract the data of Office Supplies
office_supplies = df.loc[df['Category'] == 'Office Supplies']


office_supplies.head(3)


# Answer:
office_supplies['Order Date'].min(), office_supplies['Order Date'].max()

(Timestamp('2014-01-03 00:00:00'), Timestamp('2017-12-30 00:00:00'))


# 1.
office_supplies = office_supplies[['Order Date','Sales']]
office_supplies.head()


# 2.
office_supplies = office_supplies.sort_values('Order Date')
office_supplies = office_supplies.groupby('Order Date')['Sales'].sum().reset_index()
office_supplies.head(10)


office_supplies = office_supplies.set_index('Order Date')
office_supplies.head()


office_supplies.index

DatetimeIndex(['2014-01-03', '2014-01-04', '2014-01-05', '2014-01-06',
               '2014-01-07', '2014-01-09', '2014-01-10', '2014-01-13',
               '2014-01-16', '2014-01-18',
               ...
               '2017-12-21', '2017-12-22', '2017-12-23', '2017-12-24',
               '2017-12-25', '2017-12-26', '2017-12-27', '2017-12-28',
               '2017-12-29', '2017-12-30'],
              dtype='datetime64[ns]', name='Order Date', length=1148, freq=None)


df_monthly = office_supplies['Sales'].resample('MS').mean()


df_monthly.head(10)

Order Date
2014-01-01     285.357647
2014-02-01      63.042588
2014-03-01     391.176318
2014-04-01     464.794750
2014-05-01     324.346545
2014-06-01     588.774409
2014-07-01     756.060400
2014-08-01     541.879143
2014-09-01    1015.677704
2014-10-01     267.078815
Freq: MS, Name: Sales, dtype: float64


df_monthly.plot ()
plt.xlabel('Order Date')
plt.ylabel('Sales')
plt.title('Superstore Sales')
plt.show()


decomposition = sm.tsa.seasonal_decompose(df_monthly, model = 'additive')
fig = decomposition.plot()
plt.show()

	Row ID	Order ID	Order Date	Ship Date	Ship Mode	Customer ID	Customer Name	Segment	Country	City	...	Postal Code	Region	Product ID	Category	Sub-Category	Product Name	Sales	Quantity	Profit
0	1	CA-2016-152156	2016-11-08	2016-11-11	Second Class	CG-12520	Claire Gute	Consumer	United States	Henderson	...	42420	South	FUR-BO-10001798	Furniture	Bookcases	Bush Somerset Collection Bookcase	261.96	2	41.9136
1	2	CA-2016-152156	2016-11-08	2016-11-11	Second Class	CG-12520	Claire Gute	Consumer	United States	Henderson	...	42420	South	FUR-CH-10000454	Furniture	Chairs	Hon Deluxe Fabric Upholstered Stacking Chairs,...	731.94	3	219.5820
2	3	CA-2016-138688	2016-06-12	2016-06-16	Second Class	DV-13045	Darrin Van Huff	Corporate	United States	Los Angeles	...	90036	West	OFF-LA-10000240	Office Supplies	Labels	Self-Adhesive Address Labels for Typewriters b...	14.62	2	6.8714

	Row ID	Postal Code	Sales	Quantity	Discount	Profit
count	9994.000000	9994.000000	9994.000000	9994.000000	9994.000000	9994.000000
mean	4997.500000	55190.379428	229.858001	3.789574	0.156203	28.656896
std	2885.163629	32063.693350	623.245101	2.225110	0.206452	234.260108
min	1.000000	1040.000000	0.444000	1.000000	0.000000	-6599.978000
25%	2499.250000	23223.000000	17.280000	2.000000	0.000000	1.728750
50%	4997.500000	56430.500000	54.490000	3.000000	0.200000	8.666500
75%	7495.750000	90008.000000	209.940000	5.000000	0.200000	29.364000
max	9994.000000	99301.000000	22638.480000	14.000000	0.800000	8399.976000

	Row ID	Order ID	Order Date	Ship Date	Ship Mode	Customer ID	Customer Name	Segment	Country	City	...	Postal Code	Region	Product ID	Category	Sub-Category	Product Name	Sales	Quantity	Discount	Profit
2	3	CA-2016-138688	2016-06-12	2016-06-16	Second Class	DV-13045	Darrin Van Huff	Corporate	United States	Los Angeles	...	90036	West	OFF-LA-10000240	Office Supplies	Labels	Self-Adhesive Address Labels for Typewriters b...	14.620	2	0.0	6.8714
4	5	US-2015-108966	2015-10-11	2015-10-18	Standard Class	SO-20335	Sean O'Donnell	Consumer	United States	Fort Lauderdale	...	33311	South	OFF-ST-10000760	Office Supplies	Storage	Eldon Fold 'N Roll Cart System	22.368	2	0.2	2.5164
6	7	CA-2014-115812	2014-06-09	2014-06-14	Standard Class	BH-11710	Brosina Hoffman	Consumer	United States	Los Angeles	...	90032	West	OFF-AR-10002833	Office Supplies	Art	Newell 322	7.280	4	0.0	1.9656

	Order Date	Sales
2	2016-06-12	14.620
4	2015-10-11	22.368
6	2014-06-09	7.280
8	2014-06-09	18.504
9	2014-06-09	114.900

Time Series Analysis Basics

Using Python

Contents

What is a time series data?¶

Data Extract and Processing¶

Import Libraries¶

View data in Pandas¶

A quick view of data properties¶

Time Series Analysis of `Office Supplies`¶

Reset index to `Order date` (the Time Series data points)¶

Resample data¶

Visualizing the Sales Time Series Data¶

Conclusion¶

	Order Date	Sales
0	2014-01-03	16.448
1	2014-01-04	288.060
2	2014-01-05	19.536
3	2014-01-06	685.340
4	2014-01-07	10.430
5	2014-01-09	9.344
6	2014-01-10	2.890
7	2014-01-13	2027.116
8	2014-01-16	48.660
9	2014-01-18	64.864

Time Series Analysis Basics

Using Python

Contents

What is a time series data?¶

Data Extract and Processing¶

Import Libraries¶

View data in Pandas¶

A quick view of data properties¶

Time Series Analysis of Office Supplies¶

Reset index to Order date (the Time Series data points)¶

Resample data¶

Visualizing the Sales Time Series Data¶

Conclusion¶

Time Series Analysis of `Office Supplies`¶

Reset index to `Order date` (the Time Series data points)¶