In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import calmap
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)
In [20]:
df=pd.read_csv('supermarket_sales - Sheet1.csv')
df.columns
Out[20]:
Index(['Invoice ID', 'Branch', 'City', 'Customer type', 'Gender',
       'Product line', 'Unit price', 'Quantity', 'Tax 5%', 'Total', 'Date',
       'Time', 'Payment', 'cogs', 'gross margin percentage', 'gross income',
       'Rating'],
      dtype='object')
In [21]:
df.dtypes
Out[21]:
Invoice ID                  object
Branch                      object
City                        object
Customer type               object
Gender                      object
Product line                object
Unit price                 float64
Quantity                     int64
Tax 5%                     float64
Total                      float64
Date                        object
Time                        object
Payment                     object
cogs                       float64
gross margin percentage    float64
gross income               float64
Rating                     float64
dtype: object
In [22]:
df['Date']=pd.to_datetime(df['Date'])
df.set_index('Date',inplace=True)
df.head(10)
Out[22]:
Invoice ID Branch City Customer type Gender Product line Unit price Quantity Tax 5% Total Time Payment cogs gross margin percentage gross income Rating
Date
2019-01-05 750-67-8428 A Yangon Member Female Health and beauty 74.69 7 26.1415 548.9715 13:08 Ewallet 522.83 4.761905 26.1415 9.1
2019-03-08 226-31-3081 C Naypyitaw Normal Female Electronic accessories 15.28 5 3.8200 80.2200 10:29 Cash 76.40 4.761905 3.8200 9.6
2019-03-03 631-41-3108 A Yangon Normal Male Home and lifestyle 46.33 7 16.2155 340.5255 13:23 Credit card 324.31 4.761905 16.2155 7.4
2019-01-27 123-19-1176 A Yangon Member Male Health and beauty 58.22 8 23.2880 489.0480 20:33 Ewallet 465.76 4.761905 23.2880 8.4
2019-02-08 373-73-7910 A Yangon Normal Male Sports and travel 86.31 7 30.2085 634.3785 10:37 Ewallet 604.17 4.761905 30.2085 5.3
2019-03-25 699-14-3026 C Naypyitaw Normal Male Electronic accessories 85.39 7 29.8865 627.6165 18:30 Ewallet 597.73 4.761905 29.8865 4.1
2019-02-25 355-53-5943 A Yangon Member Female Electronic accessories 68.84 6 20.6520 433.6920 14:36 Ewallet 413.04 4.761905 20.6520 5.8
2019-02-24 315-22-5665 C Naypyitaw Normal Female Home and lifestyle 73.56 10 36.7800 772.3800 11:38 Ewallet 735.60 4.761905 36.7800 8.0
2019-01-10 665-32-9167 A Yangon Member Female Health and beauty 36.26 2 3.6260 76.1460 17:15 Credit card 72.52 4.761905 3.6260 7.2
2019-02-20 692-92-5582 B Mandalay Member Female Food and beverages 54.84 3 8.2260 172.7460 13:27 Credit card 164.52 4.761905 8.2260 5.9
In [23]:
df.dtypes
Out[23]:
Invoice ID                  object
Branch                      object
City                        object
Customer type               object
Gender                      object
Product line                object
Unit price                 float64
Quantity                     int64
Tax 5%                     float64
Total                      float64
Time                        object
Payment                     object
cogs                       float64
gross margin percentage    float64
gross income               float64
Rating                     float64
dtype: object
In [24]:
# Lets take a first glance at the distributions
sns.distplot(df['Rating'])
plt.axvline(x=np.mean(df['Rating']),c='red',ls='--', label='mean')
plt.axvline(x=np.percentile(df['Rating'],25),c='green',ls='--',label='25-75 th percentile')
plt.axvline(x=np.percentile(df['Rating'],75),c='green',ls='--')
plt.legend(loc='upper right')
Out[24]:
<matplotlib.legend.Legend at 0x1a36ebd9990>
In [25]:
df.hist(figsize=(10,10))
Out[25]:
array([[<AxesSubplot: title={'center': 'Unit price'}>,
        <AxesSubplot: title={'center': 'Quantity'}>,
        <AxesSubplot: title={'center': 'Tax 5%'}>],
       [<AxesSubplot: title={'center': 'Total'}>,
        <AxesSubplot: title={'center': 'cogs'}>,
        <AxesSubplot: title={'center': 'gross margin percentage'}>],
       [<AxesSubplot: title={'center': 'gross income'}>,
        <AxesSubplot: title={'center': 'Rating'}>, <AxesSubplot: >]],
      dtype=object)
In [26]:
# Let's take a look a the sale count diference between branches
branch_counts=df['Branch'].value_counts()
x=branch_counts.index
y=branch_counts.values
plt.bar(x,y)
for i in range(len(x)):
    plt.text(x = x[i], y = y[i], s = y[i], ha = 'center')
plt.show()
In [27]:
# Let's take a look a the sale count diference between payment types
ax=sns.countplot(x='Payment',data=df)
for p in ax.patches:
    ax.annotate('{}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()+3), ha='center')

plt.show()
In [28]:
# Relationship between gross income and customer ratings

sns.regplot(x='Rating',y='gross income',data=df)
Out[28]:
<AxesSubplot: xlabel='Rating', ylabel='gross income'>
In [29]:
# Relationship between Branch and gross income

sns.boxplot(x='Branch',y='gross income',data=df)
Out[29]:
<AxesSubplot: xlabel='Branch', ylabel='gross income'>
In [30]:
# Relationship between gender and gross income

sns.boxplot(x='Gender',y='gross income',data=df)
Out[30]:
<AxesSubplot: xlabel='Gender', ylabel='gross income'>
In [34]:
# Lets plot all the bivariate combinations posible
sns.pairplot(df)
Out[34]:
<seaborn.axisgrid.PairGrid at 0x1a374098350>
In [ ]:
# Lets look for duplicated rows
df.duplicated().sum()
df[df.duplicated()==True]
Out[ ]:
Invoice ID Branch City Customer type Gender Product line Unit price Quantity Tax 5% Total Time Payment cogs gross margin percentage gross income Rating
Date
2019-03-03 631-41-3108 A Yangon Normal Male Home and lifestyle 46.33 7 16.2155 340.5255 13:23 Credit card 324.31 4.761905 16.2155 7.4
2019-01-27 123-19-1176 A Yangon Member Male Health and beauty 58.22 8 23.2880 489.0480 20:33 Ewallet 465.76 4.761905 23.2880 8.4
2019-02-08 373-73-7910 A Yangon Normal Male Sports and travel 86.31 7 30.2085 634.3785 10:37 Ewallet 604.17 4.761905 30.2085 5.3
2019-03-25 699-14-3026 C Naypyitaw Normal Male Electronic accessories 85.39 7 29.8865 627.6165 18:30 Ewallet 597.73 4.761905 29.8865 4.1
2019-02-25 355-53-5943 A Yangon Member Female Electronic accessories 68.84 6 20.6520 433.6920 14:36 Ewallet 413.04 4.761905 20.6520 5.8
2019-02-24 315-22-5665 C Naypyitaw Normal Female Home and lifestyle 73.56 10 36.7800 772.3800 11:38 Ewallet 735.60 4.761905 36.7800 8.0
In [ ]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()
Out[ ]:
0
In [ ]:
# Look for NULL VALUES
df.isna().sum()
Out[ ]:
Invoice ID                  0
Branch                      0
City                        2
Customer type               0
Gender                      6
Product line                0
Unit price                  3
Quantity                    0
Tax 5%                      0
Total                       1
Time                        1
Payment                     0
cogs                        1
gross margin percentage     0
gross income                1
Rating                     16
dtype: int64
In [ ]:
# Missing values Map
sns.heatmap(df.isnull(),cbar=False)
Out[ ]:
<AxesSubplot: >
In [ ]:
# Lets replace those missing values with the mode for each value
df.fillna(df.mode().iloc[0], inplace=True)
sns.heatmap(df.isnull(),cbar=False)
Out[ ]:
<AxesSubplot: >
In [32]:
# Lets take a look at correlations
df=pd.read_csv('supermarket_sales - Sheet1.csv')
np.round(df.corr(),2)
Out[32]:
Unit price Quantity Tax 5% Total cogs gross margin percentage gross income Rating
Unit price 1.00 0.01 0.63 0.63 0.63 NaN 0.63 -0.00
Quantity 0.01 1.00 0.71 0.71 0.71 NaN 0.71 -0.02
Tax 5% 0.63 0.71 1.00 1.00 1.00 NaN 1.00 -0.04
Total 0.63 0.71 1.00 1.00 1.00 NaN 1.00 -0.04
cogs 0.63 0.71 1.00 1.00 1.00 NaN 1.00 -0.04
gross margin percentage NaN NaN NaN NaN NaN NaN NaN NaN
gross income 0.63 0.71 1.00 1.00 1.00 NaN 1.00 -0.04
Rating -0.00 -0.02 -0.04 -0.04 -0.04 NaN -0.04 1.00
In [33]:
sns.heatmap(np.round(df.corr(),2),annot=True)
Out[33]:
<AxesSubplot: >
In [ ]:
## SOME CONCLUSIONS

# 1) There is a strong positive correlation between the "Quantity," "Tax 5%," "Total," 
# "cogs," and "gross income" variables, which means that as the quantity of products 
# sold increases, the tax, total sales, cost of goods sold, and gross income also increase.

# 2) All of the above-mentioned variables have a perfect correlation with each other, indicating 
# that they are highly dependent on each other.

# 3) There is a moderate positive correlation between the "Unit price" and "Tax 5%," "Total," 
# and "cogs" variables, suggesting that as the unit price of the products sold increases, 
# so does the tax, total sales, and cost of goods sold.

# 4) There is a weak negative correlation between the "Rating" variable and all other variables, 
# indicating that there is no significant relationship between the rating and other sales-related variables in the supermarket company.

# 5) The "gross margin percentage" variable has no correlation with any other variable, indicating 
# that it is independent of other sales-related variables in the supermarket company.