import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import calmap
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)
df=pd.read_csv('supermarket_sales - Sheet1.csv')
df.columns
Index(['Invoice ID', 'Branch', 'City', 'Customer type', 'Gender', 'Product line', 'Unit price', 'Quantity', 'Tax 5%', 'Total', 'Date', 'Time', 'Payment', 'cogs', 'gross margin percentage', 'gross income', 'Rating'], dtype='object')
df.dtypes
Invoice ID object Branch object City object Customer type object Gender object Product line object Unit price float64 Quantity int64 Tax 5% float64 Total float64 Date object Time object Payment object cogs float64 gross margin percentage float64 gross income float64 Rating float64 dtype: object
df['Date']=pd.to_datetime(df['Date'])
df.set_index('Date',inplace=True)
df.head(10)
Invoice ID | Branch | City | Customer type | Gender | Product line | Unit price | Quantity | Tax 5% | Total | Time | Payment | cogs | gross margin percentage | gross income | Rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Date | ||||||||||||||||
2019-01-05 | 750-67-8428 | A | Yangon | Member | Female | Health and beauty | 74.69 | 7 | 26.1415 | 548.9715 | 13:08 | Ewallet | 522.83 | 4.761905 | 26.1415 | 9.1 |
2019-03-08 | 226-31-3081 | C | Naypyitaw | Normal | Female | Electronic accessories | 15.28 | 5 | 3.8200 | 80.2200 | 10:29 | Cash | 76.40 | 4.761905 | 3.8200 | 9.6 |
2019-03-03 | 631-41-3108 | A | Yangon | Normal | Male | Home and lifestyle | 46.33 | 7 | 16.2155 | 340.5255 | 13:23 | Credit card | 324.31 | 4.761905 | 16.2155 | 7.4 |
2019-01-27 | 123-19-1176 | A | Yangon | Member | Male | Health and beauty | 58.22 | 8 | 23.2880 | 489.0480 | 20:33 | Ewallet | 465.76 | 4.761905 | 23.2880 | 8.4 |
2019-02-08 | 373-73-7910 | A | Yangon | Normal | Male | Sports and travel | 86.31 | 7 | 30.2085 | 634.3785 | 10:37 | Ewallet | 604.17 | 4.761905 | 30.2085 | 5.3 |
2019-03-25 | 699-14-3026 | C | Naypyitaw | Normal | Male | Electronic accessories | 85.39 | 7 | 29.8865 | 627.6165 | 18:30 | Ewallet | 597.73 | 4.761905 | 29.8865 | 4.1 |
2019-02-25 | 355-53-5943 | A | Yangon | Member | Female | Electronic accessories | 68.84 | 6 | 20.6520 | 433.6920 | 14:36 | Ewallet | 413.04 | 4.761905 | 20.6520 | 5.8 |
2019-02-24 | 315-22-5665 | C | Naypyitaw | Normal | Female | Home and lifestyle | 73.56 | 10 | 36.7800 | 772.3800 | 11:38 | Ewallet | 735.60 | 4.761905 | 36.7800 | 8.0 |
2019-01-10 | 665-32-9167 | A | Yangon | Member | Female | Health and beauty | 36.26 | 2 | 3.6260 | 76.1460 | 17:15 | Credit card | 72.52 | 4.761905 | 3.6260 | 7.2 |
2019-02-20 | 692-92-5582 | B | Mandalay | Member | Female | Food and beverages | 54.84 | 3 | 8.2260 | 172.7460 | 13:27 | Credit card | 164.52 | 4.761905 | 8.2260 | 5.9 |
df.dtypes
Invoice ID object Branch object City object Customer type object Gender object Product line object Unit price float64 Quantity int64 Tax 5% float64 Total float64 Time object Payment object cogs float64 gross margin percentage float64 gross income float64 Rating float64 dtype: object
# Lets take a first glance at the distributions
sns.distplot(df['Rating'])
plt.axvline(x=np.mean(df['Rating']),c='red',ls='--', label='mean')
plt.axvline(x=np.percentile(df['Rating'],25),c='green',ls='--',label='25-75 th percentile')
plt.axvline(x=np.percentile(df['Rating'],75),c='green',ls='--')
plt.legend(loc='upper right')
<matplotlib.legend.Legend at 0x1a36ebd9990>
df.hist(figsize=(10,10))
array([[<AxesSubplot: title={'center': 'Unit price'}>, <AxesSubplot: title={'center': 'Quantity'}>, <AxesSubplot: title={'center': 'Tax 5%'}>], [<AxesSubplot: title={'center': 'Total'}>, <AxesSubplot: title={'center': 'cogs'}>, <AxesSubplot: title={'center': 'gross margin percentage'}>], [<AxesSubplot: title={'center': 'gross income'}>, <AxesSubplot: title={'center': 'Rating'}>, <AxesSubplot: >]], dtype=object)
# Let's take a look a the sale count diference between branches
branch_counts=df['Branch'].value_counts()
x=branch_counts.index
y=branch_counts.values
plt.bar(x,y)
for i in range(len(x)):
plt.text(x = x[i], y = y[i], s = y[i], ha = 'center')
plt.show()
# Let's take a look a the sale count diference between payment types
ax=sns.countplot(x='Payment',data=df)
for p in ax.patches:
ax.annotate('{}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()+3), ha='center')
plt.show()
# Relationship between gross income and customer ratings
sns.regplot(x='Rating',y='gross income',data=df)
<AxesSubplot: xlabel='Rating', ylabel='gross income'>
# Relationship between Branch and gross income
sns.boxplot(x='Branch',y='gross income',data=df)
<AxesSubplot: xlabel='Branch', ylabel='gross income'>
# Relationship between gender and gross income
sns.boxplot(x='Gender',y='gross income',data=df)
<AxesSubplot: xlabel='Gender', ylabel='gross income'>
# Lets plot all the bivariate combinations posible
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x1a374098350>
# Lets look for duplicated rows
df.duplicated().sum()
df[df.duplicated()==True]
Invoice ID | Branch | City | Customer type | Gender | Product line | Unit price | Quantity | Tax 5% | Total | Time | Payment | cogs | gross margin percentage | gross income | Rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Date | ||||||||||||||||
2019-03-03 | 631-41-3108 | A | Yangon | Normal | Male | Home and lifestyle | 46.33 | 7 | 16.2155 | 340.5255 | 13:23 | Credit card | 324.31 | 4.761905 | 16.2155 | 7.4 |
2019-01-27 | 123-19-1176 | A | Yangon | Member | Male | Health and beauty | 58.22 | 8 | 23.2880 | 489.0480 | 20:33 | Ewallet | 465.76 | 4.761905 | 23.2880 | 8.4 |
2019-02-08 | 373-73-7910 | A | Yangon | Normal | Male | Sports and travel | 86.31 | 7 | 30.2085 | 634.3785 | 10:37 | Ewallet | 604.17 | 4.761905 | 30.2085 | 5.3 |
2019-03-25 | 699-14-3026 | C | Naypyitaw | Normal | Male | Electronic accessories | 85.39 | 7 | 29.8865 | 627.6165 | 18:30 | Ewallet | 597.73 | 4.761905 | 29.8865 | 4.1 |
2019-02-25 | 355-53-5943 | A | Yangon | Member | Female | Electronic accessories | 68.84 | 6 | 20.6520 | 433.6920 | 14:36 | Ewallet | 413.04 | 4.761905 | 20.6520 | 5.8 |
2019-02-24 | 315-22-5665 | C | Naypyitaw | Normal | Female | Home and lifestyle | 73.56 | 10 | 36.7800 | 772.3800 | 11:38 | Ewallet | 735.60 | 4.761905 | 36.7800 | 8.0 |
df.drop_duplicates(inplace=True)
df.duplicated().sum()
0
# Look for NULL VALUES
df.isna().sum()
Invoice ID 0 Branch 0 City 2 Customer type 0 Gender 6 Product line 0 Unit price 3 Quantity 0 Tax 5% 0 Total 1 Time 1 Payment 0 cogs 1 gross margin percentage 0 gross income 1 Rating 16 dtype: int64
# Missing values Map
sns.heatmap(df.isnull(),cbar=False)
<AxesSubplot: >
# Lets replace those missing values with the mode for each value
df.fillna(df.mode().iloc[0], inplace=True)
sns.heatmap(df.isnull(),cbar=False)
<AxesSubplot: >
# Lets take a look at correlations
df=pd.read_csv('supermarket_sales - Sheet1.csv')
np.round(df.corr(),2)
Unit price | Quantity | Tax 5% | Total | cogs | gross margin percentage | gross income | Rating | |
---|---|---|---|---|---|---|---|---|
Unit price | 1.00 | 0.01 | 0.63 | 0.63 | 0.63 | NaN | 0.63 | -0.00 |
Quantity | 0.01 | 1.00 | 0.71 | 0.71 | 0.71 | NaN | 0.71 | -0.02 |
Tax 5% | 0.63 | 0.71 | 1.00 | 1.00 | 1.00 | NaN | 1.00 | -0.04 |
Total | 0.63 | 0.71 | 1.00 | 1.00 | 1.00 | NaN | 1.00 | -0.04 |
cogs | 0.63 | 0.71 | 1.00 | 1.00 | 1.00 | NaN | 1.00 | -0.04 |
gross margin percentage | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
gross income | 0.63 | 0.71 | 1.00 | 1.00 | 1.00 | NaN | 1.00 | -0.04 |
Rating | -0.00 | -0.02 | -0.04 | -0.04 | -0.04 | NaN | -0.04 | 1.00 |
sns.heatmap(np.round(df.corr(),2),annot=True)
<AxesSubplot: >
## SOME CONCLUSIONS
# 1) There is a strong positive correlation between the "Quantity," "Tax 5%," "Total,"
# "cogs," and "gross income" variables, which means that as the quantity of products
# sold increases, the tax, total sales, cost of goods sold, and gross income also increase.
# 2) All of the above-mentioned variables have a perfect correlation with each other, indicating
# that they are highly dependent on each other.
# 3) There is a moderate positive correlation between the "Unit price" and "Tax 5%," "Total,"
# and "cogs" variables, suggesting that as the unit price of the products sold increases,
# so does the tax, total sales, and cost of goods sold.
# 4) There is a weak negative correlation between the "Rating" variable and all other variables,
# indicating that there is no significant relationship between the rating and other sales-related variables in the supermarket company.
# 5) The "gross margin percentage" variable has no correlation with any other variable, indicating
# that it is independent of other sales-related variables in the supermarket company.