In [1]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8) #Adjust the configuration of the plots we will create

# Read in the data

df=pd.read_csv(r'C:\Users\Usuario\Desktop\Gonza\PortfolioProyects\Movies - Python\movies.csv')
In [2]:
# Let's look at the data

df.head()
Out[2]:
name rating genre year released score votes director writer star country budget gross company runtime
0 The Shining R Drama 1980 June 13, 1980 (United States) 8.4 927000.0 Stanley Kubrick Stephen King Jack Nicholson United Kingdom 19000000.0 46998772.0 Warner Bros. 146.0
1 The Blue Lagoon R Adventure 1980 July 2, 1980 (United States) 5.8 65000.0 Randal Kleiser Henry De Vere Stacpoole Brooke Shields United States 4500000.0 58853106.0 Columbia Pictures 104.0
2 Star Wars: Episode V - The Empire Strikes Back PG Action 1980 June 20, 1980 (United States) 8.7 1200000.0 Irvin Kershner Leigh Brackett Mark Hamill United States 18000000.0 538375067.0 Lucasfilm 124.0
3 Airplane! PG Comedy 1980 July 2, 1980 (United States) 7.7 221000.0 Jim Abrahams Jim Abrahams Robert Hays United States 3500000.0 83453539.0 Paramount Pictures 88.0
4 Caddyshack R Comedy 1980 July 25, 1980 (United States) 7.3 108000.0 Harold Ramis Brian Doyle-Murray Chevy Chase United States 6000000.0 39846344.0 Orion Pictures 98.0
In [3]:
# We need to see if we have any missing data
# Let's loop through the data and see if there is anything missing

for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))
name - 0%
rating - 1%
genre - 0%
year - 0%
released - 0%
score - 0%
votes - 0%
director - 0%
writer - 0%
star - 0%
country - 0%
budget - 28%
gross - 2%
company - 0%
runtime - 0%
In [4]:
#Data types for our columns

df.dtypes
Out[4]:
name         object
rating       object
genre        object
year          int64
released     object
score       float64
votes       float64
director     object
writer       object
star         object
country      object
budget      float64
gross       float64
company      object
runtime     float64
dtype: object
In [5]:
#Changing data types of columns

df['budget'] = df['budget'].astype('Int64')
df['gross'] = df['gross'].astype('Int64')

df['yearcorrect']=df['released'].astype(str).str[:4]
In [6]:
df.sort_values(by=['gross'], inplace=False, ascending=False).head()
Out[6]:
name rating genre year released score votes director writer star country budget gross company runtime yearcorrect
5445 Avatar PG-13 Action 2009 December 18, 2009 (United States) 7.8 1100000.0 James Cameron James Cameron Sam Worthington United States 237000000 2847246203 Twentieth Century Fox 162.0 Dece
7445 Avengers: Endgame PG-13 Action 2019 April 26, 2019 (United States) 8.4 903000.0 Anthony Russo Christopher Markus Robert Downey Jr. United States 356000000 2797501328 Marvel Studios 181.0 Apri
3045 Titanic PG-13 Drama 1997 December 19, 1997 (United States) 7.8 1100000.0 James Cameron James Cameron Leonardo DiCaprio United States 200000000 2201647264 Twentieth Century Fox 194.0 Dece
6663 Star Wars: Episode VII - The Force Awakens PG-13 Action 2015 December 18, 2015 (United States) 7.8 876000.0 J.J. Abrams Lawrence Kasdan Daisy Ridley United States 245000000 2069521700 Lucasfilm 138.0 Dece
7244 Avengers: Infinity War PG-13 Action 2018 April 27, 2018 (United States) 8.4 897000.0 Anthony Russo Christopher Markus Robert Downey Jr. United States 321000000 2048359754 Marvel Studios 149.0 Apri
In [7]:
pd.set_option('display.max_rows', None)
In [8]:
# Check for duplicates and drop them

df['company'].drop_duplicates().sort_values(ascending=False).head()
Out[8]:
7129                     thefyzz
5664                 micro_scope
6412    iDeal Partners Film Fund
4007                    i5 Films
6793                  i am OTHER
Name: company, dtype: object
In [9]:
# Scatter plot

x=df['budget']
y=df['gross']
x=x.fillna(0)
y=y.fillna(0)

plt.scatter(x, y)

plt.title('Budget vs Gross Earnings')
plt.xlabel('Gross Earnings')
plt.ylabel('Budget for film')

plt.show()
In [15]:
# Plot budget vs gross using seabron

sns.regplot(x="gross", y="budget", data=df, scatter_kws={"color":"red"}, line_kws={"color":"blue"})
---------------------------------------------------------------------------
UFuncTypeError                            Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_16872\2629294837.py in <module>
      1 # Plot budget vs gross using seabron
      2 
----> 3 sns.regplot(x="gross", y="budget", data=df, scatter_kws={"color":"red"}, line_kws={"color":"blue"})

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\_decorators.py in inner_f(*args, **kwargs)
     44             )
     45         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 46         return f(**kwargs)
     47     return inner_f
     48 

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\regression.py in regplot(x, y, data, x_estimator, x_bins, x_ci, scatter, fit_reg, ci, n_boot, units, seed, order, logistic, lowess, robust, logx, x_partial, y_partial, truncate, dropna, x_jitter, y_jitter, label, color, marker, scatter_kws, line_kws, ax)
    861     scatter_kws["marker"] = marker
    862     line_kws = {} if line_kws is None else copy.copy(line_kws)
--> 863     plotter.plot(ax, scatter_kws, line_kws)
    864     return ax
    865 

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\regression.py in plot(self, ax, scatter_kws, line_kws)
    368 
    369         if self.fit_reg:
--> 370             self.lineplot(ax, line_kws)
    371 
    372         # Label the axes

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\regression.py in lineplot(self, ax, kws)
    411         """Draw the model."""
    412         # Fit the regression model
--> 413         grid, yhat, err_bands = self.fit_regression(ax)
    414         edges = grid[0], grid[-1]
    415 

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\regression.py in fit_regression(self, ax, x_range, grid)
    219             yhat, yhat_boots = self.fit_logx(grid)
    220         else:
--> 221             yhat, yhat_boots = self.fit_fast(grid)
    222 
    223         # Compute the confidence interval at each grid point

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\regression.py in fit_fast(self, grid)
    236         X, y = np.c_[np.ones(len(self.x)), self.x], self.y
    237         grid = np.c_[np.ones(len(grid)), grid]
--> 238         yhat = grid.dot(reg_func(X, y))
    239         if self.ci is None:
    240             return yhat, None

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\regression.py in reg_func(_x, _y)
    232         """Low-level regression and prediction using linear algebra."""
    233         def reg_func(_x, _y):
--> 234             return np.linalg.pinv(_x).dot(_y)
    235 
    236         X, y = np.c_[np.ones(len(self.x)), self.x], self.y

<__array_function__ internals> in pinv(*args, **kwargs)

C:\ProgramData\Anaconda3\lib\site-packages\numpy\linalg\linalg.py in pinv(a, rcond, hermitian)
   2000         return wrap(res)
   2001     a = a.conjugate()
-> 2002     u, s, vt = svd(a, full_matrices=False, hermitian=hermitian)
   2003 
   2004     # discard small singular values

<__array_function__ internals> in svd(*args, **kwargs)

C:\ProgramData\Anaconda3\lib\site-packages\numpy\linalg\linalg.py in svd(a, full_matrices, compute_uv, hermitian)
   1658 
   1659         signature = 'D->DdD' if isComplexType(t) else 'd->ddd'
-> 1660         u, s, vh = gufunc(a, signature=signature, extobj=extobj)
   1661         u = u.astype(result_t, copy=False)
   1662         s = s.astype(_realType(result_t), copy=False)

UFuncTypeError: Cannot cast ufunc 'svd_n_s' input from dtype('O') to dtype('float64') with casting rule 'same_kind'
In [16]:
# All the posible pearson correlations between numerical fields
# Another posibilitie to pearson is to use 'kendall' or 'spearman' method
# ( df.corr() by default is pearson )

correlation_matrix=df.corr(method='pearson')

sns.heatmap(correlation_matrix, annot=True)

plt.title('Correlation Matrix')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()
In [17]:
# Let's give a number to every object in order to make more correlations

df_numerized=df

for col_name in df_numerized.columns:
    if(df_numerized[col_name].dtype =='object'):
        df_numerized[col_name]=df_numerized[col_name].astype('category')
        df_numerized[col_name]=df_numerized[col_name].cat.codes
df_numerized.head()
Out[17]:
name rating genre year released score votes director writer star country budget gross company runtime yearcorrect
0 6587 6 6 1980 1705 8.4 927000.0 2589 4014 1047 54 19000000 46998772 2319 146.0 14
1 5573 6 1 1980 1492 5.8 65000.0 2269 1632 327 55 4500000 58853106 731 104.0 13
2 5142 4 0 1980 1771 8.7 1200000.0 1111 2567 1745 55 18000000 538375067 1540 124.0 14
3 286 4 4 1980 1492 7.7 221000.0 1301 2000 2246 55 3500000 83453539 1812 88.0 13
4 1027 6 4 1980 1543 7.3 108000.0 1054 521 410 55 6000000 39846344 1777 98.0 13
In [18]:
#Now I have the correlatios for all the elements in the table

correlation_matrix=df_numerized.corr(method='pearson')

sns.heatmap(correlation_matrix, annot=True)

plt.title('Correlation Matrix')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()
In [19]:
# Unstack them for a better view

correlation_matrix=df_numerized.corr()

correlation_pairs=correlation_matrix.unstack()

correlation_pairs.head()
Out[19]:
name  name        1.000000
      rating     -0.008069
      genre       0.016355
      year        0.011453
      released   -0.011311
dtype: float64
In [20]:
sorted_pairs=correlation_pairs.sort_values()

sorted_pairs.head()
Out[20]:
genre   budget   -0.356564
budget  genre    -0.356564
genre   gross    -0.235650
gross   genre    -0.235650
budget  rating   -0.176002
dtype: float64
In [21]:
high_corr=sorted_pairs[(sorted_pairs)>0.5]

high_corr
Out[21]:
votes        gross          0.630757
gross        votes          0.630757
             budget         0.740395
budget       gross          0.740395
yearcorrect  released       0.993694
released     yearcorrect    0.993694
name         name           1.000000
company      company        1.000000
gross        gross          1.000000
budget       budget         1.000000
country      country        1.000000
star         star           1.000000
writer       writer         1.000000
director     director       1.000000
votes        votes          1.000000
score        score          1.000000
released     released       1.000000
year         year           1.000000
genre        genre          1.000000
rating       rating         1.000000
runtime      runtime        1.000000
yearcorrect  yearcorrect    1.000000
dtype: float64
In [22]:
# Votes and budget have the highest correlation to gross earnings and company has a low correlation
In [ ]: