# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8) #Adjust the configuration of the plots we will create
# Read in the data
df=pd.read_csv(r'C:\Users\Usuario\Desktop\Gonza\PortfolioProyects\Movies - Python\movies.csv')
# Let's look at the data
df.head()
name | rating | genre | year | released | score | votes | director | writer | star | country | budget | gross | company | runtime | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | The Shining | R | Drama | 1980 | June 13, 1980 (United States) | 8.4 | 927000.0 | Stanley Kubrick | Stephen King | Jack Nicholson | United Kingdom | 19000000.0 | 46998772.0 | Warner Bros. | 146.0 |
1 | The Blue Lagoon | R | Adventure | 1980 | July 2, 1980 (United States) | 5.8 | 65000.0 | Randal Kleiser | Henry De Vere Stacpoole | Brooke Shields | United States | 4500000.0 | 58853106.0 | Columbia Pictures | 104.0 |
2 | Star Wars: Episode V - The Empire Strikes Back | PG | Action | 1980 | June 20, 1980 (United States) | 8.7 | 1200000.0 | Irvin Kershner | Leigh Brackett | Mark Hamill | United States | 18000000.0 | 538375067.0 | Lucasfilm | 124.0 |
3 | Airplane! | PG | Comedy | 1980 | July 2, 1980 (United States) | 7.7 | 221000.0 | Jim Abrahams | Jim Abrahams | Robert Hays | United States | 3500000.0 | 83453539.0 | Paramount Pictures | 88.0 |
4 | Caddyshack | R | Comedy | 1980 | July 25, 1980 (United States) | 7.3 | 108000.0 | Harold Ramis | Brian Doyle-Murray | Chevy Chase | United States | 6000000.0 | 39846344.0 | Orion Pictures | 98.0 |
# We need to see if we have any missing data
# Let's loop through the data and see if there is anything missing
for col in df.columns:
pct_missing = np.mean(df[col].isnull())
print('{} - {}%'.format(col, round(pct_missing*100)))
name - 0% rating - 1% genre - 0% year - 0% released - 0% score - 0% votes - 0% director - 0% writer - 0% star - 0% country - 0% budget - 28% gross - 2% company - 0% runtime - 0%
#Data types for our columns
df.dtypes
name object rating object genre object year int64 released object score float64 votes float64 director object writer object star object country object budget float64 gross float64 company object runtime float64 dtype: object
#Changing data types of columns
df['budget'] = df['budget'].astype('Int64')
df['gross'] = df['gross'].astype('Int64')
df['yearcorrect']=df['released'].astype(str).str[:4]
df.sort_values(by=['gross'], inplace=False, ascending=False).head()
name | rating | genre | year | released | score | votes | director | writer | star | country | budget | gross | company | runtime | yearcorrect | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5445 | Avatar | PG-13 | Action | 2009 | December 18, 2009 (United States) | 7.8 | 1100000.0 | James Cameron | James Cameron | Sam Worthington | United States | 237000000 | 2847246203 | Twentieth Century Fox | 162.0 | Dece |
7445 | Avengers: Endgame | PG-13 | Action | 2019 | April 26, 2019 (United States) | 8.4 | 903000.0 | Anthony Russo | Christopher Markus | Robert Downey Jr. | United States | 356000000 | 2797501328 | Marvel Studios | 181.0 | Apri |
3045 | Titanic | PG-13 | Drama | 1997 | December 19, 1997 (United States) | 7.8 | 1100000.0 | James Cameron | James Cameron | Leonardo DiCaprio | United States | 200000000 | 2201647264 | Twentieth Century Fox | 194.0 | Dece |
6663 | Star Wars: Episode VII - The Force Awakens | PG-13 | Action | 2015 | December 18, 2015 (United States) | 7.8 | 876000.0 | J.J. Abrams | Lawrence Kasdan | Daisy Ridley | United States | 245000000 | 2069521700 | Lucasfilm | 138.0 | Dece |
7244 | Avengers: Infinity War | PG-13 | Action | 2018 | April 27, 2018 (United States) | 8.4 | 897000.0 | Anthony Russo | Christopher Markus | Robert Downey Jr. | United States | 321000000 | 2048359754 | Marvel Studios | 149.0 | Apri |
pd.set_option('display.max_rows', None)
# Check for duplicates and drop them
df['company'].drop_duplicates().sort_values(ascending=False).head()
7129 thefyzz 5664 micro_scope 6412 iDeal Partners Film Fund 4007 i5 Films 6793 i am OTHER Name: company, dtype: object
# Scatter plot
x=df['budget']
y=df['gross']
x=x.fillna(0)
y=y.fillna(0)
plt.scatter(x, y)
plt.title('Budget vs Gross Earnings')
plt.xlabel('Gross Earnings')
plt.ylabel('Budget for film')
plt.show()
# Plot budget vs gross using seabron
sns.regplot(x="gross", y="budget", data=df, scatter_kws={"color":"red"}, line_kws={"color":"blue"})
--------------------------------------------------------------------------- UFuncTypeError Traceback (most recent call last) ~\AppData\Local\Temp\ipykernel_16872\2629294837.py in <module> 1 # Plot budget vs gross using seabron 2 ----> 3 sns.regplot(x="gross", y="budget", data=df, scatter_kws={"color":"red"}, line_kws={"color":"blue"}) C:\ProgramData\Anaconda3\lib\site-packages\seaborn\_decorators.py in inner_f(*args, **kwargs) 44 ) 45 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)}) ---> 46 return f(**kwargs) 47 return inner_f 48 C:\ProgramData\Anaconda3\lib\site-packages\seaborn\regression.py in regplot(x, y, data, x_estimator, x_bins, x_ci, scatter, fit_reg, ci, n_boot, units, seed, order, logistic, lowess, robust, logx, x_partial, y_partial, truncate, dropna, x_jitter, y_jitter, label, color, marker, scatter_kws, line_kws, ax) 861 scatter_kws["marker"] = marker 862 line_kws = {} if line_kws is None else copy.copy(line_kws) --> 863 plotter.plot(ax, scatter_kws, line_kws) 864 return ax 865 C:\ProgramData\Anaconda3\lib\site-packages\seaborn\regression.py in plot(self, ax, scatter_kws, line_kws) 368 369 if self.fit_reg: --> 370 self.lineplot(ax, line_kws) 371 372 # Label the axes C:\ProgramData\Anaconda3\lib\site-packages\seaborn\regression.py in lineplot(self, ax, kws) 411 """Draw the model.""" 412 # Fit the regression model --> 413 grid, yhat, err_bands = self.fit_regression(ax) 414 edges = grid[0], grid[-1] 415 C:\ProgramData\Anaconda3\lib\site-packages\seaborn\regression.py in fit_regression(self, ax, x_range, grid) 219 yhat, yhat_boots = self.fit_logx(grid) 220 else: --> 221 yhat, yhat_boots = self.fit_fast(grid) 222 223 # Compute the confidence interval at each grid point C:\ProgramData\Anaconda3\lib\site-packages\seaborn\regression.py in fit_fast(self, grid) 236 X, y = np.c_[np.ones(len(self.x)), self.x], self.y 237 grid = np.c_[np.ones(len(grid)), grid] --> 238 yhat = grid.dot(reg_func(X, y)) 239 if self.ci is None: 240 return yhat, None C:\ProgramData\Anaconda3\lib\site-packages\seaborn\regression.py in reg_func(_x, _y) 232 """Low-level regression and prediction using linear algebra.""" 233 def reg_func(_x, _y): --> 234 return np.linalg.pinv(_x).dot(_y) 235 236 X, y = np.c_[np.ones(len(self.x)), self.x], self.y <__array_function__ internals> in pinv(*args, **kwargs) C:\ProgramData\Anaconda3\lib\site-packages\numpy\linalg\linalg.py in pinv(a, rcond, hermitian) 2000 return wrap(res) 2001 a = a.conjugate() -> 2002 u, s, vt = svd(a, full_matrices=False, hermitian=hermitian) 2003 2004 # discard small singular values <__array_function__ internals> in svd(*args, **kwargs) C:\ProgramData\Anaconda3\lib\site-packages\numpy\linalg\linalg.py in svd(a, full_matrices, compute_uv, hermitian) 1658 1659 signature = 'D->DdD' if isComplexType(t) else 'd->ddd' -> 1660 u, s, vh = gufunc(a, signature=signature, extobj=extobj) 1661 u = u.astype(result_t, copy=False) 1662 s = s.astype(_realType(result_t), copy=False) UFuncTypeError: Cannot cast ufunc 'svd_n_s' input from dtype('O') to dtype('float64') with casting rule 'same_kind'
# All the posible pearson correlations between numerical fields
# Another posibilitie to pearson is to use 'kendall' or 'spearman' method
# ( df.corr() by default is pearson )
correlation_matrix=df.corr(method='pearson')
sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlation Matrix')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()
# Let's give a number to every object in order to make more correlations
df_numerized=df
for col_name in df_numerized.columns:
if(df_numerized[col_name].dtype =='object'):
df_numerized[col_name]=df_numerized[col_name].astype('category')
df_numerized[col_name]=df_numerized[col_name].cat.codes
df_numerized.head()
name | rating | genre | year | released | score | votes | director | writer | star | country | budget | gross | company | runtime | yearcorrect | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6587 | 6 | 6 | 1980 | 1705 | 8.4 | 927000.0 | 2589 | 4014 | 1047 | 54 | 19000000 | 46998772 | 2319 | 146.0 | 14 |
1 | 5573 | 6 | 1 | 1980 | 1492 | 5.8 | 65000.0 | 2269 | 1632 | 327 | 55 | 4500000 | 58853106 | 731 | 104.0 | 13 |
2 | 5142 | 4 | 0 | 1980 | 1771 | 8.7 | 1200000.0 | 1111 | 2567 | 1745 | 55 | 18000000 | 538375067 | 1540 | 124.0 | 14 |
3 | 286 | 4 | 4 | 1980 | 1492 | 7.7 | 221000.0 | 1301 | 2000 | 2246 | 55 | 3500000 | 83453539 | 1812 | 88.0 | 13 |
4 | 1027 | 6 | 4 | 1980 | 1543 | 7.3 | 108000.0 | 1054 | 521 | 410 | 55 | 6000000 | 39846344 | 1777 | 98.0 | 13 |
#Now I have the correlatios for all the elements in the table
correlation_matrix=df_numerized.corr(method='pearson')
sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlation Matrix')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()
# Unstack them for a better view
correlation_matrix=df_numerized.corr()
correlation_pairs=correlation_matrix.unstack()
correlation_pairs.head()
name name 1.000000 rating -0.008069 genre 0.016355 year 0.011453 released -0.011311 dtype: float64
sorted_pairs=correlation_pairs.sort_values()
sorted_pairs.head()
genre budget -0.356564 budget genre -0.356564 genre gross -0.235650 gross genre -0.235650 budget rating -0.176002 dtype: float64
high_corr=sorted_pairs[(sorted_pairs)>0.5]
high_corr
votes gross 0.630757 gross votes 0.630757 budget 0.740395 budget gross 0.740395 yearcorrect released 0.993694 released yearcorrect 0.993694 name name 1.000000 company company 1.000000 gross gross 1.000000 budget budget 1.000000 country country 1.000000 star star 1.000000 writer writer 1.000000 director director 1.000000 votes votes 1.000000 score score 1.000000 released released 1.000000 year year 1.000000 genre genre 1.000000 rating rating 1.000000 runtime runtime 1.000000 yearcorrect yearcorrect 1.000000 dtype: float64
# Votes and budget have the highest correlation to gross earnings and company has a low correlation