# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8) #Adjust the configuration of the plots we will create
# Read in the data
df=pd.read_csv(r'C:\Users\Usuario\Desktop\Gonza\PortfolioProyects\Movies - Python\movies.csv')
# Let's look at the data
df.head()
| name | rating | genre | year | released | score | votes | director | writer | star | country | budget | gross | company | runtime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | The Shining | R | Drama | 1980 | June 13, 1980 (United States) | 8.4 | 927000.0 | Stanley Kubrick | Stephen King | Jack Nicholson | United Kingdom | 19000000.0 | 46998772.0 | Warner Bros. | 146.0 |
| 1 | The Blue Lagoon | R | Adventure | 1980 | July 2, 1980 (United States) | 5.8 | 65000.0 | Randal Kleiser | Henry De Vere Stacpoole | Brooke Shields | United States | 4500000.0 | 58853106.0 | Columbia Pictures | 104.0 |
| 2 | Star Wars: Episode V - The Empire Strikes Back | PG | Action | 1980 | June 20, 1980 (United States) | 8.7 | 1200000.0 | Irvin Kershner | Leigh Brackett | Mark Hamill | United States | 18000000.0 | 538375067.0 | Lucasfilm | 124.0 |
| 3 | Airplane! | PG | Comedy | 1980 | July 2, 1980 (United States) | 7.7 | 221000.0 | Jim Abrahams | Jim Abrahams | Robert Hays | United States | 3500000.0 | 83453539.0 | Paramount Pictures | 88.0 |
| 4 | Caddyshack | R | Comedy | 1980 | July 25, 1980 (United States) | 7.3 | 108000.0 | Harold Ramis | Brian Doyle-Murray | Chevy Chase | United States | 6000000.0 | 39846344.0 | Orion Pictures | 98.0 |
# We need to see if we have any missing data
# Let's loop through the data and see if there is anything missing
for col in df.columns:
pct_missing = np.mean(df[col].isnull())
print('{} - {}%'.format(col, round(pct_missing*100)))
name - 0% rating - 1% genre - 0% year - 0% released - 0% score - 0% votes - 0% director - 0% writer - 0% star - 0% country - 0% budget - 28% gross - 2% company - 0% runtime - 0%
#Data types for our columns
df.dtypes
name object rating object genre object year int64 released object score float64 votes float64 director object writer object star object country object budget float64 gross float64 company object runtime float64 dtype: object
#Changing data types of columns
df['budget'] = df['budget'].astype('Int64')
df['gross'] = df['gross'].astype('Int64')
df['yearcorrect']=df['released'].astype(str).str[:4]
df.sort_values(by=['gross'], inplace=False, ascending=False).head()
| name | rating | genre | year | released | score | votes | director | writer | star | country | budget | gross | company | runtime | yearcorrect | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5445 | Avatar | PG-13 | Action | 2009 | December 18, 2009 (United States) | 7.8 | 1100000.0 | James Cameron | James Cameron | Sam Worthington | United States | 237000000 | 2847246203 | Twentieth Century Fox | 162.0 | Dece |
| 7445 | Avengers: Endgame | PG-13 | Action | 2019 | April 26, 2019 (United States) | 8.4 | 903000.0 | Anthony Russo | Christopher Markus | Robert Downey Jr. | United States | 356000000 | 2797501328 | Marvel Studios | 181.0 | Apri |
| 3045 | Titanic | PG-13 | Drama | 1997 | December 19, 1997 (United States) | 7.8 | 1100000.0 | James Cameron | James Cameron | Leonardo DiCaprio | United States | 200000000 | 2201647264 | Twentieth Century Fox | 194.0 | Dece |
| 6663 | Star Wars: Episode VII - The Force Awakens | PG-13 | Action | 2015 | December 18, 2015 (United States) | 7.8 | 876000.0 | J.J. Abrams | Lawrence Kasdan | Daisy Ridley | United States | 245000000 | 2069521700 | Lucasfilm | 138.0 | Dece |
| 7244 | Avengers: Infinity War | PG-13 | Action | 2018 | April 27, 2018 (United States) | 8.4 | 897000.0 | Anthony Russo | Christopher Markus | Robert Downey Jr. | United States | 321000000 | 2048359754 | Marvel Studios | 149.0 | Apri |
pd.set_option('display.max_rows', None)
# Check for duplicates and drop them
df['company'].drop_duplicates().sort_values(ascending=False).head()
7129 thefyzz 5664 micro_scope 6412 iDeal Partners Film Fund 4007 i5 Films 6793 i am OTHER Name: company, dtype: object
# Scatter plot
x=df['budget']
y=df['gross']
x=x.fillna(0)
y=y.fillna(0)
plt.scatter(x, y)
plt.title('Budget vs Gross Earnings')
plt.xlabel('Gross Earnings')
plt.ylabel('Budget for film')
plt.show()
# Plot budget vs gross using seabron
sns.regplot(x="gross", y="budget", data=df, scatter_kws={"color":"red"}, line_kws={"color":"blue"})
--------------------------------------------------------------------------- UFuncTypeError Traceback (most recent call last) ~\AppData\Local\Temp\ipykernel_16872\2629294837.py in <module> 1 # Plot budget vs gross using seabron 2 ----> 3 sns.regplot(x="gross", y="budget", data=df, scatter_kws={"color":"red"}, line_kws={"color":"blue"}) C:\ProgramData\Anaconda3\lib\site-packages\seaborn\_decorators.py in inner_f(*args, **kwargs) 44 ) 45 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)}) ---> 46 return f(**kwargs) 47 return inner_f 48 C:\ProgramData\Anaconda3\lib\site-packages\seaborn\regression.py in regplot(x, y, data, x_estimator, x_bins, x_ci, scatter, fit_reg, ci, n_boot, units, seed, order, logistic, lowess, robust, logx, x_partial, y_partial, truncate, dropna, x_jitter, y_jitter, label, color, marker, scatter_kws, line_kws, ax) 861 scatter_kws["marker"] = marker 862 line_kws = {} if line_kws is None else copy.copy(line_kws) --> 863 plotter.plot(ax, scatter_kws, line_kws) 864 return ax 865 C:\ProgramData\Anaconda3\lib\site-packages\seaborn\regression.py in plot(self, ax, scatter_kws, line_kws) 368 369 if self.fit_reg: --> 370 self.lineplot(ax, line_kws) 371 372 # Label the axes C:\ProgramData\Anaconda3\lib\site-packages\seaborn\regression.py in lineplot(self, ax, kws) 411 """Draw the model.""" 412 # Fit the regression model --> 413 grid, yhat, err_bands = self.fit_regression(ax) 414 edges = grid[0], grid[-1] 415 C:\ProgramData\Anaconda3\lib\site-packages\seaborn\regression.py in fit_regression(self, ax, x_range, grid) 219 yhat, yhat_boots = self.fit_logx(grid) 220 else: --> 221 yhat, yhat_boots = self.fit_fast(grid) 222 223 # Compute the confidence interval at each grid point C:\ProgramData\Anaconda3\lib\site-packages\seaborn\regression.py in fit_fast(self, grid) 236 X, y = np.c_[np.ones(len(self.x)), self.x], self.y 237 grid = np.c_[np.ones(len(grid)), grid] --> 238 yhat = grid.dot(reg_func(X, y)) 239 if self.ci is None: 240 return yhat, None C:\ProgramData\Anaconda3\lib\site-packages\seaborn\regression.py in reg_func(_x, _y) 232 """Low-level regression and prediction using linear algebra.""" 233 def reg_func(_x, _y): --> 234 return np.linalg.pinv(_x).dot(_y) 235 236 X, y = np.c_[np.ones(len(self.x)), self.x], self.y <__array_function__ internals> in pinv(*args, **kwargs) C:\ProgramData\Anaconda3\lib\site-packages\numpy\linalg\linalg.py in pinv(a, rcond, hermitian) 2000 return wrap(res) 2001 a = a.conjugate() -> 2002 u, s, vt = svd(a, full_matrices=False, hermitian=hermitian) 2003 2004 # discard small singular values <__array_function__ internals> in svd(*args, **kwargs) C:\ProgramData\Anaconda3\lib\site-packages\numpy\linalg\linalg.py in svd(a, full_matrices, compute_uv, hermitian) 1658 1659 signature = 'D->DdD' if isComplexType(t) else 'd->ddd' -> 1660 u, s, vh = gufunc(a, signature=signature, extobj=extobj) 1661 u = u.astype(result_t, copy=False) 1662 s = s.astype(_realType(result_t), copy=False) UFuncTypeError: Cannot cast ufunc 'svd_n_s' input from dtype('O') to dtype('float64') with casting rule 'same_kind'
# All the posible pearson correlations between numerical fields
# Another posibilitie to pearson is to use 'kendall' or 'spearman' method
# ( df.corr() by default is pearson )
correlation_matrix=df.corr(method='pearson')
sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlation Matrix')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()
# Let's give a number to every object in order to make more correlations
df_numerized=df
for col_name in df_numerized.columns:
if(df_numerized[col_name].dtype =='object'):
df_numerized[col_name]=df_numerized[col_name].astype('category')
df_numerized[col_name]=df_numerized[col_name].cat.codes
df_numerized.head()
| name | rating | genre | year | released | score | votes | director | writer | star | country | budget | gross | company | runtime | yearcorrect | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6587 | 6 | 6 | 1980 | 1705 | 8.4 | 927000.0 | 2589 | 4014 | 1047 | 54 | 19000000 | 46998772 | 2319 | 146.0 | 14 |
| 1 | 5573 | 6 | 1 | 1980 | 1492 | 5.8 | 65000.0 | 2269 | 1632 | 327 | 55 | 4500000 | 58853106 | 731 | 104.0 | 13 |
| 2 | 5142 | 4 | 0 | 1980 | 1771 | 8.7 | 1200000.0 | 1111 | 2567 | 1745 | 55 | 18000000 | 538375067 | 1540 | 124.0 | 14 |
| 3 | 286 | 4 | 4 | 1980 | 1492 | 7.7 | 221000.0 | 1301 | 2000 | 2246 | 55 | 3500000 | 83453539 | 1812 | 88.0 | 13 |
| 4 | 1027 | 6 | 4 | 1980 | 1543 | 7.3 | 108000.0 | 1054 | 521 | 410 | 55 | 6000000 | 39846344 | 1777 | 98.0 | 13 |
#Now I have the correlatios for all the elements in the table
correlation_matrix=df_numerized.corr(method='pearson')
sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlation Matrix')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()
# Unstack them for a better view
correlation_matrix=df_numerized.corr()
correlation_pairs=correlation_matrix.unstack()
correlation_pairs.head()
name name 1.000000
rating -0.008069
genre 0.016355
year 0.011453
released -0.011311
dtype: float64
sorted_pairs=correlation_pairs.sort_values()
sorted_pairs.head()
genre budget -0.356564 budget genre -0.356564 genre gross -0.235650 gross genre -0.235650 budget rating -0.176002 dtype: float64
high_corr=sorted_pairs[(sorted_pairs)>0.5]
high_corr
votes gross 0.630757
gross votes 0.630757
budget 0.740395
budget gross 0.740395
yearcorrect released 0.993694
released yearcorrect 0.993694
name name 1.000000
company company 1.000000
gross gross 1.000000
budget budget 1.000000
country country 1.000000
star star 1.000000
writer writer 1.000000
director director 1.000000
votes votes 1.000000
score score 1.000000
released released 1.000000
year year 1.000000
genre genre 1.000000
rating rating 1.000000
runtime runtime 1.000000
yearcorrect yearcorrect 1.000000
dtype: float64
# Votes and budget have the highest correlation to gross earnings and company has a low correlation