# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8) #Adjust the configuration of the plots we will create

# Read in the data

df=pd.read_csv(r'C:\Users\Usuario\Desktop\Gonza\PortfolioProyects\Movies - Python\movies.csv')


# Let's look at the data

df.head()


# We need to see if we have any missing data
# Let's loop through the data and see if there is anything missing

for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))

name - 0%
rating - 1%
genre - 0%
year - 0%
released - 0%
score - 0%
votes - 0%
director - 0%
writer - 0%
star - 0%
country - 0%
budget - 28%
gross - 2%
company - 0%
runtime - 0%


#Data types for our columns

df.dtypes

name         object
rating       object
genre        object
year          int64
released     object
score       float64
votes       float64
director     object
writer       object
star         object
country      object
budget      float64
gross       float64
company      object
runtime     float64
dtype: object


#Changing data types of columns

df['budget'] = df['budget'].astype('Int64')
df['gross'] = df['gross'].astype('Int64')

df['yearcorrect']=df['released'].astype(str).str[:4]


df.sort_values(by=['gross'], inplace=False, ascending=False).head()


pd.set_option('display.max_rows', None)


# Check for duplicates and drop them

df['company'].drop_duplicates().sort_values(ascending=False).head()

7129                     thefyzz
5664                 micro_scope
6412    iDeal Partners Film Fund
4007                    i5 Films
6793                  i am OTHER
Name: company, dtype: object


# Scatter plot

x=df['budget']
y=df['gross']
x=x.fillna(0)
y=y.fillna(0)

plt.scatter(x, y)

plt.title('Budget vs Gross Earnings')
plt.xlabel('Gross Earnings')
plt.ylabel('Budget for film')

plt.show()


# Plot budget vs gross using seabron

sns.regplot(x="gross", y="budget", data=df, scatter_kws={"color":"red"}, line_kws={"color":"blue"})

---------------------------------------------------------------------------
UFuncTypeError                            Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_16872\2629294837.py in <module>
      1 # Plot budget vs gross using seabron
      2 
----> 3 sns.regplot(x="gross", y="budget", data=df, scatter_kws={"color":"red"}, line_kws={"color":"blue"})

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\_decorators.py in inner_f(*args, **kwargs)
     44             )
     45         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 46         return f(**kwargs)
     47     return inner_f
     48 

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\regression.py in regplot(x, y, data, x_estimator, x_bins, x_ci, scatter, fit_reg, ci, n_boot, units, seed, order, logistic, lowess, robust, logx, x_partial, y_partial, truncate, dropna, x_jitter, y_jitter, label, color, marker, scatter_kws, line_kws, ax)
    861     scatter_kws["marker"] = marker
    862     line_kws = {} if line_kws is None else copy.copy(line_kws)
--> 863     plotter.plot(ax, scatter_kws, line_kws)
    864     return ax
    865 

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\regression.py in plot(self, ax, scatter_kws, line_kws)
    368 
    369         if self.fit_reg:
--> 370             self.lineplot(ax, line_kws)
    371 
    372         # Label the axes

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\regression.py in lineplot(self, ax, kws)
    411         """Draw the model."""
    412         # Fit the regression model
--> 413         grid, yhat, err_bands = self.fit_regression(ax)
    414         edges = grid[0], grid[-1]
    415 

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\regression.py in fit_regression(self, ax, x_range, grid)
    219             yhat, yhat_boots = self.fit_logx(grid)
    220         else:
--> 221             yhat, yhat_boots = self.fit_fast(grid)
    222 
    223         # Compute the confidence interval at each grid point

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\regression.py in fit_fast(self, grid)
    236         X, y = np.c_[np.ones(len(self.x)), self.x], self.y
    237         grid = np.c_[np.ones(len(grid)), grid]
--> 238         yhat = grid.dot(reg_func(X, y))
    239         if self.ci is None:
    240             return yhat, None

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\regression.py in reg_func(_x, _y)
    232         """Low-level regression and prediction using linear algebra."""
    233         def reg_func(_x, _y):
--> 234             return np.linalg.pinv(_x).dot(_y)
    235 
    236         X, y = np.c_[np.ones(len(self.x)), self.x], self.y

<__array_function__ internals> in pinv(*args, **kwargs)

C:\ProgramData\Anaconda3\lib\site-packages\numpy\linalg\linalg.py in pinv(a, rcond, hermitian)
   2000         return wrap(res)
   2001     a = a.conjugate()
-> 2002     u, s, vt = svd(a, full_matrices=False, hermitian=hermitian)
   2003 
   2004     # discard small singular values

<__array_function__ internals> in svd(*args, **kwargs)

C:\ProgramData\Anaconda3\lib\site-packages\numpy\linalg\linalg.py in svd(a, full_matrices, compute_uv, hermitian)
   1658 
   1659         signature = 'D->DdD' if isComplexType(t) else 'd->ddd'
-> 1660         u, s, vh = gufunc(a, signature=signature, extobj=extobj)
   1661         u = u.astype(result_t, copy=False)
   1662         s = s.astype(_realType(result_t), copy=False)

UFuncTypeError: Cannot cast ufunc 'svd_n_s' input from dtype('O') to dtype('float64') with casting rule 'same_kind'


# All the posible pearson correlations between numerical fields
# Another posibilitie to pearson is to use 'kendall' or 'spearman' method
# ( df.corr() by default is pearson )

correlation_matrix=df.corr(method='pearson')

sns.heatmap(correlation_matrix, annot=True)

plt.title('Correlation Matrix')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()


# Let's give a number to every object in order to make more correlations

df_numerized=df

for col_name in df_numerized.columns:
    if(df_numerized[col_name].dtype =='object'):
        df_numerized[col_name]=df_numerized[col_name].astype('category')
        df_numerized[col_name]=df_numerized[col_name].cat.codes
df_numerized.head()


#Now I have the correlatios for all the elements in the table

correlation_matrix=df_numerized.corr(method='pearson')

sns.heatmap(correlation_matrix, annot=True)

plt.title('Correlation Matrix')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()


# Unstack them for a better view

correlation_matrix=df_numerized.corr()

correlation_pairs=correlation_matrix.unstack()

correlation_pairs.head()

name  name        1.000000
      rating     -0.008069
      genre       0.016355
      year        0.011453
      released   -0.011311
dtype: float64


sorted_pairs=correlation_pairs.sort_values()

sorted_pairs.head()

genre   budget   -0.356564
budget  genre    -0.356564
genre   gross    -0.235650
gross   genre    -0.235650
budget  rating   -0.176002
dtype: float64


high_corr=sorted_pairs[(sorted_pairs)>0.5]

high_corr

votes        gross          0.630757
gross        votes          0.630757
             budget         0.740395
budget       gross          0.740395
yearcorrect  released       0.993694
released     yearcorrect    0.993694
name         name           1.000000
company      company        1.000000
gross        gross          1.000000
budget       budget         1.000000
country      country        1.000000
star         star           1.000000
writer       writer         1.000000
director     director       1.000000
votes        votes          1.000000
score        score          1.000000
released     released       1.000000
year         year           1.000000
genre        genre          1.000000
rating       rating         1.000000
runtime      runtime        1.000000
yearcorrect  yearcorrect    1.000000
dtype: float64


# Votes and budget have the highest correlation to gross earnings and company has a low correlation

	name	rating	genre	year	released	score	votes	director	writer	star	country	budget	gross	company	runtime
0	The Shining	R	Drama	1980	June 13, 1980 (United States)	8.4	927000.0	Stanley Kubrick	Stephen King	Jack Nicholson	United Kingdom	19000000.0	46998772.0	Warner Bros.	146.0
1	The Blue Lagoon	R	Adventure	1980	July 2, 1980 (United States)	5.8	65000.0	Randal Kleiser	Henry De Vere Stacpoole	Brooke Shields	United States	4500000.0	58853106.0	Columbia Pictures	104.0
2	Star Wars: Episode V - The Empire Strikes Back	PG	Action	1980	June 20, 1980 (United States)	8.7	1200000.0	Irvin Kershner	Leigh Brackett	Mark Hamill	United States	18000000.0	538375067.0	Lucasfilm	124.0
3	Airplane!	PG	Comedy	1980	July 2, 1980 (United States)	7.7	221000.0	Jim Abrahams	Jim Abrahams	Robert Hays	United States	3500000.0	83453539.0	Paramount Pictures	88.0
4	Caddyshack	R	Comedy	1980	July 25, 1980 (United States)	7.3	108000.0	Harold Ramis	Brian Doyle-Murray	Chevy Chase	United States	6000000.0	39846344.0	Orion Pictures	98.0

	name	rating	genre	year	released	score	votes	director	writer	star	country	budget	gross	company	runtime	yearcorrect
5445	Avatar	PG-13	Action	2009	December 18, 2009 (United States)	7.8	1100000.0	James Cameron	James Cameron	Sam Worthington	United States	237000000	2847246203	Twentieth Century Fox	162.0	Dece
7445	Avengers: Endgame	PG-13	Action	2019	April 26, 2019 (United States)	8.4	903000.0	Anthony Russo	Christopher Markus	Robert Downey Jr.	United States	356000000	2797501328	Marvel Studios	181.0	Apri
3045	Titanic	PG-13	Drama	1997	December 19, 1997 (United States)	7.8	1100000.0	James Cameron	James Cameron	Leonardo DiCaprio	United States	200000000	2201647264	Twentieth Century Fox	194.0	Dece
6663	Star Wars: Episode VII - The Force Awakens	PG-13	Action	2015	December 18, 2015 (United States)	7.8	876000.0	J.J. Abrams	Lawrence Kasdan	Daisy Ridley	United States	245000000	2069521700	Lucasfilm	138.0	Dece
7244	Avengers: Infinity War	PG-13	Action	2018	April 27, 2018 (United States)	8.4	897000.0	Anthony Russo	Christopher Markus	Robert Downey Jr.	United States	321000000	2048359754	Marvel Studios	149.0	Apri

	name	rating	genre	year	released	score	votes	director	writer	star	country	budget	gross	company	runtime	yearcorrect
0	6587	6	6	1980	1705	8.4	927000.0	2589	4014	1047	54	19000000	46998772	2319	146.0	14
1	5573	6	1	1980	1492	5.8	65000.0	2269	1632	327	55	4500000	58853106	731	104.0	13
2	5142	4	0	1980	1771	8.7	1200000.0	1111	2567	1745	55	18000000	538375067	1540	124.0	14
3	286	4	4	1980	1492	7.7	221000.0	1301	2000	2246	55	3500000	83453539	1812	88.0	13
4	1027	6	4	1980	1543	7.3	108000.0	1054	521	410	55	6000000	39846344	1777	98.0	13