Load packages and set working directory
library(ggplot2)
library(dplyr)
library(statsr)
library(BAS)
library(gridExtra)
library(cowplot)
library(xtable)
library(knitr)
library(BAS)
setwd("~/Downloads/R projects")
Load data
main_data <- load("movies.Rdata")
write.table(movies, "movies_full")
audience_score
the data and its variables are adequate. However, measurement errors may arise in the variables.plot_grid(
ggplot(movies, aes(x = thtr_rel_year)) + xlab("Release year") + ylab("Frequency") +
geom_histogram(bins=30),
ggplot(movies, aes(y = imdb_num_votes, x = thtr_rel_year)) + xlab("Release year") + ylab("IMDB No. of Votes") +
geom_point()+geom_smooth(method = lm),
labels = c(1:2)
)
## `geom_smooth()` using formula 'y ~ x'
Create all required variables.
movies_main <- movies %>%
mutate(
oscar_season = ifelse(thtr_rel_month == 10| thtr_rel_month == 11| thtr_rel_month == 12, "yes","no"),
summer_season = ifelse(thtr_rel_month == 5| thtr_rel_month == 6| thtr_rel_month == 7|
thtr_rel_month == 8, "yes","no"),
feature_film = ifelse(title_type == "Feature Film", "yes", "no"),
drama = ifelse(genre == "Drama", "yes", "no"),
mpaa_rating_R = ifelse(mpaa_rating == "R", "yes", "no"),
)
Select the necessary variables for analysis.
movies_main <- movies_main %>%
select(feature_film, drama, runtime, mpaa_rating_R, thtr_rel_year, oscar_season,
summer_season, imdb_rating, imdb_num_votes, critics_score, best_pic_nom,
best_pic_win, best_actor_win, best_actress_win, best_dir_win, top200_box, audience_score)
Convert necessary variables to factors.
names <- c("oscar_season","summer_season","feature_film","drama","mpaa_rating_R")
movies_main[,names] <- lapply(movies_main[,names],factor)
#glimpse(movies_main)
movies_main_gg <- movies_main[,names] #subset data
movies_main_gg <- movies_main_gg[,c(3,4,1,2,5)]
P = list()
for (i in names(movies_main_gg)){
mydata = data.frame(covr1 = movies_main_gg[[i]], as = movies_main$audience_score)
p = ggplot(mydata, aes(x = covr1, y = as))+
geom_boxplot(notch=F) + labs(x = paste0(i), y = "Audience Score")+
stat_summary(fun=mean, geom="point", shape=23, size=4)
theme(plot.title = element_text(size = 9))
P = c(P, list(p))
}
library(cowplot)
plot_grid(plotlist = P, labels = c(1:5), ncol = 3)
movies_summary <- movies_main %>%
select(feature_film,drama,oscar_season,summer_season,mpaa_rating_R)
summary_data <- data.frame(
"Variable" = c("Feature film","Drama","Oscar season","Summer season","R rating"),
"Yes" = c(sum(ifelse(movies_summary[,1]=="yes",1,0)),
sum(ifelse(movies_summary[,2]=="yes",1,0)),
sum(ifelse(movies_summary[,3]=="yes",1,0)),
sum(ifelse(movies_summary[,4]=="yes",1,0)),
sum(ifelse(movies_summary[,5]=="yes",1,0))),
"No" = c(sum(ifelse(movies_summary[,1]=="no",1,0)),
sum(ifelse(movies_summary[,2]=="no",1,0)),
sum(ifelse(movies_summary[,3]=="no",1,0)),
sum(ifelse(movies_summary[,4]=="no",1,0)),
sum(ifelse(movies_summary[,5]=="no",1,0)))
)
summary_addon <- data.frame(
"Audience score" = "Audience score",
"Mean" = mean(movies_main$audience_score),
"Median" = median(movies_main$audience_score)
)
#xtable(summary_data)
knitr::kable(summary_data,"markdown")
Variable | Yes | No |
---|---|---|
Feature film | 591 | 60 |
Drama | 305 | 346 |
Oscar season | 191 | 460 |
Summer season | 208 | 443 |
R rating | 329 | 322 |
knitr::kable(summary_addon,"markdown")
Audience.score | Mean | Median |
---|---|---|
Audience score | 62.36252 | 65 |
The above plot grid compares the outcome of interest (audience score) with new explanatory varibles using boxplots. The diamond-shaped box and the line indicates each category’s mean and median audience score, respectively.
We begin by examining the importance of our explanatory variables under all relevant priors and thereby searching for the prior, which will be used in the final model.
movies_main <- na.omit(movies_main)
# Unit information prior
mod.fit.g <- bas.lm(audience_score ~ ., data=movies_main, prior="g-prior",
a =nrow(movies_main), modelprior=uniform())
# a is the hyperparameter in this case g=n
#Zellner-Siow prior with Jeffrey's reference prior on sigma^2
mod.fit.ZS <- bas.lm(audience_score ~ ., data=movies_main, prior="JZS",
modelprior=uniform())
# Hyper g/n prior
mod.fit.HG <- bas.lm(audience_score ~ ., data=movies_main, prior="hyper-g-n",
a=3, modelprior=uniform())
# hyperparameter a=3
# Empirical Bayesian estimation under maximum marginal likelihood
mod.fit.EB <- bas.lm(audience_score ~ ., data=movies_main, prior="EB-local",
a = nrow(movies_main), modelprior=uniform())
# BIC to approximate reference prior
mod.fit.BIC <- bas.lm(audience_score ~ ., data=movies_main, prior="BIC",
modelprior=uniform())
# AIC
mod.fit.AIC <- bas.lm(audience_score ~ ., data=movies_main, prior="AIC",
modelprior=uniform())
In order to compare the posterior inclusion probability (pip) of each coefficient, we group the results \(p(\beta_i\neq0)\) obtained from the probne0
attribute of each model for later comparison.
probne0 = cbind(mod.fit.g$probne0, mod.fit.BIC$probne0, mod.fit.ZS$probne0, mod.fit.EB$probne0,
mod.fit.HG$probne0, mod.fit.AIC$probne0)
colnames(probne0) = c("g","BIC", "ZS", "EB","HG", "AIC")
rownames(probne0) = c(mod.fit.BIC$namesx)
#include_list <- c("Intercept","runtime","imdb_rating","critics_score")
#probne0 <- probne0[include_list,]
# Generate plot for each variable and save in a list
P = list()
for (i in 1:nrow(probne0)){
mydata = data.frame(prior = colnames(probne0), posterior = probne0[i, ])
mydata$prior = factor(mydata$prior, levels = colnames(probne0))
p = ggplot(mydata, aes(x = prior, y = posterior)) +
geom_bar(stat = "identity") + xlab("") +
ylab("") + theme(text = element_text(size=9), axis.text.x = element_text(angle=90, hjust=1))+
ggtitle(rownames(probne0)[i])+
theme(plot.title = element_text(size = 9))
P = c(P, list(p))
}
library(cowplot)
plot_grid(plotlist = P)
We observe that \(AIC\) prior is the least conservative and imdb_rating
and critics_score
have high posterior inclusion probabilties (greater than 0.8) in all the prior categories. Thus we choose \(BIC\) as the prior since it is not as conservatice as the \(g-prior\), but more conservative than the rest.
mod.fit <- bas.lm(
audience_score ~ .,
data = movies_main,
prior = "BIC",
modelprior = uniform(),
method = "MCMC",
)
coef1 <- coef(mod.fit)
ci <- confint(coef1)[,1:2]
names <- c("posterior mean", "posterior std", "pip", colnames(ci))
out <- cbind(coef1$postmean, coef1$postsd, coef1$probne0, ci)
colnames(out) = names
#Rearrange posterior probabilities in descending order and print.
out <- data.frame(out)
out <- out[order(-out$pip),]
knitr::kable(round(out,4))
posterior.mean | posterior.std | pip | X2.5. | X97.5. | |
---|---|---|---|---|---|
Intercept | 62.3477 | 0.3946 | 1.0000 | 61.5741 | 63.1311 |
imdb_rating | 14.9860 | 0.7363 | 1.0000 | 13.6270 | 16.5408 |
critics_score | 0.0628 | 0.0305 | 0.8858 | 0.0000 | 0.1064 |
runtime | -0.0257 | 0.0312 | 0.4706 | -0.0823 | 0.0001 |
mpaa_rating_Ryes | -0.3036 | 0.7032 | 0.1998 | -2.1621 | 0.0003 |
best_actor_winyes | -0.2910 | 0.8361 | 0.1460 | -2.6434 | 0.0000 |
best_actress_winyes | -0.3115 | 0.9094 | 0.1423 | -2.8232 | 0.0000 |
best_pic_nomyes | 0.5159 | 1.5807 | 0.1333 | 0.0000 | 5.0611 |
thtr_rel_year | -0.0045 | 0.0182 | 0.0905 | -0.0508 | 0.0000 |
summer_seasonyes | 0.0859 | 0.3807 | 0.0793 | -0.0199 | 0.9887 |
oscar_seasonyes | -0.0798 | 0.3760 | 0.0747 | -0.9039 | 0.0000 |
best_dir_winyes | -0.1201 | 0.6243 | 0.0673 | -1.2547 | 0.0463 |
feature_filmyes | -0.1048 | 0.5648 | 0.0655 | -1.0211 | 0.0000 |
imdb_num_votes | 0.0000 | 0.0000 | 0.0572 | 0.0000 | 0.0000 |
top200_boxyes | 0.0847 | 0.6980 | 0.0468 | -0.0502 | 0.0104 |
dramayes | 0.0161 | 0.1941 | 0.0433 | 0.0000 | 0.0000 |
best_pic_winyes | -0.0090 | 0.8515 | 0.0401 | 0.0000 | 0.0000 |
Coefficients graphs
par(mfrow=c(2,2))
plot(coef(mod.fit), subset = c(1,4,9,11), ask = F)
The posterior inclusion probabilities are listed in descending order in the above table to highlight important predictors in our model. The variables imdb_rating
has a posterior mean of 14.98, while critics_score is 0.0627. We also demonstrate the 95% credible intervals of all the predictors. Given this data, we believe that there is a 95% chance that the audience_score increases by 13.67 to 16.53 as the imdb_rating
increases by one. The lower limit of critics_score’s credible interval includes 0, thus raising doubts about the importance of that variable. runtime
is also included in the coefficient graphs grid, since it has a posterior inclusion probability of greater than 0.4.
Model diagnostics
We use the “MCMC” sampler until the number of unique models in the sample exceeds the number of models (\(2^{17}=131072\), where \(17\) is the total number of predictors) or until the number of iteration exceeds \(262144(=2\times131072)\), whichever is smaller.
par(mfrow=c(1,2))
diag1 <- c("pip","model")
for(i in 1:2){
diagnostics(mod.fit, type=diag1[[i]], col = "blue", pch = 16, cex.lab = 0.7,
cex.main = 0.7)
}
The plot on the left verifies whether the MCMC exploration has run long enough so that the posterior inclusion probability (pip) has converged. Since all the points are on the 45 degree line, we conclude that the pip of each variable from MCMC has converged well enough to the theoretical pip. The plot on the right also confirms that the model posterior probabilities have also converged.
par(mfrow=c(2,2))
for(i in 1:4){
plot(mod.fit, which = i, ask = F, add.smooth = F,
pch = 16, cex.lab = 0.7)
}
imdb_rating
and critics_score
besides the intercept
, suggesting that these variables are important for prediction. The variables represented in grey lines have posterior inclusion probability less than 0.5.Model Rank
image(mod.fit, rotate=F, cex.axis=0.8)
We can see that the best ranked model includes runtime
, imdb_rating
and critics_score
, besides the intercept
.
Prediction with full model:
accountant <- data.frame(
feature_film = "yes", drama="no",
runtime=128, mpaa_rating_R = "yes",
thtr_rel_year = 2016, oscar_season = "yes",
summer_season = "no", imdb_rating = 7.3,
imdb_num_votes = 249681, critics_score=52,
best_pic_nom = "no", best_pic_win = "no",
best_actor_win = "no", best_actress_win = "no",
best_dir_win = "no", top200_box = "no"
)
predict_1.1 <- predict(mod.fit, accountant, estimator="HPM", interval = "predict", se.fit=F)
predict_1.2 <- predict(mod.fit, accountant, estimator="BPM", interval = "predict", se.fit=F)
predict_1.3 <- predict(mod.fit, accountant, estimator="BMA", interval = "predict", se.fit=F)
predict_1.4 <- predict(mod.fit, accountant, estimator="MPM", interval = "predict", se.fit=F)
predict_data <- data.frame(
"Movie (Accountant)" = c("Full model-HPM","Full model-BPM", "Full model-BMA", "Full model-MPM"),
"Estimated audience score" = c(predict_1.1$Ybma, predict_1.2$Ybma,
predict_1.3$Ybma, predict_1.4$Ybma),
"Real audience score" = 76
)
knitr::kable(predict_data,"markdown")
Movie..Accountant. | Estimated.audience.score | Real.audience.score |
---|---|---|
Full model-HPM | 72.87218 | 76 |
Full model-BPM | 73.30925 | 76 |
Full model-BMA | 73.30925 | 76 |
Full model-MPM | 73.78982 | 76 |
The explanatory variables common in the best model are:
intersect(intersect(predict_1.1$best.vars, predict_1.2$best.vars),
intersect(predict_1.3$best.vars, predict_1.4$best.vars))
## [1] "Intercept" "imdb_rating" "critics_score"
#Model with best BMA
mod.fit.pars <- bas.lm(
audience_score ~ imdb_rating + critics_score,
data = movies_main,
prior = "BIC",
modelprior = uniform(),
)
accountant.pars <- data.frame(
imdb_rating = 7.3, critics_score = 52
)
predict_2 <- predict(mod.fit.pars, accountant.pars, estimator = "HPM", interval = "predict", se.fit = T)
#Model with best BMA and runtime included
mod.fit.pars.runtime <- bas.lm(
audience_score ~ imdb_rating + critics_score + runtime,
data = movies_main,
prior = "BIC",
modelprior = uniform()
)
accountant.pars.runtime <- data.frame(
imdb_rating = 7.3, critics_score = 52, runtime = 128
)
predict_3 <- predict(mod.fit.pars.runtime, accountant.pars.runtime, estimator = "HPM",
interval = "predict", se.fit = T)
#Linear Model
mod.fit.lm <- lm(audience_score~.,
data = movies_main)
predict_4 <- predict(mod.fit.lm, accountant, interval = "prediction")
predict_data <- data.frame(
"Movie (Accountant)" = c("Parsimonious model-BMA", "Parsimonious + runtime model-BMA", "Linear model - full"),
"Estimated audience score" = c(predict_2$Ybma, predict_3$Ybma, predict_4[,1]),
"Real audience score" = 76
)
knitr::kable(predict_data,"markdown")
Movie..Accountant. | Estimated.audience.score | Real.audience.score |
---|---|---|
Parsimonious model-BMA | 73.78982 | 76 |
Parsimonious + runtime model-BMA | 72.87218 | 76 |
Linear model - full | 70.86968 | 76 |
We make a prediction of 73.78 using the full model (HPM), which is close to the actual audience score (76). Other related models also perform nearly as well. We also compare the Bayesian approach to the frequentist linear model and show that the Bayesian model predicts better.
In this project we determine the influence of several movie attributes on the Rotten Tomatoes audience score. We discover that only imdb_rating
, critics_score
and runtime
are the major covariates explaining the audience_score
.
Drawbacks
imdb_rating
, which suffers from the same problems of selection and measurement bias as audience_score
.Future work
A more exhaustive investigation would include collecting equitable data from all years (1970 - 2014) and recognizing the endogeneity (simultaneity bias) of imdb_rating
. For example, just like imdb_rating
influences audience_score
, the opposite effect may also be true.