写在前面

最近在学习python，结合一个实际案例，写一下python和R在做数据分析上的差异。
本人还不是特别熟练python，所以python的代码来自于kagle的一个高vote回帖。
我这里只是转写一下R的版本，转写python代码之后感觉python做数据分析和可视化实在不如R给力。代码丢这了，有机会说说如何用tidyverse分析数据吧。
这里写了多数代码，剩下流程差不多的就放弃写了。还有机器学习的部分回头有心情了用tidymodels写一下基本的框架吧。

Netflix is an application that keeps growing bigger and faster with its popularity, shows and content. This is an EDA
or a story telling through its data along with a content-based recommendation system and a wide range of different
graphs and visuals.

image.png

The python source code is from here

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns

library(tidyverse)
library(skimr)

# Loading the dataset
data <- tidytuesdayR::tt_load('2021-04-20')
netfix_dta <- data$netflix_titles
# install a module if your python don't have
# reticulate::py_install('seaborn',pip = TRUE)

Pass the data to Python from R in rstudio


netflix_overall=r.netfix_dta
netflix_overall.head()

Also, you can do the same thing using R

head(netfix_dta)

glimpse(netfix_dta)

Therefore, it is clear that the dataset contains 12 columns for exploratory analysis.

netflix_overall.count()

Also, in R you can do it better.

skim(netfix_dta)


netflix_shows=netflix_overall[netflix_overall['type']=='TV Show']
netflix_shows.head()

In R, you can use pipe to repeat, which makes your script easy to read.

netflix_shows <- netfix_dta %>%
  filter(type == "TV Show")

head(netflix_shows)


netflix_movies=netflix_overall[netflix_overall['type']=='Movie']

netflix_movies <- netfix_dta %>%
  filter(type == "Movie")

Analysis of Movies vs TV Shows.


sns.set(style="darkgrid") 
ax = sns.countplot(x="type", data=netflix_overall, palette="Set2")
plt.show()

In R

netfix_dta %>% 
  ggplot(aes(x = fct_rev(type), fill = type)) + 
  geom_bar() + 
  theme_bw()


It is evident that there are more Movies on Netflix than TV shows.

```{python} md

# If a producer wants to release some content, which month must he do so?( Month when least amount of content is added)

```{python}
netflix_date = netflix_shows[['date_added']].dropna()
netflix_date['year'] = netflix_date['date_added'].apply(lambda x : x.split(', ')[-1])
netflix_date['month'] = netflix_date['date_added'].apply(lambda x : x.lstrip().split(' ')[0])

month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'][::-1]
df = netflix_date.groupby('year')['month'].value_counts().unstack().fillna(0)[month_order].T
plt.figure(figsize=(10, 7), dpi=200)
plt.pcolor(df, cmap='afmhot_r', edgecolors='white', linewidths=2) # heatmap
plt.xticks(np.arange(0.5, len(df.columns), 1), df.columns, fontsize=7, fontfamily='serif')
plt.yticks(np.arange(0.5, len(df.index), 1), df.index, fontsize=7, fontfamily='serif')

plt.title('Netflix Contents Update', fontsize=12, fontfamily='calibri', fontweight='bold', position=(0.20, 1.0+0.02))
cbar = plt.colorbar()

cbar.ax.tick_params(labelsize=8) 
cbar.ax.minorticks_on()
plt.show()

library(lubridate)
library(viridis)

netfix_dta %>% 
  select(date_added) %>% 
  mutate(date_added = mdy(date_added),
         month = month(date_added, label = TRUE, abbr = FALSE),
         year = year(date_added)) %>% 
  group_by(year, month) %>% 
  filter(!is.na(month)) %>% 
  summarise(contents = n()) %>% 
  ggplot(aes(x = year, y = fct_rev(month), fill = contents)) + 
  geom_tile() + 
  viridis::scale_fill_viridis(option = "A") + 
  labs(title = 'Netflix Contents Update',
       x = '',
       y = '')

Movie ratings analysis

plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.countplot(x="rating", data=netflix_movies, palette="Set2", order=netflix_movies['rating'].value_counts().index[0:15])
plt.show()

In R

netfix_dta %>% 
  group_by(rating) %>% 
  summarise(n = n()) %>% 
  filter(!is.na(rating)) %>% 
  ggplot(aes(x = fct_reorder(rating,n, .desc = TRUE), y = n, fill = rating)) + 
  geom_bar(stat = "identity", show.legend = F) + 
  scale_y_continuous(expand = expansion(c(0,.1))) + 
  labs(
    x = 'Rating',
    y = 'Count'
  )

Analysing IMDB ratings to get top rated movies on Netflix

imdb_ratings=pd.read_csv('netflix/IMDb ratings.csv',usecols=['weighted_average_vote'])

imdb_titles=pd.read_csv('netflix/IMDb movies.csv', usecols=['title','year','genre'])

ratings = pd.DataFrame({'Title':imdb_titles.title, 'Release Year':imdb_titles.year, 'Rating': imdb_ratings.weighted_average_vote, 'Genre':imdb_titles.genre})
ratings.drop_duplicates(subset=['Title','Release Year','Rating'], inplace=True)
ratings.shape

ratings.head()

In R

imdb_ratings <- read_csv('netflix/IMDb ratings.csv') %>% 
  select(1,2)
imdb_titles <- read_csv('netflix/IMDb movies.csv') %>% 
  select(1, title, year, genre)

ratings <- left_join(imdb_titles, imdb_ratings, by = "imdb_title_id") %>% 
  select(-1) %>% 
  select(1:3,Rating = "weighted_average_vote")
ratings
dim(ratings)

ratings.dropna()
joint_data=ratings.merge(netflix_overall,left_on='Title',right_on='title',how='inner')
joint_data=joint_data.sort_values(by='Rating', ascending=False)

joint_data.head()
joint_data.shape

joint_data <- ratings %>% 
  filter(!is.na(.)) %>% 
  inner_join(., netfix_dta, by = "title") %>% 
  arrange(desc(Rating))

dim(joint_data)

import plotly.express as px
top_rated=joint_data[0:10]
top_rated
fig =px.sunburst(
    top_rated,
    path=['title','country'],
    values='Rating',
    color='Rating')
fig.show()

library(plotly)
top_rated <- joint_data[1:10,]
fig <- plot_ly(
  ids = c(top_rated$title, paste0(top_rated$title,"-",top_rated$country)),
  labels = c(top_rated$title,top_rated$country),
  parents = c(rep('',10), top_rated$title),
  colors = c(top_rated$Rating,top_rated$Rating),
  type = "sunburst",
  branchvalues = 'total'
)

fig

fig =px.sunburst(
    r.top_rated,
    path=['title','country'],
    values='Rating',
    color='Rating')
fig.show()

Countries with highest rated content.

country_count=joint_data['country'].value_counts().sort_values(ascending=False)
country_count=pd.DataFrame(country_count)
topcountries=country_count[0:11]
topcountries

topcountries <- joint_data %>% 
  group_by(country) %>% 
  summarise(n = n()) %>% 
  arrange(desc(n)) %>% 
  filter(!is.na(country))

import plotly.express as px
data = dict(
    number=[1063,619,135,60,44,41,40,40,38,35],
    country=["United States", "India", "United Kingdom", "Canada", "Spain",'Turkey','Philippines','France','South Korea','Australia'])
fig = px.funnel(data, x='number', y='country')
fig.show()

library(reticulate)
data <- py$data %>% 
  as.data.frame() %>% 
  arrange(desc(number))

plot_ly(
  y = data$country,
  x = data$number,
  type = "funnel",
) %>% 
  layout(yaxis = list(categoryarray = data$country))

Year wise analysis

plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.countplot(y="release_year", data=netflix_movies, palette="Set2", order=netflix_movies['release_year'].value_counts().index[0:15])
plt.show()

netflix_movies %>% 
  group_by(release_year) %>% 
  summarise(n = n()) %>% 
  arrange(desc(n)) %>% 
  slice(1:15) %>% 
  mutate(release_year = factor(release_year, levels = release_year)) %>% 
  ggplot(aes(y = fct_rev(release_year), x = n, fill = release_year)) + 
  geom_bar(stat = "identity",show.legend = FALSE) + 
  ggsci::scale_fill_simpsons()

Analysis of duration of movies¶

netflix_movies['duration']=netflix_movies['duration'].str.replace(' min','')
netflix_movies['duration']=netflix_movies['duration'].astype(str).astype(int)
netflix_movies['duration']
plt.figure(figsize=(8,8))
sns.set(style="darkgrid")
sns.kdeplot(data=netflix_movies['duration'], shade=True)
plt.show()

netflix_movies %>% 
  mutate(duration = str_remove(duration, " min") %>% as.double()) %>% 
  ggplot(aes(x = duration)) + 
           geom_density(fill = "blue2",alpha = .4) + 
  ggthemes::theme_solarized()

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image

from collections import Counter

genres=list(netflix_movies['listed_in'])
gen=[]

for i in genres:
    i=list(i.split(','))
    for j in i:
        gen.append(j.replace(' ',""))
g=Counter(gen)

text = list(set(gen))
plt.rcParams['figure.figsize'] = (13, 13)

wordcloud = WordCloud(max_words=1000000,background_color="white").generate(str(text))

plt.imshow(wordcloud,interpolation="bilinear")
plt.axis("off")
plt.show()

library(wordcloud)
library(tidytext)
set.seed(2021)
netflix_movies %>% 
  unnest_tokens(word, listed_in) %>% 
  count(word, sort = TRUE) %>% 
  with(wordcloud(word, n, max.words = 100))

matplotlib.use('TkAgg')
g={k: v for k, v in sorted(g.items(), key=lambda item: item[1], reverse= True)}
g
fig, ax = plt.subplots()

x=list(g.keys())
y=list(g.values())
ax.vlines(x, ymin=0, ymax=y, color='green')
ax.plot(x,y, "o", color='maroon')
ax.set_xticklabels(x, rotation = 90)
ax.set_ylabel("Count of movies")
# set a title
ax.set_title("Genres")
plt.show()

g <- py$g %>% unlist() %>% data.frame() %>% select(n = ".")

g %>% 
  mutate(name = rownames(g),
         name = fct_reorder(name, n, .desc = TRUE)) %>% 
  ggplot(aes(x = name, y = n)) + 
  geom_segment(aes(x = name, xend = name, y= 0, yend = n)) + 
  geom_point(size = 5, color = 'orange') + 
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1)
  )

Lowest number of seasons.

features=['title','duration']
durations= netflix_shows[features]

durations['no_of_seasons']=durations['duration'].str.replace(' Season','')

#durations['no_of_seasons']=durations['no_of_seasons'].astype(str).astype(int)
durations['no_of_seasons']=durations['no_of_seasons'].str.replace('s','')
durations['no_of_seasons']=durations['no_of_seasons'].astype(str).astype(int)

t=['title','no_of_seasons']
top=durations[t]

top=top.sort_values(by='no_of_seasons', ascending=False)
bottom=top.sort_values(by='no_of_seasons')
bottom=bottom[20:50]

import plotly.graph_objects as go
# Set the width and height of the figure
plt.figure(figsize=(15,15))
fig = go.Figure(data=[go.Table(header=dict(values=['Title', 'No of seasons']), cells=dict(values=[bottom['title'],bottom['no_of_seasons']],fill_color='lavender'))])
fig.show()

library(kableExtra)
netflix_shows %>% 
  select(title, duration) %>% 
  separate(duration, ' ',into = c('duration','season')) %>% 
  mutate(duration = as.numeric(duration)) %>% 
  arrange(desc(duration)) %>% 
  kbl() %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

左手python右手R

左手python右手R

写在前面

Analysis of Movies vs TV Shows.

Movie ratings analysis

Analysing IMDB ratings to get top rated movies on Netflix

Analysis of duration of movies¶

Lowest number of seasons.