You can use the table of contents on the left to navigate. The first two sections are just code and can be skipped.
def cast_of_season(sid, include_featured=True):
"""Return subset of actors table corresponding to cast of the given season.
"""
return cast_of_seasons([sid], include_featured)
def cast_of_seasons(sids, include_featured=True):
idx = casts['sid'].isin(sids)
if not include_featured:
idx &= ~casts['featured']
cast = casts[idx]
return actors.loc[cast['aid'].unique()]
def eps_in_range(start, end):
epidx = (
(episodes['epid'] >= start)
& (episodes['epid'] <= end)
)
return epidx.sum()
usage_metrics = {'episode_share', 'cast_episode_share', 'n_titles'}
"""A value is calculated for each title and each of the above metrics in the following way:
- episode_share: a title's episode share is 1 / the total number of 'eligible' titles in its episode
- cast_episode_share: the above, but further divided by the number of performers in the title
- n_titles: 1
To associate a value for one of the above metrics with a performer and an episode, we sum
the values for each distinct title they appeared in (we don't double-count if they appeared
in multiple roles in a single sketch). A value of x has the following interpretations depending
on the metric in use:
- episode_share: The actor appeared in 100*x% of eligible sketches in the episode.
- cast_episode_share: The actor 'owned' 100*x% of the audience's attention during the episode.
If they only appeared in solo sketches, this number would be the same as episode_share. The
more other performers they share their sketches with, the lower this number becomes.
- n_titles: The actor appeared in x eligible sketches.
To get a value for these metrics over larger time spans, you should take an average over episodes.
'eligible' titles include basically all segments that cast members can normally appear in (including
monologue and Weekend Update). See performer_title_categories in 1.5 for an enumeration.
"""
def total_airtime(aid, apps_idx, metric):
"""Return the sum of the given airtime metric over some appearances for some actor.
apps_idx should be a boolean array that indexes into the appearances table.
"""
assert metric in usage_metrics
# I suspect the need to dedupe/reindex on tid is what makes this so slow
_apps = apps[
(apps['aid']==aid) & apps_idx
]
tids = _apps['tid'].unique()
if metric == 'n_titles':
tot = len(tids)
else:
# The other metrics have precomputed values per title, stored in the
# titles table.
_titles = titles[titles['tid'].isin(tids)]
tot = _titles[metric].sum()
return tot
def epid_idx_for_castyear(cast, table):
"""Return a boolean array that indexes the given table
to match only episodes in the given cast-year.
"""
season = seasons[seasons['sid']==cast.sid].iloc[0]
if not pd.isnull(cast.first_epid):
first = cast.first_epid
else:
first = season.first_epid
if not pd.isnull(cast.last_epid):
last = cast.last_epid
else:
last = season.last_epid
return ((table['epid'] >= first) & (table['epid'] <= last))
def airtime_one_season(cast, metric):
"""Return avg. airtime (per given metric) over one cast-year
(as a row from the casts table)"""
epidx = epid_idx_for_castyear(cast, apps)
return total_airtime(cast.aid, epidx, metric) / cast.n_episodes
def season_airtimes(sid):
df = casts[casts['sid']==sid].copy()
for metric in usage_metrics:
airs = []
for cast in df.itertuples():
ep_idx = epid_idx_for_castyear(cast, apps)
at = total_airtime(cast.aid, ep_idx, metric)
airs.append(at)
df[metric] = airs
return df
def overall_actor_airtime(aid, metric='episode_share'):
res = 0
for cast in casts[casts['aid']==aid].itertuples():
epidx = epid_idx_for_castyear(cast, apps)
res += total_airtime(cast.aid, epidx, metric)
return res
def overall_avg_airtime(aid, metric='episode_share'):
num = 0
denom = 0
for cast in casts[casts['aid']==aid].itertuples():
epidx = epid_idx_for_castyear(cast, apps)
num += total_airtime(cast.aid, epidx, metric)
denom += cast.n_episodes
return num / denom
%matplotlib inline
from __future__ import division
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
from collections import defaultdict
# TODO: If this stuff starts getting slow, might want to set appropriate indices (aid, tid, epid, etc.)
# Also, might help a bit to use categorical column types for some string cols.
# Because we used the JsonLinesItemExporter, our files don't actually contain
# well-formed json (they're missing enclosing square brackets and commas between
# items)
def load_json(path):
with open(path) as f:
lines = f.readlines()
json_str = '[{}]'.format(','.join(lines))
return pd.read_json(json_str, orient='records')
json_dir = 'output'
table_names = ['actors', 'appearances', 'characters', 'episodes', 'hosts', 'impressions',
'seasons', 'sketches', 'titles', 'casts',
]
tables = {}
for name in table_names:
path = os.path.join(json_dir, name+'.json')
df = load_json(path)
tables[name] = df
for name, table in tables.items():
print "Loaded {:,} {}".format(len(table), name)
# Whoops, forgot to do stupid namespace thing for casts
casts = tables['casts']
casts['aid'] = casts['aid'].apply(lambda aid: 'c_'+aid)
# Do some useful pre-emptive merge things
# Add actor_name to some tables that use aid
aid_tables = ['impressions', 'characters', 'appearances', 'hosts', 'casts']
for tablename in aid_tables:
table = tables[tablename]
mg = table.merge(tables['actors'], on='aid', suffixes=('_x', '_a'))\
.rename(columns=dict(name_a='actor_name', name_x='name', name='actor_name'))\
.drop('type', axis=1)
tables[tablename] = mg
assert len(mg)
# One extra (weird) row for season 5 we need to remove. Doesn't seem to be real ep.
episodes = tables['episodes']
episodes = episodes[pd.notnull(episodes['epno'])]
episodes['epid'] = episodes['epid'].astype(int)
tables['episodes'] = episodes
# Add epid, sid to appearances
apps = tables['appearances']
mg = apps.merge(tables['titles'], on='tid')\
.drop(['category', 'order', 'skid', 'name'], axis=1)
mg = mg.merge(tables['episodes'], on='epid')\
.drop(['aired', 'epno'], axis=1)
tables['appearances'] = mg
# Add sid to titles
titles = tables['titles']
mg = titles.merge(tables['episodes'], on='epid')\
.drop(['aired', 'epno'], axis=1)
tables['titles'] = mg
_table_names = ['actors', 'appearances', 'characters', 'episodes', 'hosts', 'impressions',
'seasons', 'sketches', 'titles', 'casts',
]
table_to_index = dict(episodes='epid', impressions='impid', seasons='sid', titles='tid', actors='aid')
for (tablename, idx_col) in table_to_index.items():
tables[tablename].set_index(idx_col, drop=False, inplace=True)
def fix_aids():
"""There are a small number of instances where the same actor is represented by more than one
aid, where one of them has irregular capitalization. Fix those instances.
"""
bad_aids = {'c_Jifa', 'c_Biha', 'c_Taki', 'c_Phha', 'c_keTh', 'c_Keth', 'c_ChCH', 'c_PhHA', 'c_KeTH',
'c_TaKI', 'c_BiHA', 'c_JiFA', 'c_ChCH', 'c_TaKI'
}
actors = tables['actors']
actors = actors[~actors['aid'].isin(bad_aids)]
tables['actors'] = actors
fixed = {}
for bad in bad_aids:
good = bad[:2] + bad[2].upper() + bad[3].lower() + bad[4].upper() + bad[5].upper()
fixed[bad] = good
fixfn = lambda aid: fixed.get(aid, aid)
for tname, table in tables.items():
if tname == 'actors':
continue
if 'aid' in table.columns:
table['aid'] = table['aid'].map(fixfn)
fix_aids()
for name, table in tables.items():
# I know, I know
globals()[name] = table
# shortcut alias
apps = appearances
imps = impressions
def enrich_seasons():
"""Add some derived columns to the seasons table."""
first_eps = []
last_eps = []
n_eps = []
for season in seasons.itertuples():
epids = episodes[episodes['sid']==season.sid]['epid']
first_eps.append(epids.min())
last_eps.append(epids.max())
n_eps.append(len(epids))
seasons['first_epid'] = first_eps
seasons['last_epid'] = last_eps
seasons['n_episodes'] = n_eps
def enrich_casts():
"""Add a column for each cast-year entry with the number of episodes the cast member
was eligible to appear in in that season. (Normally this will be fixed per season across
all cast members. The exception is cast members who start late in the season or end their
run mid-season.)
"""
n_eps = []
fracs = []
for cast in casts.itertuples():
first = cast.first_epid
if pd.isnull(first):
first = seasons.loc[cast.sid, 'first_epid']
last = cast.last_epid
if pd.isnull(last):
last = seasons.loc[cast.sid, 'last_epid']
count = eps_in_range(first, last)
n_eps.append(count)
frac = count / seasons.loc[cast.sid, 'n_episodes']
fracs.append(frac)
casts['n_episodes'] = n_eps
casts['season_fraction'] = fracs
enrich_seasons()
enrich_casts()
seasons.head(2)
# NB: slowish
weekend_update_categories = {'Weekend Update', 'Saturday Night News', 'SNL Newsbreak'}
live_sketch_categories = {'Sketch', 'Musical Sketch', 'Show', 'Game Show', 'Award Show'}
recorded_sketch_categories = {'Film', 'Commercial'}
# (See note in items.py re Miscellaneous category)
misc_performer_categories = {'Cold Opening', 'Monologue', 'Miscellaneous'}
# These are the categories of titles that count when computing airtime statistics.
# Main omissions are Goodnights and Musical Performance. Also some rarer categories
# like Guest Performance, In Memoriam, Talent Entrance, etc.
performer_title_categories = set.union(
misc_performer_categories, weekend_update_categories, live_sketch_categories, recorded_sketch_categories
)
def add_airtime_columns(force=False):
"""Add some derived columns to titles/episodes that are useful when calculating relative 'airtime'
of cast members and guests.
"""
if 'episode_share' in titles.columns and not force:
return
# If there are n eligible titles in an episode, each has an episode_share of 1/n
titles['episode_share'] = 0.0
titles['n_performers'] = 0
# The same as above, but each title is further normalized by number of performers present.
titles['cast_episode_share'] = 0.0
for episode in episodes.itertuples():
# XXX: Need to rescrape these eps (skipped because they have no host)
if episode.epid in (19810411, 19841006, 19811003):
continue
ep_titles_ix = ((titles['epid']==episode.epid)
& (titles['category'].isin(performer_title_categories)))
n_titles = ep_titles_ix.sum()
if n_titles == 0:
print 'Warning: Found 0 titles for epid {}. Skipping.'.format(episode.epid)
continue
titles.loc[ep_titles_ix, 'episode_share'] = 1/n_titles
perfs = []
for title in titles[ep_titles_ix].itertuples():
performer_aids = apps[apps['tid']==title.tid]['aid'].unique()
perfs.append(len(performer_aids))
titles.loc[ep_titles_ix, 'n_performers'] = perfs
titles.loc[ep_titles_ix, 'cast_episode_share'] = (
titles.loc[ep_titles_ix, 'episode_share']
/
titles.loc[ep_titles_ix, 'n_performers']
)
add_airtime_columns(1)
def eps_present_in_casts(cs):
eps = 0
for cast in cs.itertuples():
season = seasons.loc[cast.sid]
if not pd.isnull(cast.first_epid):
first = cast.first_epid
else:
first = season.first_epid
if not pd.isnull(cast.last_epid):
last = cast.last_epid
else:
last = season.last_epid
present_epids = apps.loc[
(apps['aid']==cast.aid) &
(apps['epid'] >= first) & (apps['epid'] <= last),
'epid'
].unique()
eps += len(present_epids)
return eps
def build_tenure():
# n_eps: how many episodes of SNL were there between this person's start and finish?
# eps_present: how many episodes of SNL did this person appear in as a cast member?
# (may be less than n_eps for weeks where they weren't in the show)
# Haha, so I guess some performers actually have non-contiguous runs on the show (e.g. Al
# Franken, so some previous assumptions won't work.)
cols = ['aid', 'actor_name', 'n_episodes', 'eps_present', 'n_seasons']
rows = []
cast = actors[actors['type']=='cast']
for actor in cast.itertuples():
aid, name = actor.aid, actor.name
cast_years = casts[casts['aid'] == aid].sort_values(by='sid')
if len(cast_years) == 0:
print "Warning: {} ({}) was in actors table with type='cast', but they aren't in casts table"\
.format(name, aid)
continue
n_seasons = len(cast_years)
n_episodes = cast_years['n_episodes'].sum()
eps_present = eps_present_in_casts(cast_years)
row = [aid, name, n_episodes, eps_present, n_seasons]
rows.append(row)
return pd.DataFrame(rows, columns=cols)
tenure = build_tenure()
tenure.head(3)
import gender_guesser.detector as gender
detector = gender.Detector()
def names_from_file(fname):
with open(fname) as f:
return set([name.strip().decode('utf-8') for name in f])
# First names not recognized by gender_guesser
extra_malenames = {
'Beck', 'Mikey', 'Chevy', 'Norm',
'Nile', 'Lin-Manuel', 'Macaulay', 'Kiefer', 'Spike', 'Kanye', 'Rainn', 'Shia',
'Sting', 'Hulk', 'Liberace', 'Yogi', 'Merv', 'Mr.', 'O.J.',
}
extra_femalenames = {
'Aidy', 'Sasheer', 'Janeane', 'Danitra',
'Lorde', 'Taraji', 'Uzo', 'Brie', 'Rihanna', 'January',
'Anjelica', 'Oprah', 'Ann-Margret',
}
# Names misgendered by gender_guesser (or labelled as androgynous/unknown)
female_fullnames = {
'Blake Lively', 'Terry Turner', 'Dakota Johnson', 'Cameron Diaz', 'Taylor Swift',
'Robin Wright', 'Sydney Biddle Barrows', 'Whitney Houston', 'Morgan Fairchild',
'Reese Witherspoon',
'Casey Wilson', 'Nasim Pedrad', 'Noël Wells'.decode('utf-8'), 'Jan Hooks', 'Robin Duke',
}.union(names_from_file('female_names.txt'))
male_fullnames = {
'Kyle Gass', 'The Rock', 'Jamie Foxx', 'Kelsey Grammer', 'Leslie Nielsen',
'Kyle MacLachlan', 'Desi Arnaz Jr.', 'Desi Arnaz', 'Kyle Mooney', 'The Weeknd',
'Bernie Sanders', 'Sacha Baron Cohen', 'A. Whitney Brown', 'Finesse Mitchell',
'Dana Carvey', 'Tracy Morgan',
}.union(names_from_file('male_names.txt'))
# A few interesting cases: Dame Edna, RuPaul, Marilyn Manson, T.J. Jourian (transman).
# I labelled as ffmm, respectively.
def genderize(name, confident=True):
if name in female_fullnames:
return 'female'
if name in male_fullnames:
return 'male'
first = name.split()[0]
if first in extra_malenames:
return 'male'
if first in extra_femalenames:
return 'female'
guess = detector.get_gender(first)
if confident and guess == 'mostly_male':
return 'male'
if confident and guess == 'mostly_female':
return 'female'
return guess
actors['gender'] = actors['name'].apply(genderize)
actors['gender'].value_counts()
charapps = appearances[pd.notnull(appearances['charid'])]
char_counts = charapps.groupby('charid').size().to_frame(name='count').reset_index(level=0)
char_counts = pd.merge(char_counts, characters, on='charid')
char_counts.sort_values(by='count', ascending=False, inplace=True)
char_counts.head(20)
Wow, Sarducci. Really fascinating thing is apparently Don Novello hosted one ep and co-hosted another in character as Guido Sarducci, and his appearances in those episodes aren't even counted here. Man, the Ebersol years were weird.
A lot of these characters are from the 90's - consistent with observation from writer in LFNY about this being the era that relied most on recurring characters, and drove them into the ground.
Visualization concept: x axis is time (or just season). For each popular character, do a sort of KDE of their appearances over time. See which characters were flash-in-the-pan and which had more of a slow burn. Perhaps even more interesting: see which seasons used the most highly recurring characters. For characters that appear as part of the same recurring sketch (e.g. Hans+Franz, Wayne+Garth, MacGruber+Vicki), consider them as a gestalt and just count whichever one appeared most often (probably the other character appeared in a subset of their sketches).
charapps = appearances[pd.notnull(appearances['impid'])]
char_counts = charapps.groupby('impid').size().to_frame(name='count').reset_index(level=0)
char_counts = pd.merge(char_counts, impressions, on='impid')
char_counts.sort_values(by='count', ascending=False, inplace=True)
char_counts.head(20)
Darrell Hammond's clinton wins by a huge margin. I think this is a combination of factors:
# Most impersonated people (lumping together different impersonators)
charapps = appearances[pd.notnull(appearances['impid'])]
char_counts = charapps.groupby('role').size().to_frame(name='count').reset_index(level=0)
char_counts.sort_values(by='count', ascending=False, inplace=True)
char_counts.head(20)
Notes:
impressions.groupby('name').size().sort_values(ascending=False).head(20)
For the most part, should expect these to be people who have had long-lasting cultural relevance (therefore many successive casts have had chances to do their own impersonations). Hillary Clinton and Madonna are good examples of this. Other factors:
I happen to know that Matthew McConaughey's high count is mostly because of a single sketch with a gag involving 6 Matthew McConaugheys (a throwaway gag about the surfeit of white male cast members). I think the others are legit.
df = impressions.groupby('actor_name').size().sort_values(ascending=False)
df.head(10)
df = characters.groupby('actor_name').size().sort_values(ascending=False)
df.head(10)
TODO: Need to rerun full scrape since fixing voice role bug.
app_counts = appearances.groupby('actor_name').size().to_frame(name='roles')
voice_counts = apps[apps['voice']].groupby('actor_name').size().to_frame(name='voice_roles')
voice_counts.sort_values(by='voice_roles', ascending=False).head(10)
# TODO: buuuug
from collections import defaultdict
def get_app_counts(cond):
app_counts = defaultdict(lambda: defaultdict(int))
#cond = appearances['sid'] == sid
for app in appearances[cond].itertuples():
if pd.notnull(app.charid):
k = 'character'
elif pd.notnull(app.impid):
k = 'impression'
else:
k = 'other'
app_counts[app.actor_name][k] += 1
return app_counts
def app_bars(cond=None, normalize=False):
"""Where cond is some boolean array masking appearances"""
counts = get_app_counts(cond)
# Sort actors by total appearances
items = sorted(counts.items(), key=lambda tup: sum(tup[1].values()), reverse=1)
items = items[:13]
if normalize:
def normfn(tup):
actor, appcounts = tup
appsum = sum(appcounts.values())
normed = {k: v/appsum for k, v in appcounts.items()}
return (actor, normed)
items = map(normfn, items)
# Start with most appearances at the top
items.reverse()
actors = [it[0] for it in items]
FS = (10, 6)
fig, ax = plt.subplots(figsize=FS)
acc = np.zeros(len(actors))
x = range(len(actors))
horizontal = True
for k in ['impression', 'character', 'other']:
y = [it[1][k] for it in items]
fn = ax.barh if horizontal else ax.bar
if horizontal:
kwargs = dict(left=acc, label=k)
else:
kwargs = dict(bottom=acc, label=k)
fn(x, y, **kwargs)
acc += y
ax.legend()
if not horizontal:
ax.set_xticks(x)
ax.set_xticklabels(actors)
for tick in ax.get_xticklabels():
tick.set_rotation(45)
else:
ax.set_yticks(x)
ax.set_yticklabels(actors)
return ax
#app_bars(apps['sid']==39);
ax = app_bars(
(apps['sid']>=39)
& (apps['capacity']=='cast')
, 0
)
ax.set_title('Dist. of performance types (last 4 seasons)');
This is kind of cool. Some observations:
# (Tina Fey's tenure - '00 - '05)
ax = app_bars(
(apps['sid']>=26)
& (apps['sid']<=31)
& (apps['capacity']=='cast')
, normalize=False,
)
ax.set_title("Dist. of performance types ('00-'05)");
For her most recent hosting gig, Tina Fey actually did a monologue about how she had no memorable recurring characters.
This really makes it clear how focused Darrell Hammond was on impressions.
rec_cats = {'Film', 'Commercial', 'Cartoon'}
from collections import defaultdict
def rec_get_app_counts(cond):
app_counts = defaultdict(lambda: defaultdict(int))
df = appearances[cond].merge(titles, on='tid')
for app in df.itertuples():
if app.category in rec_cats:
k = 'recorded'
else:
k = 'live'
app_counts[app.actor_name][k] += 1
return app_counts
def rec_app_bars(cond=None, normalize=False):
"""Where cond is some boolean array masking appearances"""
counts = rec_get_app_counts(cond)
# Sort actors by total appearances
items = sorted(counts.items(), key=lambda tup: sum(tup[1].values()), reverse=1)
items = items[:13]
if normalize:
def normfn(tup):
actor, appcounts = tup
appsum = sum(appcounts.values())
normed = {k: v/appsum for k, v in appcounts.items()}
return (actor, normed)
items = map(normfn, items)
# Start with most appearances at the top
items.reverse()
actors = [it[0] for it in items]
FS = (8, 6)
fig, ax = plt.subplots(figsize=FS)
acc = np.zeros(len(actors))
x = range(len(actors))
horizontal = True
for k in ['recorded', 'live']:
y = [it[1][k] for it in items]
fn = ax.barh if horizontal else ax.bar
if horizontal:
kwargs = dict(left=acc, label=k)
else:
kwargs = dict(bottom=acc, label=k)
fn(x, y, **kwargs)
acc += y
ax.legend()
if not horizontal:
ax.set_xticks(x)
ax.set_xticklabels(actors)
for tick in ax.get_xticklabels():
tick.set_rotation(45)
else:
ax.set_yticks(x)
ax.set_yticklabels(actors)
return ax
ax = rec_app_bars(
(apps['sid']>=39)
& (apps['capacity']=='cast')
, True
)
ax.set_title('Last 4 seasons');
ax = rec_app_bars(
(apps['sid']>=31)
& (apps['sid']<=37)
& (apps['capacity']=='cast')
, normalize=1,
)
ax.set_title('Mid-to-late 2000\'s');
Samberg definitely appears in a disproportionately high # of recorded segments, but it's not as dramatic a difference as I expected.
from adjustText import adjust_text
# TODO: There's probably a more effective way to visualize this. Maybe do a stacked bar
# for each performer, and arbitrarily assign a color to each distinct impression (with
# its height being the # of times it was performed)
# TODO: maybe helpful to color dots by featured vs. repertory
def scatter_impressions(sids, ax=None, transpose=True, characters=False):
if isinstance(sids, int):
sids = [sids]
if ax is None:
FS = (13, 6)
fig, ax = plt.subplots(figsize=FS)
ppl = cast_of_seasons(sids)
xy = []
sought_col = 'charid' if characters else 'impid'
for actor in ppl.itertuples():
idx = (apps['aid']==actor.aid) & apps['sid'].isin(sids) & pd.notnull(apps[sought_col])
imp_apps = apps[idx]
distinct = len(imp_apps[sought_col].unique())
row = [distinct, len(imp_apps)]
xy.append(row)
xy = np.array(xy)
ax.scatter(xy[:,0+transpose], xy[:,1-transpose])
texts = []
for (coords, actor) in zip(xy, ppl.itertuples()):
#ax.annotate(actor.name, coords)
txt = ax.text(coords[0+transpose], coords[1-transpose], actor.name)
texts.append(txt)
adjust_text(texts, arrowprops=dict(arrowstyle='->', lw=.5))
thing = 'character' if characters else 'impression'
ax_labels = ['# distinct {}s'.format(thing), '{} appearances'.format(thing.capitalize())]
ax.set_xlabel(ax_labels[0+transpose])
ax.set_ylabel(ax_labels[1-transpose])
# x = y baseline
minmax = min(xy[:,0].max(), xy[:,1].max())
ax.plot([0, minmax], [0, minmax], linestyle='--', lw=.5, color='b')
#ax.plot([0, minmax*2], [0, minmax], linestyle='--', lw=.5, color='b')
#ax.plot([0, minmax*3], [0, minmax], linestyle='--', lw=.5, color='b')
return ax
scatter_impressions(42);
Beck Bennett is kind of a surprising. He did Putin 7 times in s42, Mike Pence 6 times, and Jake Tapper 5 times. Really shows how much political stuff they put on air last year. Cecily Strong's biggest recurring impression last year was Melania Trump, but she mostly did lots of one-offs as various debate moderators and news personalities.
It's pretty remarkable how far ahead of the pack Kate McKinnon is. She had a couple of huge political impressions last season (Hillary and Kellyanne Conway), but even though she was doing those every week, she still had a greater variety of impersonations than anyone else.
fig, ax = plt.subplots(figsize=(14, 10))
scatter_impressions(range(35, 43), ax);
fig, ax = plt.subplots(figsize=(14, 10))
scatter_impressions(range(26, 32), ax);
scatter_impressions(42, characters=True);
scatter_impressions(range(38, 43), characters=True);
M_COLOR = 'dodgerblue'
F_COLOR = 'magenta'
def scatter_sketches_and_episodes():
df = tenure.copy()
df['gender'] = actors.loc[df['aid'], 'gender'].values
df['n_titles'] = df['aid'].apply(lambda aid: overall_actor_airtime(aid, 'n_titles'))
FS = (15, 8)
fig, ax = plt.subplots(figsize=FS)
xcol = 'n_titles'
ycol= 'n_episodes'
x = df[xcol]
y = df[ycol]
igirl = df['gender'] == 'female'
ax.scatter(x[igirl], y[igirl], alpha=.4, color=F_COLOR)
ax.scatter(x[~igirl], y[~igirl], alpha=.6, color=M_COLOR)
ax.set_xlabel('# sketches')
ax.set_ylabel('# episodes')
ax.set_xlim(left=0)
ax.set_ylim(bottom=0)
label_min_x = 700
label_min_y = 175
for xx, yy, name in df[[xcol, ycol, 'actor_name']].itertuples(index=False):
if xx >= label_min_x or yy >= label_min_y:
ax.annotate(name, xy=(xx+10,yy-2.5))
return ax
ax = scatter_sketches_and_episodes();
df = tenure.copy()
df['avg_sketches_per_ep'] = df['aid'].apply(lambda aid: overall_avg_airtime(aid, 'n_titles'))
df['avg_episode_share'] = df['aid'].apply(overall_avg_airtime)
df['avg_cast_episode_share'] = df['aid'].apply(lambda aid: overall_avg_airtime(aid, 'cast_episode_share'))
df.sort_values(by='avg_episode_share', ascending=False).head(15)
In many cases, these are the MVP players of their time.
Some are just highly competent workhorses who were written into a lot of sketches but never really achieved star status during their time on the show or did any hugely popular recurring characters or impressions:
Not sure about the story with Michael McKean. He played Bill Clinton after Phil Hartman left, so I guess that helped. Also not sure about Randy Quaid.
Janeane Garofalo is the most surprising example. She hated her time on the show, and I got the impression she didn't get along well with many of the writers or other cast members. So how was she in so many sketches? When she was on, there were only 2 other female cast members, Ellen Cleghorne and Laura Kightlinger. Laura was a featured player and not used in many sketches, so I guess Janeane was the default choice when they needed a white lady in a sketch or for an impression? Still surprising.
df.sort_values(by='avg_episode_share', ascending=True).head(15)
Cast members with the fewest appearances per episode. Mostly players who just didn't manage to make much of an impact or get in a lot of sketches during their (short) time on the show.
Some were much more active with the show's writing, e.g. Robert Smigel, Al Franken, A. Whitney Brown, Fred Wolf, Jim Downey. The latter 2 were head writers for a while.
(#TODO: I think Emily Prager was legit not even in the single episode she was credited for. Laurie was in one, but that was one of the episodes that failed to scrape. George Coe has a higher n_episodes than he should - same with Michael O'Donoghue. snlarchive just has incomplete cast metadata for s1: http://www.snlarchives.net/Seasons/?1975)
df = tenure.copy()
df['missed'] = df['n_episodes'] - df['eps_present']
df.sort_values(by='missed', ascending=False).head(20)
I'm guessing Maya Rudolph missed a bunch of episodes during and after her pregnancy.
I think I read somewhere that Mike Myers was allowed to miss some shows when he was shooting films. Maybe similar situation for David Spade?
I think most of the rest are just examples of performers not having any of their sketches make it to air, rather than them not being able to perform due to some extenuating circumstances.
Again, several were doing more writing than performing (Al Franken, Fred Wolf, Jim Downey).
Apparently A. Whitney Brown was a featured player in all 6 of his seasons. I wonder if that's a record. Hm, no, I guess not. Al Franken was a FP on all 11 of his seasons.
# Categories for live sketches
sketch_cats = ['Sketch', 'Musical Sketch', 'Show', 'Game Show', 'Award Show']
rec_cats = ['Film', 'Commercial', 'Cartoon']
other_cats = ['Guest Performance', 'Miscellaneous', 'In Memoriam', 'Talent Entrance', 'Intro',
'Encore Presentation',
]
collapse_other = 1 # Give all 'other' categories the same color
cat_to_color = {}
for i, skc in enumerate(sketch_cats):
g = 120 + i * ((254-120)/(len(sketch_cats)-1))
cat_to_color[skc] = (254, g, 86)
for i, skc in enumerate(rec_cats):
b = 13 + i * ((254-13)/(len(rec_cats)-1))
if skc == 'Commercial':
b += 30
cat_to_color[skc] = (86, 254, b)
for i, skc in enumerate(other_cats):
if collapse_other:
cat_to_color[skc] = (86, 86, 254)
else:
z = 13
r = z + i * ((254-z)/(len(other_cats)-1))
cat_to_color[skc] = (r, 86, 254)
# Whoops, matplotlib wants channels in [0,1]
cat_to_color = {cat: map(lambda ch: ch/255, color_tup) for (cat, color_tup) in cat_to_color.items()}
def cat_avg_per_season(cat, norms=None):
res = []
# I know there are cleverer ways to do these things, but working with a small dataset,
# I'm enjoying the simplicity of for loops over torturing myself reading inscrutable
# pandas documentation.
for i, season in enumerate(seasons.itertuples()):
eps_this_season = (episodes['sid'] == season.sid).sum()
if isinstance(cat, list) or isinstance(cat, set):
cat_cond = titles['category'].isin(cat)
else:
cat_cond = titles['category'] == cat
cats_this_season = (cat_cond & (titles['sid'] == season.sid)).sum()
avg = cats_this_season / eps_this_season
if norms:
avg = avg / norms[i]
res.append(avg)
return res
# Whether to normalize all bars to sum to 1
normalize = 1
norms = None
if normalize:
allcats = set(cat_to_color.keys())
norms = cat_avg_per_season(allcats)
bar_width = .8
sids = seasons['sid'].values
FS = (15, 10)
fig, axes = plt.subplots(3, 1, figsize=FS)
for cat, ax in zip(['Film', 'Commercial', ['Show', 'Game Show']], axes):
ax.set_title('{} segments per season'.format(cat))
if isinstance(cat, list):
color = cat_to_color[cat[0]]
else:
color = cat_to_color[cat]
y = cat_avg_per_season(cat, norms)
ax.bar(sids, y, bar_width, color=color)
ax.set_xlim(left=0)
ax.set_xticks(range(1, sids[-1]+1))
FS = (15, 7)
fig, ax = plt.subplots(figsize=FS)
acc = np.zeros(len(sids))
#for cat in sketch_cats + rec_cats + other_cats:
for cat in rec_cats + sketch_cats + other_cats:
color = cat_to_color[cat]
y = cat_avg_per_season(cat, norms)
label = cat
if cat in other_cats and collapse_other:
label = 'Other' if cat == 'Guest Performance' else None
ax.bar(sids, y, bar_width, bottom=acc, color=color, label=label)
acc += y
ax.legend(bbox_to_anchor=(1.15,.75))
ax.set_xlim(left=0)
ax.set_xticks(range(1, sids[-1]+1));
Some trends:
# TODO: Kinda slow (~20 s?)
def host_usage(aid, epid, metric):
assert metric in usage_metrics
tids = apps[
(apps['epid']==epid) & (apps['aid']==aid)
]['tid'].unique()
if metric == 'n_titles':
return len(tids)
host_titles = titles[titles['tid'].isin(tids)]
return host_titles[metric].sum()
hosts_per_epid = hosts.groupby('epid').size()
usage = {mt: [] for mt in usage_metrics}
for host in hosts.itertuples():
for metric in usage_metrics:
dest = usage[metric]
# Don't count shows with multiple hosts.
if hosts_per_epid[host.epid] > 1:
dest.append(None)
continue
x = host_usage(host.aid, host.epid, metric)
dest.append(x)
df = hosts.copy()
for metric in usage_metrics:
df[metric] = usage[metric]
df['n_titles'].plot.hist(title='Dist. of sketches appeared in by SNL hosts', figsize=(8,5));
# Hosts who appeared in the most sketches on their episode
df.sort_values(by='n_titles', ascending=False).head(10)
Wow, Betty White is a legend. She was in literally every single segment of the show except for Jay-Z's musical performances (she didn't want to upstage him I guess).
# Same as above, but normalized by number of sketches in the show
df.sort_values(by='episode_share', ascending=False).head(10)
Props to Mulder and Will Ferrell. Also 100% attendance, their shows just had fewer sketches.
5 of the top 10 are former cast members, which isn't that surprising. They know how SNL works, and they're good at it.
# Same as the above but additionally weight each sketch by the inverse of
# the number of performers in it.
df.sort_values(by='cast_episode_share', ascending=False).head(10)
So, in some sense, Bob Newhart and Eddie Murphy personally 'owned' almost 40% of the shows they hosted, which is pretty impressive.
Rationale for this one is that carrying a sketch solo or with just one other performer is a lot of responsibility. A less adept host might get put in a lot of sketches, but with lots of cast members around them to do the heavy lifting.
Interesting that this metric diverges pretty significantly from the other two. I guess it's pretty volatile compared to the others. A sketch with 7 or 8 other performers (which is not that uncommon) is worth almost nothing under this metric compared to a solo sketch. Also of course fails to account for the size of roles. Your score might get dragged down just because of a few background roles with few or no lines.
Interesting that all these episodes are pre-y2k.
# Hosts who appeared in the fewest sketches
df.sort_values(by='n_titles', ascending=True).head(10)
# As above, but normalized by # sketches in show
df.sort_values(by='episode_share', ascending=True).head(30).reset_index(drop=True)
#df.sort_values(by='cast_episode_share', ascending=True).head(10)
"""Shelley Duvall is an interesting example. She was in 60% of sketches, but her 'cast_episode_share' is low because so many of them had huge numbers of performers. Like, look at all the people in this sketch: http://www.snlarchives.net/Episodes/?197705143
While I think it's true that a weak performer will probably never get a segment to themselves, or a duo sketch, I don't think the penalty should necessarily be linear with number of performers. You can hide a bad host in a sketch with 5 cast members just as easily as you can in one with 10.
(Another interesting feature of the Duvall episode is that there's seemingly no monologue?)""";
Some general comments on least-used hosts (over all metrics):
def g_lookup(aid):
try:
return actors.loc[aid, 'gender']
except KeyError:
return None
impressions['actor_gender'] = impressions['aid'].apply(g_lookup)
impressions['gender'] = impressions['name'].apply(lambda n: genderize(n, confident=True))
imps['queen'] = (imps['gender']=='female') & (imps['actor_gender']=='male')
imps['king'] = (imps['gender']=='male') & (imps['actor_gender']=='female')
queens = imps[imps['queen']]
kings = imps[imps['king']]
print "Found {} drag performances by male actors, and {} by female actors".format(
len(queens), len(kings)
)
queens.head()
unk = actors[actors['gender']=='unknown']
mm = actors[actors['gender']=='mostly_male']
mf = actors[actors['gender']=='mostly_female']
def add_titles_gender_ratio():
title_to_gendercounts = defaultdict(lambda: defaultdict(int))
# Don't double-count actors appearing in multiple roles in one sketch
tids_aids = apps.drop_duplicates(subset=['tid', 'aid'])
for app in tids_aids.itertuples():
# XXX: hack. Remove me afrer rerunning (fixed above in 2.1)
if app.aid in {'c_PhHA', 'c_KeTH', 'c_TaKI', 'c_BiHA', 'c_JiFA', 'c_ChCH',}:
gender = 'male'
else:
gender = actors.loc[app.aid, 'gender']
title_to_gendercounts[app.tid][gender] += 1
df = pd.DataFrame.from_dict(title_to_gendercounts, orient='index')
df['female'].fillna(0, inplace=True)
df['male'].fillna(0, inplace=True)
titles['n_female'] = df['female']
titles['n_male'] = df['male']
titles['female_ratio'] = titles['n_female'] / (titles['n_female'] + titles['n_male'])
add_titles_gender_ratio()
titles.head(2)
# TODO: Maybe don't include voice roles? Don Pardo/Steve Higgins as announcer could
# really skew numbers.
# OTOH, those guys actually seem to account for a surprisingly small minority of 'announcer'
# roles. They usually go to cast members.
# There's also the question of whether such roles should count toward our accounting of
# male/female representation on the show (if everyone on screen in a sketch is male but
# a woman does the voiceover, do we want to call it a mixed gender sketch rather than all-male?)
#apps['role'].value_counts().sort_values(ascending=False).head(20)
#apps[apps['role']=='announcer'].sample(20)
weekend_update_categories = {'Weekend Update', 'Saturday Night News', 'SNL Newsbreak'}
live_sketch_categories = {'Sketch', 'Musical Sketch', 'Show', 'Game Show', 'Award Show'}
recorded_sketch_categories = {'Film', 'Commercial'}
# (See note in items.py re Miscellaneous category)
misc_performer_categories = {'Cold Opening', 'Monologue', 'Miscellaneous'}
# These are the categories of titles that count when computing airtime statistics.
# Main omissions are Goodnights and Musical Performance. Also some rarer categories
# like Guest Performance, In Memoriam, Talent Entrance, etc.
performer_title_categories = set.union(
misc_performer_categories, weekend_update_categories, live_sketch_categories, recorded_sketch_categories
)
# I'm not going to include weekend update. Because the anchors are fixed during a season (or more),
# I think it'll have a disproportionately large impact on a season's makeup.
# But idk, worth experimenting with.
gender_ratio_categories = set.union(live_sketch_categories, recorded_sketch_categories, misc_performer_categories)
# Also, don't include Monologues by default. The fact that they always include the host restricts whether
# it's possible to be all-male or all-female. And unlike all the other categories we're using, Monologues
# aren't very 'sketch-like' most of the time.
gender_ratio_categories -= {'Monologue'}
M_COLOR = 'dodgerblue'
F_COLOR = 'magenta'
MIXED_COLOR = 'blanchedalmond'
def plot_gender_ratio(ax=None, normalized=True, monologues=False, min_performers=2):
"""Plot proportion of sketches per season which are all-male vs. all-female vs. mixed
"""
# I'm not going to include weekend update. Because the anchors are fixed during a season (or more),
# I think it'll have a disproportionately large impact on a season's makeup.
# But idk, worth experimenting with.
cats = gender_ratio_categories
if monologues:
cats = cats.union({'Monologue'})
# Also worth playing around with limit on # performers.
title_idx = (titles['category'].isin(cats)) & (titles['n_performers'] >= min_performers)
all_m = []
m_idx = title_idx & (titles['female_ratio']==0)
all_f = []
f_idx = title_idx & (titles['female_ratio']==1)
mixed = []
ratios = np.zeros((len(seasons), 3))
mixed_idx = title_idx & (titles['female_ratio']>0) & (titles['female_ratio']<1)
for i, sid in enumerate(seasons.index):
sidx = titles['sid'] == sid
row = [(sidx & gender_idx).sum() for gender_idx in [f_idx, mixed_idx, m_idx]]
if normalized:
denom = (sidx & title_idx).sum()
row = [x / denom for x in row]
ratios[i] = row
FS = (15, 9)
if ax is None:
fig, ax = plt.subplots(figsize=FS)
ax.stackplot(seasons.index, ratios.T, labels=['all female', 'mixed', 'all male'],
colors=[F_COLOR, MIXED_COLOR, M_COLOR]
)
ax.set_xlabel('Season')
ax.legend(bbox_to_anchor=(1.15,.5))
ax.set_xlim(left=1, right=seasons.index.max())
if normalized:
ax.set_ylim(top=1)
return ax, ratios
def plot_cast_gender(ax=None, include_featured=True, fractional=True):
fm = []
df = casts.merge(actors, on='aid')
for sid in seasons.index:
idx = (df['sid'] == sid)
if not include_featured:
idx = idx & ~df['featured']
if fractional:
m = df.loc[
idx & (df['gender']=='male'),
'season_fraction'
].sum()
f = df.loc[
idx & (df['gender']=='female'),
'season_fraction'
].sum()
else:
#acts = cast_of_season(sid, include_featured)
acts = df[idx]
m = (acts['gender'] == 'male').sum()
f = (acts['gender'] == 'female').sum()
if (m+f) != len(acts):
print "Season {} has m = {}, f = {}, cast size = {}".format(sid, m, f, len(acts))
fm.append([f, m])
if ax is None:
FS = (15, 5)
fig, ax = plt.subplots(figsize=FS)
fm = np.array(fm)
x = seasons.index
bar = 0
if bar:
bar_width = .4
ax.bar(x, fm[:,1], bar_width, label='male')
ax.bar(x+bar_width, fm[:,0], bar_width, label='female')
else:
ax.plot(x, fm[:,1], label='male', color=M_COLOR)
ax.plot(x, fm[:,0], label='female', color=F_COLOR)
#ax.stackplot(x, y, labels=['female', 'male'])
ax.legend()
ax.set_title('Cast gender composition')
ax.set_xlabel('Season')
ax.set_xlim(left=1)
ax.set_xticks(seasons.index, minor=True)
yrange = range(14)
ax.set_yticks(yrange)
ax.set_ylabel('# Cast members')
return ax
FS = (15, 10)
fig, (ax1, ax2) = plt.subplots(2, figsize=FS)
plot_cast_gender(ax1)
plot_cast_gender(ax2, include_featured=False)
ax2.set_title('Cast gender composition (repertory players only)');
# Note: "# cast members" can actually be non-integral to account for cast members who
# were around for only part of a season. e.g. Ben Stiller was only in the last 6 out of 20 episodes
# of season 14, so he's about 1/3 of a dude in that season for accounting purposes.
# Note: This does not include host Monologues or Weekend Update, since they're pretty constrained
# in terms of cast. It just counts 'sketches' (including commercials, digital shorts, etc.)
# Also, does not include solo performances.
ax, dat = plot_gender_ratio(monologues=False)
ax.set_title('Gender composition of SNL sketches over time');
This is interesting! Some observations:
if 0:
perfs_range = range(1, 7)
FS = (14, 5 * len(perfs_range))
fig, axes = plt.subplots(len(perfs_range), figsize=FS)
for min_perfs, ax in zip(perfs_range, axes):
plot_gender_ratio(ax, monologues=False, min_performers=min_perfs)
title = 'Gender composition of sketches (min performers = {})'.format(min_perfs)
ax.set_title(title)
def sample_monogender_sketches(gender, n=10, sid_range=None, min_performers=2):
assert gender in ('male', 'female')
cats = gender_ratio_categories
cols = ['category', 'name', 'epid', 'n_performers', 'n_female', 'n_male']
target_ratio = 0 if gender == 'male' else 1
title_idx = (
titles['category'].isin(cats)
& (titles['n_performers'] >= min_performers)
& (titles['female_ratio'] == target_ratio)
)
if sid_range:
title_idx &= (titles['sid'] >= sid_range[0]) & (titles['sid'] <= sid_range[1])
return titles.loc[title_idx, cols].sample(n, random_state=1337)
sample_monogender_sketches('female', 20)
sample_monogender_sketches('male', 20)
All-female sketches seem to be more explicitly about gender or 'girl stuff' (about half of the 20 above fit that description).
All-male sketches seem to be more wide ranging. Of the random 20 sampled above, there are only 2 or 3 that are about intrinsically or stereotypically male topics. ('Sigma', 'The Cardinals', and 'Father And Son').
(Er, the sampled sketches have changed since I wrote the above, but the broad strokes remain true.)
def plot_drag_over_time(ax=None):
df = imps.merge(apps, on='impid')
qpy = df.groupby('sid')['queen'].sum()
kpy = df.groupby('sid')['king'].sum()
if ax is None:
FS = (15, 7)
fig, ax = plt.subplots(figsize=FS)
bar_width = .4
ax.bar(qpy.index-bar_width/2, qpy.values, bar_width, label='queens')
ax.bar(kpy.index+bar_width/2, kpy.values, bar_width, label='kings')
ax.legend()
return ax
ax = plot_drag_over_time()
ax.set_title('Drag performances per season');
Men performing in drag has definitely gone in and out of vogue on SNL. Most of it happened in the 90's and 00's, with a pretty big peak around the mid-90's. (Which might charitably be described as its most 'frattish' period. Janeane Garofalo, who was a cast member in '95, has been outspoken about how misogynistic she found the culture and humor at the time. The Nora Dunn/Andrew Dice Clay scandal also happened earlier that decade.)
Something really remarkable happened starting in season 39. Performances by male actors in drag vanished entirely, and women in male parts skyrocketed. There are two significant factors about the timing of this:
Incidentally, there's a great passage from Tina Fey's book that's relevant to this topic:
The only other thing I remember about the Sylvester Stallone show was that they did a Rocky-themed monologue and they needed someone to play Rocky's wife, Adrian. Cheri really wanted the part – she was little, she was from Philly, she could do a good imitation of Talia Shire – but instead, somebody thought it would be funnier to put her co-star Chris Kattan in a dress. I remember thinking that was kind of bullshit.
[...] But I tell this specific tale of Cheri being passed over for Kattan-in-drag because it illustrates how things were the first week I was there. By the time I left nine years later, that would never have happened. The women in the cast took over the show in that decade, and I had the pleasure of being there to witness it.
from matplotlib.ticker import FormatStrFormatter
def plot_top_drag_performers(royal_col, n=10):
df = imps.merge(apps, on='impid')
drag_per_actor = df.groupby('actor_name_x')[royal_col].sum().sort_values(ascending=False).iloc[:n]\
.sort_values()
FS = (10, 7)
fig, ax = plt.subplots(figsize=FS)
nbars = min(n, len(drag_per_actor))
y = range(nbars)
ax.barh(y, drag_per_actor.values[:n])
ax.set_yticks(y)
ax.set_yticklabels(drag_per_actor.index[:n])
ax.set_xticks(range(0, int(drag_per_actor.max()), 2))
#ax.xaxis.set_major_formatter(FormatStrFormatter('%d'))
return ax
# (There are 8 female cast members who have performed in drag more than once.
# Another 10 or so have done it exactly once, plus a couple hosts)
plot_top_drag_performers('king', n=8);
As expected, Kate McKinnon rules the roost. Her dominance is especially impressive given that she presumably has a few years left of her SNL career (she's done 5 seasons - Amy Poehler and Maya Rudolph each did 8).
We can also see that the peak that came before her corresponds to the Fey era of early-mid-ish 2000's with performers like Amy Poehler, Maya Rudolph, and Rachel Dratch.
Amusingly, Melissa McCarthy places 4th on her Sean Spicer alone.
plot_top_drag_performers('queen', n=14);
Some notes:
if 0:
tm = imps['actor_name']=='Tracy Morgan'
print "Tracy morgan did {} impressions, {} female".format(tm.sum(), (tm & (imps['queen'])).sum())
imps[tm]
if 0:
ts = imps['actor_name']=='Terry Sweeney'
imps[ts]
if 0:
x = {'Darrell Hammond', 'Taran Killam', 'Bill Hader'}
imps[
imps['actor_name'].isin(x) & imps['queen']
]
Some threads that seem like they might be worth tugging on more...
Last season gave SNL its biggest ratings in decades because the election and early stages of Trump's presidency gave them tons of material, and they spun it into gold and pulled off a lot of successful impressions.
I think in the data on season 42 there's some compelling evidence for what a singularly political season it was.
But the data also suggests that this is, to some degree, a continuation of an ongoing trend toward more political content. Among Gerald Ford, Jimmy Carter, Ronald Reagan, and George Bush, none of them was impersonated as many times as any president that came after (Clinton, GWB, Obama, and yes, even Trump).
The data often makes it clear who's really good at doing impressions and who can't do them at all. Or who's getting written into a lot of sketches but never in starring roles.
Not yet explored but could be interesting:
I already wrote a lot about this above, but, as much as it's sort of a trite/expected topic, I do think the data offers a lot to unpack on this topic.
Explorations that are still in a messy stage / never went anywhere.
rec = titles[pd.notnull(titles['skid'])]
skid_counts = rec.groupby('skid').size().to_frame(name='count').reset_index(level=0)
skid_counts = pd.merge(skid_counts, sketches, on='skid')
skid_counts.sort_values(by='count', ascending=False, inplace=True)
skid_counts.head(20)
# Well this is kind of a mess!
def airtime_over_time(aid, metric='episode_share'):
c = casts[casts['aid']==aid]
x = c['sid'].values
y = [airtime_one_season(yr, metric) for yr in c.itertuples()]
return x, y
def plot_airtime_over_time(aids, metric='episode_share'):
"""Given some actor ids, make a plot, where x axis is seasons and y axis is avg.
airtime (according to given metric). One line drawn per actor.
"""
FS = (11, 5)
fig, ax = plt.subplots(figsize=FS)
for aid in aids:
x, y = airtime_over_time(aid, metric)
ax.plot(x, y, label=actors.loc[aid, 'name'])
ax.set_xlabel('Season')
ax.set_ylabel(metric)
ax.legend()
return ax
ax = plot_airtime_over_time(['c_TaKi', 'c_KaMc', 'c_CeSt', 'c_BoMo', 'c_VaBa']);
# Women who started around 2000-2001-ish
y2k = ['Tina Fey', 'Rachel Dratch', 'Maya Rudolph', 'Amy Poehler']
yaids = actors.loc[actors['name'].isin(y2k), 'aid']
ax = plot_airtime_over_time(yaids);
ats = season_airtimes(39)
ats.head()
ats.sort_values(by='n_titles', ascending=False)
TODO: #s for Sasheer and Nasim seem pretty low. Are we denominating correctly for incomplete seasons? Actually, looks like Nasim was there for the whole season. How can she have been in fewer sketches than Kyle Mooney?
An average episode has around 15 segments. After subtracting goodnights and musical performances, that leaves around 12 for cast members to star in. So you only need an 'episode share' of a little under .1 to appear on average once a week.
sts = titles[
(titles['n_performers']==1)
& ~(titles['category'].isin({'Monologue', 'Weekend Update'}))
& (titles['sid'] >= 25)
]
solo_apps = sts.merge(apps, on='tid')
solos_per_actor = solo_apps.groupby('actor_name').size()
solos_per_actor.sort_values(ascending=False).head(15)