import pandas as pd
medals = pd.read_csv('Summer Olympic medallists 1896 to 2008 - ALL MEDALISTS.csv',skiprows=4)
print medals.shape
medals.head()
medals.info()
# Select the 'NOC' column of medals: country_names
country_names = medals['NOC']
print type(country_names)
country_names.head()
# Count the number of medals won by each country: medal_counts
medal_counts = country_names.value_counts()
print type(medal_counts)
medal_counts.head()
medals.head(3)
# Construct the pivot table: counted
counted = medals.pivot_table(index='NOC', values='Athlete', columns='Medal', aggfunc='count')
counted.head(7)
# Create the new column: counted['totals']
counted['totals'] = counted.sum(axis='columns')
counted.head(7)
# Sort counted by the 'totals' column
counted = counted.sort_values('totals', ascending=False)
counted.head(7)
# Select columns: ev_gen
ev_gen = medals[['Event_gender', 'Gender']]
print ev_gen.shape
ev_gen.head()
# Drop duplicate pairs: ev_gen_uniques
ev_gen_uniques = ev_gen.drop_duplicates()
ev_gen_uniques
# Group medals by the two columns: medals_by_gender
medals_by_gender = medals.groupby(['Event_gender', 'Gender'])
# Create a DataFrame with a group count: medal_count_by_gender
medal_count_by_gender = medals_by_gender.count()
medal_count_by_gender
# Create the Boolean Series: sus
sus = (medals.Event_gender == 'W') & (medals.Gender == 'Men')
sus.head()
# Create a DataFrame with the suspicious row: suspect
suspect = medals[sus]
suspect
print len(medals['Sport'].unique())
medals['Sport'].unique()
# Group medals by 'NOC': country_grouped
country_grouped = medals.groupby('NOC')
# Compute the number of distinct sports in which each country won medals: Nsports
Nsports = country_grouped['Sport'].nunique()
Nsports.head()
country_grouped['Sport'].count().head()
# Sort the values of Nsports in descending order
Nsports = Nsports.sort_values(ascending=False)
Nsports.head()
country_grouped['Sport'].count().sort_values(ascending = False).head()
count = medals.groupby('NOC')['Medal'].count()
count.head()
count.index
count.idxmax()
count.idxmin()
import numpy as np
count2 = medals.groupby('NOC')['Medal','Event'].count()
count2['Event']=count2['Event'].apply(lambda x: x+np.random.randint(-10,5))
count2.head()
count2.head()
count2.idxmax(1).value_counts()
count2.idxmax(axis='columns').head()
# Extract all rows for which the 'Edition' is between 1952 & 1988: during_cold_war
during_cold_war = (medals.Edition>=1952) & (medals.Edition<=1988)
print during_cold_war.value_counts()
during_cold_war.head()
# Extract rows for which 'NOC' is either 'USA' or 'URS': is_usa_urs
is_usa_urs = medals.NOC.isin(['USA', 'URS'])
print is_usa_urs.value_counts()
is_usa_urs.head()
# Use during_cold_war and is_usa_urs to create the DataFrame: cold_war_medals
cold_war_medals = medals.loc[during_cold_war & is_usa_urs]
print cold_war_medals.shape
cold_war_medals.head()
# Group cold_war_medals by 'NOC'
country_grouped = cold_war_medals.groupby('NOC')
# Create Nsports
Nsports = country_grouped['Sport'].nunique().sort_values(ascending=False)
print Nsports.shape
Nsports
country_grouped['Sport'].nunique().head()
cold_war_medals.groupby('NOC').count()
# Create the pivot table: medals_won_by_country
medals_won_by_country = medals.pivot_table(index='Edition', columns='NOC', values='Athlete', aggfunc='count')
medals_won_by_country.head()
# Slice medals_won_by_country: cold_war_usa_usr_medals
cold_war_usa_usr_medals = medals_won_by_country.loc[1952:1988, ['USA','URS']]
cold_war_usa_usr_medals
# Create most_medals
most_medals = cold_war_usa_usr_medals.idxmax(axis='columns')
most_medals
import matplotlib.pyplot as plt
%matplotlib inline
# Create the DataFrame: usa
usa = medals[medals.NOC == 'USA']
# Group usa by ['Edition', 'Medal'] and aggregate over 'Athlete'
usa_medals_by_year = usa.groupby(['Edition', 'Medal'])['Athlete'].count()
usa_medals_by_year.head(10)
# Reshape usa_medals_by_year by unstacking
usa_medals_by_year = usa_medals_by_year.unstack(level='Medal')
usa_medals_by_year.head(10)
# Plot the DataFrame usa_medals_by_year
usa_medals_by_year.plot()
plt.show()
# Create the DataFrame: usa
usa = medals[medals.NOC == 'USA']
# Group usa by 'Edition', 'Medal', and 'Athlete'
usa_medals_by_year = usa.groupby(['Edition', 'Medal'])['Athlete'].count()
# Reshape usa_medals_by_year by unstacking
usa_medals_by_year = usa_medals_by_year.unstack(level='Medal')
# Create an area plot of usa_medals_by_year
usa_medals_by_year.plot.area(alpha=.5)
plt.show()
# Redefine 'Medal' as an ordered categorical
medals.Medal = pd.Categorical(values=medals.Medal, categories=['Bronze', 'Silver', 'Gold'], ordered=True)
medals.Medal.head()
# Create the DataFrame: usa
usa = medals[medals.NOC == 'USA']
# Group usa by 'Edition', 'Medal', and 'Athlete'
usa_medals_by_year = usa.groupby(['Edition', 'Medal'])['Athlete'].count()
# Reshape usa_medals_by_year by unstacking
usa_medals_by_year = usa_medals_by_year.unstack(level='Medal')
# Create an area plot of usa_medals_by_year
usa_medals_by_year.plot.area(alpha=.6)
plt.show()