Pandas Playground -- Series
import pandas as pd
if True:
series = pd.Series(['Dave', 'Cheng-Han', 'Udacity', 42, -1789710578])
print series
>>>>>>>>>>>
0 Dave
1 Cheng-Han
2 Udacity
3 42
4 -1789710578
dtype: object
'''
You can also manually assign indices to the items in the Series when
creating the series
'''
if True:
series = pd.Series(['Dave', 'Cheng-Han', 359, 9001],
index=['Instructor', 'Curriculum Manager',
'Course Number', 'Power Level'])
print series
>>>>>>>>>>>>>
Instructor Dave
Curriculum Manager Cheng-Han
Course Number 359
Power Level 9001
dtype: object
'''
You can use index to select specific items from the Series
'''
if True:
series = pd.Series(['Dave', 'Cheng-Han', 359, 9001],
index=['Instructor', 'Curriculum Manager',
'Course Number', 'Power Level'])
print series['Instructor']
print ""
print series[['Instructor', 'Curriculum Manager', 'Course Number']]
>>>>>>>>>>>
Dave
Instructor Dave
Curriculum Manager Cheng-Han
Course Number 359
dtype: object
'''
You can also use boolean operators to select specific items from the Series
'''
if True:
cuteness = pd.Series([1, 2, 3, 4, 5], index=['Cockroach', 'Fish', 'Mini Pig',
'Puppy', 'Kitten'])
print cuteness > 3
print ""
print cuteness[cuteness > 3]
>>>>>>>>>>
Cockroach False
Fish False
Mini Pig False
Puppy True
Kitten True
dtype: bool
Puppy 4
Kitten 5
dtype: int64
Pandas Playground -- Dataframe
import numpy as np
import pandas as pd
'''
To create a dataframe, you can pass a dictionary of lists to the Dataframe
constructor:
1) The key of the dictionary will be the column name
2) The associating list will be the values within that column.
'''
if True:
data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions',
'Lions', 'Lions'],
'wins': [11, 8, 10, 15, 11, 6, 10, 4],
'losses': [5, 8, 6, 1, 5, 10, 6, 12]}
football = pd.DataFrame(data)
print football
>>>>>>>>>>>>>>>>>>
losses team wins year
0 5 Bears 11 2010
1 8 Bears 8 2011
2 6 Bears 10 2012
3 1 Packers 15 2011
4 5 Packers 11 2012
5 10 Lions 6 2010
6 6 Lions 10 2011
7 12 Lions 4 2012
'''
Pandas also has various functions that will help you understand some basic
information about your data frame. Some of these functions are:
1) dtypes: to get the datatype for each column
2) describe: useful for seeing basic statistics of the dataframe's numerical
columns
3) head: displays the first five rows of the dataset
4) tail: displays the last five rows of the dataset
'''
if True:
data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions',
'Lions', 'Lions'],
'wins': [11, 8, 10, 15, 11, 6, 10, 4],
'losses': [5, 8, 6, 1, 5, 10, 6, 12]}
football = pd.DataFrame(data)
print football.dtypes
print ""
print football.describe()
print ""
print football.head()
print ""
print football.tail()
>>>>>>>>>>>>>>>>>
losses int64
team object
wins int64
year int64
dtype: object
>>>>>>>>>>>>>>>>>
losses wins year
count 8.000000 8.000000 8.000000
mean 6.625000 9.375000 2011.125000
std 3.377975 3.377975 0.834523
min 1.000000 4.000000 2010.000000
25% 5.000000 7.500000 2010.750000
50% 6.000000 10.000000 2011.000000
75% 8.500000 11.000000 2012.000000
max 12.000000 15.000000 2012.000000
>>>>>>>>>>>>>>>>
losses team wins year
0 5 Bears 11 2010
1 8 Bears 8 2011
2 6 Bears 10 2012
3 1 Packers 15 2011
4 5 Packers 11 2012
>>>>>>>>>>>>>>>>>
losses team wins year
3 1 Packers 15 2011
4 5 Packers 11 2012
5 10 Lions 6 2010
6 6 Lions 10 2011
7 12 Lions 4 2012
Creat a DataFrame
from pandas import DataFrame, Series
# The following code would create a two-column pandas DataFrame
# named df with columns labeled 'name' and 'age':
people = ['Sarah', 'Mike', 'Chrisna']
ages = [28, 32, 25]
df = DataFrame({'name':Series(people),
'age' :Series(ages) })
QUIZ
def create_dataframe():
'''
Create a pandas dataframe called 'olympic_medal_counts_df' containing
the data from the table of 2014 Sochi winter olympics medal counts.
The columns for this dataframe should be called
'country_name', 'gold', 'silver', and 'bronze'.
There is no need to specify row indexes for this dataframe
(in this case, the rows will automatically be assigned numbered indexes).
You do not need to call the function in your code when running it in the
browser - the grader will do that automatically when you submit or test it.
'''
countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
'Netherlands', 'Germany', 'Switzerland', 'Belarus',
'Austria', 'France', 'Poland', 'China', 'Korea',
'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']
gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]
# your code here
return olympic_medal_counts_df
def creat_dateframe():
countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
'Netherlands', 'Germany', 'Switzerland', 'Belarus',
'Austria', 'France', 'Poland', 'China', 'Korea',
'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']
gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]
olympic_medal_counts_df = DataFrame({'country_name':Series(countries),
'gold' :Series(gold),
'silver' :Series(silver),
'bronze' :Series(bronze)})
#######################################
#######################################
solution 2
olympic_medal_counts = {'country_name':Series(countries),'gold':Series(gold),
'silver':Series(silver), 'bronze':Series(bronze)}
olympic_medal_counts_df = DataFrame(olympic_medal_counts )
return olympic_medal_counts_d
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
bronze country_name gold silver
0 9 Russian Fed. 13 11
1 10 Norway 11 5
2 5 Canada 10 10
3 12 United States 9 7
4 9 Netherlands 8 7
5 5 Germany 8 6
6 2 Switzerland 6 3
7 1 Belarus 5 0
8 5 Austria 4 8
9 7 France 4 4
10 1 Poland 4 1
11 2 China 3 4
12 2 Korea 3 3
13 6 Sweden 2 7
14 2 Czech Republic 2 4
15 4 Slovenia 2 2
16 3 Japan 1 4
17 1 Finland 1 3
18 2 Great Britain 1 1
19 1 Ukraine 1 0
20 0 Slovakia 1 0
21 6 Italy 0 2
22 2 Latvia 0 2
23 1 Australia 0 2
24 0 Croatia 0 1
25 1 Kazakhstan 0 0
Indexing DataFrame
import pandas as pd
'''
You can think of a DataFrame as a group of Series that share an index.
This makes it easy to select specific columns that you want from the
DataFrame.
Also a couple pointers:
1) Selecting a single column from the DataFrame will return a Series
2) Selecting multiple columns from the DataFrame will return a DataFrame
if True:
data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions',
'Lions', 'Lions'],
'wins': [11, 8, 10, 15, 11, 6, 10, 4],
'losses': [5, 8, 6, 1, 5, 10, 6, 12]}
football = pd.DataFrame(data)
print football['year']
print football.year # shorthand for football['year']
print football[['year', 'wins', 'losses']]
>>>>>>>>>>>>>>>>>>
0 2010
1 2011
2 2012
3 2011
4 2012
5 2010
6 2011
7 2012
Name: year, dtype: int64
>>>>>>>>>>>>>>>>>>>>>>
0 2010
1 2011
2 2012
3 2011
4 2012
5 2010
6 2011
7 2012
Name: year, dtype: int64
>>>>>>>>>>>>>>>>>>>>>>
year wins losses
0 2010 11 5
1 2011 8 8
2 2012 10 6
3 2011 15 1
4 2012 11 5
5 2010 6 10
6 2011 10 6
7 2012 4 12
'''
Row selection can be done through multiple ways.
Some of the basic and common methods are:
1) Slicing
2) An individual index (through the functions iloc or loc)
3) Boolean indexing
You can also combine multiple selection requirements through boolean
operators like & (and) or | (or)
'''
if True:
data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions',
'Lions', 'Lions'],
'wins': [11, 8, 10, 15, 11, 6, 10, 4],
'losses': [5, 8, 6, 1, 5, 10, 6, 12]}
football = pd.DataFrame(data)
print football.iloc[[0]]
print football.loc[[0]]
print football[3:5]
print football[football.wins > 10]
print football[(football.wins > 10) & (football.team == "Packers")]
>>>>>>>>>>>>>
losses team wins year
0 5 Bears 11 2010
>>>>>>>>>>>>>
losses team wins year
0 5 Bears 11 2010
>>>>>>>>>>>>>
losses team wins year
3 1 Packers 15 2011
4 5 Packers 11 2012
>>>>>>>>>>>>>
losses team wins year
0 5 Bears 11 2010
3 1 Packers 15 2011
4 5 Packers 11 2012
>>>>>>>>>>>>>
losses team wins year
3 1 Packers 15 2011
4 5 Packers 11 2012
Pandas Vectorized Methods
As a refresher on lambda, lambda functions are small inline functions that are defined on-the-fly in Python.lambda x: x>= 1
will take an input x and return x>=1, or a boolean that equalsTrue
orFalse
.
In this example,map()
andapplymap()
create a new Series or DataFrame by applying the lambda function to each element. Note thatmap()
can only be used on a Series to return a new Series andapplymap()
can only be used on a DataFrame to return a new DataFrame.
QUIZ
import numpy
from pandas import DataFrame, Series
def avg_medal_count():
'''
Using the dataframe's apply method, create a new Series called
avg_medal_count that indicates the average number of gold, silver,
and bronze medals earned amongst countries who earned at
least one medal of any kind at the 2014 Sochi olympics. Note that
the countries list already only includes countries that have earned
at least one medal. No additional filtering is necessary.
You do not need to call the function in your code when running it in the
browser - the grader will do that automatically when you submit or test it.
'''
countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
'Netherlands', 'Germany', 'Switzerland', 'Belarus',
'Austria', 'France', 'Poland', 'China', 'Korea',
'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']
gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]
olympic_medal_counts = {'country_name':countries,
'gold': Series(gold),
'silver': Series(silver),
'bronze': Series(bronze)}
df = DataFrame(olympic_medal_counts)
# YOUR CODE HERE
return avg_medal_count
def avg_medal_count():
avg_medal_count = df[['gold','silver','bronze']].apply(numpy.mean)
return avg_medal_count
>>>>>>>>>>>>>>>>>>>>>>>
gold 3.807692
silver 3.730769
bronze 3.807692