Powered by GitBook

Pandas Playground -- Series

import pandas as pd

if True:
    series = pd.Series(['Dave', 'Cheng-Han', 'Udacity', 42, -1789710578])
    print series
>>>>>>>>>>>
0           Dave
1      Cheng-Han
2        Udacity
3             42
4    -1789710578
dtype: object

'''
You can also manually assign indices to the items in the Series when
creating the series
'''
if True:
    series = pd.Series(['Dave', 'Cheng-Han', 359, 9001],
                       index=['Instructor', 'Curriculum Manager',
                              'Course Number', 'Power Level'])
    print series
>>>>>>>>>>>>>
Instructor                 Dave
Curriculum Manager    Cheng-Han
Course Number               359
Power Level                9001
dtype: object

'''
You can use index to select specific items from the Series
'''
if True:
    series = pd.Series(['Dave', 'Cheng-Han', 359, 9001],
                       index=['Instructor', 'Curriculum Manager',
                              'Course Number', 'Power Level'])
    print series['Instructor']
    print ""
    print series[['Instructor', 'Curriculum Manager', 'Course Number']]
>>>>>>>>>>>
Dave

Instructor                 Dave
Curriculum Manager    Cheng-Han
Course Number               359
dtype: object

'''
You can also use boolean operators to select specific items from the Series
'''
if True:
    cuteness = pd.Series([1, 2, 3, 4, 5], index=['Cockroach', 'Fish', 'Mini Pig',
                                                 'Puppy', 'Kitten'])
    print cuteness > 3
    print ""
    print cuteness[cuteness > 3]
>>>>>>>>>>
Cockroach    False
Fish         False
Mini Pig     False
Puppy         True
Kitten        True
dtype: bool

Puppy     4
Kitten    5
dtype: int64

Pandas Playground -- Dataframe

import numpy as np
import pandas as pd

'''
To create a dataframe, you can pass a dictionary of lists to the Dataframe
constructor:
1) The key of the dictionary will be the column name
2) The associating list will be the values within that column.
'''

if True:
    data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
            'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions',
                     'Lions', 'Lions'],
            'wins': [11, 8, 10, 15, 11, 6, 10, 4],
            'losses': [5, 8, 6, 1, 5, 10, 6, 12]}
    football = pd.DataFrame(data)
    print football
>>>>>>>>>>>>>>>>>>
   losses     team  wins  year
0       5    Bears    11  2010
1       8    Bears     8  2011
2       6    Bears    10  2012
3       1  Packers    15  2011
4       5  Packers    11  2012
5      10    Lions     6  2010
6       6    Lions    10  2011
7      12    Lions     4  2012

'''
Pandas also has various functions that will help you understand some basic
information about your data frame. Some of these functions are:
1) dtypes: to get the datatype for each column
2) describe: useful for seeing basic statistics of the dataframe's numerical
   columns
3) head: displays the first five rows of the dataset
4) tail: displays the last five rows of the dataset
'''

if True:
    data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
            'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions',
                     'Lions', 'Lions'],
            'wins': [11, 8, 10, 15, 11, 6, 10, 4],
            'losses': [5, 8, 6, 1, 5, 10, 6, 12]}
    football = pd.DataFrame(data)
    print football.dtypes
    print ""
    print football.describe()
    print ""
    print football.head()
    print ""
    print football.tail()
>>>>>>>>>>>>>>>>>
losses     int64
team      object
wins       int64
year       int64
dtype: object
>>>>>>>>>>>>>>>>>
          losses       wins         year
count   8.000000   8.000000     8.000000
mean    6.625000   9.375000  2011.125000
std     3.377975   3.377975     0.834523
min     1.000000   4.000000  2010.000000
25%     5.000000   7.500000  2010.750000
50%     6.000000  10.000000  2011.000000
75%     8.500000  11.000000  2012.000000
max    12.000000  15.000000  2012.000000
>>>>>>>>>>>>>>>>
   losses     team  wins  year
0       5    Bears    11  2010
1       8    Bears     8  2011
2       6    Bears    10  2012
3       1  Packers    15  2011
4       5  Packers    11  2012
>>>>>>>>>>>>>>>>>
   losses     team  wins  year
3       1  Packers    15  2011
4       5  Packers    11  2012
5      10    Lions     6  2010
6       6    Lions    10  2011
7      12    Lions     4  2012

Creat a DataFrame

from pandas import DataFrame, Series

# The following code would create a two-column pandas DataFrame
# named df with columns labeled 'name' and 'age':

people = ['Sarah', 'Mike', 'Chrisna']
 ages  =  [28, 32, 25]
 df = DataFrame({'name':Series(people),
                 'age' :Series(ages) })

QUIZ

def create_dataframe():
    '''
    Create a pandas dataframe called 'olympic_medal_counts_df' containing
    the data from the table of 2014 Sochi winter olympics medal counts.  

    The columns for this dataframe should be called 
    'country_name', 'gold', 'silver', and 'bronze'.  

    There is no need to  specify row indexes for this dataframe 
    (in this case, the rows will automatically be assigned numbered indexes).

    You do not need to call the function in your code when running it in the
    browser - the grader will do that automatically when you submit or test it.
    '''

    countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
                 'Netherlands', 'Germany', 'Switzerland', 'Belarus',
                 'Austria', 'France', 'Poland', 'China', 'Korea', 
                 'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
                 'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
                 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']

    gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
    bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]

    # your code here

    return olympic_medal_counts_df

def creat_dateframe():
    countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
                 'Netherlands', 'Germany', 'Switzerland', 'Belarus',
                 'Austria', 'France', 'Poland', 'China', 'Korea', 
                 'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
                 'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
                 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']

    gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
    bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]
    olympic_medal_counts_df = DataFrame({'country_name':Series(countries),
                                         'gold'        :Series(gold),                   
                                         'silver'      :Series(silver),
                                         'bronze'      :Series(bronze)})
    #######################################
    #######################################
    solution 2
    olympic_medal_counts = {'country_name':Series(countries),'gold':Series(gold),  
                            'silver':Series(silver), 'bronze':Series(bronze)}
    olympic_medal_counts_df = DataFrame(olympic_medal_counts )


    return olympic_medal_counts_d

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
    bronze    country_name  gold  silver
0        9    Russian Fed.    13      11
1       10          Norway    11       5
2        5          Canada    10      10
3       12   United States     9       7
4        9     Netherlands     8       7
5        5         Germany     8       6
6        2     Switzerland     6       3
7        1         Belarus     5       0
8        5         Austria     4       8
9        7          France     4       4
10       1          Poland     4       1
11       2           China     3       4
12       2           Korea     3       3
13       6          Sweden     2       7
14       2  Czech Republic     2       4
15       4        Slovenia     2       2
16       3           Japan     1       4
17       1         Finland     1       3
18       2   Great Britain     1       1
19       1         Ukraine     1       0
20       0        Slovakia     1       0
21       6           Italy     0       2
22       2          Latvia     0       2
23       1       Australia     0       2
24       0         Croatia     0       1
25       1      Kazakhstan     0       0

Indexing DataFrame

import pandas as pd

'''
You can think of a DataFrame as a group of Series that share an index.
This makes it easy to select specific columns that you want from the 
DataFrame. 

Also a couple pointers:
1) Selecting a single column from the DataFrame will return a Series
2) Selecting multiple columns from the DataFrame will return a DataFrame

if True:
    data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
            'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions',
                     'Lions', 'Lions'],
            'wins': [11, 8, 10, 15, 11, 6, 10, 4],
            'losses': [5, 8, 6, 1, 5, 10, 6, 12]}
    football = pd.DataFrame(data)
    print football['year']
    print football.year  # shorthand for football['year']
    print football[['year', 'wins', 'losses']]
>>>>>>>>>>>>>>>>>>
0    2010
1    2011
2    2012
3    2011
4    2012
5    2010
6    2011
7    2012
Name: year, dtype: int64
>>>>>>>>>>>>>>>>>>>>>>
0    2010
1    2011
2    2012
3    2011
4    2012
5    2010
6    2011
7    2012
Name: year, dtype: int64
>>>>>>>>>>>>>>>>>>>>>>
   year  wins  losses
0  2010    11       5
1  2011     8       8
2  2012    10       6
3  2011    15       1
4  2012    11       5
5  2010     6      10
6  2011    10       6
7  2012     4      12

'''
Row selection can be done through multiple ways.

Some of the basic and common methods are:
   1) Slicing
   2) An individual index (through the functions iloc or loc)
   3) Boolean indexing

You can also combine multiple selection requirements through boolean
operators like & (and) or | (or)
'''


if True:
    data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
            'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions',
                     'Lions', 'Lions'],
            'wins': [11, 8, 10, 15, 11, 6, 10, 4],
            'losses': [5, 8, 6, 1, 5, 10, 6, 12]}
    football = pd.DataFrame(data)
    print football.iloc[[0]]
    print football.loc[[0]]
    print football[3:5]
    print football[football.wins > 10]
    print football[(football.wins > 10) & (football.team == "Packers")]
>>>>>>>>>>>>>
   losses   team  wins  year
0       5  Bears    11  2010
>>>>>>>>>>>>>
   losses   team  wins  year
0       5  Bears    11  2010
>>>>>>>>>>>>>
   losses     team  wins  year
3       1  Packers    15  2011
4       5  Packers    11  2012
>>>>>>>>>>>>>
   losses     team  wins  year
0       5    Bears    11  2010
3       1  Packers    15  2011
4       5  Packers    11  2012
>>>>>>>>>>>>>
   losses     team  wins  year
3       1  Packers    15  2011
4       5  Packers    11  2012

Pandas Vectorized Methods

As a refresher on lambda, lambda functions are small inline functions that are defined on-the-fly in Python.lambda x: x>= 1will take an input x and return x>=1, or a boolean that equalsTrueorFalse.

In this example,map()andapplymap()create a new Series or DataFrame by applying the lambda function to each element. Note thatmap()can only be used on a Series to return a new Series andapplymap()can only be used on a DataFrame to return a new DataFrame.

QUIZ

import numpy
from pandas import DataFrame, Series


def avg_medal_count():
    '''
    Using the dataframe's apply method, create a new Series called 
    avg_medal_count that indicates the average number of gold, silver,
    and bronze medals earned amongst countries who earned at 
    least one medal of any kind at the 2014 Sochi olympics.  Note that
    the countries list already only includes countries that have earned
    at least one medal. No additional filtering is necessary.

    You do not need to call the function in your code when running it in the
    browser - the grader will do that automatically when you submit or test it.
    '''

    countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
                 'Netherlands', 'Germany', 'Switzerland', 'Belarus',
                 'Austria', 'France', 'Poland', 'China', 'Korea', 
                 'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
                 'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
                 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']

    gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
    bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]

    olympic_medal_counts = {'country_name':countries,
                            'gold': Series(gold),
                            'silver': Series(silver),
                            'bronze': Series(bronze)}    
    df = DataFrame(olympic_medal_counts)

    # YOUR CODE HERE

    return avg_medal_count

def avg_medal_count():
    avg_medal_count = df[['gold','silver','bronze']].apply(numpy.mean)
    return avg_medal_count
>>>>>>>>>>>>>>>>>>>>>>>
gold      3.807692
silver    3.730769
bronze    3.807692

results matching ""

No results matching ""