{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Pandas : Examples" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Reading CSV" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameCity
0Donald DuckCopenhagen
1GoofyAarhus
2Mickey MouseAarhus
\n", "
" ], "text/plain": [ " Name City\n", "0 Donald Duck Copenhagen\n", "1 Goofy Aarhus\n", "2 Mickey Mouse Aarhus" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "students = pd.read_csv('students.csv')\n", "students" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Loading Pandas data frames from sqlite3" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import sqlite3\n", "\n", "connection = sqlite3.connect('example.sqlite')\n", "\n", "countries = pd.read_sql_query('SELECT * FROM country', connection)\n", "cities = pd.read_sql_query('SELECT * FROM city', connection)\n", "\n", "students.to_sql('students', connection, if_exists='replace')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(pandas.core.frame.DataFrame,\n", " pandas.core.frame.DataFrame,\n", " pandas.core.frame.DataFrame)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(countries), type(cities), type(students) # Pandas data frame" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namepopulationareacapital
0Denmark574876942931Copenhagen
1Germany82800000357168Berlin
2USA3257191789833520Washington, D.C.
3Iceland334252102775Reykjavik
\n", "
" ], "text/plain": [ " name population area capital\n", "0 Denmark 5748769 42931 Copenhagen\n", "1 Germany 82800000 357168 Berlin\n", "2 USA 325719178 9833520 Washington, D.C.\n", "3 Iceland 334252 102775 Reykjavik" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "countries # looks nice, because of Jupyter notebook integration" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " name population area capital\n", "0 Denmark 5748769 42931 Copenhagen\n", "1 Germany 82800000 357168 Berlin\n", "2 USA 325719178 9833520 Washington, D.C.\n", "3 Iceland 334252 102775 Reykjavik\n" ] } ], "source": [ "print(countries) # does not exploit Jupyter notebook integration" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecountrypopulationestablished
0CopenhagenDenmark775033800
1AarhusDenmark273077750
2BerlinGermany37119301237
3MunichGermany14643011158
4ReykjavikIceland126100874
5Washington, D.C.USA6939721790
6New OrleansUSA3438291718
7San FranciscoUSA8843631776
\n", "
" ], "text/plain": [ " name country population established\n", "0 Copenhagen Denmark 775033 800\n", "1 Aarhus Denmark 273077 750\n", "2 Berlin Germany 3711930 1237\n", "3 Munich Germany 1464301 1158\n", "4 Reykjavik Iceland 126100 874\n", "5 Washington, D.C. USA 693972 1790\n", "6 New Orleans USA 343829 1718\n", "7 San Francisco USA 884363 1776" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cities" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Indexing data frames" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 Denmark\n", "1 Germany\n", "2 USA\n", "3 Iceland\n", "Name: name, dtype: object" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "countries['name'] # single column" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 Denmark\n", "1 Germany\n", "2 USA\n", "3 Iceland\n", "Name: name, dtype: object" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "countries.name # can also access columns by .name" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecapital
0DenmarkCopenhagen
1GermanyBerlin
2USAWashington, D.C.
3IcelandReykjavik
\n", "
" ], "text/plain": [ " name capital\n", "0 Denmark Copenhagen\n", "1 Germany Berlin\n", "2 USA Washington, D.C.\n", "3 Iceland Reykjavik" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "countries[['name', 'capital']] # select multiple columns" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namepopulationareacapital
0Denmark574876942931Copenhagen
1Germany82800000357168Berlin
\n", "
" ], "text/plain": [ " name population area capital\n", "0 Denmark 5748769 42931 Copenhagen\n", "1 Germany 82800000 357168 Berlin" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "countries.head(2) # first two rows" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namepopulationareacapital
3Iceland334252102775Reykjavik
\n", "
" ], "text/plain": [ " name population area capital\n", "3 Iceland 334252 102775 Reykjavik" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "countries.tail(1) # not a row but a data frame with one row\n", " # notice row label is unchanged" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namepopulationareacapital
1Germany82800000357168Berlin
2USA3257191789833520Washington, D.C.
\n", "
" ], "text/plain": [ " name population area capital\n", "1 Germany 82800000 357168 Berlin\n", "2 USA 325719178 9833520 Washington, D.C." ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "countries[1:3] # row slicing by row lables" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namepopulationareacapital
0Denmark574876942931Copenhagen
2USA3257191789833520Washington, D.C.
\n", "
" ], "text/plain": [ " name population area capital\n", "0 Denmark 5748769 42931 Copenhagen\n", "2 USA 325719178 9833520 Washington, D.C." ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "countries[::2] # every 2nd row" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Washington, D.C.'" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "countries.at[2, 'capital'] # use .at to lookup single cell" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "name Aarhus\n", "country Denmark\n", "population 273077\n", "established 750\n", "Name: 1, dtype: object" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cities.loc[1] # single row is accessed using .loc[row lable]" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Aarhus'" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cities.loc[1]['name'] # another way to get a single value" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Aarhus'" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cities.loc[1, 'name'] # and yet another way" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countryname
1DenmarkAarhus
3GermanyMunich
\n", "
" ], "text/plain": [ " country name\n", "1 Denmark Aarhus\n", "3 Germany Munich" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cities.loc[[1, 3], ['country', 'name']] # extract sub data frame" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Masking rows" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 True\n", "1 True\n", "2 False\n", "3 False\n", "4 False\n", "5 False\n", "6 False\n", "7 False\n", "Name: country, dtype: bool" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mask = cities['country'] == 'Denmark'\n", "mask" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecountrypopulationestablished
0CopenhagenDenmark775033800
1AarhusDenmark273077750
\n", "
" ], "text/plain": [ " name country population established\n", "0 Copenhagen Denmark 775033 800\n", "1 Aarhus Denmark 273077 750" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cities[mask] # use a boolean data frame as a mask" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecountrypopulationestablished
0CopenhagenDenmark775033800
1AarhusDenmark273077750
\n", "
" ], "text/plain": [ " name country population established\n", "0 Copenhagen Denmark 775033 800\n", "1 Aarhus Denmark 273077 750" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cities[cities.country == 'Denmark'] # or shorter" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
namecountrypopulationestablished
1AarhusDenmark273077750
\n", "
" ], "text/plain": [ " name country population established\n", "1 Aarhus Denmark 273077 750" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# can also do operations on columns +, -, &, |, ...\n", "cities[(cities.country == 'Denmark') & (cities.established < 800)]" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 775833\n", "1 273827\n", "2 3713167\n", "3 1465459\n", "4 126974\n", "5 695762\n", "6 345547\n", "7 886139\n", "dtype: int64" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cities.population + cities.established # not very meaningfull but you can do it.." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Creating data frames from data" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
010
111
212
313
\n", "
" ], "text/plain": [ " 0\n", "0 10\n", "1 11\n", "2 12\n", "3 13" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame([10, 11, 12, 13]) # one dimensional data" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AB
01one
12two
23three
\n", "
" ], "text/plain": [ " A B\n", "0 1 one\n", "1 2 two\n", "2 3 three" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame({'A': [1,2,3], 'B': ['one', 'two', 'three']}) # data frame from dictionary" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01
01011
11213
\n", "
" ], "text/plain": [ " 0 1\n", "0 10 11\n", "1 12 13" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame([[10, 11], [12, 13]]) # two dimensional list" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AB
01011
11213
\n", "
" ], "text/plain": [ " A B\n", "0 10 11\n", "1 12 13" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame([[10, 11], [12, 13]], columns=['A', 'B']) # name columns" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AB
x1011
y1213
\n", "
" ], "text/plain": [ " A B\n", "x 10 11\n", "y 12 13" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame([[10, 11], [12, 13]], \n", " columns=['A', 'B'],\n", " index=['x', 'y']) # can also assign row labels" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0.57252247, 0.60743944, 0.05473956, 0.79662469],\n", " [0.04944432, 0.15741632, 0.13890316, 0.81421765],\n", " [0.98212556, 0.77183968, 0.06285572, 0.31457113]])" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "A = np.random.random((3, 4))\n", "A" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
00.5725220.6074390.0547400.796625
10.0494440.1574160.1389030.814218
20.9821260.7718400.0628560.314571
\n", "
" ], "text/plain": [ " 0 1 2 3\n", "0 0.572522 0.607439 0.054740 0.796625\n", "1 0.049444 0.157416 0.138903 0.814218\n", "2 0.982126 0.771840 0.062856 0.314571" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(A)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
y0.5725220.6074390.0547400.796625
x0.0494440.1574160.1389030.814218
z0.9821260.7718400.0628560.314571
\n", "
" ], "text/plain": [ " A B C D\n", "y 0.572522 0.607439 0.054740 0.796625\n", "x 0.049444 0.157416 0.138903 0.814218\n", "z 0.982126 0.771840 0.062856 0.314571" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "R = pd.DataFrame(A, columns=list('ABCD'), index=list('yxz'))\n", "R" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Labelled rows .loc and .iloc" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
BC
x0.1574160.138903
z0.7718400.062856
\n", "
" ], "text/plain": [ " B C\n", "x 0.157416 0.138903\n", "z 0.771840 0.062856" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "R.loc['x':'z', ['B', 'C']] # row slicing is now by labels (inclusive)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.13890315739933812" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "R.iloc[1, 2] # .iloc can be used to index with integer numbers" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AB
y0.5725220.607439
x0.0494440.157416
\n", "
" ], "text/plain": [ " A B\n", "y 0.572522 0.607439\n", "x 0.049444 0.157416" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "R.iloc[:2, :2] # two first first rows and columns\n", "# R[:2, :2] is invalid !!!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Mering two data frames" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
name_xpopulation_xareacapitalname_ycountrypopulation_yestablished
0Denmark574876942931CopenhagenCopenhagenDenmark775033800
1Germany82800000357168BerlinBerlinGermany37119301237
2USA3257191789833520Washington, D.C.Washington, D.C.USA6939721790
3Iceland334252102775ReykjavikReykjavikIceland126100874
\n", "
" ], "text/plain": [ " name_x population_x area capital name_y \\\n", "0 Denmark 5748769 42931 Copenhagen Copenhagen \n", "1 Germany 82800000 357168 Berlin Berlin \n", "2 USA 325719178 9833520 Washington, D.C. Washington, D.C. \n", "3 Iceland 334252 102775 Reykjavik Reykjavik \n", "\n", " country population_y established \n", "0 Denmark 775033 800 \n", "1 Germany 3711930 1237 \n", "2 USA 693972 1790 \n", "3 Iceland 126100 874 " ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# data frames can be merged\n", "# note that 'name' and 'population' are in both original data frames\n", "M = pd.merge(countries, cities, left_on='capital', right_on='name')\n", "M " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Renaming, dropping and adding columns" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
country_populationareacapitalcountrycapital_populationestablished
0574876942931CopenhagenDenmark775033800
182800000357168BerlinGermany37119301237
23257191789833520Washington, D.C.USA6939721790
3334252102775ReykjavikIceland126100874
\n", "
" ], "text/plain": [ " country_population area capital country capital_population \\\n", "0 5748769 42931 Copenhagen Denmark 775033 \n", "1 82800000 357168 Berlin Germany 3711930 \n", "2 325719178 9833520 Washington, D.C. USA 693972 \n", "3 334252 102775 Reykjavik Iceland 126100 \n", "\n", " established \n", "0 800 \n", "1 1237 \n", "2 1790 \n", "3 874 " ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "M1 = M.rename(columns={\n", " 'population_x': 'country_population',\n", " 'population_y': 'capital_population'\n", "})\n", "M2 = M1.drop(columns=['name_x', 'name_y'])\n", "M2" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
country_populationareacapitalcountrycapital_populationestablishedempty column
0574876942931CopenhagenDenmark775033800None
182800000357168BerlinGermany37119301237None
23257191789833520Washington, D.C.USA6939721790None
3334252102775ReykjavikIceland126100874None
\n", "
" ], "text/plain": [ " country_population area capital country capital_population \\\n", "0 5748769 42931 Copenhagen Denmark 775033 \n", "1 82800000 357168 Berlin Germany 3711930 \n", "2 325719178 9833520 Washington, D.C. USA 693972 \n", "3 334252 102775 Reykjavik Iceland 126100 \n", "\n", " established empty column \n", "0 800 None \n", "1 1237 None \n", "2 1790 None \n", "3 874 None " ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "M2['empty column'] = None # add new column to existing data frame\n", "M2" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
country_populationareacapitalcountrycapital_populationestablishedempty column%pop in capital
0574876942931CopenhagenDenmark775033800None0.134817
182800000357168BerlinGermany37119301237None0.044830
23257191789833520Washington, D.C.USA6939721790None0.002131
3334252102775ReykjavikIceland126100874None0.377260
\n", "
" ], "text/plain": [ " country_population area capital country capital_population \\\n", "0 5748769 42931 Copenhagen Denmark 775033 \n", "1 82800000 357168 Berlin Germany 3711930 \n", "2 325719178 9833520 Washington, D.C. USA 693972 \n", "3 334252 102775 Reykjavik Iceland 126100 \n", "\n", " established empty column %pop in capital \n", "0 800 None 0.134817 \n", "1 1237 None 0.044830 \n", "2 1790 None 0.002131 \n", "3 874 None 0.377260 " ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# add new column based on column computation\n", "M2['%pop in capital'] = M2.capital_population / M2.country_population\n", "M2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Sorting rows" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
country_populationareacapitalcountrycapital_populationestablishedempty column%pop in capital
3334252102775ReykjavikIceland126100874None0.377260
0574876942931CopenhagenDenmark775033800None0.134817
182800000357168BerlinGermany37119301237None0.044830
23257191789833520Washington, D.C.USA6939721790None0.002131
\n", "
" ], "text/plain": [ " country_population area capital country capital_population \\\n", "3 334252 102775 Reykjavik Iceland 126100 \n", "0 5748769 42931 Copenhagen Denmark 775033 \n", "1 82800000 357168 Berlin Germany 3711930 \n", "2 325719178 9833520 Washington, D.C. USA 693972 \n", "\n", " established empty column %pop in capital \n", "3 874 None 0.377260 \n", "0 800 None 0.134817 \n", "1 1237 None 0.044830 \n", "2 1790 None 0.002131 " ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 'inplace' changes existing data frame without creating new data frame,\n", "# otherwise a new data frame is created and returned\n", "M2.sort_values('%pop in capital', ascending=False, inplace=True)\n", "M2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pandas and Matplotlib integration" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "application/javascript": [ "/* Put everything inside the global mpl namespace */\n", "window.mpl = {};\n", "\n", "\n", "mpl.get_websocket_type = function() {\n", " if (typeof(WebSocket) !== 'undefined') {\n", " return WebSocket;\n", " } else if (typeof(MozWebSocket) !== 'undefined') {\n", " return MozWebSocket;\n", " } else {\n", " alert('Your browser does not have WebSocket support. ' +\n", " 'Please try Chrome, Safari or Firefox ≥ 6. ' +\n", " 'Firefox 4 and 5 are also supported but you ' +\n", " 'have to enable WebSockets in about:config.');\n", " };\n", "}\n", "\n", "mpl.figure = function(figure_id, websocket, ondownload, parent_element) {\n", " this.id = figure_id;\n", "\n", " this.ws = websocket;\n", "\n", " this.supports_binary = (this.ws.binaryType != undefined);\n", "\n", " if (!this.supports_binary) {\n", " var warnings = document.getElementById(\"mpl-warnings\");\n", " if (warnings) {\n", " warnings.style.display = 'block';\n", " warnings.textContent = (\n", " \"This browser does not support binary websocket messages. \" +\n", " \"Performance may be slow.\");\n", " }\n", " }\n", "\n", " this.imageObj = new Image();\n", "\n", " this.context = undefined;\n", " this.message = undefined;\n", " this.canvas = undefined;\n", " this.rubberband_canvas = undefined;\n", " this.rubberband_context = undefined;\n", " this.format_dropdown = undefined;\n", "\n", " this.image_mode = 'full';\n", "\n", " this.root = $('
');\n", " this._root_extra_style(this.root)\n", " this.root.attr('style', 'display: inline-block');\n", "\n", " $(parent_element).append(this.root);\n", "\n", " this._init_header(this);\n", " this._init_canvas(this);\n", " this._init_toolbar(this);\n", "\n", " var fig = this;\n", "\n", " this.waiting = false;\n", "\n", " this.ws.onopen = function () {\n", " fig.send_message(\"supports_binary\", {value: fig.supports_binary});\n", " fig.send_message(\"send_image_mode\", {});\n", " if (mpl.ratio != 1) {\n", " fig.send_message(\"set_dpi_ratio\", {'dpi_ratio': mpl.ratio});\n", " }\n", " fig.send_message(\"refresh\", {});\n", " }\n", "\n", " this.imageObj.onload = function() {\n", " if (fig.image_mode == 'full') {\n", " // Full images could contain transparency (where diff images\n", " // almost always do), so we need to clear the canvas so that\n", " // there is no ghosting.\n", " fig.context.clearRect(0, 0, fig.canvas.width, fig.canvas.height);\n", " }\n", " fig.context.drawImage(fig.imageObj, 0, 0);\n", " };\n", "\n", " this.imageObj.onunload = function() {\n", " fig.ws.close();\n", " }\n", "\n", " this.ws.onmessage = this._make_on_message_function(this);\n", "\n", " this.ondownload = ondownload;\n", "}\n", "\n", "mpl.figure.prototype._init_header = function() {\n", " var titlebar = $(\n", " '
');\n", " var titletext = $(\n", " '
');\n", " titlebar.append(titletext)\n", " this.root.append(titlebar);\n", " this.header = titletext[0];\n", "}\n", "\n", "\n", "\n", "mpl.figure.prototype._canvas_extra_style = function(canvas_div) {\n", "\n", "}\n", "\n", "\n", "mpl.figure.prototype._root_extra_style = function(canvas_div) {\n", "\n", "}\n", "\n", "mpl.figure.prototype._init_canvas = function() {\n", " var fig = this;\n", "\n", " var canvas_div = $('
');\n", "\n", " canvas_div.attr('style', 'position: relative; clear: both; outline: 0');\n", "\n", " function canvas_keyboard_event(event) {\n", " return fig.key_event(event, event['data']);\n", " }\n", "\n", " canvas_div.keydown('key_press', canvas_keyboard_event);\n", " canvas_div.keyup('key_release', canvas_keyboard_event);\n", " this.canvas_div = canvas_div\n", " this._canvas_extra_style(canvas_div)\n", " this.root.append(canvas_div);\n", "\n", " var canvas = $('');\n", " canvas.addClass('mpl-canvas');\n", " canvas.attr('style', \"left: 0; top: 0; z-index: 0; outline: 0\")\n", "\n", " this.canvas = canvas[0];\n", " this.context = canvas[0].getContext(\"2d\");\n", "\n", " var backingStore = this.context.backingStorePixelRatio ||\n", "\tthis.context.webkitBackingStorePixelRatio ||\n", "\tthis.context.mozBackingStorePixelRatio ||\n", "\tthis.context.msBackingStorePixelRatio ||\n", "\tthis.context.oBackingStorePixelRatio ||\n", "\tthis.context.backingStorePixelRatio || 1;\n", "\n", " mpl.ratio = (window.devicePixelRatio || 1) / backingStore;\n", "\n", " var rubberband = $('');\n", " rubberband.attr('style', \"position: absolute; left: 0; top: 0; z-index: 1;\")\n", "\n", " var pass_mouse_events = true;\n", "\n", " canvas_div.resizable({\n", " start: function(event, ui) {\n", " pass_mouse_events = false;\n", " },\n", " resize: function(event, ui) {\n", " fig.request_resize(ui.size.width, ui.size.height);\n", " },\n", " stop: function(event, ui) {\n", " pass_mouse_events = true;\n", " fig.request_resize(ui.size.width, ui.size.height);\n", " },\n", " });\n", "\n", " function mouse_event_fn(event) {\n", " if (pass_mouse_events)\n", " return fig.mouse_event(event, event['data']);\n", " }\n", "\n", " rubberband.mousedown('button_press', mouse_event_fn);\n", " rubberband.mouseup('button_release', mouse_event_fn);\n", " // Throttle sequential mouse events to 1 every 20ms.\n", " rubberband.mousemove('motion_notify', mouse_event_fn);\n", "\n", " rubberband.mouseenter('figure_enter', mouse_event_fn);\n", " rubberband.mouseleave('figure_leave', mouse_event_fn);\n", "\n", " canvas_div.on(\"wheel\", function (event) {\n", " event = event.originalEvent;\n", " event['data'] = 'scroll'\n", " if (event.deltaY < 0) {\n", " event.step = 1;\n", " } else {\n", " event.step = -1;\n", " }\n", " mouse_event_fn(event);\n", " });\n", "\n", " canvas_div.append(canvas);\n", " canvas_div.append(rubberband);\n", "\n", " this.rubberband = rubberband;\n", " this.rubberband_canvas = rubberband[0];\n", " this.rubberband_context = rubberband[0].getContext(\"2d\");\n", " this.rubberband_context.strokeStyle = \"#000000\";\n", "\n", " this._resize_canvas = function(width, height) {\n", " // Keep the size of the canvas, canvas container, and rubber band\n", " // canvas in synch.\n", " canvas_div.css('width', width)\n", " canvas_div.css('height', height)\n", "\n", " canvas.attr('width', width * mpl.ratio);\n", " canvas.attr('height', height * mpl.ratio);\n", " canvas.attr('style', 'width: ' + width + 'px; height: ' + height + 'px;');\n", "\n", " rubberband.attr('width', width);\n", " rubberband.attr('height', height);\n", " }\n", "\n", " // Set the figure to an initial 600x600px, this will subsequently be updated\n", " // upon first draw.\n", " this._resize_canvas(600, 600);\n", "\n", " // Disable right mouse context menu.\n", " $(this.rubberband_canvas).bind(\"contextmenu\",function(e){\n", " return false;\n", " });\n", "\n", " function set_focus () {\n", " canvas.focus();\n", " canvas_div.focus();\n", " }\n", "\n", " window.setTimeout(set_focus, 100);\n", "}\n", "\n", "mpl.figure.prototype._init_toolbar = function() {\n", " var fig = this;\n", "\n", " var nav_element = $('
');\n", " nav_element.attr('style', 'width: 100%');\n", " this.root.append(nav_element);\n", "\n", " // Define a callback function for later on.\n", " function toolbar_event(event) {\n", " return fig.toolbar_button_onclick(event['data']);\n", " }\n", " function toolbar_mouse_event(event) {\n", " return fig.toolbar_button_onmouseover(event['data']);\n", " }\n", "\n", " for(var toolbar_ind in mpl.toolbar_items) {\n", " var name = mpl.toolbar_items[toolbar_ind][0];\n", " var tooltip = mpl.toolbar_items[toolbar_ind][1];\n", " var image = mpl.toolbar_items[toolbar_ind][2];\n", " var method_name = mpl.toolbar_items[toolbar_ind][3];\n", "\n", " if (!name) {\n", " // put a spacer in here.\n", " continue;\n", " }\n", " var button = $('');\n", " button.click(method_name, toolbar_event);\n", " button.mouseover(tooltip, toolbar_mouse_event);\n", " nav_element.append(button);\n", " }\n", "\n", " // Add the status bar.\n", " var status_bar = $('');\n", " nav_element.append(status_bar);\n", " this.message = status_bar[0];\n", "\n", " // Add the close button to the window.\n", " var buttongrp = $('
');\n", " var button = $('');\n", " button.click(function (evt) { fig.handle_close(fig, {}); } );\n", " button.mouseover('Stop Interaction', toolbar_mouse_event);\n", " buttongrp.append(button);\n", " var titlebar = this.root.find($('.ui-dialog-titlebar'));\n", " titlebar.prepend(buttongrp);\n", "}\n", "\n", "mpl.figure.prototype._root_extra_style = function(el){\n", " var fig = this\n", " el.on(\"remove\", function(){\n", "\tfig.close_ws(fig, {});\n", " });\n", "}\n", "\n", "mpl.figure.prototype._canvas_extra_style = function(el){\n", " // this is important to make the div 'focusable\n", " el.attr('tabindex', 0)\n", " // reach out to IPython and tell the keyboard manager to turn it's self\n", " // off when our div gets focus\n", "\n", " // location in version 3\n", " if (IPython.notebook.keyboard_manager) {\n", " IPython.notebook.keyboard_manager.register_events(el);\n", " }\n", " else {\n", " // location in version 2\n", " IPython.keyboard_manager.register_events(el);\n", " }\n", "\n", "}\n", "\n", "mpl.figure.prototype._key_event_extra = function(event, name) {\n", " var manager = IPython.notebook.keyboard_manager;\n", " if (!manager)\n", " manager = IPython.keyboard_manager;\n", "\n", " // Check for shift+enter\n", " if (event.shiftKey && event.which == 13) {\n", " this.canvas_div.blur();\n", " // select the cell after this one\n", " var index = IPython.notebook.find_cell_index(this.cell_info[0]);\n", " IPython.notebook.select(index + 1);\n", " }\n", "}\n", "\n", "mpl.figure.prototype.handle_save = function(fig, msg) {\n", " fig.ondownload(fig, null);\n", "}\n", "\n", "\n", "mpl.find_output_cell = function(html_output) {\n", " // Return the cell and output element which can be found *uniquely* in the notebook.\n", " // Note - this is a bit hacky, but it is done because the \"notebook_saving.Notebook\"\n", " // IPython event is triggered only after the cells have been serialised, which for\n", " // our purposes (turning an active figure into a static one), is too late.\n", " var cells = IPython.notebook.get_cells();\n", " var ncells = cells.length;\n", " for (var i=0; i= 3 moved mimebundle to data attribute of output\n", " data = data.data;\n", " }\n", " if (data['text/html'] == html_output) {\n", " return [cell, data, j];\n", " }\n", " }\n", " }\n", " }\n", "}\n", "\n", "// Register the function which deals with the matplotlib target/channel.\n", "// The kernel may be null if the page has been refreshed.\n", "if (IPython.notebook.kernel != null) {\n", " IPython.notebook.kernel.comm_manager.register_target('matplotlib', mpl.mpl_figure_comm);\n", "}\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "import pandas_datareader # module giving access to a lot of standard data sources, eg. World Bank\n", "\n", "#df = pandas_datareader.data.DataReader(['AAPL', 'GOOGL', 'MSFT', 'ZM'], 'stooq') # ignores start=...\n", "\n", "stocks = pandas_datareader.stooq.StooqDailyReader(['AAPL', 'GOOGL', 'MSFT', 'ZM'], start='2000-01-01').read()\n", "stocks['Close'].plot()\n", "plt.legend()\n", "None # avoid print result of plt.legend() to terminal" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Hierarchical / Multi-level indexing (MultiIndex)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "Index(['name', 'population', 'area', 'capital'], dtype='object')" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "countries.columns # a standard index" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RangeIndex(start=0, stop=4, step=1)" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "countries.index # standard row labels" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AttributesCloseHighLowOpenVolume
SymbolsAAPLGOOGLMSFTZMAAPLGOOGLMSFTZMAAPLGOOGLMSFTZMAAPLGOOGLMSFTZMAAPLGOOGLMSFTZM
Date
2000-01-033.4698NaN42.148NaN3.4869NaN42.895NaN3.1522NaN40.495NaN3.2516NaN42.446NaN154312168.0NaN73600921.0NaN
2000-01-043.1771NaN40.730NaN3.4300NaN42.350NaN3.1374NaN40.592NaN3.3554NaN41.066NaN147567311.0NaN74832390.0NaN
2000-01-053.2242NaN41.157NaN3.4276NaN42.085NaN3.1932NaN39.549NaN3.2168NaN40.180NaN224160517.0NaN88577634.0NaN
2000-01-062.9452NaN39.779NaN3.3171NaN41.181NaN2.9452NaN39.190NaN3.2898NaN40.568NaN221180021.0NaN76018229.0NaN
2000-01-073.0853NaN40.296NaN3.1311NaN40.592NaN2.9614NaN38.802NaN2.9913NaN39.278NaN132693820.0NaN85748546.0NaN
...............................................................
2020-04-27283.17001270.86174.050164.60284.54001294.10176.900167.85279.95001265.06173.300155.0000281.80001292.00176.590156.5929271893.02209333.033194384.019783693.0
2020-04-28278.58001232.59169.810156.72285.83001284.76175.670166.00278.20001230.38169.390155.2500285.08001283.20175.590165.0028001187.04035007.034392694.013477842.0
2020-04-29287.73001342.18177.430146.48289.67001360.15177.680151.00283.89001326.73171.880143.3800284.73001345.00173.220147.9834320204.05417888.051286559.022033320.0
2020-04-30293.80001346.70179.210135.17294.53001350.00180.400143.80288.35001321.50176.230133.6801289.96001331.36180.000139.9945765968.02792124.053875857.016682256.0
2020-05-01289.07001317.32174.570138.56299.00001351.43178.640141.63285.85001309.66174.010132.6700286.25001324.09175.800136.0060154175.02443554.039370474.013806992.0
\n", "

5115 rows × 20 columns

\n", "
" ], "text/plain": [ "Attributes Close High \\\n", "Symbols AAPL GOOGL MSFT ZM AAPL GOOGL MSFT \n", "Date \n", "2000-01-03 3.4698 NaN 42.148 NaN 3.4869 NaN 42.895 \n", "2000-01-04 3.1771 NaN 40.730 NaN 3.4300 NaN 42.350 \n", "2000-01-05 3.2242 NaN 41.157 NaN 3.4276 NaN 42.085 \n", "2000-01-06 2.9452 NaN 39.779 NaN 3.3171 NaN 41.181 \n", "2000-01-07 3.0853 NaN 40.296 NaN 3.1311 NaN 40.592 \n", "... ... ... ... ... ... ... ... \n", "2020-04-27 283.1700 1270.86 174.050 164.60 284.5400 1294.10 176.900 \n", "2020-04-28 278.5800 1232.59 169.810 156.72 285.8300 1284.76 175.670 \n", "2020-04-29 287.7300 1342.18 177.430 146.48 289.6700 1360.15 177.680 \n", "2020-04-30 293.8000 1346.70 179.210 135.17 294.5300 1350.00 180.400 \n", "2020-05-01 289.0700 1317.32 174.570 138.56 299.0000 1351.43 178.640 \n", "\n", "Attributes Low Open \\\n", "Symbols ZM AAPL GOOGL MSFT ZM AAPL GOOGL \n", "Date \n", "2000-01-03 NaN 3.1522 NaN 40.495 NaN 3.2516 NaN \n", "2000-01-04 NaN 3.1374 NaN 40.592 NaN 3.3554 NaN \n", "2000-01-05 NaN 3.1932 NaN 39.549 NaN 3.2168 NaN \n", "2000-01-06 NaN 2.9452 NaN 39.190 NaN 3.2898 NaN \n", "2000-01-07 NaN 2.9614 NaN 38.802 NaN 2.9913 NaN \n", "... ... ... ... ... ... ... ... \n", "2020-04-27 167.85 279.9500 1265.06 173.300 155.0000 281.8000 1292.00 \n", "2020-04-28 166.00 278.2000 1230.38 169.390 155.2500 285.0800 1283.20 \n", "2020-04-29 151.00 283.8900 1326.73 171.880 143.3800 284.7300 1345.00 \n", "2020-04-30 143.80 288.3500 1321.50 176.230 133.6801 289.9600 1331.36 \n", "2020-05-01 141.63 285.8500 1309.66 174.010 132.6700 286.2500 1324.09 \n", "\n", "Attributes Volume \n", "Symbols MSFT ZM AAPL GOOGL MSFT ZM \n", "Date \n", "2000-01-03 42.446 NaN 154312168.0 NaN 73600921.0 NaN \n", "2000-01-04 41.066 NaN 147567311.0 NaN 74832390.0 NaN \n", "2000-01-05 40.180 NaN 224160517.0 NaN 88577634.0 NaN \n", "2000-01-06 40.568 NaN 221180021.0 NaN 76018229.0 NaN \n", "2000-01-07 39.278 NaN 132693820.0 NaN 85748546.0 NaN \n", "... ... ... ... ... ... ... \n", "2020-04-27 176.590 156.59 29271893.0 2209333.0 33194384.0 19783693.0 \n", "2020-04-28 175.590 165.00 28001187.0 4035007.0 34392694.0 13477842.0 \n", "2020-04-29 173.220 147.98 34320204.0 5417888.0 51286559.0 22033320.0 \n", "2020-04-30 180.000 139.99 45765968.0 2792124.0 53875857.0 16682256.0 \n", "2020-05-01 175.800 136.00 60154175.0 2443554.0 39370474.0 13806992.0 \n", "\n", "[5115 rows x 20 columns]" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks # notice hierarchical headings" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SymbolsAAPLGOOGLMSFTZM
Date
2000-01-033.4698NaN42.148NaN
2000-01-043.1771NaN40.730NaN
2000-01-053.2242NaN41.157NaN
2000-01-062.9452NaN39.779NaN
2000-01-073.0853NaN40.296NaN
...............
2020-04-27283.17001270.86174.050164.60
2020-04-28278.58001232.59169.810156.72
2020-04-29287.73001342.18177.430146.48
2020-04-30293.80001346.70179.210135.17
2020-05-01289.07001317.32174.570138.56
\n", "

5115 rows × 4 columns

\n", "
" ], "text/plain": [ "Symbols AAPL GOOGL MSFT ZM\n", "Date \n", "2000-01-03 3.4698 NaN 42.148 NaN\n", "2000-01-04 3.1771 NaN 40.730 NaN\n", "2000-01-05 3.2242 NaN 41.157 NaN\n", "2000-01-06 2.9452 NaN 39.779 NaN\n", "2000-01-07 3.0853 NaN 40.296 NaN\n", "... ... ... ... ...\n", "2020-04-27 283.1700 1270.86 174.050 164.60\n", "2020-04-28 278.5800 1232.59 169.810 156.72\n", "2020-04-29 287.7300 1342.18 177.430 146.48\n", "2020-04-30 293.8000 1346.70 179.210 135.17\n", "2020-05-01 289.0700 1317.32 174.570 138.56\n", "\n", "[5115 rows x 4 columns]" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks.Close # one top level group of the columns" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Date\n", "2000-01-03 NaN\n", "2000-01-04 NaN\n", "2000-01-05 NaN\n", "2000-01-06 NaN\n", "2000-01-07 NaN\n", " ... \n", "2020-04-27 1270.86\n", "2020-04-28 1232.59\n", "2020-04-29 1342.18\n", "2020-04-30 1346.70\n", "2020-05-01 1317.32\n", "Name: GOOGL, Length: 5115, dtype: float64" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks.Close.GOOGL # selecting a single column" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Date\n", "2000-01-03 NaN\n", "2000-01-04 NaN\n", "2000-01-05 NaN\n", "2000-01-06 NaN\n", "2000-01-07 NaN\n", " ... \n", "2020-04-27 1270.86\n", "2020-04-28 1232.59\n", "2020-04-29 1342.18\n", "2020-04-30 1346.70\n", "2020-05-01 1317.32\n", "Name: GOOGL, Length: 5115, dtype: float64" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks['Close']['GOOGL'] # same as above" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MultiIndex([( 'Close', 'AAPL'),\n", " ( 'Close', 'GOOGL'),\n", " ( 'Close', 'MSFT'),\n", " ( 'Close', 'ZM'),\n", " ( 'High', 'AAPL'),\n", " ( 'High', 'GOOGL'),\n", " ( 'High', 'MSFT'),\n", " ( 'High', 'ZM'),\n", " ( 'Low', 'AAPL'),\n", " ( 'Low', 'GOOGL'),\n", " ( 'Low', 'MSFT'),\n", " ( 'Low', 'ZM'),\n", " ( 'Open', 'AAPL'),\n", " ( 'Open', 'GOOGL'),\n", " ( 'Open', 'MSFT'),\n", " ( 'Open', 'ZM'),\n", " ('Volume', 'AAPL'),\n", " ('Volume', 'GOOGL'),\n", " ('Volume', 'MSFT'),\n", " ('Volume', 'ZM')],\n", " names=['Attributes', 'Symbols'])" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks.columns # has a MultiIndex" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatetimeIndex(['2000-01-03', '2000-01-04', '2000-01-05', '2000-01-06',\n", " '2000-01-07', '2000-01-10', '2000-01-11', '2000-01-12',\n", " '2000-01-13', '2000-01-14',\n", " ...\n", " '2020-04-20', '2020-04-21', '2020-04-22', '2020-04-23',\n", " '2020-04-24', '2020-04-27', '2020-04-28', '2020-04-29',\n", " '2020-04-30', '2020-05-01'],\n", " dtype='datetime64[ns]', name='Date', length=5115, freq=None)" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stocks.index" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AttributesCloseHighLowOpenVolume
SymbolsGOOGLGOOGLGOOGLGOOGLGOOGL
Date
2000-01-03NaNNaNNaNNaNNaN
2000-01-04NaNNaNNaNNaNNaN
2000-01-05NaNNaNNaNNaNNaN
2000-01-06NaNNaNNaNNaNNaN
2000-01-07NaNNaNNaNNaNNaN
..................
2020-04-271270.861294.101265.061292.002209333.0
2020-04-281232.591284.761230.381283.204035007.0
2020-04-291342.181360.151326.731345.005417888.0
2020-04-301346.701350.001321.501331.362792124.0
2020-05-011317.321351.431309.661324.092443554.0
\n", "

5115 rows × 5 columns

\n", "
" ], "text/plain": [ "Attributes Close High Low Open Volume\n", "Symbols GOOGL GOOGL GOOGL GOOGL GOOGL\n", "Date \n", "2000-01-03 NaN NaN NaN NaN NaN\n", "2000-01-04 NaN NaN NaN NaN NaN\n", "2000-01-05 NaN NaN NaN NaN NaN\n", "2000-01-06 NaN NaN NaN NaN NaN\n", "2000-01-07 NaN NaN NaN NaN NaN\n", "... ... ... ... ... ...\n", "2020-04-27 1270.86 1294.10 1265.06 1292.00 2209333.0\n", "2020-04-28 1232.59 1284.76 1230.38 1283.20 4035007.0\n", "2020-04-29 1342.18 1360.15 1326.73 1345.00 5417888.0\n", "2020-04-30 1346.70 1350.00 1321.50 1331.36 2792124.0\n", "2020-05-01 1317.32 1351.43 1309.66 1324.09 2443554.0\n", "\n", "[5115 rows x 5 columns]" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# select all rows and columns with 1st level = all, 2nd level = 'GOOGLE'\n", "stocks.loc[:, pd.IndexSlice[:,'GOOGL']] " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Creating your own multi index" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwo
ababc
Ax01234
y56789
Bx1011121314
y1516171819
Cx2021222324
y2526272829
\n", "
" ], "text/plain": [ " one two \n", " a b a b c\n", "A x 0 1 2 3 4\n", " y 5 6 7 8 9\n", "B x 10 11 12 13 14\n", " y 15 16 17 18 19\n", "C x 20 21 22 23 24\n", " y 25 26 27 28 29" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "column_index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),\n", " ('two', 'a'), ('two', 'b'), ('two', 'c')])\n", "\n", "row_index = pd.MultiIndex.from_tuples((x, y) for x in 'ABC' for y in 'xy')\n", "\n", "H = pd.DataFrame(np.arange(30).reshape(6, 5), columns=column_index, index=row_index)\n", "H" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Reorganizing hierarcical labels - stack and unstack" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwo
ababc
Ax01234
y56789
Bx1011121314
y1516171819
Cx2021222324
y2526272829
\n", "
" ], "text/plain": [ " one two \n", " a b a b c\n", "A x 0 1 2 3 4\n", " y 5 6 7 8 9\n", "B x 10 11 12 13 14\n", " y 15 16 17 18 19\n", "C x 20 21 22 23 24\n", " y 25 26 27 28 29" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "H" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abc
Axone01NaN
two234.0
yone56NaN
two789.0
Bxone1011NaN
two121314.0
yone1516NaN
two171819.0
Cxone2021NaN
two222324.0
yone2526NaN
two272829.0
\n", "
" ], "text/plain": [ " a b c\n", "A x one 0 1 NaN\n", " two 2 3 4.0\n", " y one 5 6 NaN\n", " two 7 8 9.0\n", "B x one 10 11 NaN\n", " two 12 13 14.0\n", " y one 15 16 NaN\n", " two 17 18 19.0\n", "C x one 20 21 NaN\n", " two 22 23 24.0\n", " y one 25 26 NaN\n", " two 27 28 29.0" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "H.stack(level=0) # move one level of indexing from columns to last rows (missing columns filled with NaN)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwo
Axa0.02
b1.03
cNaN4
ya5.07
b6.08
cNaN9
Bxa10.012
b11.013
cNaN14
ya15.017
b16.018
cNaN19
Cxa20.022
b21.023
cNaN24
ya25.027
b26.028
cNaN29
\n", "
" ], "text/plain": [ " one two\n", "A x a 0.0 2\n", " b 1.0 3\n", " c NaN 4\n", " y a 5.0 7\n", " b 6.0 8\n", " c NaN 9\n", "B x a 10.0 12\n", " b 11.0 13\n", " c NaN 14\n", " y a 15.0 17\n", " b 16.0 18\n", " c NaN 19\n", "C x a 20.0 22\n", " b 21.0 23\n", " c NaN 24\n", " y a 25.0 27\n", " b 26.0 28\n", " c NaN 29" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "H.stack(level=1)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwo
ababc
ABCABCABCABCABC
x0102011121212223132341424
y5152561626717278182891929
\n", "
" ], "text/plain": [ " one two \n", " a b a b c \n", " A B C A B C A B C A B C A B C\n", "x 0 10 20 1 11 21 2 12 22 3 13 23 4 14 24\n", "y 5 15 25 6 16 26 7 17 27 8 18 28 9 19 29" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "H.unstack(level=0) # and unstack moves row level to column last level" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## And there are much more ... see the Pandas documentation" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.1" } }, "nbformat": 4, "nbformat_minor": 4 }