diff --git a/pandas.ipynb b/pandas.ipynb
index f0a8a33..295d912 100644
--- a/pandas.ipynb
+++ b/pandas.ipynb
@@ -45,7 +45,11 @@
},
{
"cell_type": "markdown",
- "metadata": {},
+ "metadata": {
+ "slideshow": {
+ "slide_type": "-"
+ }
+ },
"source": [
"We can access the elements in an array using multi-index notation, familiar in small variations in many computing environments and languages - with the usual Pythonic conventions, e.g., counting starts from 0, slicing a:b is inclusive:exclusive, negative indices, etc\n",
"\n",
@@ -336,7 +340,8 @@
"\n",
"Series4 = Series1 > Series2 \n",
"\n",
- "# Take a look at the different Series objects!"
+ "# Take a look at the different Series objects!\n",
+ "Series4"
]
},
{
@@ -374,7 +379,8 @@
"source": [
"Series1 = pd.Series([1,10],index=[\"om\",\"iros\"])\n",
"Series2 = pd.Series([4,-1],index=[\"pap\",\"as\"])\n",
- "Series3 = Series1 + Series2"
+ "Series3 = Series1 + Series2\n",
+ "Series3"
]
},
{
@@ -421,7 +427,8 @@
"# accesing by list of index labels\n",
"\n",
"my_series.index = [\"om\",\"ir\",\"os\",\"pap\",\"pas\",\"pil\",\"io\",\"po\",\"ulos\",\"is\",\"best\"]\n",
- "x = my_series[[\"om\",\"pap\"]]"
+ "x = my_series[[\"om\",\"pap\"]]\n",
+ "x"
]
},
{
@@ -445,7 +452,8 @@
"# Notice the index of x is a SUBSET of the index of \"my_series\"\n",
"# This can be useful when needing to relate values back to the original \"my_series\"!\n",
"\n",
- "x = my_series[choose]"
+ "x = my_series[choose]\n",
+ "x"
]
},
{
@@ -477,7 +485,9 @@
"# Filter \"my_series\" to be all the elements that are NOT\n",
"# equal to 0, using the \"choose\" boolean mask below: \n",
"\n",
- "choose = my_series == 0.0\n"
+ "choose = my_series == 0.0\n",
+ "my_series[my_series != 0]\n",
+ "my_series[~choose]"
]
},
{
@@ -554,7 +564,8 @@
"# 1. Create a boolean mask by using the .notna() method.\n",
"# 2. Use the mask to subset the Series.\n",
"\n",
- "names = pd.Series(['foo','bar',None,'baz','qux',None])\n"
+ "names = pd.Series(['foo','bar',None,'baz','qux',None])\n",
+ "names[names.notna()]"
]
},
{
@@ -598,12 +609,22 @@
" # Your code here\n",
" # HINT: delete the \"pass\" when your done\n",
" # HINT2: handle None values!\n",
- " pass\n",
+ " try:\n",
+ " return s.lower()\n",
+ " except AttributeError:\n",
+ " return None\n",
"\n",
- "\n",
- "names = pd.Series(['Foo', 'BAR', None, 'foo', None, 'bar', 'bAR', 'foo', None])"
+ "names = pd.Series(['Foo', 'BAR', None, 'foo', None, 'bar', 'bAR', 'foo', None])\n",
+ "names_lower = names.map(lower)"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
{
"cell_type": "code",
"execution_count": null,
@@ -614,7 +635,8 @@
"\n",
"# Using the series from above, now lowercased, count the occurences of each name\n",
"# Hint: It's simple, just use .value_counts()!\n",
- "\n"
+ "\n",
+ "names_lower.value_counts()"
]
},
{
@@ -652,6 +674,7 @@
"source": [
"# the other important attribute: name of rows and columns\n",
"tips.index\n",
+ "\n",
"tips.columns"
]
},
@@ -765,7 +788,9 @@
"\n",
"# Using the tips dataframe, create a new one that contains the \n",
"# information contained in all rows between the 20th (inclusive) \n",
- "# and the 45th (exclusive) and only the columns: tip, sex, day"
+ "# and the 45th (exclusive) and only the columns: tip, sex, day\n",
+ "\n",
+ "tips.loc[20:45,[\"tip\",\"sex\",\"day\"]]"
]
},
{
@@ -844,7 +869,8 @@
"# tip and size for only Male clients during Dinner. \n",
"\n",
"# HINT: Remember that \"size\" cannot be accessed via dot notation, as it's an \n",
- "# attribute of the series!"
+ "# attribute of the series!\n",
+ "tips[(tips.sex==\"Male\")&(tips.time==\"Dinner\")].corr().loc[\"size\",\"tip\"]"
]
},
{
@@ -942,7 +968,10 @@
"# Challenge: \n",
"\n",
"# Get the second largest bill by gender!\n",
- "# HINT: use sort_values and iloc!"
+ "# HINT: use sort_values and iloc!\n",
+ "def sec_max_bill(df):\n",
+ " return df.total_bill.sort_values(ascending=False).iloc[1]\n",
+ "tips.groupby(\"sex\").apply(sec_max_bill)"
]
},
{
@@ -972,7 +1001,7 @@
"def day_mean(df):\n",
" # Hint: you will need to group by \"day\"\n",
" # in this function, then get the mean tip. \n",
- " pass\n",
+ " return df.groupby(\"day\").tip.mean()\n",
"\n",
"\n",
"tips.groupby(\"sex\").apply(day_mean)"
@@ -1218,6 +1247,634 @@
"\n",
"*Needless to say that eyeballing is OK for making sure your code makes sense, but will not result in full credits for the project. We want a fully automated code. To carry out the project successfully you need to use most the attributes and methods described earlier. The last one is a little tricky*"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Buyer | \n",
+ " Product | \n",
+ " Quantity | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Jackson | \n",
+ " apple | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Jackson | \n",
+ " apple | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " John | \n",
+ " orange | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " John | \n",
+ " potato | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Tom | \n",
+ " tomato | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Buyer Product Quantity\n",
+ "0 Jackson apple 4\n",
+ "1 Jackson apple 9\n",
+ "2 John orange 9\n",
+ "3 John potato 10\n",
+ "4 Tom tomato 4"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "supermarket_prices = pd.read_csv(\"supermarket_prices.csv\")\n",
+ "supermarket_transactions = pd.read_csv(\"supermarket_transactions.csv\")\n",
+ "supermarket_transactions.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Quantity | \n",
+ "
\n",
+ " \n",
+ " Buyer | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Emma | \n",
+ " 81 | \n",
+ "
\n",
+ " \n",
+ " Jackson | \n",
+ " 70 | \n",
+ "
\n",
+ " \n",
+ " John | \n",
+ " 122 | \n",
+ "
\n",
+ " \n",
+ " Liam | \n",
+ " 81 | \n",
+ "
\n",
+ " \n",
+ " Lucas | \n",
+ " 62 | \n",
+ "
\n",
+ " \n",
+ " Sandra | \n",
+ " 78 | \n",
+ "
\n",
+ " \n",
+ " Sophia | \n",
+ " 61 | \n",
+ "
\n",
+ " \n",
+ " Tom | \n",
+ " 49 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Quantity\n",
+ "Buyer \n",
+ "Emma 81\n",
+ "Jackson 70\n",
+ "John 122\n",
+ "Liam 81\n",
+ "Lucas 62\n",
+ "Sandra 78\n",
+ "Sophia 61\n",
+ "Tom 49"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#How many items each client has purchased\n",
+ "supermarket_transactions.groupby('Buyer').sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ " Quantity | \n",
+ "
\n",
+ " \n",
+ " Buyer | \n",
+ " Product | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Emma | \n",
+ " apple | \n",
+ " 25 | \n",
+ "
\n",
+ " \n",
+ " banana | \n",
+ " 26 | \n",
+ "
\n",
+ " \n",
+ " potato | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " tomato | \n",
+ " 16 | \n",
+ "
\n",
+ " \n",
+ " Jackson | \n",
+ " apple | \n",
+ " 18 | \n",
+ "
\n",
+ " \n",
+ " orange | \n",
+ " 28 | \n",
+ "
\n",
+ " \n",
+ " potato | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " tomato | \n",
+ " 16 | \n",
+ "
\n",
+ " \n",
+ " John | \n",
+ " apple | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " banana | \n",
+ " 28 | \n",
+ "
\n",
+ " \n",
+ " orange | \n",
+ " 46 | \n",
+ "
\n",
+ " \n",
+ " potato | \n",
+ " 18 | \n",
+ "
\n",
+ " \n",
+ " tomato | \n",
+ " 23 | \n",
+ "
\n",
+ " \n",
+ " Liam | \n",
+ " apple | \n",
+ " 21 | \n",
+ "
\n",
+ " \n",
+ " banana | \n",
+ " 16 | \n",
+ "
\n",
+ " \n",
+ " orange | \n",
+ " 16 | \n",
+ "
\n",
+ " \n",
+ " potato | \n",
+ " 21 | \n",
+ "
\n",
+ " \n",
+ " tomato | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " Lucas | \n",
+ " apple | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " banana | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " orange | \n",
+ " 17 | \n",
+ "
\n",
+ " \n",
+ " potato | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " tomato | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ " Sandra | \n",
+ " banana | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " orange | \n",
+ " 37 | \n",
+ "
\n",
+ " \n",
+ " potato | \n",
+ " 38 | \n",
+ "
\n",
+ " \n",
+ " tomato | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " Sophia | \n",
+ " apple | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " banana | \n",
+ " 13 | \n",
+ "
\n",
+ " \n",
+ " orange | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " potato | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " tomato | \n",
+ " 13 | \n",
+ "
\n",
+ " \n",
+ " Tom | \n",
+ " apple | \n",
+ " 18 | \n",
+ "
\n",
+ " \n",
+ " banana | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " potato | \n",
+ " 16 | \n",
+ "
\n",
+ " \n",
+ " tomato | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Quantity\n",
+ "Buyer Product \n",
+ "Emma apple 25\n",
+ " banana 26\n",
+ " potato 14\n",
+ " tomato 16\n",
+ "Jackson apple 18\n",
+ " orange 28\n",
+ " potato 8\n",
+ " tomato 16\n",
+ "John apple 7\n",
+ " banana 28\n",
+ " orange 46\n",
+ " potato 18\n",
+ " tomato 23\n",
+ "Liam apple 21\n",
+ " banana 16\n",
+ " orange 16\n",
+ " potato 21\n",
+ " tomato 7\n",
+ "Lucas apple 14\n",
+ " banana 3\n",
+ " orange 17\n",
+ " potato 9\n",
+ " tomato 19\n",
+ "Sandra banana 2\n",
+ " orange 37\n",
+ " potato 38\n",
+ " tomato 1\n",
+ "Sophia apple 14\n",
+ " banana 13\n",
+ " orange 7\n",
+ " potato 14\n",
+ " tomato 13\n",
+ "Tom apple 18\n",
+ " banana 6\n",
+ " potato 16\n",
+ " tomato 9"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#How many items of each type each client has purchased\n",
+ "by_client_type = supermarket_transactions.groupby(['Buyer','Product']).sum()\n",
+ "by_client_type"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " total_spent | \n",
+ "
\n",
+ " \n",
+ " Buyer | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Emma | \n",
+ " 246.4 | \n",
+ "
\n",
+ " \n",
+ " Jackson | \n",
+ " 202.8 | \n",
+ "
\n",
+ " \n",
+ " John | \n",
+ " 461.3 | \n",
+ "
\n",
+ " \n",
+ " Liam | \n",
+ " 263.3 | \n",
+ "
\n",
+ " \n",
+ " Lucas | \n",
+ " 176.0 | \n",
+ "
\n",
+ " \n",
+ " Sandra | \n",
+ " 300.8 | \n",
+ "
\n",
+ " \n",
+ " Sophia | \n",
+ " 189.4 | \n",
+ "
\n",
+ " \n",
+ " Tom | \n",
+ " 126.1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " total_spent\n",
+ "Buyer \n",
+ "Emma 246.4\n",
+ "Jackson 202.8\n",
+ "John 461.3\n",
+ "Liam 263.3\n",
+ "Lucas 176.0\n",
+ "Sandra 300.8\n",
+ "Sophia 189.4\n",
+ "Tom 126.1"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#Calculate the total amount spent by each client\n",
+ "merged_tables = pd.merge(supermarket_transactions, supermarket_prices, on = 'Product', how = 'outer')\n",
+ "merged_tables['total_spent'] = merged_tables.Quantity*merged_tables.Price\n",
+ "client_spent = merged_tables[['Buyer','total_spent']].groupby(['Buyer']).sum()\n",
+ "client_spent"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'Emma'"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#The company that provides the supermarket with bananas wishes to give a prize to the client that \n",
+ "#has spent the largest proportion of their spending on bananas. Who should win the prize?\n",
+ "\n",
+ "merged_banana = merged_tables[merged_tables.Product=='banana']\n",
+ "spent_banana = merged_banana[['Buyer','total_spent']].groupby('Buyer').sum()\n",
+ "spent_banana_total = pd.merge(spent_banana,client_spent, on = 'Buyer', how = 'inner')\n",
+ "spent_banana_total['prop_banana'] = spent_banana_total['total_spent_x']/spent_banana_total['total_spent_y']\n",
+ "spent_banana_total.prop_banana.idxmax()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Buyer | \n",
+ " Product | \n",
+ " prop | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2 | \n",
+ " John | \n",
+ " orange | \n",
+ " 0.428788 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " Liam | \n",
+ " potato | \n",
+ " 0.271174 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " Sophia | \n",
+ " potato | \n",
+ " 0.251320 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Buyer Product prop\n",
+ "2 John orange 0.428788\n",
+ "8 Liam potato 0.271174\n",
+ "13 Sophia potato 0.251320"
+ ]
+ },
+ "execution_count": 51,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#A marketing company that works with the supermarket is interested to understand better the \n",
+ "#characteristics of the three people that have spent most of their spending on bananas. \n",
+ "#For each one of them report the other product that they have spent most of their remaining income on.\n",
+ "top_three = spent_banana_total.prop_banana.sort_values()[-4:-1].index.values\n",
+ "spending = pd.DataFrame()\n",
+ "for person in top_three:\n",
+ " spending = pd.concat([spending, merged_tables[merged_tables.Buyer == person]],axis=0) \n",
+ "spent_buyer_prod = spending.groupby(['Buyer','Product']).sum().reset_index()[['Buyer','Product','total_spent']]\n",
+ "total_buyer_prod = pd.merge(spent_buyer_prod,client_spent,on='Buyer',how='left')\n",
+ "total_buyer_prod['prop'] = total_buyer_prod.total_spent_x/total_buyer_prod.total_spent_y\n",
+ "prop_no_banana = total_buyer_prod[['Buyer','Product','prop']][total_buyer_prod.Product!='banana']\n",
+ "prop_no_banana.loc[prop_no_banana.groupby('Buyer').idxmax().iloc[:,0],:]"
+ ]
}
],
"metadata": {
@@ -1237,7 +1894,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.3"
+ "version": "3.7.3"
}
},
"nbformat": 4,