diff --git a/pandas.ipynb b/pandas.ipynb index f0a8a33..295d912 100644 --- a/pandas.ipynb +++ b/pandas.ipynb @@ -45,7 +45,11 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "-" + } + }, "source": [ "We can access the elements in an array using multi-index notation, familiar in small variations in many computing environments and languages - with the usual Pythonic conventions, e.g., counting starts from 0, slicing a:b is inclusive:exclusive, negative indices, etc\n", "\n", @@ -336,7 +340,8 @@ "\n", "Series4 = Series1 > Series2 \n", "\n", - "# Take a look at the different Series objects!" + "# Take a look at the different Series objects!\n", + "Series4" ] }, { @@ -374,7 +379,8 @@ "source": [ "Series1 = pd.Series([1,10],index=[\"om\",\"iros\"])\n", "Series2 = pd.Series([4,-1],index=[\"pap\",\"as\"])\n", - "Series3 = Series1 + Series2" + "Series3 = Series1 + Series2\n", + "Series3" ] }, { @@ -421,7 +427,8 @@ "# accesing by list of index labels\n", "\n", "my_series.index = [\"om\",\"ir\",\"os\",\"pap\",\"pas\",\"pil\",\"io\",\"po\",\"ulos\",\"is\",\"best\"]\n", - "x = my_series[[\"om\",\"pap\"]]" + "x = my_series[[\"om\",\"pap\"]]\n", + "x" ] }, { @@ -445,7 +452,8 @@ "# Notice the index of x is a SUBSET of the index of \"my_series\"\n", "# This can be useful when needing to relate values back to the original \"my_series\"!\n", "\n", - "x = my_series[choose]" + "x = my_series[choose]\n", + "x" ] }, { @@ -477,7 +485,9 @@ "# Filter \"my_series\" to be all the elements that are NOT\n", "# equal to 0, using the \"choose\" boolean mask below: \n", "\n", - "choose = my_series == 0.0\n" + "choose = my_series == 0.0\n", + "my_series[my_series != 0]\n", + "my_series[~choose]" ] }, { @@ -554,7 +564,8 @@ "# 1. Create a boolean mask by using the .notna() method.\n", "# 2. Use the mask to subset the Series.\n", "\n", - "names = pd.Series(['foo','bar',None,'baz','qux',None])\n" + "names = pd.Series(['foo','bar',None,'baz','qux',None])\n", + "names[names.notna()]" ] }, { @@ -598,12 +609,22 @@ " # Your code here\n", " # HINT: delete the \"pass\" when your done\n", " # HINT2: handle None values!\n", - " pass\n", + " try:\n", + " return s.lower()\n", + " except AttributeError:\n", + " return None\n", "\n", - "\n", - "names = pd.Series(['Foo', 'BAR', None, 'foo', None, 'bar', 'bAR', 'foo', None])" + "names = pd.Series(['Foo', 'BAR', None, 'foo', None, 'bar', 'bAR', 'foo', None])\n", + "names_lower = names.map(lower)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -614,7 +635,8 @@ "\n", "# Using the series from above, now lowercased, count the occurences of each name\n", "# Hint: It's simple, just use .value_counts()!\n", - "\n" + "\n", + "names_lower.value_counts()" ] }, { @@ -652,6 +674,7 @@ "source": [ "# the other important attribute: name of rows and columns\n", "tips.index\n", + "\n", "tips.columns" ] }, @@ -765,7 +788,9 @@ "\n", "# Using the tips dataframe, create a new one that contains the \n", "# information contained in all rows between the 20th (inclusive) \n", - "# and the 45th (exclusive) and only the columns: tip, sex, day" + "# and the 45th (exclusive) and only the columns: tip, sex, day\n", + "\n", + "tips.loc[20:45,[\"tip\",\"sex\",\"day\"]]" ] }, { @@ -844,7 +869,8 @@ "# tip and size for only Male clients during Dinner. \n", "\n", "# HINT: Remember that \"size\" cannot be accessed via dot notation, as it's an \n", - "# attribute of the series!" + "# attribute of the series!\n", + "tips[(tips.sex==\"Male\")&(tips.time==\"Dinner\")].corr().loc[\"size\",\"tip\"]" ] }, { @@ -942,7 +968,10 @@ "# Challenge: \n", "\n", "# Get the second largest bill by gender!\n", - "# HINT: use sort_values and iloc!" + "# HINT: use sort_values and iloc!\n", + "def sec_max_bill(df):\n", + " return df.total_bill.sort_values(ascending=False).iloc[1]\n", + "tips.groupby(\"sex\").apply(sec_max_bill)" ] }, { @@ -972,7 +1001,7 @@ "def day_mean(df):\n", " # Hint: you will need to group by \"day\"\n", " # in this function, then get the mean tip. \n", - " pass\n", + " return df.groupby(\"day\").tip.mean()\n", "\n", "\n", "tips.groupby(\"sex\").apply(day_mean)" @@ -1218,6 +1247,634 @@ "\n", "*Needless to say that eyeballing is OK for making sure your code makes sense, but will not result in full credits for the project. We want a fully automated code. To carry out the project successfully you need to use most the attributes and methods described earlier. The last one is a little tricky*" ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BuyerProductQuantity
0Jacksonapple4
1Jacksonapple9
2Johnorange9
3Johnpotato10
4Tomtomato4
\n", + "
" + ], + "text/plain": [ + " Buyer Product Quantity\n", + "0 Jackson apple 4\n", + "1 Jackson apple 9\n", + "2 John orange 9\n", + "3 John potato 10\n", + "4 Tom tomato 4" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "supermarket_prices = pd.read_csv(\"supermarket_prices.csv\")\n", + "supermarket_transactions = pd.read_csv(\"supermarket_transactions.csv\")\n", + "supermarket_transactions.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Quantity
Buyer
Emma81
Jackson70
John122
Liam81
Lucas62
Sandra78
Sophia61
Tom49
\n", + "
" + ], + "text/plain": [ + " Quantity\n", + "Buyer \n", + "Emma 81\n", + "Jackson 70\n", + "John 122\n", + "Liam 81\n", + "Lucas 62\n", + "Sandra 78\n", + "Sophia 61\n", + "Tom 49" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#How many items each client has purchased\n", + "supermarket_transactions.groupby('Buyer').sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Quantity
BuyerProduct
Emmaapple25
banana26
potato14
tomato16
Jacksonapple18
orange28
potato8
tomato16
Johnapple7
banana28
orange46
potato18
tomato23
Liamapple21
banana16
orange16
potato21
tomato7
Lucasapple14
banana3
orange17
potato9
tomato19
Sandrabanana2
orange37
potato38
tomato1
Sophiaapple14
banana13
orange7
potato14
tomato13
Tomapple18
banana6
potato16
tomato9
\n", + "
" + ], + "text/plain": [ + " Quantity\n", + "Buyer Product \n", + "Emma apple 25\n", + " banana 26\n", + " potato 14\n", + " tomato 16\n", + "Jackson apple 18\n", + " orange 28\n", + " potato 8\n", + " tomato 16\n", + "John apple 7\n", + " banana 28\n", + " orange 46\n", + " potato 18\n", + " tomato 23\n", + "Liam apple 21\n", + " banana 16\n", + " orange 16\n", + " potato 21\n", + " tomato 7\n", + "Lucas apple 14\n", + " banana 3\n", + " orange 17\n", + " potato 9\n", + " tomato 19\n", + "Sandra banana 2\n", + " orange 37\n", + " potato 38\n", + " tomato 1\n", + "Sophia apple 14\n", + " banana 13\n", + " orange 7\n", + " potato 14\n", + " tomato 13\n", + "Tom apple 18\n", + " banana 6\n", + " potato 16\n", + " tomato 9" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#How many items of each type each client has purchased\n", + "by_client_type = supermarket_transactions.groupby(['Buyer','Product']).sum()\n", + "by_client_type" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
total_spent
Buyer
Emma246.4
Jackson202.8
John461.3
Liam263.3
Lucas176.0
Sandra300.8
Sophia189.4
Tom126.1
\n", + "
" + ], + "text/plain": [ + " total_spent\n", + "Buyer \n", + "Emma 246.4\n", + "Jackson 202.8\n", + "John 461.3\n", + "Liam 263.3\n", + "Lucas 176.0\n", + "Sandra 300.8\n", + "Sophia 189.4\n", + "Tom 126.1" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Calculate the total amount spent by each client\n", + "merged_tables = pd.merge(supermarket_transactions, supermarket_prices, on = 'Product', how = 'outer')\n", + "merged_tables['total_spent'] = merged_tables.Quantity*merged_tables.Price\n", + "client_spent = merged_tables[['Buyer','total_spent']].groupby(['Buyer']).sum()\n", + "client_spent" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Emma'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#The company that provides the supermarket with bananas wishes to give a prize to the client that \n", + "#has spent the largest proportion of their spending on bananas. Who should win the prize?\n", + "\n", + "merged_banana = merged_tables[merged_tables.Product=='banana']\n", + "spent_banana = merged_banana[['Buyer','total_spent']].groupby('Buyer').sum()\n", + "spent_banana_total = pd.merge(spent_banana,client_spent, on = 'Buyer', how = 'inner')\n", + "spent_banana_total['prop_banana'] = spent_banana_total['total_spent_x']/spent_banana_total['total_spent_y']\n", + "spent_banana_total.prop_banana.idxmax()" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
BuyerProductprop
2Johnorange0.428788
8Liampotato0.271174
13Sophiapotato0.251320
\n", + "
" + ], + "text/plain": [ + " Buyer Product prop\n", + "2 John orange 0.428788\n", + "8 Liam potato 0.271174\n", + "13 Sophia potato 0.251320" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#A marketing company that works with the supermarket is interested to understand better the \n", + "#characteristics of the three people that have spent most of their spending on bananas. \n", + "#For each one of them report the other product that they have spent most of their remaining income on.\n", + "top_three = spent_banana_total.prop_banana.sort_values()[-4:-1].index.values\n", + "spending = pd.DataFrame()\n", + "for person in top_three:\n", + " spending = pd.concat([spending, merged_tables[merged_tables.Buyer == person]],axis=0) \n", + "spent_buyer_prod = spending.groupby(['Buyer','Product']).sum().reset_index()[['Buyer','Product','total_spent']]\n", + "total_buyer_prod = pd.merge(spent_buyer_prod,client_spent,on='Buyer',how='left')\n", + "total_buyer_prod['prop'] = total_buyer_prod.total_spent_x/total_buyer_prod.total_spent_y\n", + "prop_no_banana = total_buyer_prod[['Buyer','Product','prop']][total_buyer_prod.Product!='banana']\n", + "prop_no_banana.loc[prop_no_banana.groupby('Buyer').idxmax().iloc[:,0],:]" + ] } ], "metadata": { @@ -1237,7 +1894,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.3" + "version": "3.7.3" } }, "nbformat": 4,