|
| 1 | +# # Load the data |
| 2 | + |
| 3 | +from pyspark.sql import SparkSession |
| 4 | +from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType |
| 5 | + |
| 6 | +spark = SparkSession.builder\ |
| 7 | + .appName("Import Wine Table")\ |
| 8 | + .config("spark.yarn.access.hadoopFileSystems","s3a://ml-field/demo/wine/")\ |
| 9 | + .getOrCreate() |
| 10 | + |
| 11 | +wine_data_raw = spark.sql("SELECT * FROM `default`.`wine`") |
| 12 | +wine_data_raw.show(3) |
| 13 | + |
| 14 | +# ### Basic DataFrame operations |
| 15 | +# Dataframes essentially allow you to express sql-like statements. |
| 16 | +# We can filter, count, and so on. |
| 17 | +# Documentation - (http://spark.apache.org/docs/latest/sql-programming-guide.html#dataframe-operations) |
| 18 | + |
| 19 | +"number of lines in dataset : {:d}".format(wine_data_raw.count()) |
| 20 | + |
| 21 | +# ### Spark SQL - manipulate data as if it was a table |
| 22 | +wine_data_raw.createOrReplaceTempView("wine") |
| 23 | +spark.sql("select distinct(Quality), count(*) from wine GROUP BY Quality").show() |
| 24 | + |
| 25 | + |
| 26 | +# ### Remove invalid data |
| 27 | +wine_data = wine_data_raw.filter(wine_data_raw.Quality != "1") |
| 28 | +total_wines = wine_data.count() |
| 29 | +good_wines = wine_data.filter(wine_data.Quality == 'Excellent').count() |
| 30 | +good_wines = wine_data.filter(wine_data.Quality == 'Poor').count() |
| 31 | + |
| 32 | +"Wines total: {}, Good : {}, Poor : {}".format(total_wines,good_wines,good_wines ) |
| 33 | + |
| 34 | + |
| 35 | +# # 2. Data visualisation ( using mathplotlib and Seaborn) |
| 36 | +# ## Feature Visualization |
| 37 | +# |
| 38 | +# The data vizualization workflow for large data sets is usually: |
| 39 | +# |
| 40 | +# * Sample data so it fits in memory on a single machine. |
| 41 | +# * Examine single variable distributions. |
| 42 | +# * Examine joint distributions and correlations. |
| 43 | +# * Look for other types of relationships. |
| 44 | +# |
| 45 | +# [DataFrame#sample() documentation](http://people.apache.org/~pwendell/spark-releases/spark-1.5.0-rc1-docs/api/python/pyspark.sql.html#pyspark.sql.DataFrame.sample) |
| 46 | + |
| 47 | +# ### Note: toPandas() => brings data localy !!! |
| 48 | +sample_data = wine_data.sample(False, 0.5, 83).toPandas() |
| 49 | +sample_data.transpose().head(21) |
| 50 | + |
| 51 | +spark.stop() |
| 52 | + |
| 53 | + |
| 54 | +# ## Feature Distributions |
| 55 | +# |
| 56 | +# We want to examine the distribution of our features, so start with them one at a time. |
| 57 | +# |
| 58 | +# Seaborn has a standard function called [dist()](http://stanford.edu/~mwaskom/software/seaborn/generated/seaborn.distplot.html#seaborn.distplot) that allows us to easily examine the distribution of a column of a pandas dataframe or a numpy array. |
| 59 | + |
| 60 | +get_ipython().magic(u'matplotlib inline') |
| 61 | +import matplotlib.pyplot as plt |
| 62 | +import seaborn as sb |
| 63 | + |
| 64 | +sb.distplot(sample_data['Alcohol'], kde=False) |
| 65 | + |
| 66 | +# We can examine feature differences in the distribution of our features when we condition (split) our data. |
| 67 | +# |
| 68 | +# [BoxPlot docs](http://stanford.edu/~mwaskom/software/seaborn/generated/seaborn.boxplot.html) |
| 69 | + |
| 70 | + |
| 71 | +sb.boxplot(x="Quality", y="Alcohol", data=sample_data) |
| 72 | + |
| 73 | +# ## Joint Distributions |
| 74 | +# |
| 75 | +# Looking at joint distributions of data can also tell us a lot, particularly about redundant features. [Seaborn's PairPlot](http://stanford.edu/~mwaskom/software/seaborn/generated/seaborn.pairplot.html#seaborn.pairplot) let's us look at joint distributions for many variables at once. |
| 76 | + |
| 77 | + |
| 78 | +example_numeric_data = sample_data[["fixedAcidity", "volatileAcidity", |
| 79 | + "citricAcid", "residualSugar", "Quality"]] |
| 80 | +sb.pairplot(example_numeric_data, hue="Quality") |
0 commit comments