Skip to content

Commit bdbc99b

Browse files
author
Michael Gregory
committed
initial
0 parents  commit bdbc99b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+12567
-0
lines changed

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
R
2+
node_modules
3+
*.pyc
4+
__pycache__
5+
.*
6+
!.gitignore

0_DE_data_prep.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#Fetch raw data and create an external Hive table in CDP/CML.
2+
3+
#Data taken from http://...WineNewGBTDataSet.csv
4+
#wget http://.../WineNewGBTDataSet.csv; aws s3 cp WineNewGBTDataSet.csv s3://ml-field/demo/wine/; rm WineNewGBTDataSet.csv
5+
6+
7+
from pyspark.sql import SparkSession
8+
from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType
9+
10+
spark = SparkSession.builder\
11+
.appName("Setup Wine Table")\
12+
.config("spark.yarn.access.hadoopFileSystems","s3a://ml-field/demo/wine/")\
13+
.getOrCreate()
14+
15+
spark.sql("SHOW databases").show()
16+
spark.sql("USE default")
17+
spark.sql("SHOW tables").show()
18+
19+
20+
#spark.sql("DROP TABLE IF EXISTS wine").show()
21+
statement = '''
22+
CREATE EXTERNAL TABLE IF NOT EXISTS `default`.`wine` (
23+
`fixedAcidity` double ,
24+
`volatileAcidity` double ,
25+
`citricAcid` double ,
26+
`residualSugar` double ,
27+
`chlorides` double ,
28+
`freeSulfurDioxide` double ,
29+
`totalSulfurDioxide` double ,
30+
`density` double ,
31+
`pH` double ,
32+
`sulphates` double ,
33+
`Alcohol` double ,
34+
`Quality` string )
35+
ROW FORMAT DELIMITED FIELDS TERMINATED BY ';'
36+
STORED AS TextFile
37+
LOCATION 's3a://ml-field/demo/wine/'
38+
'''
39+
spark.sql(statement)
40+
41+
spark.sql("DESCRIBE TABLE EXTENDED `default`.`wine`").show()
42+
spark.sql("SELECT * FROM `default`.`wine` LIMIT 5").take(5)
43+
44+
spark.stop()

1_DS_analysis.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# # Load the data
2+
3+
from pyspark.sql import SparkSession
4+
from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType
5+
6+
spark = SparkSession.builder\
7+
.appName("Import Wine Table")\
8+
.config("spark.yarn.access.hadoopFileSystems","s3a://ml-field/demo/wine/")\
9+
.getOrCreate()
10+
11+
wine_data_raw = spark.sql("SELECT * FROM `default`.`wine`")
12+
wine_data_raw.show(3)
13+
14+
# ### Basic DataFrame operations
15+
# Dataframes essentially allow you to express sql-like statements.
16+
# We can filter, count, and so on.
17+
# Documentation - (http://spark.apache.org/docs/latest/sql-programming-guide.html#dataframe-operations)
18+
19+
"number of lines in dataset : {:d}".format(wine_data_raw.count())
20+
21+
# ### Spark SQL - manipulate data as if it was a table
22+
wine_data_raw.createOrReplaceTempView("wine")
23+
spark.sql("select distinct(Quality), count(*) from wine GROUP BY Quality").show()
24+
25+
26+
# ### Remove invalid data
27+
wine_data = wine_data_raw.filter(wine_data_raw.Quality != "1")
28+
total_wines = wine_data.count()
29+
good_wines = wine_data.filter(wine_data.Quality == 'Excellent').count()
30+
good_wines = wine_data.filter(wine_data.Quality == 'Poor').count()
31+
32+
"Wines total: {}, Good : {}, Poor : {}".format(total_wines,good_wines,good_wines )
33+
34+
35+
# # 2. Data visualisation ( using mathplotlib and Seaborn)
36+
# ## Feature Visualization
37+
#
38+
# The data vizualization workflow for large data sets is usually:
39+
#
40+
# * Sample data so it fits in memory on a single machine.
41+
# * Examine single variable distributions.
42+
# * Examine joint distributions and correlations.
43+
# * Look for other types of relationships.
44+
#
45+
# [DataFrame#sample() documentation](http://people.apache.org/~pwendell/spark-releases/spark-1.5.0-rc1-docs/api/python/pyspark.sql.html#pyspark.sql.DataFrame.sample)
46+
47+
# ### Note: toPandas() => brings data localy !!!
48+
sample_data = wine_data.sample(False, 0.5, 83).toPandas()
49+
sample_data.transpose().head(21)
50+
51+
spark.stop()
52+
53+
54+
# ## Feature Distributions
55+
#
56+
# We want to examine the distribution of our features, so start with them one at a time.
57+
#
58+
# Seaborn has a standard function called [dist()](http://stanford.edu/~mwaskom/software/seaborn/generated/seaborn.distplot.html#seaborn.distplot) that allows us to easily examine the distribution of a column of a pandas dataframe or a numpy array.
59+
60+
get_ipython().magic(u'matplotlib inline')
61+
import matplotlib.pyplot as plt
62+
import seaborn as sb
63+
64+
sb.distplot(sample_data['Alcohol'], kde=False)
65+
66+
# We can examine feature differences in the distribution of our features when we condition (split) our data.
67+
#
68+
# [BoxPlot docs](http://stanford.edu/~mwaskom/software/seaborn/generated/seaborn.boxplot.html)
69+
70+
71+
sb.boxplot(x="Quality", y="Alcohol", data=sample_data)
72+
73+
# ## Joint Distributions
74+
#
75+
# Looking at joint distributions of data can also tell us a lot, particularly about redundant features. [Seaborn's PairPlot](http://stanford.edu/~mwaskom/software/seaborn/generated/seaborn.pairplot.html#seaborn.pairplot) let's us look at joint distributions for many variables at once.
76+
77+
78+
example_numeric_data = sample_data[["fixedAcidity", "volatileAcidity",
79+
"citricAcid", "residualSugar", "Quality"]]
80+
sb.pairplot(example_numeric_data, hue="Quality")

0 commit comments

Comments
 (0)