|
| 1 | +package com.highperformancespark.examples.dataframe; |
| 2 | + |
| 3 | +import org.apache.spark.api.java.JavaRDD; |
| 4 | +import org.apache.spark.api.java.JavaSparkContext; |
| 5 | +import org.apache.spark.sql.Column; |
| 6 | +import org.apache.spark.sql.DataFrame; |
| 7 | +import org.apache.spark.sql.SQLContext; |
| 8 | +import org.apache.spark.sql.expressions.Window; |
| 9 | +import org.apache.spark.sql.expressions.WindowSpec; |
| 10 | +import org.apache.spark.sql.hive.HiveContext; |
| 11 | + |
| 12 | +import java.util.HashMap; |
| 13 | +import java.util.Map; |
| 14 | + |
| 15 | +import static org.apache.spark.sql.functions.*; |
| 16 | + |
| 17 | +public class JavaHappyPandas { |
| 18 | + |
| 19 | + /** |
| 20 | + * Creates SQLContext with an existing SparkContext. |
| 21 | + */ |
| 22 | + public static SQLContext sqlContext(JavaSparkContext jsc) { |
| 23 | + SQLContext sqlContext = new SQLContext(jsc); |
| 24 | + return sqlContext; |
| 25 | + } |
| 26 | + |
| 27 | + /** |
| 28 | + * Creates HiveContext with an existing SparkContext. |
| 29 | + */ |
| 30 | + public static HiveContext hiveContext(JavaSparkContext jsc) { |
| 31 | + HiveContext hiveContext = new HiveContext(jsc); |
| 32 | + return hiveContext; |
| 33 | + } |
| 34 | + |
| 35 | + /** |
| 36 | + * Illustrate loading some JSON data. |
| 37 | + */ |
| 38 | + public static DataFrame loadDataSimple(JavaSparkContext jsc, SQLContext sqlContext, String path) { |
| 39 | + DataFrame df1 = sqlContext.read().json(path); |
| 40 | + |
| 41 | + DataFrame df2 = sqlContext.read().format("json").option("samplingRatio", "1.0").load(path); |
| 42 | + |
| 43 | + JavaRDD<String> jsonRDD = jsc.textFile(path); |
| 44 | + DataFrame df3 = sqlContext.read().json(jsonRDD); |
| 45 | + |
| 46 | + return df1; |
| 47 | + } |
| 48 | + |
| 49 | + public static DataFrame jsonLoadFromRDD(SQLContext sqlContext, JavaRDD<String> input) { |
| 50 | + JavaRDD<String> rdd = input.filter(e -> e.contains("panda")); |
| 51 | + DataFrame df = sqlContext.read().json(rdd); |
| 52 | + return df; |
| 53 | + } |
| 54 | + |
| 55 | + // Here will be some examples on PandaInfo DataFrame |
| 56 | + |
| 57 | + /** |
| 58 | + * Gets the percentage of happy pandas per place. |
| 59 | + * |
| 60 | + * @param pandaInfo the input DataFrame |
| 61 | + * @return Returns DataFrame of (place, percentage of happy pandas) |
| 62 | + */ |
| 63 | + public static DataFrame happyPandasPercentage(DataFrame pandaInfo) { |
| 64 | + DataFrame happyPercentage = pandaInfo.select(pandaInfo.col("place"), |
| 65 | + (pandaInfo.col("happyPandas").divide(pandaInfo.col("totalPandas"))).as("percentHappy")); |
| 66 | + return happyPercentage; |
| 67 | + } |
| 68 | + |
| 69 | + /** |
| 70 | + * Encodes pandaType to Integer values instead of String values. |
| 71 | + * |
| 72 | + * @param pandaInfo the input DataFrame |
| 73 | + * @return Returns a DataFrame of pandaId and integer value for pandaType. |
| 74 | + */ |
| 75 | + public static DataFrame encodePandaType(DataFrame pandaInfo) { |
| 76 | + DataFrame encodedDF = pandaInfo.select(pandaInfo.col("id"), |
| 77 | + when(pandaInfo.col("pt").equalTo("giant"), 0). |
| 78 | + when(pandaInfo.col("pt").equalTo("red"), 1). |
| 79 | + otherwise(2).as("encodedType")); |
| 80 | + |
| 81 | + return encodedDF; |
| 82 | + } |
| 83 | + |
| 84 | + /** |
| 85 | + * Gets places with happy pandas more than minHappinessBound. |
| 86 | + */ |
| 87 | + public static DataFrame minHappyPandas(DataFrame pandaInfo, int minHappyPandas) { |
| 88 | + return pandaInfo.filter(pandaInfo.col("happyPandas").geq(minHappyPandas)); |
| 89 | + } |
| 90 | + |
| 91 | + /** |
| 92 | + * Find pandas that are sad. |
| 93 | + */ |
| 94 | + public static DataFrame sadPandas(DataFrame pandaInfo) { |
| 95 | + return pandaInfo.filter(pandaInfo.col("happy").notEqual(true)); |
| 96 | + } |
| 97 | + |
| 98 | + /** |
| 99 | + * Find pandas that are happy and fuzzier than squishy. |
| 100 | + */ |
| 101 | + public static DataFrame happyFuzzyPandas(DataFrame pandaInfo) { |
| 102 | + DataFrame df = pandaInfo.filter( |
| 103 | + pandaInfo.col("happy").and(pandaInfo.col("attributes").apply(0)).gt(pandaInfo.col("attributes").apply(1)) |
| 104 | + ); |
| 105 | + |
| 106 | + return df; |
| 107 | + } |
| 108 | + |
| 109 | + /** |
| 110 | + * Gets places that contains happy pandas more than unhappy pandas. |
| 111 | + */ |
| 112 | + public static DataFrame happyPandasPlaces(DataFrame pandaInfo) { |
| 113 | + return pandaInfo.filter(pandaInfo.col("happyPandas").geq(pandaInfo.col("totalPandas").divide(2))); |
| 114 | + } |
| 115 | + |
| 116 | + /** |
| 117 | + * Remove duplicate pandas by id. |
| 118 | + */ |
| 119 | + public static DataFrame removeDuplicates(DataFrame pandas) { |
| 120 | + DataFrame df = pandas.dropDuplicates(new String[]{"id"}); |
| 121 | + return df; |
| 122 | + } |
| 123 | + |
| 124 | + public static DataFrame describePandas(DataFrame pandas) { |
| 125 | + return pandas.describe(); |
| 126 | + } |
| 127 | + |
| 128 | + public static DataFrame maxPandaSizePerZip(DataFrame pandas) { |
| 129 | + return pandas.groupBy(pandas.col("zip")).max("pandaSize"); |
| 130 | + } |
| 131 | + |
| 132 | + public static DataFrame minMaxPandaSizePerZip(DataFrame pandas) { |
| 133 | + return pandas.groupBy(pandas.col("zip")).agg(min("pandaSize"), max("pandaSize")); |
| 134 | + } |
| 135 | + |
| 136 | + public static DataFrame minPandaSizeMaxAgePerZip(DataFrame pandas) { |
| 137 | + Map<String, String> map = new HashMap<>(); |
| 138 | + map.put("pandaSize", "min"); |
| 139 | + map.put("age", "max"); |
| 140 | + |
| 141 | + DataFrame df = pandas.groupBy(pandas.col("zip")).agg(map); |
| 142 | + return df; |
| 143 | + } |
| 144 | + |
| 145 | + public static DataFrame minMeanSizePerZip(DataFrame pandas) { |
| 146 | + return pandas.groupBy(pandas.col("zip")).agg(min(pandas.col("pandaSize")), mean(pandas.col("pandaSize"))); |
| 147 | + } |
| 148 | + |
| 149 | + public static DataFrame simpleSqlExample(DataFrame pandas) { |
| 150 | + SQLContext sqlContext = pandas.sqlContext(); |
| 151 | + pandas.registerTempTable("pandas"); |
| 152 | + |
| 153 | + DataFrame miniPandas = sqlContext.sql("SELECT * FROM pandas WHERE pandaSize < 12"); |
| 154 | + return miniPandas; |
| 155 | + } |
| 156 | + |
| 157 | + /** |
| 158 | + * Orders pandas by size ascending and by age descending. |
| 159 | + * Pandas will be sorted by "size" first and if two pandas |
| 160 | + * have the same "size" will be sorted by "age". |
| 161 | + */ |
| 162 | + public static DataFrame orderPandas(DataFrame pandas) { |
| 163 | + return pandas.orderBy(pandas.col("pandaSize").asc(), pandas.col("age").desc()); |
| 164 | + } |
| 165 | + |
| 166 | + public static DataFrame computeRelativePandaSizes(DataFrame pandas) { |
| 167 | + //tag::relativePandaSizesWindow[] |
| 168 | + WindowSpec windowSpec = Window |
| 169 | + .orderBy(pandas.col("age")) |
| 170 | + .partitionBy(pandas.col("zip")) |
| 171 | + .rowsBetween(-10, 10); // can use rangeBetween for range instead |
| 172 | + //end::relativePandaSizesWindow[] |
| 173 | + |
| 174 | + //tag::relativePandaSizesQuery[] |
| 175 | + Column pandaRelativeSizeCol = pandas.col("pandaSize").minus(avg(pandas.col("pandaSize")).over(windowSpec)); |
| 176 | + |
| 177 | + return pandas.select(pandas.col("name"), pandas.col("zip"), pandas.col("pandaSize"), |
| 178 | + pandas.col("age"), pandaRelativeSizeCol.as("panda_relative_size")); |
| 179 | + //end::relativePandaSizesQuery[] |
| 180 | + } |
| 181 | + |
| 182 | + public static void joins(DataFrame df1, DataFrame df2) { |
| 183 | + //tag::innerJoin[] |
| 184 | + // Inner join implicit |
| 185 | + df1.join(df2, df1.col("name").equalTo(df2.col("name"))); |
| 186 | + // Inner join explicit |
| 187 | + df1.join(df2, df1.col("name").equalTo(df2.col("name")), "inner"); |
| 188 | + //end::innerJoin[] |
| 189 | + |
| 190 | + //tag::leftouterJoin[] |
| 191 | + // Left outer join explicit |
| 192 | + df1.join(df2, df1.col("name").equalTo(df2.col("name")), "left_outer"); |
| 193 | + //end::leftouterJoin[] |
| 194 | + |
| 195 | + //tag::rightouterJoin[] |
| 196 | + // Right outer join explicit |
| 197 | + df1.join(df2, df1.col("name").equalTo(df2.col("name")), "right_outer"); |
| 198 | + //end::rightouterJoin[] |
| 199 | + |
| 200 | + //tag::leftsemiJoin[] |
| 201 | + // Left semi join explicit |
| 202 | + df1.join(df2, df1.col("name").equalTo(df2.col("name")), "leftsemi"); |
| 203 | + //end::leftsemiJoin[] |
| 204 | + } |
| 205 | + |
| 206 | + public static DataFrame selfJoin(DataFrame df) { |
| 207 | + return (df.as("a")).join(df.as("b")).where("a.name = b.name"); |
| 208 | + } |
| 209 | + |
| 210 | +} |
0 commit comments