Skip to content
This repository has been archived by the owner on May 12, 2021. It is now read-only.

[WIP][PIO-209] Upgrade Spark to 2.4 for pre-built binary distribution #521

Open
wants to merge 16 commits into
base: develop
Choose a base branch
from
7 changes: 2 additions & 5 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,7 @@ env:
METADATA_REP=PGSQL EVENTDATA_REP=PGSQL MODELDATA_REP=PGSQL
- BUILD_TYPE=Integration
METADATA_REP=ELASTICSEARCH EVENTDATA_REP=ELASTICSEARCH MODELDATA_REP=S3
PIO_ELASTICSEARCH_VERSION=5.6.9
- BUILD_TYPE=Integration
METADATA_REP=ELASTICSEARCH EVENTDATA_REP=ELASTICSEARCH MODELDATA_REP=S3
PIO_ELASTICSEARCH_VERSION=6.4.2
PIO_ELASTICSEARCH_VERSION=6.8.1
- BUILD_TYPE=Integration
METADATA_REP=ELASTICSEARCH EVENTDATA_REP=HBASE MODELDATA_REP=LOCALFS
PIO_HBASE_VERSION=1.2.6
Expand Down Expand Up @@ -101,7 +98,7 @@ env:
- BUILD_TYPE=Integration
METADATA_REP=PGSQL EVENTDATA_REP=PGSQL MODELDATA_REP=HDFS
PIO_SCALA_VERSION=2.11.12
PIO_SPARK_VERSION=2.4.0
PIO_SPARK_VERSION=2.4.3
PIO_HADOOP_VERSION=2.7.7

- BUILD_TYPE=LicenseCheck
Expand Down
7 changes: 2 additions & 5 deletions LICENSE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1530,7 +1530,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Binary distribution bundles

com.thoughtworks.paranamer # paranamer # 2.3 (https://github.com/paul-hammant/paranamer)
com.thoughtworks.paranamer # paranamer # 2.6 (https://github.com/paul-hammant/paranamer)
com.thoughtworks.paranamer # paranamer # 2.8 (https://github.com/paul-hammant/paranamer)

which is available under the BSD license (http://www.opensource.org/licenses/bsd-license.php)

Expand Down Expand Up @@ -1701,15 +1701,12 @@ EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Binary distribution bundles

org.scala-lang # scala-library # 2.11.12 (http://scala-lang.org/)
org.scala-lang # scala-compiler # 2.11.12 (http://scala-lang.org/)
org.scala-lang # scala-reflect # 2.11.12 (http://scala-lang.org/)
org.scala-lang # scalap # 2.11.12 (http://scala-lang.org/)
org.scala-lang.modules # scala-java8-compat_2.11 # 0.7.0 (http://scala-lang.org/)
org.scala-lang.modules # scala-parser-combinators_2.11 # 1.0.6 (http://scala-lang.org/)
org.scala-lang.modules # scala-parser-combinators_2.11 # 1.1.0 (http://scala-lang.org/)
org.scala-lang.modules # scala-xml_2.11 # 1.0.5 (http://scala-lang.org/)
org.scala-lang.modules # scala-xml_2.11 # 1.0.6 (http://scala-lang.org/)

which is available under the BSD license (http://www.scala-lang.org/downloads/license.html)

Copyright (c) 2002-2017 EPFL
Expand Down
7 changes: 3 additions & 4 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ scalaVersion in ThisBuild := sys.props.getOrElse("scala.version", "2.11.12")

scalaBinaryVersion in ThisBuild := binaryVersion(scalaVersion.value)

crossScalaVersions in ThisBuild := Seq("2.11.12")
crossScalaVersions in ThisBuild := Seq(scalaVersion.value)

scalacOptions in ThisBuild ++= Seq("-deprecation", "-unchecked", "-feature")

Expand All @@ -36,16 +36,15 @@ fork in (ThisBuild, run) := true
javacOptions in (ThisBuild, compile) ++= Seq("-source", "1.8", "-target", "1.8",
"-Xlint:deprecation", "-Xlint:unchecked")

// Ignore differentiation of Spark patch levels
sparkVersion in ThisBuild := sys.props.getOrElse("spark.version", "2.1.3")
sparkVersion in ThisBuild := sys.props.getOrElse("spark.version", "2.4.3")

sparkBinaryVersion in ThisBuild := binaryVersion(sparkVersion.value)

hadoopVersion in ThisBuild := sys.props.getOrElse("hadoop.version", "2.7.7")

akkaVersion in ThisBuild := sys.props.getOrElse("akka.version", "2.5.17")

elasticsearchVersion in ThisBuild := sys.props.getOrElse("elasticsearch.version", "5.6.9")
elasticsearchVersion in ThisBuild := sys.props.getOrElse("elasticsearch.version", "6.8.1")

hbaseVersion in ThisBuild := sys.props.getOrElse("hbase.version", "1.2.6")

Expand Down
9 changes: 4 additions & 5 deletions conf/pio-env.sh.template
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@
# you need to change these to fit your site.

# SPARK_HOME: Apache Spark is a hard dependency and must be configured.
# SPARK_HOME=$PIO_HOME/vendors/spark-2.0.2-bin-hadoop2.7
SPARK_HOME=$PIO_HOME/vendors/spark-2.1.1-bin-hadoop2.6
SPARK_HOME=$PIO_HOME/vendors/spark-2.4.3-bin-hadoop2.7

POSTGRES_JDBC_DRIVER=$PIO_HOME/lib/postgresql-42.0.0.jar
MYSQL_JDBC_DRIVER=$PIO_HOME/lib/mysql-connector-java-5.1.41.jar
Expand All @@ -40,7 +39,7 @@ MYSQL_JDBC_DRIVER=$PIO_HOME/lib/mysql-connector-java-5.1.41.jar

# HBASE_CONF_DIR: You must configure this if you intend to run PredictionIO
# with HBase on a remote cluster.
# HBASE_CONF_DIR=$PIO_HOME/vendors/hbase-1.0.0/conf
# HBASE_CONF_DIR=$PIO_HOME/vendors/hbase-1.2.6/conf

# Filesystem paths where PredictionIO uses as block storage.
PIO_FS_BASEDIR=$HOME/.pio_store
Expand Down Expand Up @@ -89,7 +88,7 @@ PIO_STORAGE_SOURCES_PGSQL_PASSWORD=pio
# PIO_STORAGE_SOURCES_ELASTICSEARCH_HOSTS=localhost
# PIO_STORAGE_SOURCES_ELASTICSEARCH_PORTS=9200
# PIO_STORAGE_SOURCES_ELASTICSEARCH_SCHEMES=http
# PIO_STORAGE_SOURCES_ELASTICSEARCH_HOME=$PIO_HOME/vendors/elasticsearch-5.6.9
# PIO_STORAGE_SOURCES_ELASTICSEARCH_HOME=$PIO_HOME/vendors/elasticsearch-6.8.1
# Optional basic HTTP auth
# PIO_STORAGE_SOURCES_ELASTICSEARCH_USERNAME=my-name
# PIO_STORAGE_SOURCES_ELASTICSEARCH_PASSWORD=my-secret
Expand All @@ -100,7 +99,7 @@ PIO_STORAGE_SOURCES_PGSQL_PASSWORD=pio

# HBase Example
# PIO_STORAGE_SOURCES_HBASE_TYPE=hbase
# PIO_STORAGE_SOURCES_HBASE_HOME=$PIO_HOME/vendors/hbase-1.0.0
# PIO_STORAGE_SOURCES_HBASE_HOME=$PIO_HOME/vendors/hbase-1.2.6

# AWS S3 Example
# PIO_STORAGE_SOURCES_S3_TYPE=s3
Expand Down
4 changes: 2 additions & 2 deletions conf/pio-vendors.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,15 @@ if [ -z "$PIO_SCALA_VERSION" ]; then
fi

if [ -z "$PIO_SPARK_VERSION" ]; then
PIO_SPARK_VERSION="2.1.3"
PIO_SPARK_VERSION="2.4.3"
fi

if [ -z "$PIO_HADOOP_VERSION" ]; then
PIO_HADOOP_VERSION="2.7.7"
fi

if [ -z "$PIO_ELASTICSEARCH_VERSION" ]; then
PIO_ELASTICSEARCH_VERSION="5.6.9"
PIO_ELASTICSEARCH_VERSION="6.8.1"
fi

if [ -z "$PIO_HBASE_VERSION" ]; then
Expand Down
6 changes: 3 additions & 3 deletions docs/manual/data/versions.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
pio: 0.14.0
spark: 2.4.0
spark_download_filename: spark-2.4.0-bin-hadoop2.7
elasticsearch_download_filename: elasticsearch-5.6.9
spark: 2.4.3
spark_download_filename: spark-2.4.3-bin-hadoop2.7
elasticsearch_download_filename: elasticsearch-6.8.1
hbase_version: 1.2.6
hbase_basename: hbase-1.2.6
hbase_variant: bin
52 changes: 16 additions & 36 deletions docs/manual/source/install/index.html.md.erb
Original file line number Diff line number Diff line change
Expand Up @@ -21,54 +21,34 @@ limitations under the License.

## Prerequisites

It is **very important** to meet the minimum version of the following
It is **very important** to meet the version of the following
technologies that power Apache PredictionIO®.

* Apache Hadoop 2.6.5 (optional, required only if YARN and HDFS are needed)
* Apache Spark 2.0.2 for Hadoop 2.6
* Java SE Development Kit 8
* Apache Spark 2.0+
* Apache Hadoop 2.6, 2.7

and one of the following sets:

* PostgreSQL 9.1

or

* MySQL 5.1

or

* Apache HBase 0.98.5
* Elasticsearch 5.6.9

WARNING: **Note that support for Scala 2.10 and Spark 1.6 were removed as of PredictionIO 0.14.0.**

If you are running on a single machine, we recommend a minimum of 2GB memory.

INFO: If you are using Linux, Apache Spark local mode, which is the default
operation mode without further configuration, may not work. In that case,
configure your Apache Spark to run in [standalone cluster
mode](http://spark.apache.org/docs/latest/spark-standalone.html).
* PostgreSQL 9.6 or MySQL 5.1
* Apache HBase 1.2
* Elasticsearch 6.x, 5.6(deprecated)

## Installation

* [Installing Apache PredictionIO](install-sourcecode.html)
Pre-built for the following versions

You may also use Docker to install Apache PredictionIO®
* Scala 2.11
* Apache Spark 2.4
* Apache Hadoop 2.7
* Elasticsearch 6.8

* [Installing Apache PredictionIO with Docker](install-docker.html)


[//]: # (* *(coming soon)* Installing Apache PredictionIO with Homebrew)
* [Downloading Binary Distribution](install-sourcecode.html#downloading-binary-distribution)

Building Apache PredictionIO

* [Downloading Source Code](install-sourcecode.html#downloading-source-code)

WARNING: **0.8.2 contains schema changes from the previous versions, if you have
installed the previous versions, you may need to clear both HBase and
Elasticsearch. See more [here](/resources/upgrade/).**
Docker


[//]: # (## Production Deployment)

[//]: # (For production environment setup, please refer to [Production)
[//]: # (Deployment](/production/deploy.html) guide.)
* [Installing Apache PredictionIO with Docker](install-docker.html)
19 changes: 0 additions & 19 deletions docs/manual/source/install/install-sourcecode.html.md.erb
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,6 @@ replace `/home/abc` with your own home directory wherever you see it.

## Downloading Binary Distribution

You can use pre-built binary distribution for Apache PredictionIO® if you are
building against

* Scala 2.11.12
* Spark 2.1.3
* Hadoop 2.7.7
* Elasticsearch 5.6.9

Download [binary release from an Apache
mirror](https://www.apache.org/dyn/closer.lua/predictionio/<%= data.versions.pio
%>/apache-predictionio-<%= data.versions.pio %>-bin.tar.gz).
Expand Down Expand Up @@ -127,17 +119,6 @@ Extract the binary distribution you have just built.
$ tar zxvf PredictionIO-<%= data.versions.pio %>.tar.gz
```

### Building against Different Versions of Dependencies

Starting from version 0.11.0, PredictionIO can be built against different
versions of dependencies. As of writing, one could build PredictionIO against
these different dependencies:

* Scala 2.11.x
* Spark 2.0.x, 2.1.x, 2.2.x, 2.3.x, 2.4.x
* Hadoop 2.6.x, 2.7.x
* Elasticsearch 5.6.x, 6.x

## Installing Dependencies

Let us install dependencies inside a subdirectory of the Apache PredictionIO
Expand Down
5 changes: 2 additions & 3 deletions docs/manual/source/partials/shared/install/_postgres.html.erb
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,5 @@ $ psql -c "create user pio with password 'pio'"
Starting from 0.11.0, PredictionIO no longer bundles JDBC drivers. Download the
PostgreSQL JDBC driver from the [official web
site](https://jdbc.postgresql.org/), and put the JAR file in the `lib`
subdirectory. By default, `conf/pio-env.sh` assumes version 42.0.0 JDBC 4.2. If
you use a different version, modify `POSTGRES_JDBC_DRIVER` to point to the
correct JAR.
subdirectory. Afterwords, you need to edit `conf/pio-env.sh` and change the
`POSTGRES_JDBC_DRIVER` variable to point to the correct JAR.
2 changes: 1 addition & 1 deletion project/assembly.sbt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.7")
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.9")
2 changes: 1 addition & 1 deletion project/build.properties
Original file line number Diff line number Diff line change
@@ -1 +1 @@
sbt.version=1.2.3
sbt.version=1.2.8
6 changes: 3 additions & 3 deletions project/plugins.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@ addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.9.0")

addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.2")

addSbtPlugin("com.typesafe.sbt" % "sbt-twirl" % "1.3.15")
addSbtPlugin("com.typesafe.sbt" % "sbt-twirl" % "1.4.1")

addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.3")
addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.5")

addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")

resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/"

addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1")

addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.3.6")
addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.3.22")

addSbtPlugin("com.typesafe.sbt" % "sbt-license-report" % "1.2.0")
17 changes: 4 additions & 13 deletions storage/elasticsearch/build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -19,29 +19,20 @@ import PIOBuild._

name := "apache-predictionio-data-elasticsearch"

elasticsearchVersion := (if (majorVersion(elasticsearchVersion.value) < 5) "5.6.9" else elasticsearchVersion.value)

libraryDependencies ++= Seq(
"org.apache.predictionio" %% "apache-predictionio-core" % version.value % "provided",
"org.apache.spark" %% "spark-core" % sparkVersion.value % "provided",
"org.apache.predictionio" %% "apache-predictionio-core" % version.value % "provided",
"org.apache.spark" %% "spark-core" % sparkVersion.value % "provided",
"org.elasticsearch.client" % "elasticsearch-rest-client" % elasticsearchVersion.value,
"org.elasticsearch" %% "elasticsearch-spark-20" % elasticsearchVersion.value
"org.elasticsearch" %% "elasticsearch-spark-20" % elasticsearchVersion.value
exclude("org.apache.spark", "*"),
"org.elasticsearch" % "elasticsearch-hadoop-mr" % elasticsearchVersion.value,
"org.specs2" %% "specs2" % "2.3.13" % "test")
"org.specs2" %% "specs2" % "2.3.13" % "test")

parallelExecution in Test := false

pomExtra := childrenPomExtra.value

assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false)

assemblyShadeRules in assembly := Seq(
ShadeRule.rename("org.apache.http.**" ->
"org.apache.predictionio.shaded.org.apache.http.@1").inAll,
ShadeRule.rename("org.elasticsearch.client.**" ->
"org.apache.predictionio.shaded.org.elasticsearch.client.@1").inAll)

// skip test in assembly
test in assembly := {}

Expand Down
Loading