Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions kite-data/kite-data-gcs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Kite Data GCS Module
--------

To test, is needed to configure bucket:

export GCS_BUCKET=test-bucket

To test outside GCE VM:

export GOOGLE_APPLICATION_CREDENTIALS=/path/to/credentials.json
200 changes: 200 additions & 0 deletions kite-data/kite-data-gcs/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright 2013 Cloudera Inc.
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">

<modelVersion>4.0.0</modelVersion>
<artifactId>kite-data-gcs</artifactId>

<parent>
<groupId>org.kitesdk</groupId>
<artifactId>kite-data</artifactId>
<version>1.1.1-SNAPSHOT</version>
</parent>

<name>Kite Data GCS Module</name>
<description>
The Kite Data GCS module provides tools for storing Kite datasets in Google Cloud Storage.
</description>

<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.rat</groupId>
<artifactId>apache-rat-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>findbugs-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>findbugs-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<systemPropertyVariables>
<test.google.gcs.bucket>${test.google.gcs.bucket}</test.google.gcs.bucket>
</systemPropertyVariables>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-antrun-plugin</artifactId>
<executions>
<execution>
<phase>compile</phase>
<goals>
<goal>run</goal>
</goals>
<configuration>
<tasks>
<echo message="Create empty javadoc JAR to satisfy Maven central" />
<mkdir dir="target/apidocs" />
</tasks>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>

<!--
Seems like these reporting plugins aren't properly inherited from the parent
pom's pluginManagement. The docs say it's supposed to work.
-->
<reporting>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-project-info-reports-plugin</artifactId>
<reportSets>
<reportSet>
<inherited>false</inherited>
<reports>
<report>index</report>
<report>summary</report>
<report>dependency-info</report>
<report>dependencies</report>
</reports>
</reportSet>
</reportSets>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
</plugin>
</plugins>
</reporting>

<dependencies>
<!-- Kite -->
<dependency>
<groupId>org.kitesdk</groupId>
<artifactId>kite-data-core</artifactId>
<version>${project.parent.version}</version>
</dependency>

<!-- Hadoop -->
<dependency>
<groupId>org.kitesdk</groupId>
<artifactId>${artifact.hadoop-deps}</artifactId>
<type>pom</type>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.kitesdk</groupId>
<artifactId>kite-hadoop-compatibility</artifactId>
</dependency>

<!-- Avro, Parquet, and other formats -->
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
</dependency>

<dependency>
<groupId>com.google.cloud.bigdataoss</groupId>
<artifactId>gcs-connector</artifactId>
<version>${vers.hadoop-gcs}</version>
<scope>provided</scope>
</dependency>

<!-- Misc -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>${vers.guava.gcs}</version>
</dependency>
<dependency>
<groupId>com.google.code.findbugs</groupId>
<artifactId>jsr305</artifactId>
<scope>provided</scope>
<optional>true</optional>
</dependency>
<dependency>
<groupId>com.google.code.findbugs</groupId>
<artifactId>annotations</artifactId>
<scope>provided</scope>
</dependency>

<!-- Test Dependencies -->

<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.kitesdk</groupId>
<artifactId>kite-data-core</artifactId>
<version>${project.parent.version}</version>
<type>test-jar</type>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.kitesdk</groupId>
<artifactId>${artifact.hadoop-test-deps}</artifactId>
<type>pom</type>
<scope>test</scope>
</dependency>
</dependencies>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
/*
* Copyright 2015 Cloudera.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.kitesdk.data.spi.gcs;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.kitesdk.data.DatasetIOException;
import org.kitesdk.data.DatasetOperationException;
import org.kitesdk.data.spi.DatasetRepository;
import org.kitesdk.data.spi.DefaultConfiguration;
import org.kitesdk.data.spi.Loadable;
import org.kitesdk.data.spi.OptionBuilder;
import org.kitesdk.data.spi.Registration;
import org.kitesdk.data.spi.URIPattern;
import org.kitesdk.data.spi.filesystem.FileSystemDatasetRepository;

/**
* A Loader implementation to register URIs for GS.
*/
public class Loader implements Loadable {

private static final int UNSPECIFIED_PORT = -1;
private static final String GOOGLE_HADOOP_FILE_SYSTEM = "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem";

/**
* This class builds configured instances of
* {@code FileSystemDatasetRepository} from a Map of options. This is for the
* URI system.
*/
private static class URIBuilder implements OptionBuilder<DatasetRepository> {

@Override
public DatasetRepository getFromOptions(Map<String, String> match) {
String path = match.get("path");
final Path root = (path == null || path.isEmpty()) ?
new Path("/") : new Path("/", path);

Configuration conf = DefaultConfiguration.get();
FileSystem fs;
try {
fs = FileSystem.get(fileSystemURI(match), conf);
} catch (IOException e) {
// "Incomplete HDFS URI, no host" => add a helpful suggestion
if (e.getMessage().startsWith("Incomplete")) {
throw new DatasetIOException("Could not get a FileSystem: " +
"make sure the credentials for " + match.get(URIPattern.SCHEME) +
" URIs are configured.", e);
}
throw new DatasetIOException("Could not get a FileSystem", e);
}
return new FileSystemDatasetRepository.Builder()
.configuration(new Configuration(conf)) // make a modifiable copy
.rootDirectory(fs.makeQualified(root))
.build();
}
}

@Override
public void load() {
try {
// load hdfs-site.xml by loading HdfsConfiguration
Configuration config = DefaultConfiguration.get();
config.set("fs.gs.impl", GOOGLE_HADOOP_FILE_SYSTEM);
config.set("fs.AbstractFileSystem.gs.impl", GOOGLE_HADOOP_FILE_SYSTEM);
config.set("google.cloud.auth.service.account.enable", "true");

FileSystem.getLocal(config);
} catch (IOException e) {
throw new DatasetIOException("Cannot load default config", e);
}

OptionBuilder<DatasetRepository> builder = new URIBuilder();

// username and secret are the same; host is the bucket
Registration.register(
new URIPattern("gs:/*path"),
new URIPattern("gs:/*path/:namespace/:dataset"),
builder);
}

private static URI fileSystemURI(Map<String, String> match) {
final String userInfo;
if (match.containsKey(URIPattern.USERNAME)) {
if (match.containsKey(URIPattern.PASSWORD)) {
userInfo = match.get(URIPattern.USERNAME) + ":" +
match.get(URIPattern.PASSWORD);
} else {
userInfo = match.get(URIPattern.USERNAME);
}
} else {
userInfo = null;
}
try {
int port = UNSPECIFIED_PORT;
if (match.containsKey(URIPattern.PORT)) {
try {
port = Integer.parseInt(match.get(URIPattern.PORT));
} catch (NumberFormatException e) {
port = UNSPECIFIED_PORT;
}
}
return new URI(match.get(URIPattern.SCHEME), userInfo,
match.get(URIPattern.HOST), port, "/", null, null);
} catch (URISyntaxException ex) {
throw new DatasetOperationException("[BUG] Could not build FS URI", ex);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#
# Copyright 2015 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
org.kitesdk.data.spi.gcs.Loader
Loading