Algorithms that are built around BioJava and are running on Apache Spark
https://github.com/sbl-sdsc/mmtf-spark
wget http://mmtf.rcsb.org/v1.0/hadoopfiles/full.tar
tar -xvf full.tar
Or you can get a C-alpha, phosphate, ligand only version (~800 Mb download)
wget http://mmtf.rcsb.org/v1.0/hadoopfiles/reduced.tar
tar -xvf reduced.tar
<dependency>
<groupId>org.biojava</groupId>
<artifactId>biojava-spark</artifactId>
<version>0.2.1</version>
</dependency>
float maxResolution = 3.0f;
float maxRfree = 0.3f;
StructureDataRDD structureData = new StructureDataRDD("/path/to/file")
.filterResolution(maxResolution)
.filterRfree(maxRfree);
Map<String, Long> elementCountMap = BiojavaSparkUtils.findAtoms(structureData).countByElement();
Double mean = BiojavaSparkUtils.findContacts(structureData,
new AtomSelectObject()
.groupNameList(new String[] {"PRO","LYS"})
.elementNameList(new String[] {"C"})
.atomNameList(new String[] {"CA"}),
cutoff)
.getDistanceDistOfAtomInts("CA", "CA")
.mean();
System.out.println("\nMean PRO-LYS CA-CA distance: " + mean);