Class RDDConverterUtils
- java.lang.Object
- 
- org.apache.sysds.runtime.instructions.spark.utils.RDDConverterUtils
 
- 
 public class RDDConverterUtils extends Object 
- 
- 
Nested Class SummaryNested Classes Modifier and Type Class Description static classRDDConverterUtils.BinaryCellToBinaryBlockFunctionstatic classRDDConverterUtils.DataFrameExtractIDFunction
 - 
Field SummaryFields Modifier and Type Field Description static StringDF_ID_COLUMN
 - 
Constructor SummaryConstructors Constructor Description RDDConverterUtils()
 - 
Method SummaryAll Methods Static Methods Concrete Methods Deprecated Methods Modifier and Type Method Description static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock>binaryBlockToBinaryBlock(org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in, DataCharacteristics mcIn, DataCharacteristics mcOut)static org.apache.spark.api.java.JavaRDD<String>binaryBlockToCsv(org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in, DataCharacteristics mcIn, FileFormatPropertiesCSV props, boolean strict)static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row>binaryBlockToDataFrame(org.apache.spark.sql.SparkSession sparkSession, org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in, DataCharacteristics mc, boolean toVector)static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row>binaryBlockToDataFrame(org.apache.spark.sql.SQLContext sqlContext, org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in, DataCharacteristics mc, boolean toVector)Deprecated.static org.apache.spark.api.java.JavaRDD<org.apache.spark.ml.feature.LabeledPoint>binaryBlockToLabeledPoints(org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in)Converter from binary block rdd to rdd of labeled points.static org.apache.spark.api.java.JavaRDD<String>binaryBlockToLibsvm(org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in, DataCharacteristics mcIn, FileFormatPropertiesLIBSVM props, boolean strict)static org.apache.spark.api.java.JavaRDD<String>binaryBlockToTextCell(org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in, DataCharacteristics mc)static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock>binaryCellToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixCell> input, DataCharacteristics mcOut, boolean outputEmptyBlocks)static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock>csvToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> input, DataCharacteristics mc, boolean hasHeader, String delim, boolean fill, double fillValue, Set<String> naStrings)static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock>csvToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaRDD<String> input, DataCharacteristics mcOut, boolean hasHeader, String delim, boolean fill, double fillValue, Set<String> naStrings)static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock>dataFrameToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df, DataCharacteristics mc, boolean containsID, boolean isVector)static voidlibsvmToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, String pathIn, String pathX, String pathY, DataCharacteristics mcOutX)Converts a libsvm text input file into two binary block matrices for features and labels, and saves these to the specified output files.static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock>libsvmToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> input, DataCharacteristics mc, String delim, String indexDelim)static org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text>stringToSerializableText(org.apache.spark.api.java.JavaPairRDD<Long,String> in)static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock>textCellToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> input, DataCharacteristics mcOut, boolean outputEmptyBlocks, FileFormatPropertiesMM mmProps)
 
- 
- 
- 
Field Detail- 
DF_ID_COLUMNpublic static final String DF_ID_COLUMN - See Also:
- Constant Field Values
 
 
- 
 - 
Method Detail- 
textCellToBinaryBlockpublic static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> textCellToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> input, DataCharacteristics mcOut, boolean outputEmptyBlocks, FileFormatPropertiesMM mmProps) 
 - 
binaryCellToBinaryBlockpublic static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> binaryCellToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixCell> input, DataCharacteristics mcOut, boolean outputEmptyBlocks) 
 - 
binaryBlockToLabeledPointspublic static org.apache.spark.api.java.JavaRDD<org.apache.spark.ml.feature.LabeledPoint> binaryBlockToLabeledPoints(org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in) Converter from binary block rdd to rdd of labeled points. Note that the input needs to be reblocked to satisfy the 'clen <= blen' constraint.- Parameters:
- in- matrix as- JavaPairRDD<MatrixIndexes, MatrixBlock>
- Returns:
- JavaRDD of labeled points
 
 - 
binaryBlockToTextCellpublic static org.apache.spark.api.java.JavaRDD<String> binaryBlockToTextCell(org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in, DataCharacteristics mc) 
 - 
binaryBlockToCsvpublic static org.apache.spark.api.java.JavaRDD<String> binaryBlockToCsv(org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in, DataCharacteristics mcIn, FileFormatPropertiesCSV props, boolean strict) 
 - 
binaryBlockToLibsvmpublic static org.apache.spark.api.java.JavaRDD<String> binaryBlockToLibsvm(org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in, DataCharacteristics mcIn, FileFormatPropertiesLIBSVM props, boolean strict) 
 - 
binaryBlockToBinaryBlockpublic static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> binaryBlockToBinaryBlock(org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in, DataCharacteristics mcIn, DataCharacteristics mcOut) 
 - 
csvToBinaryBlockpublic static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> csvToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> input, DataCharacteristics mc, boolean hasHeader, String delim, boolean fill, double fillValue, Set<String> naStrings) 
 - 
csvToBinaryBlockpublic static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> csvToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaRDD<String> input, DataCharacteristics mcOut, boolean hasHeader, String delim, boolean fill, double fillValue, Set<String> naStrings) 
 - 
dataFrameToBinaryBlockpublic static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> dataFrameToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df, DataCharacteristics mc, boolean containsID, boolean isVector) 
 - 
binaryBlockToDataFramepublic static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> binaryBlockToDataFrame(org.apache.spark.sql.SparkSession sparkSession, org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in, DataCharacteristics mc, boolean toVector)
 - 
binaryBlockToDataFrame@Deprecated public static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> binaryBlockToDataFrame(org.apache.spark.sql.SQLContext sqlContext, org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in, DataCharacteristics mc, boolean toVector) Deprecated.
 - 
libsvmToBinaryBlockpublic static void libsvmToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, String pathIn, String pathX, String pathY, DataCharacteristics mcOutX)Converts a libsvm text input file into two binary block matrices for features and labels, and saves these to the specified output files. This call also deletes existing files at the specified output locations, as well as determines and writes the meta data files of both output matrices.Note: We use org.apache.spark.mllib.util.MLUtils.loadLibSVMFilefor parsing the libsvm input files in order to ensure consistency with Spark.- Parameters:
- sc- java spark context
- pathIn- path to libsvm input file
- pathX- path to binary block output file of features
- pathY- path to binary block output file of labels
- mcOutX- matrix characteristics of output matrix X
 
 - 
stringToSerializableTextpublic static org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> stringToSerializableText(org.apache.spark.api.java.JavaPairRDD<Long,String> in) 
 - 
libsvmToBinaryBlockpublic static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> libsvmToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> input, DataCharacteristics mc, String delim, String indexDelim) 
 
- 
 
-