Class FrameRDDConverterUtils
- java.lang.Object
-
- org.apache.sysds.runtime.instructions.spark.utils.FrameRDDConverterUtils
-
public class FrameRDDConverterUtils extends Object
-
-
Nested Class Summary
Nested Classes Modifier and Type Class Description static classFrameRDDConverterUtils.LongFrameToLongWritableFrameFunctionstatic classFrameRDDConverterUtils.LongWritableFrameToLongFrameFunctionstatic classFrameRDDConverterUtils.LongWritableTextToLongTextFunctionstatic classFrameRDDConverterUtils.LongWritableToSerFunction
-
Constructor Summary
Constructors Constructor Description FrameRDDConverterUtils()
-
Method Summary
All Methods Static Methods Concrete Methods Deprecated Methods Modifier and Type Method Description static org.apache.spark.api.java.JavaRDD<String>binaryBlockToCsv(org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> in, DataCharacteristics mcIn, FileFormatPropertiesCSV props, boolean strict)static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row>binaryBlockToDataFrame(org.apache.spark.sql.SparkSession sparkSession, org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> in, DataCharacteristics mc, Types.ValueType[] schema)static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row>binaryBlockToDataFrame(org.apache.spark.sql.SQLContext sqlContext, org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> in, DataCharacteristics mc, Types.ValueType[] schema)Deprecated.static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock>binaryBlockToMatrixBlock(org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> input, DataCharacteristics mcIn, DataCharacteristics mcOut)static org.apache.spark.api.java.JavaRDD<String>binaryBlockToTextCell(org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> input, DataCharacteristics mcIn)static intconvertDFSchemaToFrameSchema(org.apache.spark.sql.types.StructType dfschema, String[] colnames, Types.ValueType[] fschema, boolean containsID)NOTE: regarding the support of vector columns, we make the following schema restriction: single vector column, which allows inference of the vector length without data access and covers the common case.static org.apache.spark.sql.types.StructTypeconvertFrameSchemaToDFSchema(Types.ValueType[] fschema, boolean containsID)This function will convert Frame schema into DataFrame schemastatic org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock>csvToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> input, DataCharacteristics mc, Types.ValueType[] schema, boolean hasHeader, String delim, boolean fill, double fillValue, Set<String> naStrings)static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock>csvToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaRDD<String> input, DataCharacteristics mcOut, Types.ValueType[] schema, boolean hasHeader, String delim, boolean fill, double fillValue, Set<String> naStrings)static org.apache.spark.api.java.JavaRDD<org.apache.spark.sql.Row>csvToRowRDD(org.apache.spark.api.java.JavaSparkContext sc, String fnameIn, String delim, Types.ValueType[] schema)static org.apache.spark.api.java.JavaRDD<org.apache.spark.sql.Row>csvToRowRDD(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaRDD<String> dataRdd, String delim, Types.ValueType[] schema)static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock>dataFrameToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df, DataCharacteristics mc, boolean containsID)static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock>dataFrameToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df, DataCharacteristics mc, boolean containsID, Pair<String[],Types.ValueType[]> out)static org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,FrameBlock>matrixBlockToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> input, DataCharacteristics mcIn)static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock>matrixBlockToBinaryBlockLongIndex(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> input, DataCharacteristics dcIn)static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock>textCellToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> in, DataCharacteristics mcOut, Types.ValueType[] schema)static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock>textCellToBinaryBlockLongIndex(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<Long,org.apache.hadoop.io.Text> input, DataCharacteristics mc, Types.ValueType[] schema)
-
-
-
Method Detail
-
csvToBinaryBlock
public static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> csvToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> input, DataCharacteristics mc, Types.ValueType[] schema, boolean hasHeader, String delim, boolean fill, double fillValue, Set<String> naStrings)
-
csvToBinaryBlock
public static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> csvToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaRDD<String> input, DataCharacteristics mcOut, Types.ValueType[] schema, boolean hasHeader, String delim, boolean fill, double fillValue, Set<String> naStrings)
-
binaryBlockToCsv
public static org.apache.spark.api.java.JavaRDD<String> binaryBlockToCsv(org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> in, DataCharacteristics mcIn, FileFormatPropertiesCSV props, boolean strict)
-
textCellToBinaryBlock
public static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> textCellToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> in, DataCharacteristics mcOut, Types.ValueType[] schema)
-
textCellToBinaryBlockLongIndex
public static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> textCellToBinaryBlockLongIndex(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<Long,org.apache.hadoop.io.Text> input, DataCharacteristics mc, Types.ValueType[] schema)
-
binaryBlockToTextCell
public static org.apache.spark.api.java.JavaRDD<String> binaryBlockToTextCell(org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> input, DataCharacteristics mcIn)
-
matrixBlockToBinaryBlock
public static org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,FrameBlock> matrixBlockToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> input, DataCharacteristics mcIn)
-
matrixBlockToBinaryBlockLongIndex
public static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> matrixBlockToBinaryBlockLongIndex(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> input, DataCharacteristics dcIn)
-
binaryBlockToMatrixBlock
public static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> binaryBlockToMatrixBlock(org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> input, DataCharacteristics mcIn, DataCharacteristics mcOut)
-
dataFrameToBinaryBlock
public static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> dataFrameToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df, DataCharacteristics mc, boolean containsID)
-
dataFrameToBinaryBlock
public static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> dataFrameToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df, DataCharacteristics mc, boolean containsID, Pair<String[],Types.ValueType[]> out)
-
binaryBlockToDataFrame
public static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> binaryBlockToDataFrame(org.apache.spark.sql.SparkSession sparkSession, org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> in, DataCharacteristics mc, Types.ValueType[] schema)
-
binaryBlockToDataFrame
@Deprecated public static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> binaryBlockToDataFrame(org.apache.spark.sql.SQLContext sqlContext, org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> in, DataCharacteristics mc, Types.ValueType[] schema)
Deprecated.
-
convertFrameSchemaToDFSchema
public static org.apache.spark.sql.types.StructType convertFrameSchemaToDFSchema(Types.ValueType[] fschema, boolean containsID)
This function will convert Frame schema into DataFrame schema- Parameters:
fschema- frame schemacontainsID- true if contains ID column- Returns:
- Spark StructType of StructFields representing schema
-
convertDFSchemaToFrameSchema
public static int convertDFSchemaToFrameSchema(org.apache.spark.sql.types.StructType dfschema, String[] colnames, Types.ValueType[] fschema, boolean containsID)NOTE: regarding the support of vector columns, we make the following schema restriction: single vector column, which allows inference of the vector length without data access and covers the common case.- Parameters:
dfschema- schema as StructTypecolnames- column namesfschema- array of SystemDS ValueTypescontainsID- if true, contains ID column- Returns:
- 0-based column index of vector column, -1 if no vector.
-
csvToRowRDD
public static org.apache.spark.api.java.JavaRDD<org.apache.spark.sql.Row> csvToRowRDD(org.apache.spark.api.java.JavaSparkContext sc, String fnameIn, String delim, Types.ValueType[] schema)
-
csvToRowRDD
public static org.apache.spark.api.java.JavaRDD<org.apache.spark.sql.Row> csvToRowRDD(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaRDD<String> dataRdd, String delim, Types.ValueType[] schema)
-
-