abstract class MultiFileCloudPartitionReaderBase extends FilePartitionReaderBase
The Abstract multi-file cloud reading framework
The data driven: next() -> if (first time) initAndStartReaders -> submit tasks (getBatchRunner) -> wait tasks done sequentially -> decode in GPU (readBatch)
- Alphabetic
- By Inheritance
- MultiFileCloudPartitionReaderBase
- FilePartitionReaderBase
- ScanWithMetrics
- Logging
- PartitionReader
- Closeable
- AutoCloseable
- AnyRef
- Any
- Hide All
- Show All
- Public
- All
Instance Constructors
-
new
MultiFileCloudPartitionReaderBase(conf: Configuration, inputFiles: Array[PartitionedFile], numThreads: Int, maxNumFileProcessed: Int, filters: Array[Filter], execMetrics: Map[String, GpuMetric], maxReadBatchSizeRows: Int, maxReadBatchSizeBytes: Long, ignoreCorruptFiles: Boolean = false, alluxioPathReplacementMap: Map[String, String] = Map.empty, alluxioReplacementTaskTime: Boolean = false, keepReadsInOrder: Boolean = true, combineConf: CombineConf = CombineConf(-1, -1))
- conf
Configuration parameters
- inputFiles
PartitionFiles to be read
- numThreads
the number of threads to read files in parallel.
- maxNumFileProcessed
threshold to control the maximum file number to be submitted to threadpool
- filters
push down filters
- execMetrics
the metrics
- ignoreCorruptFiles
Whether to ignore corrupt files when GPU failed to decode the files
- alluxioPathReplacementMap
Map containing mapping of DFS scheme to Alluxio scheme
- alluxioReplacementTaskTime
Whether the Alluxio replacement algorithm is set to task time
- keepReadsInOrder
Whether to require the files to be read in the same order as Spark. Defaults to true for formats that don't explicitly handle this.
- combineConf
configs relevant to combination
Abstract Value Members
-
abstract
def
getBatchRunner(tc: TaskContext, file: PartitionedFile, origFile: Option[PartitionedFile], conf: Configuration, filters: Array[Filter]): Callable[HostMemoryBuffersWithMetaDataBase]
The sub-class must implement the real file reading logic in a Callable which will be running in a thread pool
The sub-class must implement the real file reading logic in a Callable which will be running in a thread pool
- tc
task context to use
- file
file to be read
- origFile
optional original unmodified file if replaced with Alluxio
- conf
the Configuration parameters
- filters
push down filters
- returns
Callable[HostMemoryBuffersWithMetaDataBase]
-
abstract
def
getFileFormatShortName: String
File format short name used for logging and other things to uniquely identity which file format is being used.
File format short name used for logging and other things to uniquely identity which file format is being used.
- returns
the file format short name
-
abstract
def
readBatches(fileBufsAndMeta: HostMemoryBuffersWithMetaDataBase): Iterator[ColumnarBatch]
Decode HostMemoryBuffers in GPU
Decode HostMemoryBuffers in GPU
- fileBufsAndMeta
the file HostMemoryBuffer read from a PartitionedFile
- returns
an iterator of batches that were decoded
Concrete Value Members
-
final
def
!=(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
##(): Int
- Definition Classes
- AnyRef → Any
-
final
def
==(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
asInstanceOf[T0]: T0
- Definition Classes
- Any
-
var
batchIter: Iterator[ColumnarBatch]
- Attributes
- protected
- Definition Classes
- FilePartitionReaderBase
- def canUseCombine: Boolean
-
def
clone(): AnyRef
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
-
def
close(): Unit
- Definition Classes
- MultiFileCloudPartitionReaderBase → FilePartitionReaderBase → Closeable → AutoCloseable
- def combineHMBs(results: Array[HostMemoryBuffersWithMetaDataBase]): HostMemoryBuffersWithMetaDataBase
-
var
combineLeftOverFiles: Option[Array[HostMemoryBuffersWithMetaDataBase]]
- Attributes
- protected
-
var
currentFileHostBuffers: Option[HostMemoryBuffersWithMetaDataBase]
- Attributes
- protected
-
def
currentMetricsValues(): Array[CustomTaskMetric]
- Definition Classes
- PartitionReader
-
final
def
eq(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
def
equals(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
def
finalize(): Unit
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( classOf[java.lang.Throwable] )
-
def
get(): ColumnarBatch
- Definition Classes
- FilePartitionReaderBase → PartitionReader
-
final
def
getClass(): Class[_]
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
-
def
hashCode(): Int
- Definition Classes
- AnyRef → Any
- Annotations
- @native()
-
def
initializeLogIfNecessary(isInterpreter: Boolean, silent: Boolean): Boolean
- Attributes
- protected
- Definition Classes
- Logging
-
def
initializeLogIfNecessary(isInterpreter: Boolean): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
var
isDone: Boolean
- Attributes
- protected
- Definition Classes
- FilePartitionReaderBase
-
final
def
isInstanceOf[T0]: Boolean
- Definition Classes
- Any
-
def
isTraceEnabled(): Boolean
- Attributes
- protected
- Definition Classes
- Logging
-
def
log: Logger
- Attributes
- protected
- Definition Classes
- Logging
-
def
logDebug(msg: ⇒ String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logDebug(msg: ⇒ String): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logError(msg: ⇒ String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logError(msg: ⇒ String): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logInfo(msg: ⇒ String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logInfo(msg: ⇒ String): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logName: String
- Attributes
- protected
- Definition Classes
- Logging
-
def
logTrace(msg: ⇒ String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logTrace(msg: ⇒ String): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logWarning(msg: ⇒ String, throwable: Throwable): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
def
logWarning(msg: ⇒ String): Unit
- Attributes
- protected
- Definition Classes
- Logging
-
val
metrics: Map[String, GpuMetric]
- Definition Classes
- ScanWithMetrics
-
final
def
ne(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
def
next(): Boolean
- Definition Classes
- MultiFileCloudPartitionReaderBase → PartitionReader
-
final
def
notify(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
final
def
notifyAll(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native()
-
final
def
synchronized[T0](arg0: ⇒ T0): T0
- Definition Classes
- AnyRef
-
def
toString(): String
- Definition Classes
- AnyRef → Any
-
final
def
wait(): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long, arg1: Int): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()