SplitSampleRDDFunctions

Enhances RDDs with methods for split-sampling

T

The row type of the RDD

// import conversions to enhance RDDs with split sampling
import io.radanalytics.silex.sample.split.implicits._
// obtain a sequence of 5 RDDs randomly split from RDD 'data', where each element
// has probability 1/5 of being assigned to each output.
val splits = data.splitSample(5)
// randomly split data so that the second output has twice the probability of receiving
// a data element as the first, and the third output has three times the probability.
val splitsW = data.weightedSplitSample(Seq(1.0, 2.0, 3.0))

Linear Supertypes

Serializable, Serializable, AnyRef, Any

Instance Constructors

new SplitSampleRDDFunctions(self: RDD[T])(implicit arg0: ClassTag[T])

Value Members

final def !=(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def ##(): Int

Definition Classes
AnyRef → Any
final def ==(arg0: Any): Boolean

Definition Classes
AnyRef → Any
final def asInstanceOf[T0]: T0

Definition Classes
Any
def clone(): AnyRef

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( ... )
final def eq(arg0: AnyRef): Boolean

Definition Classes
AnyRef
def equals(arg0: Any): Boolean

Definition Classes
AnyRef → Any
def finalize(): Unit

Attributes
protected[java.lang]
Definition Classes
AnyRef
Annotations
@throws( classOf[java.lang.Throwable] )
final def getClass(): Class[_]

Definition Classes
AnyRef → Any
def hashCode(): Int

Definition Classes
AnyRef → Any
final def isInstanceOf[T0]: Boolean

Definition Classes
Any
final def ne(arg0: AnyRef): Boolean

Definition Classes
AnyRef
final def notify(): Unit

Definition Classes
AnyRef
final def notifyAll(): Unit

Definition Classes
AnyRef
def splitSample(n: Int, persist: StorageLevel = defaultSL, seed: Long = scala.util.Random.nextLong): Seq[RDD[T]]

Split an RDD into n random subsets, where each row is assigned to an output with equal probability 1/n.
Split an RDD into n random subsets, where each row is assigned to an output with equal probability 1/n.
n
The number of output RDDs to split into
persist
The storage level to use for persisting the intermediate result.
seed
A random seed to use for sampling. Will be modified, deterministically, by partition id.
final def synchronized[T0](arg0: ⇒ T0): T0

Definition Classes
AnyRef
def toString(): String

Definition Classes
AnyRef → Any
final def wait(): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long, arg1: Int): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
final def wait(arg0: Long): Unit

Definition Classes
AnyRef
Annotations
@throws( ... )
def weightedSplitSample(weights: Seq[Double], persist: StorageLevel = defaultSL, seed: Long = scala.util.Random.nextLong): Seq[RDD[T]]

Split an RDD into weighted random subsets, where each row is assigned to an output (j) with probability proportional to the corresponding jth weight.
Split an RDD into weighted random subsets, where each row is assigned to an output (j) with probability proportional to the corresponding jth weight.
weights
A sequence of weights that determine the relative probabilities of sampling into the corresponding RDD outputs. Weights will be normalized so that they sum to 1. Individual weights must be strictly > 0.
persist
The storage level to use for persisting the intermediate result.
seed
A random seed to use for sampling. Will be modified, deterministically, by partition id.

Related Docs: object SplitSampleRDDFunctions | package split

class SplitSampleRDDFunctions[T] extends Serializable

Instance Constructors

new SplitSampleRDDFunctions(self: RDD[T])(implicit arg0: ClassTag[T])

Value Members

final def !=(arg0: Any): Boolean

final def ##(): Int

final def ==(arg0: Any): Boolean

final def asInstanceOf[T0]: T0

def clone(): AnyRef

final def eq(arg0: AnyRef): Boolean

def equals(arg0: Any): Boolean

def finalize(): Unit

final def getClass(): Class[_]

def hashCode(): Int

final def isInstanceOf[T0]: Boolean

final def ne(arg0: AnyRef): Boolean

final def notify(): Unit

final def notifyAll(): Unit

def splitSample(n: Int, persist: StorageLevel = defaultSL, seed: Long = scala.util.Random.nextLong): Seq[RDD[T]]

final def synchronized[T0](arg0: ⇒ T0): T0

def toString(): String

final def wait(): Unit

final def wait(arg0: Long, arg1: Int): Unit

final def wait(arg0: Long): Unit

def weightedSplitSample(weights: Seq[Double], persist: StorageLevel = defaultSL, seed: Long = scala.util.Random.nextLong): Seq[RDD[T]]

Inherited from Serializable

Inherited from Serializable

Inherited from AnyRef

Inherited from Any

Ungrouped