Packages

implicit class SplitOperations extends AnyRef

Linear Supertypes
AnyRef, Any
Ordering
  1. Alphabetic
  2. By Inheritance
Inherited
  1. SplitOperations
  2. AnyRef
  3. Any
  1. Hide All
  2. Show All
Visibility
  1. Public
  2. All

Instance Constructors

  1. new SplitOperations(df: DataFrame)

Value Members

  1. final def !=(arg0: Any): Boolean
    Definition Classes
    AnyRef → Any
  2. final def ##(): Int
    Definition Classes
    AnyRef → Any
  3. final def ==(arg0: Any): Boolean
    Definition Classes
    AnyRef → Any
  4. final def asInstanceOf[T0]: T0
    Definition Classes
    Any
  5. def clone(): AnyRef
    Attributes
    protected[lang]
    Definition Classes
    AnyRef
    Annotations
    @throws( ... ) @native() @HotSpotIntrinsicCandidate()
  6. final def eq(arg0: AnyRef): Boolean
    Definition Classes
    AnyRef
  7. def equals(arg0: Any): Boolean
    Definition Classes
    AnyRef → Any
  8. final def getClass(): Class[_]
    Definition Classes
    AnyRef → Any
    Annotations
    @native() @HotSpotIntrinsicCandidate()
  9. def hashCode(): Int
    Definition Classes
    AnyRef → Any
    Annotations
    @native() @HotSpotIntrinsicCandidate()
  10. final def isInstanceOf[T0]: Boolean
    Definition Classes
    Any
  11. final def ne(arg0: AnyRef): Boolean
    Definition Classes
    AnyRef
  12. final def notify(): Unit
    Definition Classes
    AnyRef
    Annotations
    @native() @HotSpotIntrinsicCandidate()
  13. final def notifyAll(): Unit
    Definition Classes
    AnyRef
    Annotations
    @native() @HotSpotIntrinsicCandidate()
  14. def split(participantId: Column = col("participant_id"), affectedStatus: Column = col("affected_status"), splits: Seq[OccurrenceSplit]): DataFrame

    Calculate frequencies and simple splits on variants in DataFrame df.

    Calculate frequencies and simple splits on variants in DataFrame df.

    participantId

    This column is used to determine number of distinct participants (pn) that has been sequenced

    affectedStatus

    This column is used to calculate frequencies by affected status.

    splits

    List of splits to calculate, can contain both simple splits and frequency splits

    returns

    A dataframe with one line per locus and split columns specified by splits parameter.

    Example:
    1. Using this dataframe as input :

      +----------+-----+-----+---------+---------+------+---------------+------------+---------------+----+-------------+------------+--------+--------+---------+--------------+-----------------+------------+
      |chromosome|start|end  |reference|alternate|calls |affected_status|genes_symbol|hgvsg          |name|variant_class|variant_type|zygosity|study_id|ethnicity|participant_id|transmission_mode|study_code  |
      +----------+-----+-----+---------+---------+------+---------------+------------+---------------+----+-------------+------------+--------+--------+---------+--------------+-----------------+------------+
      |1         |69897|69898|T        |C        |[1, 1]|false          |[OR4F5]     |chr1:g.69897T>C|null|SNV          |germline    |HOM     |S1      |null     |P1            |AR               |STUDY_CODE_1|
      |1         |69897|69898|T        |C        |[0, 1]|true           |[OR4F5]     |chr1:g.69897T>C|null|SNV          |germline    |HET     |S1      |null     |P2            |AD               |STUDY_CODE_1|
      |1         |69897|69898|T        |C        |[1, 1]|false          |[OR4F5]     |chr1:g.69897T>C|null|SNV          |germline    |HOM     |S2      |null     |P3            |AR               |STUDY_CODE_2|
      |2         |69897|69898|T        |C        |[1, 1]|false          |[OR4F5]     |chr1:g.69897T>C|null|SNV          |germline    |HOM     |S2      |null     |P4            |AR               |STUDY_CODE_2|
      +----------+-----+-----+---------+---------+------+---------------+------------+---------------+----+-------------+------------+--------+--------+---------+--------------+-----------------+------------+

      And then calculate frequencies with these parameters :

       val result = input.split(
         FrequencySplit("frequency_by_study_id", extraSplitBy = Some(col("study_id")), byAffected = true, extraAggregations = Seq(
             AtLeastNElements(name = "participant_ids", c = col("participant_id"), n = 2),
             SimpleAggregation(name = "transmissions", c = col("transmission_mode")),
             FirstElement(name = "study_code", col("study_code"))
           )
         ),
         FrequencySplit("frequency_kf", byAffected = true, extraAggregations = Seq(SimpleAggregation(name = "zygosities", c = col("zygosity"))))
      )

      Resulting dataframe will contain all locus columns + 2 frequency columns : frequency_by_study_id and frequency_kf:

      +----------+-----+---------+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------+
      |chromosome|start|reference|alternate|frequency_by_study_id                                                                                                                                                                                                                      |frequency_kf                                                                                                                  |
      +----------+-----+---------+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------+
      |2         |69897|T        |C        |[{S2, {2, 1, 1, 2, 4, 0.5, 0.5}, {0, 0, 0, 0, 0, 0.0, 0.0}, {2, 1, 1, 2, 4, 0.5, 0.5}, null, [AR], STUDY_CODE_2}]                                                                                                                          |{{2, 1, 1, 4, 8, 0.25, 0.25}, {0, 0, 0, 1, 2, 0.0, 0.0}, {2, 1, 1, 3, 6, 0.3333333333333333, 0.3333333333333333}, [HOM]}      |
      |1         |69897|T        |C        |[{S2, {2, 1, 1, 2, 4, 0.5, 0.5}, {0, 0, 0, 0, 0, 0.0, 0.0}, {2, 1, 1, 2, 4, 0.5, 0.5}, null, [AR], STUDY_CODE_2}, {S1, {3, 2, 1, 2, 4, 0.75, 1.0}, {1, 1, 0, 1, 2, 0.5, 1.0}, {2, 1, 1, 1, 2, 1.0, 1.0}, [P1, P2], [AR, AD], STUDY_CODE_1}]|{{5, 3, 2, 4, 8, 0.625, 0.75}, {1, 1, 0, 1, 2, 0.5, 1.0}, {4, 2, 2, 3, 6, 0.6666666666666666, 0.6666666666666666}, [HOM, HET]}|
      +----------+-----+---------+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------+
      *
      • frequency_by_study_id :
        • induced by split parameter FrequencySplit("frequency_by_study_id", extraSplitBy = Some(col("study_id"))). See FrequencySplit.
        • is an array of struct, each struct represents a frequency for a study_id. Fields of this struct are :
          • study_id (split column)
          • total frequency, which is also a struct that contains these fields : ac (allele count), an (allele number), pc (patient count), pn (patient number), hom (number of homozygous), af (allele frequency), pf (participant frequency)
          • if byAffected parameter is set to true it contains also two others frequencies (affected and not_affected) which have the same fields as total
          • participant_ids : extra aggregation obtained by AtLeastNElements("participant_ids", col("participant_id"), 2). See AtLeastNElements.
          • transmissions : extra aggregation obtained by SimpleAggregation("transmissions", col("transmission_mode")). See SimpleAggregation.
      • frequency_kf :
        • induced by this split parameter FrequencySplit("frequency_kf")
        • is a struct of frequency. Fields of this struct are :
          • study_id (split column)
          • total frequency, which is also a struct that contains these fields : ac (allele count), an (allele number), pc (patient count), pn (patient number), hom (number of homozygous), af (allele frequency), pf (participant frequency)
          • if byAffected parameter is set to true it contains also two others frequencies (affected and not_affected) which have the same fields as total
          • zygosities : extra aggregation obtained by SimpleAggregation(name = "zygosities", c = col("zygosity")). See SimpleAggregation. Here the schema of output dataframe :
      root
      |-- chromosome: string (nullable = true)
      |-- start: long (nullable = false)
      |-- reference: string (nullable = true)
      |-- alternate: string (nullable = true)
      |-- frequency_by_study_id: array (nullable = false)
      |    |-- element: struct (containsNull = false)
      |    |    |-- study_id: string (nullable = true)
      |    |    |-- total: struct (nullable = false)
      |    |    |    |-- ac: long (nullable = true)
      |    |    |    |-- pc: long (nullable = true)
      |    |    |    |-- hom: long (nullable = true)
      |    |    |    |-- pn: long (nullable = true)
      |    |    |    |-- an: long (nullable = true)
      |    |    |    |-- af: double (nullable = true)
      |    |    |    |-- pf: double (nullable = true)
      |    |    |-- affected: struct (nullable = false)
      |    |    |    |-- ac: long (nullable = true)
      |    |    |    |-- pc: long (nullable = true)
      |    |    |    |-- hom: long (nullable = true)
      |    |    |    |-- pn: long (nullable = true)
      |    |    |    |-- an: long (nullable = true)
      |    |    |    |-- af: double (nullable = true)
      |    |    |    |-- pf: double (nullable = true)
      |    |    |-- not_affected: struct (nullable = false)
      |    |    |    |-- ac: long (nullable = true)
      |    |    |    |-- pc: long (nullable = true)
      |    |    |    |-- hom: long (nullable = true)
      |    |    |    |-- pn: long (nullable = true)
      |    |    |    |-- an: long (nullable = true)
      |    |    |    |-- af: double (nullable = true)
      |    |    |    |-- pf: double (nullable = true)
      |    |    |-- participant_ids: array (nullable = true)
      |    |    |    |-- element: string (containsNull = false)
      |    |    |-- transmissions: array (nullable = false)
      |    |    |    |-- element: string (containsNull = false)
      |    |    |-- study_code: string (nullable = true)
      |-- frequency_kf: struct (nullable = false)
      |    |-- total: struct (nullable = false)
      |    |    |-- ac: long (nullable = true)
      |    |    |-- pc: long (nullable = true)
      |    |    |-- hom: long (nullable = true)
      |    |    |-- pn: long (nullable = false)
      |    |    |-- an: long (nullable = false)
      |    |    |-- af: double (nullable = true)
      |    |    |-- pf: double (nullable = true)
      |    |-- affected: struct (nullable = false)
      |    |    |-- ac: long (nullable = true)
      |    |    |-- pc: long (nullable = true)
      |    |    |-- hom: long (nullable = true)
      |    |    |-- pn: long (nullable = false)
      |    |    |-- an: long (nullable = false)
      |    |    |-- af: double (nullable = true)
      |    |    |-- pf: double (nullable = true)
      |    |-- not_affected: struct (nullable = false)
      |    |    |-- ac: long (nullable = true)
      |    |    |-- pc: long (nullable = true)
      |    |    |-- hom: long (nullable = true)
      |    |    |-- pn: long (nullable = false)
      |    |    |-- an: long (nullable = false)
      |    |    |-- af: double (nullable = true)
      |    |    |-- pf: double (nullable = true)
      |    |-- zygosities: array (nullable = false)
      |    |    |-- element: string (containsNull = false)
  15. final def synchronized[T0](arg0: ⇒ T0): T0
    Definition Classes
    AnyRef
  16. def toString(): String
    Definition Classes
    AnyRef → Any
  17. final def wait(arg0: Long, arg1: Int): Unit
    Definition Classes
    AnyRef
    Annotations
    @throws( ... )
  18. final def wait(arg0: Long): Unit
    Definition Classes
    AnyRef
    Annotations
    @throws( ... ) @native()
  19. final def wait(): Unit
    Definition Classes
    AnyRef
    Annotations
    @throws( ... )

Deprecated Value Members

  1. def finalize(): Unit
    Attributes
    protected[lang]
    Definition Classes
    AnyRef
    Annotations
    @throws( classOf[java.lang.Throwable] ) @Deprecated
    Deprecated

Inherited from AnyRef

Inherited from Any

Ungrouped