implicit class SplitOperations extends AnyRef
Linear Supertypes
Ordering
- Alphabetic
- By Inheritance
Inherited
- SplitOperations
- AnyRef
- Any
- Hide All
- Show All
Visibility
- Public
- All
Instance Constructors
- new SplitOperations(df: DataFrame)
Value Members
-
final
def
!=(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
##(): Int
- Definition Classes
- AnyRef → Any
-
final
def
==(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
asInstanceOf[T0]: T0
- Definition Classes
- Any
-
def
clone(): AnyRef
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native() @HotSpotIntrinsicCandidate()
-
final
def
eq(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
def
equals(arg0: Any): Boolean
- Definition Classes
- AnyRef → Any
-
final
def
getClass(): Class[_]
- Definition Classes
- AnyRef → Any
- Annotations
- @native() @HotSpotIntrinsicCandidate()
-
def
hashCode(): Int
- Definition Classes
- AnyRef → Any
- Annotations
- @native() @HotSpotIntrinsicCandidate()
-
final
def
isInstanceOf[T0]: Boolean
- Definition Classes
- Any
-
final
def
ne(arg0: AnyRef): Boolean
- Definition Classes
- AnyRef
-
final
def
notify(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native() @HotSpotIntrinsicCandidate()
-
final
def
notifyAll(): Unit
- Definition Classes
- AnyRef
- Annotations
- @native() @HotSpotIntrinsicCandidate()
-
def
split(participantId: Column = col("participant_id"), affectedStatus: Column = col("affected_status"), splits: Seq[OccurrenceSplit]): DataFrame
Calculate frequencies and simple splits on variants in DataFrame df.
Calculate frequencies and simple splits on variants in DataFrame df.
- participantId
This column is used to determine number of distinct participants (pn) that has been sequenced
- affectedStatus
This column is used to calculate frequencies by affected status.
- splits
List of splits to calculate, can contain both simple splits and frequency splits
- returns
A dataframe with one line per locus and split columns specified by splits parameter.
Using this dataframe as input :
+----------+-----+-----+---------+---------+------+---------------+------------+---------------+----+-------------+------------+--------+--------+---------+--------------+-----------------+------------+ |chromosome|start|end |reference|alternate|calls |affected_status|genes_symbol|hgvsg |name|variant_class|variant_type|zygosity|study_id|ethnicity|participant_id|transmission_mode|study_code | +----------+-----+-----+---------+---------+------+---------------+------------+---------------+----+-------------+------------+--------+--------+---------+--------------+-----------------+------------+ |1 |69897|69898|T |C |[1, 1]|false |[OR4F5] |chr1:g.69897T>C|null|SNV |germline |HOM |S1 |null |P1 |AR |STUDY_CODE_1| |1 |69897|69898|T |C |[0, 1]|true |[OR4F5] |chr1:g.69897T>C|null|SNV |germline |HET |S1 |null |P2 |AD |STUDY_CODE_1| |1 |69897|69898|T |C |[1, 1]|false |[OR4F5] |chr1:g.69897T>C|null|SNV |germline |HOM |S2 |null |P3 |AR |STUDY_CODE_2| |2 |69897|69898|T |C |[1, 1]|false |[OR4F5] |chr1:g.69897T>C|null|SNV |germline |HOM |S2 |null |P4 |AR |STUDY_CODE_2| +----------+-----+-----+---------+---------+------+---------------+------------+---------------+----+-------------+------------+--------+--------+---------+--------------+-----------------+------------+
And then calculate frequencies with these parameters :
val result = input.split( FrequencySplit("frequency_by_study_id", extraSplitBy = Some(col("study_id")), byAffected = true, extraAggregations = Seq( AtLeastNElements(name = "participant_ids", c = col("participant_id"), n = 2), SimpleAggregation(name = "transmissions", c = col("transmission_mode")), FirstElement(name = "study_code", col("study_code")) ) ), FrequencySplit("frequency_kf", byAffected = true, extraAggregations = Seq(SimpleAggregation(name = "zygosities", c = col("zygosity")))) )
Resulting dataframe will contain all locus columns + 2 frequency columns : frequency_by_study_id and frequency_kf:
+----------+-----+---------+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------+ |chromosome|start|reference|alternate|frequency_by_study_id |frequency_kf | +----------+-----+---------+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------+ |2 |69897|T |C |[{S2, {2, 1, 1, 2, 4, 0.5, 0.5}, {0, 0, 0, 0, 0, 0.0, 0.0}, {2, 1, 1, 2, 4, 0.5, 0.5}, null, [AR], STUDY_CODE_2}] |{{2, 1, 1, 4, 8, 0.25, 0.25}, {0, 0, 0, 1, 2, 0.0, 0.0}, {2, 1, 1, 3, 6, 0.3333333333333333, 0.3333333333333333}, [HOM]} | |1 |69897|T |C |[{S2, {2, 1, 1, 2, 4, 0.5, 0.5}, {0, 0, 0, 0, 0, 0.0, 0.0}, {2, 1, 1, 2, 4, 0.5, 0.5}, null, [AR], STUDY_CODE_2}, {S1, {3, 2, 1, 2, 4, 0.75, 1.0}, {1, 1, 0, 1, 2, 0.5, 1.0}, {2, 1, 1, 1, 2, 1.0, 1.0}, [P1, P2], [AR, AD], STUDY_CODE_1}]|{{5, 3, 2, 4, 8, 0.625, 0.75}, {1, 1, 0, 1, 2, 0.5, 1.0}, {4, 2, 2, 3, 6, 0.6666666666666666, 0.6666666666666666}, [HOM, HET]}| +----------+-----+---------+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------+ *
- frequency_by_study_id :
- induced by split parameter FrequencySplit("frequency_by_study_id", extraSplitBy = Some(col("study_id"))). See FrequencySplit.
- is an array of struct, each struct represents a frequency for a study_id. Fields of this struct are :
- study_id (split column)
- total frequency, which is also a struct that contains these fields : ac (allele count), an (allele number), pc (patient count), pn (patient number), hom (number of homozygous), af (allele frequency), pf (participant frequency)
- if byAffected parameter is set to true it contains also two others frequencies (affected and not_affected) which have the same fields as total
- participant_ids : extra aggregation obtained by AtLeastNElements("participant_ids", col("participant_id"), 2). See AtLeastNElements.
- transmissions : extra aggregation obtained by SimpleAggregation("transmissions", col("transmission_mode")). See SimpleAggregation.
- frequency_kf :
- induced by this split parameter FrequencySplit("frequency_kf")
- is a struct of frequency. Fields of this struct are :
- study_id (split column)
- total frequency, which is also a struct that contains these fields : ac (allele count), an (allele number), pc (patient count), pn (patient number), hom (number of homozygous), af (allele frequency), pf (participant frequency)
- if byAffected parameter is set to true it contains also two others frequencies (affected and not_affected) which have the same fields as total
- zygosities : extra aggregation obtained by SimpleAggregation(name = "zygosities", c = col("zygosity")). See SimpleAggregation. Here the schema of output dataframe :
root |-- chromosome: string (nullable = true) |-- start: long (nullable = false) |-- reference: string (nullable = true) |-- alternate: string (nullable = true) |-- frequency_by_study_id: array (nullable = false) | |-- element: struct (containsNull = false) | | |-- study_id: string (nullable = true) | | |-- total: struct (nullable = false) | | | |-- ac: long (nullable = true) | | | |-- pc: long (nullable = true) | | | |-- hom: long (nullable = true) | | | |-- pn: long (nullable = true) | | | |-- an: long (nullable = true) | | | |-- af: double (nullable = true) | | | |-- pf: double (nullable = true) | | |-- affected: struct (nullable = false) | | | |-- ac: long (nullable = true) | | | |-- pc: long (nullable = true) | | | |-- hom: long (nullable = true) | | | |-- pn: long (nullable = true) | | | |-- an: long (nullable = true) | | | |-- af: double (nullable = true) | | | |-- pf: double (nullable = true) | | |-- not_affected: struct (nullable = false) | | | |-- ac: long (nullable = true) | | | |-- pc: long (nullable = true) | | | |-- hom: long (nullable = true) | | | |-- pn: long (nullable = true) | | | |-- an: long (nullable = true) | | | |-- af: double (nullable = true) | | | |-- pf: double (nullable = true) | | |-- participant_ids: array (nullable = true) | | | |-- element: string (containsNull = false) | | |-- transmissions: array (nullable = false) | | | |-- element: string (containsNull = false) | | |-- study_code: string (nullable = true) |-- frequency_kf: struct (nullable = false) | |-- total: struct (nullable = false) | | |-- ac: long (nullable = true) | | |-- pc: long (nullable = true) | | |-- hom: long (nullable = true) | | |-- pn: long (nullable = false) | | |-- an: long (nullable = false) | | |-- af: double (nullable = true) | | |-- pf: double (nullable = true) | |-- affected: struct (nullable = false) | | |-- ac: long (nullable = true) | | |-- pc: long (nullable = true) | | |-- hom: long (nullable = true) | | |-- pn: long (nullable = false) | | |-- an: long (nullable = false) | | |-- af: double (nullable = true) | | |-- pf: double (nullable = true) | |-- not_affected: struct (nullable = false) | | |-- ac: long (nullable = true) | | |-- pc: long (nullable = true) | | |-- hom: long (nullable = true) | | |-- pn: long (nullable = false) | | |-- an: long (nullable = false) | | |-- af: double (nullable = true) | | |-- pf: double (nullable = true) | |-- zygosities: array (nullable = false) | | |-- element: string (containsNull = false)
- frequency_by_study_id :
Example: -
final
def
synchronized[T0](arg0: ⇒ T0): T0
- Definition Classes
- AnyRef
-
def
toString(): String
- Definition Classes
- AnyRef → Any
-
final
def
wait(arg0: Long, arg1: Int): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
-
final
def
wait(arg0: Long): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... ) @native()
-
final
def
wait(): Unit
- Definition Classes
- AnyRef
- Annotations
- @throws( ... )
Deprecated Value Members
-
def
finalize(): Unit
- Attributes
- protected[lang]
- Definition Classes
- AnyRef
- Annotations
- @throws( classOf[java.lang.Throwable] ) @Deprecated
- Deprecated