|
1 | 1 | # isarn-sketches-spark
|
2 | 2 | Routines and data structures for using isarn-sketches idiomatically in Apache Spark
|
| 3 | + |
| 4 | +## API documentation |
| 5 | +https://isarn.github.io/isarn-sketches-spark/latest/api/#org.isarnproject.sketches.udaf.package |
| 6 | + |
| 7 | +## How to use in your project |
| 8 | + |
| 9 | +#### sbt |
| 10 | +``` scala |
| 11 | +resolvers += "isarn project" at "https://dl.bintray.com/isarn/maven/" |
| 12 | + |
| 13 | +libraryDependencies += "org.isarnproject" %% "isarn-sketches-spark" % "0.1.0" |
| 14 | +``` |
| 15 | + |
| 16 | +#### maven |
| 17 | +``` xml |
| 18 | +<dependency> |
| 19 | + <groupId>org.isarnproject</groupId> |
| 20 | + <artifactId>isarn-sketches-spark_2.10</artifactId> |
| 21 | + <version>0.1.0</version> |
| 22 | + <type>pom</type> |
| 23 | +</dependency> |
| 24 | +``` |
| 25 | + |
| 26 | +## Examples |
| 27 | + |
| 28 | +### Sketch a numeric column |
| 29 | +```scala |
| 30 | +scala> import org.isarnproject.sketches._, org.isarnproject.sketches.udaf._, org.apache.spark.isarnproject.sketches.udt._ |
| 31 | +import org.isarnproject.sketches._ |
| 32 | +import org.isarnproject.sketches.udaf._ |
| 33 | +import org.apache.spark.isarnproject.sketches.udt._ |
| 34 | + |
| 35 | +scala> import scala.util.Random.nextGaussian |
| 36 | +import scala.util.Random.nextGaussian |
| 37 | + |
| 38 | +scala> val data = sc.parallelize(Vector.fill(1000){(nextGaussian, nextGaussian)}).toDF.as[(Double, Double)] |
| 39 | +data: org.apache.spark.sql.Dataset[(Double, Double)] = [_1: double, _2: double] |
| 40 | + |
| 41 | +scala> val udaf = tdigestUDAF[Double].delta(0.2).maxDiscrete(25) |
| 42 | +udaf: org.isarnproject.sketches.udaf.TDigestUDAF[Double] = TDigestUDAF(0.2,25) |
| 43 | + |
| 44 | +scala> val agg = data.agg(udaf($"_1"), udaf($"_2")) |
| 45 | +agg: org.apache.spark.sql.DataFrame = [tdigestudaf(_1): tdigest, tdigestudaf(_2): tdigest] |
| 46 | + |
| 47 | +scala> val (td1, td2) = (agg.first.getAs[TDigestSQL](0).tdigest, agg.first.getAs[TDigestSQL](1).tdigest) |
| 48 | +td1: org.isarnproject.sketches.TDigest = TDigest(0.2,25,151,TDigestMap(-3.1241237514093707 -> (1.0, 1.0), ... |
| 49 | + |
| 50 | +scala> td1.cdf(0) |
| 51 | +res1: Double = 0.5159531867457404 |
| 52 | + |
| 53 | +scala> td2.cdf(0) |
| 54 | +res2: Double = 0.504233763693618 |
| 55 | +``` |
| 56 | + |
| 57 | +### Sketch a numeric array column |
| 58 | +```scala |
| 59 | +scala> import org.isarnproject.sketches._, org.isarnproject.sketches.udaf._, org.apache.spark.isarnproject.sketches.udt._ |
| 60 | +import org.isarnproject.sketches._ |
| 61 | +import org.isarnproject.sketches.udaf._ |
| 62 | +import org.apache.spark.isarnproject.sketches.udt._ |
| 63 | + |
| 64 | +scala> import scala.util.Random._ |
| 65 | +import scala.util.Random._ |
| 66 | + |
| 67 | +scala> val data = spark.createDataFrame(Vector.fill(1000){(nextInt(10), Vector.fill(5){nextGaussian})}) |
| 68 | +data: org.apache.spark.sql.DataFrame = [_1: int, _2: array<double>] |
| 69 | + |
| 70 | +scala> val udaf1 = tdigestUDAF[Int].maxDiscrete(20) |
| 71 | +udaf1: org.isarnproject.sketches.udaf.TDigestUDAF[Int] = TDigestUDAF(0.5,20) |
| 72 | + |
| 73 | +scala> val udafA = tdigestArrayUDAF[Double] |
| 74 | +udafA: org.isarnproject.sketches.udaf.TDigestArrayUDAF[Double] = TDigestArrayUDAF(0.5,0) |
| 75 | + |
| 76 | +scala> val (first1, firstA) = (data.agg(udaf1($"_1")).first, data.agg(udafA($"_2")).first) |
| 77 | +first1: org.apache.spark.sql.Row = [TDigestSQL(TDigest(0.5,20,19,TDigestMap(-9.0 -> (51.0, 51.0),... |
| 78 | +firstA: org.apache.spark.sql.Row = [TDigestArraySQL([Lorg.isarnproject.sketches.TDigest;@782b0d37)] |
| 79 | + |
| 80 | +scala> val sample1 = Vector.fill(10) { first1.getAs[TDigestSQL](0).tdigest.sample } |
| 81 | +sample1: scala.collection.immutable.Vector[Double] = Vector(0.0, 7.0, 9.0, 6.0, 1.0, 3.0, 4.0, 0.0, 9.0, 0.0) |
| 82 | + |
| 83 | +scala> val sampleA = firstA.getAs[TDigestArraySQL](0).tdigests.map(_.sample) |
| 84 | +sampleA: Array[Double] = Array(0.5079398036724695, 0.7518583956493221, -0.054376728126603546, 0.7141623682043323, 0.4788564991204228) |
| 85 | +``` |
| 86 | + |
| 87 | +### Sketch a column of ML Vector |
| 88 | +```scala |
| 89 | +scala> import org.isarnproject.sketches._, org.isarnproject.sketches.udaf._, org.apache.spark.isarnproject.sketches.udt._ |
| 90 | +import org.isarnproject.sketches._ |
| 91 | +import org.isarnproject.sketches.udaf._ |
| 92 | +import org.apache.spark.isarnproject.sketches.udt._ |
| 93 | + |
| 94 | +scala> import org.apache.spark.ml.linalg.Vectors |
| 95 | +import org.apache.spark.ml.linalg.Vectors |
| 96 | + |
| 97 | +scala> import scala.util.Random._ |
| 98 | +import scala.util.Random._ |
| 99 | + |
| 100 | +scala> val data = spark.createDataFrame(Vector.fill(1000){(nextInt(10), Vectors.dense(nextGaussian,nextGaussian,nextGaussian))}) |
| 101 | +data: org.apache.spark.sql.DataFrame = [_1: int, _2: vector] |
| 102 | + |
| 103 | +scala> val udafV = tdigestMLVecUDAF |
| 104 | +udafV: org.isarnproject.sketches.udaf.TDigestMLVecUDAF = TDigestMLVecUDAF(0.5,0) |
| 105 | + |
| 106 | +scala> val firstV = data.agg(udafV($"_2")).first |
| 107 | +firstV: org.apache.spark.sql.Row = [TDigestArraySQL([Lorg.isarnproject.sketches.TDigest;@42b579cd)] |
| 108 | + |
| 109 | +scala> val sampleV = firstV.getAs[TDigestArraySQL](0).tdigests.map(_.sample) |
| 110 | +sampleV: Array[Double] = Array(1.815862652134914, 0.24668895676164276, 0.09236479932949887) |
| 111 | + |
| 112 | +scala> val medianV = firstV.getAs[TDigestArraySQL](0).tdigests.map(_.cdfInverse(0.5)) |
| 113 | +medianV: Array[Double] = Array(-0.049806905959001196, -0.08528817932077674, -0.05291800642695017) |
| 114 | +``` |
| 115 | + |
| 116 | +### Sketch a column of MLLib Vector |
| 117 | +```scala |
| 118 | +scala> import org.isarnproject.sketches._, org.isarnproject.sketches.udaf._, org.apache.spark.isarnproject.sketches.udt._ |
| 119 | +import org.isarnproject.sketches._ |
| 120 | +import org.isarnproject.sketches.udaf._ |
| 121 | +import org.apache.spark.isarnproject.sketches.udt._ |
| 122 | + |
| 123 | +scala> import org.apache.spark.mllib.linalg.Vectors |
| 124 | +import org.apache.spark.mllib.linalg.Vectors |
| 125 | + |
| 126 | +scala> import scala.util.Random._ |
| 127 | +import scala.util.Random._ |
| 128 | + |
| 129 | +scala> val data = spark.createDataFrame(Vector.fill(1000){(nextInt(10), Vectors.dense(nextGaussian,nextGaussian,nextGaussian))}) |
| 130 | +data: org.apache.spark.sql.DataFrame = [_1: int, _2: vector] |
| 131 | + |
| 132 | +scala> val udafV = tdigestMLLibVecUDAF |
| 133 | +udafV: org.isarnproject.sketches.udaf.TDigestMLLibVecUDAF = TDigestMLLibVecUDAF(0.5,0) |
| 134 | + |
| 135 | +scala> val firstV = data.agg(udafV($"_2")).first |
| 136 | +firstV: org.apache.spark.sql.Row = [TDigestArraySQL([Lorg.isarnproject.sketches.TDigest;@6bffea90)] |
| 137 | + |
| 138 | +scala> val sampleV = firstV.getAs[TDigestArraySQL](0).tdigests.map(_.sample) |
| 139 | +sampleV: Array[Double] = Array(0.10298190759496548, -0.1968752746464183, -1.0139250851274562) |
| 140 | + |
| 141 | +scala> val medianV = firstV.getAs[TDigestArraySQL](0).tdigests.map(_.cdfInverse(0.5)) |
| 142 | +medianV: Array[Double] = Array(0.025820266848484798, 0.01951778217339037, 0.09701138847692858) |
| 143 | +``` |
0 commit comments