-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathhousing_model.html
43 lines (41 loc) · 144 KB
/
housing_model.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
<!DOCTYPE html>
<html>
<head>
<meta name="databricks-html-version" content="1">
<title>housing_model - Databricks</title>
<meta charset="utf-8">
<meta name="google" content="notranslate">
<meta name="robots" content="nofollow">
<meta http-equiv="Content-Language" content="en">
<meta http-equiv="Content-Type" content="text/html; charset=UTF8">
<link rel="stylesheet"
href="https://fonts.googleapis.com/css?family=Source+Code+Pro:400,700">
<link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855/lib/css/bootstrap.min.css">
<link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855/lib/jquery-ui-bundle/jquery-ui.min.css">
<link rel="stylesheet" type="text/css" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855/css/main.css">
<link rel="stylesheet" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855/css/print.css" media="print">
<link rel="icon" type="image/png" href="https://databricks-prod-cloudfront.cloud.databricks.com/static/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855/img/favicon.ico"/>
<script>window.settings = {"enableSshKeyUI":false,"defaultInteractivePricePerDBU":0.4,"enableOnDemandClusterType":true,"enableAutoCompleteAsYouType":[],"devTierName":"Community Edition","enableJobsPrefetching":true,"workspaceFeaturedLinks":[{"linkURI":"https://docs.databricks.com/index.html","displayName":"Documentation","icon":"question"},{"linkURI":"https://docs.databricks.com/release-notes/product/index.html","displayName":"Release Notes","icon":"code"},{"linkURI":"https://docs.databricks.com/spark/latest/training/index.html","displayName":"Training & Tutorials","icon":"graduation-cap"}],"enableClearStateFeature":false,"dbcForumURL":"http://forums.databricks.com/","enableProtoClusterInfoDeltaPublisher":true,"maxCustomTags":45,"enableInstanceProfilesUIInJobs":true,"nodeInfo":{"node_types":[{"support_ssh":false,"spark_heap_memory":4800,"instance_type_id":"r3.2xlarge","spark_core_oversubscription_factor":8.0,"node_type_id":"dev-tier-node","description":"Community Optimized","support_cluster_tags":false,"container_memory_mb":6000,"node_instance_type":{"instance_type_id":"r3.2xlarge","provider":"AWS","compute_units":26.0,"number_of_ips":15,"local_disks":1,"reserved_compute_units":3.64,"memory_mb":62464,"num_cores":8,"reserved_memory_mb":4800},"memory_mb":6144,"is_hidden":false,"category":"Community Edition","num_cores":0.88,"support_ebs_volumes":false,"is_deprecated":false}],"default_node_type_id":"dev-tier-node"},"enableClusterAcls":true,"notebookRevisionVisibilityHorizon":999999,"enableTableHandler":true,"maxEbsVolumesPerInstance":10,"isAdmin":true,"deltaProcessingBatchSize":1000,"enableLargeResultDownload":true,"zoneInfos":[{"id":"us-west-2c","isDefault":true},{"id":"us-west-2b","isDefault":false},{"id":"us-west-2a","isDefault":false}],"enableCustomSpotPricingUIByTier":false,"enableEBSVolumesUIForJobs":true,"enablePublishNotebooks":true,"enableMaxConcurrentRuns":true,"enableJobAclsConfig":false,"enableFullTextSearch":false,"enableElasticSparkUI":false,"enableNewClustersCreate":true,"clusters":true,"allowRunOnPendingClusters":true,"fileStoreBase":"FileStore","enableSshKeyUIInJobs":true,"enableDetachAndAttachSubMenu":false,"configurableSparkOptionsSpec":[{"keyPattern":"spark\\.kryo(\\.[^\\.]+)+","valuePattern":".*","keyPatternDisplay":"spark.kryo.*","valuePatternDisplay":"*","description":"Configuration options for Kryo serialization"},{"keyPattern":"spark\\.io\\.compression\\.codec","valuePattern":"(lzf|snappy|org\\.apache\\.spark\\.io\\.LZFCompressionCodec|org\\.apache\\.spark\\.io\\.SnappyCompressionCodec)","keyPatternDisplay":"spark.io.compression.codec","valuePatternDisplay":"snappy|lzf","description":"The codec used to compress internal data such as RDD partitions, broadcast variables and shuffle outputs."},{"keyPattern":"spark\\.serializer","valuePattern":"(org\\.apache\\.spark\\.serializer\\.JavaSerializer|org\\.apache\\.spark\\.serializer\\.KryoSerializer)","keyPatternDisplay":"spark.serializer","valuePatternDisplay":"org.apache.spark.serializer.JavaSerializer|org.apache.spark.serializer.KryoSerializer","description":"Class to use for serializing objects that will be sent over the network or need to be cached in serialized form."},{"keyPattern":"spark\\.rdd\\.compress","valuePattern":"(true|false)","keyPatternDisplay":"spark.rdd.compress","valuePatternDisplay":"true|false","description":"Whether to compress serialized RDD partitions (e.g. for StorageLevel.MEMORY_ONLY_SER). Can save substantial space at the cost of some extra CPU time."},{"keyPattern":"spark\\.speculation","valuePattern":"(true|false)","keyPatternDisplay":"spark.speculation","valuePatternDisplay":"true|false","description":"Whether to use speculation (recommended off for streaming)"},{"keyPattern":"spark\\.es(\\.[^\\.]+)+","valuePattern":".*","keyPatternDisplay":"spark.es.*","valuePatternDisplay":"*","description":"Configuration options for ElasticSearch"},{"keyPattern":"es(\\.([^\\.]+))+","valuePattern":".*","keyPatternDisplay":"es.*","valuePatternDisplay":"*","description":"Configuration options for ElasticSearch"},{"keyPattern":"spark\\.(storage|shuffle)\\.memoryFraction","valuePattern":"0?\\.0*([1-9])([0-9])*","keyPatternDisplay":"spark.(storage|shuffle).memoryFraction","valuePatternDisplay":"(0.0,1.0)","description":"Fraction of Java heap to use for Spark's shuffle or storage"},{"keyPattern":"spark\\.streaming\\.backpressure\\.enabled","valuePattern":"(true|false)","keyPatternDisplay":"spark.streaming.backpressure.enabled","valuePatternDisplay":"true|false","description":"Enables or disables Spark Streaming's internal backpressure mechanism (since 1.5). This enables the Spark Streaming to control the receiving rate based on the current batch scheduling delays and processing times so that the system receives only as fast as the system can process. Internally, this dynamically sets the maximum receiving rate of receivers. This rate is upper bounded by the values `spark.streaming.receiver.maxRate` and `spark.streaming.kafka.maxRatePerPartition` if they are set."},{"keyPattern":"spark\\.streaming\\.receiver\\.maxRate","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.receiver.maxRate","valuePatternDisplay":"numeric","description":"Maximum rate (number of records per second) at which each receiver will receive data. Effectively, each stream will consume at most this number of records per second. Setting this configuration to 0 or a negative number will put no limit on the rate. See the deployment guide in the Spark Streaming programing guide for mode details."},{"keyPattern":"spark\\.streaming\\.kafka\\.maxRatePerPartition","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.kafka.maxRatePerPartition","valuePatternDisplay":"numeric","description":"Maximum rate (number of records per second) at which data will be read from each Kafka partition when using the Kafka direct stream API introduced in Spark 1.3. See the Kafka Integration guide for more details."},{"keyPattern":"spark\\.streaming\\.kafka\\.maxRetries","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.kafka.maxRetries","valuePatternDisplay":"numeric","description":"Maximum number of consecutive retries the driver will make in order to find the latest offsets on the leader of each partition (a default value of 1 means that the driver will make a maximum of 2 attempts). Only applies to the Kafka direct stream API introduced in Spark 1.3."},{"keyPattern":"spark\\.streaming\\.ui\\.retainedBatches","valuePattern":"^([0-9]{1,})$","keyPatternDisplay":"spark.streaming.ui.retainedBatches","valuePatternDisplay":"numeric","description":"How many batches the Spark Streaming UI and status APIs remember before garbage collecting."}],"enableReactNotebookComments":true,"enableAdminPasswordReset":false,"enableResetPassword":true,"maxClusterTagValueLength":255,"enableJobsSparkUpgrade":true,"enableNotebookCommandNumbers":true,"sparkVersions":[{"key":"1.6.3-db2-hadoop2-scala2.10","displayName":"Spark 1.6.3-db2 (Hadoop 2, Scala 2.10)","packageLabel":"spark-image-aba860a0ffce4f3471fb14aefdcb1d768ac66a53a5ad884c48745ef98aeb9d67","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.x-ubuntu15.10","displayName":"Spark 1.6.x (Hadoop 1)","packageLabel":"spark-image-8cea23fb9094e174bf5815d79009f4a8e383eb86cf2909cf6c6434ed8da2a16a","upgradable":true,"deprecated":false,"customerVisible":false},{"key":"1.4.x-ubuntu15.10","displayName":"Spark 1.4.1 (Hadoop 1)","packageLabel":"spark-image-f710650fb8aaade8e4e812368ea87c45cd8cd0b5e6894ca6c94f3354e8daa6dc","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.2.x-scala2.11","displayName":"Spark 2.2 RC1 (Experimental, Scala 2.11)","packageLabel":"spark-image-a6234552b950aaef67e5aaef6815883e5bd9fc537f52bf1705894e1a5364be8d","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.1.0-db2-scala2.11","displayName":"Spark 2.1.0-db2 (Scala 2.11)","packageLabel":"spark-image-267c4490a3ab8a39acdbbd9f1d36f6decdecebf013e30dd677faff50f1d9cf8b","upgradable":true,"deprecated":false,"customerVisible":false},{"key":"2.0.0-ubuntu15.10-scala2.10","displayName":"Spark 2.0.0 (Scala 2.10)","packageLabel":"spark-image-073c1b52ace74f251fae2680624a0d8d184a8b57096d1c21c5ce56c29be6a37a","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"2.0.2-db3-scala2.10","displayName":"Spark 2.0.2-db3 (Scala 2.10)","packageLabel":"spark-image-584091dedb690de20e8cf22d9e02fdcce1281edda99eedb441a418d50e28088f","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.1.0-db1-scala2.11","displayName":"Spark 2.1.0-db1 (Scala 2.11)","packageLabel":"spark-image-e8ad5b72cf0f899dcf2b4720c1f572ab0e87a311d6113b943b4e1d4a7edb77eb","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"2.1.1-db4-scala2.11","displayName":"Spark 2.1.1-db4 (Scala 2.11)","packageLabel":"spark-image-f6e9552fae99c2dd7d4cb60afb006f23c6a44c9a9ca6951d91b9d920f2f93e66","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.1.0-db2-scala2.10","displayName":"Spark 2.1.0-db2 (Scala 2.10)","packageLabel":"spark-image-a2ca4f6b58c95f78dca91b1340305ab3fe32673bd894da2fa8e1dc8a9f8d0478","upgradable":true,"deprecated":false,"customerVisible":false},{"key":"1.6.x-ubuntu15.10-hadoop1","displayName":"Spark 1.6.x (Hadoop 1)","packageLabel":"spark-image-8cea23fb9094e174bf5815d79009f4a8e383eb86cf2909cf6c6434ed8da2a16a","upgradable":true,"deprecated":false,"customerVisible":false},{"key":"2.0.2-db4-scala2.11","displayName":"Spark 2.0.2-db4 (Scala 2.11)","packageLabel":"spark-image-7dbc7583e8271765b8a1508cb9e832768e35489bbde2c4c790bc6766aee2fd7f","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.1-ubuntu15.10-hadoop1","displayName":"Spark 1.6.1 (Hadoop 1)","packageLabel":"spark-image-21d1cac181b7b8856dd1b4214a3a734f95b5289089349db9d9c926cb87d843db","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"2.0.x-gpu-scala2.11","displayName":"Spark 2.0 (Auto-updating, GPU, Scala 2.11 experimental)","packageLabel":"spark-image-968b89f1d0ec32e1ee4dacd04838cae25ef44370a441224177a37980d539d83a","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.2-ubuntu15.10-hadoop1","displayName":"Spark 1.6.2 (Hadoop 1)","packageLabel":"spark-image-8cea23fb9094e174bf5815d79009f4a8e383eb86cf2909cf6c6434ed8da2a16a","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"1.6.3-db1-hadoop2-scala2.10","displayName":"Spark 1.6.3-db1 (Hadoop 2, Scala 2.10)","packageLabel":"spark-image-eaa8d9b990015a14e032fb2e2e15be0b8d5af9627cd01d855df728b67969d5d9","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.3-db2-hadoop1-scala2.10","displayName":"Spark 1.6.3-db2 (Hadoop 1, Scala 2.10)","packageLabel":"spark-image-14112ea0645bea94333a571a150819ce85573cf5541167d905b7e6588645cf3b","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.2-ubuntu15.10-hadoop2","displayName":"Spark 1.6.2 (Hadoop 2)","packageLabel":"spark-image-161245e66d887cd775e23286a54bab0b146143e1289f25bd1732beac454a1561","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"1.6.1-ubuntu15.10-hadoop2","displayName":"Spark 1.6.1 (Hadoop 2)","packageLabel":"spark-image-4cafdf8bc6cba8edad12f441e3b3f0a8ea27da35c896bc8290e16b41fd15496a","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"2.0.2-db2-scala2.10","displayName":"Spark 2.0.2-db2 (Scala 2.10)","packageLabel":"spark-image-36d48f22cca7a907538e07df71847dd22aaf84a852c2eeea2dcefe24c681602f","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"2.0.x-ubuntu15.10-scala2.11","displayName":"Spark 2.0 (Ubuntu 15.10, Scala 2.11, deprecated)","packageLabel":"spark-image-8e1c50d626a52eac5a6c8129e09ae206ba9890f4523775f77af4ad6d99a64c44","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"2.0.x-scala2.10","displayName":"Spark 2.0 (Auto-updating, Scala 2.10)","packageLabel":"spark-image-859e88079f97f58d50e25163b39a1943d1eeac0b6939c5a65faba986477e311a","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.1.1-db4-scala2.10","displayName":"Spark 2.1.1-db4 (Scala 2.10)","packageLabel":"spark-image-86c0873023816ae29d99f81118dc825d6380a9968a9f7a3d0a055f5c3a8964a0","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.0.2-db1-scala2.11","displayName":"Spark 2.0.2-db1 (Scala 2.11)","packageLabel":"spark-image-c2d623f03dd44097493c01aa54a941fc31978ebe6d759b36c75b716b2ff6ab9c","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"2.0.2-db4-scala2.10","displayName":"Spark 2.0.2-db4 (Scala 2.10)","packageLabel":"spark-image-859e88079f97f58d50e25163b39a1943d1eeac0b6939c5a65faba986477e311a","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.5.x-ubuntu15.10","displayName":"Spark 1.5.2 (Hadoop 1)","packageLabel":"spark-image-c9d2a8abf41f157a4acc6d52bc721090346f6fea2de356f3a66e388f54481698","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.2.x-scala2.10","displayName":"Spark 2.2 RC1 (Experimental, Scala 2.10)","packageLabel":"spark-image-d87399ba349d7ac22c7a245147ab9da5052d9a92f9f130f3072b0a6f862c5d9f","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.0.x-scala2.11","displayName":"Spark 2.0 (Auto-updating, Scala 2.11)","packageLabel":"spark-image-7dbc7583e8271765b8a1508cb9e832768e35489bbde2c4c790bc6766aee2fd7f","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.1.x-scala2.10","displayName":"Spark 2.1 (Auto-updating, Scala 2.10)","packageLabel":"spark-image-86c0873023816ae29d99f81118dc825d6380a9968a9f7a3d0a055f5c3a8964a0","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.1.0-db3-scala2.10","displayName":"Spark 2.1.0-db3 (Scala 2.10)","packageLabel":"spark-image-25a17d070af155f10c4232dcc6248e36a2eb48c24f8d4fc00f34041b86bd1626","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.0.2-db2-scala2.11","displayName":"Spark 2.0.2-db2 (Scala 2.11)","packageLabel":"spark-image-4fa852ba378e97815083b96c9cada7b962a513ec23554a5fc849f7f1dd8c065a","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"1.3.x-ubuntu15.10","displayName":"Spark 1.3.0 (Hadoop 1)","packageLabel":"spark-image-40d2842670bc3dc178b14042501847d76171437ccf70613fa397a7a24c48b912","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.0.1-db1-scala2.11","displayName":"Spark 2.0.1-db1 (Scala 2.11)","packageLabel":"spark-image-10ab19f634bbfdb860446c326a9f76dc25bfa87de6403b980566279142a289ea","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"2.0.2-db3-scala2.11","displayName":"Spark 2.0.2-db3 (Scala 2.11)","packageLabel":"spark-image-7fd7aaa89d55692e429115ae7eac3b1a1dc4de705d50510995f34306b39c2397","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"1.6.3-db1-hadoop1-scala2.10","displayName":"Spark 1.6.3-db1 (Hadoop 1, Scala 2.10)","packageLabel":"spark-image-d50af1032799546b8ccbeeb76889a20c819ebc2a0e68ea20920cb30d3895d3ae","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.0.2-db1-scala2.10","displayName":"Spark 2.0.2-db1 (Scala 2.10)","packageLabel":"spark-image-654bdd6e9bad70079491987d853b4b7abf3b736fff099701501acaabe0e75c41","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"2.0.x-ubuntu15.10","displayName":"Spark 2.0 (Ubuntu 15.10, Scala 2.10, deprecated)","packageLabel":"spark-image-a659f3909d51b38d297b20532fc807ecf708cfb7440ce9b090c406ab0c1e4b7e","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"2.0.1-db1-scala2.10","displayName":"Spark 2.0.1-db1 (Scala 2.10)","packageLabel":"spark-image-5a13c2db3091986a4e7363006cc185c5b1108c7761ef5d0218506cf2e6643840","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"2.1.x-scala2.11","displayName":"Spark 2.1 (Auto-updating, Scala 2.11)","packageLabel":"spark-image-f6e9552fae99c2dd7d4cb60afb006f23c6a44c9a9ca6951d91b9d920f2f93e66","upgradable":true,"deprecated":false,"customerVisible":true},{"key":"2.1.0-db1-scala2.10","displayName":"Spark 2.1.0-db1 (Scala 2.10)","packageLabel":"spark-image-f0ab82a5deb7908e0d159e9af066ba05fb56e1edb35bdad41b7ad2fd62a9b546","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"1.6.0-ubuntu15.10","displayName":"Spark 1.6.0 (Hadoop 1)","packageLabel":"spark-image-10ef758029b8c7e19cd7f4fb52fff9180d75db92ca071bd94c47f3c1171a7cb5","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"1.6.x-ubuntu15.10-hadoop2","displayName":"Spark 1.6.x (Hadoop 2)","packageLabel":"spark-image-161245e66d887cd775e23286a54bab0b146143e1289f25bd1732beac454a1561","upgradable":true,"deprecated":false,"customerVisible":false},{"key":"2.0.0-ubuntu15.10-scala2.11","displayName":"Spark 2.0.0 (Scala 2.11)","packageLabel":"spark-image-b4ec141e751f201399f8358a82efee202560f7ed05e1a04a2ae8778f6324b909","upgradable":true,"deprecated":true,"customerVisible":false},{"key":"2.1.0-db3-scala2.11","displayName":"Spark 2.1.0-db3 (Scala 2.11)","packageLabel":"spark-image-ccbc6b73f158e2001fc1fb8c827bfdde425d8bd6d65cb7b3269784c28bb72c16","upgradable":true,"deprecated":false,"customerVisible":true}],"enableRestrictedClusterCreation":true,"enableFeedback":true,"enableClusterAutoScaling":false,"enableUserVisibleDefaultTags":true,"defaultNumWorkers":0,"serverContinuationTimeoutMillis":10000,"autoTerminateClustersByDefault":false,"driverStderrFilePrefix":"stderr","enableNotebookRefresh":false,"accountsOwnerUrl":"https://accounts.cloud.databricks.com/registration.html#login","driverStdoutFilePrefix":"stdout","defaultNodeTypeToPricingUnitsMap":{"r3.2xlarge":2,"class-node":1,"m4.2xlarge":0.5,"r4.xlarge":1,"m4.4xlarge":0.5,"r4.16xlarge":8,"p2.8xlarge":16,"m4.10xlarge":0.5,"r3.8xlarge":8,"r4.4xlarge":4,"dev-tier-node":1,"c3.8xlarge":4,"r3.4xlarge":4,"i2.4xlarge":6,"m4.xlarge":0.5,"r4.8xlarge":8,"r4.large":0.5,"development-node":1,"i2.2xlarge":3,"g2.8xlarge":6,"memory-optimized":1,"m4.large":0.5,"p2.16xlarge":24,"c3.2xlarge":1,"c4.2xlarge":1,"i2.xlarge":1.5,"compute-optimized":1,"c4.4xlarge":2,"c3.4xlarge":2,"g2.2xlarge":1.5,"p2.xlarge":2,"m4.16xlarge":0.5,"c4.8xlarge":4,"r3.xlarge":1,"r4.2xlarge":2,"i2.8xlarge":12},"enableSparkDocsSearch":true,"sparkHistoryServerEnabled":true,"enableEBSVolumesUI":false,"sanitizeMarkdownHtml":true,"enableIPythonImportExport":true,"enableClusterTagsUIForJobs":true,"enableClusterTagsUI":false,"enableNotebookHistoryDiffing":true,"branch":"2.44","accountsLimit":3,"enableSparkEnvironmentVariables":true,"enableX509Authentication":false,"enableStructuredStreamingNbOptimizations":false,"enableNotebookGitBranching":true,"local":false,"enableClusterAutoScalingForJobs":false,"enableStrongPassword":false,"displayDefaultContainerMemoryGB":6,"enableNotebookCommandMode":true,"disableS3TableImport":false,"deploymentMode":"production","useSpotForWorkers":true,"enableUserInviteWorkflow":true,"enableStaticNotebooks":true,"perClusterAutoTerminationEnabled":false,"enableCssTransitions":true,"defaultAutoTerminationInactivityMin":180,"minClusterTagKeyLength":1,"showHomepageFeaturedLinks":true,"pricingURL":"https://databricks.com/product/pricing","enableClusterAclsConfig":false,"useTempS3UrlForTableUpload":false,"notifyLastLogin":false,"enableSshKeyUIByTier":false,"defaultAutomatedPricePerDBU":0.2,"enableNotebookGitVersioning":true,"files":"files/","feedbackEmail":"feedback@databricks.com","enableDriverLogsUI":true,"enableWorkspaceAclsConfig":false,"dropzoneMaxFileSize":2047,"enableNewClustersList":false,"enableNewDashboardViews":true,"driverLog4jFilePrefix":"log4j","enableSingleSignOn":true,"enableMavenLibraries":true,"displayRowLimit":1000,"deltaProcessingAsyncEnabled":true,"enableSparkEnvironmentVariablesUI":false,"defaultSparkVersion":{"key":"2.1.x-scala2.10","displayName":"Spark 2.1 (Auto-updating, Scala 2.10)","packageLabel":"spark-image-86c0873023816ae29d99f81118dc825d6380a9968a9f7a3d0a055f5c3a8964a0","upgradable":true,"deprecated":false,"customerVisible":true},"enableCustomSpotPricing":false,"enableMountAclsConfig":false,"useDevTierHomePage":true,"enableClusterClone":true,"enableNotebookLineNumbers":true,"enablePublishHub":false,"minAutoTerminationInactivityMin":10,"notebookHubUrl":"http://hub.dev.databricks.com/","showSqlEndpoints":false,"enableClusterAclsByTier":false,"databricksDocsBaseUrl":"https://docs.databricks.com/","cloud":"AWS","disallowAddingAdmins":true,"enableSparkConfUI":true,"featureTier":"DEVELOPER_BASIC_TIER","mavenCentralSearchEndpoint":"http://search.maven.org/solrsearch/select","enableOrgSwitcherUI":true,"clustersLimit":1,"enableJdbcImport":true,"logfiles":"logfiles/","enableWebappSharding":true,"enableClusterDeltaUpdates":true,"enableSingleSignOnLogin":false,"ebsVolumeSizeLimitGB":{"GENERAL_PURPOSE_SSD":[100,4096],"THROUGHPUT_OPTIMIZED_HDD":[500,4096]},"enableMountAcls":false,"requireEmailUserName":true,"dbcFeedbackURL":"mailto:feedback@databricks.com","enableMountAclService":true,"enableWorkspaceAcls":false,"maxClusterTagKeyLength":127,"gitHash":"16bec05fa08fe1dd3350ef898eaf4457d7c34e6a","showWorkspaceFeaturedLinks":true,"signupUrl":"https://databricks.com/try-databricks","allowFeedbackForumAccess":true,"enableImportFromUrl":true,"enableMiniClusters":true,"enableNewTableUI":true,"enableDebugUI":false,"enableStreamingMetricsDashboard":true,"allowNonAdminUsers":true,"enableSingleSignOnByTier":false,"enableJobsRetryOnTimeout":true,"useStandardTierUpgradeTooltips":true,"staticNotebookResourceUrl":"https://databricks-prod-cloudfront.cloud.databricks.com/static/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855/","enableSpotClusterType":true,"enableSparkPackages":true,"dynamicSparkVersions":true,"enableClusterTagsUIByTier":false,"enableNotebookHistoryUI":true,"enableClusterLoggingUI":true,"maxAutoTerminationInactivityMin":10000,"enableDatabaseDropdownInTableUI":false,"showDebugCounters":false,"enableInstanceProfilesUI":false,"enableFolderHtmlExport":true,"homepageFeaturedLinks":[{"linkURI":"https://docs.databricks.com/_static/notebooks/gentle-introduction-to-apache-spark.html","displayName":"Introduction to Apache Spark on Databricks","icon":"img/home/Python_icon.svg"},{"linkURI":"https://docs.databricks.com/_static/notebooks/databricks-for-data-scientists.html","displayName":"Databricks for Data Scientists","icon":"img/home/Scala_icon.svg"},{"linkURI":"https://docs.databricks.com/_static/notebooks/structured-streaming-python.html","displayName":"Introduction to Structured Streaming","icon":"img/home/Python_icon.svg"}],"enableClusterStart":true,"enableEBSVolumesUIByTier":false,"upgradeURL":"https://accounts.cloud.databricks.com/registration.html#login","notebookLoadingBackground":"#fff","sshContainerForwardedPort":2200,"enableServerAutoComplete":true,"enableStaticHtmlImport":true,"enableInstanceProfilesByTier":false,"defaultMemoryPerContainerMB":6000,"enablePresenceUI":true,"accounts":true,"useOnDemandClustersByDefault":true,"useFramedStaticNotebooks":false,"enableNewProgressReportUI":true,"defaultCoresPerContainer":4,"showTerminationReason":true,"enableNewClustersGet":true,"showPricePerDBU":false,"showSqlProxyUI":true};</script>
<script>var __DATABRICKS_NOTEBOOK_MODEL = {"version":"NotebookV1","origId":1271183488258032,"name":"housing_model","language":"python","commands":[{"version":"CommandV1","origId":1271183488258034,"guid":"9a527442-c727-4903-ae94-2d9ab5cbf68e","subtype":"command","commandType":"auto","position":0.25,"command":"%md\n### Formatting a CSV into a DataFrame¶\nIn the previous version of this notebook, some gymnastics were required to read my small csv into Spark. That would be much easier today using Spark 2.X than it was with 1.4, there's also the fact that Databricks makes it very simple to use their data importer to access data from Amazon's S3. I've simply used their [web GUI to create a table](https://docs.databricks.com/user-guide/tables.html) named \"realestate\" that can be read with Spark SQL, and from there, all the goodness of Spark dataframes is available to me. ","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":1493778580531,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"c50f5ffd-c294-4d37-bc37-b84c9b857818"},{"version":"CommandV1","origId":1271183488258035,"guid":"fb4635a9-95ae-43ad-944d-8ba514e7a7e5","subtype":"command","commandType":"auto","position":1.0,"command":"df = sqlContext.sql(\"SELECT * FROM realestate\")","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\"></div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"<span class=\"ansired\">AnalysisException</span>: u'Path does not exist: dbfs:/FileStore/tables/2esy8tnj1455052720017;'","error":"<div class=\"ansiout\"><span class=\"ansired\">---------------------------------------------------------------------------</span>\n<span class=\"ansired\">AnalysisException</span> Traceback (most recent call last)\n<span class=\"ansigreen\"><ipython-input-3-98c1e65c294a></span> in <span class=\"ansicyan\"><module></span><span class=\"ansiblue\">()</span>\n<span class=\"ansigreen\">----> 1</span><span class=\"ansiyellow\"> </span>sparkDF <span class=\"ansiyellow\">=</span> sqlContext<span class=\"ansiyellow\">.</span>read<span class=\"ansiyellow\">.</span>format<span class=\"ansiyellow\">(</span><span class=\"ansiblue\">"csv"</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">.</span>load<span class=\"ansiyellow\">(</span><span class=\"ansiblue\">"/FileStore/tables/2esy8tnj1455052720017/"</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n\n<span class=\"ansigreen\">/databricks/spark/python/pyspark/sql/readwriter.py</span> in <span class=\"ansicyan\">load</span><span class=\"ansiblue\">(self, path, format, schema, **options)</span>\n<span class=\"ansigreen\"> 147</span> self<span class=\"ansiyellow\">.</span>options<span class=\"ansiyellow\">(</span><span class=\"ansiyellow\">**</span>options<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 148</span> <span class=\"ansigreen\">if</span> isinstance<span class=\"ansiyellow\">(</span>path<span class=\"ansiyellow\">,</span> basestring<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\">--> 149</span><span class=\"ansiyellow\"> </span><span class=\"ansigreen\">return</span> self<span class=\"ansiyellow\">.</span>_df<span class=\"ansiyellow\">(</span>self<span class=\"ansiyellow\">.</span>_jreader<span class=\"ansiyellow\">.</span>load<span class=\"ansiyellow\">(</span>path<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 150</span> <span class=\"ansigreen\">elif</span> path <span class=\"ansigreen\">is</span> <span class=\"ansigreen\">not</span> None<span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 151</span> <span class=\"ansigreen\">if</span> type<span class=\"ansiyellow\">(</span>path<span class=\"ansiyellow\">)</span> <span class=\"ansiyellow\">!=</span> list<span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n\n<span class=\"ansigreen\">/databricks/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py</span> in <span class=\"ansicyan\">__call__</span><span class=\"ansiblue\">(self, *args)</span>\n<span class=\"ansigreen\"> 1131</span> answer <span class=\"ansiyellow\">=</span> self<span class=\"ansiyellow\">.</span>gateway_client<span class=\"ansiyellow\">.</span>send_command<span class=\"ansiyellow\">(</span>command<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 1132</span> return_value = get_return_value(\n<span class=\"ansigreen\">-> 1133</span><span class=\"ansiyellow\"> answer, self.gateway_client, self.target_id, self.name)\n</span><span class=\"ansigreen\"> 1134</span> <span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 1135</span> <span class=\"ansigreen\">for</span> temp_arg <span class=\"ansigreen\">in</span> temp_args<span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n\n<span class=\"ansigreen\">/databricks/spark/python/pyspark/sql/utils.py</span> in <span class=\"ansicyan\">deco</span><span class=\"ansiblue\">(*a, **kw)</span>\n<span class=\"ansigreen\"> 67</span> e.java_exception.getStackTrace()))\n<span class=\"ansigreen\"> 68</span> <span class=\"ansigreen\">if</span> s<span class=\"ansiyellow\">.</span>startswith<span class=\"ansiyellow\">(</span><span class=\"ansiblue\">'org.apache.spark.sql.AnalysisException: '</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\">---> 69</span><span class=\"ansiyellow\"> </span><span class=\"ansigreen\">raise</span> AnalysisException<span class=\"ansiyellow\">(</span>s<span class=\"ansiyellow\">.</span>split<span class=\"ansiyellow\">(</span><span class=\"ansiblue\">': '</span><span class=\"ansiyellow\">,</span> <span class=\"ansicyan\">1</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">[</span><span class=\"ansicyan\">1</span><span class=\"ansiyellow\">]</span><span class=\"ansiyellow\">,</span> stackTrace<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 70</span> <span class=\"ansigreen\">if</span> s<span class=\"ansiyellow\">.</span>startswith<span class=\"ansiyellow\">(</span><span class=\"ansiblue\">'org.apache.spark.sql.catalyst.analysis'</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 71</span> <span class=\"ansigreen\">raise</span> AnalysisException<span class=\"ansiyellow\">(</span>s<span class=\"ansiyellow\">.</span>split<span class=\"ansiyellow\">(</span><span class=\"ansiblue\">': '</span><span class=\"ansiyellow\">,</span> <span class=\"ansicyan\">1</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">[</span><span class=\"ansicyan\">1</span><span class=\"ansiyellow\">]</span><span class=\"ansiyellow\">,</span> stackTrace<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n\n<span class=\"ansired\">AnalysisException</span>: u'Path does not exist: dbfs:/FileStore/tables/2esy8tnj1455052720017;'</div>","workflows":[],"startTime":1493781516226,"submitTime":1493781516228,"finishTime":1493781517336,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"473c1faf-0b3c-48b8-8a93-9a96de798eeb"},{"version":"CommandV1","origId":1271183488258036,"guid":"dd2ff2e3-1ff2-478a-8fe0-85eea347220a","subtype":"command","commandType":"auto","position":1.25,"command":"%md\nSpark leverages \"lazy\" execution, so nothing has been executed yet. We can force spark to do some work and take a peak at the data by calling \"take\" on our dataframe. A take of 5 shows the first 5 rows from our original csv. Again, if you read the older version of this tutorial, you will see that this is skipping through a lot of logic for converting between RDD's and dataframes that was a real inconvenience back in the day.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":1493781516292,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"647a4755-824c-408d-8f73-b3151aaccf0a"},{"version":"CommandV1","origId":1271183488258037,"guid":"7d79cb4b-996b-4bff-b808-38f3ee52e84f","subtype":"command","commandType":"auto","position":1.5,"command":"df.take(5)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\"><span class=\"ansired\">Out[</span><span class=\"ansired\">4</span><span class=\"ansired\">]: </span>\n[Row(street=u'3526 HIGH ST', city=u'SACRAMENTO', zip=u'95838', state=u'CA', beds=2, baths=1, sq__ft=836, type=u'Residential', sale_date=u'Wed May 21 00:00:00 EDT 2008', price=59222, latitude=38.63191223144531, longitude=-121.43487548828125),\n Row(street=u'51 OMAHA CT', city=u'SACRAMENTO', zip=u'95823', state=u'CA', beds=3, baths=1, sq__ft=1167, type=u'Residential', sale_date=u'Wed May 21 00:00:00 EDT 2008', price=68212, latitude=38.47890090942383, longitude=-121.4310302734375),\n Row(street=u'2796 BRANCH ST', city=u'SACRAMENTO', zip=u'95815', state=u'CA', beds=2, baths=1, sq__ft=796, type=u'Residential', sale_date=u'Wed May 21 00:00:00 EDT 2008', price=68880, latitude=38.61830520629883, longitude=-121.44384002685547),\n Row(street=u'2805 JANETTE WAY', city=u'SACRAMENTO', zip=u'95815', state=u'CA', beds=2, baths=1, sq__ft=852, type=u'Residential', sale_date=u'Wed May 21 00:00:00 EDT 2008', price=69307, latitude=38.61683654785156, longitude=-121.43914794921875),\n Row(street=u'6001 MCMAHON DR', city=u'SACRAMENTO', zip=u'95824', state=u'CA', beds=2, baths=1, sq__ft=797, type=u'Residential', sale_date=u'Wed May 21 00:00:00 EDT 2008', price=81900, latitude=38.51947021484375, longitude=-121.4357681274414)]\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"<span class=\"ansired\">TypeError</span>: take() takes exactly 2 arguments (1 given)","error":"<div class=\"ansiout\"><span class=\"ansired\">---------------------------------------------------------------------------</span>\n<span class=\"ansired\">TypeError</span> Traceback (most recent call last)\n<span class=\"ansigreen\"><ipython-input-14-972b49413de5></span> in <span class=\"ansicyan\"><module></span><span class=\"ansiblue\">()</span>\n<span class=\"ansigreen\">----> 1</span><span class=\"ansiyellow\"> </span>df<span class=\"ansiyellow\">.</span>take<span class=\"ansiyellow\">(</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n\n<span class=\"ansired\">TypeError</span>: take() takes exactly 2 arguments (1 given)</div>","workflows":[],"startTime":1493781517341,"submitTime":1493781516343,"finishTime":1493781517864,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"015e3531-c2e7-4532-afc3-09d0777a966a"},{"version":"CommandV1","origId":1271183488258038,"guid":"1ce1a1e7-c48f-4cea-be4f-b651bf52bd1c","subtype":"command","commandType":"auto","position":1.75,"command":"%md\n\nAlthough the dataframe responds to some RDD syntax such as 'take', the formatting is not especially readable. For a more readable view, try 'show'. Having tried take and show on some much larger data sets too, they seem to be executed differently. Show is sometimes executed in a tiny fraction of the time of take.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":1493781516435,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"6267d357-6092-400c-a04b-413d3da7e2d6"},{"version":"CommandV1","origId":1271183488258039,"guid":"eba4a345-902d-45c6-993a-a3f3a9880c5a","subtype":"command","commandType":"auto","position":2.0,"command":"df.show(5)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">+----------------+----------+-----+-----+----+-----+------+-----------+--------------------+-----+---------+-----------+\n| street| city| zip|state|beds|baths|sq__ft| type| sale_date|price| latitude| longitude|\n+----------------+----------+-----+-----+----+-----+------+-----------+--------------------+-----+---------+-----------+\n| 3526 HIGH ST|SACRAMENTO|95838| CA| 2| 1| 836|Residential|Wed May 21 00:00:...|59222|38.631912|-121.434875|\n| 51 OMAHA CT|SACRAMENTO|95823| CA| 3| 1| 1167|Residential|Wed May 21 00:00:...|68212| 38.4789| -121.43103|\n| 2796 BRANCH ST|SACRAMENTO|95815| CA| 2| 1| 796|Residential|Wed May 21 00:00:...|68880|38.618305| -121.44384|\n|2805 JANETTE WAY|SACRAMENTO|95815| CA| 2| 1| 852|Residential|Wed May 21 00:00:...|69307|38.616837| -121.43915|\n| 6001 MCMAHON DR|SACRAMENTO|95824| CA| 2| 1| 797|Residential|Wed May 21 00:00:...|81900| 38.51947| -121.43577|\n+----------------+----------+-----+-----+----+-----+------+-----------+--------------------+-----+---------+-----------+\nonly showing top 5 rows\n\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"workflows":[],"startTime":1493781517868,"submitTime":1493781516493,"finishTime":1493781518642,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"ae079802-ba3f-4e99-8887-b48fec951e95"},{"version":"CommandV1","origId":1271183488258040,"guid":"390ab57c-89f6-48ac-a708-1be3112bed17","subtype":"command","commandType":"auto","position":2.5,"command":"%md\n## Pandas\nWhen working in Spark with Python, there is also an easy option to convert to Pandas on the fly, assuming your dataframe is small enough that it could be held in memory. Pandas dataframes can also be converted to Spark dataframes.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":1493781516569,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"02ea61f6-95c5-442c-9bdd-f4bf1241ebe1"},{"version":"CommandV1","origId":1271183488258041,"guid":"e774ddaf-6aab-4b5b-8bc4-9fbaec724621","subtype":"command","commandType":"auto","position":3.0,"command":"df.toPandas().head()","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\"><span class=\"ansired\">Out[</span><span class=\"ansired\">6</span><span class=\"ansired\">]: </span>\n street city zip state beds baths sq__ft \\\n0 3526 HIGH ST SACRAMENTO 95838 CA 2 1 836 \n1 51 OMAHA CT SACRAMENTO 95823 CA 3 1 1167 \n2 2796 BRANCH ST SACRAMENTO 95815 CA 2 1 796 \n3 2805 JANETTE WAY SACRAMENTO 95815 CA 2 1 852 \n4 6001 MCMAHON DR SACRAMENTO 95824 CA 2 1 797 \n\n type sale_date price latitude longitude \n0 Residential Wed May 21 00:00:00 EDT 2008 59222 38.631912 -121.434875 \n1 Residential Wed May 21 00:00:00 EDT 2008 68212 38.478901 -121.431030 \n2 Residential Wed May 21 00:00:00 EDT 2008 68880 38.618305 -121.443840 \n3 Residential Wed May 21 00:00:00 EDT 2008 69307 38.616837 -121.439148 \n4 Residential Wed May 21 00:00:00 EDT 2008 81900 38.519470 -121.435768 \n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"workflows":[],"startTime":1493781518646,"submitTime":1493781516611,"finishTime":1493781519955,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"323354fa-f33d-4b4c-949d-0947aac8c674"},{"version":"CommandV1","origId":1271183488258042,"guid":"4cede34e-9d15-468c-98a4-3bb9eaf65342","subtype":"command","commandType":"auto","position":3.5,"command":"%md\n## Dataframe operations¶\nOnce you have converted to a dataframe, many operations like selecting rows/columns and doing counts becomes much easier. Keep in mind that Spark dataframes are not mutable. For example, if we'd like a dataframe with only the homes in zip code 95815, the syntax should be very comfortable for Pandas and R users.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":1493781516681,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"926f4d69-6ab8-44c6-84c3-546c4750b0d8"},{"version":"CommandV1","origId":1271183488258043,"guid":"63a597da-4b2a-44d2-b58b-ff496e263c93","subtype":"command","commandType":"auto","position":3.75,"command":"favorite_zip = df[df.zip == 95815]\nfavorite_zip.show(5)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">+----------------+----------+-----+-----+----+-----+------+-----------+--------------------+------+---------+----------+\n| street| city| zip|state|beds|baths|sq__ft| type| sale_date| price| latitude| longitude|\n+----------------+----------+-----+-----+----+-----+------+-----------+--------------------+------+---------+----------+\n| 2796 BRANCH ST|SACRAMENTO|95815| CA| 2| 1| 796|Residential|Wed May 21 00:00:...| 68880|38.618305|-121.44384|\n|2805 JANETTE WAY|SACRAMENTO|95815| CA| 2| 1| 852|Residential|Wed May 21 00:00:...| 69307|38.616837|-121.43915|\n| 2930 LA ROSA RD|SACRAMENTO|95815| CA| 1| 1| 871|Residential|Wed May 21 00:00:...|106852| 38.6187|-121.43584|\n| 3132 CLAY ST|SACRAMENTO|95815| CA| 2| 1| 800|Residential|Tue May 20 00:00:...| 78000| 38.62468| -121.4392|\n| 483 ARCADE BLVD|SACRAMENTO|95815| CA| 4| 2| 1316|Residential|Tue May 20 00:00:...| 89000| 38.62357|-121.45489|\n+----------------+----------+-----+-----+----+-----+------+-----------+--------------------+------+---------+----------+\nonly showing top 5 rows\n\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"workflows":[],"startTime":1493781519960,"submitTime":1493781516724,"finishTime":1493781520384,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"febc498a-3b51-4ae2-a4e2-37dc237dfe62"},{"version":"CommandV1","origId":1271183488258044,"guid":"9a7220dc-6a15-4f6e-b2af-42912435c867","subtype":"command","commandType":"auto","position":3.875,"command":"%md\n\nIt's also straightforward to choose a subset of columns from your dataframe by calling '.select' on that dataframe.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":1493781516795,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"5227c198-bee3-4612-a198-743c799b5fd4"},{"version":"CommandV1","origId":1271183488258045,"guid":"8438961e-e80e-4295-8c11-efab71517880","subtype":"command","commandType":"auto","position":4.0,"command":"df.select('city','beds').show(10)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">+--------------+----+\n| city|beds|\n+--------------+----+\n| SACRAMENTO| 2|\n| SACRAMENTO| 3|\n| SACRAMENTO| 2|\n| SACRAMENTO| 2|\n| SACRAMENTO| 2|\n| SACRAMENTO| 3|\n| SACRAMENTO| 3|\n| SACRAMENTO| 3|\n|RANCHO CORDOVA| 2|\n| RIO LINDA| 3|\n+--------------+----+\nonly showing top 10 rows\n\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"workflows":[],"startTime":1493781520389,"submitTime":1493781516831,"finishTime":1493781521213,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"60540b65-40c4-48c7-a3ad-d56295fdb17a"},{"version":"CommandV1","origId":1271183488258046,"guid":"e699e4b9-1a24-40fc-8666-5675f2ee6c81","subtype":"command","commandType":"auto","position":4.5,"command":"%md\nHere's a count of how many houses have different numbers of bedrooms. Note that some houses seem to have 0 beds, which should be impossible. We'll keep that in mind for later.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":1493781516918,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"ce1ed6b0-6fe4-414b-9d09-2a4e53a97a78"},{"version":"CommandV1","origId":1271183488258047,"guid":"d5f4dfa2-778a-4fe3-a13c-3b42b3b77983","subtype":"command","commandType":"auto","position":5.0,"command":"df.groupBy(\"beds\").count().show()","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">+----+-----+\n|beds|count|\n+----+-----+\n| 1| 10|\n| 6| 3|\n| 3| 413|\n| 5| 59|\n| 4| 258|\n| 8| 1|\n| 2| 133|\n| 0| 108|\n+----+-----+\n\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"workflows":[],"startTime":1493781521219,"submitTime":1493781516965,"finishTime":1493781522730,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"a3e4ab13-6f07-47e6-a6b7-eb4626617a9c"},{"version":"CommandV1","origId":1271183488258048,"guid":"4af7d09f-93d0-4ac8-a10b-b74f52570ef6","subtype":"command","commandType":"auto","position":5.5,"command":"%md\nDescribe allows us to retrieve summary statistics on one or more columns. Results are returned in a dataframe, so .show() is necessary to see them. Not only are some of the bed values 0, but baths and square feet each have mins of zero, which are bad data points.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":1493781517042,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"3af11fb2-b227-4bb7-b775-79243507f628"},{"version":"CommandV1","origId":1271183488258049,"guid":"78e4dbe4-d5ce-48f3-be46-db72032def2f","subtype":"command","commandType":"auto","position":6.0,"command":"df.describe(['baths', 'beds','price','sq__ft']).show()","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">+-------+------------------+------------------+------------------+------------------+\n|summary| baths| beds| price| sq__ft|\n+-------+------------------+------------------+------------------+------------------+\n| count| 985| 985| 985| 985|\n| mean|1.7766497461928934|2.9116751269035532|234144.26395939087|1314.9167512690356|\n| stddev|0.8953714223186463|1.3079322320435807|138365.83908492787| 853.0482425034448|\n| min| 0| 0| 1551| 0|\n| max| 5| 8| 884790| 5822|\n+-------+------------------+------------------+------------------+------------------+\n\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"<span class=\"ansired\">AnalysisException</span>: u"cannot resolve '`sqft`' given input columns: [type, sale_date, latitude, state, longitude, sq__ft, beds, city, street, zip, price, baths];;\\n'Aggregate [cast(count(baths#304) as string) AS baths#1086, cast(count(beds#303) as string) AS beds#1088, cast(count(price#308) as string) AS price#1090, cast(count('sqft) as string) AS sqft#1092, cast(avg(cast(baths#304 as bigint)) as string) AS baths#1094, cast(avg(cast(beds#303 as bigint)) as string) AS beds#1096, cast(avg(cast(price#308 as bigint)) as string) AS price#1098, cast(avg('sqft) as string) AS sqft#1100, cast(stddev_samp(cast(baths#304 as double)) as string) AS baths#1110, cast(stddev_samp(cast(beds#303 as double)) as string) AS beds#1120, cast(stddev_samp(cast(price#308 as double)) as string) AS price#1130, cast(stddev_samp('sqft) as string) AS sqft#1140, cast(min(baths#304) as string) AS baths#1142, cast(min(beds#303) as string) AS beds#1144, cast(min(price#308) as string) AS price#1146, cast(min('sqft) as string) AS sqft#1148, cast(max(baths#304) as string) AS baths#1150, cast(max(beds#303) as string) AS beds#1152, cast(max(price#308) as string) AS price#1154, cast(max('sqft) as string) AS sqft#1156]\\n+- Project [street#299, city#300, zip#301, state#302, beds#303, baths#304, sq__ft#305, type#306, sale_date#307, price#308, latitude#309, longitude#310]\\n +- SubqueryAlias realestate\\n +- Relation[street#299,city#300,zip#301,state#302,beds#303,baths#304,sq__ft#305,type#306,sale_date#307,price#308,latitude#309,longitude#310] csv\\n"","error":"<div class=\"ansiout\"><span class=\"ansired\">---------------------------------------------------------------------------</span>\n<span class=\"ansired\">AnalysisException</span> Traceback (most recent call last)\n<span class=\"ansigreen\"><ipython-input-12-871a261ea2ba></span> in <span class=\"ansicyan\"><module></span><span class=\"ansiblue\">()</span>\n<span class=\"ansigreen\">----> 1</span><span class=\"ansiyellow\"> </span>df<span class=\"ansiyellow\">.</span>describe<span class=\"ansiyellow\">(</span><span class=\"ansiyellow\">[</span><span class=\"ansiblue\">'baths'</span><span class=\"ansiyellow\">,</span> <span class=\"ansiblue\">'beds'</span><span class=\"ansiyellow\">,</span><span class=\"ansiblue\">'price'</span><span class=\"ansiyellow\">,</span><span class=\"ansiblue\">'sqft'</span><span class=\"ansiyellow\">]</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">.</span>show<span class=\"ansiyellow\">(</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n\n<span class=\"ansigreen\">/databricks/spark/python/pyspark/sql/dataframe.py</span> in <span class=\"ansicyan\">describe</span><span class=\"ansiblue\">(self, *cols)</span>\n<span class=\"ansigreen\"> 900</span> <span class=\"ansigreen\">if</span> len<span class=\"ansiyellow\">(</span>cols<span class=\"ansiyellow\">)</span> <span class=\"ansiyellow\">==</span> <span class=\"ansicyan\">1</span> <span class=\"ansigreen\">and</span> isinstance<span class=\"ansiyellow\">(</span>cols<span class=\"ansiyellow\">[</span><span class=\"ansicyan\">0</span><span class=\"ansiyellow\">]</span><span class=\"ansiyellow\">,</span> list<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 901</span> cols <span class=\"ansiyellow\">=</span> cols<span class=\"ansiyellow\">[</span><span class=\"ansicyan\">0</span><span class=\"ansiyellow\">]</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\">--> 902</span><span class=\"ansiyellow\"> </span>jdf <span class=\"ansiyellow\">=</span> self<span class=\"ansiyellow\">.</span>_jdf<span class=\"ansiyellow\">.</span>describe<span class=\"ansiyellow\">(</span>self<span class=\"ansiyellow\">.</span>_jseq<span class=\"ansiyellow\">(</span>cols<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 903</span> <span class=\"ansigreen\">return</span> DataFrame<span class=\"ansiyellow\">(</span>jdf<span class=\"ansiyellow\">,</span> self<span class=\"ansiyellow\">.</span>sql_ctx<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 904</span> <span class=\"ansiyellow\"></span>\n\n<span class=\"ansigreen\">/databricks/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py</span> in <span class=\"ansicyan\">__call__</span><span class=\"ansiblue\">(self, *args)</span>\n<span class=\"ansigreen\"> 1131</span> answer <span class=\"ansiyellow\">=</span> self<span class=\"ansiyellow\">.</span>gateway_client<span class=\"ansiyellow\">.</span>send_command<span class=\"ansiyellow\">(</span>command<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 1132</span> return_value = get_return_value(\n<span class=\"ansigreen\">-> 1133</span><span class=\"ansiyellow\"> answer, self.gateway_client, self.target_id, self.name)\n</span><span class=\"ansigreen\"> 1134</span> <span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 1135</span> <span class=\"ansigreen\">for</span> temp_arg <span class=\"ansigreen\">in</span> temp_args<span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n\n<span class=\"ansigreen\">/databricks/spark/python/pyspark/sql/utils.py</span> in <span class=\"ansicyan\">deco</span><span class=\"ansiblue\">(*a, **kw)</span>\n<span class=\"ansigreen\"> 67</span> e.java_exception.getStackTrace()))\n<span class=\"ansigreen\"> 68</span> <span class=\"ansigreen\">if</span> s<span class=\"ansiyellow\">.</span>startswith<span class=\"ansiyellow\">(</span><span class=\"ansiblue\">'org.apache.spark.sql.AnalysisException: '</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\">---> 69</span><span class=\"ansiyellow\"> </span><span class=\"ansigreen\">raise</span> AnalysisException<span class=\"ansiyellow\">(</span>s<span class=\"ansiyellow\">.</span>split<span class=\"ansiyellow\">(</span><span class=\"ansiblue\">': '</span><span class=\"ansiyellow\">,</span> <span class=\"ansicyan\">1</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">[</span><span class=\"ansicyan\">1</span><span class=\"ansiyellow\">]</span><span class=\"ansiyellow\">,</span> stackTrace<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 70</span> <span class=\"ansigreen\">if</span> s<span class=\"ansiyellow\">.</span>startswith<span class=\"ansiyellow\">(</span><span class=\"ansiblue\">'org.apache.spark.sql.catalyst.analysis'</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 71</span> <span class=\"ansigreen\">raise</span> AnalysisException<span class=\"ansiyellow\">(</span>s<span class=\"ansiyellow\">.</span>split<span class=\"ansiyellow\">(</span><span class=\"ansiblue\">': '</span><span class=\"ansiyellow\">,</span> <span class=\"ansicyan\">1</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">[</span><span class=\"ansicyan\">1</span><span class=\"ansiyellow\">]</span><span class=\"ansiyellow\">,</span> stackTrace<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n\n<span class=\"ansired\">AnalysisException</span>: u"cannot resolve '`sqft`' given input columns: [type, sale_date, latitude, state, longitude, sq__ft, beds, city, street, zip, price, baths];;\\n'Aggregate [cast(count(baths#304) as string) AS baths#1086, cast(count(beds#303) as string) AS beds#1088, cast(count(price#308) as string) AS price#1090, cast(count('sqft) as string) AS sqft#1092, cast(avg(cast(baths#304 as bigint)) as string) AS baths#1094, cast(avg(cast(beds#303 as bigint)) as string) AS beds#1096, cast(avg(cast(price#308 as bigint)) as string) AS price#1098, cast(avg('sqft) as string) AS sqft#1100, cast(stddev_samp(cast(baths#304 as double)) as string) AS baths#1110, cast(stddev_samp(cast(beds#303 as double)) as string) AS beds#1120, cast(stddev_samp(cast(price#308 as double)) as string) AS price#1130, cast(stddev_samp('sqft) as string) AS sqft#1140, cast(min(baths#304) as string) AS baths#1142, cast(min(beds#303) as string) AS beds#1144, cast(min(price#308) as string) AS price#1146, cast(min('sqft) as string) AS sqft#1148, cast(max(baths#304) as string) AS baths#1150, cast(max(beds#303) as string) AS beds#1152, cast(max(price#308) as string) AS price#1154, cast(max('sqft) as string) AS sqft#1156]\\n+- Project [street#299, city#300, zip#301, state#302, beds#303, baths#304, sq__ft#305, type#306, sale_date#307, price#308, latitude#309, longitude#310]\\n +- SubqueryAlias realestate\\n +- Relation[street#299,city#300,zip#301,state#302,beds#303,baths#304,sq__ft#305,type#306,sale_date#307,price#308,latitude#309,longitude#310] csv\\n"</div>","workflows":[],"startTime":1493781522735,"submitTime":1493781517078,"finishTime":1493781523711,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"83a9fdd8-44d0-4db2-9d07-aacb90d4614f"},{"version":"CommandV1","origId":1271183488258050,"guid":"d47f95a2-486e-4e2b-ae3b-ab4f7eef4c8f","subtype":"command","commandType":"auto","position":6.03125,"command":"%md\nI'm curious to take a look at the distribution of house prices too. I see that we've got a house that sold for $1551 on the low end and one for $884,790 on the high end. Databricks notebooks do allow you to use Matplotlib. The limitation seems to be that you have to bring whatever you are plotting into memory with something like Pandas, so this wouldn't work for big data without sampling aggregating first (if you know of a way plotting directly from a Spark dataframe, please post in the comments).","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":1493781517153,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"07b48eb8-e76c-4dc6-ab1b-d25ec4e8a873"},{"version":"CommandV1","origId":1271183488258051,"guid":"22fa8eee-4b66-4940-b482-0b783714985b","subtype":"command","commandType":"auto","position":6.0625,"command":"import matplotlib.pyplot as plt\nfig, ax = plt.subplots()\nplt.hist(df.toPandas()['price'], bins = 25)\nplt.xlabel('US Dollars')\nplt.title('Distribution of Sacramento Home Prices ')\ndisplay(fig)","commandVersion":0,"state":"finished","results":{"type":"image","data":"","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"workflows":[],"startTime":1493781523715,"submitTime":1493781517192,"finishTime":1493781527463,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"5ead9069-eb00-4b0e-869c-ec4dcc9ffc45"},{"version":"CommandV1","origId":1271183488258052,"guid":"a8be573b-9669-4719-a527-b3ea54504fc4","subtype":"command","commandType":"auto","position":6.125,"command":"%md\nLooks like we've got a right skewed distribution here with some outliers on the left as well. As a simple response to this for our exploratory model, later we'll remove prices below $50,000 or above $450,000.","commandVersion":0,"state":"finished","results":null,"errorSummary":"<span class=\"ansired\">AttributeError</span>: 'DataFrame' object has no attribute 'toPandas'","error":null,"workflows":[],"startTime":0,"submitTime":1493781517258,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"0e9b7e8a-9a0c-485d-8f94-84ecd14bd53b"},{"version":"CommandV1","origId":1271183488258053,"guid":"e870123a-d009-429a-b385-c7080cdc8c40","subtype":"command","commandType":"auto","position":6.25,"command":"%md\n\n### Regression with SparkML\nAs opposed to the previous iteration of this notebook where I used the RDD API for MLlib, this time I'll be using dataframes. Let's do a very simple linear regression. ","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":1493781517293,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"02d41be2-973a-402a-add1-054ca1bd49a6"},{"version":"CommandV1","origId":1271183488258054,"guid":"7f72804a-11ba-4b6a-b715-edb4c47249e4","subtype":"command","commandType":"auto","position":6.375,"command":"from pyspark.ml.regression import LinearRegression\nfrom pyspark.ml.feature import VectorAssembler","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\"></div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"Cancelled","error":null,"workflows":[],"startTime":1493781527468,"submitTime":1493781517339,"finishTime":1493781527541,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"4efc3bf5-c71d-4021-af0c-ee0ef2b56799"},{"version":"CommandV1","origId":1271183488258055,"guid":"61add7dd-4406-460c-aa7c-f77c5df3999e","subtype":"command","commandType":"auto","position":6.5,"command":"%md\n\nI'll start by creating a dataframe df that has only the subset of features I'm interested in. I'm going to predict home price from the number of baths, beds, and square feet.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":1493781517402,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"8f3d47c6-8b9a-4d78-8ec0-ca663673d171"},{"version":"CommandV1","origId":1271183488258056,"guid":"aac954a8-b049-40ea-bd26-7b1150a3dbc3","subtype":"command","commandType":"auto","position":7.0,"command":"df = df.select('price','baths','beds','sq__ft')","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\"></div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"<span class=\"ansired\">TypeError</span>: select() takes at most 3 arguments (5 given)","error":"<div class=\"ansiout\"><span class=\"ansired\">---------------------------------------------------------------------------</span>\n<span class=\"ansired\">TypeError</span> Traceback (most recent call last)\n<span class=\"ansigreen\"><ipython-input-238-602984fc41af></span> in <span class=\"ansicyan\"><module></span><span class=\"ansiblue\">()</span>\n<span class=\"ansigreen\">----> 1</span><span class=\"ansiyellow\"> </span>df <span class=\"ansiyellow\">=</span> df<span class=\"ansiyellow\">.</span>select<span class=\"ansiyellow\">(</span><span class=\"ansiblue\">'price'</span><span class=\"ansiyellow\">,</span><span class=\"ansiblue\">'baths'</span><span class=\"ansiyellow\">,</span><span class=\"ansiblue\">'beds'</span><span class=\"ansiyellow\">,</span><span class=\"ansiblue\">'sq__ft'</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n\n<span class=\"ansired\">TypeError</span>: select() takes at most 3 arguments (5 given)</div>","workflows":[],"startTime":1493781527545,"submitTime":1493781517469,"finishTime":1493781527615,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"9d565bba-3f39-4deb-8a88-a764dbec0d30"},{"version":"CommandV1","origId":1271183488258057,"guid":"e58b3759-4493-408a-8641-49f8ec8a79b2","subtype":"command","commandType":"auto","position":7.0625,"command":"%md\nLet's remove those rows that have suspicious 0 values for any of the features we want to use for prediction and some of the more extreme house prices to get something closer to a normal distribution","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":1493781517546,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"3e54d998-685c-4be5-95d1-ce50a04ebd93"},{"version":"CommandV1","origId":1271183488258058,"guid":"0369eb28-363d-4164-a3f3-f607112304a7","subtype":"command","commandType":"auto","position":7.125,"command":"df = df[df.baths > 0]\ndf = df[df.beds > 0]\ndf = df[df.sq__ft > 0]\ndf = df[df.price > 50000]\ndf = df[df.price < 400000]\ndf.describe(['baths','beds','price','sq__ft']).show()","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">+-------+------------------+------------------+------------------+-----------------+\n|summary| baths| beds| price| sq__ft|\n+-------+------------------+------------------+------------------+-----------------+\n| count| 741| 741| 741| 741|\n| mean|1.8960863697705803|3.1875843454790824|204151.37112010797|1486.379217273954|\n| stddev|0.6096062646787571| 0.821111455036673| 79585.60148803357|517.9177859727873|\n| min| 1| 1| 55422| 539|\n| max| 4| 8| 399000| 3612|\n+-------+------------------+------------------+------------------+-----------------+\n\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"<span class=\"ansired\">AttributeError</span>: 'DataFrame' object has no attribute 'baths'","error":"<div class=\"ansiout\"><span class=\"ansired\">---------------------------------------------------------------------------</span>\n<span class=\"ansired\">AttributeError</span> Traceback (most recent call last)\n<span class=\"ansigreen\"><ipython-input-239-c1559468f89a></span> in <span class=\"ansicyan\"><module></span><span class=\"ansiblue\">()</span>\n<span class=\"ansigreen\">----> 1</span><span class=\"ansiyellow\"> </span>df <span class=\"ansiyellow\">=</span> df<span class=\"ansiyellow\">[</span>df<span class=\"ansiyellow\">.</span>baths <span class=\"ansiyellow\">></span> <span class=\"ansicyan\">0</span><span class=\"ansiyellow\">]</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 2</span> df <span class=\"ansiyellow\">=</span> df<span class=\"ansiyellow\">[</span>df<span class=\"ansiyellow\">.</span>beds <span class=\"ansiyellow\">></span> <span class=\"ansicyan\">0</span><span class=\"ansiyellow\">]</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 3</span> df <span class=\"ansiyellow\">=</span> df<span class=\"ansiyellow\">[</span>df<span class=\"ansiyellow\">.</span>sq__ft <span class=\"ansiyellow\">></span> <span class=\"ansicyan\">0</span><span class=\"ansiyellow\">]</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 4</span> df <span class=\"ansiyellow\">=</span> df<span class=\"ansiyellow\">[</span>df<span class=\"ansiyellow\">.</span>price <span class=\"ansiyellow\">></span> <span class=\"ansicyan\">50000</span><span class=\"ansiyellow\">]</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 5</span> df <span class=\"ansiyellow\">=</span> df<span class=\"ansiyellow\">[</span>df<span class=\"ansiyellow\">.</span>price <span class=\"ansiyellow\">></span> <span class=\"ansicyan\">400000</span><span class=\"ansiyellow\">]</span><span class=\"ansiyellow\"></span>\n\n<span class=\"ansigreen\">/databricks/python/local/lib/python2.7/site-packages/pandas/core/generic.pyc</span> in <span class=\"ansicyan\">__getattr__</span><span class=\"ansiblue\">(self, name)</span>\n<span class=\"ansigreen\"> 2670</span> <span class=\"ansigreen\">if</span> name <span class=\"ansigreen\">in</span> self<span class=\"ansiyellow\">.</span>_info_axis<span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 2671</span> <span class=\"ansigreen\">return</span> self<span class=\"ansiyellow\">[</span>name<span class=\"ansiyellow\">]</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\">-> 2672</span><span class=\"ansiyellow\"> </span><span class=\"ansigreen\">return</span> object<span class=\"ansiyellow\">.</span>__getattribute__<span class=\"ansiyellow\">(</span>self<span class=\"ansiyellow\">,</span> name<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 2673</span> <span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 2674</span> <span class=\"ansigreen\">def</span> __setattr__<span class=\"ansiyellow\">(</span>self<span class=\"ansiyellow\">,</span> name<span class=\"ansiyellow\">,</span> value<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n\n<span class=\"ansired\">AttributeError</span>: 'DataFrame' object has no attribute 'baths'</div>","workflows":[],"startTime":1493781527620,"submitTime":1493781517612,"finishTime":1493781528445,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"23f32698-3b3f-48d4-9f8d-cd2263f172e8"},{"version":"CommandV1","origId":1271183488258059,"guid":"8f88431b-ec88-4353-b9d0-d5ab30019214","subtype":"command","commandType":"auto","position":7.5,"command":"%md\nInstantiate a vector assembler object that will vectorize all of the feature columns that we are interested in using in our model.","commandVersion":0,"state":"finished","results":null,"errorSummary":"<span class=\"ansired\">ValueError</span>: Params must be either a param map or a list/tuple of param maps, but got <type 'str'>.","error":null,"workflows":[],"startTime":0,"submitTime":1493781517696,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"fab0e061-2f5b-4de8-ae99-aed205e96830"},{"version":"CommandV1","origId":1271183488258060,"guid":"2dc82068-0bc2-4cee-9690-e1a67c57be3b","subtype":"command","commandType":"auto","position":7.75,"command":"features = [\"baths\",\"beds\",\"sq__ft\"]\nassembler = VectorAssembler(\n inputCols=features,\n outputCol='features')","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\"></div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"<span class=\"ansired\">SyntaxError</span><span class=\"ansired\">:</span> invalid syntax","error":"<div class=\"ansiout\"><span class=\"ansicyan\"> File </span><span class=\"ansigreen\">"<ipython-input-68-93ea5d8bd77a>"</span><span class=\"ansicyan\">, line </span><span class=\"ansigreen\">5</span>\n<span class=\"ansiyellow\"> outputCol='features').</span>\n<span class=\"ansigrey\"> ^</span>\n<span class=\"ansired\">SyntaxError</span><span class=\"ansired\">:</span> invalid syntax\n</div>","workflows":[],"startTime":1493781528450,"submitTime":1493781517733,"finishTime":1493781528490,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"d767b345-2264-42c0-b46c-ed46dd8a7ab0"},{"version":"CommandV1","origId":1271183488258061,"guid":"6ddeda12-b942-45a8-b9a3-de632f6f9641","subtype":"command","commandType":"auto","position":7.875,"command":"%md\nNow create a new dataframe with our vector assembled. Spark wants a column containing feature vectors in order to train a model.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":1493781517835,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"9a49ab82-f31e-4ded-ae3b-347782353243"},{"version":"CommandV1","origId":1271183488258062,"guid":"29ab3f3e-73bd-4161-ba91-dd00844591e8","subtype":"command","commandType":"auto","position":8.0,"command":"assembled_df = assembler.transform(df)\nassembled_df.show(5)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">+-----+-----+----+------+----------------+\n|price|baths|beds|sq__ft| features|\n+-----+-----+----+------+----------------+\n|59222| 1| 2| 836| [1.0,2.0,836.0]|\n|68212| 1| 3| 1167|[1.0,3.0,1167.0]|\n|68880| 1| 2| 796| [1.0,2.0,796.0]|\n|69307| 1| 2| 852| [1.0,2.0,852.0]|\n|81900| 1| 2| 797| [1.0,2.0,797.0]|\n+-----+-----+----+------+----------------+\nonly showing top 5 rows\n\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"<span class=\"ansired\">AttributeError</span>: 'DataFrame' object has no attribute '_jdf'","error":"<div class=\"ansiout\"><span class=\"ansired\">---------------------------------------------------------------------------</span>\n<span class=\"ansired\">AttributeError</span> Traceback (most recent call last)\n<span class=\"ansigreen\"><ipython-input-241-8c18faaaa893></span> in <span class=\"ansicyan\"><module></span><span class=\"ansiblue\">()</span>\n<span class=\"ansigreen\">----> 1</span><span class=\"ansiyellow\"> </span>assembled_df <span class=\"ansiyellow\">=</span> assembler<span class=\"ansiyellow\">.</span>transform<span class=\"ansiyellow\">(</span>df<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 2</span> assembled_df<span class=\"ansiyellow\">.</span>show<span class=\"ansiyellow\">(</span><span class=\"ansicyan\">5</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n\n<span class=\"ansigreen\">/databricks/spark/python/pyspark/ml/base.py</span> in <span class=\"ansicyan\">transform</span><span class=\"ansiblue\">(self, dataset, params)</span>\n<span class=\"ansigreen\"> 103</span> <span class=\"ansigreen\">return</span> self<span class=\"ansiyellow\">.</span>copy<span class=\"ansiyellow\">(</span>params<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">.</span>_transform<span class=\"ansiyellow\">(</span>dataset<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 104</span> <span class=\"ansigreen\">else</span><span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\">--> 105</span><span class=\"ansiyellow\"> </span><span class=\"ansigreen\">return</span> self<span class=\"ansiyellow\">.</span>_transform<span class=\"ansiyellow\">(</span>dataset<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 106</span> <span class=\"ansigreen\">else</span><span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 107</span> <span class=\"ansigreen\">raise</span> ValueError<span class=\"ansiyellow\">(</span><span class=\"ansiblue\">"Params must be a param map but got %s."</span> <span class=\"ansiyellow\">%</span> type<span class=\"ansiyellow\">(</span>params<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n\n<span class=\"ansigreen\">/databricks/spark/python/pyspark/ml/wrapper.py</span> in <span class=\"ansicyan\">_transform</span><span class=\"ansiblue\">(self, dataset)</span>\n<span class=\"ansigreen\"> 250</span> <span class=\"ansigreen\">def</span> _transform<span class=\"ansiyellow\">(</span>self<span class=\"ansiyellow\">,</span> dataset<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 251</span> self<span class=\"ansiyellow\">.</span>_transfer_params_to_java<span class=\"ansiyellow\">(</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\">--> 252</span><span class=\"ansiyellow\"> </span><span class=\"ansigreen\">return</span> DataFrame<span class=\"ansiyellow\">(</span>self<span class=\"ansiyellow\">.</span>_java_obj<span class=\"ansiyellow\">.</span>transform<span class=\"ansiyellow\">(</span>dataset<span class=\"ansiyellow\">.</span>_jdf<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">,</span> dataset<span class=\"ansiyellow\">.</span>sql_ctx<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 253</span> <span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 254</span> <span class=\"ansiyellow\"></span>\n\n<span class=\"ansigreen\">/databricks/python/local/lib/python2.7/site-packages/pandas/core/generic.pyc</span> in <span class=\"ansicyan\">__getattr__</span><span class=\"ansiblue\">(self, name)</span>\n<span class=\"ansigreen\"> 2670</span> <span class=\"ansigreen\">if</span> name <span class=\"ansigreen\">in</span> self<span class=\"ansiyellow\">.</span>_info_axis<span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 2671</span> <span class=\"ansigreen\">return</span> self<span class=\"ansiyellow\">[</span>name<span class=\"ansiyellow\">]</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\">-> 2672</span><span class=\"ansiyellow\"> </span><span class=\"ansigreen\">return</span> object<span class=\"ansiyellow\">.</span>__getattribute__<span class=\"ansiyellow\">(</span>self<span class=\"ansiyellow\">,</span> name<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 2673</span> <span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 2674</span> <span class=\"ansigreen\">def</span> __setattr__<span class=\"ansiyellow\">(</span>self<span class=\"ansiyellow\">,</span> name<span class=\"ansiyellow\">,</span> value<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n\n<span class=\"ansired\">AttributeError</span>: 'DataFrame' object has no attribute '_jdf'</div>","workflows":[],"startTime":1493781528494,"submitTime":1493781517890,"finishTime":1493781529702,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"9831e490-c7a4-4102-a7fe-a9e16cc7bf47"},{"version":"CommandV1","origId":1271183488258063,"guid":"c7a72314-6f5c-49e3-bae9-372c203029c6","subtype":"command","commandType":"auto","position":8.125,"command":"%md\nPerform a train/test split of our data.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":1493781517965,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"630cae50-c8c4-42a1-a023-f8d026535f8b"},{"version":"CommandV1","origId":1271183488258064,"guid":"2c12367b-04a4-435a-bce8-b6bbdf5f1442","subtype":"command","commandType":"auto","position":8.25,"command":"train, test = assembled_df.randomSplit([0.6, 0.4], seed=0)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\"></div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":null,"error":null,"workflows":[],"startTime":1493781529706,"submitTime":1493781518006,"finishTime":1493781529744,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"485e3a0a-2a66-483f-8d14-0910ba19036a"},{"version":"CommandV1","origId":1271183488258065,"guid":"7dd1b05c-def0-46ca-90c1-999d8164c32d","subtype":"command","commandType":"auto","position":8.5,"command":"%md\nNow we can instantiate a linear regression model and train it. We'll need to designate which columns contain the features and the target.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":1493781518076,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"a48553d8-741c-4c8f-b1a0-5a0faf6136b6"},{"version":"CommandV1","origId":1271183488258066,"guid":"631646d4-9efb-4c39-89be-e3c20e94ce62","subtype":"command","commandType":"auto","position":9.0,"command":"lr = LinearRegression(maxIter=10).setLabelCol(\"price\").setFeaturesCol(\"features\")\nmodel = lr.fit(train)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\"></div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"<span class=\"ansired\">IllegalArgumentException</span>: u'requirement failed: Column baths must be of type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 but was actually ShortType.'","error":"<div class=\"ansiout\"><span class=\"ansired\">---------------------------------------------------------------------------</span>\n<span class=\"ansired\">IllegalArgumentException</span> Traceback (most recent call last)\n<span class=\"ansigreen\"><ipython-input-93-2740d2e6b916></span> in <span class=\"ansicyan\"><module></span><span class=\"ansiblue\">()</span>\n<span class=\"ansigreen\">----> 1</span><span class=\"ansiyellow\"> </span>lrModel <span class=\"ansiyellow\">=</span> lr<span class=\"ansiyellow\">.</span>fit<span class=\"ansiyellow\">(</span>assembled_df<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n\n<span class=\"ansigreen\">/databricks/spark/python/pyspark/ml/base.py</span> in <span class=\"ansicyan\">fit</span><span class=\"ansiblue\">(self, dataset, params)</span>\n<span class=\"ansigreen\"> 62</span> <span class=\"ansigreen\">return</span> self<span class=\"ansiyellow\">.</span>copy<span class=\"ansiyellow\">(</span>params<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">.</span>_fit<span class=\"ansiyellow\">(</span>dataset<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 63</span> <span class=\"ansigreen\">else</span><span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\">---> 64</span><span class=\"ansiyellow\"> </span><span class=\"ansigreen\">return</span> self<span class=\"ansiyellow\">.</span>_fit<span class=\"ansiyellow\">(</span>dataset<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 65</span> <span class=\"ansigreen\">else</span><span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 66</span> raise ValueError("Params must be either a param map or a list/tuple of param maps, "\n\n<span class=\"ansigreen\">/databricks/spark/python/pyspark/ml/wrapper.py</span> in <span class=\"ansicyan\">_fit</span><span class=\"ansiblue\">(self, dataset)</span>\n<span class=\"ansigreen\"> 234</span> <span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 235</span> <span class=\"ansigreen\">def</span> _fit<span class=\"ansiyellow\">(</span>self<span class=\"ansiyellow\">,</span> dataset<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\">--> 236</span><span class=\"ansiyellow\"> </span>java_model <span class=\"ansiyellow\">=</span> self<span class=\"ansiyellow\">.</span>_fit_java<span class=\"ansiyellow\">(</span>dataset<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 237</span> <span class=\"ansigreen\">return</span> self<span class=\"ansiyellow\">.</span>_create_model<span class=\"ansiyellow\">(</span>java_model<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 238</span> <span class=\"ansiyellow\"></span>\n\n<span class=\"ansigreen\">/databricks/spark/python/pyspark/ml/wrapper.py</span> in <span class=\"ansicyan\">_fit_java</span><span class=\"ansiblue\">(self, dataset)</span>\n<span class=\"ansigreen\"> 231</span> """\n<span class=\"ansigreen\"> 232</span> self<span class=\"ansiyellow\">.</span>_transfer_params_to_java<span class=\"ansiyellow\">(</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\">--> 233</span><span class=\"ansiyellow\"> </span><span class=\"ansigreen\">return</span> self<span class=\"ansiyellow\">.</span>_java_obj<span class=\"ansiyellow\">.</span>fit<span class=\"ansiyellow\">(</span>dataset<span class=\"ansiyellow\">.</span>_jdf<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 234</span> <span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 235</span> <span class=\"ansigreen\">def</span> _fit<span class=\"ansiyellow\">(</span>self<span class=\"ansiyellow\">,</span> dataset<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n\n<span class=\"ansigreen\">/databricks/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py</span> in <span class=\"ansicyan\">__call__</span><span class=\"ansiblue\">(self, *args)</span>\n<span class=\"ansigreen\"> 1131</span> answer <span class=\"ansiyellow\">=</span> self<span class=\"ansiyellow\">.</span>gateway_client<span class=\"ansiyellow\">.</span>send_command<span class=\"ansiyellow\">(</span>command<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 1132</span> return_value = get_return_value(\n<span class=\"ansigreen\">-> 1133</span><span class=\"ansiyellow\"> answer, self.gateway_client, self.target_id, self.name)\n</span><span class=\"ansigreen\"> 1134</span> <span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 1135</span> <span class=\"ansigreen\">for</span> temp_arg <span class=\"ansigreen\">in</span> temp_args<span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n\n<span class=\"ansigreen\">/databricks/spark/python/pyspark/sql/utils.py</span> in <span class=\"ansicyan\">deco</span><span class=\"ansiblue\">(*a, **kw)</span>\n<span class=\"ansigreen\"> 77</span> <span class=\"ansigreen\">raise</span> QueryExecutionException<span class=\"ansiyellow\">(</span>s<span class=\"ansiyellow\">.</span>split<span class=\"ansiyellow\">(</span><span class=\"ansiblue\">': '</span><span class=\"ansiyellow\">,</span> <span class=\"ansicyan\">1</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">[</span><span class=\"ansicyan\">1</span><span class=\"ansiyellow\">]</span><span class=\"ansiyellow\">,</span> stackTrace<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 78</span> <span class=\"ansigreen\">if</span> s<span class=\"ansiyellow\">.</span>startswith<span class=\"ansiyellow\">(</span><span class=\"ansiblue\">'java.lang.IllegalArgumentException: '</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\">---> 79</span><span class=\"ansiyellow\"> </span><span class=\"ansigreen\">raise</span> IllegalArgumentException<span class=\"ansiyellow\">(</span>s<span class=\"ansiyellow\">.</span>split<span class=\"ansiyellow\">(</span><span class=\"ansiblue\">': '</span><span class=\"ansiyellow\">,</span> <span class=\"ansicyan\">1</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">[</span><span class=\"ansicyan\">1</span><span class=\"ansiyellow\">]</span><span class=\"ansiyellow\">,</span> stackTrace<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 80</span> <span class=\"ansigreen\">raise</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 81</span> <span class=\"ansigreen\">return</span> deco<span class=\"ansiyellow\"></span>\n\n<span class=\"ansired\">IllegalArgumentException</span>: u'requirement failed: Column baths must be of type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 but was actually ShortType.'</div>","workflows":[],"startTime":1493781529748,"submitTime":1493781518115,"finishTime":1493781533669,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"265f6565-a8ad-4d73-8508-743137e35cde"},{"version":"CommandV1","origId":1271183488258067,"guid":"fa75f274-8923-4e7c-8008-7027cadddd55","subtype":"command","commandType":"auto","position":9.5,"command":"%md\nNow that our model is trained, lets evaluate on the test data.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":1493781518194,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"32a50dbd-40ca-4a38-b1a0-35adcfdf3670"},{"version":"CommandV1","origId":1271183488258068,"guid":"1627e7dd-8ada-46c8-9f57-7ff222552ed5","subtype":"command","commandType":"auto","position":9.625,"command":"testing_summary = model.evaluate(test)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\"></div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"<span class=\"ansired\">AttributeError</span>: 'LinearRegressionTrainingSummary' object has no attribute 'show'","error":"<div class=\"ansiout\"><span class=\"ansired\">---------------------------------------------------------------------------</span>\n<span class=\"ansired\">AttributeError</span> Traceback (most recent call last)\n<span class=\"ansigreen\"><ipython-input-98-08ccd6d0af5c></span> in <span class=\"ansicyan\"><module></span><span class=\"ansiblue\">()</span>\n<span class=\"ansigreen\">----> 1</span><span class=\"ansiyellow\"> </span>lrModel<span class=\"ansiyellow\">.</span>summary<span class=\"ansiyellow\">.</span>show<span class=\"ansiyellow\">(</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n\n<span class=\"ansired\">AttributeError</span>: 'LinearRegressionTrainingSummary' object has no attribute 'show'</div>","workflows":[],"startTime":1493781533674,"submitTime":1493781518250,"finishTime":1493781536189,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"a9654b83-ff5d-4bae-96a7-5d1b4a3767fb"},{"version":"CommandV1","origId":1271183488258069,"guid":"907d0351-5991-4f7c-a973-333742008d36","subtype":"command","commandType":"auto","position":9.6875,"command":"%md\nHere's a look at some predictions made by the model.","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":1493781518321,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"c1b11b92-cd83-4e9d-9bfc-2b1f31257f37"},{"version":"CommandV1","origId":1271183488258070,"guid":"d3853a9e-0366-4630-b822-04e8d55d4373","subtype":"command","commandType":"auto","position":9.75,"command":"testing_summary.predictions.select('price','baths','beds','sq__ft','prediction').show(10)","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\">+-----+-----+----+------+------------------+\n|price|baths|beds|sq__ft| prediction|\n+-----+-----+----+------+------------------+\n|55422| 1| 2| 838|147583.46179541512|\n|60000| 1| 1| 611|133690.41622679384|\n|61000| 1| 2| 876|152040.47190950165|\n|61500| 1| 3| 970|150333.98207882006|\n|62050| 1| 2| 623|122366.16772887288|\n|68566| 1| 2| 864|150632.99503136906|\n|70000| 1| 2| 1011|167874.58678849327|\n|70000| 2| 4| 1099| 149588.0987783487|\n|71000| 1| 2| 900|154855.42566576682|\n|75000| 1| 2| 861|150281.12581183593|\n+-----+-----+----+------+------------------+\nonly showing top 10 rows\n\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"<span class=\"ansired\">AnalysisException</span>: u"cannot resolve '`predictions`' given input columns: [beds, prediction, price, baths, sq__ft, features];;\\n'Project [price#14304, baths#14300, beds#14299, sq__ft#14301, 'predictions]\\n+- Project [price#14304, baths#14300, beds#14299, sq__ft#14301, features#14917, UDF(features#14917) AS prediction#15002]\\n +- Sample 0.6, 1.0, false, 0\\n +- Sort [price#14304 ASC NULLS FIRST, baths#14300 ASC NULLS FIRST, beds#14299 ASC NULLS FIRST, sq__ft#14301 ASC NULLS FIRST, features#14917 ASC NULLS FIRST], false\\n +- Project [price#14304, baths#14300, beds#14299, sq__ft#14301, UDF(named_struct(baths_double_VectorAssembler_435f896d53ad9df8043c, cast(baths#14300 as double), beds_double_VectorAssembler_435f896d53ad9df8043c, cast(beds#14299 as double), sq__ft_double_VectorAssembler_435f896d53ad9df8043c, cast(sq__ft#14301 as double))) AS features#14917]\\n +- Filter (price#14304 < 400000)\\n +- Filter (price#14304 > 50000)\\n +- Filter (sq__ft#14301 > 0)\\n +- Filter (cast(beds#14299 as int) > 0)\\n +- Filter (cast(baths#14300 as int) > 0)\\n +- Project [price#14304, baths#14300, beds#14299, sq__ft#14301]\\n +- Project [street#14295, city#14296, zip#14297, state#14298, beds#14299, baths#14300, sq__ft#14301, type#14302, sale_date#14303, price#14304, latitude#14305, longitude#14306]\\n +- SubqueryAlias realestate\\n +- Relation[street#14295,city#14296,zip#14297,state#14298,beds#14299,baths#14300,sq__ft#14301,type#14302,sale_date#14303,price#14304,latitude#14305,longitude#14306] csv\\n"","error":"<div class=\"ansiout\"><span class=\"ansired\">---------------------------------------------------------------------------</span>\n<span class=\"ansired\">AnalysisException</span> Traceback (most recent call last)\n<span class=\"ansigreen\"><ipython-input-23-c426213d244a></span> in <span class=\"ansicyan\"><module></span><span class=\"ansiblue\">()</span>\n<span class=\"ansigreen\">----> 1</span><span class=\"ansiyellow\"> </span>testing_summary<span class=\"ansiyellow\">.</span>predictions<span class=\"ansiyellow\">.</span>select<span class=\"ansiyellow\">(</span><span class=\"ansiblue\">'price'</span><span class=\"ansiyellow\">,</span><span class=\"ansiblue\">'baths'</span><span class=\"ansiyellow\">,</span><span class=\"ansiblue\">'beds'</span><span class=\"ansiyellow\">,</span><span class=\"ansiblue\">'sq__ft'</span><span class=\"ansiyellow\">,</span><span class=\"ansiblue\">'predictions'</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">.</span>show<span class=\"ansiyellow\">(</span><span class=\"ansicyan\">10</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n\n<span class=\"ansigreen\">/databricks/spark/python/pyspark/sql/dataframe.pyc</span> in <span class=\"ansicyan\">select</span><span class=\"ansiblue\">(self, *cols)</span>\n<span class=\"ansigreen\"> 991</span> <span class=\"ansiyellow\">[</span>Row<span class=\"ansiyellow\">(</span>name<span class=\"ansiyellow\">=</span><span class=\"ansiblue\">u'Alice'</span><span class=\"ansiyellow\">,</span> age<span class=\"ansiyellow\">=</span><span class=\"ansicyan\">12</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">,</span> Row<span class=\"ansiyellow\">(</span>name<span class=\"ansiyellow\">=</span><span class=\"ansiblue\">u'Bob'</span><span class=\"ansiyellow\">,</span> age<span class=\"ansiyellow\">=</span><span class=\"ansicyan\">15</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">]</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 992</span> """\n<span class=\"ansigreen\">--> 993</span><span class=\"ansiyellow\"> </span>jdf <span class=\"ansiyellow\">=</span> self<span class=\"ansiyellow\">.</span>_jdf<span class=\"ansiyellow\">.</span>select<span class=\"ansiyellow\">(</span>self<span class=\"ansiyellow\">.</span>_jcols<span class=\"ansiyellow\">(</span><span class=\"ansiyellow\">*</span>cols<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 994</span> <span class=\"ansigreen\">return</span> DataFrame<span class=\"ansiyellow\">(</span>jdf<span class=\"ansiyellow\">,</span> self<span class=\"ansiyellow\">.</span>sql_ctx<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 995</span> <span class=\"ansiyellow\"></span>\n\n<span class=\"ansigreen\">/databricks/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py</span> in <span class=\"ansicyan\">__call__</span><span class=\"ansiblue\">(self, *args)</span>\n<span class=\"ansigreen\"> 1131</span> answer <span class=\"ansiyellow\">=</span> self<span class=\"ansiyellow\">.</span>gateway_client<span class=\"ansiyellow\">.</span>send_command<span class=\"ansiyellow\">(</span>command<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 1132</span> return_value = get_return_value(\n<span class=\"ansigreen\">-> 1133</span><span class=\"ansiyellow\"> answer, self.gateway_client, self.target_id, self.name)\n</span><span class=\"ansigreen\"> 1134</span> <span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 1135</span> <span class=\"ansigreen\">for</span> temp_arg <span class=\"ansigreen\">in</span> temp_args<span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n\n<span class=\"ansigreen\">/databricks/spark/python/pyspark/sql/utils.pyc</span> in <span class=\"ansicyan\">deco</span><span class=\"ansiblue\">(*a, **kw)</span>\n<span class=\"ansigreen\"> 67</span> e.java_exception.getStackTrace()))\n<span class=\"ansigreen\"> 68</span> <span class=\"ansigreen\">if</span> s<span class=\"ansiyellow\">.</span>startswith<span class=\"ansiyellow\">(</span><span class=\"ansiblue\">'org.apache.spark.sql.AnalysisException: '</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\">---> 69</span><span class=\"ansiyellow\"> </span><span class=\"ansigreen\">raise</span> AnalysisException<span class=\"ansiyellow\">(</span>s<span class=\"ansiyellow\">.</span>split<span class=\"ansiyellow\">(</span><span class=\"ansiblue\">': '</span><span class=\"ansiyellow\">,</span> <span class=\"ansicyan\">1</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">[</span><span class=\"ansicyan\">1</span><span class=\"ansiyellow\">]</span><span class=\"ansiyellow\">,</span> stackTrace<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 70</span> <span class=\"ansigreen\">if</span> s<span class=\"ansiyellow\">.</span>startswith<span class=\"ansiyellow\">(</span><span class=\"ansiblue\">'org.apache.spark.sql.catalyst.analysis'</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">:</span><span class=\"ansiyellow\"></span>\n<span class=\"ansigreen\"> 71</span> <span class=\"ansigreen\">raise</span> AnalysisException<span class=\"ansiyellow\">(</span>s<span class=\"ansiyellow\">.</span>split<span class=\"ansiyellow\">(</span><span class=\"ansiblue\">': '</span><span class=\"ansiyellow\">,</span> <span class=\"ansicyan\">1</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\">[</span><span class=\"ansicyan\">1</span><span class=\"ansiyellow\">]</span><span class=\"ansiyellow\">,</span> stackTrace<span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n\n<span class=\"ansired\">AnalysisException</span>: u"cannot resolve '`predictions`' given input columns: [beds, prediction, price, baths, sq__ft, features];;\\n'Project [price#14304, baths#14300, beds#14299, sq__ft#14301, 'predictions]\\n+- Project [price#14304, baths#14300, beds#14299, sq__ft#14301, features#14917, UDF(features#14917) AS prediction#15002]\\n +- Sample 0.6, 1.0, false, 0\\n +- Sort [price#14304 ASC NULLS FIRST, baths#14300 ASC NULLS FIRST, beds#14299 ASC NULLS FIRST, sq__ft#14301 ASC NULLS FIRST, features#14917 ASC NULLS FIRST], false\\n +- Project [price#14304, baths#14300, beds#14299, sq__ft#14301, UDF(named_struct(baths_double_VectorAssembler_435f896d53ad9df8043c, cast(baths#14300 as double), beds_double_VectorAssembler_435f896d53ad9df8043c, cast(beds#14299 as double), sq__ft_double_VectorAssembler_435f896d53ad9df8043c, cast(sq__ft#14301 as double))) AS features#14917]\\n +- Filter (price#14304 < 400000)\\n +- Filter (price#14304 > 50000)\\n +- Filter (sq__ft#14301 > 0)\\n +- Filter (cast(beds#14299 as int) > 0)\\n +- Filter (cast(baths#14300 as int) > 0)\\n +- Project [price#14304, baths#14300, beds#14299, sq__ft#14301]\\n +- Project [street#14295, city#14296, zip#14297, state#14298, beds#14299, baths#14300, sq__ft#14301, type#14302, sale_date#14303, price#14304, latitude#14305, longitude#14306]\\n +- SubqueryAlias realestate\\n +- Relation[street#14295,city#14296,zip#14297,state#14298,beds#14299,baths#14300,sq__ft#14301,type#14302,sale_date#14303,price#14304,latitude#14305,longitude#14306] csv\\n"</div>","workflows":[],"startTime":1493781601138,"submitTime":1493781601140,"finishTime":1493781602247,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"151055f8-0996-404f-974d-0e2c93fc45d8"},{"version":"CommandV1","origId":1271183488258077,"guid":"ab416058-788c-422f-984d-97d28b5c5f31","subtype":"command","commandType":"auto","position":10.0625,"command":"%md\nLet's also take a look at the RMSE to get an idea of how good our average prediction was.","commandVersion":0,"state":"error","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":0,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"70ecba0b-d07b-4102-bc03-9373eec60c73"},{"version":"CommandV1","origId":1271183488258076,"guid":"b62c1cda-9e0c-4455-87a2-50a1b8182a44","subtype":"command","commandType":"auto","position":10.375,"command":"testing_summary.rootMeanSquaredError","commandVersion":0,"state":"finished","results":{"type":"html","data":"<div class=\"ansiout\"><span class=\"ansired\">Out[</span><span class=\"ansired\">21</span><span class=\"ansired\">]: </span>58134.33374877389\n</div>","arguments":{},"addedWidgets":{},"removedWidgets":[]},"errorSummary":"<span class=\"ansired\">TypeError</span>: 'float' object is not callable","error":"<div class=\"ansiout\"><span class=\"ansired\">---------------------------------------------------------------------------</span>\n<span class=\"ansired\">TypeError</span> Traceback (most recent call last)\n<span class=\"ansigreen\"><ipython-input-123-a8423f0f72d1></span> in <span class=\"ansicyan\"><module></span><span class=\"ansiblue\">()</span>\n<span class=\"ansigreen\">----> 1</span><span class=\"ansiyellow\"> </span>testing_summary<span class=\"ansiyellow\">.</span>rootMeanSquaredError<span class=\"ansiyellow\">(</span><span class=\"ansiyellow\">)</span><span class=\"ansiyellow\"></span>\n\n<span class=\"ansired\">TypeError</span>: 'float' object is not callable</div>","workflows":[],"startTime":1493781536223,"submitTime":1493781518434,"finishTime":1493781536238,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"b40ae323-d130-4db7-b69f-9aabc52fe194"},{"version":"CommandV1","origId":1271183488258075,"guid":"f43d154a-49f4-4ca9-80e9-09b20b95a301","subtype":"command","commandType":"auto","position":11.0,"command":"%md\n\nThat's it for this basic introduction to Spark dataframes and MLlib. Looking back at the tutorial I wrote 2 years ago with this same content, it's amazing to see how much Spark has matured and improved. This notebook required a fraction of the previous code, and far less frustration. If you're interested in next steps, I suggest checking out the [Spark ML page on tuning models](https://spark.apache.org/docs/2.1.0/ml-tuning.html) to leverage concepts such as grid searching hyperparameters with cross validation. Also, if you are serious about running Spark against large data sets, I'd advise beginners to start thinking about when it is (and is not) to [leverage caching](http://spark.apache.org/docs/latest/quick-start.html#caching) for their use cases. ","commandVersion":0,"state":"finished","results":null,"errorSummary":null,"error":null,"workflows":[],"startTime":0,"submitTime":1493781518501,"finishTime":0,"collapsed":false,"bindings":{},"inputWidgets":{},"displayType":"table","width":"auto","height":"auto","xColumns":null,"yColumns":null,"pivotColumns":null,"pivotAggregation":null,"customPlotOptions":{},"commentThread":[],"commentsVisible":false,"parentHierarchy":[],"diffInserts":[],"diffDeletes":[],"globalVars":{},"latestUser":"a user","commandTitle":"","showCommandTitle":false,"hideCommandCode":false,"hideCommandResult":false,"iPythonMetadata":null,"streamStates":{},"nuid":"f58c1c28-7d8d-4857-8ca8-7c2c39bc32fb"}],"dashboards":[],"guid":"855bd341-71ca-4840-81d7-1510583f01df","globalVars":{},"iPythonMetadata":null,"inputWidgets":{}};</script>
<script
src="https://databricks-prod-cloudfront.cloud.databricks.com/static/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855/js/notebook-main.js"
onerror="window.mainJsLoadError = true;"></script>
</head>
<body>
<script>
if (window.mainJsLoadError) {
var u = 'https://databricks-prod-cloudfront.cloud.databricks.com/static/e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855/js/notebook-main.js';
var b = document.getElementsByTagName('body')[0];
var c = document.createElement('div');
c.innerHTML = ('<h1>Network Error</h1>' +
'<p><b>Please check your network connection and try again.</b></p>' +
'<p>Could not load a required resource: ' + u + '</p>');
c.style.margin = '30px';
c.style.padding = '20px 50px';
c.style.backgroundColor = '#f5f5f5';
c.style.borderRadius = '5px';
b.appendChild(c);
}
</script>
</body>
</html>