apache-spark - Databricks Spark 引发 java.io.NotSerializableException：com.amazonaws.services.s3.AmazonS3Client

Question

您好我正在尝试在 Databricks 上运行以下代码，这是一个 3 节点 spark 集群我从 kinesis 流中检索数据到 spark 数据帧中并将其转换为提取有效负载 json 文件名

在下面的代码中，我尝试从 AWS s3 下载每个 json 文件名的数据，我的有效负载是“filename.json”形式的字符串

import scala.io.Source
import com.amazonaws.services.s3._
import java.io.InputStream
val ds = interim_df_v1.select("payload").mapPartitions(iterator => {
    val s3Client: AmazonS3 = new AmazonS3Client()
    val res = iterator.flatMap(row=>{
        var bucket = "bucket_name"
        Source.fromInputStream(s3Client.getObject(bucket, row.getString(0)).getObjectContent: InputStream).getLines
    })
    res
  })

尝试创建一个Singleton对象来访问 s3 客户端也mapPartitions但似乎没有任何工作，得到以下错误

    at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withAction(Dataset.scala:3496)
    at org.apache.spark.sql.Dataset.collect(Dataset.scala:2872)
    at org.apache.spark.sql.execution.streaming.MemorySink.addBatch(memory.scala:276)
    at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$5$$anonfun$apply$19.apply(MicroBatchExecution.scala:580)
    at org.apache.spark.sql.execution.SQLExecution$$anonfun$withCustomExecutionEnv$1$$anonfun$apply$1.apply(SQLExecution.scala:112)
    at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:233)
    at org.apache.spark.sql.execution.SQLExecution$$anonfun$withCustomExecutionEnv$1.apply(SQLExecution.scala:98)
    at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:835)
    at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:74)
    at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:185)
    at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch$5.apply(MicroBatchExecution.scala:578)
    at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:263)
    at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:61)
    at org.apache.spark.sql.execution.streaming.MicroBatchExecution.org$apache$spark$sql$execution$streaming$MicroBatchExecution$$runBatch(MicroBatchExecution.scala:577)
    at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply$mcV$sp(MicroBatchExecution.scala:208)
    at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:176)
    at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1$$anonfun$apply$mcZ$sp$1.apply(MicroBatchExecution.scala:176)
    at org.apache.spark.sql.execution.streaming.ProgressReporter$class.reportTimeTaken(ProgressReporter.scala:263)
    at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:61)
    at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$runActivatedStream$1.apply$mcZ$sp(MicroBatchExecution.scala:176)
    at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:56)
    at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:170)
    at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:296)
    at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:208)
Caused by: java.io.NotSerializableException: com.amazonaws.services.s3.AmazonS3Client
Serialization stack:
    - object not serializable (class: com.amazonaws.services.s3.AmazonS3Client, value: com.amazonaws.services.s3.AmazonS3Client@73d83b2c)
    - field (class: line019c6b4503ad49929c23c8abc36c49b4166.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw, name: s3Client, type: interface com.amazonaws.services.s3.AmazonS3)

请让我知道任何其他解决方法

apache-spark - Databricks Spark 引发 java.io.NotSerializableException：com.amazonaws.services.s3.AmazonS3Client

0 回答 0

Related

Reference