[SPARK-53449][SQL] Simply options for builtin Datasource Scan related classes

yaooqinn · yaooqinn · commit 916a779a8d46 · 2025-09-02T17:08:55.000+08:00
### What changes were proposed in this pull request? Simplify interoperations between SQLConf and builtin Datasource Scan related classes, following [SPARK-52704](https://issues.apache.org/jira/browse/SPARK-52704) [SPARK-53415](https://issues.apache.org/jira/browse/SPARK-53415) ### Why are the changes needed? - Reduce code duplication - Restore type annotation for IDE ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? existing tests ### Was this patch authored or co-authored using generative AI tooling? no Closes #52192 from yaooqinn/SPARK-53449. Authored-by: Kent Yao <yao@apache.org> Signed-off-by: Kent Yao <yao@apache.org>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -37,7 +37,7 @@ import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat =>
 import org.apache.spark.sql.execution.datasources.v2.{PushedDownOperators, TableSampleInfo}
 import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
 import org.apache.spark.sql.execution.vectorized.ConstantColumnVector
-import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.{SessionStateHelper, SQLConf}
 import org.apache.spark.sql.sources.{BaseRelation, Filter}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.vectorized.ColumnarBatch
@@ -295,7 +295,7 @@ case class RowDataSourceScanExec(
 /**
  * A base trait for file scans containing file listing and metrics code.
  */
-trait FileSourceScanLike extends DataSourceScanExec {
+trait FileSourceScanLike extends DataSourceScanExec with SessionStateHelper {
 
   // Filters on non-partition columns.
   def dataFilters: Seq[Expression]
@@ -327,7 +327,7 @@ trait FileSourceScanLike extends DataSourceScanExec {
     relation.fileFormat.vectorTypes(
       requiredSchema = requiredSchema,
       partitionSchema = relation.partitionSchema,
-      relation.sparkSession.sessionState.conf).map { vectorTypes =>
+      getSqlConf(relation.sparkSession)).map { vectorTypes =>
         vectorTypes ++
           // for column-based file format, append metadata column's vector type classes if any
           fileConstantMetadataColumns.map { _ => classOf[ConstantColumnVector].getName }
@@ -414,7 +414,7 @@ trait FileSourceScanLike extends DataSourceScanExec {
 
   // exposed for testing
   lazy val bucketedScan: Boolean = {
-    if (relation.sparkSession.sessionState.conf.bucketingEnabled && relation.bucketSpec.isDefined
+    if (getSqlConf(relation.sparkSession).bucketingEnabled && relation.bucketSpec.isDefined
       && !disableBucketedScan) {
       val spec = relation.bucketSpec.get
       val bucketColumns = spec.bucketColumnNames.flatMap(n => toAttribute(n))
@@ -535,7 +535,7 @@ trait FileSourceScanLike extends DataSourceScanExec {
           bucketedKey -> "true",
           "SelectedBucketsCount" -> (s"$numSelectedBuckets out of ${spec.numBuckets}" +
             optionalNumCoalescedBuckets.map { b => s" (Coalesced to $b)"}.getOrElse("")))
-      } else if (!relation.sparkSession.sessionState.conf.bucketingEnabled) {
+      } else if (!getSqlConf(relation.sparkSession).bucketingEnabled) {
         metadata + (bucketedKey -> "false (disabled by configuration)")
       } else if (disableBucketedScan) {
         metadata + (bucketedKey -> "false (disabled by query planner)")
@@ -646,7 +646,7 @@ trait FileSourceScanLike extends DataSourceScanExec {
     }
 
     override def calculateTotalPartitionBytes: Long = {
-      val openCostInBytes = relation.sparkSession.sessionState.conf.filesOpenCostInBytes
+      val openCostInBytes = getSqlConf(relation.sparkSession).filesOpenCostInBytes
       partitionDirectories.flatMap(_.files.map(_.getLen + openCostInBytes)).sum
     }
 
@@ -698,7 +698,7 @@ case class FileSourceScanExec(
   // Note that some vals referring the file-based relation are lazy intentionally
   // so that this plan can be canonicalized on executor side too. See SPARK-23731.
   override lazy val supportsColumnar: Boolean = {
-    val conf = relation.sparkSession.sessionState.conf
+    val conf = getSqlConf(relation.sparkSession)
     // Only output columnar if there is WSCG to read it.
     val requiredWholeStageCodegenSettings =
       conf.wholeStageEnabled && !WholeStageCodegenExec.isTooManyFields(conf, schema)
@@ -725,7 +725,7 @@ case class FileSourceScanExec(
         requiredSchema = requiredSchema,
         filters = pushedDownFilters,
         options = options,
-        hadoopConf = relation.sparkSession.sessionState.newHadoopConfWithOptions(relation.options))
+        hadoopConf = getHadoopConf(relation.sparkSession, relation.options))
 
     val readRDD = if (bucketedScan) {
       createBucketedReadRDD(relation.bucketSpec.get, readFile, dynamicallySelectedPartitions)
@@ -849,15 +849,15 @@ case class FileSourceScanExec(
   private def createReadRDD(
       readFile: PartitionedFile => Iterator[InternalRow],
       selectedPartitions: ScanFileListing): RDD[InternalRow] = {
-    val openCostInBytes = relation.sparkSession.sessionState.conf.filesOpenCostInBytes
+    val openCostInBytes = getSqlConf(relation.sparkSession).filesOpenCostInBytes
     val maxSplitBytes =
       FilePartition.maxSplitBytes(relation.sparkSession, selectedPartitions)
     logInfo(log"Planning scan with bin packing, max size: ${MDC(MAX_SPLIT_BYTES, maxSplitBytes)} " +
       log"bytes, open cost is considered as scanning ${MDC(OPEN_COST_IN_BYTES, openCostInBytes)} " +
       log"bytes.")
 
     // Filter files with bucket pruning if possible
-    val bucketingEnabled = relation.sparkSession.sessionState.conf.bucketingEnabled
+    val bucketingEnabled = getSqlConf(relation.sparkSession).bucketingEnabled
     val shouldProcess: Path => Boolean = optionalBucketSet match {
       case Some(bucketSet) if bucketingEnabled =>
         // Do not prune the file if bucket file name is invalid
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -53,7 +53,7 @@ import org.apache.spark.sql.execution.streaming.{Sink, Source}
 import org.apache.spark.sql.execution.streaming.runtime._
 import org.apache.spark.sql.execution.streaming.sinks.FileStreamSink
 import org.apache.spark.sql.execution.streaming.sources.{RateStreamProvider, TextSocketSourceProvider}
-import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.{SessionStateHelper, SQLConf}
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.{DataType, StructField, StructType}
@@ -100,12 +100,14 @@ case class DataSource(
     partitionColumns: Seq[String] = Seq.empty,
     bucketSpec: Option[BucketSpec] = None,
     options: Map[String, String] = Map.empty,
-    catalogTable: Option[CatalogTable] = None) extends Logging {
+    catalogTable: Option[CatalogTable] = None) extends SessionStateHelper with Logging {
 
   case class SourceInfo(name: String, schema: StructType, partitionColumns: Seq[String])
 
+  private val conf: SQLConf = getSqlConf(sparkSession)
+
   lazy val providingClass: Class[_] = {
-    val cls = DataSource.lookupDataSource(className, sparkSession.sessionState.conf)
+    val cls = DataSource.lookupDataSource(className, conf)
     // `providingClass` is used for resolving data source relation for catalog tables.
     // As now catalog for data source V2 is under development, here we fall back all the
     // [[FileDataSourceV2]] to [[FileFormat]] to guarantee the current catalog works.
@@ -120,8 +122,7 @@ case class DataSource(
 
   private[sql] def providingInstance(): Any = providingClass.getConstructor().newInstance()
 
-  private def newHadoopConfiguration(): Configuration =
-    sparkSession.sessionState.newHadoopConfWithOptions(options)
+  private def newHadoopConfiguration(): Configuration = getHadoopConf(sparkSession, options)
 
   private def makeQualified(path: Path): Path = {
     val fs = path.getFileSystem(newHadoopConfiguration())
@@ -130,7 +131,7 @@ case class DataSource(
 
   lazy val sourceInfo: SourceInfo = sourceSchema()
   private val caseInsensitiveOptions = CaseInsensitiveMap(options)
-  private val equality = sparkSession.sessionState.conf.resolver
+  private val equality = conf.resolver
 
   /**
    * Whether or not paths should be globbed before being used to access files.
@@ -262,7 +263,7 @@ case class DataSource(
           }
         }
 
-        val isSchemaInferenceEnabled = sparkSession.sessionState.conf.streamingSchemaInference
+        val isSchemaInferenceEnabled = conf.streamingSchemaInference
         val isTextSource = providingClass == classOf[text.TextFileFormat]
         val isSingleVariantColumn = (providingClass == classOf[json.JsonFileFormat] ||
           providingClass == classOf[csv.CSVFileFormat]) &&
@@ -281,8 +282,7 @@ case class DataSource(
             checkAndGlobPathIfNecessary(checkEmptyGlobPath = false, checkFilesExist = false)
           createInMemoryFileIndex(globbedPaths)
         })
-        val forceNullable = sparkSession.sessionState.conf
-          .getConf(SQLConf.FILE_SOURCE_SCHEMA_FORCE_NULLABLE)
+        val forceNullable = conf.getConf(SQLConf.FILE_SOURCE_SCHEMA_FORCE_NULLABLE)
         val sourceDataSchema = if (forceNullable) dataSchema.asNullable else dataSchema
         SourceInfo(
           s"FileSource[$path]",
@@ -381,7 +381,7 @@ case class DataSource(
           if FileStreamSink.hasMetadata(
             caseInsensitiveOptions.get("path").toSeq ++ paths,
             newHadoopConfiguration(),
-            sparkSession.sessionState.conf) =>
+            conf) =>
         val basePath = new Path((caseInsensitiveOptions.get("path").toSeq ++ paths).head)
         val fileCatalog = new MetadataLogFileIndex(sparkSession, basePath,
           caseInsensitiveOptions, userSpecifiedSchema)
@@ -407,11 +407,11 @@ case class DataSource(
 
       // This is a non-streaming file based datasource.
       case (format: FileFormat, _) =>
-        val useCatalogFileIndex = sparkSession.sessionState.conf.manageFilesourcePartitions &&
+        val useCatalogFileIndex = conf.manageFilesourcePartitions &&
           catalogTable.isDefined && catalogTable.get.tracksPartitionsInCatalog &&
           catalogTable.get.partitionColumnNames.nonEmpty
         val (fileCatalog, dataSchema, partitionSchema) = if (useCatalogFileIndex) {
-          val defaultTableSize = sparkSession.sessionState.conf.defaultSizeInBytes
+          val defaultTableSize = conf.defaultSizeInBytes
           val index = new CatalogFileIndex(
             sparkSession,
             catalogTable.get,
@@ -475,7 +475,7 @@ case class DataSource(
       throw QueryExecutionErrors.dataPathNotSpecifiedError()
     }
 
-    val caseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis
+    val caseSensitive = conf.caseSensitiveAnalysis
     PartitioningUtils.validatePartitionColumn(data.schema, partitionColumns, caseSensitive)
 
     val fileIndex = catalogTable.map(_.identifier).map { tableIdent =>
@@ -531,7 +531,7 @@ case class DataSource(
         disallowWritingIntervals(
           outputColumns.toStructType.asNullable, format.toString, forbidAnsiIntervals = false)
         val cmd = planForWritingFileFormat(format, mode, data)
-        val qe = sparkSession.sessionState.executePlan(cmd)
+        val qe = sessionState(sparkSession).executePlan(cmd)
         qe.assertCommandExecuted()
         // Replace the schema with that of the DataFrame we just wrote out to avoid re-inferring
         copy(userSpecifiedSchema = Some(outputColumns.toStructType.asNullable)).resolveRelation()
@@ -555,7 +555,7 @@ case class DataSource(
         SaveIntoDataSourceCommand(data, dataSource, caseInsensitiveOptions, mode)
       case format: FileFormat =>
         disallowWritingIntervals(data.schema, format.toString, forbidAnsiIntervals = false)
-        DataSource.validateSchema(format.toString, data.schema, sparkSession.sessionState.conf)
+        DataSource.validateSchema(format.toString, data.schema, conf)
         planForWritingFileFormat(format, mode, data)
       case _ => throw SparkException.internalError(
         s"${providingClass.getCanonicalName} does not allow create table as select.")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.datasources
 import org.apache.spark.sql.{SparkSession, SQLContext}
 import org.apache.spark.sql.catalyst.catalog.BucketSpec
 import org.apache.spark.sql.execution.FileRelation
+import org.apache.spark.sql.internal.SessionStateHelper
 import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister}
 import org.apache.spark.sql.types.{StructField, StructType}
 
@@ -46,7 +47,7 @@ case class HadoopFsRelation(
     bucketSpec: Option[BucketSpec],
     fileFormat: FileFormat,
     options: Map[String, String])(val sparkSession: SparkSession)
-  extends BaseRelation with FileRelation {
+  extends BaseRelation with FileRelation with SessionStateHelper{
 
   override def sqlContext: SQLContext = sparkSession.sqlContext
 
@@ -55,7 +56,7 @@ case class HadoopFsRelation(
   // respects the data types of the partition schema.
   val (schema: StructType, overlappedPartCols: Map[String, StructField]) =
     PartitioningUtils.mergeDataAndPartitionSchema(dataSchema,
-      partitionSchema, sparkSession.sessionState.conf.caseSensitiveAnalysis)
+      partitionSchema, getSqlConf(sparkSession).caseSensitiveAnalysis)
 
   override def toString: String = {
     fileFormat match {
@@ -65,7 +66,7 @@ case class HadoopFsRelation(
   }
 
   override def sizeInBytes: Long = {
-    val compressionFactor = sparkSession.sessionState.conf.fileCompressionFactor
+    val compressionFactor = getSqlConf(sparkSession).fileCompressionFactor
     (location.sizeInBytes * compressionFactor).toLong
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionStateHelper.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SessionStateHelper.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.SparkSession
  * It also provides type annotations for IDEs to build indexes.
  */
 trait SessionStateHelper {
-  private def sessionState(sparkSession: SparkSession): SessionState = {
+  protected def sessionState(sparkSession: SparkSession): SessionState = {
     sparkSession.sessionState
   }
 
@@ -48,6 +48,10 @@ trait SessionStateHelper {
       options: Map[String, String]): Configuration = {
     sessionState(sparkSession).newHadoopConfWithOptions(options)
   }
+
+  def getHadoopConf(sparkSession: SparkSession): Configuration = {
+    sessionState(sparkSession).newHadoopConf()
+  }
 }
 
 object SessionStateHelper extends SessionStateHelper
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveTableScanExec.scala
@@ -40,7 +40,7 @@ import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.hive._
 import org.apache.spark.sql.hive.client.HiveClientImpl
-import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.internal.{SessionStateHelper, SQLConf}
 import org.apache.spark.sql.types.{BooleanType, DataType}
 import org.apache.spark.util.Utils
 
@@ -57,12 +57,12 @@ case class HiveTableScanExec(
     relation: HiveTableRelation,
     partitionPruningPred: Seq[Expression])(
     @transient private val sparkSession: SparkSession)
-  extends LeafExecNode with CastSupport {
+  extends LeafExecNode with CastSupport with SessionStateHelper {
 
   require(partitionPruningPred.isEmpty || relation.isPartitioned,
     "Partition pruning predicates only supported for partitioned tables.")
 
-  override def conf: SQLConf = sparkSession.sessionState.conf
+  override def conf: SQLConf = getSqlConf(sparkSession)
 
   override def nodeName: String = s"Scan hive ${relation.tableMeta.qualifiedName}"
 
@@ -98,7 +98,7 @@ case class HiveTableScanExec(
   // Create a local copy of hadoopConf,so that scan specific modifications should not impact
   // other queries
   @transient private lazy val hadoopConf = {
-    val c = sparkSession.sessionState.newHadoopConf()
+    val c = getHadoopConf(sparkSession)
     // append columns ids and names before broadcast
     addColumnMetadataToConf(c)
     c
@@ -175,8 +175,7 @@ case class HiveTableScanExec(
         prunePartitions(hivePartitions)
       }
     } else {
-      if (sparkSession.sessionState.conf.metastorePartitionPruning &&
-        partitionPruningPred.nonEmpty) {
+      if (conf.metastorePartitionPruning && partitionPruningPred.nonEmpty) {
         rawPartitions
       } else {
         prunePartitions(rawPartitions)
@@ -187,16 +186,15 @@ case class HiveTableScanExec(
   // exposed for tests
   @transient lazy val rawPartitions: Seq[HivePartition] = {
     val prunedPartitions =
-      if (sparkSession.sessionState.conf.metastorePartitionPruning &&
-        partitionPruningPred.nonEmpty) {
+      if (conf.metastorePartitionPruning && partitionPruningPred.nonEmpty) {
         // Retrieve the original attributes based on expression ID so that capitalization matches.
         val normalizedFilters = partitionPruningPred.map(_.transform {
           case a: AttributeReference => originalAttributes(a)
         })
-        sparkSession.sessionState.catalog
+        sessionState(sparkSession).catalog
           .listPartitionsByFilter(relation.tableMeta.identifier, normalizedFilters)
       } else {
-        sparkSession.sessionState.catalog.listPartitions(relation.tableMeta.identifier)
+        sessionState(sparkSession).catalog.listPartitions(relation.tableMeta.identifier)
       }
     prunedPartitions.map(HiveClientImpl.toHivePartition(_, hiveQlTable))
   }

Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ import org.apache.spark.sql.SparkSession`
`27`	`27`	`* It also provides type annotations for IDEs to build indexes.`
`28`	`28`	`*/`
`29`	`29`	`trait SessionStateHelper {`
`30`		`- private def sessionState(sparkSession: SparkSession): SessionState = {`
	`30`	`+ protected def sessionState(sparkSession: SparkSession): SessionState = {`
`31`	`31`	`sparkSession.sessionState`
`32`	`32`	`}`
`33`	`33`
`@@ -48,6 +48,10 @@ trait SessionStateHelper {`
`48`	`48`	`options: Map[String, String]): Configuration = {`
`49`	`49`	`sessionState(sparkSession).newHadoopConfWithOptions(options)`
`50`	`50`	`}`
	`51`	`+`
	`52`	`+ def getHadoopConf(sparkSession: SparkSession): Configuration = {`
	`53`	`+ sessionState(sparkSession).newHadoopConf()`
	`54`	`+ }`
`51`	`55`	`}`
`52`	`56`
`53`	`57`	`object SessionStateHelper extends SessionStateHelper`