SonarSource · github-actions · Apr 8, 2025 · Apr 8, 2025 · Apr 9, 2025 · Apr 9, 2025
diff --git a/rules/S7472/metadata.json b/rules/S7472/metadata.json
@@ -0,0 +1,2 @@
+{
+}
diff --git a/rules/S7472/python/metadata.json b/rules/S7472/python/metadata.json
@@ -0,0 +1,25 @@
+{
+  "title": "Avoid the usage of \"unionAll()\" in favor of \"union()\"",
+  "type": "CODE_SMELL",
+  "status": "ready",
+  "remediation": {
+    "func": "Constant\/Issue",
+    "constantCost": "5min"
+  },
+  "tags": [
+    "data-science",
+    "pyspark"
+  ],
+  "defaultSeverity": "Major",
+  "ruleSpecification": "RSPEC-7472",
+  "sqKey": "S7472",
+  "scope": "All",
+  "defaultQualityProfiles": ["Sonar way"],
+  "quickfix": "unknown",
+  "code": {
+    "impacts": {
+      "MAINTAINABILITY": "MEDIUM",
+    },
+    "attribute": "CONVENTIONAL"
+  }
+}
diff --git a/rules/S7472/python/rule.adoc b/rules/S7472/python/rule.adoc
@@ -0,0 +1,51 @@
+This rule raises an issue when `pyspark.sql.DataFrame.unionAll` is used instead of `pyspark.sql.DataFrame.union`.
+
+
+== Why is this an issue?
+
+When using Pyspark, it is recommended to avoid using the `unionAll()` method and instead use the `union()` method when combining two DataFrames. The unionAll() method is deprecated and has been replaced by union(), which provides the same functionality. Using union() ensures compatibility with future versions of PySpark and aligns with current best practices.
+
+=== Code examples
+
+==== Noncompliant code example
+
+[source,python,diff-id=1,diff-type=noncompliant]
+----
+from pyspark.sql import SparkSession
+
+spark = SparkSession.builder.appName("Example").getOrCreate()
+
+data1 = [(1, "Alice"), (2, "Bob")]
+data2 = [(3, "Cathy"), (4, "David")]
+
+df1 = spark.createDataFrame(data1, ["id", "name"])
+df2 = spark.createDataFrame(data2, ["id", "name"])
+
+# Noncompliant: unionAll() is deprecated
+combined_df = df1.unionAll(df2)
+----
+
+==== Compliant solution
+
+[source,python,diff-id=1,diff-type=compliant]
+----
+from pyspark.sql import SparkSession
+
+spark = SparkSession.builder.appName("Example").getOrCreate()
+
+data1 = [(1, "Alice"), (2, "Bob")]
+data2 = [(3, "Cathy"), (4, "David")]
+
+df1 = spark.createDataFrame(data1, ["id", "name"])
+df2 = spark.createDataFrame(data2, ["id", "name"])
+
+combined_df = df1.union(df2)
+----
+
+== Resources
+=== Documentation
+
+* Mungingdata post - https://www.mungingdata.com/pyspark/union-unionbyname-merge-dataframes/[Combining PySpark DataFrames]
+* Spark by examples - https://sparkbyexamples.com/pyspark/pyspark-union-and-unionall/[union and unionAll]
+* PySpark Documentation - https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.unionAll.html[pyspark.sql.DataFrame.unionAll]
+