SonarSource · github-actions · Apr 8, 2025 · Apr 8, 2025 · Apr 9, 2025 · Apr 9, 2025
diff --git a/rules/S7472/metadata.json b/rules/S7472/metadata.json
@@ -0,0 +1,2 @@
+{
+}
diff --git a/rules/S7472/python/metadata.json b/rules/S7472/python/metadata.json
@@ -0,0 +1,25 @@
+{
+  "title": "Avoid the usage of \"unionAll()\" in favor of \"union()\"",
+  "type": "CODE_SMELL",
+  "status": "ready",
+  "remediation": {
+    "func": "Constant\/Issue",
+    "constantCost": "5min"
+  },
+  "tags": [
+    "data-science",
+    "pyspark"
+  ],
+  "defaultSeverity": "Major",
+  "ruleSpecification": "RSPEC-7472",
+  "sqKey": "S7472",
+  "scope": "All",
+  "defaultQualityProfiles": ["Sonar way"],
+  "quickfix": "unknown",
+  "code": {
+    "impacts": {
+      "MAINTAINABILITY": "MEDIUM",
+    },
+    "attribute": "CONVENTIONAL"
+  }
+}
diff --git a/rules/S7472/python/rule.adoc b/rules/S7472/python/rule.adoc
@@ -0,0 +1,52 @@
+This rule raises an issue when `pyspark.sql.DataFrame.unionAll` is used instead of `pyspark.sql.DataFrame.union`.
+
+
+== Why is this an issue?
+
+When using Pyspark, it is recommended to avoid using the `unionAll()` method and instead use the `union()` method when combining two DataFrames. 
+The `unionAll()` method is an alias for `union()`, which provides the same functionality. 
+Using `union()` ensures compatibility with future versions of PySpark and aligns with current best practices.
+
+=== Code examples
+
+==== Noncompliant code example
+
+[source,python,diff-id=1,diff-type=noncompliant]
+----
+from pyspark.sql import SparkSession
+
+spark = SparkSession.builder.appName("Example").getOrCreate()
+
+data1 = [(1, "Alice"), (2, "Bob")]
+data2 = [(3, "Cathy"), (4, "David")]
+
+df1 = spark.createDataFrame(data1, ["id", "name"])
+df2 = spark.createDataFrame(data2, ["id", "name"])
+
+combined_df = df1.unionAll(df2) # Noncompliant: unionAll() should be replaced by union()
+----
+
+==== Compliant solution
+
+[source,python,diff-id=1,diff-type=compliant]
+----
+from pyspark.sql import SparkSession
+
+spark = SparkSession.builder.appName("Example").getOrCreate()
+
+data1 = [(1, "Alice"), (2, "Bob")]
+data2 = [(3, "Cathy"), (4, "David")]
+
+df1 = spark.createDataFrame(data1, ["id", "name"])
+df2 = spark.createDataFrame(data2, ["id", "name"])
+
+combined_df = df1.union(df2)
+----
+
+== Resources
+=== Documentation
+
+* Mungingdata post - https://www.mungingdata.com/pyspark/union-unionbyname-merge-dataframes/[Combining PySpark DataFrames]
+* Spark by examples - https://sparkbyexamples.com/pyspark/pyspark-union-and-unionall/[union and unionAll]
+* PySpark Documentation - https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.unionAll.html[pyspark.sql.DataFrame.unionAll]
+