[SPARK-50672][PYTHON][TESTS] Make openpyxl optional in PySpark Tests

### What changes were proposed in this pull request? Make `openpyxl` optional in PySpark Tests ### Why are the changes needed? `openpyxl` is an optional dependency of pandas, also optional to pyspark the test should not fail without it. ### Does this PR introduce _any_ user-facing change? no, test only ### How was this patch tested? manually check ### Was this patch authored or co-authored using generative AI tooling? no Closes #49294 from zhengruifeng/optional_openpyxl. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
apache · Dec 26, 2024 · d3022e9 · d3022e9
1 parent 92948e7
commit d3022e9
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 1 deletion.
diff --git a/python/pyspark/pandas/tests/io/test_dataframe_conversion.py b/python/pyspark/pandas/tests/io/test_dataframe_conversion.py
@@ -26,7 +26,12 @@
 from pyspark import pandas as ps
 from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
 from pyspark.testing.sqlutils import SQLTestUtils
-from pyspark.testing.utils import have_jinja2, jinja2_requirement_message
+from pyspark.testing.utils import (
+    have_openpyxl,
+    openpyxl_requirement_message,
+    have_jinja2,
+    jinja2_requirement_message,
+)
 
 
 class DataFrameConversionMixin:
@@ -87,6 +92,7 @@ def get_excel_dfs(pandas_on_spark_location, pandas_location):
             "expected": pd.read_excel(pandas_location, index_col=0),
         }
 
+    @unittest.skipIf(not have_openpyxl, openpyxl_requirement_message)
     def test_to_excel(self):
         with self.temp_dir() as dirpath:
             pandas_location = dirpath + "/" + "output1.xlsx"

diff --git a/python/pyspark/testing/utils.py b/python/pyspark/testing/utils.py
@@ -97,6 +97,9 @@ def have_package(name: str) -> bool:
 have_jinja2 = have_package("jinja2")
 jinja2_requirement_message = None if have_jinja2 else "No module named 'jinja2'"
 
+have_openpyxl = have_package("openpyxl")
+openpyxl_requirement_message = None if have_openpyxl else "No module named 'openpyxl'"
+
 pandas_requirement_message = None
 try:
     from pyspark.sql.pandas.utils import require_minimum_pandas_version