From 1f7e282e70ab95fb02b1dcdaf3e708fb41f2cbee Mon Sep 17 00:00:00 2001 From: Hussein Awala Date: Thu, 5 Dec 2024 01:08:07 +0100 Subject: [PATCH] chore: add sqlfluff to lint spark sql queries (#113) --- .pre-commit-config.yaml | 6 ++ examples/airflow/iceberg_query.sql | 15 +++- examples/airflow/query.sql | 6 +- poetry.lock | 113 ++++++++++++++++++++++++++++- pyproject.toml | 15 ++++ 5 files changed, 147 insertions(+), 8 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 867a546..675f50e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,3 +5,9 @@ repos: - id: ruff args: [ --fix, --exit-non-zero-on-fix ] - id: ruff-format + - repo: https://github.com/sqlfluff/sqlfluff + rev: 3.2.5 + hooks: + - id: sqlfluff-lint + - id: sqlfluff-fix + args: [--config, "./pyproject.toml"] diff --git a/examples/airflow/iceberg_query.sql b/examples/airflow/iceberg_query.sql index 1698303..13dabcc 100644 --- a/examples/airflow/iceberg_query.sql +++ b/examples/airflow/iceberg_query.sql @@ -3,12 +3,19 @@ CREATE TABLE IF NOT EXISTS prod.db.sample ( data string, category string ) -USING iceberg +USING ICEBERG PARTITIONED BY (category); MERGE INTO prod.db.sample t -USING (SELECT * FROM prod.db.another_sample WHERE category = 'foo') s -ON t.id = s.id +USING ( + SELECT + id, + data, + category + FROM prod.db.another_sample + WHERE category = 'foo' +) s + ON t.id = s.id WHEN MATCHED THEN UPDATE SET * WHEN NOT MATCHED THEN INSERT * -WHEN NOT MATCHED BY SOURCE THEN DELETE; +WHEN NOT MATCHED BY SOURCE THEN DELETE; -- noqa: PRS diff --git a/examples/airflow/query.sql b/examples/airflow/query.sql index 15026e2..c368597 100644 --- a/examples/airflow/query.sql +++ b/examples/airflow/query.sql @@ -7,8 +7,8 @@ USING PARQUET PARTITIONED BY (student_id INT); INSERT INTO students VALUES - ('Amy Smith', '123 Park Ave, San Jose', {{ ts }}, 111111); +('Amy Smith', '123 Park Ave, San Jose', '{{ ts }}', 111111); INSERT INTO students VALUES - ('Bob Brown', '456 Taylor St, Cupertino', {{ ts }}, 222222);, - ('Cathy Johnson', '789 Race Ave, Palo Alto', {{ ts}}, 333333); +('Bob Brown', '456 Taylor St, Cupertino', '{{ ts }}', 222222), +('Cathy Johnson', '789 Race Ave, Palo Alto', '{{ ts }}', 333333); diff --git a/poetry.lock b/poetry.lock index 0bff720..049b2a9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -615,6 +615,17 @@ marshmallow = ["marshmallow (>=3.18.0)"] tests = ["apispec[marshmallow,yaml]", "openapi-spec-validator (==0.7.1)", "pytest"] yaml = ["PyYAML (>=3.10)"] +[[package]] +name = "appdirs" +version = "1.4.4" +description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +optional = false +python-versions = "*" +files = [ + {file = "appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"}, + {file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"}, +] + [[package]] name = "argcomplete" version = "3.5.1" @@ -824,6 +835,17 @@ files = [ {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"}, ] +[[package]] +name = "chardet" +version = "5.2.0" +description = "Universal encoding detector for Python 3" +optional = false +python-versions = ">=3.7" +files = [ + {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"}, + {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"}, +] + [[package]] name = "charset-normalizer" version = "3.4.0" @@ -1210,6 +1232,26 @@ wrapt = ">=1.10,<2" [package.extras] dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "jinja2 (>=3.0.3,<3.1.0)", "setuptools", "sphinx (<2)", "tox"] +[[package]] +name = "diff-cover" +version = "9.2.0" +description = "Run coverage and linting reports on diffs" +optional = false +python-versions = "<4.0.0,>=3.8.10" +files = [ + {file = "diff_cover-9.2.0-py3-none-any.whl", hash = "sha256:1e24edc51c39e810c47dd9986e76c333ed95859655c091f572e590c39cabbdbe"}, + {file = "diff_cover-9.2.0.tar.gz", hash = "sha256:85a0b353ebbb678f9e87ea303f75b545bd0baca38f563219bb72f2ae862bba36"}, +] + +[package.dependencies] +chardet = ">=3.0.0" +Jinja2 = ">=2.7.1" +pluggy = ">=0.13.1,<2" +Pygments = ">=2.9.0,<3.0.0" + +[package.extras] +toml = ["tomli (>=1.2.1)"] + [[package]] name = "dill" version = "0.3.9" @@ -4484,6 +4526,32 @@ test-all = ["Babel (>=1.3)", "Jinja2 (>=2.3)", "Pygments (>=1.2)", "arrow (>=0.3 timezone = ["python-dateutil"] url = ["furl (>=0.4.1)"] +[[package]] +name = "sqlfluff" +version = "3.2.5" +description = "The SQL Linter for Humans" +optional = false +python-versions = ">=3.8" +files = [ + {file = "sqlfluff-3.2.5-py3-none-any.whl", hash = "sha256:ae9ff821986b5b0dd1ea858392db7f0eb80343c2cdeee7900fa031f581e04643"}, + {file = "sqlfluff-3.2.5.tar.gz", hash = "sha256:39822db2c6ad7dac9f6e43d36a3d086c503c051b09665d14a5bdf644770f6ef6"}, +] + +[package.dependencies] +appdirs = "*" +chardet = "*" +click = "*" +colorama = ">=0.3" +diff-cover = ">=2.5.0" +Jinja2 = "*" +pathspec = "*" +pytest = "*" +pyyaml = ">=5.1" +regex = "*" +tblib = "*" +toml = {version = "*", markers = "python_version < \"3.11\""} +tqdm = "*" + [[package]] name = "sqlparse" version = "0.5.2" @@ -4547,6 +4615,17 @@ files = [ [package.extras] widechars = ["wcwidth"] +[[package]] +name = "tblib" +version = "3.0.0" +description = "Traceback serialization library." +optional = false +python-versions = ">=3.8" +files = [ + {file = "tblib-3.0.0-py3-none-any.whl", hash = "sha256:80a6c77e59b55e83911e1e607c649836a69c103963c5f28a46cbeef44acf8129"}, + {file = "tblib-3.0.0.tar.gz", hash = "sha256:93622790a0a29e04f0346458face1e144dc4d32f493714c6c3dff82a4adb77e6"}, +] + [[package]] name = "tenacity" version = "9.0.0" @@ -4587,6 +4666,17 @@ files = [ {file = "text_unidecode-1.3-py2.py3-none-any.whl", hash = "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8"}, ] +[[package]] +name = "toml" +version = "0.10.2" +description = "Python Library for Tom's Obvious, Minimal Language" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, + {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, +] + [[package]] name = "tomli" version = "2.1.0" @@ -4598,6 +4688,27 @@ files = [ {file = "tomli-2.1.0.tar.gz", hash = "sha256:3f646cae2aec94e17d04973e4249548320197cfabdf130015d023de4b74d8ab8"}, ] +[[package]] +name = "tqdm" +version = "4.67.1" +description = "Fast, Extensible Progress Meter" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2"}, + {file = "tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +dev = ["nbval", "pytest (>=6)", "pytest-asyncio (>=0.24)", "pytest-cov", "pytest-timeout"] +discord = ["requests"] +notebook = ["ipywidgets (>=6)"] +slack = ["slack-sdk"] +telegram = ["requests"] + [[package]] name = "typing-extensions" version = "4.12.2" @@ -5095,4 +5206,4 @@ api = ["aiohttp", "fastapi", "httpx", "jinja2", "kubernetes-asyncio", "uvicorn", [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "cfd5b64d934a38891d912931c8bf34b3e355afb62e01180ce84972006f4036bf" +content-hash = "ff56ec36b1c8541a17edd5c0d1abb771805035943cfc7ad1c8e5bbd11d1e7828" diff --git a/pyproject.toml b/pyproject.toml index 4e2af53..ecd5f3b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ mkdocstrings = {version = "^0.25.1", extras = ["python"]} mkdocs-gen-files = "^0.5.0" mkdocs-literate-nav = "^0.6.1" helm-mkdocs = "^0.0.5" +sqlfluff = "^3.2.5" [tool.poetry.extras] api = ["fastapi", "kubernetes-asyncio", "uvicorn", "httpx", "jinja2", "aiohttp", "websockets"] @@ -90,3 +91,17 @@ docstring-code-format = true [tool.ruff.lint.pydocstyle] convention = "google" + +[tool.sqlfluff.core] +dialect = "sparksql" +sql_file_exts = ".sql" +ignore = "templating" +exclude_rules="AL01" + +[tool.sqlfluff.indentation] +indented_joins = false +indented_using_on = true +template_blocks_indent = false + +[tool.sqlfluff.rules.capitalisation.keywords] +capitalisation_policy = "upper"