Skip to content

Commit

Permalink
Add support for regex based CGPatterns
Browse files Browse the repository at this point in the history
ghstack-source-id: 78529c182f9c258ff3cf09f10a72856d1857e6f6
Pull Request resolved: #529
  • Loading branch information
ketkarameya committed Jul 11, 2023
1 parent 6fed671 commit eaf000c
Show file tree
Hide file tree
Showing 12 changed files with 387 additions and 17 deletions.
19 changes: 15 additions & 4 deletions src/models/capture_group_patterns.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Copyright (c) 2023 Uber Technologies, Inc.
use crate::{
models::Validator,
utilities::{
regex_utilities::get_all_matches_for_regex,
tree_sitter_utilities::{get_all_matches_for_query, get_ts_query_parser, number_of_errors},
Instantiate,
},
Expand All @@ -24,7 +25,7 @@ use serde_derive::Deserialize;
use std::collections::HashMap;
use tree_sitter::{Node, Query};

use super::matches::Match;
use super::{default_configs::REGEX_QUERY_PREFIX, matches::Match};

#[pyclass]
#[derive(Deserialize, Debug, Clone, Default, PartialEq, Hash, Eq)]
Expand All @@ -38,12 +39,20 @@ impl CGPattern {
pub(crate) fn pattern(&self) -> String {
self.0.to_string()
}

pub(crate) fn extract_regex(&self) -> Result<Regex, regex::Error> {
let mut _val = &self.pattern()[REGEX_QUERY_PREFIX.len()..];
Regex::new(_val)
}
}

impl Validator for CGPattern {
fn validate(&self) -> Result<(), String> {
if self.pattern().starts_with("rgx ") {
panic!("Regex not supported")
return self
.extract_regex()
.map(|_| Ok(()))
.unwrap_or(Err(format!("Cannot parse the regex - {}", self.pattern())));
}
let mut parser = get_ts_query_parser();
parser
Expand All @@ -70,7 +79,7 @@ impl Instantiate for CGPattern {
#[derive(Debug)]
pub(crate) enum CompiledCGPattern {
Q(Query),
R(Regex), // Regex is not yet supported
R(Regex),
}

impl CompiledCGPattern {
Expand Down Expand Up @@ -103,7 +112,9 @@ impl CompiledCGPattern {
replace_node,
replace_node_idx,
),
CompiledCGPattern::R(_) => panic!("Regex is not yet supported!!!"),
CompiledCGPattern::R(regex) => {
get_all_matches_for_regex(node, source_code, regex, recursive, replace_node)
}
}
}
}
2 changes: 2 additions & 0 deletions src/models/default_configs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ pub const THRIFT: &str = "thrift";
pub const STRINGS: &str = "strings";
pub const TS_SCHEME: &str = "scm"; // We support scheme files that contain tree-sitter query

pub const REGEX_QUERY_PREFIX: &str = "rgx ";

#[cfg(test)]
//FIXME: Remove this hack by not passing PiranhaArguments to SourceCodeUnit
pub(crate) const UNUSED_CODE_PATH: &str = "/dev/null";
Expand Down
40 changes: 39 additions & 1 deletion src/models/matches.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,18 @@ pub(crate) struct Match {
gen_py_str_methods!(Match);

impl Match {
pub(crate) fn from_regex(
mtch: &regex::Match, matches: HashMap<String, String>, source_code: &str,
) -> Self {
Match {
matched_string: mtch.as_str().to_string(),
range: Range::from_regex_match(mtch, source_code),
matches,
associated_comma: None,
associated_comments: Vec::new(),
}
}

pub(crate) fn new(
matched_string: String, range: tree_sitter::Range, matches: HashMap<String, String>,
) -> Self {
Expand Down Expand Up @@ -231,7 +243,7 @@ impl Match {
serde_derive::Serialize, Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Deserialize,
)]
#[pyclass]
struct Range {
pub(crate) struct Range {
#[pyo3(get)]
start_byte: usize,
#[pyo3(get)]
Expand Down Expand Up @@ -260,6 +272,32 @@ impl From<tree_sitter::Range> for Range {
}
gen_py_str_methods!(Range);

impl Range {
pub(crate) fn from_regex_match(mtch: &regex::Match, source_code: &str) -> Self {
Self {
start_byte: mtch.start(),
end_byte: mtch.end(),
start_point: position_for_offset(source_code.as_bytes(), mtch.start()),
end_point: position_for_offset(source_code.as_bytes(), mtch.end()),
}
}
}

// Finds the position (col and row number) for a given offset.
// Copied from tree-sitter tests [https://github.com/tree-sitter/tree-sitter/blob/d0029a15273e526925a764033e9b7f18f96a7ce5/cli/src/parse.rs#L364]
fn position_for_offset(input: &[u8], offset: usize) -> Point {
let mut result = Point { row: 0, column: 0 };
for c in &input[0..offset] {
if *c as char == '\n' {
result.row += 1;
result.column = 0;
} else {
result.column += 1;
}
}
result
}

/// A range of positions in a multi-line text document, both in terms of bytes and of
/// rows and columns.
#[derive(
Expand Down
5 changes: 4 additions & 1 deletion src/models/rule_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,10 @@ impl RuleStore {
pub(crate) fn query(&mut self, cg_pattern: &CGPattern) -> &CompiledCGPattern {
let pattern = cg_pattern.pattern();
if pattern.starts_with("rgx ") {
panic!("Regex not supported.")
return &*self
.rule_query_cache
.entry(pattern)
.or_insert_with(|| CompiledCGPattern::R(cg_pattern.extract_regex().unwrap()));
}

&*self
Expand Down
10 changes: 0 additions & 10 deletions src/models/unit_tests/rule_graph_validation_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -118,13 +118,3 @@ fn test_filter_bad_arg_contains_n_sibling() {
.sibling_count(2)
.build();
}

#[test]
#[should_panic(expected = "Regex not supported")]
fn test_unsupported_regex() {
RuleGraphBuilder::default()
.rules(vec![
piranha_rule! {name = "Test rule", query = "rgx (\\w+) (\\w)+"},
])
.build();
}
3 changes: 2 additions & 1 deletion src/tests/test_piranha_java.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,11 @@ create_rewrite_tests! {
test_non_seed_user_rule: "non_seed_user_rule", 1, substitutions = substitutions! {"input_type_name" => "ArrayList"};
test_insert_field_and_initializer: "insert_field_and_initializer", 1;
test_user_option_delete_if_empty: "user_option_delete_if_empty", 1;
test_user_option_do_not_delete_if_empty : "user_option_do_not_delete_if_empty", 1, delete_file_if_empty =false;
test_user_option_do_not_delete_if_empty : "user_option_do_not_delete_if_empty", 1, delete_file_if_empty = false;
test_new_line_character_used_in_string_literal: "new_line_character_used_in_string_literal", 1;
test_java_delete_method_invocation_argument: "delete_method_invocation_argument", 1;
test_java_delete_method_invocation_argument_no_op: "delete_method_invocation_argument_no_op", 0;
test_regex_based_matcher: "regex_based_matcher", 1, cleanup_comments = true;
}

create_match_tests! {
Expand Down
1 change: 1 addition & 0 deletions src/utilities/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Copyright (c) 2023 Uber Technologies, Inc.
limitations under the License.
*/

pub(crate) mod regex_utilities;
pub(crate) mod tree_sitter_utilities;
use std::collections::HashMap;
use std::error::Error;
Expand Down
73 changes: 73 additions & 0 deletions src/utilities/regex_utilities.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/*
Copyright (c) 2023 Uber Technologies, Inc.
<p>Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
except in compliance with the License. You may obtain a copy of the License at
<p>http://www.apache.org/licenses/LICENSE-2.0
<p>Unless required by applicable law or agreed to in writing, software distributed under the
License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
express or implied. See the License for the specific language governing permissions and
limitations under the License.
*/

use crate::models::matches::Match;
use itertools::Itertools;
use regex::Regex;
use std::collections::HashMap;
use tree_sitter::Node;

/// Applies the query upon the given `node`, and gets all the matches
/// # Arguments
/// * `node` - the root node to apply the query upon
/// * `source_code` - the corresponding source code string for the node.
/// * `recursive` - if `true` it matches the query to `self` and `self`'s sub-ASTs, else it matches the `query` only to `self`.
/// * `replace_node` - node to replace
///
/// # Returns
/// List containing all the matches against `node`
pub(crate) fn get_all_matches_for_regex(
node: &Node, source_code: String, regex: &Regex, recursive: bool, replace_node: Option<String>,
) -> Vec<Match> {
let all_captures = regex.captures_iter(&source_code).collect_vec();
let names = regex.capture_names().collect_vec();
let mut all_matches = vec![];
for captures in all_captures {
// Check if the range of the self (node), and the range of outermost node captured by the query are equal.
let range_matches_node = node.start_byte() == captures.get(0).unwrap().start()
&& node.end_byte() == captures.get(0).unwrap().end();
let range_matches_inside_node = node.start_byte() <= captures.get(0).unwrap().start()
&& node.end_byte() >= captures.get(0).unwrap().end();
if (recursive && range_matches_inside_node) || range_matches_node {
let replace_node_match = if let Some(ref rn) = replace_node {
captures
.name(rn)
.unwrap_or_else(|| panic!("The tag {rn} provided in the replace node is not present"))
} else {
captures.get(0).unwrap()
};
let matches = extract_captures(&captures, &names);
all_matches.push(Match::from_regex(
&replace_node_match,
matches,
&source_code,
));
}
}
all_matches
}

// Creates a hashmap from the capture group(name) to the corresponding code snippet.
fn extract_captures(
captures: &regex::Captures<'_>, names: &Vec<Option<&str>>,
) -> HashMap<String, String> {
names
.iter()
.flatten()
.flat_map(|x| {
captures
.name(x)
.map(|v| (x.to_string(), v.as_str().to_string()))
})
.collect()
}
31 changes: 31 additions & 0 deletions test-resources/java/regex_based_matcher/configurations/edges.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright (c) 2023 Uber Technologies, Inc.
#
# <p>Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
# except in compliance with the License. You may obtain a copy of the License at
# <p>http://www.apache.org/licenses/LICENSE-2.0
#
# <p>Unless required by applicable law or agreed to in writing, software distributed under the
# License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing permissions and
# limitations under the License.

[[edges]]
scope = "File"
from = "update_import"
to = ["update_list_int"]

[[edges]]
scope = "Method"
from = "update_list_int"
to = ["update_add"]


[[edges]]
scope = "File"
from = "delete_import_our_map_of"
to = ["change_type_from_OurHashMap", "add_import_For_hashmap"]

[[edges]]
scope = "Method"
from = "change_type_from_OurHashMap"
to = ["update_push"]
105 changes: 105 additions & 0 deletions test-resources/java/regex_based_matcher/configurations/rules.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# Copyright (c) 2023 Uber Technologies, Inc.
#
# <p>Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
# except in compliance with the License. You may obtain a copy of the License at
# <p>http://www.apache.org/licenses/LICENSE-2.0
#
# <p>Unless required by applicable law or agreed to in writing, software distributed under the
# License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing permissions and
# limitations under the License.

# Replace foo().bar().baz() with `true` inside methods not nnotated as @DoNotCleanup
[[rules]]
name = "replace_call"
query = """rgx (?P<n1>foo\\(\\)\\.bar\\(\\)\\.baz\\(\\))"""
replace_node = "n1"
replace = "true"
groups = ["replace_expression_with_boolean_literal"]
[[rules.filters]]
enclosing_node = """(method_declaration) @md"""
not_contains = ["""rgx @DoNotCleanup"""]

# Before:
# abc().def().ghi()
# abc().fed().ghi()
[[rules]]
name = "replace_call_def_fed"
query = """rgx (?P<n>abc\\(\\)\\.(?P<m_def>def)\\(\\)\\.ghi\\(\\))"""
replace_node = "m_def"
replace = "fed"


# The below three rules do a dummy type migration from OurListOfInteger to List<Integer>

# Updates the import statement from `out.list.OurListOfInteger` to `java.util.List`
[[rules]]
name = "update_import"
query = """rgx (?P<n>our\\.list\\.OurListOfInteger)"""
replace_node = "n"
replace = "java.util.List"

# Updates the type of local variables from `OurListOfInteger` to `List<Integer>`
[[rules]]
name = "update_list_int"
query = """rgx (?P<var_decl>(?P<type>OurListOf(?P<param>Integer))\\s*(?P<name>\\w+)\\s*=.*;)"""
replace_node = "type"
replace = "List<@param>"
is_seed_rule = false
[[rules.filter]]
enclosing_node = "(method_declaration) @cmd"

# Updates the relevant callsite from `addToOurList` to `add`
[[rules]]
name = "update_add"
query = """rgx (?P<call>@name\\.(?P<m_name>addToOurList)\\(\\w+\\))"""
replace_node = "m_name"
replace = "add"
holes = ["name"]
is_seed_rule = false


# The below three rules do a dummy type migration OurMapOf{T1}{T2} to HashMap<T1, T2>. For example, from OurMapOfStringInteger to HashMap<String, Integer>.
# This is to exercise non-constant regex matches for replace_node.

# Deletes the import statement `our.map.OurMapOf...`
[[rules]]
name = "delete_import_our_map_of"
query = """rgx (?P<n>import our\\.map\\.OurMapOf\\w+;)"""
replace_node = "n"
replace = ""

# Adds Import to java.util.Hashmap if absent
[[rules]]
name = "add_import_For_hashmap"
query = """(package_declaration) @pkg"""
replace_node = "pkg"
replace = "@pkg\nimport java.util.HashMap;"
is_seed_rule = false
[[rules.filters]]
enclosing_node = "(program) @prgrm"
not_contains = [
"((import_declaration) @im (#match? @im \"java.util.HashMap\"))",
]

# Before:
# OurMapOfStringInteger
# After:
# HashMap<String, Integer>
[[rules]]
name = "change_type_from_OurHashMap"
query = """rgx (?P<var_decl>(?P<type>\\bOurMapOf(?P<param1>[A-Z]\\w+)(?P<param2>[A-Z]\\w+))\\s*(?P<name>\\w+)\\s*=.*;)"""
replace_node = "type"
replace = "HashMap<@param1, @param2>"
is_seed_rule = false
[[rules.filter]]
enclosing_node = "(method_declaration) @cmd"

# Updates the relevant callsite from `pushIntoOurMap` to `push`
[[rules]]
name = "update_push"
query = """rgx (?P<call>@name\\.(?P<m_name>pushIntoOurMap)\\(.*\\))"""
replace_node = "m_name"
replace = "push"
holes = ["name"]
is_seed_rule = false
Loading

0 comments on commit eaf000c

Please sign in to comment.