tree-sitter安装与基本使用

初始化tree-sitter

安装tree-sitter

1
pip install tree-sitter

语言支持

1
2
3
4
5
# 针对要解析的语言,创建文件夹,并从github的tree-sitter仓库下载语言支持
mkdir vendor
cd vendor
git clone https://github.com/tree-sitter/tree-sitter-cpp
git clone https://github.com/tree-sitter/tree-sitter-c

创建build文件夹

1
2
# build于vendor是同级文件夹
mkdir build

创建language_build.py,生成.so文件,该文件相当于自定义的编译器,用于解析代码生成语法树

1
2
3
4
5
6
7
8
9
10
11
from tree_sitter import Language, Parser
Language.build_library(
# Store the library in the `build` directory
'my-languages.so',
# Include one or more languages
[
'../vendor/tree-sitter-c',
'../vendor/tree-sitter-cpp'
]
)

运行该文件

1
python language_build.py

使用初探

基本过程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# 导入依赖
from tree_sitter import Language, Parser
# so文件路径和语言配置
CPP_LANGUAGE = Language('../build/my-languages.so', 'cpp')
C_LANGUAGE = Language('../build/my-languages.so', 'c')

# 举一个CPP例子
cpp_parser = Parser()
cpp_parser.set_language(CPP_LANGUAGE)

file_path = "dot/177755_CVE-2015-7540_CWE-399_vul.c"
with open(file_path, "r") as file:
code = file.read()
tree = cpp_parser.parse(bytes(code, "utf8"))
# tree = parser.parse(source.encode('utf-8').decode('unicode_escape').encode())
print(type(tree))

遍历tree

1
2
3
4
5
6
7
def print_tree(node, indent=0):
code = node.text.decode('utf-8')
print(' ' * indent, node.type, code)
for child in node.children:
print_tree(child, indent + 2)

print_tree(tree.root_node)

tree节点属性

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# 孩子节点【节点数、节点列表】
root_node.child_count: int
root_node.children: list[Node]| None
# 该语法树节点对应代码字符串位置【左闭右开】
root_node.start_byte: int
root_node.end_byte: int
# 语法树节点对应代码 (行, 列) 位置元组
root_node.start_point: tuple[int, int]
root_node.end_point: tuple[int, int]
'''
以上的行、列以及字符串位置都是以0开始
'''
# 语法树命名节点、命名类型 以及 语法树对应的文本
# 因为具体语法树有代码所有的标记,所以一些符号可能没有类型
# 我猜测该属性可以用于区别具体语法树符号节点,构建抽象语法树
root_node.is_named: bool
root_node.type: str # 没有类型时,这里显示代码原始标记
root_node.text: bytes

# 语法树父节点
root_node.parent: Node| None

# 语法树左兄弟、左命名兄弟
root_node.prev_sibling: Node| None
root_node.prev_named_sibling: Node| None
# 语法树右兄弟、右命名兄弟
root_node.next_sibling: Node| None
root_node.next_named_sibling: Node| None

附:属性和方法

1
2
3
4
5
6
7
<class 'tree_sitter.Tree'>
# print(dir(tree))

['__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'changed_ranges', 'edit', 'included_ranges', 'root_node', 'root_node_with_offset', 'text', 'walk']
<class 'tree_sitter.Node'>
# print(dir(tree.root_node))
['__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'byte_range', 'child', 'child_by_field_id', 'child_by_field_name', 'child_count', 'children', 'children_by_field_id', 'children_by_field_name', 'descendant_count', 'descendant_for_byte_range', 'descendant_for_point_range', 'edit', 'end_byte', 'end_point', 'field_name_for_child', 'grammar_id', 'grammar_name', 'has_changes', 'has_error', 'id', 'is_error', 'is_extra', 'is_missing', 'is_named', 'kind_id', 'named_child', 'named_child_count', 'named_children', 'named_descendant_for_byte_range', 'named_descendant_for_point_range', 'next_named_sibling', 'next_parse_state', 'next_sibling', 'parent', 'parse_state', 'prev_named_sibling', 'prev_sibling', 'range', 'sexp', 'start_byte', 'start_point', 'text', 'type', 'walk']
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
byte_range (0, 283)
child <built-in method child of tree_sitter.Node object at 0x7f9f1c167d70>
child_by_field_id <built-in method child_by_field_id of tree_sitter.Node object at 0x7f9f1c167d70>
child_by_field_name <built-in method child_by_field_name of tree_sitter.Node object at 0x7f9f1c167d70>
child_count 1
children [<Node type=function_definition, start_point=(0, 0), end_point=(12, 2)>]
children_by_field_id <built-in method children_by_field_id of tree_sitter.Node object at 0x7f9f1c167d70>
children_by_field_name <built-in method children_by_field_name of tree_sitter.Node object at 0x7f9f1c167d70>
descendant_count 103
descendant_for_byte_range <built-in method descendant_for_byte_range of tree_sitter.Node object at 0x7f9f1c167d70>
descendant_for_point_range <built-in method descendant_for_point_range of tree_sitter.Node object at 0x7f9f1c167d70>
edit <built-in method edit of tree_sitter.Node object at 0x7f9f1c167d70>
end_byte 283
end_point (12, 2)
field_name_for_child <built-in method field_name_for_child of tree_sitter.Node object at 0x7f9f1c167d70>
grammar_id 214
grammar_name translation_unit
has_changes False
has_error False
id 24860624
is_error False
is_extra False
is_missing False
is_named True
kind_id 214
named_child <built-in method named_child of tree_sitter.Node object at 0x7f9f1c167d70>
named_child_count 1
named_children [<Node type=function_definition, start_point=(0, 0), end_point=(12, 2)>]
named_descendant_for_byte_range <built-in method named_descendant_for_byte_range of tree_sitter.Node object at 0x7f9f1c167d70>
named_descendant_for_point_range <built-in method named_descendant_for_point_range of tree_sitter.Node object at 0x7f9f1c167d70>
next_named_sibling None
next_parse_state 0
next_sibling None
parent None
parse_state 0
prev_named_sibling None
prev_sibling None
range <Range start_point=(0, 0), start_byte=0, end_point=(12, 2), end_byte=283>
sexp <built-in method sexp of tree_sitter.Node object at 0x7f9f1c167d70>
start_byte 0
start_point (0, 0)
text b'bool fun1(struct var1 *var2, bool *var3)\n {\n uint8_t var4 = 0;\n fun2(var2, var5);\n fun3(var2, &var4);\n if (var4 == 0xFF) {\n *var3 = true;\n } else {\n *var3 = false;\n }\n fun4(var2);\n return !var2->var6;\n }'
type translation_unit
walk <built-in method walk of tree_sitter.Node object at 0x7f9f1c167d70>