C grammar for the R treesitter package.
# Install the development version of treesitter.c from r-universe
install.packages('treesitter.c', repos = c('https://sounkou-bioinfo.r-universe.dev', 'https://cloud.r-project.org'))
# or the cran release
install.packages('treesitter.c', repos = 'https://cloud.r-project.org')library(treesitter)
#>
#> Attaching package: 'treesitter'
#> The following object is masked from 'package:base':
#>
#> range
library(treesitter.c)
c_language <- language()
parser <- parser(c_language)
code <- "
struct Point {
int x[MAX_SIZE];
int y;
};
"
tree <- parser_parse(parser, code)
tree
#> <tree_sitter_tree>
#>
#> ── Text ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
#> struct Point {
#> int x[MAX_SIZE];
#> int y;
#> };
#>
#>
#> ── S-Expression ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
#> (translation_unit [(1, 0), (5, 0)]
#> (struct_specifier [(1, 0), (4, 1)]
#> "struct" [(1, 0), (1, 6)]
#> name: (type_identifier [(1, 7), (1, 12)])
#> body: (field_declaration_list [(1, 13), (4, 1)]
#> "{" [(1, 13), (1, 14)]
#> (field_declaration [(2, 2), (2, 18)]
#> type: (primitive_type [(2, 2), (2, 5)])
#> declarator: (array_declarator [(2, 6), (2, 17)]
#> declarator: (field_identifier [(2, 6), (2, 7)])
#> "[" [(2, 7), (2, 8)]
#> size: (identifier [(2, 8), (2, 16)])
#> "]" [(2, 16), (2, 17)]
#> )
#> ";" [(2, 17), (2, 18)]
#> )
#> (field_declaration [(3, 2), (3, 8)]
#> type: (primitive_type [(3, 2), (3, 5)])
#> declarator: (field_identifier [(3, 6), (3, 7)])
#> ";" [(3, 7), (3, 8)]
#> )
#> "}" [(4, 0), (4, 1)]
#> )
#> )
#> ";" [(4, 1), (4, 2)]
#> <truncated>If you have a C compiler available and want to preprocess macros
(recommended for headers that use macros), enable
preprocess = TRUE. Prefer to use the helper
r_cc() to detect the compiler automatically
# Check for a compiler and use include_dirs so the preprocessor can find nested headers
cc <- treesitter.c::r_cc()
hdr_df_pp <- parse_r_include_headers(
dir = R.home("include"),
preprocess = TRUE,
include_dirs = R.home("include")
)
hdr_df_pp[grepl("Rf", x = hdr_df_pp$name), ] |> head(10)
#> name file line kind
#> 1633 Rf_error /usr/share/R/include/R_ext/Callbacks.h 2708 declaration
#> 1636 Rf_warning /usr/share/R/include/R_ext/Callbacks.h 2714 declaration
#> 1645 Rf_revsort /usr/share/R/include/R_ext/Callbacks.h 2753 declaration
#> 1646 Rf_iPsort /usr/share/R/include/R_ext/Callbacks.h 2754 declaration
#> 1647 Rf_rPsort /usr/share/R/include/R_ext/Callbacks.h 2755 declaration
#> 1648 Rf_cPsort /usr/share/R/include/R_ext/Callbacks.h 2756 declaration
#> 1653 Rf_StringFalse /usr/share/R/include/R_ext/Callbacks.h 2772 declaration
#> 1654 Rf_StringTrue /usr/share/R/include/R_ext/Callbacks.h 2773 declaration
#> 1655 Rf_isBlankString /usr/share/R/include/R_ext/Callbacks.h 2774 declaration
#> 1707 Rf_asChar /usr/share/R/include/R_ext/Callbacks.h 3108 declarationYou can use the preprocess_header function with extra
compiler options to avoid system includes and use the bundled fake libc
headers. This avoids system includes bloat.
# Path to a header file to preprocess
header_file <- file.path(R.home("include"), "Rinternals.h")
# Get the path to the fake libc headers
fake_libc <- fake_libc_path()
# Preprocess with -nostdinc and -I pointing to fake_libc
preprocessed <- preprocess_header(
file = header_file,
cc = r_cc(),
ccflags = paste0("-I", fake_libc),
"-nostdinc"
)
cat(substr(preprocessed, 1, 500))
#> # 0 "/usr/share/R/include/Rinternals.h"
#> # 0 "<built-in>"
#> # 0 "<command-line>"
#> # 1 "/usr/share/R/include/Rinternals.h"
#> # 38 "/usr/share/R/include/Rinternals.h"
#> # 1 "/usr/local/lib/R/site-library/treesitter.c/fake_libc/stdio.h" 1
#> # 1 "/usr/local/lib/R/site-library/treesitter.c/fake_libc/_fake_defines.h" 1
#> # 2 "/usr/local/lib/R/site-library/treesitter.c/fake_libc/stdio.h" 2
#> # 1 "/usr/local/lib/R/site-library/treesitter.c/fake_libc/_fake_typedefs.h" 1
#>
#>
#>
#> typedef int size_t;
#> typedef int __builtin_va_This approach ensures only the fake libc headers are used, making preprocessing more predictable and portable.
The following concise examples demonstrate extracting specific information (functions, parameters, structs, macros) using the package’s simple helpers.
Simple parse and extract functions: parse a small header string and extract functions with parameter types.
txt <- "int foo(int a, const char* s);
static inline int bar(void) { return 1; }"
# extract params and return type
root <- parse_header_text(txt)
get_function_nodes(root, extract_params = TRUE, extract_return = TRUE)
#> capture_name text start_line start_col params return_type
#> 1 decl_name foo 1 5 int, con.... int
#> 2 def_name bar 2 19 void int
get_function_nodes(root, extract_params = TRUE)
#> capture_name text start_line start_col params return_type
#> 1 decl_name foo 1 5 int, con.... <NA>
#> 2 def_name bar 2 19 void <NA>Extract function parameter and return types while parsing:
txt <- "int foo(int a, const char* s);"
root <- parse_header_text(txt)
get_function_nodes(root, extract_params = TRUE, extract_return = TRUE)
#> capture_name text start_line start_col params return_type
#> 1 decl_name foo 1 5 int, con.... intGet structs and members:
txt <- "struct T { unsigned int x:1; int y; };"
root <- parse_header_text(txt)
get_struct_nodes(root)
#> capture_name text start_line
#> 1 struct_name T 1
get_struct_members(root)
#> struct_name member_name member_type bitfield nested_members
#> 1 T x unsigned int int 1 <NA>
#> 2 T y int <NA> <NA>Collect a directory with all kinds using
parse_headers_collect
res <- parse_headers_collect(dir = R.home("include"), preprocess = FALSE, extract_params = TRUE)
names(res)
#> [1] "functions" "structs" "struct_members" "enums"
#> [5] "unions" "globals" "defines"
head(res$functions)
#> file capture_name start_line start_col
#> 1 /usr/share/R/include/R_ext/Altrep.h decl_name 47 1
#> 2 /usr/share/R/include/R_ext/Altrep.h decl_name 50 1
#> 3 /usr/share/R/include/R_ext/Altrep.h decl_name 52 1
#> 4 /usr/share/R/include/R_ext/Altrep.h decl_name 54 1
#> 5 /usr/share/R/include/R_ext/Altrep.h decl_name 56 1
#> 6 /usr/share/R/include/R_ext/Altrep.h decl_name 58 1
#> params return_type name
#> 1 R_altrep.... <NA> R_new_altrep
#> 2 const ch.... <NA> R_make_altstring_class
#> 3 const ch.... <NA> R_make_altinteger_class
#> 4 const ch.... <NA> R_make_altreal_class
#> 5 const ch.... <NA> R_make_altlogical_class
#> 6 const ch.... <NA> R_make_altraw_class
# Optional: inspect macros from a single header
path <- file.path(R.home("include"), "Rembedded.h")
defs <- get_defines_from_file(path, use_cpp = TRUE, ccflags = paste("-I", dirname(path)))
head(defs)
#> [1] "__DBL_MIN_EXP__" "__LDBL_MANT_DIG__"
#> [3] "__UINT_LEAST16_MAX__" "__FLT16_HAS_QUIET_NAN__"
#> [5] "__ATOMIC_ACQUIRE" "__WCHAR_MAX__"treesiter ABI Version 14, compatible with treesitter package version
0.3.0. The C grammar source used for bootstrapping was downloaded from
https://github.com/tree-sitter/tree-sitter-c. The
pre-generated parser.c from upstream is ~3.7 MB and
contains pragma directives that trigger CRAN check warnings.
During bootstrap (bootstrap.R), all #pragma
directives are automatically removed from parser.c to
ensure CRAN compliance. This includes pragmas for diagnostic control and
optimization settings that are not portable across compilers.
GPL-3