11#include "tree_sitter/alloc.h"
2+ #include "tree_sitter/array.h"
23#include "tree_sitter/parser.h"
34#include <ctype.h>
45#include <wctype.h>
@@ -13,10 +14,14 @@ enum TokenType {
1314 END_OF_STATEMENT ,
1415 PREPROC_UNARY_OPERATOR ,
1516 HOLLERITH_CONSTANT ,
17+ MACRO_IDENTIFIER ,
1618};
1719
20+ typedef Array (char * ) StringArray ;
21+
1822typedef struct {
1923 bool in_line_continuation ;
24+ StringArray MacroIdentifiers ;
2025} Scanner ;
2126
2227typedef enum {
@@ -301,31 +306,46 @@ static bool scan_end_line_continuation(Scanner *scanner, TSLexer *lexer) {
301306 return true;
302307}
303308
304- static bool scan_string_literal_kind (TSLexer * lexer ) {
305- // Strictly, it's allowed for the kind to be an integer literal, in
306- // practice I've not seen it
309+ typedef Array (char ) String ;
310+
311+ // Returns NULL on error, otherwise an allocated char array for an identifier
312+ static String * scan_identifier (TSLexer * lexer ) {
307313 if (!iswalpha (lexer -> lookahead )) {
308- return false ;
314+ return NULL ;
309315 }
310-
311- lexer -> result_symbol = STRING_LITERAL_KIND ;
312-
313- // We need two characters of lookahead to see `_"`
314- char current_char = '\0' ;
315-
316+ String * possible_identifier = ts_calloc (1 , sizeof (String ));
316317 while (is_identifier_char (lexer -> lookahead ) && !lexer -> eof (lexer )) {
317- current_char = lexer -> lookahead ;
318- // Don't capture the trailing underscore as part of the kind identifier
319- if (lexer -> lookahead == '_' ) {
320- lexer -> mark_end (lexer );
321- }
322- advance (lexer );
318+ array_push (possible_identifier , lexer -> lookahead );
319+ // Don't capture the trailing underscore as part of the kind identifier
320+ // If another user of this function wants to mark the end again after
321+ // the identifier they're free to do so
322+ if (lexer -> lookahead == '_' ) {
323+ lexer -> mark_end (lexer );
324+ }
325+ advance (lexer );
326+ }
327+ if (possible_identifier -> size == 0 ) {
328+ // First deallocate the array content itself and then the heap-allocated
329+ // array struct
330+ array_delete (possible_identifier );
331+ ts_free (possible_identifier );
332+ return NULL ;
333+ }
334+ return possible_identifier ;
335+ }
336+
337+ static bool scan_string_literal_kind (TSLexer * lexer , String * identifier ) {
338+ if (identifier -> size == 0 ) {
339+ return false;
323340 }
324341
325- if ((current_char != '_' ) || (lexer -> lookahead != '"' && lexer -> lookahead != '\'' )) {
342+ char last_char = identifier -> contents [identifier -> size - 1 ];
343+ if ((last_char != '_' ) ||
344+ (lexer -> lookahead != '"' && lexer -> lookahead != '\'' )) {
326345 return false;
327346 }
328347
348+ lexer -> result_symbol = STRING_LITERAL_KIND ;
329349 return true;
330350}
331351
@@ -393,6 +413,33 @@ static bool scan_string_literal(TSLexer *lexer) {
393413 return false;
394414}
395415
416+ // Scans, using the MacroIdentifiers list from the scanner state, an identifier
417+ // that is contained in that list
418+ static bool scan_macro_identifier (Scanner * scanner , TSLexer * lexer ,
419+ String * identifier ) {
420+ unsigned num_macro_ids = scanner -> MacroIdentifiers .size ;
421+ // Nothing to compare against
422+ if (num_macro_ids == 0 ) {
423+ return false;
424+ }
425+
426+ // Find an equal macro identifier
427+ for (size_t i = 0 , end = scanner -> MacroIdentifiers .size ; i < end ; ++ i ) {
428+ char * macro_id = * array_get (& scanner -> MacroIdentifiers , i );
429+ unsigned macro_id_len = strlen (macro_id );
430+ // This will never be equal
431+ if (identifier -> size != macro_id_len ) {
432+ continue ;
433+ }
434+ if (strncmp (macro_id , identifier -> contents , identifier -> size ) == 0 ) {
435+ lexer -> mark_end (lexer );
436+ lexer -> result_symbol = MACRO_IDENTIFIER ;
437+ return true;
438+ }
439+ }
440+ return false;
441+ }
442+
396443/// Need an external scanner to catch '!' before its parsed as a comment
397444static bool scan_preproc_unary_operator (TSLexer * lexer ) {
398445 const char next_char = lexer -> lookahead ;
@@ -467,19 +514,57 @@ static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
467514 return true;
468515 }
469516
470- if (valid_symbols [STRING_LITERAL_KIND ]) {
517+ // These symbols both scan for an identifier, we need to combine the logic
518+ // and they always need to be the last to look for since we can't backtrack
519+ if (valid_symbols [STRING_LITERAL_KIND ] || valid_symbols [MACRO_IDENTIFIER ]) {
520+ String * identifier = scan_identifier (lexer );
521+ bool identifier_result = false;
471522 // This may need a lot of lookahead, so should (probably) always
472523 // be the last token to look for
473- if (scan_string_literal_kind (lexer )) {
524+ if (identifier && valid_symbols [STRING_LITERAL_KIND ]) {
525+ if (scan_string_literal_kind (lexer , identifier )) {
526+ identifier_result = true;
527+ }
528+ }
529+ if (!identifier_result && identifier && valid_symbols [MACRO_IDENTIFIER ]) {
530+ if (scan_macro_identifier (scanner , lexer , identifier )) {
531+ identifier_result = true;
532+ }
533+ }
534+ if (identifier ) {
535+ // First deallocate the array content itself and then the heap-allocated
536+ // array struct
537+ array_delete (identifier );
538+ ts_free (identifier );
539+ }
540+ if (identifier_result ) {
474541 return true;
475542 }
476543 }
477-
478544 return false;
479545}
480546
481547void * tree_sitter_fortran_external_scanner_create () {
482- return ts_calloc (1 , sizeof (bool ));
548+ Scanner * result = (Scanner * )ts_calloc (1 , sizeof (Scanner ));
549+ // First get the colon separated list of macro IDs from the environment
550+ char * macro_ids = getenv ("CODEE_TS_MACRO_IDS" );
551+ if (!macro_ids ) {
552+ return result ;
553+ }
554+ // Now separate them while we copy them to a list in the scanner state
555+ StringArray * macroIdsResult = & result -> MacroIdentifiers ;
556+ char * macro_id = strtok (macro_ids , ":" );
557+ while (macro_id ) {
558+ // strlen is safe with strtok's result
559+ int length = strlen (macro_id );
560+ // length + 1 for the null termination
561+ char * new_str = (char * )ts_calloc (1 , (length + 1 ) * sizeof (char ));
562+ strncpy (new_str , macro_id , length );
563+ array_push (macroIdsResult , new_str );
564+ // Keep splitting
565+ macro_id = strtok (NULL , ":" );
566+ }
567+ return result ;
483568}
484569
485570bool tree_sitter_fortran_external_scanner_scan (void * payload , TSLexer * lexer ,
@@ -491,20 +576,28 @@ bool tree_sitter_fortran_external_scanner_scan(void *payload, TSLexer *lexer,
491576unsigned tree_sitter_fortran_external_scanner_serialize (void * payload ,
492577 char * buffer ) {
493578 Scanner * scanner = (Scanner * )payload ;
494- buffer [0 ] = (char )scanner -> in_line_continuation ;
495- return 1 ;
579+ unsigned size = sizeof (* scanner );
580+ memcpy (buffer , scanner , size );
581+ return size ;
496582}
497583
498584void tree_sitter_fortran_external_scanner_deserialize (void * payload ,
499585 const char * buffer ,
500586 unsigned length ) {
501587 Scanner * scanner = (Scanner * )payload ;
502588 if (length > 0 ) {
503- scanner -> in_line_continuation = buffer [0 ];
589+ unsigned size = sizeof (* scanner );
590+ memcpy (scanner , buffer , size );
504591 }
505592}
506593
507594void tree_sitter_fortran_external_scanner_destroy (void * payload ) {
508595 Scanner * scanner = (Scanner * )payload ;
596+ // Destroy the strings allocated in each array element
597+ for (size_t i = 0 , end = scanner -> MacroIdentifiers .size ; i < end ; ++ i ) {
598+ char * str = * array_get (& scanner -> MacroIdentifiers , i );
599+ ts_free (str );
600+ }
601+ array_delete (& scanner -> MacroIdentifiers );
509602 ts_free (scanner );
510603}
0 commit comments