22use JSON::PP qw( decode_json encode_json) ;
33use Encode qw( decode encode is_utf8) ;
44use Unicode::Normalize qw( NFC) ;
5+ use utf8;
6+ BEGIN
7+ {
8+ $bibDir = $ENV {' BIBLIOGRAPHY_DIR' };
9+ $bibItemsDir = $ENV {' BIBITEMS_DIR' };
10+ }
11+
12+ # Handy when using the perl debugger...
13+ # sub is8 {
14+ # my ($s) = @_;
15+ # return is_utf8($s) ? "is UTF8" : "is not UTF8";
16+ # }
517
618# Cleanup text fields
719sub sanitize_text {
820 my ($s ) = @_ ;
921 return ' ' unless defined $s ;
1022
1123 # Ensure decoded characters
24+ # NOTE: It SEEMS is_utf8() checking may not correctly indicate the correct state of the content!!
1225 $s = decode(' UTF-8' , $s , Encode::WARN) unless is_utf8($s );
1326
1427 # Repair mojibake (e.g., "â" -> "’") if available
@@ -19,32 +32,28 @@ sub sanitize_text {
1932
2033 # Normalize and clean
2134 $s = NFC($s ); # normalize accents/combining marks
22- $s =~ s / [\x00 -\x08\x0B\x0C\x0E -\x1F\x7F\x{80} -\x{9F} ]// g ; # drop C0/C1 controls
23- $s =~ s /\x{00A0} / / g ; # NBSP -> space
24- $s =~ s /\r\n ?/ / g ; # normalize newlines
35+ $s =~ s / [\x00 -\x08\x0B\x0C\x0E -\x1F\x7F\x{80} -\x{9F} ]// gu ; # drop C0/C1 controls
36+ $s =~ s /\x{00A0} / / gu ; # NBSP -> space
37+ $s =~ s /\r\n ?/ / gu ; # normalize newlines
2538
2639 return $s ;
2740}
2841
29- BEGIN
30- {
31- $bibDir = $ENV {' BIBLIOGRAPHY_DIR' };
32- $bibItemsDir = $ENV {' BIBITEMS_DIR' };
33- }
3442my $item = $_ ;
35- my $json = eval { decode_json($item ) } or do { warn " Bad JSON line: $_ \n " ; next ; };
43+ my $obj = eval { decode_json($item ) } or do { warn " Bad JSON line: $_ \n " ; next ; };
3644
37- my $key = $json -> {key };
38- my $target = $json -> {target } || print STDERR " Cannot find target for key \" $key \" in line: $_ \n " ;
45+ my $key = $obj -> {key };
46+ my $target = $obj -> {target } || print STDERR " Cannot find target for key \" $key \" in line: $_ \n " ;
3947
4048if ($key eq $target ) { # only top level entries
4149 # my $handle = undef;
4250 # my $itemjson = "$bibItemsDir/$key.json";
4351 # open($handle, ">:encoding(UTF-8)", $itemjson) || die "$0: cannot open $itemjson in write-open mode: $!";
44- # print $handle $item;
52+ # print $handle (is_utf8( $item) ? $item : decode('UTF-8', $item, Encode::WARN)) ;
4553 # close $handle || die "$0: close of file $itemjson failed: $!";
4654
47- my $obj = eval { decode_json($item ) } or do { warn " Bad JSON for $key \n " ; next ; };
55+ # No need to decode the $item a second time. Just changed the $json above to be $obj.
56+ # my $obj = eval { decode_json($item) } or do { warn "Bad JSON for $key\n"; next; };
4857 delete $obj -> {children };
4958
5059 my $type = $obj -> {type } // ' ' ;
@@ -59,34 +68,42 @@ BEGIN
5968 my $abs = sanitize_text($obj -> {abstract } // ' ' );
6069 # a hack for bulleted lists in the abstracts (use markdown there)
6170 # won't work for nested lists.
62- $abs =~ s /\n ?\n\N{U+2022} / \n */ g ;
71+ $abs =~ s /\n ?\n\N{U+2022} / \n */ gu ;
6372 my $indented = join (' ' , map { " $_ \n " } split (/ \n / , $abs ));
6473 my $abstract = $indented eq ' ' ? " abstract: ''" : " abstract: |\n $indented " ;
6574
6675 my $itemDate = defined $obj -> {isoDateString } ? $obj -> {isoDateString } : ' ' ;
76+ my $itemReadableDate = defined $obj -> {readableDateString } ? $obj -> {readableDateString } : ' ' ;
6777
6878 my $itemAuthors = ' ' ;
6979 if (ref ($obj -> {authorsFormatted }) eq ' ARRAY' && @{$obj -> {authorsFormatted }}) {
7080 $itemAuthors = " \n " ;
7181 for my $a (@{$obj -> {authorsFormatted }}) {
72- my $quoted = encode_json($a // ' ' );
73- $itemAuthors .= " - $quoted \n " ;
82+ # The encode_json is where extended unicode chars get corrupted, e.g., "Emanuelson, Pär"
83+ # There may be other things that now don't work!!
84+ # my $quoted = encode_json($a // '');
85+ # sanitize_text seems to handle them correctly
86+ my $san = sanitize_text($a // ' ' );
87+ $itemAuthors .= " - \" $san \"\n " ;
7488 }
75- $itemAuthors =~ s /\n $// ; # strip trailing newline
89+ $itemAuthors =~ s /\n $// u ; # strip trailing newline
7690 }
7791
7892 my $itemEditors = ' ' ;
7993 if (ref ($obj -> {editorsFormatted }) eq ' ARRAY' && @{$obj -> {editorsFormatted }}) {
8094 $itemEditors = " \n " ;
8195 for my $a (@{$obj -> {editorsFormatted }}) {
82- my $quoted = encode_json($a // ' ' );
83- $itemEditors .= " - $quoted \n " ;
96+ # as above...
97+ # my $quoted = encode_json($a // '');
98+ my $san = sanitize_text($a // ' ' );
99+ $itemEditors .= " - \" $san \"\n " ;
84100 }
85- $itemEditors =~ s /\n $// ; # strip trailing newline
101+ $itemEditors =~ s /\n $// u ; # strip trailing newline
86102 }
87103
88104 my $urlSource = defined $obj -> {url } ? $obj -> {url } : ' ' ;
89105
106+ # Some/most/all of these *may* need sanitize_text
90107 # optional fields - ones used vary by value of type
91108 my $applicationNumber = defined $obj -> {applicationNumber } ? qq{ "$obj ->{applicationNumber}"} : ' ""' ;
92109 my $assignee = defined $obj -> {assignee } ? qq{ "$obj ->{assignees}"} : ' ""' ;
@@ -157,6 +174,7 @@ BEGIN
157174---
158175$title
159176date: $itemDate
177+ readabledate: $itemReadableDate
160178type: bibliography
161179item_type: $type
162180authors: $itemAuthors
0 commit comments