Skip to content

Commit d1f9335

Browse files
authored
Merge pull request #301 from Interlisp/mth6--Accented-chars--HTMLish--partial-dates
Fix a few bibliography display issues: 1. Accented characters mis-encoded -- resolves Interlisp/medley#2339 2. HTMLish "tags" improperly removed -- resolves Interlisp/medley#2340 3. display dates as originally set in Zotero -- resolves Interlisp/medley#2342 **Note:** there **_may_** be additional bibliography item metadata fields that _could_ contain the issues 1 and 2 above. We could wait until we discover them, or we could add these changes to those we _suspect_ are likely to contain them.
2 parents d13b602 + 1af5b50 commit d1f9335

File tree

5 files changed

+70
-39
lines changed

5 files changed

+70
-39
lines changed

layouts/bibliography/list.html

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,7 @@ <h1 class="bib-title">{{ or .Params.heading .Title }}</h1>
6262
{{- $rendered = $rendered | htmlEscape -}}
6363
<a href="{{ .RelPermalink }}"><span><strong>{{- $rendered | safeHTML -}}</strong></span></a><br>
6464

65-
{{- /* Safe date: use front matter date only if non-empty & matches basic pattern */ -}}
66-
{{- $authors := .Params.authors -}}
65+
{{- $authors := .Params.authors -}}
6766
{{- $editors := .Params.editors -}}
6867
{{- $isPatent := eq .Params.item_type "patent" -}}
6968
{{- /* warnf "authors: %v" $authors */ -}}
@@ -91,17 +90,19 @@ <h1 class="bib-title">{{ or .Params.heading .Title }}</h1>
9190
<br>
9291
{{- end -}}
9392

93+
{{- /* Safe date: use front matter date only if non-empty & matches basic pattern */ -}}
9494
{{- $d := .Date -}}
95+
{{- $fmtDate := .Params.readabledate -}}
9596
{{- with .Params.date -}}
96-
{{- $datestr := trim . " " -}}
97-
{{- if and (ne $datestr "") (findRE `^\d{4}-\d{2}-\d{2}` $datestr) -}}
98-
{{- $d = time $datestr -}}
97+
{{- $isoDate := trim . " " -}}
98+
{{- if and (ne $isoDate "") (findRE `^\d{4}-\d{2}-\d{2}` $isoDate) -}}
99+
{{- $d = time $isoDate -}}
99100
{{- end -}}
100-
{{- end -}}
101+
{{- end }}
101102
{{- $d = $d.Format "2006-01-02" -}}
102103
{{- /* Don't display bogus date */ -}}
103104
{{- if (ne $d "0001-01-01") }}
104-
<time datetime="{{ $d }}">{{ $d }}</time><br>
105+
<time datetime="{{ $d }}">{{ $fmtDate }}</time>
105106
{{ end -}}
106107

107108
{{ with .Params.abstract -}}
@@ -114,7 +115,7 @@ <h1 class="bib-title">{{ or .Params.heading .Title }}</h1>
114115
{{- $previewWords := split $plain " " | first $previewWordLimit -}}
115116
{{- $preview = delimit $previewWords " " -}}
116117
{{- else -}}
117-
{{- $preview = replace . "\n" "<br>" -}}
118+
{{- $preview = replace (. | htmlEscape) "\n" "<br>" -}}
118119
{{- end -}}
119120
<br>
120121
<div>{{ $preview | safeHTML }}{{ if $previewing }}…{{ end }}</div>

layouts/bibliography/single.html

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,16 +32,17 @@ <h1>{{ .Title }}</h1>
3232

3333
<p>
3434
{{- $d := .Date -}}
35+
{{- $fmtDate := .Params.readabledate -}}
3536
{{- with .Params.date -}}
36-
{{- $datestr := trim . " " -}}
37-
{{- if and (ne $datestr "") (findRE `^\d{4}-\d{2}-\d{2}` $datestr) -}}
38-
{{- $d = time $datestr -}}
37+
{{- $isoDate := trim . " " -}}
38+
{{- if and (ne $isoDate "") (findRE `^\d{4}-\d{2}-\d{2}` $isoDate) -}}
39+
{{- $d = time $isoDate -}}
3940
{{- end -}}
4041
{{- end }}
4142
{{- $d = $d.Format "2006-01-02" -}}
4243
{{- /* Don't display bogus date */ -}}
4344
{{- if (ne $d "0001-01-01") }}
44-
<time datetime="{{ $d }}">{{ $d }}</time>
45+
<time datetime="{{ $d }}">{{ $fmtDate }}</time>
4546
{{ end -}}
4647
</p>
4748
<p>
@@ -56,7 +57,7 @@ <h1>{{ .Title }}</h1>
5657
<strong>Abstract</strong>
5758
<br>
5859
{{ with .Params.abstract }}
59-
{{ . | markdownify }}
60+
{{ . | htmlEscape | markdownify }}
6061
{{ else }}
6162
No abstract available.
6263
{{ end }}

scripts/bib-fns.jq

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,14 @@ def issued_iso_string:
3030
.
3131
end;
3232

33+
def issued_date_readable:
34+
if nonBlankKey("issued") and (.issued | nonBlankKey("date-parts")) then
35+
setpath(["readableDateString"];
36+
(.issued["date-parts"][0]) as $p | $p | map(pad2) | join("-"))
37+
else
38+
.
39+
end;
40+
3341
def format_person_name:
3442
if (has("family") and .family != null and (.family|tostring|length)>0) then
3543
.family

scripts/bibSplit.pl

Lines changed: 38 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,26 @@
22
use JSON::PP qw(decode_json encode_json);
33
use Encode qw(decode encode is_utf8);
44
use Unicode::Normalize qw(NFC);
5+
use utf8;
6+
BEGIN
7+
{
8+
$bibDir = $ENV{'BIBLIOGRAPHY_DIR'};
9+
$bibItemsDir = $ENV{'BIBITEMS_DIR'};
10+
}
11+
12+
# Handy when using the perl debugger...
13+
# sub is8 {
14+
# my ($s) = @_;
15+
# return is_utf8($s) ? "is UTF8" : "is not UTF8";
16+
# }
517

618
# Cleanup text fields
719
sub sanitize_text {
820
my ($s) = @_;
921
return '' unless defined $s;
1022

1123
# Ensure decoded characters
24+
# NOTE: It SEEMS is_utf8() checking may not correctly indicate the correct state of the content!!
1225
$s = decode('UTF-8', $s, Encode::WARN) unless is_utf8($s);
1326

1427
# Repair mojibake (e.g., "’" -> "’") if available
@@ -19,32 +32,28 @@ sub sanitize_text {
1932

2033
# Normalize and clean
2134
$s = NFC($s); # normalize accents/combining marks
22-
$s =~ s/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x{80}-\x{9F}]//g; # drop C0/C1 controls
23-
$s =~ s/\x{00A0}/ /g; # NBSP -> space
24-
$s =~ s/\r\n?/ /g; # normalize newlines
35+
$s =~ s/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x{80}-\x{9F}]//gu; # drop C0/C1 controls
36+
$s =~ s/\x{00A0}/ /gu; # NBSP -> space
37+
$s =~ s/\r\n?/ /gu; # normalize newlines
2538

2639
return $s;
2740
}
2841

29-
BEGIN
30-
{
31-
$bibDir = $ENV{'BIBLIOGRAPHY_DIR'};
32-
$bibItemsDir = $ENV{'BIBITEMS_DIR'};
33-
}
3442
my $item = $_;
35-
my $json = eval { decode_json($item) } or do { warn "Bad JSON line: $_\n"; next; };
43+
my $obj = eval { decode_json($item) } or do { warn "Bad JSON line: $_\n"; next; };
3644

37-
my $key = $json->{key};
38-
my $target = $json->{target} || print STDERR "Cannot find target for key \"$key\" in line: $_\n";
45+
my $key = $obj->{key};
46+
my $target = $obj->{target} || print STDERR "Cannot find target for key \"$key\" in line: $_\n";
3947

4048
if ($key eq $target) { # only top level entries
4149
#my $handle = undef;
4250
#my $itemjson = "$bibItemsDir/$key.json";
4351
#open($handle, ">:encoding(UTF-8)", $itemjson) || die "$0: cannot open $itemjson in write-open mode: $!";
44-
#print $handle $item;
52+
#print $handle (is_utf8($item) ? $item : decode('UTF-8', $item, Encode::WARN));
4553
#close $handle || die "$0: close of file $itemjson failed: $!";
4654

47-
my $obj = eval { decode_json($item) } or do { warn "Bad JSON for $key\n"; next; };
55+
# No need to decode the $item a second time. Just changed the $json above to be $obj.
56+
#my $obj = eval { decode_json($item) } or do { warn "Bad JSON for $key\n"; next; };
4857
delete $obj->{children};
4958

5059
my $type = $obj->{type} // '';
@@ -59,34 +68,42 @@ BEGIN
5968
my $abs = sanitize_text($obj->{abstract} // '');
6069
# a hack for bulleted lists in the abstracts (use markdown there)
6170
# won't work for nested lists.
62-
$abs =~ s/\n?\n\N{U+2022}/\n*/g;
71+
$abs =~ s/\n?\n\N{U+2022}/\n*/gu;
6372
my $indented = join('', map { " $_\n" } split(/\n/, $abs));
6473
my $abstract = $indented eq '' ? "abstract: ''" : "abstract: |\n$indented";
6574

6675
my $itemDate = defined $obj->{isoDateString} ? $obj->{isoDateString} : '';
76+
my $itemReadableDate = defined $obj->{readableDateString} ? $obj->{readableDateString} : '';
6777

6878
my $itemAuthors = '';
6979
if (ref($obj->{authorsFormatted}) eq 'ARRAY' && @{$obj->{authorsFormatted}}) {
7080
$itemAuthors = "\n";
7181
for my $a (@{$obj->{authorsFormatted}}) {
72-
my $quoted = encode_json($a // '');
73-
$itemAuthors .= " - $quoted\n";
82+
# The encode_json is where extended unicode chars get corrupted, e.g., "Emanuelson, Pär"
83+
# There may be other things that now don't work!!
84+
# my $quoted = encode_json($a // '');
85+
# sanitize_text seems to handle them correctly
86+
my $san = sanitize_text($a // '');
87+
$itemAuthors .= " - \"$san\"\n";
7488
}
75-
$itemAuthors =~ s/\n$//; # strip trailing newline
89+
$itemAuthors =~ s/\n$//u; # strip trailing newline
7690
}
7791

7892
my $itemEditors = '';
7993
if (ref($obj->{editorsFormatted}) eq 'ARRAY' && @{$obj->{editorsFormatted}}) {
8094
$itemEditors = "\n";
8195
for my $a (@{$obj->{editorsFormatted}}) {
82-
my $quoted = encode_json($a // '');
83-
$itemEditors .= " - $quoted\n";
96+
# as above...
97+
# my $quoted = encode_json($a // '');
98+
my $san = sanitize_text($a // '');
99+
$itemEditors .= " - \"$san\"\n";
84100
}
85-
$itemEditors =~ s/\n$//; # strip trailing newline
101+
$itemEditors =~ s/\n$//u; # strip trailing newline
86102
}
87103

88104
my $urlSource = defined $obj->{url} ? $obj->{url} : '';
89105

106+
# Some/most/all of these *may* need sanitize_text
90107
# optional fields - ones used vary by value of type
91108
my $applicationNumber = defined $obj->{applicationNumber} ? qq{"$obj->{applicationNumber}"} : '""';
92109
my $assignee = defined $obj->{assignee} ? qq{"$obj->{assignees}"} : '""';
@@ -157,6 +174,7 @@ BEGIN
157174
---
158175
$title
159176
date: $itemDate
177+
readabledate: $itemReadableDate
160178
type: bibliography
161179
item_type: $type
162180
authors: $itemAuthors

scripts/update_bibliography.sh

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,16 @@ function usage () {
77
echo -e "Usage: $0 [ options ] [ input_items_file ]\n"
88
echo -e "Options:"
99
echo -e " -h | --help \tDisplay this message and exit."
10-
echo -e " -r | --rawitems \tSave the complete downloaded JSON as '00-rawItems.json' (See 'input_items_file' below.)."
10+
echo -e " -r | --rawitems \tSave the complete downloaded JSON as '00-rawItems.json'"
11+
echo -e " \t(See 'input_items_file' below.)."
1112
echo -e " -g | --tagsfile \tGenerate 'tags.json' containing all tags on the 'cleaned-up' set of entries."
1213
echo -e " -y | --typefiles \tGenerate item type information JSON files. (See below.)"
1314
echo -e " -c | --collectionsfiles \tGenerate two JSON files containing info about each of the Zotero collections."
14-
echo -e " -u | --curlfiles \tGenerate files in the 'curl/' directory with the output of each call to curl. Very low level debugging."
15+
echo -e " -u | --curlfiles \tGenerate files in the 'curl/' directory with the output of each call to curl."
16+
echo -e " \t(Very low level debugging.)"
1517
echo -e " -d | --debugfiles \tGenerate numbered files with the intermediate processing step output JASON."
16-
echo -e " -i N | --infolevel N \tSet the level of display of informational messages. N is 0-10 (Default = 2.). (See below.)"
18+
echo -e " -i N | --infolevel N \tSet the level of display of informational messages. N is 0-10 (Default = 2.)."
19+
echo -e " \t(See below.)"
1720
echo -e "\n typefiles: These 3 files contain the type and itemType information for the entries with different level of details."
1821
echo -e "\n infolevel: The infolevel controls how much detail is presented during processing."
1922
echo -e " 0: NO info messages."
@@ -374,11 +377,11 @@ if $debugFiles ; then
374377
fi
375378
finalCount=$(jq '. | length' <<< "$items")
376379

377-
items=$(jq 'include "./bib-fns";map(issued_iso_string)' <<< "$items")
380+
items=$(jq 'include "./bib-fns";map(issued_iso_string | issued_date_readable | add_author_string | add_editor_string)' <<< "$items")
378381

379-
items=$(jq 'include "./bib-fns";map(add_author_string)' <<< "$items")
382+
#items=$(jq 'include "./bib-fns";map(add_author_string)' <<< "$items")
380383

381-
items=$(jq 'include "./bib-fns";map(add_editor_string)' <<< "$items")
384+
#items=$(jq 'include "./bib-fns";map(add_editor_string)' <<< "$items")
382385
# if $removeChildrenFromFinalFile; then
383386
# # Remove .children arrays, if any. Save space.
384387
# items=$(jq 'map(del(.children))' <<< "$items")

0 commit comments

Comments
 (0)