nani/website/modules/nani/radk.scm

radk.scm

1
;;; Nani Project website
2
;;; Copyright © 2020 Julien Lepiller <julien@lepiller.eu>
3
;;;
4
;;; This file is part of the Nani Project website.
5
;;;
6
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7
;;; under the terms of the GNU Affero General Public License as published by
8
;;; the Free Software Foundation; either version 3 of the License, or (at
9
;;; your option) any later version.
10
;;;
11
;;; The Nani Project website is distributed in the hope that it will be useful, but
12
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
;;; GNU Affero General Public License for more details.
15
;;;
16
;;; You should have received a copy of the GNU Affero General Public License
17
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18
19
(define-module (nani radk)
20
  #:use-module (ice-9 binary-ports)
21
  #:use-module (ice-9 match)
22
  #:use-module (ice-9 peg)
23
  #:use-module (ice-9 rdelim)
24
  #:use-module (nani parse-result)
25
  #:use-module (nani serialize)
26
  #:use-module (rnrs bytevectors)
27
  #:use-module (sxml simple)
28
  #:export (parse-radk
29
            get-kanji-stroke
30
            get-rad-kanji
31
            get-rad-stroke
32
            serialize-radk
33
            kanji-count))
34
35
(define-peg-pattern comment none (and "#" (* (or "\t" (range #\x20 #\x10ffff))) "\n"))
36
(define-peg-pattern space none " ")
37
(define-peg-pattern return none "\n")
38
(define-peg-pattern entry all
39
  (and (ignore "$") space char space num (? (and space name)) (ignore "\n")
40
       (+ (or char (ignore "\n")))))
41
(define-peg-pattern num all (+ (or (range #\0 #\9))))
42
(define-peg-pattern name none (+ (or (range #\0 #\9) (range #\a #\z) (range #\A #\Z))))
43
(define-peg-pattern char all (and (range #\xff #\x10ffff)))
44
(define-peg-pattern radk-doc body (* (or return comment entry)))
45
46
(define (parse-radk file)
47
  (peg:tree (match-pattern radk-doc (call-with-input-file file read-string))))
48
49
(define (get-rad-kanji content)
50
  (let loop ((result '()) (content content))
51
    (match content
52
      (() result)
53
      ((('entry ('char radical) ('num stroke) (('char kanji) ...)) content ...)
54
       (loop (cons (cons radical kanji) result)
55
             content)))))
56
57
(define (get-rad-stroke content)
58
  (let loop ((result '()) (content content))
59
    (match content
60
      (() result)
61
      ((('entry ('char radical) ('num stroke) (('char kanji) ...)) content ...)
62
       (loop (cons (cons radical (string->number stroke)) result)
63
             content)))))
64
65
(define (get-kanji-stroke file)
66
  (define strokes (xml->sxml (call-with-input-file file read-string)))
67
68
  (match strokes
69
    (('*TOP* _ ('kanjidic2 content ...))
70
     (map
71
       (lambda (entry)
72
         (let* ((literal (car (assoc-ref entry 'literal)))
73
                (misc (assoc-ref entry 'misc))
74
                (misc (filter list? misc))
75
                (stroke (car (assoc-ref misc 'stroke_count))))
76
           (cons literal (string->number stroke))))
77
       (filter
78
         (lambda (entry)
79
           (and
80
             (list? entry)
81
             (equal? (car entry) 'character)))
82
         content)))))
83
84
(define (serialize-radk rad-kanji rad-stroke kanji-stroke)
85
  (define (serialize-rad-kanji-element element pos bv)
86
    (match element
87
      ((radical kanji ...)
88
       (let* ((pos (serialize-string (radical-character radical) pos bv)))
89
         (serialize-string (string-join kanji "") pos bv)))))
90
  (define (rad-kanji-element-size element)
91
    (match element
92
      ((radical kanji ...)
93
       (+ (string-size (radical-character radical))
94
          (string-size (string-join kanji ""))))))
95
96
  (define (serialize-rad-kanji rad-kanji pos bv)
97
    (serialize-list rad-kanji serialize-rad-kanji-element pos bv))
98
  (define (rad-kanji-size rad-kanji)
99
    (list-size rad-kanji rad-kanji-element-size))
100
101
  (define (serialize-rad-stroke-element element pos bv)
102
    (match element
103
      ((radical . stroke)
104
       (let ((pos (serialize-string (radical-character radical) pos bv)))
105
         (serialize-char stroke pos bv)))))
106
  (define (rad-stroke-element-size element)
107
    (match element
108
      ((radical . stroke)
109
       (+ (string-size (radical-character radical)) (char-size stroke)))))
110
111
  (define (serialize-rad-stroke rad-stroke pos bv)
112
    (serialize-list rad-stroke serialize-rad-stroke-element pos bv))
113
  (define (rad-stroke-size rad-stroke)
114
    (list-size rad-stroke rad-stroke-element-size))
115
116
  (define (serialize-kanji-stroke-element element pos bv)
117
    (match element
118
      ((kanji . stroke)
119
       (let ((pos (serialize-string kanji pos bv)))
120
         (serialize-char stroke pos bv)))))
121
  (define (kanji-stroke-element-size element)
122
    (match element
123
      ((kanji . stroke)
124
       (+ (string-size kanji) (char-size stroke)))))
125
126
  (define (serialize-kanji-stroke kanji-stroke pos bv)
127
    (serialize-list kanji-stroke serialize-kanji-stroke-element pos bv))
128
  (define (kanji-stroke-size kanji-stroke)
129
    (list-size kanji-stroke kanji-stroke-element-size))
130
131
  (let* ((header (string->utf8 "NANI_RADK001"))
132
         (header-size (bytevector-length header))
133
         (bv (make-bytevector (+ header-size 12
134
                                 (rad-kanji-size rad-kanji)
135
                                 (rad-stroke-size rad-stroke)
136
                                 (kanji-stroke-size kanji-stroke)))))
137
    (bytevector-copy! header 0 bv 0 header-size)
138
    (let* ((pos header-size)
139
           (pos (serialize-rad-kanji rad-kanji pos bv))
140
           (pos (serialize-rad-stroke rad-stroke pos bv))
141
           (pos (serialize-kanji-stroke kanji-stroke pos bv)))
142
      bv)))
143
144
(define (radical-character kanji)
145
  (match kanji
146
    ("化" "⺅")
147
    ("个" "𠆢")
148
    ("并" "丷")
149
    ("刈" "⺉")
150
    ("乞" "𠂉")
151
    ("込" "⻌")
152
    ("尚" "⺌")
153
    ("忙" "⺖")
154
    ("扎" "⺘")
155
    ("汁" "⺡")
156
    ("犯" "⺨")
157
    ("艾" "⺾")
158
    ("邦" "⻏")
159
    ("阡" "⻖")
160
    ("老" "⺹")
161
    ("杰" "⺣")
162
    ("礼" "⺭")
163
    ("疔" "疒")
164
    ("禹" "禸")
165
    ("初" "⻂")
166
    ("買" "⺲")
167
    ("滴" "啇")
168
    (_ kanji)))
169
170
(define (parse-result file)
171
  (define (parse-result-rad-kanji-element port)
172
    (let ((radical (parse-result-string port))
173
          (kanji-list (parse-result-string port)))
174
       (cons radical (string->list kanji-list))))
175
  (define (parse-result-rad-kanji port)
176
    (parse-result-list port parse-result-rad-kanji-element))
177
178
  (define (parse-result-rad-stroke-element port)
179
    (let ((radical (parse-result-string port))
180
          (stroke (parse-result-char port)))
181
       (cons radical stroke)))
182
  (define (parse-result-rad-stroke port)
183
    (parse-result-list port parse-result-rad-stroke-element))
184
185
  (define (parse-result-kanji-stroke-element port)
186
    (let ((kanji (parse-result-string port))
187
          (stroke (parse-result-char port)))
188
      (cons kanji stroke)))
189
  (define (parse-result-kanji-stroke port)
190
    (parse-result-list port parse-result-kanji-stroke-element))
191
192
  (call-with-input-file file
193
    (lambda (port)
194
      (let* ((header (utf8->string (get-bytevector-n port 12)))
195
             (rad-kanji (parse-result-rad-kanji port))
196
             (rad-stroke (parse-result-rad-stroke port))
197
             (kanji-stroke (parse-result-kanji-stroke port)))
198
        (list rad-kanji rad-stroke kanji-stroke)))))
199
200
(define (get-kanji-list content)
201
  (let loop ((result '()) (content content))
202
    (match content
203
      (() result)
204
      (((_ kanji ...) content ...)
205
       (loop (append result (filter (lambda (k) (not (member k result))) kanji))
206
             content)))))
207
208
(define (kanji-count file)
209
  (match (parse-result file)
210
    ((rad-kanji _ _)
211
     (length (get-kanji-list rad-kanji)))))
212