Add kanjidic

Julien LepillerWed Jul 21 13:59:22+0200 2021

163e9a1

Add kanjidic

Makefile

99
DICOS=
1010
DOWNLOADS=
1111
12+
include jibiki.mk
13+
include jmdict.mk
14+
include kanjidic.mk
1215
include radicals.mk
1316
include wadoku.mk
14-
include jmdict.mk
15-
include jibiki.mk
1617
1718
# Files that constitute the website
1819
PAGES=blog.scm data.scm documentation.scm e404.scm feeds.scm index.scm mentions.scm

7374
	sha256sum $< | cut -f1 -d' ' > $@
7475
7576
dicos/list: $(DICOS) tools/list.scm $(MOFILES)
77+
	rm -f $@
7678
	guile -L modules -L . tools/list.scm $@ $(DICOS)

kanjidic.mk unknown status 1

1+
KANJIDIC_LANGS=en es fr pt
2+
KANJIDIC_MODULES=tools/kanjidic.scm
3+
DICOS+=$(addprefix dicos/kanjidic_, $(addsuffix .nani, $(KANJIDIC_LANGS)))
4+
DOWNLOADS+=dictionaries/kanjidic2.xml
5+
6+
dictionaries/kanjidic2.xml:
7+
	wget http://www.edrdg.org/kanjidic/kanjidic2.xml.gz -O $@.gz
8+
	gunzip $@.gz
9+
10+
dicos/kanjidic_%.nani: dictionaries/kanjidic2.xml tools/kanjidic.scm $(RADK_MODULES)
11+
	guile -L modules tools/kanjidic.scm build $< $(shell basename $@ .nani | sed 's|^kanjidic_||g') $@

modules/nani/kanji/kanjidic.scm unknown status 1

1+
;;; Nani Project website
2+
;;; Copyright ?? 2021 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
19+
(define-module (nani kanji kanjidic)
20+
  #:use-module (ice-9 binary-ports)
21+
  #:use-module (ice-9 match)
22+
  #:use-module (ice-9 rdelim)
23+
  #:use-module (nani encoding huffman)
24+
  #:use-module (nani encoding parse)
25+
  #:use-module (nani encoding serialize)
26+
  #:use-module (nani encoding trie)
27+
  #:use-module (rnrs bytevectors)
28+
  #:use-module (srfi srfi-1)
29+
  #:use-module (srfi srfi-9)
30+
  #:use-module (sxml simple)
31+
  #:export (make-kanji
32+
            kanji?
33+
            kanji-position
34+
            kanji-kanji
35+
            kanji-strokes
36+
            kanji-senses
37+
            kanji-kun
38+
            kanji-on
39+
            kanji-nanori
40+
41+
            get-kanji-info
42+
            serialize-kanji kanji-size
43+
            serialize-kanjidic
44+
            kanjidic-entry-count))
45+
46+
(define-record-type <kanji>
47+
  (make-kanji position kanji strokes senses kun on nanori)
48+
  kanji?
49+
  (position kanji-position kanji-position-set!)
50+
  (kanji    kanji-kanji)
51+
  (strokes  kanji-strokes)
52+
  (senses   kanji-senses)
53+
  (kun      kanji-kun)
54+
  (on       kanji-on)
55+
  (nanori   kanji-nanori))
56+
57+
(define (sxml->on sxml)
58+
  (match sxml
59+
    (('reading ('@ ('r_type "ja_on")) reading) reading)
60+
    (_ #f)))
61+
62+
(define (sxml->kun sxml)
63+
  (match sxml
64+
    (('reading ('@ ('r_type "ja_kun")) reading) reading)
65+
    (_ #f)))
66+
67+
(define (sxml->nanori sxml)
68+
  (match sxml
69+
    (('nanori nanori) nanori)
70+
    (_ #f)))
71+
72+
(define (sxml->sense lang)
73+
  (lambda (sxml)
74+
    (match sxml
75+
      (('meaning meaning) (if (equal? lang "en") meaning #f))
76+
      (('meaning ('@ ('m_lang m_lang)) meaning)
77+
       (if (equal? lang m_lang) meaning #f))
78+
      (_ #f))))
79+
80+
(define (sxml->kanji lang)
81+
  (lambda (entry)
82+
    (let* ((literal (car (assoc-ref entry 'literal)))
83+
           (misc (assoc-ref entry 'misc))
84+
           (misc (filter list? misc))
85+
           (strokes (string->number (car (assoc-ref misc 'stroke_count))))
86+
           (rm (assoc-ref entry 'reading_meaning)))
87+
      (if rm
88+
          (let* ((rm (filter list? rm))
89+
                 (rmgroup (assoc-ref rm 'rmgroup))
90+
                 (rmgroup (filter list? rmgroup))
91+
                 (on (filter-map sxml->on rmgroup))
92+
                 (kun (filter-map sxml->kun rmgroup))
93+
                 (senses (filter-map (sxml->sense lang) rmgroup))
94+
                 (nanori (filter-map sxml->nanori rm)))
95+
            (make-kanji 0 literal strokes senses kun on nanori))
96+
          #f))))
97+
98+
(define (get-kanji-info file lang)
99+
  (let* ((dic (xml->sxml (call-with-input-file file read-string)))
100+
         (kanjis (match dic (('*TOP* _ ('kanjidic2 content ...)) content))))
101+
    (filter-map (sxml->kanji lang)
102+
                (filter
103+
                  (match-lambda
104+
                    (('character _ ...) #t)
105+
                    (_ #f))
106+
                  kanjis))))
107+
108+
(define (serialize-kanji sense-huffman reading-huffman)
109+
  (lambda (kanji pos bv)
110+
    (kanji-position-set! kanji pos)
111+
    (let* ((pos (serialize-char (kanji-strokes kanji) pos bv))
112+
           (pos ((serialize-list (serialize-huffman-string sense-huffman))
113+
                 (kanji-senses kanji) pos bv))
114+
           (pos ((serialize-list (serialize-huffman-string reading-huffman))
115+
                 (kanji-kun kanji) pos bv))
116+
           (pos ((serialize-list (serialize-huffman-string reading-huffman))
117+
                 (kanji-on kanji) pos bv))
118+
           (pos ((serialize-list (serialize-huffman-string reading-huffman))
119+
                 (kanji-nanori kanji) pos bv)))
120+
      pos)))
121+
(define (kanji-size sense-huffman reading-huffman)
122+
  (lambda (kanji)
123+
    (+ (char-size (kanji-strokes kanji))
124+
       ((list-size (huffman-string-size sense-huffman))
125+
        (kanji-senses kanji))
126+
       ((list-size (huffman-string-size reading-huffman))
127+
        (kanji-kun kanji))
128+
       ((list-size (huffman-string-size reading-huffman))
129+
        (kanji-on kanji))
130+
       ((list-size (huffman-string-size reading-huffman))
131+
        (kanji-nanori kanji)))))
132+
133+
(define (update-trie-pos! trie kanji)
134+
  (let* ((vals (trie-vals trie))
135+
         (vals (map (lambda (i) (kanji-position (array-ref kanji i))) vals)))
136+
    (trie-vals-set! trie vals))
137+
  (for-each
138+
    (match-lambda
139+
      ((char . child)
140+
       (update-trie-pos! child kanji)))
141+
    (trie-transitions trie)))
142+
143+
(define (make-key key)
144+
  (apply append
145+
    (map (lambda (c)
146+
           (list (quotient c 16) (modulo c 16)))
147+
         (bytevector->u8-list (string->utf8 key)))))
148+
149+
(define (make-kanji-trie kanji)
150+
  (let ((trie (make-empty-trie)))
151+
    (let loop ((kanji kanji) (i 0))
152+
      (if (null? kanji)
153+
          (compress-trie trie)
154+
          (begin
155+
            (add-to-trie! trie (make-key (kanji-kanji (car kanji))) i)
156+
            (loop (cdr kanji) (+ i 1)))))))
157+
158+
(define (serialize-kanjidic kanji)
159+
  (define sense-huffman
160+
    (let* ((senses (apply append (map kanji-senses kanji))))
161+
      (create-huffman senses)))
162+
  (define sense-huffman-code (huffman->code sense-huffman))
163+
  (define reading-huffman
164+
    (let* ((kun (apply append (map kanji-kun kanji)))
165+
           (on (apply append (map kanji-on kanji)))
166+
           (nanori (apply append (map kanji-nanori kanji))))
167+
      (create-huffman (append kun on nanori))))
168+
  (define reading-huffman-code (huffman->code reading-huffman))
169+
170+
  (let* ((header (string->utf8 "NANI_KANJIDIC001"))
171+
         (header-size (bytevector-length header))
172+
         (sense-huffman-bv (serialize-huffman sense-huffman))
173+
         (sense-huffman-size (bytevector-length sense-huffman-bv))
174+
         (reading-huffman-bv (serialize-huffman reading-huffman))
175+
         (reading-huffman-size (bytevector-length reading-huffman-bv))
176+
         (serialize-trie (serialize-trie serialize-int int-size))
177+
         (trie-size (trie-size int-size))
178+
         (kanji-trie (make-kanji-trie kanji))
179+
         (kanji-trie-size (trie-size kanji-trie))
180+
         (results-size
181+
           ((list-size (kanji-size sense-huffman-code reading-huffman-code)
182+
                       #:size? #f)
183+
            kanji))
184+
         (huffman-size (+ sense-huffman-size reading-huffman-size))
185+
         (bv (make-bytevector (+ header-size 4 huffman-size kanji-trie-size
186+
                                 results-size))))
187+
    (format #t "Number of kanji: ~a~%" (length kanji))
188+
    ((serialize-list (serialize-kanji sense-huffman-code reading-huffman-code)
189+
                     #:size? #f)
190+
     kanji (+ header-size 4 huffman-size kanji-trie-size) bv)
191+
    (let ((kanji (list->array 1 kanji)))
192+
      (update-trie-pos! kanji-trie kanji))
193+
    (bytevector-copy! header 0 bv 0 header-size)
194+
    (serialize-int (length kanji) (+ header-size) bv)
195+
    (bytevector-copy! sense-huffman-bv 0 bv (+ header-size 4)
196+
                      sense-huffman-size)
197+
    (bytevector-copy! reading-huffman-bv 0 bv (+ header-size 4 sense-huffman-size)
198+
                      reading-huffman-size)
199+
    (serialize-trie kanji-trie (+ header-size 4 huffman-size) bv)
200+
    bv))
201+
202+
(define (kanjidic-entry-count file)
203+
  (call-with-input-file file
204+
    (lambda (port)
205+
      (let* ((header (get-bytevector-n port 16))
206+
             (size (get-bytevector-n port 4)))
207+
        (bytevector-u32-ref size 0 (endianness big))))))

radicals.mk

1-
RADK_MODULES=tools/radk.scm modules/nani/kanji/radk.scm modules/nani/encoding/serialize.scm
1+
RADK_MODULES=tools/radk.scm
22
DICOS+=dicos/radicals.nani
3-
DOWNLOADS+=dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml
3+
DOWNLOADS+=dictionaries/radkfilex.utf8
44
55
dictionaries/radkfilex.utf8:
66
	#wget ftp://ftp.monash.edu/pub/nihongo/kradzip.zip -O dictionaries/kradzip.zip

99
	iconv -f euc-jp -t utf-8 dictionaries/radkfilex > $@
1010
	rm dictionaries/radkfilex
1111
12-
dictionaries/kanjidic2.xml:
13-
	wget http://www.edrdg.org/kanjidic/kanjidic2.xml.gz -O $@.gz
14-
	gunzip $@.gz
15-
1612
dicos/radicals.nani: tools/radk.scm dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml $(RADK_MODULES)
1713
	guile -L modules tools/radk.scm build dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml $@

tools/kanjidic.scm unknown status 1

1+
;;; Nani Project website
2+
;;; Copyright ?? 2021 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
19+
(use-modules (nani kanji kanjidic))
20+
(use-modules (ice-9 match))
21+
(use-modules (ice-9 binary-ports))
22+
23+
(match (command-line)
24+
  ((_ cmd kanjidic-file lang output)
25+
   (cond
26+
    ((equal? cmd "build")
27+
     (let* ((kanji (get-kanji-info kanjidic-file lang)))
28+
       (call-with-output-file output
29+
         (lambda (port)
30+
           (put-bytevector port
31+
             (serialize-kanjidic kanji))))))
32+
    (else (format #t "Unknown cmd ~a.~%" cmd)))))

tools/list.scm

1717
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
1818
1919
(use-modules (tools i18n))
20+
(use-modules (nani kanji kanjidic))
2021
(use-modules (nani kanji radk))
2122
(use-modules (nani result result))
2223
(use-modules (nani pitch pitch))

7475
        as you can't search for anything.  This dictionary can be searched for by
7576
        kanji, reading (kana) and by meaning in the languages you selected."))
7677
78+
  (define (kanjidic-synopsis lang)
79+
    (match lang
80+
      ("en" `(_ "Kanji dictionary with English meanings."))
81+
      ("es" `(_ "Kanji dictionary with Spanish meanings."))
82+
      ("fr" `(_ "Kanji dictionary with French meanings."))
83+
      ("pt" `(_ "Kanji dictionary with Portuguese meanings."))))
84+
  (define (kanjidic-description lang)
85+
    `(_ "This dicitonary allows you to search for kanji and view kanji information
86+
such as number of strokes, pronunciations and meanings."))
87+
7788
  (let* ((english
7889
          (cond
7990
            ((equal? (dico-type dico) "radk")
8091
             (if long?
8192
                 radk-description
8293
                 radk-synopsis))
94+
            ((equal? (dico-type dico) "kanjidic")
95+
             (let ((dico-lang (substring dico 9)))
96+
               (if long?
97+
                   (kanjidic-description dico-lang)
98+
                   (kanjidic-synopsis dico-lang))))
8399
            ((equal? (dico-type dico) "wadoku")
84100
             (if long?
85101
                 wadoku-description

117133
(define (dico-type file)
118134
  (cond
119135
    ((equal? file "radicals") "radk")
136+
    ((equal? file "kanjidic") "kanjidic")
120137
    ((and (> (string-length file) 6) (equal? (substring file 0 6) "JMdict"))
121138
     "jmdict")
122139
    ((equal? file "jibiki_fre") "jibiki")

127144
  (cond
128145
    ((equal? (dico-type (dico-name file)) "radk")
129146
     (kanji-count file))
147+
    ((equal? (dico-type (dico-name file)) "kanjidic")
148+
     (kanjidic-entry-count file))
130149
    ((member (dico-type (dico-name file)) '("jmdict" "wadoku" "jibiki"))
131150
     (dictionary-entry-count file))
132151
    ((equal? (dico-type (dico-name file)) "wadoku_pitch")

135154
(define (dico-name file)
136155
  (basename file ".nani"))
137156
157+
(define (dico-lang name)
158+
  (cond
159+
    ((equal? name "radicals") "")
160+
    ((equal? name "kanjidic") "")
161+
    ((equal? name "wadoku_pitch") "")
162+
    ((and (> (string-length name) 6) (equal? (substring name 0 6) "JMdict"))
163+
     (let ((lang (substring name 7)))
164+
       (match lang
165+
         ("e" "en")
166+
         ("dut" "nl")
167+
         ("fre" "fr")
168+
         ("ger" "de")
169+
         ("hun" "hu")
170+
         ("rus" "ru")
171+
         ("slv" "sl")
172+
         ("spa" "es")
173+
         ("swe" "sv"))))
174+
    ((equal? name "jibiki_fre") "fr")
175+
    ((equal? name "wadoku_ger") "de")))
176+
138177
(match (command-line)
139178
  ((_ output dicos ...)
140179
   (with-output-to-file output

144183
           (let* ((sha256 (sha256 dico))
145184
                  (size (filesize dico))
146185
                  (name (dico-name dico))
186+
                  (lang (dico-lang name))
147187
                  (type (dico-type name))
148188
                  (entry-count (entries dico)))
149189
             (format #t "[~a]~%" name)

156196
                   (when description
157197
                     (format #t "description=~a=~a~%" lang description))))
158198
               (filter (lambda (lang) (not (equal? lang ""))) languages))
199+
             (format #t "lang=~a~%" lang)
159200
             (format #t "sha256=~a~%" sha256)
160201
             (format #t "size=~a~%" size)
161202
             (format #t "type=~a~%" type)