Add kanjidic
Makefile
9 | 9 | DICOS= | |
10 | 10 | DOWNLOADS= | |
11 | 11 | ||
12 | + | include jibiki.mk | |
13 | + | include jmdict.mk | |
14 | + | include kanjidic.mk | |
12 | 15 | include radicals.mk | |
13 | 16 | include wadoku.mk | |
14 | - | include jmdict.mk | |
15 | - | include jibiki.mk | |
16 | 17 | ||
17 | 18 | # Files that constitute the website | |
18 | 19 | PAGES=blog.scm data.scm documentation.scm e404.scm feeds.scm index.scm mentions.scm | |
… | |||
73 | 74 | sha256sum $< | cut -f1 -d' ' > $@ | |
74 | 75 | ||
75 | 76 | dicos/list: $(DICOS) tools/list.scm $(MOFILES) | |
77 | + | rm -f $@ | |
76 | 78 | guile -L modules -L . tools/list.scm $@ $(DICOS) |
kanjidic.mk unknown status 1
1 | + | KANJIDIC_LANGS=en es fr pt | |
2 | + | KANJIDIC_MODULES=tools/kanjidic.scm | |
3 | + | DICOS+=$(addprefix dicos/kanjidic_, $(addsuffix .nani, $(KANJIDIC_LANGS))) | |
4 | + | DOWNLOADS+=dictionaries/kanjidic2.xml | |
5 | + | ||
6 | + | dictionaries/kanjidic2.xml: | |
7 | + | wget http://www.edrdg.org/kanjidic/kanjidic2.xml.gz -O $@.gz | |
8 | + | gunzip $@.gz | |
9 | + | ||
10 | + | dicos/kanjidic_%.nani: dictionaries/kanjidic2.xml tools/kanjidic.scm $(RADK_MODULES) | |
11 | + | guile -L modules tools/kanjidic.scm build $< $(shell basename $@ .nani | sed 's|^kanjidic_||g') $@ |
modules/nani/kanji/kanjidic.scm unknown status 1
1 | + | ;;; Nani Project website | |
2 | + | ;;; Copyright ?? 2021 Julien Lepiller <julien@lepiller.eu> | |
3 | + | ;;; | |
4 | + | ;;; This file is part of the Nani Project website. | |
5 | + | ;;; | |
6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | + | ;;; your option) any later version. | |
10 | + | ;;; | |
11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + | ;;; GNU Affero General Public License for more details. | |
15 | + | ;;; | |
16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | + | ||
19 | + | (define-module (nani kanji kanjidic) | |
20 | + | #:use-module (ice-9 binary-ports) | |
21 | + | #:use-module (ice-9 match) | |
22 | + | #:use-module (ice-9 rdelim) | |
23 | + | #:use-module (nani encoding huffman) | |
24 | + | #:use-module (nani encoding parse) | |
25 | + | #:use-module (nani encoding serialize) | |
26 | + | #:use-module (nani encoding trie) | |
27 | + | #:use-module (rnrs bytevectors) | |
28 | + | #:use-module (srfi srfi-1) | |
29 | + | #:use-module (srfi srfi-9) | |
30 | + | #:use-module (sxml simple) | |
31 | + | #:export (make-kanji | |
32 | + | kanji? | |
33 | + | kanji-position | |
34 | + | kanji-kanji | |
35 | + | kanji-strokes | |
36 | + | kanji-senses | |
37 | + | kanji-kun | |
38 | + | kanji-on | |
39 | + | kanji-nanori | |
40 | + | ||
41 | + | get-kanji-info | |
42 | + | serialize-kanji kanji-size | |
43 | + | serialize-kanjidic | |
44 | + | kanjidic-entry-count)) | |
45 | + | ||
46 | + | (define-record-type <kanji> | |
47 | + | (make-kanji position kanji strokes senses kun on nanori) | |
48 | + | kanji? | |
49 | + | (position kanji-position kanji-position-set!) | |
50 | + | (kanji kanji-kanji) | |
51 | + | (strokes kanji-strokes) | |
52 | + | (senses kanji-senses) | |
53 | + | (kun kanji-kun) | |
54 | + | (on kanji-on) | |
55 | + | (nanori kanji-nanori)) | |
56 | + | ||
57 | + | (define (sxml->on sxml) | |
58 | + | (match sxml | |
59 | + | (('reading ('@ ('r_type "ja_on")) reading) reading) | |
60 | + | (_ #f))) | |
61 | + | ||
62 | + | (define (sxml->kun sxml) | |
63 | + | (match sxml | |
64 | + | (('reading ('@ ('r_type "ja_kun")) reading) reading) | |
65 | + | (_ #f))) | |
66 | + | ||
67 | + | (define (sxml->nanori sxml) | |
68 | + | (match sxml | |
69 | + | (('nanori nanori) nanori) | |
70 | + | (_ #f))) | |
71 | + | ||
72 | + | (define (sxml->sense lang) | |
73 | + | (lambda (sxml) | |
74 | + | (match sxml | |
75 | + | (('meaning meaning) (if (equal? lang "en") meaning #f)) | |
76 | + | (('meaning ('@ ('m_lang m_lang)) meaning) | |
77 | + | (if (equal? lang m_lang) meaning #f)) | |
78 | + | (_ #f)))) | |
79 | + | ||
80 | + | (define (sxml->kanji lang) | |
81 | + | (lambda (entry) | |
82 | + | (let* ((literal (car (assoc-ref entry 'literal))) | |
83 | + | (misc (assoc-ref entry 'misc)) | |
84 | + | (misc (filter list? misc)) | |
85 | + | (strokes (string->number (car (assoc-ref misc 'stroke_count)))) | |
86 | + | (rm (assoc-ref entry 'reading_meaning))) | |
87 | + | (if rm | |
88 | + | (let* ((rm (filter list? rm)) | |
89 | + | (rmgroup (assoc-ref rm 'rmgroup)) | |
90 | + | (rmgroup (filter list? rmgroup)) | |
91 | + | (on (filter-map sxml->on rmgroup)) | |
92 | + | (kun (filter-map sxml->kun rmgroup)) | |
93 | + | (senses (filter-map (sxml->sense lang) rmgroup)) | |
94 | + | (nanori (filter-map sxml->nanori rm))) | |
95 | + | (make-kanji 0 literal strokes senses kun on nanori)) | |
96 | + | #f)))) | |
97 | + | ||
98 | + | (define (get-kanji-info file lang) | |
99 | + | (let* ((dic (xml->sxml (call-with-input-file file read-string))) | |
100 | + | (kanjis (match dic (('*TOP* _ ('kanjidic2 content ...)) content)))) | |
101 | + | (filter-map (sxml->kanji lang) | |
102 | + | (filter | |
103 | + | (match-lambda | |
104 | + | (('character _ ...) #t) | |
105 | + | (_ #f)) | |
106 | + | kanjis)))) | |
107 | + | ||
108 | + | (define (serialize-kanji sense-huffman reading-huffman) | |
109 | + | (lambda (kanji pos bv) | |
110 | + | (kanji-position-set! kanji pos) | |
111 | + | (let* ((pos (serialize-char (kanji-strokes kanji) pos bv)) | |
112 | + | (pos ((serialize-list (serialize-huffman-string sense-huffman)) | |
113 | + | (kanji-senses kanji) pos bv)) | |
114 | + | (pos ((serialize-list (serialize-huffman-string reading-huffman)) | |
115 | + | (kanji-kun kanji) pos bv)) | |
116 | + | (pos ((serialize-list (serialize-huffman-string reading-huffman)) | |
117 | + | (kanji-on kanji) pos bv)) | |
118 | + | (pos ((serialize-list (serialize-huffman-string reading-huffman)) | |
119 | + | (kanji-nanori kanji) pos bv))) | |
120 | + | pos))) | |
121 | + | (define (kanji-size sense-huffman reading-huffman) | |
122 | + | (lambda (kanji) | |
123 | + | (+ (char-size (kanji-strokes kanji)) | |
124 | + | ((list-size (huffman-string-size sense-huffman)) | |
125 | + | (kanji-senses kanji)) | |
126 | + | ((list-size (huffman-string-size reading-huffman)) | |
127 | + | (kanji-kun kanji)) | |
128 | + | ((list-size (huffman-string-size reading-huffman)) | |
129 | + | (kanji-on kanji)) | |
130 | + | ((list-size (huffman-string-size reading-huffman)) | |
131 | + | (kanji-nanori kanji))))) | |
132 | + | ||
133 | + | (define (update-trie-pos! trie kanji) | |
134 | + | (let* ((vals (trie-vals trie)) | |
135 | + | (vals (map (lambda (i) (kanji-position (array-ref kanji i))) vals))) | |
136 | + | (trie-vals-set! trie vals)) | |
137 | + | (for-each | |
138 | + | (match-lambda | |
139 | + | ((char . child) | |
140 | + | (update-trie-pos! child kanji))) | |
141 | + | (trie-transitions trie))) | |
142 | + | ||
143 | + | (define (make-key key) | |
144 | + | (apply append | |
145 | + | (map (lambda (c) | |
146 | + | (list (quotient c 16) (modulo c 16))) | |
147 | + | (bytevector->u8-list (string->utf8 key))))) | |
148 | + | ||
149 | + | (define (make-kanji-trie kanji) | |
150 | + | (let ((trie (make-empty-trie))) | |
151 | + | (let loop ((kanji kanji) (i 0)) | |
152 | + | (if (null? kanji) | |
153 | + | (compress-trie trie) | |
154 | + | (begin | |
155 | + | (add-to-trie! trie (make-key (kanji-kanji (car kanji))) i) | |
156 | + | (loop (cdr kanji) (+ i 1))))))) | |
157 | + | ||
158 | + | (define (serialize-kanjidic kanji) | |
159 | + | (define sense-huffman | |
160 | + | (let* ((senses (apply append (map kanji-senses kanji)))) | |
161 | + | (create-huffman senses))) | |
162 | + | (define sense-huffman-code (huffman->code sense-huffman)) | |
163 | + | (define reading-huffman | |
164 | + | (let* ((kun (apply append (map kanji-kun kanji))) | |
165 | + | (on (apply append (map kanji-on kanji))) | |
166 | + | (nanori (apply append (map kanji-nanori kanji)))) | |
167 | + | (create-huffman (append kun on nanori)))) | |
168 | + | (define reading-huffman-code (huffman->code reading-huffman)) | |
169 | + | ||
170 | + | (let* ((header (string->utf8 "NANI_KANJIDIC001")) | |
171 | + | (header-size (bytevector-length header)) | |
172 | + | (sense-huffman-bv (serialize-huffman sense-huffman)) | |
173 | + | (sense-huffman-size (bytevector-length sense-huffman-bv)) | |
174 | + | (reading-huffman-bv (serialize-huffman reading-huffman)) | |
175 | + | (reading-huffman-size (bytevector-length reading-huffman-bv)) | |
176 | + | (serialize-trie (serialize-trie serialize-int int-size)) | |
177 | + | (trie-size (trie-size int-size)) | |
178 | + | (kanji-trie (make-kanji-trie kanji)) | |
179 | + | (kanji-trie-size (trie-size kanji-trie)) | |
180 | + | (results-size | |
181 | + | ((list-size (kanji-size sense-huffman-code reading-huffman-code) | |
182 | + | #:size? #f) | |
183 | + | kanji)) | |
184 | + | (huffman-size (+ sense-huffman-size reading-huffman-size)) | |
185 | + | (bv (make-bytevector (+ header-size 4 huffman-size kanji-trie-size | |
186 | + | results-size)))) | |
187 | + | (format #t "Number of kanji: ~a~%" (length kanji)) | |
188 | + | ((serialize-list (serialize-kanji sense-huffman-code reading-huffman-code) | |
189 | + | #:size? #f) | |
190 | + | kanji (+ header-size 4 huffman-size kanji-trie-size) bv) | |
191 | + | (let ((kanji (list->array 1 kanji))) | |
192 | + | (update-trie-pos! kanji-trie kanji)) | |
193 | + | (bytevector-copy! header 0 bv 0 header-size) | |
194 | + | (serialize-int (length kanji) (+ header-size) bv) | |
195 | + | (bytevector-copy! sense-huffman-bv 0 bv (+ header-size 4) | |
196 | + | sense-huffman-size) | |
197 | + | (bytevector-copy! reading-huffman-bv 0 bv (+ header-size 4 sense-huffman-size) | |
198 | + | reading-huffman-size) | |
199 | + | (serialize-trie kanji-trie (+ header-size 4 huffman-size) bv) | |
200 | + | bv)) | |
201 | + | ||
202 | + | (define (kanjidic-entry-count file) | |
203 | + | (call-with-input-file file | |
204 | + | (lambda (port) | |
205 | + | (let* ((header (get-bytevector-n port 16)) | |
206 | + | (size (get-bytevector-n port 4))) | |
207 | + | (bytevector-u32-ref size 0 (endianness big)))))) |
radicals.mk
1 | - | RADK_MODULES=tools/radk.scm modules/nani/kanji/radk.scm modules/nani/encoding/serialize.scm | |
1 | + | RADK_MODULES=tools/radk.scm | |
2 | 2 | DICOS+=dicos/radicals.nani | |
3 | - | DOWNLOADS+=dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml | |
3 | + | DOWNLOADS+=dictionaries/radkfilex.utf8 | |
4 | 4 | ||
5 | 5 | dictionaries/radkfilex.utf8: | |
6 | 6 | #wget ftp://ftp.monash.edu/pub/nihongo/kradzip.zip -O dictionaries/kradzip.zip | |
… | |||
9 | 9 | iconv -f euc-jp -t utf-8 dictionaries/radkfilex > $@ | |
10 | 10 | rm dictionaries/radkfilex | |
11 | 11 | ||
12 | - | dictionaries/kanjidic2.xml: | |
13 | - | wget http://www.edrdg.org/kanjidic/kanjidic2.xml.gz -O $@.gz | |
14 | - | gunzip $@.gz | |
15 | - | ||
16 | 12 | dicos/radicals.nani: tools/radk.scm dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml $(RADK_MODULES) | |
17 | 13 | guile -L modules tools/radk.scm build dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml $@ |
tools/kanjidic.scm unknown status 1
1 | + | ;;; Nani Project website | |
2 | + | ;;; Copyright ?? 2021 Julien Lepiller <julien@lepiller.eu> | |
3 | + | ;;; | |
4 | + | ;;; This file is part of the Nani Project website. | |
5 | + | ;;; | |
6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | + | ;;; your option) any later version. | |
10 | + | ;;; | |
11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + | ;;; GNU Affero General Public License for more details. | |
15 | + | ;;; | |
16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | + | ||
19 | + | (use-modules (nani kanji kanjidic)) | |
20 | + | (use-modules (ice-9 match)) | |
21 | + | (use-modules (ice-9 binary-ports)) | |
22 | + | ||
23 | + | (match (command-line) | |
24 | + | ((_ cmd kanjidic-file lang output) | |
25 | + | (cond | |
26 | + | ((equal? cmd "build") | |
27 | + | (let* ((kanji (get-kanji-info kanjidic-file lang))) | |
28 | + | (call-with-output-file output | |
29 | + | (lambda (port) | |
30 | + | (put-bytevector port | |
31 | + | (serialize-kanjidic kanji)))))) | |
32 | + | (else (format #t "Unknown cmd ~a.~%" cmd))))) |
tools/list.scm
17 | 17 | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | 18 | ||
19 | 19 | (use-modules (tools i18n)) | |
20 | + | (use-modules (nani kanji kanjidic)) | |
20 | 21 | (use-modules (nani kanji radk)) | |
21 | 22 | (use-modules (nani result result)) | |
22 | 23 | (use-modules (nani pitch pitch)) | |
… | |||
74 | 75 | as you can't search for anything. This dictionary can be searched for by | |
75 | 76 | kanji, reading (kana) and by meaning in the languages you selected.")) | |
76 | 77 | ||
78 | + | (define (kanjidic-synopsis lang) | |
79 | + | (match lang | |
80 | + | ("en" `(_ "Kanji dictionary with English meanings.")) | |
81 | + | ("es" `(_ "Kanji dictionary with Spanish meanings.")) | |
82 | + | ("fr" `(_ "Kanji dictionary with French meanings.")) | |
83 | + | ("pt" `(_ "Kanji dictionary with Portuguese meanings.")))) | |
84 | + | (define (kanjidic-description lang) | |
85 | + | `(_ "This dicitonary allows you to search for kanji and view kanji information | |
86 | + | such as number of strokes, pronunciations and meanings.")) | |
87 | + | ||
77 | 88 | (let* ((english | |
78 | 89 | (cond | |
79 | 90 | ((equal? (dico-type dico) "radk") | |
80 | 91 | (if long? | |
81 | 92 | radk-description | |
82 | 93 | radk-synopsis)) | |
94 | + | ((equal? (dico-type dico) "kanjidic") | |
95 | + | (let ((dico-lang (substring dico 9))) | |
96 | + | (if long? | |
97 | + | (kanjidic-description dico-lang) | |
98 | + | (kanjidic-synopsis dico-lang)))) | |
83 | 99 | ((equal? (dico-type dico) "wadoku") | |
84 | 100 | (if long? | |
85 | 101 | wadoku-description | |
… | |||
117 | 133 | (define (dico-type file) | |
118 | 134 | (cond | |
119 | 135 | ((equal? file "radicals") "radk") | |
136 | + | ((equal? file "kanjidic") "kanjidic") | |
120 | 137 | ((and (> (string-length file) 6) (equal? (substring file 0 6) "JMdict")) | |
121 | 138 | "jmdict") | |
122 | 139 | ((equal? file "jibiki_fre") "jibiki") | |
… | |||
127 | 144 | (cond | |
128 | 145 | ((equal? (dico-type (dico-name file)) "radk") | |
129 | 146 | (kanji-count file)) | |
147 | + | ((equal? (dico-type (dico-name file)) "kanjidic") | |
148 | + | (kanjidic-entry-count file)) | |
130 | 149 | ((member (dico-type (dico-name file)) '("jmdict" "wadoku" "jibiki")) | |
131 | 150 | (dictionary-entry-count file)) | |
132 | 151 | ((equal? (dico-type (dico-name file)) "wadoku_pitch") | |
… | |||
135 | 154 | (define (dico-name file) | |
136 | 155 | (basename file ".nani")) | |
137 | 156 | ||
157 | + | (define (dico-lang name) | |
158 | + | (cond | |
159 | + | ((equal? name "radicals") "") | |
160 | + | ((equal? name "kanjidic") "") | |
161 | + | ((equal? name "wadoku_pitch") "") | |
162 | + | ((and (> (string-length name) 6) (equal? (substring name 0 6) "JMdict")) | |
163 | + | (let ((lang (substring name 7))) | |
164 | + | (match lang | |
165 | + | ("e" "en") | |
166 | + | ("dut" "nl") | |
167 | + | ("fre" "fr") | |
168 | + | ("ger" "de") | |
169 | + | ("hun" "hu") | |
170 | + | ("rus" "ru") | |
171 | + | ("slv" "sl") | |
172 | + | ("spa" "es") | |
173 | + | ("swe" "sv")))) | |
174 | + | ((equal? name "jibiki_fre") "fr") | |
175 | + | ((equal? name "wadoku_ger") "de"))) | |
176 | + | ||
138 | 177 | (match (command-line) | |
139 | 178 | ((_ output dicos ...) | |
140 | 179 | (with-output-to-file output | |
… | |||
144 | 183 | (let* ((sha256 (sha256 dico)) | |
145 | 184 | (size (filesize dico)) | |
146 | 185 | (name (dico-name dico)) | |
186 | + | (lang (dico-lang name)) | |
147 | 187 | (type (dico-type name)) | |
148 | 188 | (entry-count (entries dico))) | |
149 | 189 | (format #t "[~a]~%" name) | |
… | |||
156 | 196 | (when description | |
157 | 197 | (format #t "description=~a=~a~%" lang description)))) | |
158 | 198 | (filter (lambda (lang) (not (equal? lang ""))) languages)) | |
199 | + | (format #t "lang=~a~%" lang) | |
159 | 200 | (format #t "sha256=~a~%" sha256) | |
160 | 201 | (format #t "size=~a~%" size) | |
161 | 202 | (format #t "type=~a~%" type) |