Add kanjidic
Makefile
| 9 | 9 | DICOS= | |
| 10 | 10 | DOWNLOADS= | |
| 11 | 11 | ||
| 12 | + | include jibiki.mk | |
| 13 | + | include jmdict.mk | |
| 14 | + | include kanjidic.mk | |
| 12 | 15 | include radicals.mk | |
| 13 | 16 | include wadoku.mk | |
| 14 | - | include jmdict.mk | |
| 15 | - | include jibiki.mk | |
| 16 | 17 | ||
| 17 | 18 | # Files that constitute the website | |
| 18 | 19 | PAGES=blog.scm data.scm documentation.scm e404.scm feeds.scm index.scm mentions.scm | |
… | |||
| 73 | 74 | sha256sum $< | cut -f1 -d' ' > $@ | |
| 74 | 75 | ||
| 75 | 76 | dicos/list: $(DICOS) tools/list.scm $(MOFILES) | |
| 77 | + | rm -f $@ | |
| 76 | 78 | guile -L modules -L . tools/list.scm $@ $(DICOS) | |
kanjidic.mk unknown status 1
| 1 | + | KANJIDIC_LANGS=en es fr pt | |
| 2 | + | KANJIDIC_MODULES=tools/kanjidic.scm | |
| 3 | + | DICOS+=$(addprefix dicos/kanjidic_, $(addsuffix .nani, $(KANJIDIC_LANGS))) | |
| 4 | + | DOWNLOADS+=dictionaries/kanjidic2.xml | |
| 5 | + | ||
| 6 | + | dictionaries/kanjidic2.xml: | |
| 7 | + | wget http://www.edrdg.org/kanjidic/kanjidic2.xml.gz -O $@.gz | |
| 8 | + | gunzip $@.gz | |
| 9 | + | ||
| 10 | + | dicos/kanjidic_%.nani: dictionaries/kanjidic2.xml tools/kanjidic.scm $(RADK_MODULES) | |
| 11 | + | guile -L modules tools/kanjidic.scm build $< $(shell basename $@ .nani | sed 's|^kanjidic_||g') $@ |
modules/nani/kanji/kanjidic.scm unknown status 1
| 1 | + | ;;; Nani Project website | |
| 2 | + | ;;; Copyright ?? 2021 Julien Lepiller <julien@lepiller.eu> | |
| 3 | + | ;;; | |
| 4 | + | ;;; This file is part of the Nani Project website. | |
| 5 | + | ;;; | |
| 6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | + | ;;; your option) any later version. | |
| 10 | + | ;;; | |
| 11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | + | ;;; GNU Affero General Public License for more details. | |
| 15 | + | ;;; | |
| 16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | + | ||
| 19 | + | (define-module (nani kanji kanjidic) | |
| 20 | + | #:use-module (ice-9 binary-ports) | |
| 21 | + | #:use-module (ice-9 match) | |
| 22 | + | #:use-module (ice-9 rdelim) | |
| 23 | + | #:use-module (nani encoding huffman) | |
| 24 | + | #:use-module (nani encoding parse) | |
| 25 | + | #:use-module (nani encoding serialize) | |
| 26 | + | #:use-module (nani encoding trie) | |
| 27 | + | #:use-module (rnrs bytevectors) | |
| 28 | + | #:use-module (srfi srfi-1) | |
| 29 | + | #:use-module (srfi srfi-9) | |
| 30 | + | #:use-module (sxml simple) | |
| 31 | + | #:export (make-kanji | |
| 32 | + | kanji? | |
| 33 | + | kanji-position | |
| 34 | + | kanji-kanji | |
| 35 | + | kanji-strokes | |
| 36 | + | kanji-senses | |
| 37 | + | kanji-kun | |
| 38 | + | kanji-on | |
| 39 | + | kanji-nanori | |
| 40 | + | ||
| 41 | + | get-kanji-info | |
| 42 | + | serialize-kanji kanji-size | |
| 43 | + | serialize-kanjidic | |
| 44 | + | kanjidic-entry-count)) | |
| 45 | + | ||
| 46 | + | (define-record-type <kanji> | |
| 47 | + | (make-kanji position kanji strokes senses kun on nanori) | |
| 48 | + | kanji? | |
| 49 | + | (position kanji-position kanji-position-set!) | |
| 50 | + | (kanji kanji-kanji) | |
| 51 | + | (strokes kanji-strokes) | |
| 52 | + | (senses kanji-senses) | |
| 53 | + | (kun kanji-kun) | |
| 54 | + | (on kanji-on) | |
| 55 | + | (nanori kanji-nanori)) | |
| 56 | + | ||
| 57 | + | (define (sxml->on sxml) | |
| 58 | + | (match sxml | |
| 59 | + | (('reading ('@ ('r_type "ja_on")) reading) reading) | |
| 60 | + | (_ #f))) | |
| 61 | + | ||
| 62 | + | (define (sxml->kun sxml) | |
| 63 | + | (match sxml | |
| 64 | + | (('reading ('@ ('r_type "ja_kun")) reading) reading) | |
| 65 | + | (_ #f))) | |
| 66 | + | ||
| 67 | + | (define (sxml->nanori sxml) | |
| 68 | + | (match sxml | |
| 69 | + | (('nanori nanori) nanori) | |
| 70 | + | (_ #f))) | |
| 71 | + | ||
| 72 | + | (define (sxml->sense lang) | |
| 73 | + | (lambda (sxml) | |
| 74 | + | (match sxml | |
| 75 | + | (('meaning meaning) (if (equal? lang "en") meaning #f)) | |
| 76 | + | (('meaning ('@ ('m_lang m_lang)) meaning) | |
| 77 | + | (if (equal? lang m_lang) meaning #f)) | |
| 78 | + | (_ #f)))) | |
| 79 | + | ||
| 80 | + | (define (sxml->kanji lang) | |
| 81 | + | (lambda (entry) | |
| 82 | + | (let* ((literal (car (assoc-ref entry 'literal))) | |
| 83 | + | (misc (assoc-ref entry 'misc)) | |
| 84 | + | (misc (filter list? misc)) | |
| 85 | + | (strokes (string->number (car (assoc-ref misc 'stroke_count)))) | |
| 86 | + | (rm (assoc-ref entry 'reading_meaning))) | |
| 87 | + | (if rm | |
| 88 | + | (let* ((rm (filter list? rm)) | |
| 89 | + | (rmgroup (assoc-ref rm 'rmgroup)) | |
| 90 | + | (rmgroup (filter list? rmgroup)) | |
| 91 | + | (on (filter-map sxml->on rmgroup)) | |
| 92 | + | (kun (filter-map sxml->kun rmgroup)) | |
| 93 | + | (senses (filter-map (sxml->sense lang) rmgroup)) | |
| 94 | + | (nanori (filter-map sxml->nanori rm))) | |
| 95 | + | (make-kanji 0 literal strokes senses kun on nanori)) | |
| 96 | + | #f)))) | |
| 97 | + | ||
| 98 | + | (define (get-kanji-info file lang) | |
| 99 | + | (let* ((dic (xml->sxml (call-with-input-file file read-string))) | |
| 100 | + | (kanjis (match dic (('*TOP* _ ('kanjidic2 content ...)) content)))) | |
| 101 | + | (filter-map (sxml->kanji lang) | |
| 102 | + | (filter | |
| 103 | + | (match-lambda | |
| 104 | + | (('character _ ...) #t) | |
| 105 | + | (_ #f)) | |
| 106 | + | kanjis)))) | |
| 107 | + | ||
| 108 | + | (define (serialize-kanji sense-huffman reading-huffman) | |
| 109 | + | (lambda (kanji pos bv) | |
| 110 | + | (kanji-position-set! kanji pos) | |
| 111 | + | (let* ((pos (serialize-char (kanji-strokes kanji) pos bv)) | |
| 112 | + | (pos ((serialize-list (serialize-huffman-string sense-huffman)) | |
| 113 | + | (kanji-senses kanji) pos bv)) | |
| 114 | + | (pos ((serialize-list (serialize-huffman-string reading-huffman)) | |
| 115 | + | (kanji-kun kanji) pos bv)) | |
| 116 | + | (pos ((serialize-list (serialize-huffman-string reading-huffman)) | |
| 117 | + | (kanji-on kanji) pos bv)) | |
| 118 | + | (pos ((serialize-list (serialize-huffman-string reading-huffman)) | |
| 119 | + | (kanji-nanori kanji) pos bv))) | |
| 120 | + | pos))) | |
| 121 | + | (define (kanji-size sense-huffman reading-huffman) | |
| 122 | + | (lambda (kanji) | |
| 123 | + | (+ (char-size (kanji-strokes kanji)) | |
| 124 | + | ((list-size (huffman-string-size sense-huffman)) | |
| 125 | + | (kanji-senses kanji)) | |
| 126 | + | ((list-size (huffman-string-size reading-huffman)) | |
| 127 | + | (kanji-kun kanji)) | |
| 128 | + | ((list-size (huffman-string-size reading-huffman)) | |
| 129 | + | (kanji-on kanji)) | |
| 130 | + | ((list-size (huffman-string-size reading-huffman)) | |
| 131 | + | (kanji-nanori kanji))))) | |
| 132 | + | ||
| 133 | + | (define (update-trie-pos! trie kanji) | |
| 134 | + | (let* ((vals (trie-vals trie)) | |
| 135 | + | (vals (map (lambda (i) (kanji-position (array-ref kanji i))) vals))) | |
| 136 | + | (trie-vals-set! trie vals)) | |
| 137 | + | (for-each | |
| 138 | + | (match-lambda | |
| 139 | + | ((char . child) | |
| 140 | + | (update-trie-pos! child kanji))) | |
| 141 | + | (trie-transitions trie))) | |
| 142 | + | ||
| 143 | + | (define (make-key key) | |
| 144 | + | (apply append | |
| 145 | + | (map (lambda (c) | |
| 146 | + | (list (quotient c 16) (modulo c 16))) | |
| 147 | + | (bytevector->u8-list (string->utf8 key))))) | |
| 148 | + | ||
| 149 | + | (define (make-kanji-trie kanji) | |
| 150 | + | (let ((trie (make-empty-trie))) | |
| 151 | + | (let loop ((kanji kanji) (i 0)) | |
| 152 | + | (if (null? kanji) | |
| 153 | + | (compress-trie trie) | |
| 154 | + | (begin | |
| 155 | + | (add-to-trie! trie (make-key (kanji-kanji (car kanji))) i) | |
| 156 | + | (loop (cdr kanji) (+ i 1))))))) | |
| 157 | + | ||
| 158 | + | (define (serialize-kanjidic kanji) | |
| 159 | + | (define sense-huffman | |
| 160 | + | (let* ((senses (apply append (map kanji-senses kanji)))) | |
| 161 | + | (create-huffman senses))) | |
| 162 | + | (define sense-huffman-code (huffman->code sense-huffman)) | |
| 163 | + | (define reading-huffman | |
| 164 | + | (let* ((kun (apply append (map kanji-kun kanji))) | |
| 165 | + | (on (apply append (map kanji-on kanji))) | |
| 166 | + | (nanori (apply append (map kanji-nanori kanji)))) | |
| 167 | + | (create-huffman (append kun on nanori)))) | |
| 168 | + | (define reading-huffman-code (huffman->code reading-huffman)) | |
| 169 | + | ||
| 170 | + | (let* ((header (string->utf8 "NANI_KANJIDIC001")) | |
| 171 | + | (header-size (bytevector-length header)) | |
| 172 | + | (sense-huffman-bv (serialize-huffman sense-huffman)) | |
| 173 | + | (sense-huffman-size (bytevector-length sense-huffman-bv)) | |
| 174 | + | (reading-huffman-bv (serialize-huffman reading-huffman)) | |
| 175 | + | (reading-huffman-size (bytevector-length reading-huffman-bv)) | |
| 176 | + | (serialize-trie (serialize-trie serialize-int int-size)) | |
| 177 | + | (trie-size (trie-size int-size)) | |
| 178 | + | (kanji-trie (make-kanji-trie kanji)) | |
| 179 | + | (kanji-trie-size (trie-size kanji-trie)) | |
| 180 | + | (results-size | |
| 181 | + | ((list-size (kanji-size sense-huffman-code reading-huffman-code) | |
| 182 | + | #:size? #f) | |
| 183 | + | kanji)) | |
| 184 | + | (huffman-size (+ sense-huffman-size reading-huffman-size)) | |
| 185 | + | (bv (make-bytevector (+ header-size 4 huffman-size kanji-trie-size | |
| 186 | + | results-size)))) | |
| 187 | + | (format #t "Number of kanji: ~a~%" (length kanji)) | |
| 188 | + | ((serialize-list (serialize-kanji sense-huffman-code reading-huffman-code) | |
| 189 | + | #:size? #f) | |
| 190 | + | kanji (+ header-size 4 huffman-size kanji-trie-size) bv) | |
| 191 | + | (let ((kanji (list->array 1 kanji))) | |
| 192 | + | (update-trie-pos! kanji-trie kanji)) | |
| 193 | + | (bytevector-copy! header 0 bv 0 header-size) | |
| 194 | + | (serialize-int (length kanji) (+ header-size) bv) | |
| 195 | + | (bytevector-copy! sense-huffman-bv 0 bv (+ header-size 4) | |
| 196 | + | sense-huffman-size) | |
| 197 | + | (bytevector-copy! reading-huffman-bv 0 bv (+ header-size 4 sense-huffman-size) | |
| 198 | + | reading-huffman-size) | |
| 199 | + | (serialize-trie kanji-trie (+ header-size 4 huffman-size) bv) | |
| 200 | + | bv)) | |
| 201 | + | ||
| 202 | + | (define (kanjidic-entry-count file) | |
| 203 | + | (call-with-input-file file | |
| 204 | + | (lambda (port) | |
| 205 | + | (let* ((header (get-bytevector-n port 16)) | |
| 206 | + | (size (get-bytevector-n port 4))) | |
| 207 | + | (bytevector-u32-ref size 0 (endianness big)))))) |
radicals.mk
| 1 | - | RADK_MODULES=tools/radk.scm modules/nani/kanji/radk.scm modules/nani/encoding/serialize.scm | |
| 1 | + | RADK_MODULES=tools/radk.scm | |
| 2 | 2 | DICOS+=dicos/radicals.nani | |
| 3 | - | DOWNLOADS+=dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml | |
| 3 | + | DOWNLOADS+=dictionaries/radkfilex.utf8 | |
| 4 | 4 | ||
| 5 | 5 | dictionaries/radkfilex.utf8: | |
| 6 | 6 | #wget ftp://ftp.monash.edu/pub/nihongo/kradzip.zip -O dictionaries/kradzip.zip | |
… | |||
| 9 | 9 | iconv -f euc-jp -t utf-8 dictionaries/radkfilex > $@ | |
| 10 | 10 | rm dictionaries/radkfilex | |
| 11 | 11 | ||
| 12 | - | dictionaries/kanjidic2.xml: | |
| 13 | - | wget http://www.edrdg.org/kanjidic/kanjidic2.xml.gz -O $@.gz | |
| 14 | - | gunzip $@.gz | |
| 15 | - | ||
| 16 | 12 | dicos/radicals.nani: tools/radk.scm dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml $(RADK_MODULES) | |
| 17 | 13 | guile -L modules tools/radk.scm build dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml $@ | |
tools/kanjidic.scm unknown status 1
| 1 | + | ;;; Nani Project website | |
| 2 | + | ;;; Copyright ?? 2021 Julien Lepiller <julien@lepiller.eu> | |
| 3 | + | ;;; | |
| 4 | + | ;;; This file is part of the Nani Project website. | |
| 5 | + | ;;; | |
| 6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | + | ;;; your option) any later version. | |
| 10 | + | ;;; | |
| 11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | + | ;;; GNU Affero General Public License for more details. | |
| 15 | + | ;;; | |
| 16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | + | ||
| 19 | + | (use-modules (nani kanji kanjidic)) | |
| 20 | + | (use-modules (ice-9 match)) | |
| 21 | + | (use-modules (ice-9 binary-ports)) | |
| 22 | + | ||
| 23 | + | (match (command-line) | |
| 24 | + | ((_ cmd kanjidic-file lang output) | |
| 25 | + | (cond | |
| 26 | + | ((equal? cmd "build") | |
| 27 | + | (let* ((kanji (get-kanji-info kanjidic-file lang))) | |
| 28 | + | (call-with-output-file output | |
| 29 | + | (lambda (port) | |
| 30 | + | (put-bytevector port | |
| 31 | + | (serialize-kanjidic kanji)))))) | |
| 32 | + | (else (format #t "Unknown cmd ~a.~%" cmd))))) |
tools/list.scm
| 17 | 17 | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | 18 | ||
| 19 | 19 | (use-modules (tools i18n)) | |
| 20 | + | (use-modules (nani kanji kanjidic)) | |
| 20 | 21 | (use-modules (nani kanji radk)) | |
| 21 | 22 | (use-modules (nani result result)) | |
| 22 | 23 | (use-modules (nani pitch pitch)) | |
… | |||
| 74 | 75 | as you can't search for anything. This dictionary can be searched for by | |
| 75 | 76 | kanji, reading (kana) and by meaning in the languages you selected.")) | |
| 76 | 77 | ||
| 78 | + | (define (kanjidic-synopsis lang) | |
| 79 | + | (match lang | |
| 80 | + | ("en" `(_ "Kanji dictionary with English meanings.")) | |
| 81 | + | ("es" `(_ "Kanji dictionary with Spanish meanings.")) | |
| 82 | + | ("fr" `(_ "Kanji dictionary with French meanings.")) | |
| 83 | + | ("pt" `(_ "Kanji dictionary with Portuguese meanings.")))) | |
| 84 | + | (define (kanjidic-description lang) | |
| 85 | + | `(_ "This dicitonary allows you to search for kanji and view kanji information | |
| 86 | + | such as number of strokes, pronunciations and meanings.")) | |
| 87 | + | ||
| 77 | 88 | (let* ((english | |
| 78 | 89 | (cond | |
| 79 | 90 | ((equal? (dico-type dico) "radk") | |
| 80 | 91 | (if long? | |
| 81 | 92 | radk-description | |
| 82 | 93 | radk-synopsis)) | |
| 94 | + | ((equal? (dico-type dico) "kanjidic") | |
| 95 | + | (let ((dico-lang (substring dico 9))) | |
| 96 | + | (if long? | |
| 97 | + | (kanjidic-description dico-lang) | |
| 98 | + | (kanjidic-synopsis dico-lang)))) | |
| 83 | 99 | ((equal? (dico-type dico) "wadoku") | |
| 84 | 100 | (if long? | |
| 85 | 101 | wadoku-description | |
… | |||
| 117 | 133 | (define (dico-type file) | |
| 118 | 134 | (cond | |
| 119 | 135 | ((equal? file "radicals") "radk") | |
| 136 | + | ((equal? file "kanjidic") "kanjidic") | |
| 120 | 137 | ((and (> (string-length file) 6) (equal? (substring file 0 6) "JMdict")) | |
| 121 | 138 | "jmdict") | |
| 122 | 139 | ((equal? file "jibiki_fre") "jibiki") | |
… | |||
| 127 | 144 | (cond | |
| 128 | 145 | ((equal? (dico-type (dico-name file)) "radk") | |
| 129 | 146 | (kanji-count file)) | |
| 147 | + | ((equal? (dico-type (dico-name file)) "kanjidic") | |
| 148 | + | (kanjidic-entry-count file)) | |
| 130 | 149 | ((member (dico-type (dico-name file)) '("jmdict" "wadoku" "jibiki")) | |
| 131 | 150 | (dictionary-entry-count file)) | |
| 132 | 151 | ((equal? (dico-type (dico-name file)) "wadoku_pitch") | |
… | |||
| 135 | 154 | (define (dico-name file) | |
| 136 | 155 | (basename file ".nani")) | |
| 137 | 156 | ||
| 157 | + | (define (dico-lang name) | |
| 158 | + | (cond | |
| 159 | + | ((equal? name "radicals") "") | |
| 160 | + | ((equal? name "kanjidic") "") | |
| 161 | + | ((equal? name "wadoku_pitch") "") | |
| 162 | + | ((and (> (string-length name) 6) (equal? (substring name 0 6) "JMdict")) | |
| 163 | + | (let ((lang (substring name 7))) | |
| 164 | + | (match lang | |
| 165 | + | ("e" "en") | |
| 166 | + | ("dut" "nl") | |
| 167 | + | ("fre" "fr") | |
| 168 | + | ("ger" "de") | |
| 169 | + | ("hun" "hu") | |
| 170 | + | ("rus" "ru") | |
| 171 | + | ("slv" "sl") | |
| 172 | + | ("spa" "es") | |
| 173 | + | ("swe" "sv")))) | |
| 174 | + | ((equal? name "jibiki_fre") "fr") | |
| 175 | + | ((equal? name "wadoku_ger") "de"))) | |
| 176 | + | ||
| 138 | 177 | (match (command-line) | |
| 139 | 178 | ((_ output dicos ...) | |
| 140 | 179 | (with-output-to-file output | |
… | |||
| 144 | 183 | (let* ((sha256 (sha256 dico)) | |
| 145 | 184 | (size (filesize dico)) | |
| 146 | 185 | (name (dico-name dico)) | |
| 186 | + | (lang (dico-lang name)) | |
| 147 | 187 | (type (dico-type name)) | |
| 148 | 188 | (entry-count (entries dico))) | |
| 149 | 189 | (format #t "[~a]~%" name) | |
… | |||
| 156 | 196 | (when description | |
| 157 | 197 | (format #t "description=~a=~a~%" lang description)))) | |
| 158 | 198 | (filter (lambda (lang) (not (equal? lang ""))) languages)) | |
| 199 | + | (format #t "lang=~a~%" lang) | |
| 159 | 200 | (format #t "sha256=~a~%" sha256) | |
| 160 | 201 | (format #t "size=~a~%" size) | |
| 161 | 202 | (format #t "type=~a~%" type) | |