Add radk generation
Makefile
| 14 | 14 | ||
| 15 | 15 | DICO_MODULES=modules/nani/trie.scm modules/nani/result.scm modules/nani/jmdict/trie.scm \ | |
| 16 | 16 | modules/nani/jmdict/serialize.scm modules/nani/jmdict/xml.scm \ | |
| 17 | - | modules/nani/jmdict/entities.scm modules/nani/frequency.scm | |
| 17 | + | modules/nani/jmdict/entities.scm modules/nani/frequency.scm \ | |
| 18 | + | modules/nani/serialize.scm | |
| 18 | 19 | ||
| 19 | 20 | LANGS=fr | |
| 20 | 21 | ||
… | |||
| 70 | 71 | ||
| 71 | 72 | po/nani.pot: $(HAUNT_FILES) | |
| 72 | 73 | xgettext --keyword=_ --language=scheme --add-comments --sort-output --from-code UTF-8 -o $@ $^ | |
| 74 | + | ||
| 75 | + | include radicals.mk | |
modules/nani/jmdict/serialize.scm
| 19 | 19 | (define-module (nani jmdict serialize) | |
| 20 | 20 | #:use-module (nani huffman) | |
| 21 | 21 | #:use-module (nani result) | |
| 22 | + | #:use-module (nani serialize) | |
| 22 | 23 | #:use-module (nani trie) | |
| 23 | 24 | #:use-module (rnrs bytevectors) | |
| 24 | 25 | #:export (serialize-jmdict)) | |
| 25 | 26 | ||
| 26 | - | (define (merge-bvs bvs) | |
| 27 | - | (let* ((size (apply + (map bytevector-length bvs))) | |
| 28 | - | (bv (make-bytevector size 0))) | |
| 29 | - | (let loop ((bvs bvs) (pos 0)) | |
| 30 | - | (unless (null? bvs) | |
| 31 | - | (let ((sz (bytevector-length (car bvs)))) | |
| 32 | - | (bytevector-copy! (car bvs) 0 bv pos sz) | |
| 33 | - | (loop (cdr bvs) (+ pos sz))))) | |
| 34 | - | bv)) | |
| 35 | - | ||
| 36 | 27 | (define (serialize-trie trie results pos) | |
| 37 | 28 | (define (serialize-trie-aux transitions pos) | |
| 38 | 29 | (let loop ((pos pos) | |
… | |||
| 63 | 54 | (+ vals-sz 1) bv #:size? #f) | |
| 64 | 55 | (cons next-pos (cons bv bvs))))) | |
| 65 | 56 | ||
| 66 | - | (define* (serialize-list lst serialize pos bv #:key (size? #t)) | |
| 67 | - | (when (not (list? lst)) (throw 'not-list lst)) | |
| 68 | - | (when size? | |
| 69 | - | (bytevector-u16-set! bv pos (length lst) (endianness big))) | |
| 70 | - | (let loop ((lst lst) (pos (+ pos (if size? 2 0)))) | |
| 71 | - | (if (null? lst) | |
| 72 | - | pos | |
| 73 | - | (loop (cdr lst) (serialize (car lst) pos bv))))) | |
| 74 | - | (define* (list-size lst size #:key (size? #t)) | |
| 75 | - | (when (not (list? lst)) (throw 'not-list lst)) | |
| 76 | - | (apply + (if size? 2 0) (map size lst))) | |
| 77 | - | ||
| 78 | - | (define (serialize-pointer ptr pos bv) | |
| 79 | - | (bytevector-u8-set! bv pos (car ptr)) | |
| 80 | - | (bytevector-u32-set! bv (+ pos 1) (trie-position (cdr ptr)) (endianness big)) | |
| 81 | - | (+ pos 5)) | |
| 82 | - | ||
| 83 | - | (define (serialize-char int pos bv) | |
| 84 | - | (bytevector-u8-set! bv pos int) | |
| 85 | - | (+ pos 1)) | |
| 86 | - | (define char-size (const 1)) | |
| 87 | - | ||
| 88 | - | (define (serialize-int int pos bv) | |
| 89 | - | (bytevector-u32-set! bv pos int (endianness big)) | |
| 90 | - | (+ pos 4)) | |
| 91 | - | (define int-size (const 4)) | |
| 92 | - | ||
| 93 | - | (define (serialize-boolean bool pos bv) | |
| 94 | - | (bytevector-u8-set! bv pos (if bool 1 0)) | |
| 95 | - | (+ pos 1)) | |
| 96 | - | (define boolean-size (const 1)) | |
| 97 | - | ||
| 98 | 57 | (define (serialize-jmdict results kanji-trie reading-trie sense-trie) | |
| 99 | 58 | (define kanji-huffman | |
| 100 | 59 | (let ((kanjis (apply append (map result-kanjis results)))) | |
… | |||
| 111 | 70 | (create-huffman glosses))) | |
| 112 | 71 | (define sense-huffman-code (huffman->code sense-huffman)) | |
| 113 | 72 | ||
| 114 | - | (define (serialize-string str pos bv) | |
| 115 | - | (let ((sbv (string->utf8 str))) | |
| 116 | - | (bytevector-copy! sbv 0 bv pos (bytevector-length sbv)) | |
| 117 | - | (bytevector-u8-set! bv (+ pos (bytevector-length sbv)) 0) | |
| 118 | - | (+ pos 1 (bytevector-length sbv)))) | |
| 119 | - | (define (string-size str) | |
| 120 | - | (let ((sbv (string->utf8 str))) | |
| 121 | - | (+ 1 (bytevector-length sbv)))) | |
| 122 | - | ||
| 123 | 73 | (define (serialize-huffman-string huffman-code) | |
| 124 | 74 | (lambda (str pos bv) | |
| 125 | 75 | (let ((sbv (huffman-encode huffman-code str))) | |
modules/nani/radk.scm unknown status 1
| 1 | + | ;;; Nani Project website | |
| 2 | + | ;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu> | |
| 3 | + | ;;; | |
| 4 | + | ;;; This file is part of the Nani Project website. | |
| 5 | + | ;;; | |
| 6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | + | ;;; your option) any later version. | |
| 10 | + | ;;; | |
| 11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | + | ;;; GNU Affero General Public License for more details. | |
| 15 | + | ;;; | |
| 16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | + | ||
| 19 | + | (define-module (nani radk) | |
| 20 | + | #:use-module (ice-9 match) | |
| 21 | + | #:use-module (ice-9 peg) | |
| 22 | + | #:use-module (ice-9 rdelim) | |
| 23 | + | #:use-module (nani serialize) | |
| 24 | + | #:use-module (rnrs bytevectors) | |
| 25 | + | #:use-module (sxml simple) | |
| 26 | + | #:export (parse-radk | |
| 27 | + | get-kanji-stroke | |
| 28 | + | get-rad-kanji | |
| 29 | + | get-rad-stroke | |
| 30 | + | serialize-radk)) | |
| 31 | + | ||
| 32 | + | (define-peg-pattern comment none (and "#" (* (or "\t" (range #\x20 #\x10ffff))) "\n")) | |
| 33 | + | (define-peg-pattern space none " ") | |
| 34 | + | (define-peg-pattern return none "\n") | |
| 35 | + | (define-peg-pattern entry all | |
| 36 | + | (and (ignore "$") space char space num (? (and space name)) (ignore "\n") | |
| 37 | + | (+ (or char (ignore "\n"))))) | |
| 38 | + | (define-peg-pattern num all (+ (or (range #\0 #\9)))) | |
| 39 | + | (define-peg-pattern name none (+ (or (range #\0 #\9) (range #\a #\z) (range #\A #\Z)))) | |
| 40 | + | (define-peg-pattern char all (and (range #\xff #\x10ffff))) | |
| 41 | + | (define-peg-pattern radk-doc body (* (or return comment entry))) | |
| 42 | + | ||
| 43 | + | (define (parse-radk file) | |
| 44 | + | (peg:tree (match-pattern radk-doc (call-with-input-file file read-string)))) | |
| 45 | + | ||
| 46 | + | (define (get-rad-kanji content) | |
| 47 | + | (let loop ((result '()) (content content)) | |
| 48 | + | (match content | |
| 49 | + | (() result) | |
| 50 | + | ((('entry ('char radical) ('num stroke) (('char kanji) ...)) content ...) | |
| 51 | + | (loop (cons (cons radical kanji) result) | |
| 52 | + | content))))) | |
| 53 | + | ||
| 54 | + | (define (get-rad-stroke content) | |
| 55 | + | (let loop ((result '()) (content content)) | |
| 56 | + | (match content | |
| 57 | + | (() result) | |
| 58 | + | ((('entry ('char radical) ('num stroke) (('char kanji) ...)) content ...) | |
| 59 | + | (loop (cons (cons radical (string->number stroke)) result) | |
| 60 | + | content))))) | |
| 61 | + | ||
| 62 | + | (define (get-kanji-stroke file) | |
| 63 | + | (define strokes (xml->sxml (call-with-input-file file read-string))) | |
| 64 | + | ||
| 65 | + | (match strokes | |
| 66 | + | (('*TOP* _ ('kanjidic2 content ...)) | |
| 67 | + | (map | |
| 68 | + | (lambda (entry) | |
| 69 | + | (let* ((literal (car (assoc-ref entry 'literal))) | |
| 70 | + | (misc (assoc-ref entry 'misc)) | |
| 71 | + | (misc (filter list? misc)) | |
| 72 | + | (stroke (car (assoc-ref misc 'stroke_count)))) | |
| 73 | + | (cons literal (string->number stroke)))) | |
| 74 | + | (filter | |
| 75 | + | (lambda (entry) | |
| 76 | + | (and | |
| 77 | + | (list? entry) | |
| 78 | + | (equal? (car entry) 'character))) | |
| 79 | + | content))))) | |
| 80 | + | ||
| 81 | + | (define (serialize-radk rad-kanji rad-stroke kanji-stroke) | |
| 82 | + | (define (serialize-rad-kanji-element element pos bv) | |
| 83 | + | (match element | |
| 84 | + | ((radical kanji ...) | |
| 85 | + | (let* ((pos (serialize-string radical pos bv))) | |
| 86 | + | (serialize-list kanji serialize-string pos bv))))) | |
| 87 | + | (define (rad-kanji-element-size element) | |
| 88 | + | (match element | |
| 89 | + | ((radical kanji ...) | |
| 90 | + | (+ (string-size radical) (list-size kanji string-size))))) | |
| 91 | + | ||
| 92 | + | (define (serialize-rad-kanji rad-kanji pos bv) | |
| 93 | + | (serialize-list rad-kanji serialize-rad-kanji-element pos bv)) | |
| 94 | + | (define (rad-kanji-size rad-kanji) | |
| 95 | + | (list-size rad-kanji rad-kanji-element-size)) | |
| 96 | + | ||
| 97 | + | (define (serialize-rad-stroke-element element pos bv) | |
| 98 | + | (match element | |
| 99 | + | ((radical . stroke) | |
| 100 | + | (let ((pos (serialize-string radical pos bv))) | |
| 101 | + | (serialize-char stroke pos bv))))) | |
| 102 | + | (define (rad-stroke-element-size element) | |
| 103 | + | (match element | |
| 104 | + | ((radical . stroke) | |
| 105 | + | (+ (string-size radical) (char-size stroke))))) | |
| 106 | + | ||
| 107 | + | (define (serialize-rad-stroke rad-stroke pos bv) | |
| 108 | + | (serialize-list rad-stroke serialize-rad-stroke-element pos bv)) | |
| 109 | + | (define (rad-stroke-size rad-stroke) | |
| 110 | + | (list-size rad-stroke rad-stroke-element-size)) | |
| 111 | + | ||
| 112 | + | (define serialize-kanji-stroke serialize-rad-stroke) | |
| 113 | + | (define kanji-stroke-size rad-stroke-size) | |
| 114 | + | ||
| 115 | + | (let* ((header (string->utf8 "NANI_RADK001")) | |
| 116 | + | (header-size (bytevector-length header)) | |
| 117 | + | (bv (make-bytevector (+ header-size 12 | |
| 118 | + | (rad-kanji-size rad-kanji) | |
| 119 | + | (rad-stroke-size rad-stroke) | |
| 120 | + | (kanji-stroke-size kanji-stroke))))) | |
| 121 | + | (bytevector-copy! header 0 bv 0 header-size) | |
| 122 | + | (let* ((pos header-size) | |
| 123 | + | (pos (serialize-rad-kanji rad-kanji pos bv)) | |
| 124 | + | (pos (serialize-rad-stroke rad-stroke pos bv)) | |
| 125 | + | (pos (serialize-kanji-stroke kanji-stroke pos bv))) | |
| 126 | + | bv))) | |
| 127 | + | ||
| 128 | + | ||
| 129 | + | ||
| 130 | + | ||
| 131 | + |
modules/nani/serialize.scm unknown status 1
| 1 | + | ;;; Nani Project website | |
| 2 | + | ;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu> | |
| 3 | + | ;;; | |
| 4 | + | ;;; This file is part of the Nani Project website. | |
| 5 | + | ;;; | |
| 6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | + | ;;; your option) any later version. | |
| 10 | + | ;;; | |
| 11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | + | ;;; GNU Affero General Public License for more details. | |
| 15 | + | ;;; | |
| 16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | + | ||
| 19 | + | (define-module (nani serialize) | |
| 20 | + | #:use-module (rnrs bytevectors) | |
| 21 | + | #:export (merge-bvs | |
| 22 | + | serialize-list list-size | |
| 23 | + | serialize-pointer | |
| 24 | + | serialize-char char-size | |
| 25 | + | serialize-int int-size | |
| 26 | + | serialize-boolean boolean-size | |
| 27 | + | serialize-string string-size)) | |
| 28 | + | ||
| 29 | + | (define (merge-bvs bvs) | |
| 30 | + | (let* ((size (apply + (map bytevector-length bvs))) | |
| 31 | + | (bv (make-bytevector size 0))) | |
| 32 | + | (let loop ((bvs bvs) (pos 0)) | |
| 33 | + | (unless (null? bvs) | |
| 34 | + | (let ((sz (bytevector-length (car bvs)))) | |
| 35 | + | (bytevector-copy! (car bvs) 0 bv pos sz) | |
| 36 | + | (loop (cdr bvs) (+ pos sz))))) | |
| 37 | + | bv)) | |
| 38 | + | ||
| 39 | + | (define* (serialize-list lst serialize pos bv #:key (size? #t)) | |
| 40 | + | (when (not (list? lst)) (throw 'not-list lst)) | |
| 41 | + | (when size? | |
| 42 | + | (bytevector-u16-set! bv pos (length lst) (endianness big))) | |
| 43 | + | (let loop ((lst lst) (pos (+ pos (if size? 2 0)))) | |
| 44 | + | (if (null? lst) | |
| 45 | + | pos | |
| 46 | + | (loop (cdr lst) (serialize (car lst) pos bv))))) | |
| 47 | + | (define* (list-size lst size #:key (size? #t)) | |
| 48 | + | (when (not (list? lst)) (throw 'not-list lst)) | |
| 49 | + | (apply + (if size? 2 0) (map size lst))) | |
| 50 | + | ||
| 51 | + | (define (serialize-pointer ptr pos bv) | |
| 52 | + | (bytevector-u8-set! bv pos (car ptr)) | |
| 53 | + | (bytevector-u32-set! bv (+ pos 1) (trie-position (cdr ptr)) (endianness big)) | |
| 54 | + | (+ pos 5)) | |
| 55 | + | ||
| 56 | + | (define (serialize-char int pos bv) | |
| 57 | + | (bytevector-u8-set! bv pos int) | |
| 58 | + | (+ pos 1)) | |
| 59 | + | (define char-size (const 1)) | |
| 60 | + | ||
| 61 | + | (define (serialize-int int pos bv) | |
| 62 | + | (bytevector-u32-set! bv pos int (endianness big)) | |
| 63 | + | (+ pos 4)) | |
| 64 | + | (define int-size (const 4)) | |
| 65 | + | ||
| 66 | + | (define (serialize-boolean bool pos bv) | |
| 67 | + | (bytevector-u8-set! bv pos (if bool 1 0)) | |
| 68 | + | (+ pos 1)) | |
| 69 | + | (define boolean-size (const 1)) | |
| 70 | + | ||
| 71 | + | (define (serialize-string str pos bv) | |
| 72 | + | (let ((sbv (string->utf8 str))) | |
| 73 | + | (bytevector-copy! sbv 0 bv pos (bytevector-length sbv)) | |
| 74 | + | (bytevector-u8-set! bv (+ pos (bytevector-length sbv)) 0) | |
| 75 | + | (+ pos 1 (bytevector-length sbv)))) | |
| 76 | + | (define (string-size str) | |
| 77 | + | (let ((sbv (string->utf8 str))) | |
| 78 | + | (+ 1 (bytevector-length sbv)))) |
radicals.mk unknown status 1
| 1 | + | RADK_MODULES=tools/radk.scm modules/nani/radk.scm modules/nani/serialize.scm | |
| 2 | + | ||
| 3 | + | dictionaries/kradzip.zip: | |
| 4 | + | wget ftp://ftp.monash.edu/pub/nihongo/kradzip.zip -O $@ | |
| 5 | + | ||
| 6 | + | dictionaries/radkfilex: dictionaries/kradzip.zip | |
| 7 | + | unzip $^ $$(basename $@) -d $$(dirname $@) | |
| 8 | + | touch $@ | |
| 9 | + | ||
| 10 | + | dictionaries/radkfilex.utf8: dictionaries/radkfilex | |
| 11 | + | iconv -f euc-jp -t utf-8 $^ > $@ | |
| 12 | + | ||
| 13 | + | dictionaries/kanjidic2.xml.gz: | |
| 14 | + | wget http://www.edrdg.org/kanjidic/kanjidic2.xml.gz -O $@ | |
| 15 | + | ||
| 16 | + | dictionaries/kanjidic2.xml: dictionaries/kanjidic2.xml.gz | |
| 17 | + | gunzip $^ | |
| 18 | + | ||
| 19 | + | dicos/radicals.nani: tools/radk.scm dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml $(RADK_MODULES) | |
| 20 | + | guile -L modules tools/radk.scm build dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml $@ |
tools/jmdict.scm
| 63 | 63 | (put-bytevector port | |
| 64 | 64 | (serialize-jmdict results kanji-trie reading-trie meaning-trie)))))) | |
| 65 | 65 | ||
| 66 | + | (define (print word dict) | |
| 67 | + | ()) | |
| 68 | + | ||
| 66 | 69 | (match (command-line) | |
| 67 | 70 | ((_ cmd input lang output) | |
| 68 | 71 | (cond | |
… | |||
| 72 | 75 | (compile input (lambda (sense) (equal? (sense-language sense) lang)) output))) | |
| 73 | 76 | ((equal? cmd "convert") | |
| 74 | 77 | (convert input output)) | |
| 75 | - | (else (format #t "Unknown cmd ~a.~%" cmd))))) | |
| 78 | + | (else (format #t "Unknown cmd ~a.~%" cmd)))) | |
| 79 | + | ((_ "print" word input) | |
| 80 | + | (print word input))) | |
tools/radk.scm unknown status 1
| 1 | + | ;;; Nani Project website | |
| 2 | + | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
| 3 | + | ;;; | |
| 4 | + | ;;; This file is part of the Nani Project website. | |
| 5 | + | ;;; | |
| 6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | + | ;;; your option) any later version. | |
| 10 | + | ;;; | |
| 11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | + | ;;; GNU Affero General Public License for more details. | |
| 15 | + | ;;; | |
| 16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | + | ||
| 19 | + | (use-modules (nani radk)) | |
| 20 | + | (use-modules (ice-9 match)) | |
| 21 | + | (use-modules (ice-9 binary-ports)) | |
| 22 | + | ||
| 23 | + | (match (command-line) | |
| 24 | + | ((_ cmd radk-file kanjidic-file output) | |
| 25 | + | (cond | |
| 26 | + | ((equal? cmd "build") | |
| 27 | + | (let* ((radk (parse-radk radk-file)) | |
| 28 | + | (rad-stroke (get-rad-stroke radk)) | |
| 29 | + | (rad-kanji (get-rad-kanji radk)) | |
| 30 | + | (kanji-stroke (get-kanji-stroke kanjidic-file))) | |
| 31 | + | (call-with-output-file output | |
| 32 | + | (lambda (port) | |
| 33 | + | (put-bytevector port | |
| 34 | + | (serialize-radk rad-kanji rad-stroke kanji-stroke)))))) | |
| 35 | + | (else (format #t "Unknown cmd ~a.~%" cmd))))) |