Add radk generation
Makefile
14 | 14 | ||
15 | 15 | DICO_MODULES=modules/nani/trie.scm modules/nani/result.scm modules/nani/jmdict/trie.scm \ | |
16 | 16 | modules/nani/jmdict/serialize.scm modules/nani/jmdict/xml.scm \ | |
17 | - | modules/nani/jmdict/entities.scm modules/nani/frequency.scm | |
17 | + | modules/nani/jmdict/entities.scm modules/nani/frequency.scm \ | |
18 | + | modules/nani/serialize.scm | |
18 | 19 | ||
19 | 20 | LANGS=fr | |
20 | 21 | ||
… | |||
70 | 71 | ||
71 | 72 | po/nani.pot: $(HAUNT_FILES) | |
72 | 73 | xgettext --keyword=_ --language=scheme --add-comments --sort-output --from-code UTF-8 -o $@ $^ | |
74 | + | ||
75 | + | include radicals.mk |
modules/nani/jmdict/serialize.scm
19 | 19 | (define-module (nani jmdict serialize) | |
20 | 20 | #:use-module (nani huffman) | |
21 | 21 | #:use-module (nani result) | |
22 | + | #:use-module (nani serialize) | |
22 | 23 | #:use-module (nani trie) | |
23 | 24 | #:use-module (rnrs bytevectors) | |
24 | 25 | #:export (serialize-jmdict)) | |
25 | 26 | ||
26 | - | (define (merge-bvs bvs) | |
27 | - | (let* ((size (apply + (map bytevector-length bvs))) | |
28 | - | (bv (make-bytevector size 0))) | |
29 | - | (let loop ((bvs bvs) (pos 0)) | |
30 | - | (unless (null? bvs) | |
31 | - | (let ((sz (bytevector-length (car bvs)))) | |
32 | - | (bytevector-copy! (car bvs) 0 bv pos sz) | |
33 | - | (loop (cdr bvs) (+ pos sz))))) | |
34 | - | bv)) | |
35 | - | ||
36 | 27 | (define (serialize-trie trie results pos) | |
37 | 28 | (define (serialize-trie-aux transitions pos) | |
38 | 29 | (let loop ((pos pos) | |
… | |||
63 | 54 | (+ vals-sz 1) bv #:size? #f) | |
64 | 55 | (cons next-pos (cons bv bvs))))) | |
65 | 56 | ||
66 | - | (define* (serialize-list lst serialize pos bv #:key (size? #t)) | |
67 | - | (when (not (list? lst)) (throw 'not-list lst)) | |
68 | - | (when size? | |
69 | - | (bytevector-u16-set! bv pos (length lst) (endianness big))) | |
70 | - | (let loop ((lst lst) (pos (+ pos (if size? 2 0)))) | |
71 | - | (if (null? lst) | |
72 | - | pos | |
73 | - | (loop (cdr lst) (serialize (car lst) pos bv))))) | |
74 | - | (define* (list-size lst size #:key (size? #t)) | |
75 | - | (when (not (list? lst)) (throw 'not-list lst)) | |
76 | - | (apply + (if size? 2 0) (map size lst))) | |
77 | - | ||
78 | - | (define (serialize-pointer ptr pos bv) | |
79 | - | (bytevector-u8-set! bv pos (car ptr)) | |
80 | - | (bytevector-u32-set! bv (+ pos 1) (trie-position (cdr ptr)) (endianness big)) | |
81 | - | (+ pos 5)) | |
82 | - | ||
83 | - | (define (serialize-char int pos bv) | |
84 | - | (bytevector-u8-set! bv pos int) | |
85 | - | (+ pos 1)) | |
86 | - | (define char-size (const 1)) | |
87 | - | ||
88 | - | (define (serialize-int int pos bv) | |
89 | - | (bytevector-u32-set! bv pos int (endianness big)) | |
90 | - | (+ pos 4)) | |
91 | - | (define int-size (const 4)) | |
92 | - | ||
93 | - | (define (serialize-boolean bool pos bv) | |
94 | - | (bytevector-u8-set! bv pos (if bool 1 0)) | |
95 | - | (+ pos 1)) | |
96 | - | (define boolean-size (const 1)) | |
97 | - | ||
98 | 57 | (define (serialize-jmdict results kanji-trie reading-trie sense-trie) | |
99 | 58 | (define kanji-huffman | |
100 | 59 | (let ((kanjis (apply append (map result-kanjis results)))) | |
… | |||
111 | 70 | (create-huffman glosses))) | |
112 | 71 | (define sense-huffman-code (huffman->code sense-huffman)) | |
113 | 72 | ||
114 | - | (define (serialize-string str pos bv) | |
115 | - | (let ((sbv (string->utf8 str))) | |
116 | - | (bytevector-copy! sbv 0 bv pos (bytevector-length sbv)) | |
117 | - | (bytevector-u8-set! bv (+ pos (bytevector-length sbv)) 0) | |
118 | - | (+ pos 1 (bytevector-length sbv)))) | |
119 | - | (define (string-size str) | |
120 | - | (let ((sbv (string->utf8 str))) | |
121 | - | (+ 1 (bytevector-length sbv)))) | |
122 | - | ||
123 | 73 | (define (serialize-huffman-string huffman-code) | |
124 | 74 | (lambda (str pos bv) | |
125 | 75 | (let ((sbv (huffman-encode huffman-code str))) |
modules/nani/radk.scm unknown status 1
1 | + | ;;; Nani Project website | |
2 | + | ;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu> | |
3 | + | ;;; | |
4 | + | ;;; This file is part of the Nani Project website. | |
5 | + | ;;; | |
6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | + | ;;; your option) any later version. | |
10 | + | ;;; | |
11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + | ;;; GNU Affero General Public License for more details. | |
15 | + | ;;; | |
16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | + | ||
19 | + | (define-module (nani radk) | |
20 | + | #:use-module (ice-9 match) | |
21 | + | #:use-module (ice-9 peg) | |
22 | + | #:use-module (ice-9 rdelim) | |
23 | + | #:use-module (nani serialize) | |
24 | + | #:use-module (rnrs bytevectors) | |
25 | + | #:use-module (sxml simple) | |
26 | + | #:export (parse-radk | |
27 | + | get-kanji-stroke | |
28 | + | get-rad-kanji | |
29 | + | get-rad-stroke | |
30 | + | serialize-radk)) | |
31 | + | ||
32 | + | (define-peg-pattern comment none (and "#" (* (or "\t" (range #\x20 #\x10ffff))) "\n")) | |
33 | + | (define-peg-pattern space none " ") | |
34 | + | (define-peg-pattern return none "\n") | |
35 | + | (define-peg-pattern entry all | |
36 | + | (and (ignore "$") space char space num (? (and space name)) (ignore "\n") | |
37 | + | (+ (or char (ignore "\n"))))) | |
38 | + | (define-peg-pattern num all (+ (or (range #\0 #\9)))) | |
39 | + | (define-peg-pattern name none (+ (or (range #\0 #\9) (range #\a #\z) (range #\A #\Z)))) | |
40 | + | (define-peg-pattern char all (and (range #\xff #\x10ffff))) | |
41 | + | (define-peg-pattern radk-doc body (* (or return comment entry))) | |
42 | + | ||
43 | + | (define (parse-radk file) | |
44 | + | (peg:tree (match-pattern radk-doc (call-with-input-file file read-string)))) | |
45 | + | ||
46 | + | (define (get-rad-kanji content) | |
47 | + | (let loop ((result '()) (content content)) | |
48 | + | (match content | |
49 | + | (() result) | |
50 | + | ((('entry ('char radical) ('num stroke) (('char kanji) ...)) content ...) | |
51 | + | (loop (cons (cons radical kanji) result) | |
52 | + | content))))) | |
53 | + | ||
54 | + | (define (get-rad-stroke content) | |
55 | + | (let loop ((result '()) (content content)) | |
56 | + | (match content | |
57 | + | (() result) | |
58 | + | ((('entry ('char radical) ('num stroke) (('char kanji) ...)) content ...) | |
59 | + | (loop (cons (cons radical (string->number stroke)) result) | |
60 | + | content))))) | |
61 | + | ||
62 | + | (define (get-kanji-stroke file) | |
63 | + | (define strokes (xml->sxml (call-with-input-file file read-string))) | |
64 | + | ||
65 | + | (match strokes | |
66 | + | (('*TOP* _ ('kanjidic2 content ...)) | |
67 | + | (map | |
68 | + | (lambda (entry) | |
69 | + | (let* ((literal (car (assoc-ref entry 'literal))) | |
70 | + | (misc (assoc-ref entry 'misc)) | |
71 | + | (misc (filter list? misc)) | |
72 | + | (stroke (car (assoc-ref misc 'stroke_count)))) | |
73 | + | (cons literal (string->number stroke)))) | |
74 | + | (filter | |
75 | + | (lambda (entry) | |
76 | + | (and | |
77 | + | (list? entry) | |
78 | + | (equal? (car entry) 'character))) | |
79 | + | content))))) | |
80 | + | ||
81 | + | (define (serialize-radk rad-kanji rad-stroke kanji-stroke) | |
82 | + | (define (serialize-rad-kanji-element element pos bv) | |
83 | + | (match element | |
84 | + | ((radical kanji ...) | |
85 | + | (let* ((pos (serialize-string radical pos bv))) | |
86 | + | (serialize-list kanji serialize-string pos bv))))) | |
87 | + | (define (rad-kanji-element-size element) | |
88 | + | (match element | |
89 | + | ((radical kanji ...) | |
90 | + | (+ (string-size radical) (list-size kanji string-size))))) | |
91 | + | ||
92 | + | (define (serialize-rad-kanji rad-kanji pos bv) | |
93 | + | (serialize-list rad-kanji serialize-rad-kanji-element pos bv)) | |
94 | + | (define (rad-kanji-size rad-kanji) | |
95 | + | (list-size rad-kanji rad-kanji-element-size)) | |
96 | + | ||
97 | + | (define (serialize-rad-stroke-element element pos bv) | |
98 | + | (match element | |
99 | + | ((radical . stroke) | |
100 | + | (let ((pos (serialize-string radical pos bv))) | |
101 | + | (serialize-char stroke pos bv))))) | |
102 | + | (define (rad-stroke-element-size element) | |
103 | + | (match element | |
104 | + | ((radical . stroke) | |
105 | + | (+ (string-size radical) (char-size stroke))))) | |
106 | + | ||
107 | + | (define (serialize-rad-stroke rad-stroke pos bv) | |
108 | + | (serialize-list rad-stroke serialize-rad-stroke-element pos bv)) | |
109 | + | (define (rad-stroke-size rad-stroke) | |
110 | + | (list-size rad-stroke rad-stroke-element-size)) | |
111 | + | ||
112 | + | (define serialize-kanji-stroke serialize-rad-stroke) | |
113 | + | (define kanji-stroke-size rad-stroke-size) | |
114 | + | ||
115 | + | (let* ((header (string->utf8 "NANI_RADK001")) | |
116 | + | (header-size (bytevector-length header)) | |
117 | + | (bv (make-bytevector (+ header-size 12 | |
118 | + | (rad-kanji-size rad-kanji) | |
119 | + | (rad-stroke-size rad-stroke) | |
120 | + | (kanji-stroke-size kanji-stroke))))) | |
121 | + | (bytevector-copy! header 0 bv 0 header-size) | |
122 | + | (let* ((pos header-size) | |
123 | + | (pos (serialize-rad-kanji rad-kanji pos bv)) | |
124 | + | (pos (serialize-rad-stroke rad-stroke pos bv)) | |
125 | + | (pos (serialize-kanji-stroke kanji-stroke pos bv))) | |
126 | + | bv))) | |
127 | + | ||
128 | + | ||
129 | + | ||
130 | + | ||
131 | + |
modules/nani/serialize.scm unknown status 1
1 | + | ;;; Nani Project website | |
2 | + | ;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu> | |
3 | + | ;;; | |
4 | + | ;;; This file is part of the Nani Project website. | |
5 | + | ;;; | |
6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | + | ;;; your option) any later version. | |
10 | + | ;;; | |
11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + | ;;; GNU Affero General Public License for more details. | |
15 | + | ;;; | |
16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | + | ||
19 | + | (define-module (nani serialize) | |
20 | + | #:use-module (rnrs bytevectors) | |
21 | + | #:export (merge-bvs | |
22 | + | serialize-list list-size | |
23 | + | serialize-pointer | |
24 | + | serialize-char char-size | |
25 | + | serialize-int int-size | |
26 | + | serialize-boolean boolean-size | |
27 | + | serialize-string string-size)) | |
28 | + | ||
29 | + | (define (merge-bvs bvs) | |
30 | + | (let* ((size (apply + (map bytevector-length bvs))) | |
31 | + | (bv (make-bytevector size 0))) | |
32 | + | (let loop ((bvs bvs) (pos 0)) | |
33 | + | (unless (null? bvs) | |
34 | + | (let ((sz (bytevector-length (car bvs)))) | |
35 | + | (bytevector-copy! (car bvs) 0 bv pos sz) | |
36 | + | (loop (cdr bvs) (+ pos sz))))) | |
37 | + | bv)) | |
38 | + | ||
39 | + | (define* (serialize-list lst serialize pos bv #:key (size? #t)) | |
40 | + | (when (not (list? lst)) (throw 'not-list lst)) | |
41 | + | (when size? | |
42 | + | (bytevector-u16-set! bv pos (length lst) (endianness big))) | |
43 | + | (let loop ((lst lst) (pos (+ pos (if size? 2 0)))) | |
44 | + | (if (null? lst) | |
45 | + | pos | |
46 | + | (loop (cdr lst) (serialize (car lst) pos bv))))) | |
47 | + | (define* (list-size lst size #:key (size? #t)) | |
48 | + | (when (not (list? lst)) (throw 'not-list lst)) | |
49 | + | (apply + (if size? 2 0) (map size lst))) | |
50 | + | ||
51 | + | (define (serialize-pointer ptr pos bv) | |
52 | + | (bytevector-u8-set! bv pos (car ptr)) | |
53 | + | (bytevector-u32-set! bv (+ pos 1) (trie-position (cdr ptr)) (endianness big)) | |
54 | + | (+ pos 5)) | |
55 | + | ||
56 | + | (define (serialize-char int pos bv) | |
57 | + | (bytevector-u8-set! bv pos int) | |
58 | + | (+ pos 1)) | |
59 | + | (define char-size (const 1)) | |
60 | + | ||
61 | + | (define (serialize-int int pos bv) | |
62 | + | (bytevector-u32-set! bv pos int (endianness big)) | |
63 | + | (+ pos 4)) | |
64 | + | (define int-size (const 4)) | |
65 | + | ||
66 | + | (define (serialize-boolean bool pos bv) | |
67 | + | (bytevector-u8-set! bv pos (if bool 1 0)) | |
68 | + | (+ pos 1)) | |
69 | + | (define boolean-size (const 1)) | |
70 | + | ||
71 | + | (define (serialize-string str pos bv) | |
72 | + | (let ((sbv (string->utf8 str))) | |
73 | + | (bytevector-copy! sbv 0 bv pos (bytevector-length sbv)) | |
74 | + | (bytevector-u8-set! bv (+ pos (bytevector-length sbv)) 0) | |
75 | + | (+ pos 1 (bytevector-length sbv)))) | |
76 | + | (define (string-size str) | |
77 | + | (let ((sbv (string->utf8 str))) | |
78 | + | (+ 1 (bytevector-length sbv)))) |
radicals.mk unknown status 1
1 | + | RADK_MODULES=tools/radk.scm modules/nani/radk.scm modules/nani/serialize.scm | |
2 | + | ||
3 | + | dictionaries/kradzip.zip: | |
4 | + | wget ftp://ftp.monash.edu/pub/nihongo/kradzip.zip -O $@ | |
5 | + | ||
6 | + | dictionaries/radkfilex: dictionaries/kradzip.zip | |
7 | + | unzip $^ $$(basename $@) -d $$(dirname $@) | |
8 | + | touch $@ | |
9 | + | ||
10 | + | dictionaries/radkfilex.utf8: dictionaries/radkfilex | |
11 | + | iconv -f euc-jp -t utf-8 $^ > $@ | |
12 | + | ||
13 | + | dictionaries/kanjidic2.xml.gz: | |
14 | + | wget http://www.edrdg.org/kanjidic/kanjidic2.xml.gz -O $@ | |
15 | + | ||
16 | + | dictionaries/kanjidic2.xml: dictionaries/kanjidic2.xml.gz | |
17 | + | gunzip $^ | |
18 | + | ||
19 | + | dicos/radicals.nani: tools/radk.scm dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml $(RADK_MODULES) | |
20 | + | guile -L modules tools/radk.scm build dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml $@ |
tools/jmdict.scm
63 | 63 | (put-bytevector port | |
64 | 64 | (serialize-jmdict results kanji-trie reading-trie meaning-trie)))))) | |
65 | 65 | ||
66 | + | (define (print word dict) | |
67 | + | ()) | |
68 | + | ||
66 | 69 | (match (command-line) | |
67 | 70 | ((_ cmd input lang output) | |
68 | 71 | (cond | |
… | |||
72 | 75 | (compile input (lambda (sense) (equal? (sense-language sense) lang)) output))) | |
73 | 76 | ((equal? cmd "convert") | |
74 | 77 | (convert input output)) | |
75 | - | (else (format #t "Unknown cmd ~a.~%" cmd))))) | |
78 | + | (else (format #t "Unknown cmd ~a.~%" cmd)))) | |
79 | + | ((_ "print" word input) | |
80 | + | (print word input))) |
tools/radk.scm unknown status 1
1 | + | ;;; Nani Project website | |
2 | + | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
3 | + | ;;; | |
4 | + | ;;; This file is part of the Nani Project website. | |
5 | + | ;;; | |
6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | + | ;;; your option) any later version. | |
10 | + | ;;; | |
11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + | ;;; GNU Affero General Public License for more details. | |
15 | + | ;;; | |
16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | + | ||
19 | + | (use-modules (nani radk)) | |
20 | + | (use-modules (ice-9 match)) | |
21 | + | (use-modules (ice-9 binary-ports)) | |
22 | + | ||
23 | + | (match (command-line) | |
24 | + | ((_ cmd radk-file kanjidic-file output) | |
25 | + | (cond | |
26 | + | ((equal? cmd "build") | |
27 | + | (let* ((radk (parse-radk radk-file)) | |
28 | + | (rad-stroke (get-rad-stroke radk)) | |
29 | + | (rad-kanji (get-rad-kanji radk)) | |
30 | + | (kanji-stroke (get-kanji-stroke kanjidic-file))) | |
31 | + | (call-with-output-file output | |
32 | + | (lambda (port) | |
33 | + | (put-bytevector port | |
34 | + | (serialize-radk rad-kanji rad-stroke kanji-stroke)))))) | |
35 | + | (else (format #t "Unknown cmd ~a.~%" cmd))))) |