Rework dictionary generation
Makefile
1 | + | # keep po files, even if they are sometimes generated, and keep downloaded dictionaries around | |
1 | 2 | .PRECIOUS: po/%.po dictionaries/% | |
2 | 3 | ||
3 | 4 | all: site | |
4 | 5 | ||
6 | + | # To be filled by included files | |
7 | + | # DICOS is the list of generated dictionaries | |
8 | + | # DOWNLOADS is the list of downloaded files | |
5 | 9 | DICOS= | |
6 | 10 | DOWNLOADS= | |
7 | 11 | ||
8 | 12 | include radicals.mk | |
9 | 13 | include wadoku.mk | |
10 | 14 | include jmdict.mk | |
15 | + | include jibiki.mk | |
11 | 16 | ||
17 | + | # Files that constitute the website | |
12 | 18 | PAGES=blog.scm data.scm documentation.scm e404.scm feeds.scm index.scm mentions.scm | |
13 | - | ||
14 | 19 | HAUNT_FILES= haunt.scm $(addprefix pages/, $(PAGES)) \ | |
15 | 20 | tools/i18n.scm tools/theme.scm | |
16 | - | ||
21 | + | SHA_DICOS=$(addsuffix .sha256, $(DICOS)) | |
17 | 22 | WEB_FILES= $(HAUNT_FILES) \ | |
18 | - | $(shell find css) $(shell find images) $(DICOS) $(addsuffix .sha256, $(DICOS)) \ | |
23 | + | $(shell find css) $(shell find images) $(DICOS) $(SHA_DICOS) \ | |
19 | 24 | dicos/list | |
20 | 25 | ||
26 | + | # Guile modules used to build dictionaries | |
21 | 27 | DICO_MODULES=modules/nani/trie.scm modules/nani/result.scm modules/nani/jmdict/trie.scm \ | |
22 | 28 | modules/nani/jmdict/serialize.scm modules/nani/jmdict/xml.scm \ | |
23 | 29 | modules/nani/jmdict/entities.scm modules/nani/frequency.scm \ | |
24 | 30 | modules/nani/serialize.scm | |
25 | 31 | ||
32 | + | # Available languages | |
26 | 33 | LANGS=fr uk | |
27 | 34 | MOFILES=$(addprefix po/, $(addsuffix /LC_MESSAGES/nani.mo, $(LANGS))) | |
28 | 35 | ||
36 | + | dicos: $(DICOS) $(SHA_DICOS) dicos/list | |
37 | + | ||
29 | 38 | site: $(MOFILES) $(WEB_FILES) | |
30 | 39 | haunt build | |
31 | 40 | rm -rf public.bak |
jibiki.mk unknown status 1
1 | + | DICOS+=dicos/jibiki_fre.nani | |
2 | + | DOWNLOADS+=dictionaries/jibiki.xml | |
3 | + | ||
4 | + | dictionaries/jibiki.xml: | |
5 | + | wget https://jibiki.fr/data/Jibiki.fr/jibiki.fr_jpn_fra.xml.gz -O $@.gz | |
6 | + | gunzip $@.gz | |
7 | + | ||
8 | + | dicos/jibiki_fre.nani: dictionaries/jibiki.xml tools/jibiki.scm dictionaries/frequency.tsv $(DICO_MODULES) | |
9 | + | guile -L modules tools/jibiki.scm build $< $@ |
jmdict.mk
22 | 22 | ||
23 | 23 | dicos/JMdict_%.nani: dictionaries/JMdict.xml tools/jmdict.scm dictionaries/frequency.tsv $(DICO_MODULES) | |
24 | 24 | guile -L modules tools/jmdict.scm build \ | |
25 | - | $< $(shell echo $@ | sed 's|^.*_\([^.]*\)\..*$$|\1|g') $@ | |
25 | + | $< $(shell basename $@ .nani | sed 's|^JMdict_||g') $@ | |
26 | 26 | ||
27 | 27 | dicos/JMdict_e.nani: dictionaries/JMdict_e.xml tools/jmdict.scm dictionaries/frequency.tsv $(DICO_MODULES) | |
28 | 28 | guile -L modules tools/jmdict.scm build $< e $@ |
modules/nani/encoding/huffman.scm unknown status 1
1 | + | ;;; Nani Project website | |
2 | + | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
3 | + | ;;; | |
4 | + | ;;; This file is part of the Nani Project website. | |
5 | + | ;;; | |
6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | + | ;;; your option) any later version. | |
10 | + | ;;; | |
11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + | ;;; GNU Affero General Public License for more details. | |
15 | + | ;;; | |
16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | + | ||
19 | + | (define-module (nani encoding huffman) | |
20 | + | #:use-module (ice-9 match) | |
21 | + | #:use-module (rnrs bytevectors) | |
22 | + | #:export (create-huffman | |
23 | + | huffman->code | |
24 | + | huffman-encode | |
25 | + | huffman-decode | |
26 | + | serialize-huffman | |
27 | + | serialize-huffman-string huffman-string-size)) | |
28 | + | ||
29 | + | (define (add-occurence occ char) | |
30 | + | (let* ((o (assoc-ref occ char)) | |
31 | + | (o (+ (if o o 0) 1))) | |
32 | + | (assoc-set! occ char o))) | |
33 | + | ||
34 | + | (define (add-occurences occ str) | |
35 | + | (let loop ((lst (append (string->list str) (list #\nul))) (occ occ)) | |
36 | + | (if (null? lst) | |
37 | + | occ | |
38 | + | (loop (cdr lst) (add-occurence occ (car lst)))))) | |
39 | + | ||
40 | + | (define (get-occurences string-list) | |
41 | + | (let loop ((lst string-list) (occ '())) | |
42 | + | (if (null? lst) | |
43 | + | occ | |
44 | + | (loop (cdr lst) (add-occurences occ (car lst)))))) | |
45 | + | ||
46 | + | (define (order tree) | |
47 | + | (if (null? tree) | |
48 | + | tree | |
49 | + | (add-in-order (car tree) (order (cdr tree))))) | |
50 | + | ||
51 | + | (define (add-in-order elem tree) | |
52 | + | (if (null? tree) | |
53 | + | (list elem) | |
54 | + | (let ((elem-pos (cdr elem)) | |
55 | + | (tree-pos (cdr (car tree)))) | |
56 | + | (if (> elem-pos tree-pos) | |
57 | + | (cons (car tree) (add-in-order elem (cdr tree))) | |
58 | + | (cons elem tree))))) | |
59 | + | ||
60 | + | (define (create-huffman string-list) | |
61 | + | (let ((total (apply + (map string-length string-list))) | |
62 | + | (occurences (get-occurences string-list))) | |
63 | + | (let loop ((tree (order occurences))) | |
64 | + | (if (equal? (length tree) 1) | |
65 | + | tree | |
66 | + | (let* ((e1 (car tree)) | |
67 | + | (tree (cdr tree)) | |
68 | + | (e2 (car tree)) | |
69 | + | (tree (cdr tree))) | |
70 | + | (loop (add-in-order (cons (list e1 e2) (+ (cdr e1) (cdr e2))) tree))))))) | |
71 | + | ||
72 | + | (define (huffman->code huffman) | |
73 | + | (match huffman | |
74 | + | ('() '()) | |
75 | + | ((((h1 h2) . weight)) | |
76 | + | (append | |
77 | + | (map (lambda (e) (cons (car e) (cons 0 (cdr e)))) (huffman->code h1)) | |
78 | + | (map (lambda (e) (cons (car e) (cons 1 (cdr e)))) (huffman->code h2)))) | |
79 | + | (((h1 h2) . weight) | |
80 | + | (append | |
81 | + | (map (lambda (e) (cons (car e) (cons 0 (cdr e)))) (huffman->code h1)) | |
82 | + | (map (lambda (e) (cons (car e) (cons 1 (cdr e)))) (huffman->code h2)))) | |
83 | + | ((((? char? char) . weight)) | |
84 | + | (list (cons char '()))) | |
85 | + | (((? char? char) . weight) | |
86 | + | (list (cons char '()))))) | |
87 | + | ||
88 | + | (define (bits->bytes b) | |
89 | + | (match b | |
90 | + | ('() '()) | |
91 | + | ((a b c d e f g h bs ...) | |
92 | + | (cons (+ h (* 2 g) (* 4 f) (* 8 e) (* 16 d) (* 32 c) (* 64 b) (* 128 a)) | |
93 | + | (bits->bytes bs))) | |
94 | + | (_ (bits->bytes (append b (make-list (- 8 (length b)) 0)))))) | |
95 | + | ||
96 | + | (define (huffman-encode code str) | |
97 | + | (u8-list->bytevector | |
98 | + | (bits->bytes | |
99 | + | (apply append (map (lambda (c) (or (assoc-ref code c) (throw "not in huffman code" c))) | |
100 | + | (append (string->list str) (list #\nul))))))) | |
101 | + | ||
102 | + | (define (byte->bits b) | |
103 | + | (let loop ((i 2) (b b) (by '())) | |
104 | + | (if (eq? i 512) | |
105 | + | by | |
106 | + | (loop (* i 2) (- b (modulo b i)) (cons (if (eq? (modulo b i) 0) 0 1) by))))) | |
107 | + | ||
108 | + | (define (huffman-decode huffman bv) | |
109 | + | (let ((seq (apply append (map byte->bits (bytevector->u8-list bv))))) | |
110 | + | (list->string | |
111 | + | (reverse | |
112 | + | (let loop ((seq seq) (result '()) (huff huffman)) | |
113 | + | (if (null? seq) | |
114 | + | result | |
115 | + | (match huff | |
116 | + | ((((h1 h2) . weight)) | |
117 | + | (if (= (car seq) 0) | |
118 | + | (loop (cdr seq) result h1) | |
119 | + | (loop (cdr seq) result h2))) | |
120 | + | (((h1 h2) . weight) | |
121 | + | (if (= (car seq) 0) | |
122 | + | (loop (cdr seq) result h1) | |
123 | + | (loop (cdr seq) result h2))) | |
124 | + | ((((? char? char) . weight)) | |
125 | + | (if (equal? char #\nul) | |
126 | + | result | |
127 | + | (loop seq (cons char result) huffman))) | |
128 | + | (((? char? char) . weight) | |
129 | + | (if (equal? char #\nul) | |
130 | + | result | |
131 | + | (loop seq (cons char result) huffman)))))))))) | |
132 | + | ||
133 | + | (define (serialize-huffman huffman) | |
134 | + | (define (serialize huffman) | |
135 | + | (match huffman | |
136 | + | ((((h1 h2) . weight)) | |
137 | + | (append '(1) (serialize h1) (serialize h2))) | |
138 | + | (((h1 h2) . weight) | |
139 | + | (append '(1) (serialize h1) (serialize h2))) | |
140 | + | ((((? char? char) . weight)) | |
141 | + | (append (bytevector->u8-list (string->utf8 (list->string (list char)))) '(0))) | |
142 | + | (((? char? char) . weight) | |
143 | + | (append (bytevector->u8-list (string->utf8 (list->string (list char)))) '(0))))) | |
144 | + | (u8-list->bytevector (serialize huffman))) | |
145 | + | ||
146 | + | (define (serialize-huffman-string huffman-code) | |
147 | + | (lambda (str pos bv) | |
148 | + | (let ((sbv (huffman-encode huffman-code str))) | |
149 | + | (bytevector-copy! sbv 0 bv pos (bytevector-length sbv)) | |
150 | + | (+ pos (bytevector-length sbv))))) | |
151 | + | ||
152 | + | (define (huffman-string-size huffman-code) | |
153 | + | (lambda (str) | |
154 | + | (let ((sbv (huffman-encode huffman-code str))) | |
155 | + | (+ (bytevector-length sbv))))) |
modules/nani/encoding/parse.scm unknown status 1
1 | + | ;;; Nani Project website | |
2 | + | ;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu> | |
3 | + | ;;; | |
4 | + | ;;; This file is part of the Nani Project website. | |
5 | + | ;;; | |
6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | + | ;;; your option) any later version. | |
10 | + | ;;; | |
11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + | ;;; GNU Affero General Public License for more details. | |
15 | + | ;;; | |
16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | + | ||
19 | + | (define-module (nani encoding parse) | |
20 | + | #:use-module (ice-9 binary-ports) | |
21 | + | #:use-module (rnrs bytevectors) | |
22 | + | #:export (parse-list | |
23 | + | parse-char | |
24 | + | parse-int | |
25 | + | parse-boolean | |
26 | + | parse-string)) | |
27 | + | ||
28 | + | (define* (parse-list parse-element #:key (size #f)) | |
29 | + | (lambda (port) | |
30 | + | (let ((size (or size (bytevector-u16-ref (get-bytevector-n port 2) 0 (endianness big))))) | |
31 | + | (let loop ((result '()) (remaining size)) | |
32 | + | (if (= remaining 0) | |
33 | + | (reverse result) | |
34 | + | (loop (cons (parse-element port) result) (- remaining 1))))))) | |
35 | + | ||
36 | + | (define (parse-char port) | |
37 | + | (get-u8 port)) | |
38 | + | ||
39 | + | (define (parse-int port) | |
40 | + | (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big))) | |
41 | + | ||
42 | + | (define (parse-boolean port) | |
43 | + | (= (get-u8 port) 1)) | |
44 | + | ||
45 | + | (define (parse-string port) | |
46 | + | (define (get-result-string port) | |
47 | + | (let loop ((lu8 '()) (char (get-u8 port))) | |
48 | + | (if (= char 0) | |
49 | + | lu8 | |
50 | + | (loop (cons char lu8) (get-u8 port))))) | |
51 | + | (utf8->string (u8-list->bytevector (reverse (get-result-string port))))) |
modules/nani/encoding/serialize.scm unknown status 1
1 | + | ;;; Nani Project website | |
2 | + | ;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu> | |
3 | + | ;;; | |
4 | + | ;;; This file is part of the Nani Project website. | |
5 | + | ;;; | |
6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | + | ;;; your option) any later version. | |
10 | + | ;;; | |
11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + | ;;; GNU Affero General Public License for more details. | |
15 | + | ;;; | |
16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | + | ||
19 | + | (define-module (nani encoding serialize) | |
20 | + | #:use-module (rnrs bytevectors) | |
21 | + | #:export (merge-bvs | |
22 | + | serialize-list list-size | |
23 | + | serialize-char char-size | |
24 | + | serialize-u16 u16-size | |
25 | + | serialize-int int-size | |
26 | + | serialize-boolean boolean-size | |
27 | + | serialize-string string-size)) | |
28 | + | ||
29 | + | ;;; merge-bvs bvs | |
30 | + | ;;; Returns a bytevector that contains the data from `bvs`, a list of bytevectors. For instance, | |
31 | + | ;;; (merge-bvs '(#u8(1 2) #u8(3 4))) gives #u8(1 2 3 4). | |
32 | + | ;;; | |
33 | + | ;;; Serializers and Sizers | |
34 | + | ;;; ---------------------- | |
35 | + | ;;; | |
36 | + | ;;; A serializer is a procedure that takes a value to serialize, a position at which to serialize | |
37 | + | ;;; the value, and a bytevector in which to serialize: `serialize val pos bv`. | |
38 | + | ;;; | |
39 | + | ;;; A sizer is a procedure that takes a value to serialize, and returns the expected size it will | |
40 | + | ;;; take in the serialization. | |
41 | + | ;;; | |
42 | + | ;;; Common Serialization Functions | |
43 | + | ;;; ------------------------------ | |
44 | + | ;;; | |
45 | + | ;;; serialize-list serialize-element [#:size? #t] | |
46 | + | ;;; Returns a serializer for a list of elements. `serialize-element` is the serializer function for | |
47 | + | ;;; an element. When `#:size` is `#t`, add the size of the list to the serialization, as a 16-bits | |
48 | + | ;;; unsigned integer. | |
49 | + | ;;; | |
50 | + | ;;; list-size element-size [#:size? #t] | |
51 | + | ;;; Returns a sizer for lists, where elements are sized by `element-size`. | |
52 | + | ||
53 | + | (define (merge-bvs bvs) | |
54 | + | (let* ((size (apply + (map bytevector-length bvs))) | |
55 | + | (bv (make-bytevector size 0))) | |
56 | + | (let loop ((bvs bvs) (pos 0)) | |
57 | + | (unless (null? bvs) | |
58 | + | (let ((sz (bytevector-length (car bvs)))) | |
59 | + | (bytevector-copy! (car bvs) 0 bv pos sz) | |
60 | + | (loop (cdr bvs) (+ pos sz))))) | |
61 | + | bv)) | |
62 | + | ||
63 | + | (define* (serialize-list serialize-element #:key (size? #t)) | |
64 | + | (lambda (lst pos bv) | |
65 | + | (when (not (list? lst)) (throw 'not-list lst)) | |
66 | + | (when size? | |
67 | + | (bytevector-u16-set! bv pos (length lst) (endianness big))) | |
68 | + | (let loop ((lst lst) (pos (+ pos (if size? 2 0)))) | |
69 | + | (if (null? lst) | |
70 | + | pos | |
71 | + | (loop (cdr lst) (serialize-element (car lst) pos bv)))))) | |
72 | + | (define* (list-size element-size #:key (size? #t)) | |
73 | + | (lambda (lst) | |
74 | + | (when (not (list? lst)) (throw 'not-list lst)) | |
75 | + | (apply + (if size? 2 0) (map element-size lst)))) | |
76 | + | ||
77 | + | (define (serialize-char int pos bv) | |
78 | + | (bytevector-u8-set! bv pos int) | |
79 | + | (+ pos 1)) | |
80 | + | (define char-size (const 1)) | |
81 | + | ||
82 | + | (define (serialize-u16 int pos bv) | |
83 | + | (bytevector-u16-set! bv pos int (endianness big)) | |
84 | + | (+ pos 2)) | |
85 | + | (define u16-size (const 2)) | |
86 | + | ||
87 | + | (define (serialize-int int pos bv) | |
88 | + | (bytevector-u32-set! bv pos int (endianness big)) | |
89 | + | (+ pos 4)) | |
90 | + | (define int-size (const 4)) | |
91 | + | ||
92 | + | (define (serialize-boolean bool pos bv) | |
93 | + | (bytevector-u8-set! bv pos (if bool 1 0)) | |
94 | + | (+ pos 1)) | |
95 | + | (define boolean-size (const 1)) | |
96 | + | ||
97 | + | (define (serialize-string str pos bv) | |
98 | + | (let ((sbv (string->utf8 str))) | |
99 | + | (bytevector-copy! sbv 0 bv pos (bytevector-length sbv)) | |
100 | + | (bytevector-u8-set! bv (+ pos (bytevector-length sbv)) 0) | |
101 | + | (+ pos 1 (bytevector-length sbv)))) | |
102 | + | (define (string-size str) | |
103 | + | (let ((sbv (string->utf8 str))) | |
104 | + | (+ 1 (bytevector-length sbv)))) |
modules/nani/encoding/trie.scm unknown status 1
1 | + | ;;; Nani Project website | |
2 | + | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
3 | + | ;;; | |
4 | + | ;;; This file is part of the Nani Project website. | |
5 | + | ;;; | |
6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | + | ;;; your option) any later version. | |
10 | + | ;;; | |
11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + | ;;; GNU Affero General Public License for more details. | |
15 | + | ;;; | |
16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | + | ||
19 | + | (define-module (nani encoding trie) | |
20 | + | #:use-module (nani encoding serialize) | |
21 | + | #:use-module (rnrs bytevectors) | |
22 | + | #:use-module (srfi srfi-9) | |
23 | + | #:export (make-trie | |
24 | + | trie? | |
25 | + | trie-position | |
26 | + | trie-position-set! | |
27 | + | trie-vals | |
28 | + | trie-vals-set! | |
29 | + | trie-transitions | |
30 | + | trie-transitions-set! | |
31 | + | ||
32 | + | make-empty-trie | |
33 | + | add-to-trie! | |
34 | + | compress-trie | |
35 | + | ||
36 | + | serialize-trie | |
37 | + | serialize-trie-single | |
38 | + | trie-size | |
39 | + | trie-size-single)) | |
40 | + | ||
41 | + | (define-record-type trie | |
42 | + | (make-trie position vals transitions) | |
43 | + | trie? | |
44 | + | (position trie-position trie-position-set!) ; integer | |
45 | + | (vals trie-vals trie-vals-set!) ; list | |
46 | + | (transitions trie-transitions trie-transitions-set!)) ; array or alist | |
47 | + | ||
48 | + | (define (make-empty-trie) | |
49 | + | (make-trie 0 '() (make-array #f 16))) | |
50 | + | ||
51 | + | (define (add-to-trie! trie key value) | |
52 | + | (if (null? key) | |
53 | + | (trie-vals-set! trie (cons value (trie-vals trie))) | |
54 | + | (let ((next-trie (array-ref (trie-transitions trie) (car key)))) | |
55 | + | (if next-trie | |
56 | + | (add-to-trie! next-trie (cdr key) value) | |
57 | + | (let ((next-trie (make-empty-trie))) | |
58 | + | (array-set! (trie-transitions trie) next-trie (car key)) | |
59 | + | (add-to-trie! next-trie (cdr key) value)))))) | |
60 | + | ||
61 | + | (define (convert-trie-transitions! trie) | |
62 | + | (define (get-new-transitions transitions) | |
63 | + | (let loop ((i 0) (tr '())) | |
64 | + | (if (= i 16) | |
65 | + | tr | |
66 | + | (let ((elem (array-ref transitions i))) | |
67 | + | (if elem | |
68 | + | (begin | |
69 | + | (convert-trie-transitions! elem) | |
70 | + | (loop (+ i 1) (cons (cons i elem) tr))) | |
71 | + | (loop (+ i 1) tr)))))) | |
72 | + | (let* ((transitions (trie-transitions trie)) | |
73 | + | (transitions (get-new-transitions transitions))) | |
74 | + | (trie-transitions-set! trie transitions))) | |
75 | + | ||
76 | + | (define (compress-trie trie) | |
77 | + | (define (compress-aux trie) | |
78 | + | (make-trie | |
79 | + | (trie-position trie) | |
80 | + | (trie-vals trie) | |
81 | + | (apply append | |
82 | + | (map | |
83 | + | (lambda (tr) | |
84 | + | (let ((trie (cdr tr))) | |
85 | + | (map | |
86 | + | (lambda (tr2) | |
87 | + | (cons (+ (car tr2) (* 16 (car tr))) | |
88 | + | (compress-aux (cdr tr2)))) | |
89 | + | (trie-transitions trie)))) | |
90 | + | (trie-transitions trie))))) | |
91 | + | (convert-trie-transitions! trie) | |
92 | + | (compress-aux trie)) | |
93 | + | ||
94 | + | (define (pointer-size ptr) | |
95 | + | 5) | |
96 | + | ||
97 | + | (define (serialize-pointer ptr pos bv) | |
98 | + | (bytevector-u8-set! bv pos (car ptr)) | |
99 | + | (bytevector-u32-set! bv (+ pos 1) (trie-position (cdr ptr)) (endianness big)) | |
100 | + | (+ pos 5)) | |
101 | + | ||
102 | + | (define (serialize-trie-single serialize-value value-size) | |
103 | + | (define (serialize-trie-aux transitions pos) | |
104 | + | (let loop ((pos pos) | |
105 | + | (trs transitions) | |
106 | + | (bvs '())) | |
107 | + | (if (null? trs) | |
108 | + | (cons pos bvs) | |
109 | + | (let* ((next-trie (cdr (car trs))) | |
110 | + | (bv (get-trie-bv next-trie pos)) | |
111 | + | (pos (car bv)) | |
112 | + | (bv (cdr bv))) | |
113 | + | (loop pos (cdr trs) (append bvs bv)))))) | |
114 | + | ||
115 | + | (define (get-trie-bv trie pos) | |
116 | + | (trie-position-set! trie pos) | |
117 | + | (let* ((vals-sz (value-size (trie-vals trie))) | |
118 | + | (trs-sz ((list-size (const 5) #:size? #f) (trie-transitions trie))) | |
119 | + | (sz (+ vals-sz 1 trs-sz)) | |
120 | + | (bv (make-bytevector sz 0))) | |
121 | + | (serialize-value (trie-vals trie) 0 bv) | |
122 | + | (let* ((bvs (serialize-trie-aux | |
123 | + | (trie-transitions trie) | |
124 | + | (+ pos sz))) | |
125 | + | (next-pos (car bvs)) | |
126 | + | (bvs (cdr bvs))) | |
127 | + | (bytevector-u8-set! bv vals-sz (length (trie-transitions trie))) | |
128 | + | ((serialize-list serialize-pointer #:size? #f) | |
129 | + | (trie-transitions trie) (+ vals-sz 1) bv) | |
130 | + | (cons next-pos (cons bv bvs))))) | |
131 | + | ||
132 | + | (lambda (trie pos bv) | |
133 | + | (let* ((trie-bv (get-trie-bv trie pos)) | |
134 | + | (new-pos (car trie-bv)) | |
135 | + | (trie-bv (merge-bvs (cdr trie-bv)))) | |
136 | + | (bytevector-copy! trie-bv 0 bv pos (bytevector-length trie-bv)) | |
137 | + | new-pos))) | |
138 | + | ||
139 | + | (define (serialize-trie serialize-value value-size) | |
140 | + | (serialize-trie-single (serialize-list serialize-value) (list-size value-size))) | |
141 | + | ||
142 | + | (define (trie-size-single value-size) | |
143 | + | (lambda (trie) | |
144 | + | (apply + | |
145 | + | (value-size (trie-vals trie)) | |
146 | + | 1 | |
147 | + | ((list-size pointer-size #:size? #f) (trie-transitions trie)) | |
148 | + | (map (lambda (trie) ((trie-size-single value-size) trie)) | |
149 | + | (map cdr (trie-transitions trie)))))) | |
150 | + | ||
151 | + | (define (trie-size value-size) | |
152 | + | (trie-size-single (list-size value-size))) |
modules/nani/frequency.scm unknown status 2
1 | - | ;;; Nani Project website | |
2 | - | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
3 | - | ;;; | |
4 | - | ;;; This file is part of the Nani Project website. | |
5 | - | ;;; | |
6 | - | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | - | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | - | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | - | ;;; your option) any later version. | |
10 | - | ;;; | |
11 | - | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | - | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | - | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | - | ;;; GNU Affero General Public License for more details. | |
15 | - | ;;; | |
16 | - | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | - | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | - | ||
19 | - | (define-module (nani frequency) | |
20 | - | #:use-module (ice-9 rdelim) | |
21 | - | #:export (load-frequency | |
22 | - | frequency-entity)) | |
23 | - | ||
24 | - | (define (load-frequency file) | |
25 | - | (call-with-input-file file | |
26 | - | (lambda (port) | |
27 | - | (let loop ((frq '()) (i 1)) | |
28 | - | (let* ((line (%read-line port)) | |
29 | - | (line (car line))) | |
30 | - | (if (eof-object? line) | |
31 | - | frq | |
32 | - | (let ((content (string-split line #\tab))) | |
33 | - | (loop (cons (cons (cadr (cdr content)) i) frq) (+ i 1))))))))) | |
34 | - | ||
35 | - | (define (frequency-entity frq word) | |
36 | - | (let ((freq (assoc-ref frq word))) | |
37 | - | (cond | |
38 | - | ((not freq) | |
39 | - | #f) | |
40 | - | ((< freq 501) | |
41 | - | "frq500") | |
42 | - | ((< freq 1001) | |
43 | - | "frq1000") | |
44 | - | ((< freq 2001) | |
45 | - | "frq2000") | |
46 | - | ((< freq 5001) | |
47 | - | "frq1000") | |
48 | - | ((< freq 10001) | |
49 | - | "frq10000") | |
50 | - | ((< freq 20001) | |
51 | - | "frq20000") | |
52 | - | (else #f)))) |
modules/nani/huffman.scm unknown status 2
1 | - | ;;; Nani Project website | |
2 | - | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
3 | - | ;;; | |
4 | - | ;;; This file is part of the Nani Project website. | |
5 | - | ;;; | |
6 | - | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | - | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | - | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | - | ;;; your option) any later version. | |
10 | - | ;;; | |
11 | - | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | - | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | - | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | - | ;;; GNU Affero General Public License for more details. | |
15 | - | ;;; | |
16 | - | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | - | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | - | ||
19 | - | (define-module (nani huffman) | |
20 | - | #:use-module (ice-9 match) | |
21 | - | #:use-module (rnrs bytevectors) | |
22 | - | #:export (create-huffman | |
23 | - | huffman->code | |
24 | - | huffman-encode | |
25 | - | huffman-decode | |
26 | - | serialize-huffman | |
27 | - | serialize-huffman-string huffman-string-size)) | |
28 | - | ||
29 | - | (define (add-occurence occ char) | |
30 | - | (let* ((o (assoc-ref occ char)) | |
31 | - | (o (+ (if o o 0) 1))) | |
32 | - | (assoc-set! occ char o))) | |
33 | - | ||
34 | - | (define (add-occurences occ str) | |
35 | - | (let loop ((lst (append (string->list str) (list #\nul))) (occ occ)) | |
36 | - | (if (null? lst) | |
37 | - | occ | |
38 | - | (loop (cdr lst) (add-occurence occ (car lst)))))) | |
39 | - | ||
40 | - | (define (get-occurences string-list) | |
41 | - | (let loop ((lst string-list) (occ '())) | |
42 | - | (if (null? lst) | |
43 | - | occ | |
44 | - | (loop (cdr lst) (add-occurences occ (car lst)))))) | |
45 | - | ||
46 | - | (define (order tree) | |
47 | - | (if (null? tree) | |
48 | - | tree | |
49 | - | (add-in-order (car tree) (order (cdr tree))))) | |
50 | - | ||
51 | - | (define (add-in-order elem tree) | |
52 | - | (if (null? tree) | |
53 | - | (list elem) | |
54 | - | (let ((elem-pos (cdr elem)) | |
55 | - | (tree-pos (cdr (car tree)))) | |
56 | - | (if (> elem-pos tree-pos) | |
57 | - | (cons (car tree) (add-in-order elem (cdr tree))) | |
58 | - | (cons elem tree))))) | |
59 | - | ||
60 | - | (define (create-huffman string-list) | |
61 | - | (let ((total (apply + (map string-length string-list))) | |
62 | - | (occurences (get-occurences string-list))) | |
63 | - | (let loop ((tree (order occurences))) | |
64 | - | (if (equal? (length tree) 1) | |
65 | - | tree | |
66 | - | (let* ((e1 (car tree)) | |
67 | - | (tree (cdr tree)) | |
68 | - | (e2 (car tree)) | |
69 | - | (tree (cdr tree))) | |
70 | - | (loop (add-in-order (cons (list e1 e2) (+ (cdr e1) (cdr e2))) tree))))))) | |
71 | - | ||
72 | - | (define (huffman->code huffman) | |
73 | - | (match huffman | |
74 | - | ('() '()) | |
75 | - | ((((h1 h2) . weight)) | |
76 | - | (append | |
77 | - | (map (lambda (e) (cons (car e) (cons 0 (cdr e)))) (huffman->code h1)) | |
78 | - | (map (lambda (e) (cons (car e) (cons 1 (cdr e)))) (huffman->code h2)))) | |
79 | - | (((h1 h2) . weight) | |
80 | - | (append | |
81 | - | (map (lambda (e) (cons (car e) (cons 0 (cdr e)))) (huffman->code h1)) | |
82 | - | (map (lambda (e) (cons (car e) (cons 1 (cdr e)))) (huffman->code h2)))) | |
83 | - | ((((? char? char) . weight)) | |
84 | - | (list (cons char '()))) | |
85 | - | (((? char? char) . weight) | |
86 | - | (list (cons char '()))))) | |
87 | - | ||
88 | - | (define (bits->bytes b) | |
89 | - | (match b | |
90 | - | ('() '()) | |
91 | - | ((a b c d e f g h bs ...) | |
92 | - | (cons (+ h (* 2 g) (* 4 f) (* 8 e) (* 16 d) (* 32 c) (* 64 b) (* 128 a)) | |
93 | - | (bits->bytes bs))) | |
94 | - | (_ (bits->bytes (append b (make-list (- 8 (length b)) 0)))))) | |
95 | - | ||
96 | - | (define (huffman-encode code str) | |
97 | - | (u8-list->bytevector | |
98 | - | (bits->bytes | |
99 | - | (apply append (map (lambda (c) (assoc-ref code c)) | |
100 | - | (append (string->list str) (list #\nul))))))) | |
101 | - | ||
102 | - | (define (byte->bits b) | |
103 | - | (let loop ((i 2) (b b) (by '())) | |
104 | - | (if (eq? i 512) | |
105 | - | by | |
106 | - | (loop (* i 2) (- b (modulo b i)) (cons (if (eq? (modulo b i) 0) 0 1) by))))) | |
107 | - | ||
108 | - | (define (huffman-decode huffman bv) | |
109 | - | (let ((seq (apply append (map byte->bits (bytevector->u8-list bv))))) | |
110 | - | (list->string | |
111 | - | (reverse | |
112 | - | (let loop ((seq seq) (result '()) (huff huffman)) | |
113 | - | (if (null? seq) | |
114 | - | result | |
115 | - | (match huff | |
116 | - | ((((h1 h2) . weight)) | |
117 | - | (if (= (car seq) 0) | |
118 | - | (loop (cdr seq) result h1) | |
119 | - | (loop (cdr seq) result h2))) | |
120 | - | (((h1 h2) . weight) | |
121 | - | (if (= (car seq) 0) | |
122 | - | (loop (cdr seq) result h1) | |
123 | - | (loop (cdr seq) result h2))) | |
124 | - | ((((? char? char) . weight)) | |
125 | - | (if (equal? char #\nul) | |
126 | - | result | |
127 | - | (loop seq (cons char result) huffman))) | |
128 | - | (((? char? char) . weight) | |
129 | - | (if (equal? char #\nul) | |
130 | - | result | |
131 | - | (loop seq (cons char result) huffman)))))))))) | |
132 | - | ||
133 | - | (define (serialize-huffman huffman) | |
134 | - | (define (serialize huffman) | |
135 | - | (match huffman | |
136 | - | ((((h1 h2) . weight)) | |
137 | - | (append '(1) (serialize h1) (serialize h2))) | |
138 | - | (((h1 h2) . weight) | |
139 | - | (append '(1) (serialize h1) (serialize h2))) | |
140 | - | ((((? char? char) . weight)) | |
141 | - | (append (bytevector->u8-list (string->utf8 (list->string (list char)))) '(0))) | |
142 | - | (((? char? char) . weight) | |
143 | - | (append (bytevector->u8-list (string->utf8 (list->string (list char)))) '(0))))) | |
144 | - | (u8-list->bytevector (serialize huffman))) | |
145 | - | ||
146 | - | (define (serialize-huffman-string huffman-code) | |
147 | - | (lambda (str pos bv) | |
148 | - | (let ((sbv (huffman-encode huffman-code str))) | |
149 | - | (bytevector-copy! sbv 0 bv pos (bytevector-length sbv)) | |
150 | - | (+ pos (bytevector-length sbv))))) | |
151 | - | ||
152 | - | (define (huffman-string-size huffman-code) | |
153 | - | (lambda (str) | |
154 | - | (let ((sbv (huffman-encode huffman-code str))) | |
155 | - | (+ (bytevector-length sbv))))) |
modules/nani/jmdict/entities.scm unknown status 2
1 | - | (define-module (nani jmdict entities) | |
2 | - | #:use-module (nani result) | |
3 | - | #:use-module (nani tags) | |
4 | - | #:re-export (get-tag get-points)) |
modules/nani/jmdict/serialize.scm unknown status 2
1 | - | ;;; Nani Project website | |
2 | - | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
3 | - | ;;; | |
4 | - | ;;; This file is part of the Nani Project website. | |
5 | - | ;;; | |
6 | - | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | - | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | - | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | - | ;;; your option) any later version. | |
10 | - | ;;; | |
11 | - | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | - | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | - | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | - | ;;; GNU Affero General Public License for more details. | |
15 | - | ;;; | |
16 | - | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | - | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | - | ||
19 | - | (define-module (nani jmdict serialize) | |
20 | - | #:use-module (nani huffman) | |
21 | - | #:use-module (nani result) | |
22 | - | #:use-module (nani serialize) | |
23 | - | #:use-module (nani trie) | |
24 | - | #:use-module (ice-9 binary-ports) | |
25 | - | #:use-module (rnrs bytevectors) | |
26 | - | #:export (serialize-jmdict | |
27 | - | jmdict-entry-count)) | |
28 | - | ||
29 | - | (define (serialize-pointer ptr pos bv) | |
30 | - | (bytevector-u8-set! bv pos (car ptr)) | |
31 | - | (bytevector-u32-set! bv (+ pos 1) (trie-position (cdr ptr)) (endianness big)) | |
32 | - | (+ pos 5)) | |
33 | - | ||
34 | - | (define (serialize-trie trie results pos) | |
35 | - | (define (serialize-trie-aux transitions pos) | |
36 | - | (let loop ((pos pos) | |
37 | - | (trs transitions) | |
38 | - | (bvs '())) | |
39 | - | (if (null? trs) | |
40 | - | (cons pos bvs) | |
41 | - | (let* ((next-trie (cdr (car trs))) | |
42 | - | (bv (serialize-trie next-trie results pos)) | |
43 | - | (pos (car bv)) | |
44 | - | (bv (cdr bv))) | |
45 | - | (loop pos (cdr trs) (append bvs bv)))))) | |
46 | - | (trie-position-set! trie pos) | |
47 | - | (let* ((vals-sz (list-size (trie-vals trie) int-size)) | |
48 | - | (trs-sz (list-size (trie-transitions trie) (const 5) #:size? #f)) | |
49 | - | (sz (+ vals-sz 1 trs-sz)) | |
50 | - | (bv (make-bytevector sz 0))) | |
51 | - | (serialize-list (map (lambda (pos) (result-position (array-ref results pos))) | |
52 | - | (trie-vals trie)) | |
53 | - | serialize-int 0 bv) | |
54 | - | (let* ((bvs (serialize-trie-aux | |
55 | - | (trie-transitions trie) | |
56 | - | (+ pos (bytevector-length bv)))) | |
57 | - | (next-pos (car bvs)) | |
58 | - | (bvs (cdr bvs))) | |
59 | - | (bytevector-u8-set! bv vals-sz (length (trie-transitions trie))) | |
60 | - | (serialize-list (trie-transitions trie) serialize-pointer | |
61 | - | (+ vals-sz 1) bv #:size? #f) | |
62 | - | (cons next-pos (cons bv bvs))))) | |
63 | - | ||
64 | - | (define (serialize-jmdict results kanji-trie reading-trie sense-trie) | |
65 | - | (define kanji-huffman | |
66 | - | (let ((kanjis (apply append (map result-kanjis results)))) | |
67 | - | (create-huffman kanjis))) | |
68 | - | (define kanji-huffman-code (huffman->code kanji-huffman)) | |
69 | - | (define reading-huffman | |
70 | - | (let* ((readings (apply append (map result-readings results))) | |
71 | - | (readings (apply append (map reading-readings readings)))) | |
72 | - | (create-huffman readings))) | |
73 | - | (define reading-huffman-code (huffman->code reading-huffman)) | |
74 | - | (define sense-huffman | |
75 | - | (let* ((senses (apply append (map result-senses results))) | |
76 | - | (glosses (apply append (map sense-glosses senses)))) | |
77 | - | (create-huffman glosses))) | |
78 | - | (define sense-huffman-code (huffman->code sense-huffman)) | |
79 | - | ||
80 | - | (define (serialize-source source pos bv) | |
81 | - | (when (not (source? source)) (throw 'not-source source)) | |
82 | - | (let* ((pos (serialize-list (source-content source) serialize-string pos bv)) | |
83 | - | (pos (serialize-boolean (source-wasei? source) pos bv)) | |
84 | - | (pos (serialize-string (source-type source) pos bv)) | |
85 | - | (pos (serialize-string (source-lang source) pos bv))) | |
86 | - | pos)) | |
87 | - | (define (source-size source) | |
88 | - | (when (not (source? source)) (throw 'not-source source)) | |
89 | - | (+ (list-size (source-content source) string-size) | |
90 | - | (boolean-size (source-wasei? source)) | |
91 | - | (string-size (source-type source)) | |
92 | - | (string-size (source-lang source)))) | |
93 | - | ||
94 | - | (define (serialize-reading reading pos bv) | |
95 | - | (when (not (reading? reading)) (throw 'not-reading reading)) | |
96 | - | (let* ((pos (serialize-list (reading-kanjis reading) serialize-string pos bv)) | |
97 | - | (pos (serialize-list (reading-info reading) serialize-string pos bv)) | |
98 | - | (pos (serialize-list (reading-readings reading) | |
99 | - | (serialize-huffman-string reading-huffman-code) pos bv))) | |
100 | - | pos)) | |
101 | - | (define (reading-size reading) | |
102 | - | (when (not (reading? reading)) (throw 'not-reading reading)) | |
103 | - | (+ (list-size (reading-kanjis reading) string-size) | |
104 | - | (list-size (reading-info reading) string-size) | |
105 | - | (list-size (reading-readings reading) (huffman-string-size reading-huffman-code)))) | |
106 | - | ||
107 | - | (define (serialize-sense sense pos bv) | |
108 | - | (when (not (sense? sense)) (throw 'not-sense sense)) | |
109 | - | (let* ((pos (serialize-list (sense-references sense) serialize-string pos bv)) | |
110 | - | (pos (serialize-list (sense-limits sense) serialize-string pos bv)) | |
111 | - | (pos (serialize-list (sense-infos sense) serialize-string pos bv)) | |
112 | - | (pos (serialize-list (sense-sources sense) serialize-source pos bv)) | |
113 | - | (pos (serialize-list (sense-tags sense) serialize-char pos bv)) | |
114 | - | (pos (serialize-list (sense-glosses sense) | |
115 | - | (serialize-huffman-string sense-huffman-code) pos bv)) | |
116 | - | (pos (serialize-string (sense-language sense) pos bv))) | |
117 | - | pos)) | |
118 | - | (define (sense-size sense) | |
119 | - | (when (not (sense? sense)) (throw 'not-sense sense)) | |
120 | - | (+ (list-size (sense-references sense) string-size) | |
121 | - | (list-size (sense-limits sense) string-size) | |
122 | - | (list-size (sense-infos sense) string-size) | |
123 | - | (list-size (sense-sources sense) source-size) | |
124 | - | (list-size (sense-tags sense) char-size) | |
125 | - | (list-size (sense-glosses sense) | |
126 | - | (huffman-string-size sense-huffman-code)) | |
127 | - | (string-size (sense-language sense)))) | |
128 | - | ||
129 | - | (define (serialize-result result pos bv) | |
130 | - | (when (not (result? result)) (throw 'not-result result)) | |
131 | - | (result-position-set! result pos) | |
132 | - | (let* ((pos (serialize-list (result-kanjis result) | |
133 | - | (serialize-huffman-string kanji-huffman-code) pos bv)) | |
134 | - | (pos (serialize-list (result-readings result) serialize-reading pos bv)) | |
135 | - | (pos (serialize-list (result-senses result) serialize-sense pos bv))) | |
136 | - | pos)) | |
137 | - | (define (result-size result) | |
138 | - | (when (not (result? result)) (throw 'not-result result)) | |
139 | - | (+ (list-size (result-kanjis result) (huffman-string-size kanji-huffman-code)) | |
140 | - | (list-size (result-readings result) reading-size) | |
141 | - | (list-size (result-senses result) sense-size))) | |
142 | - | ||
143 | - | (define (trie-node-size trie) | |
144 | - | (apply + 1 (map trie-node-size (map cdr (trie-transitions trie))))) | |
145 | - | ||
146 | - | (let* ((header (string->utf8 "NANI_JMDICT001")) | |
147 | - | (header-size (bytevector-length header)) | |
148 | - | (pointers (make-bytevector 12 0)) | |
149 | - | (kanji-huffman-bv (serialize-huffman kanji-huffman)) | |
150 | - | (kanji-huffman-size (bytevector-length kanji-huffman-bv)) | |
151 | - | (reading-huffman-bv (serialize-huffman reading-huffman)) | |
152 | - | (reading-huffman-size (bytevector-length reading-huffman-bv)) | |
153 | - | (sense-huffman-bv (serialize-huffman sense-huffman)) | |
154 | - | (sense-huffman-size (bytevector-length sense-huffman-bv)) | |
155 | - | (results-size (list-size results result-size #:size? #f)) | |
156 | - | (huffman-size (+ reading-huffman-size sense-huffman-size kanji-huffman-size)) | |
157 | - | (results-bv (make-bytevector (+ header-size 12 kanji-huffman-size | |
158 | - | reading-huffman-size sense-huffman-size | |
159 | - | results-size 4)))) | |
160 | - | (format #t "Number of nodes in kanjis: ~a~%" | |
161 | - | (trie-node-size kanji-trie)) | |
162 | - | (format #t "Number of nodes in readings: ~a~%" | |
163 | - | (trie-node-size reading-trie)) | |
164 | - | (format #t "Number of nodes in senses: ~a~%" | |
165 | - | (trie-node-size sense-trie)) | |
166 | - | (serialize-list results serialize-result (+ header-size 12 huffman-size) | |
167 | - | results-bv #:size? #f) | |
168 | - | ;; number of entries | |
169 | - | (serialize-int (length results) (+ header-size 12 huffman-size results-size) | |
170 | - | results-bv) | |
171 | - | (let* ((results (list->array 1 results)) | |
172 | - | (pos (bytevector-length results-bv)) | |
173 | - | (kanji-bvs (serialize-trie kanji-trie results pos)) | |
174 | - | (pos (car kanji-bvs)) | |
175 | - | (reading-bvs (serialize-trie reading-trie results pos)) | |
176 | - | (pos (car reading-bvs)) | |
177 | - | (meaning-bvs (serialize-trie sense-trie results pos))) | |
178 | - | (bytevector-u32-set! pointers 0 (bytevector-length results-bv) (endianness big)) | |
179 | - | (bytevector-u32-set! pointers 4 (car kanji-bvs) (endianness big)) | |
180 | - | (bytevector-u32-set! pointers 8 (car reading-bvs) (endianness big)) | |
181 | - | (bytevector-copy! header 0 results-bv 0 header-size) | |
182 | - | (bytevector-copy! pointers 0 results-bv header-size 12) | |
183 | - | (bytevector-copy! kanji-huffman-bv 0 results-bv (+ header-size 12) kanji-huffman-size) | |
184 | - | (bytevector-copy! reading-huffman-bv 0 results-bv | |
185 | - | (+ header-size 12 kanji-huffman-size) | |
186 | - | reading-huffman-size) | |
187 | - | (bytevector-copy! sense-huffman-bv 0 results-bv | |
188 | - | (+ header-size 12 kanji-huffman-size reading-huffman-size) | |
189 | - | sense-huffman-size) | |
190 | - | ;; give some feedback on the size of file's structures | |
191 | - | (format #t "huffmans are ~a bytes long~%" huffman-size) | |
192 | - | (format #t "results is ~a bytes long~%" (bytevector-length results-bv)) | |
193 | - | (format #t "kanjis is ~a bytes long~%" (apply + (map bytevector-length (cdr kanji-bvs)))) | |
194 | - | (format #t "readings is ~a bytes long~%" (apply + (map bytevector-length (cdr reading-bvs)))) | |
195 | - | (format #t "senses is ~a bytes long~%" (apply + (map bytevector-length (cdr meaning-bvs)))) | |
196 | - | (merge-bvs (append (list results-bv) (cdr kanji-bvs) (cdr reading-bvs) | |
197 | - | (cdr meaning-bvs)))))) | |
198 | - | ||
199 | - | (define (jmdict-entry-count file) | |
200 | - | (call-with-input-file file | |
201 | - | (lambda (port) | |
202 | - | (let* ((header (utf8->string (get-bytevector-n port 14))) | |
203 | - | (pointers (get-bytevector-n port 12)) | |
204 | - | (kanji-pos (bytevector-u32-ref pointers 0 (endianness big)))) | |
205 | - | (seek port (- kanji-pos 4) SEEK_SET) | |
206 | - | (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big)))))) |
modules/nani/jmdict/trie.scm unknown status 2
1 | - | ;;; Nani Project website | |
2 | - | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
3 | - | ;;; | |
4 | - | ;;; This file is part of the Nani Project website. | |
5 | - | ;;; | |
6 | - | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | - | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | - | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | - | ;;; your option) any later version. | |
10 | - | ;;; | |
11 | - | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | - | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | - | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | - | ;;; GNU Affero General Public License for more details. | |
15 | - | ;;; | |
16 | - | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | - | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | - | ||
19 | - | (define-module (nani jmdict trie) | |
20 | - | #:use-module (nani trie) | |
21 | - | #:use-module (nani result) | |
22 | - | #:use-module (rnrs bytevectors) | |
23 | - | #:export (make-kanji-trie | |
24 | - | make-reading-trie | |
25 | - | make-meaning-trie)) | |
26 | - | ||
27 | - | (define (make-key key) | |
28 | - | (apply append | |
29 | - | (map | |
30 | - | (lambda (c) | |
31 | - | (list (quotient c 16) (modulo c 16))) | |
32 | - | (bytevector->u8-list (string->utf8 key))))) | |
33 | - | ||
34 | - | (define (make-kanji-trie results) | |
35 | - | (let ((trie (make-empty-trie))) | |
36 | - | (let loop ((results results) (i 0)) | |
37 | - | (if (null? results) | |
38 | - | trie | |
39 | - | (begin | |
40 | - | (for-each | |
41 | - | (lambda (key) | |
42 | - | (add-to-trie! trie (make-key key) i)) | |
43 | - | (result-kanjis (car results))) | |
44 | - | (loop (cdr results) (+ i 1))))))) | |
45 | - | ||
46 | - | (define (make-reading-trie results) | |
47 | - | (let ((trie (make-empty-trie))) | |
48 | - | (let loop ((results results) (i 0)) | |
49 | - | (if (null? results) | |
50 | - | trie | |
51 | - | (begin | |
52 | - | (for-each | |
53 | - | (lambda (reading) | |
54 | - | (for-each | |
55 | - | (lambda (key) | |
56 | - | (add-to-trie! trie (make-key key) i)) | |
57 | - | (reading-readings reading))) | |
58 | - | (result-readings (car results))) | |
59 | - | (loop (cdr results) (+ i 1))))))) | |
60 | - | ||
61 | - | (define (make-meaning-trie results) | |
62 | - | (let ((trie (make-empty-trie))) | |
63 | - | (let loop ((results results) (i 0)) | |
64 | - | (if (null? results) | |
65 | - | trie | |
66 | - | (begin | |
67 | - | (for-each | |
68 | - | (lambda (meaning) | |
69 | - | (for-each | |
70 | - | (lambda (key) | |
71 | - | (add-to-trie! trie (make-key key) i)) | |
72 | - | (sense-glosses meaning))) | |
73 | - | (result-senses (car results))) | |
74 | - | (loop (cdr results) (+ i 1))))))) |
modules/nani/jmdict/xml.scm unknown status 2
1 | - | ;;; Nani Project website | |
2 | - | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
3 | - | ;;; | |
4 | - | ;;; This file is part of the Nani Project website. | |
5 | - | ;;; | |
6 | - | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | - | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | - | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | - | ;;; your option) any later version. | |
10 | - | ;;; | |
11 | - | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | - | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | - | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | - | ;;; GNU Affero General Public License for more details. | |
15 | - | ;;; | |
16 | - | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | - | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | - | ||
19 | - | (define-module (nani jmdict xml) | |
20 | - | #:use-module (ice-9 match) | |
21 | - | #:use-module (ice-9 rdelim) | |
22 | - | #:use-module (sxml ssax) | |
23 | - | #:use-module (nani frequency) | |
24 | - | #:use-module (sxml simple) | |
25 | - | #:use-module (nani result) | |
26 | - | #:use-module (nani jmdict entities) | |
27 | - | #:export (load-dic xml->results)) | |
28 | - | ||
29 | - | (define (load-dic file) | |
30 | - | (xml->sxml (call-with-input-file file read-string))) | |
31 | - | ||
32 | - | (define (sxml->reading lst) | |
33 | - | (let loop ((reading (make-reading '() '() '())) (lst lst)) | |
34 | - | (if (null? lst) | |
35 | - | reading | |
36 | - | (loop | |
37 | - | (match (car lst) | |
38 | - | (('reading r) (update-reading reading #:readings (cons r (reading-readings reading)))) | |
39 | - | (('info r) (update-reading reading #:info (cons r (reading-info reading)))) | |
40 | - | (('limit r) (update-reading reading #:kanjis (cons r (reading-kanjis reading)))) | |
41 | - | (((? symbol? s) v) (throw 'unknown-content s v)) | |
42 | - | ((? string? _) reading)) | |
43 | - | (cdr lst))))) | |
44 | - | ||
45 | - | (define (sxml->source lst) | |
46 | - | (let loop ((source (make-source '() #f "" "")) (lst lst)) | |
47 | - | (if (null? lst) | |
48 | - | source | |
49 | - | (loop | |
50 | - | (match (car lst) | |
51 | - | (('content c) (update-source source #:content (cons c (source-content source)))) | |
52 | - | (('ls_wasei _) (update-source source #:wasei? #t)) | |
53 | - | (('ls_type t) (update-source source #:type t)) | |
54 | - | ((('xml . 'lang) l) (update-source source #:lang l)) | |
55 | - | (((? symbol? s) v) (throw 'unknown-content s v)) | |
56 | - | ((? list? l) (loop source l)) | |
57 | - | ((? string? c) (update-source source #:content | |
58 | - | (cons c (source-content source))))) | |
59 | - | (cdr lst))))) | |
60 | - | ||
61 | - | (define (sxml->sense lst) | |
62 | - | (let loop ((sense (make-sense '() '() '() '() '() '() "eng")) (lst lst)) | |
63 | - | (if (null? lst) | |
64 | - | sense | |
65 | - | (loop | |
66 | - | (match (car lst) | |
67 | - | (('ref (? string? r)) (update-sense sense #:references (cons r (sense-references sense)))) | |
68 | - | (('limit (? string? r)) (update-sense sense #:limits (cons r (sense-limits sense)))) | |
69 | - | (('info (? string? r)) (update-sense sense #:infos (cons r (sense-infos sense)))) | |
70 | - | ((? source? s) (update-sense sense #:sources (cons s (sense-sources sense)))) | |
71 | - | (('tag (? integer? r)) (update-sense sense #:tags (cons r (sense-tags sense)))) | |
72 | - | (('gloss (? string? r)) (update-sense sense #:glosses (cons r (sense-glosses sense)))) | |
73 | - | ((('xml . 'lang) (? string? l)) (update-sense sense #:language l)) | |
74 | - | (((? symbol? s) v) (throw 'unknown-content s v)) | |
75 | - | ((? list? l) (loop sense l)) | |
76 | - | ((? string? _) sense)) | |
77 | - | (cdr lst))))) | |
78 | - | ||
79 | - | (define (sxml->result lst frq) | |
80 | - | (let ((result | |
81 | - | (let loop ((result (make-result 0 0 '() '() '())) (lst lst)) | |
82 | - | (if (null? lst) | |
83 | - | result | |
84 | - | (loop | |
85 | - | (match (car lst) | |
86 | - | (('kanji kanji) | |
87 | - | (update-result result #:kanjis (cons kanji (result-kanjis result)))) | |
88 | - | ((? reading? r) | |
89 | - | (update-result result #:readings (cons r (result-readings result)))) | |
90 | - | ((? sense? s) | |
91 | - | (update-result result #:senses (cons s (result-senses result)))) | |
92 | - | ((? string? _) result)) | |
93 | - | (cdr lst)))))) | |
94 | - | (let* ((word (if (null? (result-kanjis result)) | |
95 | - | (car (reading-readings (car (result-readings result)))) | |
96 | - | (car (result-kanjis result)))) | |
97 | - | (entity (frequency-entity frq word)) | |
98 | - | (sense1 (car (result-senses result))) | |
99 | - | (sense1 (if entity | |
100 | - | (update-sense sense1 | |
101 | - | #:tags (cons (get-tag entity) (sense-tags sense1))) | |
102 | - | sense1)) | |
103 | - | (senses (cons sense1 (cdr (result-senses result))))) | |
104 | - | (update-result result | |
105 | - | #:points (get-points (update-result result #:senses senses)) | |
106 | - | #:senses senses)))) | |
107 | - | ||
108 | - | (define (sxml->element lst elem frq) | |
109 | - | (match elem | |
110 | - | ('ent_seq "") | |
111 | - | ('ke_pri "") | |
112 | - | ('re_nokanji "") | |
113 | - | ('re_pri "") | |
114 | - | ('ke_inf "") | |
115 | - | ('misc (if (and (= (length lst) 1) (string? (car lst))) | |
116 | - | `(tag ,(get-tag (car lst))) | |
117 | - | (throw 'invalid-misc lst))) | |
118 | - | ('re_restr (if (and (= (length lst) 1) (string? (car lst))) | |
119 | - | `(limit ,(car lst)) | |
120 | - | (throw 'invalid-re_restr lst))) | |
121 | - | ('keb (if (and (= (length lst) 1) (string? (car lst))) | |
122 | - | `(kanji ,(car lst)) | |
123 | - | (throw 'invalid-keb lst))) | |
124 | - | ('s_inf (if (null? (filter (lambda (s) (not (string? s))) lst)) | |
125 | - | `(info ,(apply string-append lst)) | |
126 | - | (throw 'invalid-s_inf lst))) | |
127 | - | ('dial (if (and (= (length lst) 1) (string? (car lst))) | |
128 | - | `(tag ,(get-tag (car lst))) | |
129 | - | (throw 'invalid-dial lst))) | |
130 | - | ('re_inf (if (and (= (length lst) 1) (string? (car lst))) | |
131 | - | `(info ,(car lst)) | |
132 | - | (throw 'invalid-re_inf lst))) | |
133 | - | ('stagk (if (and (= (length lst) 1) (string? (car lst))) | |
134 | - | `(limit ,(car lst)) | |
135 | - | (throw 'invalid-stagk lst))) | |
136 | - | ('stagr (if (and (= (length lst) 1) (string? (car lst))) | |
137 | - | `(limit ,(car lst)) | |
138 | - | (throw 'invalid-stagr lst))) | |
139 | - | ('field (if (and (= (length lst) 1) (string? (car lst))) | |
140 | - | `(tag ,(get-tag (car lst))) | |
141 | - | (throw 'invalid-field lst))) | |
142 | - | ('ant (if (and (= (length lst) 1) (string? (car lst))) | |
143 | - | `(ref ,(car lst)) | |
144 | - | (throw 'invalid-ant lst))) | |
145 | - | ('reb (if (and (= (length lst) 1) (string? (car lst))) | |
146 | - | `(reading ,(car lst)) | |
147 | - | (throw 'invalid-reb lst))) | |
148 | - | ('r_ele (sxml->reading lst)) | |
149 | - | ('k_ele (car (filter list? lst))) | |
150 | - | ('pos (if (and (= (length lst) 1) (string? (car lst))) | |
151 | - | `(tag ,(get-tag (car lst))) | |
152 | - | (throw 'invalid-pos lst))) | |
153 | - | ('xref (if (and (= (length lst) 1) (string? (car lst))) | |
154 | - | `(ref ,(car lst)) | |
155 | - | (throw 'invalid-xref lst))) | |
156 | - | ('gloss (cons | |
157 | - | `(gloss ,(apply string-append (filter string? lst))) | |
158 | - | (filter list? lst))) | |
159 | - | ('lsource (sxml->source lst)) | |
160 | - | ('sense (sxml->sense lst)) | |
161 | - | ('entry (sxml->result lst frq)))) | |
162 | - | ||
163 | - | (define (create-parser frq) | |
164 | - | (ssax:make-parser | |
165 | - | NEW-LEVEL-SEED | |
166 | - | (lambda (elem-gi attributes namespaces expected-content seed) | |
167 | - | (map | |
168 | - | (match-lambda | |
169 | - | ((k . v) (list k v))) | |
170 | - | (filter | |
171 | - | (match-lambda | |
172 | - | ((k . v) (not (member k '(g_type))))) | |
173 | - | attributes))) | |
174 | - | ||
175 | - | FINISH-ELEMENT | |
176 | - | (lambda (elem-gi attributes namespaces parent-seed seed) | |
177 | - | (if (equal? elem-gi 'JMdict) | |
178 | - | seed | |
179 | - | (let* ((seed (reverse seed)) | |
180 | - | (element (sxml->element seed elem-gi frq))) | |
181 | - | (cons element parent-seed)))) | |
182 | - | ||
183 | - | CHAR-DATA-HANDLER | |
184 | - | (lambda (string1 string2 seed) | |
185 | - | (cons (string-append string1 string2) seed)))) | |
186 | - | ||
187 | - | (define (xml->results port frq) | |
188 | - | (let ((results (filter result? ((create-parser frq) port '())))) | |
189 | - | (sort results (lambda (a b) (> (result-points a) (result-points b)))))) |
modules/nani/kanji/radk.scm unknown status 1
1 | + | ;;; Nani Project website | |
2 | + | ;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu> | |
3 | + | ;;; | |
4 | + | ;;; This file is part of the Nani Project website. | |
5 | + | ;;; | |
6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | + | ;;; your option) any later version. | |
10 | + | ;;; | |
11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + | ;;; GNU Affero General Public License for more details. | |
15 | + | ;;; | |
16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | + | ||
19 | + | (define-module (nani kanji radk) | |
20 | + | #:use-module (ice-9 binary-ports) | |
21 | + | #:use-module (ice-9 match) | |
22 | + | #:use-module (ice-9 peg) | |
23 | + | #:use-module (ice-9 rdelim) | |
24 | + | #:use-module (nani encoding parse) | |
25 | + | #:use-module (nani encoding serialize) | |
26 | + | #:use-module (rnrs bytevectors) | |
27 | + | #:use-module (sxml simple) | |
28 | + | #:export (parse-radk | |
29 | + | get-kanji-stroke | |
30 | + | get-rad-kanji | |
31 | + | get-rad-stroke | |
32 | + | serialize-radk | |
33 | + | kanji-count)) | |
34 | + | ||
35 | + | ;; PEG parser for the radk file | |
36 | + | (define-peg-pattern comment none (and "#" (* (or "\t" (range #\x20 #\x10ffff))) "\n")) | |
37 | + | (define-peg-pattern space none " ") | |
38 | + | (define-peg-pattern return none "\n") | |
39 | + | (define-peg-pattern entry all | |
40 | + | (and (ignore "$") space char space num (? (and space name)) (ignore "\n") | |
41 | + | (+ (or char (ignore "\n"))))) | |
42 | + | (define-peg-pattern num all (+ (or (range #\0 #\9)))) | |
43 | + | (define-peg-pattern name none (+ (or (range #\0 #\9) (range #\a #\z) (range #\A #\Z)))) | |
44 | + | (define-peg-pattern char all (and (range #\xff #\x10ffff))) | |
45 | + | (define-peg-pattern radk-doc body (* (or return comment entry))) | |
46 | + | ||
47 | + | ;; parse the file with the peg parser, to low-level alist structure | |
48 | + | (define (parse-radk file) | |
49 | + | (peg:tree (match-pattern radk-doc (call-with-input-file file read-string)))) | |
50 | + | ||
51 | + | (define (get-rad-kanji content) | |
52 | + | (let loop ((result '()) (content content)) | |
53 | + | (match content | |
54 | + | (() result) | |
55 | + | ((('entry ('char radical) ('num stroke) ('char kanji)) content ...) | |
56 | + | (loop (cons (list radical kanji) result) | |
57 | + | content)) | |
58 | + | ((('entry ('char radical) ('num stroke) (('char kanji) ...)) content ...) | |
59 | + | (loop (cons (cons radical kanji) result) | |
60 | + | content))))) | |
61 | + | ||
62 | + | (define (get-rad-stroke content) | |
63 | + | (let loop ((result '()) (content content)) | |
64 | + | (match content | |
65 | + | (() result) | |
66 | + | ((('entry ('char radical) ('num stroke) ('char kanji)) content ...) | |
67 | + | (loop (cons (cons radical (string->number stroke)) result) | |
68 | + | content)) | |
69 | + | ((('entry ('char radical) ('num stroke) (('char kanji) ...)) content ...) | |
70 | + | (loop (cons (cons radical (string->number stroke)) result) | |
71 | + | content))))) | |
72 | + | ||
73 | + | ;; return an alist where keys are kanjis, and values the associated stroke count. | |
74 | + | (define (get-kanji-stroke file) | |
75 | + | (define strokes (xml->sxml (call-with-input-file file read-string))) | |
76 | + | ||
77 | + | (match strokes | |
78 | + | (('*TOP* _ ('kanjidic2 content ...)) | |
79 | + | (map | |
80 | + | (lambda (entry) | |
81 | + | (let* ((literal (car (assoc-ref entry 'literal))) | |
82 | + | (misc (assoc-ref entry 'misc)) | |
83 | + | (misc (filter list? misc)) | |
84 | + | (stroke (car (assoc-ref misc 'stroke_count)))) | |
85 | + | (cons literal (string->number stroke)))) | |
86 | + | (filter | |
87 | + | (lambda (entry) | |
88 | + | (and | |
89 | + | (list? entry) | |
90 | + | (equal? (car entry) 'character))) | |
91 | + | content))))) | |
92 | + | ||
93 | + | (define (serialize-radk rad-kanji rad-stroke kanji-stroke) | |
94 | + | (define (serialize-rad-kanji-element element pos bv) | |
95 | + | (match element | |
96 | + | ((radical kanji ...) | |
97 | + | (let* ((pos (serialize-string (radical-character radical) pos bv))) | |
98 | + | (serialize-string (string-join kanji "") pos bv))))) | |
99 | + | (define (rad-kanji-element-size element) | |
100 | + | (match element | |
101 | + | ((radical kanji ...) | |
102 | + | (+ (string-size (radical-character radical)) | |
103 | + | (string-size (string-join kanji "")))))) | |
104 | + | ||
105 | + | (define (serialize-rad-kanji rad-kanji pos bv) | |
106 | + | ((serialize-list serialize-rad-kanji-element) rad-kanji pos bv)) | |
107 | + | (define (rad-kanji-size rad-kanji) | |
108 | + | ((list-size rad-kanji-element-size) rad-kanji)) | |
109 | + | ||
110 | + | (define (serialize-rad-stroke-element element pos bv) | |
111 | + | (match element | |
112 | + | ((radical . stroke) | |
113 | + | (let ((pos (serialize-string (radical-character radical) pos bv))) | |
114 | + | (serialize-char stroke pos bv))))) | |
115 | + | (define (rad-stroke-element-size element) | |
116 | + | (match element | |
117 | + | ((radical . stroke) | |
118 | + | (+ (string-size (radical-character radical)) (char-size stroke))))) | |
119 | + | ||
120 | + | (define (serialize-rad-stroke rad-stroke pos bv) | |
121 | + | ((serialize-list serialize-rad-stroke-element) rad-stroke pos bv)) | |
122 | + | (define (rad-stroke-size rad-stroke) | |
123 | + | ((list-size rad-stroke-element-size) rad-stroke)) | |
124 | + | ||
125 | + | (define (serialize-kanji-stroke-element element pos bv) | |
126 | + | (match element | |
127 | + | ((kanji . stroke) | |
128 | + | (let ((pos (serialize-string kanji pos bv))) | |
129 | + | (serialize-char stroke pos bv))))) | |
130 | + | (define (kanji-stroke-element-size element) | |
131 | + | (match element | |
132 | + | ((kanji . stroke) | |
133 | + | (+ (string-size kanji) (char-size stroke))))) | |
134 | + | ||
135 | + | (define (serialize-kanji-stroke kanji-stroke pos bv) | |
136 | + | ((serialize-list serialize-kanji-stroke-element) kanji-stroke pos bv)) | |
137 | + | (define (kanji-stroke-size kanji-stroke) | |
138 | + | ((list-size kanji-stroke-element-size) kanji-stroke)) | |
139 | + | ||
140 | + | (let* ((header (string->utf8 "NANI_RADK001")) | |
141 | + | (header-size (bytevector-length header)) | |
142 | + | (bv (make-bytevector (+ header-size 12 | |
143 | + | (rad-kanji-size rad-kanji) | |
144 | + | (rad-stroke-size rad-stroke) | |
145 | + | (kanji-stroke-size kanji-stroke))))) | |
146 | + | (bytevector-copy! header 0 bv 0 header-size) | |
147 | + | (let* ((pos header-size) | |
148 | + | (pos (serialize-rad-kanji rad-kanji pos bv)) | |
149 | + | (pos (serialize-rad-stroke rad-stroke pos bv)) | |
150 | + | (pos (serialize-kanji-stroke kanji-stroke pos bv))) | |
151 | + | bv))) | |
152 | + | ||
153 | + | (define (radical-character kanji) | |
154 | + | (match kanji | |
155 | + | ("???" "???") | |
156 | + | ("???" "????") | |
157 | + | ("???" "???") | |
158 | + | ("???" "???") | |
159 | + | ("???" "????") | |
160 | + | ("???" "???") | |
161 | + | ("???" "???") | |
162 | + | ("???" "???") | |
163 | + | ("???" "???") | |
164 | + | ("???" "???") | |
165 | + | ("???" "???") | |
166 | + | ("???" "???") | |
167 | + | ("???" "???") | |
168 | + | ("???" "???") | |
169 | + | ("???" "???") | |
170 | + | ("???" "???") | |
171 | + | ("???" "???") | |
172 | + | ("???" "???") | |
173 | + | ("???" "???") | |
174 | + | ("???" "???") | |
175 | + | ("???" "???") | |
176 | + | ("???" "???") | |
177 | + | (_ kanji))) | |
178 | + | ||
179 | + | ;; parse a serialized file back to the guile structure | |
180 | + | (define (parse file) | |
181 | + | (define (parse-rad-kanji-element port) | |
182 | + | (let ((radical (parse-string port)) | |
183 | + | (kanji-list (parse-string port))) | |
184 | + | (cons radical (string->list kanji-list)))) | |
185 | + | (define (parse-rad-kanji port) | |
186 | + | ((parse-list parse-rad-kanji-element) port)) | |
187 | + | ||
188 | + | (define (parse-rad-stroke-element port) | |
189 | + | (let ((radical (parse-string port)) | |
190 | + | (stroke (parse-char port))) | |
191 | + | (cons radical stroke))) | |
192 | + | (define (parse-rad-stroke port) | |
193 | + | ((parse-list parse-rad-stroke-element) port)) | |
194 | + | ||
195 | + | (define (parse-kanji-stroke-element port) | |
196 | + | (let ((kanji (parse-string port)) | |
197 | + | (stroke (parse-char port))) | |
198 | + | (cons kanji stroke))) | |
199 | + | (define (parse-kanji-stroke port) | |
200 | + | ((parse-list parse-kanji-stroke-element) port)) | |
201 | + | ||
202 | + | (call-with-input-file file | |
203 | + | (lambda (port) | |
204 | + | (let* ((header (utf8->string (get-bytevector-n port 12))) | |
205 | + | (rad-kanji (parse-rad-kanji port)) | |
206 | + | (rad-stroke (parse-rad-stroke port)) | |
207 | + | (kanji-stroke (parse-kanji-stroke port))) | |
208 | + | (list rad-kanji rad-stroke kanji-stroke))))) | |
209 | + | ||
210 | + | (define (get-kanji-list content) | |
211 | + | (let loop ((result '()) (content content)) | |
212 | + | (match content | |
213 | + | (() result) | |
214 | + | (((_ kanji ...) content ...) | |
215 | + | (loop (append result (filter (lambda (k) (not (member k result))) kanji)) | |
216 | + | content))))) | |
217 | + | ||
218 | + | (define (kanji-count file) | |
219 | + | (match (parse file) | |
220 | + | ((rad-kanji _ _) | |
221 | + | (length (get-kanji-list rad-kanji))))) |
modules/nani/parse-result.scm unknown status 2
1 | - | ;;; Nani Project website | |
2 | - | ;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu> | |
3 | - | ;;; | |
4 | - | ;;; This file is part of the Nani Project website. | |
5 | - | ;;; | |
6 | - | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | - | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | - | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | - | ;;; your option) any later version. | |
10 | - | ;;; | |
11 | - | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | - | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | - | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | - | ;;; GNU Affero General Public License for more details. | |
15 | - | ;;; | |
16 | - | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | - | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | - | ||
19 | - | (define-module (nani parse-result) | |
20 | - | #:use-module (ice-9 binary-ports) | |
21 | - | #:use-module (rnrs bytevectors) | |
22 | - | #:export (parse-result-list | |
23 | - | parse-result-char | |
24 | - | parse-result-int | |
25 | - | parse-result-boolean | |
26 | - | parse-result-string)) | |
27 | - | ||
28 | - | (define* (parse-result-list port parse-result-element) | |
29 | - | (let ((size (bytevector-u16-ref (get-bytevector-n port 2) 0 (endianness big)))) | |
30 | - | (let loop ((result '()) (remaining size)) | |
31 | - | (if (= remaining 0) | |
32 | - | (reverse result) | |
33 | - | (loop (cons (parse-result-element port) result) (- remaining 1)))))) | |
34 | - | ||
35 | - | (define (parse-result-char port) | |
36 | - | (get-u8 port)) | |
37 | - | ||
38 | - | (define (parse-result-int port) | |
39 | - | (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big))) | |
40 | - | ||
41 | - | (define (parse-result-boolean port) | |
42 | - | (= (get-u8 port) 1)) | |
43 | - | ||
44 | - | (define (parse-result-string port) | |
45 | - | (define (get-result-string port) | |
46 | - | (let loop ((lu8 '()) (char (get-u8 port))) | |
47 | - | (if (= char 0) | |
48 | - | lu8 | |
49 | - | (loop (cons char lu8) (get-u8 port))))) | |
50 | - | (utf8->string (u8-list->bytevector (reverse (get-result-string port))))) |
modules/nani/pitch/pitch.scm unknown status 1
1 | + | ;;; Nani Project website | |
2 | + | ;;; Copyright ?? 2021 Julien Lepiller <julien@lepiller.eu> | |
3 | + | ;;; | |
4 | + | ;;; This file is part of the Nani Project website. | |
5 | + | ;;; | |
6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | + | ;;; your option) any later version. | |
10 | + | ;;; | |
11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + | ;;; GNU Affero General Public License for more details. | |
15 | + | ;;; | |
16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | + | ||
19 | + | (define-module (nani pitch pitch) | |
20 | + | #:use-module (ice-9 binary-ports) | |
21 | + | #:use-module (ice-9 match) | |
22 | + | #:use-module (nani encoding huffman) | |
23 | + | #:use-module (nani encoding trie) | |
24 | + | #:use-module (rnrs bytevectors) | |
25 | + | #:use-module (srfi srfi-1) | |
26 | + | #:use-module (srfi srfi-9) | |
27 | + | #:export (make-pitch | |
28 | + | pitch? | |
29 | + | pitch-kanji | |
30 | + | pitch-reading | |
31 | + | pitch-accents | |
32 | + | ||
33 | + | serialize-pitch | |
34 | + | pitch-entry-count)) | |
35 | + | ||
36 | + | (define-record-type pitch | |
37 | + | (make-pitch kanji reading accents) | |
38 | + | pitch? | |
39 | + | (kanji pitch-kanji) | |
40 | + | (reading pitch-reading) | |
41 | + | (accents pitch-accents)) | |
42 | + | ||
43 | + | (define (make-trie-key key) | |
44 | + | (append-map | |
45 | + | (lambda (c) | |
46 | + | (list (quotient c 16) (modulo c 16))) | |
47 | + | (bytevector->u8-list (string->utf8 key)))) | |
48 | + | ||
49 | + | (define (get-pitch-trie pitches) | |
50 | + | (let ((trie (make-empty-trie))) | |
51 | + | (for-each | |
52 | + | (lambda (pitch) | |
53 | + | (let ((key (string-append (pitch-kanji pitch) (pitch-reading pitch)))) | |
54 | + | (for-each | |
55 | + | (lambda (accent) | |
56 | + | (add-to-trie! trie (make-trie-key key) accent)) | |
57 | + | (pitch-accents pitch)))) | |
58 | + | pitches) | |
59 | + | (compress-trie trie))) | |
60 | + | ||
61 | + | (define (collapse-vals! trie) | |
62 | + | (let ((transitions (trie-transitions trie)) | |
63 | + | (vals (map (match-lambda ((? string? s) s) ((? number? i) (number->string i))) | |
64 | + | (trie-vals trie)))) | |
65 | + | (trie-vals-set! trie (string-join vals ", ")) | |
66 | + | (for-each collapse-vals! (map cdr transitions)))) | |
67 | + | ||
68 | + | (define (collect-vals trie) | |
69 | + | (let ((transitions (trie-transitions trie)) | |
70 | + | (vals (trie-vals trie))) | |
71 | + | (cons vals (append-map collect-vals (map cdr transitions))))) | |
72 | + | ||
73 | + | (define (serialize-pitch pitches) | |
74 | + | (let ((trie (get-pitch-trie pitches))) | |
75 | + | (collapse-vals! trie) | |
76 | + | (let* ((huffman (create-huffman (collect-vals trie))) | |
77 | + | (code (huffman->code huffman))) | |
78 | + | (let* ((header (string->utf8 "NANI_PITCH001")) | |
79 | + | (header-size (bytevector-length header)) | |
80 | + | (huffman-bv (serialize-huffman huffman)) | |
81 | + | (huffman-size (bytevector-length huffman-bv)) | |
82 | + | (trie-size ((trie-size-single (huffman-string-size code)) trie)) | |
83 | + | (result (make-bytevector (+ header-size 4 huffman-size trie-size)))) | |
84 | + | (bytevector-copy! header 0 result 0 header-size) | |
85 | + | (bytevector-u32-set! result header-size (length pitches) (endianness big)) | |
86 | + | (bytevector-copy! huffman-bv 0 result (+ header-size 4) huffman-size) | |
87 | + | ((serialize-trie-single (serialize-huffman-string code) | |
88 | + | (huffman-string-size code)) | |
89 | + | trie (+ header-size 4 huffman-size) result) | |
90 | + | result)))) | |
91 | + | ||
92 | + | (define (pitch-entry-count file) | |
93 | + | (call-with-input-file file | |
94 | + | (lambda (port) | |
95 | + | ;; header | |
96 | + | (get-bytevector-n port 13) | |
97 | + | ;; size | |
98 | + | (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big))))) |
modules/nani/pitch/wadoku.scm unknown status 1
1 | + | ;;; Nani Project website | |
2 | + | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
3 | + | ;;; | |
4 | + | ;;; This file is part of the Nani Project website. | |
5 | + | ;;; | |
6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | + | ;;; your option) any later version. | |
10 | + | ;;; | |
11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + | ;;; GNU Affero General Public License for more details. | |
15 | + | ;;; | |
16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | + | ||
19 | + | (define-module (nani pitch wadoku) | |
20 | + | #:use-module (ice-9 match) | |
21 | + | #:use-module (nani pitch pitch) | |
22 | + | #:use-module (srfi srfi-1) | |
23 | + | #:use-module (sxml ssax) | |
24 | + | #:export (xml->pitch)) | |
25 | + | ||
26 | + | (define (sxml->element lst elem) | |
27 | + | (let ((elem (match elem | |
28 | + | ((_ . elem) elem) | |
29 | + | (_ elem)))) | |
30 | + | (match elem | |
31 | + | ('accent `(pitch . ,(car lst))) | |
32 | + | ('hira `(reading . ,(car lst))) | |
33 | + | ('orth (let ((kanji (filter string? lst))) | |
34 | + | (if (null? kanji) #f `(kanji . ,(car kanji))))) | |
35 | + | ('form | |
36 | + | `(form . ,(append-map (lambda (a) (if (list? a) a (list a))) lst))) | |
37 | + | ('reading (filter pair? lst)) | |
38 | + | ('entry | |
39 | + | (let loop ((lst lst) (kanjis '()) (readings '()) (accents '())) | |
40 | + | (if (null? lst) | |
41 | + | (if (or (and (null? kanjis) (null? readings)) (null? accents)) | |
42 | + | #f | |
43 | + | (append-map | |
44 | + | (lambda (reading) | |
45 | + | (map | |
46 | + | (lambda (kanji) | |
47 | + | (make-pitch kanji reading accents)) | |
48 | + | kanjis)) | |
49 | + | readings)) | |
50 | + | (match (car lst) | |
51 | + | (('form . f) | |
52 | + | (loop (append f (cdr lst)) kanjis readings accents)) | |
53 | + | (('pitch . pitch) | |
54 | + | (loop (cdr lst) kanjis readings (cons pitch accents))) | |
55 | + | (('reading . reading) | |
56 | + | (loop (cdr lst) kanjis (cons reading readings) accents)) | |
57 | + | (('kanji . kanji) | |
58 | + | (loop (cdr lst) (cons kanji kanjis) readings accents)) | |
59 | + | (_ (loop (cdr lst) kanjis readings accents)))))) | |
60 | + | (_ #f)))) | |
61 | + | ||
62 | + | (define parser | |
63 | + | (ssax:make-parser | |
64 | + | NEW-LEVEL-SEED | |
65 | + | (lambda (elem-gi attributes namespaces expected-content seed) | |
66 | + | attributes) | |
67 | + | ||
68 | + | FINISH-ELEMENT | |
69 | + | (lambda (elem-gi attributes namespaces parent-seed seed) | |
70 | + | (if (equal? elem-gi 'entries) | |
71 | + | seed | |
72 | + | (let* ((seed (reverse seed)) | |
73 | + | (element (sxml->element seed elem-gi))) | |
74 | + | (cons element parent-seed)))) | |
75 | + | ||
76 | + | CHAR-DATA-HANDLER | |
77 | + | (lambda (string1 string2 seed) | |
78 | + | (cons (string-append string1 string2) seed)))) | |
79 | + | ||
80 | + | (define (xml->pitch port) | |
81 | + | (filter pitch? (apply append (filter list? (parser port '()))))) |
modules/nani/radk.scm unknown status 2
1 | - | ;;; Nani Project website | |
2 | - | ;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu> | |
3 | - | ;;; | |
4 | - | ;;; This file is part of the Nani Project website. | |
5 | - | ;;; | |
6 | - | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | - | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | - | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | - | ;;; your option) any later version. | |
10 | - | ;;; | |
11 | - | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | - | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | - | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | - | ;;; GNU Affero General Public License for more details. | |
15 | - | ;;; | |
16 | - | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | - | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | - | ||
19 | - | (define-module (nani radk) | |
20 | - | #:use-module (ice-9 binary-ports) | |
21 | - | #:use-module (ice-9 match) | |
22 | - | #:use-module (ice-9 peg) | |
23 | - | #:use-module (ice-9 rdelim) | |
24 | - | #:use-module (nani parse-result) | |
25 | - | #:use-module (nani serialize) | |
26 | - | #:use-module (rnrs bytevectors) | |
27 | - | #:use-module (sxml simple) | |
28 | - | #:export (parse-radk | |
29 | - | get-kanji-stroke | |
30 | - | get-rad-kanji | |
31 | - | get-rad-stroke | |
32 | - | serialize-radk | |
33 | - | kanji-count)) | |
34 | - | ||
35 | - | (define-peg-pattern comment none (and "#" (* (or "\t" (range #\x20 #\x10ffff))) "\n")) | |
36 | - | (define-peg-pattern space none " ") | |
37 | - | (define-peg-pattern return none "\n") | |
38 | - | (define-peg-pattern entry all | |
39 | - | (and (ignore "$") space char space num (? (and space name)) (ignore "\n") | |
40 | - | (+ (or char (ignore "\n"))))) | |
41 | - | (define-peg-pattern num all (+ (or (range #\0 #\9)))) | |
42 | - | (define-peg-pattern name none (+ (or (range #\0 #\9) (range #\a #\z) (range #\A #\Z)))) | |
43 | - | (define-peg-pattern char all (and (range #\xff #\x10ffff))) | |
44 | - | (define-peg-pattern radk-doc body (* (or return comment entry))) | |
45 | - | ||
46 | - | (define (parse-radk file) | |
47 | - | (peg:tree (match-pattern radk-doc (call-with-input-file file read-string)))) | |
48 | - | ||
49 | - | (define (get-rad-kanji content) | |
50 | - | (let loop ((result '()) (content content)) | |
51 | - | (match content | |
52 | - | (() result) | |
53 | - | ((('entry ('char radical) ('num stroke) (('char kanji) ...)) content ...) | |
54 | - | (loop (cons (cons radical kanji) result) | |
55 | - | content))))) | |
56 | - | ||
57 | - | (define (get-rad-stroke content) | |
58 | - | (let loop ((result '()) (content content)) | |
59 | - | (match content | |
60 | - | (() result) | |
61 | - | ((('entry ('char radical) ('num stroke) (('char kanji) ...)) content ...) | |
62 | - | (loop (cons (cons radical (string->number stroke)) result) | |
63 | - | content))))) | |
64 | - | ||
65 | - | (define (get-kanji-stroke file) | |
66 | - | (define strokes (xml->sxml (call-with-input-file file read-string))) | |
67 | - | ||
68 | - | (match strokes | |
69 | - | (('*TOP* _ ('kanjidic2 content ...)) | |
70 | - | (map | |
71 | - | (lambda (entry) | |
72 | - | (let* ((literal (car (assoc-ref entry 'literal))) | |
73 | - | (misc (assoc-ref entry 'misc)) | |
74 | - | (misc (filter list? misc)) | |
75 | - | (stroke (car (assoc-ref misc 'stroke_count)))) | |
76 | - | (cons literal (string->number stroke)))) | |
77 | - | (filter | |
78 | - | (lambda (entry) | |
79 | - | (and | |
80 | - | (list? entry) | |
81 | - | (equal? (car entry) 'character))) | |
82 | - | content))))) | |
83 | - | ||
84 | - | (define (serialize-radk rad-kanji rad-stroke kanji-stroke) | |
85 | - | (define (serialize-rad-kanji-element element pos bv) | |
86 | - | (match element | |
87 | - | ((radical kanji ...) | |
88 | - | (let* ((pos (serialize-string (radical-character radical) pos bv))) | |
89 | - | (serialize-string (string-join kanji "") pos bv))))) | |
90 | - | (define (rad-kanji-element-size element) | |
91 | - | (match element | |
92 | - | ((radical kanji ...) | |
93 | - | (+ (string-size (radical-character radical)) | |
94 | - | (string-size (string-join kanji "")))))) | |
95 | - | ||
96 | - | (define (serialize-rad-kanji rad-kanji pos bv) | |
97 | - | (serialize-list rad-kanji serialize-rad-kanji-element pos bv)) | |
98 | - | (define (rad-kanji-size rad-kanji) | |
99 | - | (list-size rad-kanji rad-kanji-element-size)) | |
100 | - | ||
101 | - | (define (serialize-rad-stroke-element element pos bv) | |
102 | - | (match element | |
103 | - | ((radical . stroke) | |
104 | - | (let ((pos (serialize-string (radical-character radical) pos bv))) | |
105 | - | (serialize-char stroke pos bv))))) | |
106 | - | (define (rad-stroke-element-size element) | |
107 | - | (match element | |
108 | - | ((radical . stroke) | |
109 | - | (+ (string-size (radical-character radical)) (char-size stroke))))) | |
110 | - | ||
111 | - | (define (serialize-rad-stroke rad-stroke pos bv) | |
112 | - | (serialize-list rad-stroke serialize-rad-stroke-element pos bv)) | |
113 | - | (define (rad-stroke-size rad-stroke) | |
114 | - | (list-size rad-stroke rad-stroke-element-size)) | |
115 | - | ||
116 | - | (define (serialize-kanji-stroke-element element pos bv) | |
117 | - | (match element | |
118 | - | ((kanji . stroke) | |
119 | - | (let ((pos (serialize-string kanji pos bv))) | |
120 | - | (serialize-char stroke pos bv))))) | |
121 | - | (define (kanji-stroke-element-size element) | |
122 | - | (match element | |
123 | - | ((kanji . stroke) | |
124 | - | (+ (string-size kanji) (char-size stroke))))) | |
125 | - | ||
126 | - | (define (serialize-kanji-stroke kanji-stroke pos bv) | |
127 | - | (serialize-list kanji-stroke serialize-kanji-stroke-element pos bv)) | |
128 | - | (define (kanji-stroke-size kanji-stroke) | |
129 | - | (list-size kanji-stroke kanji-stroke-element-size)) | |
130 | - | ||
131 | - | (let* ((header (string->utf8 "NANI_RADK001")) | |
132 | - | (header-size (bytevector-length header)) | |
133 | - | (bv (make-bytevector (+ header-size 12 | |
134 | - | (rad-kanji-size rad-kanji) | |
135 | - | (rad-stroke-size rad-stroke) | |
136 | - | (kanji-stroke-size kanji-stroke))))) | |
137 | - | (bytevector-copy! header 0 bv 0 header-size) | |
138 | - | (let* ((pos header-size) | |
139 | - | (pos (serialize-rad-kanji rad-kanji pos bv)) | |
140 | - | (pos (serialize-rad-stroke rad-stroke pos bv)) | |
141 | - | (pos (serialize-kanji-stroke kanji-stroke pos bv))) | |
142 | - | bv))) | |
143 | - | ||
144 | - | (define (radical-character kanji) | |
145 | - | (match kanji | |
146 | - | ("???" "???") | |
147 | - | ("???" "????") | |
148 | - | ("???" "???") | |
149 | - | ("???" "???") | |
150 | - | ("???" "????") | |
151 | - | ("???" "???") | |
152 | - | ("???" "???") | |
153 | - | ("???" "???") | |
154 | - | ("???" "???") | |
155 | - | ("???" "???") | |
156 | - | ("???" "???") | |
157 | - | ("???" "???") | |
158 | - | ("???" "???") | |
159 | - | ("???" "???") | |
160 | - | ("???" "???") | |
161 | - | ("???" "???") | |
162 | - | ("???" "???") | |
163 | - | ("???" "???") | |
164 | - | ("???" "???") | |
165 | - | ("???" "???") | |
166 | - | ("???" "???") | |
167 | - | ("???" "???") | |
168 | - | (_ kanji))) | |
169 | - | ||
170 | - | (define (parse-result file) | |
171 | - | (define (parse-result-rad-kanji-element port) | |
172 | - | (let ((radical (parse-result-string port)) | |
173 | - | (kanji-list (parse-result-string port))) | |
174 | - | (cons radical (string->list kanji-list)))) | |
175 | - | (define (parse-result-rad-kanji port) | |
176 | - | (parse-result-list port parse-result-rad-kanji-element)) | |
177 | - | ||
178 | - | (define (parse-result-rad-stroke-element port) | |
179 | - | (let ((radical (parse-result-string port)) | |
180 | - | (stroke (parse-result-char port))) | |
181 | - | (cons radical stroke))) | |
182 | - | (define (parse-result-rad-stroke port) | |
183 | - | (parse-result-list port parse-result-rad-stroke-element)) | |
184 | - | ||
185 | - | (define (parse-result-kanji-stroke-element port) | |
186 | - | (let ((kanji (parse-result-string port)) | |
187 | - | (stroke (parse-result-char port))) | |
188 | - | (cons kanji stroke))) | |
189 | - | (define (parse-result-kanji-stroke port) | |
190 | - | (parse-result-list port parse-result-kanji-stroke-element)) | |
191 | - | ||
192 | - | (call-with-input-file file | |
193 | - | (lambda (port) | |
194 | - | (let* ((header (utf8->string (get-bytevector-n port 12))) | |
195 | - | (rad-kanji (parse-result-rad-kanji port)) | |
196 | - | (rad-stroke (parse-result-rad-stroke port)) | |
197 | - | (kanji-stroke (parse-result-kanji-stroke port))) | |
198 | - | (list rad-kanji rad-stroke kanji-stroke))))) | |
199 | - | ||
200 | - | (define (get-kanji-list content) | |
201 | - | (let loop ((result '()) (content content)) | |
202 | - | (match content | |
203 | - | (() result) | |
204 | - | (((_ kanji ...) content ...) | |
205 | - | (loop (append result (filter (lambda (k) (not (member k result))) kanji)) | |
206 | - | content))))) | |
207 | - | ||
208 | - | (define (kanji-count file) | |
209 | - | (match (parse-result file) | |
210 | - | ((rad-kanji _ _) | |
211 | - | (length (get-kanji-list rad-kanji))))) |
modules/nani/result.scm unknown status 2
1 | - | ;;; Nani Project website | |
2 | - | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
3 | - | ;;; | |
4 | - | ;;; This file is part of the Nani Project website. | |
5 | - | ;;; | |
6 | - | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | - | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | - | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | - | ;;; your option) any later version. | |
10 | - | ;;; | |
11 | - | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | - | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | - | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | - | ;;; GNU Affero General Public License for more details. | |
15 | - | ;;; | |
16 | - | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | - | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | - | ||
19 | - | (define-module (nani result) | |
20 | - | #:use-module (srfi srfi-9) | |
21 | - | #:export (make-result | |
22 | - | result? | |
23 | - | result-position | |
24 | - | result-position-set! | |
25 | - | result-points | |
26 | - | result-kanjis | |
27 | - | result-readings | |
28 | - | result-senses | |
29 | - | ||
30 | - | make-reading | |
31 | - | reading? | |
32 | - | reading-kanjis | |
33 | - | reading-info | |
34 | - | reading-readings | |
35 | - | ||
36 | - | make-sense | |
37 | - | sense? | |
38 | - | sense-references | |
39 | - | sense-limits | |
40 | - | sense-infos | |
41 | - | sense-sources | |
42 | - | sense-tags | |
43 | - | sense-glosses | |
44 | - | sense-language | |
45 | - | ||
46 | - | make-source | |
47 | - | source? | |
48 | - | source-content | |
49 | - | source-wasei? | |
50 | - | source-type | |
51 | - | source-lang | |
52 | - | ||
53 | - | update-result | |
54 | - | update-reading | |
55 | - | update-sense | |
56 | - | update-source)) | |
57 | - | ||
58 | - | (define-record-type result | |
59 | - | (make-result position points kanjis readings senses) | |
60 | - | result? | |
61 | - | (position result-position result-position-set!) ; integer | |
62 | - | (points result-points) ; integer | |
63 | - | (kanjis result-kanjis) ; string-list | |
64 | - | (readings result-readings) ; reanding-list | |
65 | - | (senses result-senses)) ; sense-list | |
66 | - | ||
67 | - | (define-record-type reading | |
68 | - | (make-reading kanjis info readings) | |
69 | - | reading? | |
70 | - | (kanjis reading-kanjis) ; string-list | |
71 | - | (info reading-info) ; string-list | |
72 | - | (readings reading-readings)) ; string-list | |
73 | - | ||
74 | - | (define-record-type sense | |
75 | - | (make-sense references limits infos sources tags glosses language) | |
76 | - | sense? | |
77 | - | (references sense-references) ; string-list | |
78 | - | (limits sense-limits) ; string-list | |
79 | - | (infos sense-infos) ; string-list | |
80 | - | (sources sense-sources) ; source-list | |
81 | - | (tags sense-tags) ; integer-list | |
82 | - | (glosses sense-glosses) ; string-list | |
83 | - | (language sense-language)) ; string | |
84 | - | ||
85 | - | (define-record-type source | |
86 | - | (make-source content wasei? type lang) | |
87 | - | source? | |
88 | - | (content source-content) ; string-list | |
89 | - | (wasei? source-wasei?) ; boolean | |
90 | - | (type source-type) ; string | |
91 | - | (lang source-lang)) ; string | |
92 | - | ||
93 | - | (define* (update-result result | |
94 | - | #:key (points (result-points result)) | |
95 | - | (kanjis (result-kanjis result)) | |
96 | - | (readings (result-readings result)) | |
97 | - | (senses (result-senses result))) | |
98 | - | (make-result (result-position result) points kanjis readings senses)) | |
99 | - | ||
100 | - | (define* (update-reading reading | |
101 | - | #:key (kanjis (reading-kanjis reading)) | |
102 | - | (info (reading-info reading)) | |
103 | - | (readings (reading-readings reading))) | |
104 | - | (make-reading kanjis info readings)) | |
105 | - | ||
106 | - | (define* (update-sense sense | |
107 | - | #:key (references (sense-references sense)) | |
108 | - | (limits (sense-limits sense)) | |
109 | - | (infos (sense-infos sense)) | |
110 | - | (sources (sense-sources sense)) | |
111 | - | (tags (sense-tags sense)) | |
112 | - | (glosses (sense-glosses sense)) | |
113 | - | (language (sense-language sense))) | |
114 | - | (make-sense references limits infos sources tags glosses language)) | |
115 | - | ||
116 | - | (define* (update-source source | |
117 | - | #:key (content (source-content source)) | |
118 | - | (wasei? (source-wasei? source)) | |
119 | - | (type (source-type source)) | |
120 | - | (lang (source-lang source))) | |
121 | - | (make-source content wasei? type lang)) |
modules/nani/result/frequency.scm unknown status 1
1 | + | ;;; Nani Project website | |
2 | + | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
3 | + | ;;; | |
4 | + | ;;; This file is part of the Nani Project website. | |
5 | + | ;;; | |
6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | + | ;;; your option) any later version. | |
10 | + | ;;; | |
11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + | ;;; GNU Affero General Public License for more details. | |
15 | + | ;;; | |
16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | + | ||
19 | + | (define-module (nani result frequency) | |
20 | + | #:use-module (ice-9 rdelim) | |
21 | + | #:export (load-frequency | |
22 | + | frequency->score)) | |
23 | + | ||
24 | + | (define (load-frequency file) | |
25 | + | (call-with-input-file file | |
26 | + | (lambda (port) | |
27 | + | (let loop ((frq '()) (i 1)) | |
28 | + | (let* ((line (%read-line port)) | |
29 | + | (line (car line))) | |
30 | + | (if (eof-object? line) | |
31 | + | frq | |
32 | + | (let ((content (string-split line #\tab))) | |
33 | + | (loop (cons (cons (cadr (cdr content)) i) frq) (+ i 1))))))))) | |
34 | + | ||
35 | + | (define (frequency->score frq word) | |
36 | + | (let ((freq (assoc-ref frq word))) | |
37 | + | (cond | |
38 | + | ((not freq) 0) | |
39 | + | ((< freq 501) 32) | |
40 | + | ((< freq 1001) 16) | |
41 | + | ((< freq 2001) 8) | |
42 | + | ((< freq 5001) 4) | |
43 | + | ((< freq 10001) 2) | |
44 | + | ((< freq 20001) 1) | |
45 | + | (else 0)))) |
modules/nani/result/jibiki.scm unknown status 1
1 | + | ;;; Nani Project website | |
2 | + | ;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu> | |
3 | + | ;;; | |
4 | + | ;;; This file is part of the Nani Project website. | |
5 | + | ;;; | |
6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | + | ;;; your option) any later version. | |
10 | + | ;;; | |
11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + | ;;; GNU Affero General Public License for more details. | |
15 | + | ;;; | |
16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | + | ||
19 | + | (define-module (nani result jibiki) | |
20 | + | #:use-module (ice-9 match) | |
21 | + | #:use-module (ice-9 rdelim) | |
22 | + | #:use-module (nani result frequency) | |
23 | + | #:use-module (nani result result) | |
24 | + | #:use-module (srfi srfi-9) | |
25 | + | #:use-module (sxml ssax) | |
26 | + | #:export (xml->results)) | |
27 | + | ||
28 | + | (define (sxml->string lst) | |
29 | + | (if (string? lst) | |
30 | + | lst | |
31 | + | (let loop ((lst lst) (result "")) | |
32 | + | (if (null? lst) | |
33 | + | result | |
34 | + | (loop | |
35 | + | (cdr lst) | |
36 | + | (match (car lst) | |
37 | + | (('lang . _) result) | |
38 | + | (#f result) | |
39 | + | ((? string? s) (string-append result s)) | |
40 | + | ((? list? l) (loop l result)))))))) | |
41 | + | ||
42 | + | (define (gram->info gram) | |
43 | + | (match gram | |
44 | + | ("adjectif???" "adj. en ku") | |
45 | + | ("adjectif???" "adj. en na") | |
46 | + | ("adjectif?????????" "adj. en na (tari)") | |
47 | + | ("adjectif?????????" "adj. en na (nari)") | |
48 | + | ("contraction" "contr.") | |
49 | + | ("counter" "compteur") | |
50 | + | ("num??rique" "nombre") | |
51 | + | ("particule" "particule") | |
52 | + | ("postposition" "postpos.") | |
53 | + | ("verbe auxiliaire" "v. aux.") | |
54 | + | ("??? pronom" "pronom") | |
55 | + | ("??? adverbe" "adv.") | |
56 | + | ("??? onomatop??e" "onom.") | |
57 | + | ("?????? verbe auxiliaire" "v. aux.") | |
58 | + | ("?????? compteur" "compteur") | |
59 | + | ("??? verbe" "v.") | |
60 | + | ("??? verbe intransitif" "v.i.") | |
61 | + | ("??? verbe transitif" "v.t.") | |
62 | + | ("??? nom" "n.") | |
63 | + | ("??? adjectif" "adj.") | |
64 | + | ("???" "interj.") | |
65 | + | ("?????? interjection" "interj.") | |
66 | + | ("?????? expression" "expr.") | |
67 | + | ("?????? suffixe" "suf.") | |
68 | + | ("?????? suffixe conjonctif" "suf. conj.") | |
69 | + | ("?????? conjonction" "conj.") | |
70 | + | ("?????? pr??fixe" "pref.") | |
71 | + | ("??????" "adj. pr??nominal") | |
72 | + | ("?????? locution adverbiale" "loc. adv.") | |
73 | + | ("?????? locution conjonctive" "loc. conj.") | |
74 | + | ("?????? locution postpositive" "loc. post.") | |
75 | + | ("?????? mot compos??" "mot compos??") | |
76 | + | ("??" #f) | |
77 | + | ("" #f))) | |
78 | + | ||
79 | + | (define (info->info i) | |
80 | + | (match (sxml->string i) | |
81 | + | ("abbreviation" "abbreviation") | |
82 | + | ("archaism" "archa??sme") | |
83 | + | ("children's language" "langage des enfants") | |
84 | + | ("colloquialism" "familier") | |
85 | + | ("derogatory" "terme de m??pris") | |
86 | + | ("familiar language" "familier") | |
87 | + | ("female term or language" "langage des femmes") | |
88 | + | ("honorific or respectful (sonkeigo) language" "terme honorifique") | |
89 | + | ("humble (kenjougo) language" "terme humble") | |
90 | + | ("idiomatic expression" "idiom.") | |
91 | + | ("jocular, humorous term" "comique") | |
92 | + | ("male term or language" "langage des hommes") | |
93 | + | ("obscure term" "obscure") | |
94 | + | ("obsolete term" "obsol??te") | |
95 | + | ("onomatopoeic or mimetic word" "onom.") | |
96 | + | ("polite (teineigo) language" "terme poli") | |
97 | + | ("proverb" "proverbe") | |
98 | + | ("sensitive" "sensible") | |
99 | + | ("slang" "argot") | |
100 | + | ("vulgar expression or word" "vulgaire") | |
101 | + | ("word usually written using kana alone" "habituellement ??crit en kana") | |
102 | + | ("" #f) | |
103 | + | (() #f) | |
104 | + | (_ i))) | |
105 | + | ||
106 | + | (define (sxml->ref lst) | |
107 | + | (let loop ((lst lst) (ref '())) | |
108 | + | (if (null? lst) | |
109 | + | ref | |
110 | + | (loop | |
111 | + | (cdr lst) | |
112 | + | (match (car lst) | |
113 | + | (#f ref) | |
114 | + | (('jpn . jpn) (cons jpn ref))))))) | |
115 | + | ||
116 | + | (define (sxml->reading str) | |
117 | + | (make-reading '() '() (list str))) | |
118 | + | ||
119 | + | (define (sxml->source lst) | |
120 | + | (let loop ((lst lst) (source (make-source '() #f ""))) | |
121 | + | (if (null? lst) | |
122 | + | source | |
123 | + | (loop | |
124 | + | (cdr lst) | |
125 | + | (match (car lst) | |
126 | + | ((? string? s) | |
127 | + | (update-source source | |
128 | + | #:content (cons s (source-content source)))) | |
129 | + | (('lang . l) | |
130 | + | (update-source source #:lang l)) | |
131 | + | ((? list? l) | |
132 | + | (loop l source))))))) | |
133 | + | ||
134 | + | (define (sxml->meaning lst) | |
135 | + | (let loop ((lst lst) (meaning (make-meaning '() '() '() '() '() "fr"))) | |
136 | + | (if (null? lst) | |
137 | + | meaning | |
138 | + | (loop | |
139 | + | (cdr lst) | |
140 | + | (match (car lst) | |
141 | + | (#f meaning) | |
142 | + | (('n . _) meaning) | |
143 | + | (('info . info) (update-meaning meaning | |
144 | + | #:infos (cons info (meaning-infos meaning)))) | |
145 | + | (('content . c) (update-meaning meaning | |
146 | + | #:glosses (append | |
147 | + | (map string-downcase | |
148 | + | (string-split c #\,)) | |
149 | + | (meaning-glosses meaning)))) | |
150 | + | ((? source? s) | |
151 | + | (update-meaning meaning | |
152 | + | #:sources (cons s (meaning-sources meaning)))) | |
153 | + | ((? string? _) meaning) | |
154 | + | ((? list? l) (loop l meaning))))))) | |
155 | + | ||
156 | + | (define (sxml->result lst frq) | |
157 | + | (define result | |
158 | + | (let loop ((lst lst) (result (make-result 0 0 '() '() '())) (infos '()) | |
159 | + | (sources '())) | |
160 | + | (if (null? lst) | |
161 | + | (list result infos sources) | |
162 | + | (match (car lst) | |
163 | + | (#f (loop (cdr lst) result infos sources)) | |
164 | + | (('id . _) (loop (cdr lst) result infos sources)) | |
165 | + | (('identree . _) (loop (cdr lst) result infos sources)) | |
166 | + | (('kanji . k) | |
167 | + | (loop (cdr lst) | |
168 | + | (update-result result | |
169 | + | #:kanjis (cons k (result-kanjis result))) | |
170 | + | infos | |
171 | + | sources)) | |
172 | + | ((? reading? r) | |
173 | + | (loop | |
174 | + | (cdr lst) | |
175 | + | (update-result result | |
176 | + | #:readings (cons r (result-readings result))) | |
177 | + | infos | |
178 | + | sources)) | |
179 | + | ((? meaning? s) | |
180 | + | (loop | |
181 | + | (cdr lst) | |
182 | + | (update-result result | |
183 | + | #:meanings (cons s (result-meanings result))) | |
184 | + | infos | |
185 | + | sources)) | |
186 | + | ((? source? s) | |
187 | + | (loop (cdr lst) result infos (cons s sources))) | |
188 | + | (('info . info) | |
189 | + | (loop (cdr lst) | |
190 | + | result | |
191 | + | (cons info infos) | |
192 | + | sources)) | |
193 | + | ((? list? l) | |
194 | + | (match (loop l result infos sources) | |
195 | + | ((result infos sources) | |
196 | + | (loop (cdr lst) result infos sources)))))))) | |
197 | + | ||
198 | + | (match result | |
199 | + | ((result infos sources) | |
200 | + | (if (or (and (null? (result-readings result)) | |
201 | + | (null? (result-kanjis result))) | |
202 | + | (null? (result-meanings result))) | |
203 | + | #f | |
204 | + | (let* ((word (if (null? (result-kanjis result)) | |
205 | + | (car (reading-readings | |
206 | + | (car (result-readings result)))) | |
207 | + | (car (result-kanjis result)))) | |
208 | + | (score (frequency->score frq word)) | |
209 | + | (meanings (result-meanings result)) | |
210 | + | (meanings | |
211 | + | (map | |
212 | + | (lambda (s) | |
213 | + | (update-meaning s | |
214 | + | #:infos (append infos (meaning-infos s)) | |
215 | + | #:sources (append sources (meaning-sources s)))) | |
216 | + | meanings))) | |
217 | + | (update-result result | |
218 | + | #:score score | |
219 | + | #:meanings meanings)))))) | |
220 | + | ||
221 | + | (define (sxml->element lst elem frq) | |
222 | + | (let ((elem (match elem | |
223 | + | ((_ . elem) elem) | |
224 | + | (_ elem)))) | |
225 | + | (match elem | |
226 | + | ('vedette-romaji #f) | |
227 | + | ('vedette-hiragana (sxml->reading (sxml->string lst))) | |
228 | + | ('vedette-jpn `(kanji . ,(sxml->string lst))) | |
229 | + | ('vedette lst) | |
230 | + | ('forme lst) | |
231 | + | ('domaine (if (null? lst) #f | |
232 | + | (let ((info (sxml->string lst))) | |
233 | + | (if (or (not info) (string-null? info)) | |
234 | + | #f | |
235 | + | `(info . ,info))))) | |
236 | + | ('gram (let ((info (gram->info (sxml->string lst)))) | |
237 | + | (if (or (not info) (string-null? info)) | |
238 | + | #f | |
239 | + | `(info . ,info)))) | |
240 | + | ('??tiquettes lst) | |
241 | + | ('??tiquettes-sens lst) | |
242 | + | ('texte-sens `(content . ,(sxml->string lst))) | |
243 | + | ('sens (sxml->meaning lst)) | |
244 | + | ('bloc-gram lst) | |
245 | + | ('s??mantique lst) | |
246 | + | ('article (sxml->result lst frq)) | |
247 | + | ('registre (let ((info (sxml->string lst))) | |
248 | + | (if (or (not info) (string-null? info)) | |
249 | + | #f | |
250 | + | `(info . ,info)))) | |
251 | + | ('litt??ralement | |
252 | + | (if (null? lst) #f (make-meaning | |
253 | + | '() '() '() '("lit") | |
254 | + | (list (sxml->string lst)) "fr"))) | |
255 | + | ('etymologie (sxml->source lst)) | |
256 | + | ('??tymologie (sxml->source lst)) | |
257 | + | ('note (let ((info (sxml->string lst))) | |
258 | + | (if (or (not info) (string-null? info)) #f `(info . ,info)))) | |
259 | + | ('info (let ((info (info->info (sxml->string lst)))) | |
260 | + | (if (or (not info) (string-null? info)) #f `(info . ,info)))) | |
261 | + | ('dialecte (let ((info (sxml->string lst))) | |
262 | + | (if (or (not info) (string-null? info)) #f `(info . ,info)))) | |
263 | + | ('en lst) | |
264 | + | ('eng lst) | |
265 | + | ('xref (sxml->ref lst)) | |
266 | + | ||
267 | + | ('vr #f) | |
268 | + | ('romaji #f) | |
269 | + | ('romajidx #f) | |
270 | + | ('rt #f) | |
271 | + | ('ruby #f) | |
272 | + | ('vj #f) | |
273 | + | ('jpn `(jpn . ,(sxml->string lst))) | |
274 | + | ('??tiquettes-fran??ais #f) | |
275 | + | ('??tiquettes-sous-sens #f) | |
276 | + | ('texte-sous-sens #f) | |
277 | + | ('sous-sens #f) | |
278 | + | ('sous-bloc-gram #f) | |
279 | + | ('fran??ais #f) | |
280 | + | ('exemple #f) | |
281 | + | ('exemples #f)))) | |
282 | + | ||
283 | + | (define (create-parser frq) | |
284 | + | (ssax:make-parser | |
285 | + | NEW-LEVEL-SEED | |
286 | + | (lambda (elem-gi attributes namespaces expected-content seed) | |
287 | + | attributes) | |
288 | + | ||
289 | + | FINISH-ELEMENT | |
290 | + | (lambda (elem-gi attributes namespaces parent-seed seed) | |
291 | + | (if (equal? elem-gi 'volume) | |
292 | + | (filter (lambda (a) a) seed) | |
293 | + | (let* ((seed (reverse seed)) | |
294 | + | (element (sxml->element seed elem-gi frq))) | |
295 | + | (cons element parent-seed)))) | |
296 | + | ||
297 | + | CHAR-DATA-HANDLER | |
298 | + | (lambda (string1 string2 seed) | |
299 | + | (cons (string-append string1 string2) seed)))) | |
300 | + | ||
301 | + | (define (xml->results port frq) | |
302 | + | (let ((results (filter result? ((create-parser frq) port '())))) | |
303 | + | (sort-results results))) |
modules/nani/result/jmdict.scm unknown status 1
1 | + | ;;; Nani Project website | |
2 | + | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
3 | + | ;;; | |
4 | + | ;;; This file is part of the Nani Project website. | |
5 | + | ;;; | |
6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | + | ;;; your option) any later version. | |
10 | + | ;;; | |
11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + | ;;; GNU Affero General Public License for more details. | |
15 | + | ;;; | |
16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | + | ||
19 | + | (define-module (nani result jmdict) | |
20 | + | #:use-module (ice-9 match) | |
21 | + | #:use-module (ice-9 rdelim) | |
22 | + | #:use-module (nani result frequency) | |
23 | + | #:use-module (nani result result) | |
24 | + | #:use-module (sxml simple) | |
25 | + | #:use-module (sxml ssax) | |
26 | + | #:export (load-dic xml->results)) | |
27 | + | ||
28 | + | (define (load-dic file) | |
29 | + | (xml->sxml (call-with-input-file file read-string))) | |
30 | + | ||
31 | + | (define (sxml->reading lst) | |
32 | + | (let loop ((reading (make-reading '() '() '())) (lst lst)) | |
33 | + | (if (null? lst) | |
34 | + | reading | |
35 | + | (loop | |
36 | + | (match (car lst) | |
37 | + | (('reading r) (update-reading reading #:readings (cons r (reading-readings reading)))) | |
38 | + | (('info r) (update-reading reading #:info (cons r (reading-info reading)))) | |
39 | + | (('limit r) (update-reading reading #:kanjis (cons r (reading-kanjis reading)))) | |
40 | + | (((? symbol? s) v) (throw 'unknown-content s v)) | |
41 | + | ((? string? _) reading)) | |
42 | + | (cdr lst))))) | |
43 | + | ||
44 | + | (define (sxml->source lst) | |
45 | + | (let loop ((source (make-source '() #f "")) (lst lst)) | |
46 | + | (if (null? lst) | |
47 | + | source | |
48 | + | (loop | |
49 | + | (match (car lst) | |
50 | + | (('content c) (update-source source #:content (cons c (source-content source)))) | |
51 | + | (('ls_wasei _) (update-source source #:wasei? #t)) | |
52 | + | (('ls_type t) source); type is always "part" or nothing | |
53 | + | ((('xml . 'lang) l) (update-source source #:lang l)) | |
54 | + | (((? symbol? s) v) (throw 'unknown-content s v)) | |
55 | + | ((? list? l) (loop source l)) | |
56 | + | ((? string? c) (update-source source #:content | |
57 | + | (cons c (source-content source))))) | |
58 | + | (cdr lst))))) | |
59 | + | ||
60 | + | (define (sxml->meaning lst) | |
61 | + | (let loop ((meaning (make-meaning '() '() '() '() '() "eng")) (lst lst)) | |
62 | + | (if (null? lst) | |
63 | + | meaning | |
64 | + | (loop | |
65 | + | (match (car lst) | |
66 | + | (('ref (? string? r)) (update-meaning meaning #:references (cons r (meaning-references meaning)))) | |
67 | + | (('limit (? string? r)) (update-meaning meaning #:limits (cons r (meaning-limits meaning)))) | |
68 | + | (('info (? string? r)) (update-meaning meaning #:infos (cons r (meaning-infos meaning)))) | |
69 | + | ((? source? s) (update-meaning meaning #:sources (cons s (meaning-sources meaning)))) | |
70 | + | (('info (? string? r)) (update-meaning meaning #:infos (cons r (meaning-infos meaning)))) | |
71 | + | (('gloss (? string? r)) (update-meaning meaning | |
72 | + | #:glosses | |
73 | + | (cons (string-downcase r) | |
74 | + | (meaning-glosses meaning)))) | |
75 | + | ((('xml . 'lang) (? string? l)) (update-meaning meaning #:language l)) | |
76 | + | (((? symbol? s) v) (throw 'unknown-content s v)) | |
77 | + | ((? list? l) (loop meaning l)) | |
78 | + | ((? string? _) meaning)) | |
79 | + | (cdr lst))))) | |
80 | + | ||
81 | + | (define (sxml->result lst frq) | |
82 | + | (let ((result | |
83 | + | (let loop ((result (make-result 0 0 '() '() '())) (lst lst)) | |
84 | + | (if (null? lst) | |
85 | + | result | |
86 | + | (loop | |
87 | + | (match (car lst) | |
88 | + | (('kanji kanji) | |
89 | + | (update-result result #:kanjis (cons kanji (result-kanjis result)))) | |
90 | + | ((? reading? r) | |
91 | + | (update-result result #:readings (cons r (result-readings result)))) | |
92 | + | ((? meaning? s) | |
93 | + | (update-result result #:meanings (cons s (result-meanings result)))) | |
94 | + | ((? string? _) result)) | |
95 | + | (cdr lst)))))) | |
96 | + | (let* ((word (if (null? (result-kanjis result)) | |
97 | + | (car (reading-readings (car (result-readings result)))) | |
98 | + | (car (result-kanjis result)))) | |
99 | + | (score (frequency->score frq word)) | |
100 | + | (meanings (result-meanings result))) | |
101 | + | (update-result result | |
102 | + | #:score score | |
103 | + | #:meanings meanings)))) | |
104 | + | ||
105 | + | (define (field->info field) | |
106 | + | ;; TODO: translate depending on target language | |
107 | + | (match field | |
108 | + | ("agric" "agriculture") | |
109 | + | ("anat" "anatomy") | |
110 | + | ("archeol" "archeology") | |
111 | + | ("archit" "architecture") | |
112 | + | ("art" "art, aesthetics") | |
113 | + | ("astron" "astronomy") | |
114 | + | ("audvid" "audiovisual") | |
115 | + | ("aviat" "aviation") | |
116 | + | ("baseb" "baseball") | |
117 | + | ("biochem" "biochemistry") | |
118 | + | ("biol" "biology") | |
119 | + | ("bot" "botany") | |
120 | + | ("Buddh" "Buddhism") | |
121 | + | ("bus" "business") | |
122 | + | ("chem" "chemistry") | |
123 | + | ("Christn" "Christianity") | |
124 | + | ("cloth" "clothing") | |
125 | + | ("comp" "computing") | |
126 | + | ("cryst" "crystallography") | |
127 | + | ("ecol" "ecology") | |
128 | + | ("econ" "economics") | |
129 | + | ("elec" "electricity, elec. eng.") | |
130 | + | ("electr" "electronics") | |
131 | + | ("embryo" "embryology") | |
132 | + | ("engr" "engineering") | |
133 | + | ("ent" "entomology") | |
134 | + | ("finc" "finance") | |
135 | + | ("fish" "fishing") | |
136 | + | ("food" "food, cooking") | |
137 | + | ("gardn" "gardening, horticulture") | |
138 | + | ("genet" "genetics") | |
139 | + | ("geogr" "geography") | |
140 | + | ("geol" "geology") | |
141 | + | ("geom" "geometry") | |
142 | + | ("go" "go (game)") | |
143 | + | ("golf" "golf") | |
144 | + | ("gramm" "grammar") | |
145 | + | ("grmyth" "Greek mythology") | |
146 | + | ("hanaf" "hanafuda") | |
147 | + | ("horse" "horse racing") | |
148 | + | ("law" "law") | |
149 | + | ("ling" "linguistics") | |
150 | + | ("logic" "logic") | |
151 | + | ("MA" "martial arts") | |
152 | + | ("mahj" "mahjong") | |
153 | + | ("math" "mathematics") | |
154 | + | ("mech" "mechanical engineering") | |
155 | + | ("med" "medicine") | |
156 | + | ("met" "meteorology") | |
157 | + | ("mil" "military") | |
158 | + | ("music" "music") | |
159 | + | ("ornith" "ornithology") | |
160 | + | ("paleo" "paleontology") | |
161 | + | ("pathol" "pathology") | |
162 | + | ("pharm" "pharmacy") | |
163 | + | ("phil" "philosophy") | |
164 | + | ("photo" "photography") | |
165 | + | ("physics" "physics") | |
166 | + | ("physiol" "physiology") | |
167 | + | ("print" "printing") | |
168 | + | ("psy" "psychiatry") | |
169 | + | ("psych" "psychology") | |
170 | + | ("rail" "railway") | |
171 | + | ("Shinto" "Shinto") | |
172 | + | ("shogi" "shogi") | |
173 | + | ("sports" "sports") | |
174 | + | ("stat" "statistics") | |
175 | + | ("sumo" "sumo") | |
176 | + | ("telec" "telecommunications") | |
177 | + | ("tradem" "trademark") | |
178 | + | ("vidg" "video games") | |
179 | + | ("zool" "zoology"))) | |
180 | + | ||
181 | + | (define (dial->info dial) | |
182 | + | (match dial | |
183 | + | ("bra" "Brazilian") | |
184 | + | ("hob" "Hokkaido-ben") | |
185 | + | ("ksb" "Kansai-ben") | |
186 | + | ("ktb" "Kantou-ben") | |
187 | + | ("kyb" "Kyoto-ben") | |
188 | + | ("kyu" "Kyuushuu-ben") | |
189 | + | ("nab" "Nagano-ben") | |
190 | + | ("osb" "Osaka-ben") | |
191 | + | ("rkb" "Ryuukyuu-ben") | |
192 | + | ("thb" "Touhoku-ben") | |
193 | + | ("tsb" "Tosa-ben") | |
194 | + | ("tsug" "Tsugaru-ben"))) | |
195 | + | ||
196 | + | (define (misc->info misc) | |
197 | + | (match misc | |
198 | + | ("abbr" "abbreviation") | |
199 | + | ("arch" "archaism") | |
200 | + | ("char" "character") | |
201 | + | ("chn" "children's language") | |
202 | + | ("col" "colloquialism") | |
203 | + | ("company" "company name") | |
204 | + | ("creat" "creature") | |
205 | + | ("dated" "dated term") | |
206 | + | ("dei" "deity") | |
207 | + | ("derog" "derogatory") | |
208 | + | ("doc" "document") | |
209 | + | ("ev" "event") | |
210 | + | ("fam" "familiar language") | |
211 | + | ("fem" "female term or language") | |
212 | + | ("fict" "fiction") | |
213 | + | ("form" "formal or literary term") | |
214 | + | ("given" "given name or forename, gender not specified") | |
215 | + | ("group" "group") | |
216 | + | ("hist" "historical term") | |
217 | + | ("hon" "honorific or respectful (sonkeigo) language") | |
218 | + | ("hum" "humble (kenjougo) language") | |
219 | + | ("id" "idiomatic expression") | |
220 | + | ("joc" "jocular, humorous term") | |
221 | + | ("leg" "legend") | |
222 | + | ("m-sl" "manga slang") | |
223 | + | ("male" "male term or language") | |
224 | + | ("myth" "mythology") | |
225 | + | ("net-sl" "Internet slang") | |
226 | + | ("obj" "object") | |
227 | + | ("obs" "obsolete term") | |
228 | + | ("obsc" "obscure term") | |
229 | + | ("on-mim" "onomatopoeic or mimetic word") | |
230 | + | ("organization" "organization name") | |
231 | + | ("oth" "other") | |
232 | + | ("person" "full name of a particular person") | |
233 | + | ("place" "place name") | |
234 | + | ("poet" "poetical term") | |
235 | + | ("pol" "polite (teineigo) language") | |
236 | + | ("product" "product name") | |
237 | + | ("proverb" "proverb") | |
238 | + | ("quote" "quotation") | |
239 | + | ("rare" "rare") | |
240 | + | ("relig" "religion") | |
241 | + | ("sens" "sensitive") | |
242 | + | ("serv" "service") | |
243 | + | ("sl" "slang") | |
244 | + | ("station" "railway station") | |
245 | + | ("surname" "family or surname") | |
246 | + | ("uk" "word usually written using kana alone") | |
247 | + | ("unclass" "unclassified name") | |
248 | + | ("vulg" "vulgar expression or word") | |
249 | + | ("work" "work of art, literature, music, etc. name") | |
250 | + | ("X" "rude or X-rated term") | |
251 | + | ("yoji" "yojijukugo"))) | |
252 | + | ||
253 | + | (define (pos->info pos) | |
254 | + | (match pos | |
255 | + | ("adj-f" "noun or verb acting prenominally") | |
256 | + | ("adj-i" "adjective (keiyoushi)") | |
257 | + | ("adj-ix" "adjective (keiyoushi) - yoi/ii class") | |
258 | + | ("adj-kari" "'kari' adjective (archaic)") | |
259 | + | ("adj-ku" "'ku' adjective (archaic)") | |
260 | + | ("adj-na" "adjectival nouns or quasi-adjectives (keiyodoshi)") | |
261 | + | ("adj-nari" "archaic/formal form of na-adjective") | |
262 | + | ("adj-no" "nouns which may take the genitive case particle 'no'") | |
263 | + | ("adj-pn" "pre-noun adjectival (rentaishi)") | |
264 | + | ("adj-shiku" "'shiku' adjective (archaic)") | |
265 | + | ("adj-t" "'taru' adjective") | |
266 | + | ("adv" "adverb (fukushi)") | |
267 | + | ("adv-to" "adverb taking the 'to' particle") | |
268 | + | ("aux" "auxiliary") | |
269 | + | ("aux-adj" "auxiliary adjective") | |
270 | + | ("aux-v" "auxiliary verb") | |
271 | + | ("conj" "conjunction") | |
272 | + | ("cop" "copula") | |
273 | + | ("ctr" "counter") | |
274 | + | ("exp" "expression") | |
275 | + | ("int" "interjection (kandoushi)") | |
276 | + | ("n" "noun (common) (futsuumeishi)") | |
277 | + | ("n-adv" "adverbial noun (fukushitekimeishi)") | |
278 | + | ("n-pr" "proper noun") | |
279 | + | ("n-pref" "noun, used as a prefix") | |
280 | + | ("n-suf" "noun, used as a suffix") | |
281 | + | ("n-t" "noun (temporal) (jisoumeishi)") | |
282 | + | ("num" "numeric") | |
283 | + | ("pn" "pronoun") | |
284 | + | ("pref" "prefix") | |
285 | + | ("prt" "particle") | |
286 | + | ("suf" "suffix") | |
287 | + | ("unc" "unclassified") | |
288 | + | ("v-unspec" "verb unspecified") | |
289 | + | ("v1" "Ichidan verb") | |
290 | + | ("v1-s" "Ichidan verb - kureru special class") | |
291 | + | ("v2a-s" "Nidan verb with 'u' ending (archaic)") | |
292 | + | ("v2b-k" "Nidan verb (upper class) with 'bu' ending (archaic)") | |
293 | + | ("v2b-s" "Nidan verb (lower class) with 'bu' ending (archaic)") | |
294 | + | ("v2d-k" "Nidan verb (upper class) with 'dzu' ending (archaic)") | |
295 | + | ("v2d-s" "Nidan verb (lower class) with 'dzu' ending (archaic)") | |
296 | + | ("v2g-k" "Nidan verb (upper class) with 'gu' ending (archaic)") | |
297 | + | ("v2g-s" "Nidan verb (lower class) with 'gu' ending (archaic)") | |
298 | + | ("v2h-k" "Nidan verb (upper class) with 'hu/fu' ending (archaic)") | |
299 | + | ("v2h-s" "Nidan verb (lower class) with 'hu/fu' ending (archaic)") | |
300 | + | ("v2k-k" "Nidan verb (upper class) with 'ku' ending (archaic)") | |
301 | + | ("v2k-s" "Nidan verb (lower class) with 'ku' ending (archaic)") | |
302 | + | ("v2m-k" "Nidan verb (upper class) with 'mu' ending (archaic)") | |
303 | + | ("v2m-s" "Nidan verb (lower class) with 'mu' ending (archaic)") | |
304 | + | ("v2n-s" "Nidan verb (lower class) with 'nu' ending (archaic)") | |
305 | + | ("v2r-k" "Nidan verb (upper class) with 'ru' ending (archaic)") | |
306 | + | ("v2r-s" "Nidan verb (lower class) with 'ru' ending (archaic)") | |
307 | + | ("v2s-s" "Nidan verb (lower class) with 'su' ending (archaic)") | |
308 | + | ("v2t-k" "Nidan verb (upper class) with 'tsu' ending (archaic)") | |
309 | + | ("v2t-s" "Nidan verb (lower class) with 'tsu' ending (archaic)") | |
310 | + | ("v2w-s" "Nidan verb (lower class) with 'u' ending and 'we' conjugation (archaic)") | |
311 | + | ("v2y-k" "Nidan verb (upper class) with 'yu' ending (archaic)") | |
312 | + | ("v2y-s" "Nidan verb (lower class) with 'yu' ending (archaic)") | |
313 | + | ("v2z-s" "Nidan verb (lower class) with 'zu' ending (archaic)") | |
314 | + | ("v4b" "Yodan verb with 'bu' ending (archaic)") | |
315 | + | ("v4g" "Yodan verb with 'gu' ending (archaic)") | |
316 | + | ("v4h" "Yodan verb with 'hu/fu' ending (archaic)") | |
317 | + | ("v4k" "Yodan verb with 'ku' ending (archaic)") | |
318 | + | ("v4m" "Yodan verb with 'mu' ending (archaic)") | |
319 | + | ("v4n" "Yodan verb with 'nu' ending (archaic)") | |
320 | + | ("v4r" "Yodan verb with 'ru' ending (archaic)") | |
321 | + | ("v4s" "Yodan verb with 'su' ending (archaic)") | |
322 | + | ("v4t" "Yodan verb with 'tsu' ending (archaic)") | |
323 | + | ("v5aru" "Godan verb - -aru special class") | |
324 | + | ("v5b" "Godan verb with 'bu' ending") | |
325 | + | ("v5g" "Godan verb with 'gu' ending") | |
326 | + | ("v5k" "Godan verb with 'ku' ending") | |
327 | + | ("v5k-s" "Godan verb - Iku/Yuku special class") | |
328 | + | ("v5m" "Godan verb with 'mu' ending") | |
329 | + | ("v5n" "Godan verb with 'nu' ending") | |
330 | + | ("v5r" "Godan verb with 'ru' ending") | |
331 | + | ("v5r-i" "Godan verb with 'ru' ending (irregular verb)") | |
332 | + | ("v5s" "Godan verb with 'su' ending") | |
333 | + | ("v5t" "Godan verb with 'tsu' ending") | |
334 | + | ("v5u" "Godan verb with 'u' ending") | |
335 | + | ("v5u-s" "Godan verb with 'u' ending (special class)") | |
336 | + | ("v5uru" "Godan verb - Uru old class verb (old form of Eru)") | |
337 | + | ("vi" "intransitive verb") | |
338 | + | ("vk" "Kuru verb - special class") | |
339 | + | ("vn" "irregular nu verb") | |
340 | + | ("vr" "irregular ru verb, plain form ends with -ri") | |
341 | + | ("vs" "noun or participle which takes the aux. verb suru") | |
342 | + | ("vs-c" "su verb - precursor to the modern suru") | |
343 | + | ("vs-i" "suru verb - included") | |
344 | + | ("vs-s" "suru verb - special class") | |
345 | + | ("vt" "transitive verb") | |
346 | + | ("vz" "Ichidan verb - zuru verb (alternative form of -jiru verbs)"))) | |
347 | + | ||
348 | + | (define (re_inf->info i) | |
349 | + | (match i | |
350 | + | ("gikun" "gikun (meaning as reading) or jukujikun (special kanji reading)") | |
351 | + | ("ik" "word containing irregular kana usage") | |
352 | + | ("ok" "out-dated or obsolete kana usage") | |
353 | + | ("uK" "word usually written using kanji alone"))) | |
354 | + | ||
355 | + | (define (sxml->element lst elem frq) | |
356 | + | (match elem | |
357 | + | ('ent_seq "") | |
358 | + | ('ke_pri "") | |
359 | + | ('re_nokanji "") | |
360 | + | ('re_pri "") | |
361 | + | ('ke_inf "") | |
362 | + | ('misc (if (and (= (length lst) 1) (string? (car lst))) | |
363 | + | (misc->info (car lst)) | |
364 | + | (throw 'invalid-misc lst))) | |
365 | + | ('re_restr (if (and (= (length lst) 1) (string? (car lst))) | |
366 | + | `(limit ,(car lst)) | |
367 | + | (throw 'invalid-re_restr lst))) | |
368 | + | ('keb (if (and (= (length lst) 1) (string? (car lst))) | |
369 | + | `(kanji ,(car lst)) | |
370 | + | (throw 'invalid-keb lst))) | |
371 | + | ('s_inf (if (null? (filter (lambda (s) (not (string? s))) lst)) | |
372 | + | `(info ,(apply string-append lst)) | |
373 | + | (throw 'invalid-s_inf lst))) | |
374 | + | ('dial (if (and (= (length lst) 1) (string? (car lst))) | |
375 | + | (dial->info (car lst)) | |
376 | + | (throw 'invalid-dial lst))) | |
377 | + | ('re_inf (if (and (= (length lst) 1) (string? (car lst))) | |
378 | + | (re_inf->info (car lst)) | |
379 | + | (throw 'invalid-re_inf lst))) | |
380 | + | ('stagk (if (and (= (length lst) 1) (string? (car lst))) | |
381 | + | `(limit ,(car lst)) | |
382 | + | (throw 'invalid-stagk lst))) | |
383 | + | ('stagr (if (and (= (length lst) 1) (string? (car lst))) | |
384 | + | `(limit ,(car lst)) | |
385 | + | (throw 'invalid-stagr lst))) | |
386 | + | ('field (if (and (= (length lst) 1) (string? (car lst))) | |
387 | + | (field->info (car lst)) | |
388 | + | (throw 'invalid-field lst))) | |
389 | + | ('ant (if (and (= (length lst) 1) (string? (car lst))) | |
390 | + | `(ref ,(car lst)) | |
391 | + | (throw 'invalid-ant lst))) | |
392 | + | ('reb (if (and (= (length lst) 1) (string? (car lst))) | |
393 | + | `(reading ,(car lst)) | |
394 | + | (throw 'invalid-reb lst))) | |
395 | + | ('r_ele (sxml->reading lst)) | |
396 | + | ('k_ele (car (filter list? lst))) | |
397 | + | ('pos (if (and (= (length lst) 1) (string? (car lst))) | |
398 | + | (pos->info (car lst)) | |
399 | + | (throw 'invalid-pos lst))) | |
400 | + | ('xref (if (and (= (length lst) 1) (string? (car lst))) | |
401 | + | `(ref ,(car lst)) | |
402 | + | (throw 'invalid-xref lst))) | |
403 | + | ('gloss (cons | |
404 | + | `(gloss ,(apply string-append (filter string? lst))) | |
405 | + | (filter list? lst))) | |
406 | + | ('lsource (sxml->source lst)) | |
407 | + | ('sense (sxml->meaning lst)) | |
408 | + | ('entry (sxml->result lst frq)))) | |
409 | + | ||
410 | + | (define (create-parser frq) | |
411 | + | (ssax:make-parser | |
412 | + | NEW-LEVEL-SEED | |
413 | + | (lambda (elem-gi attributes namespaces expected-content seed) | |
414 | + | (map | |
415 | + | (match-lambda | |
416 | + | ((k . v) (list k v))) | |
417 | + | (filter | |
418 | + | (match-lambda | |
419 | + | ((k . v) (not (member k '(g_type))))) | |
420 | + | attributes))) | |
421 | + | ||
422 | + | FINISH-ELEMENT | |
423 | + | (lambda (elem-gi attributes namespaces parent-seed seed) | |
424 | + | (if (equal? elem-gi 'JMdict) | |
425 | + | seed | |
426 | + | (let* ((seed (reverse seed)) | |
427 | + | (element (sxml->element seed elem-gi frq))) | |
428 | + | (cons element parent-seed)))) | |
429 | + | ||
430 | + | CHAR-DATA-HANDLER | |
431 | + | (lambda (string1 string2 seed) | |
432 | + | (cons (string-append string1 string2) seed)))) | |
433 | + | ||
434 | + | (define (xml->results port frq) | |
435 | + | (let ((results (filter result? ((create-parser frq) port '())))) | |
436 | + | (sort-results results))) |
modules/nani/result/result.scm unknown status 1
1 | + | ;;; Nani Project website | |
2 | + | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
3 | + | ;;; | |
4 | + | ;;; This file is part of the Nani Project website. | |
5 | + | ;;; | |
6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | + | ;;; your option) any later version. | |
10 | + | ;;; | |
11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + | ;;; GNU Affero General Public License for more details. | |
15 | + | ;;; | |
16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | + | ||
19 | + | (define-module (nani result result) | |
20 | + | #:use-module (ice-9 binary-ports) | |
21 | + | #:use-module (ice-9 match) | |
22 | + | #:use-module (rnrs bytevectors) | |
23 | + | #:use-module (srfi srfi-9) | |
24 | + | #:use-module (nani encoding serialize) | |
25 | + | #:use-module (nani encoding trie) | |
26 | + | #:use-module (nani encoding huffman) | |
27 | + | #:export (make-result | |
28 | + | result? | |
29 | + | result-position | |
30 | + | result-position-set! | |
31 | + | result-score | |
32 | + | result-kanjis | |
33 | + | result-readings | |
34 | + | result-meanings | |
35 | + | ||
36 | + | make-reading | |
37 | + | reading? | |
38 | + | reading-kanjis | |
39 | + | reading-info | |
40 | + | reading-readings | |
41 | + | ||
42 | + | make-meaning | |
43 | + | meaning? | |
44 | + | meaning-references | |
45 | + | meaning-limits | |
46 | + | meaning-sources | |
47 | + | meaning-infos | |
48 | + | meaning-glosses | |
49 | + | meaning-language | |
50 | + | ||
51 | + | make-source | |
52 | + | source? | |
53 | + | source-content | |
54 | + | source-wasei? | |
55 | + | source-lang | |
56 | + | ||
57 | + | update-result | |
58 | + | update-reading | |
59 | + | update-meaning | |
60 | + | update-source | |
61 | + | ||
62 | + | serialize-result result-size | |
63 | + | serialize-reading reading-size | |
64 | + | serialize-meaning meaning-size | |
65 | + | serialize-source source-size | |
66 | + | ||
67 | + | serialize-dictionary | |
68 | + | dictionary-entry-count | |
69 | + | sort-results)) | |
70 | + | ||
71 | + | (define-record-type result | |
72 | + | (make-result position score kanjis readings meanings) | |
73 | + | result? | |
74 | + | (position result-position result-position-set!) ; integer | |
75 | + | (score result-score) ; integer | |
76 | + | (kanjis result-kanjis) ; string-list | |
77 | + | (readings result-readings) ; reanding-list | |
78 | + | (meanings result-meanings)) ; meaning-list | |
79 | + | ||
80 | + | (define-record-type reading | |
81 | + | (make-reading kanjis info readings) | |
82 | + | reading? | |
83 | + | (kanjis reading-kanjis) ; string-list | |
84 | + | (info reading-info) ; string-list | |
85 | + | (readings reading-readings)) ; string-list | |
86 | + | ||
87 | + | (define-record-type meaning | |
88 | + | (make-meaning references limits sources infos glosses language) | |
89 | + | meaning? | |
90 | + | (references meaning-references) ; string-list | |
91 | + | (limits meaning-limits) ; string-list | |
92 | + | (sources meaning-sources) ; source-list | |
93 | + | (infos meaning-infos) ; string-list | |
94 | + | (glosses meaning-glosses) ; string-list | |
95 | + | (language meaning-language)) ; string | |
96 | + | ||
97 | + | (define-record-type source | |
98 | + | (make-source content wasei? lang) | |
99 | + | source? | |
100 | + | (content source-content) ; string-list | |
101 | + | (wasei? source-wasei?) ; boolean | |
102 | + | (lang source-lang)) ; string | |
103 | + | ||
104 | + | (define* (update-result result | |
105 | + | #:key (score (result-score result)) | |
106 | + | (kanjis (result-kanjis result)) | |
107 | + | (readings (result-readings result)) | |
108 | + | (meanings (result-meanings result))) | |
109 | + | (make-result (result-position result) score kanjis readings meanings)) | |
110 | + | ||
111 | + | (define* (update-reading reading | |
112 | + | #:key (kanjis (reading-kanjis reading)) | |
113 | + | (info (reading-info reading)) | |
114 | + | (readings (reading-readings reading))) | |
115 | + | (make-reading kanjis info readings)) | |
116 | + | ||
117 | + | (define* (update-meaning meaning | |
118 | + | #:key (references (meaning-references meaning)) | |
119 | + | (limits (meaning-limits meaning)) | |
120 | + | (sources (meaning-sources meaning)) | |
121 | + | (infos (meaning-infos meaning)) | |
122 | + | (glosses (meaning-glosses meaning)) | |
123 | + | (language (meaning-language meaning))) | |
124 | + | (make-meaning references limits sources infos glosses language)) | |
125 | + | ||
126 | + | (define* (update-source source | |
127 | + | #:key (content (source-content source)) | |
128 | + | (wasei? (source-wasei? source)) | |
129 | + | (lang (source-lang source))) | |
130 | + | (make-source content wasei? lang)) | |
131 | + | ||
132 | + | ;; Note how sources are not compressed, that's because they contain more weird characters | |
133 | + | ;; and represent only a very small fraction of the content, even after compression. | |
134 | + | (define (serialize-source source pos bv) | |
135 | + | (when (not (source? source)) (throw 'not-source source)) | |
136 | + | (let* ((pos ((serialize-list serialize-string) (source-content source) pos bv)) | |
137 | + | (pos (serialize-boolean (source-wasei? source) pos bv)) | |
138 | + | (pos (serialize-string (source-lang source) pos bv))) | |
139 | + | pos)) | |
140 | + | (define (source-size source) | |
141 | + | (when (not (source? source)) (throw 'not-source source)) | |
142 | + | (+ ((list-size string-size) (source-content source)) | |
143 | + | (boolean-size (source-wasei? source)) | |
144 | + | (string-size (source-lang source)))) | |
145 | + | ||
146 | + | (define (serialize-reading reading-huffman-code) | |
147 | + | (lambda (reading pos bv) | |
148 | + | (when (not (reading? reading)) (throw 'not-reading reading)) | |
149 | + | (let* ((pos ((serialize-list serialize-string) (reading-kanjis reading) pos bv)) | |
150 | + | (pos ((serialize-list serialize-string) (reading-info reading) pos bv)) | |
151 | + | (pos ((serialize-list (serialize-huffman-string reading-huffman-code)) | |
152 | + | (reading-readings reading) pos bv))) | |
153 | + | pos))) | |
154 | + | (define (reading-size reading-huffman-code) | |
155 | + | (lambda (reading) | |
156 | + | (when (not (reading? reading)) (throw 'not-reading reading)) | |
157 | + | (+ ((list-size string-size) (reading-kanjis reading)) | |
158 | + | ((list-size string-size) (reading-info reading)) | |
159 | + | ((list-size (huffman-string-size reading-huffman-code)) (reading-readings reading))))) | |
160 | + | ||
161 | + | (define (serialize-meaning meaning-huffman-code) | |
162 | + | (lambda (meaning pos bv) | |
163 | + | (when (not (meaning? meaning)) (throw 'not-meaning meaning)) | |
164 | + | (let* ((pos ((serialize-list serialize-string) (meaning-references meaning) pos bv)) | |
165 | + | (pos ((serialize-list serialize-string) (meaning-limits meaning) pos bv)) | |
166 | + | (pos ((serialize-list serialize-source) (meaning-sources meaning) pos bv)) | |
167 | + | (pos ((serialize-list (serialize-huffman-string meaning-huffman-code)) | |
168 | + | (meaning-infos meaning) pos bv)) | |
169 | + | (pos ((serialize-list (serialize-huffman-string meaning-huffman-code)) | |
170 | + | (meaning-glosses meaning) pos bv)) | |
171 | + | (pos (serialize-string (meaning-language meaning) pos bv))) | |
172 | + | pos))) | |
173 | + | (define (meaning-size meaning-huffman-code) | |
174 | + | (lambda (meaning) | |
175 | + | (when (not (meaning? meaning)) (throw 'not-meaning meaning)) | |
176 | + | (+ ((list-size string-size) (meaning-references meaning)) | |
177 | + | ((list-size string-size) (meaning-limits meaning)) | |
178 | + | ((list-size source-size) (meaning-sources meaning)) | |
179 | + | ((list-size (huffman-string-size meaning-huffman-code)) | |
180 | + | (meaning-infos meaning)) | |
181 | + | ((list-size (huffman-string-size meaning-huffman-code)) | |
182 | + | (meaning-glosses meaning)) | |
183 | + | (string-size (meaning-language meaning))))) | |
184 | + | ||
185 | + | (define (serialize-result kanji-huffman-code reading-huffman-code meaning-huffman-code) | |
186 | + | (lambda (result pos bv) | |
187 | + | (when (not (result? result)) (throw 'not-result result)) | |
188 | + | (result-position-set! result pos) | |
189 | + | (let* ((pos ((serialize-list (serialize-huffman-string kanji-huffman-code)) | |
190 | + | (result-kanjis result) pos bv)) | |
191 | + | (pos ((serialize-list (serialize-reading reading-huffman-code)) | |
192 | + | (result-readings result) pos bv)) | |
193 | + | (pos ((serialize-list (serialize-meaning meaning-huffman-code)) | |
194 | + | (result-meanings result) pos bv)) | |
195 | + | (pos (serialize-char (result-score result) pos bv))) | |
196 | + | pos))) | |
197 | + | (define (result-size kanji-huffman-code reading-huffman-code meaning-huffman-code) | |
198 | + | (lambda (result) | |
199 | + | (when (not (result? result)) (throw 'not-result result)) | |
200 | + | (+ ((list-size (huffman-string-size kanji-huffman-code)) (result-kanjis result)) | |
201 | + | ((list-size (reading-size reading-huffman-code)) (result-readings result)) | |
202 | + | ((list-size (meaning-size meaning-huffman-code)) (result-meanings result))))) | |
203 | + | ||
204 | + | ;; creating tries | |
205 | + | (define (make-key key) | |
206 | + | (apply append | |
207 | + | (map | |
208 | + | (lambda (c) | |
209 | + | (list (quotient c 16) (modulo c 16))) | |
210 | + | (bytevector->u8-list (string->utf8 key))))) | |
211 | + | ||
212 | + | (define (make-kanji-trie results) | |
213 | + | (let ((trie (make-empty-trie))) | |
214 | + | (let loop ((results results) (i 0)) | |
215 | + | (if (null? results) | |
216 | + | (compress-trie trie) | |
217 | + | (begin | |
218 | + | (for-each | |
219 | + | (lambda (key) | |
220 | + | (add-to-trie! trie (make-key key) i)) | |
221 | + | (result-kanjis (car results))) | |
222 | + | (loop (cdr results) (+ i 1))))))) | |
223 | + | ||
224 | + | (define (make-reading-trie results) | |
225 | + | (let ((trie (make-empty-trie))) | |
226 | + | (let loop ((results results) (i 0)) | |
227 | + | (if (null? results) | |
228 | + | (compress-trie trie) | |
229 | + | (begin | |
230 | + | (for-each | |
231 | + | (lambda (reading) | |
232 | + | (for-each | |
233 | + | (lambda (key) | |
234 | + | (add-to-trie! trie (make-key key) i)) | |
235 | + | (reading-readings reading))) | |
236 | + | (result-readings (car results))) | |
237 | + | (loop (cdr results) (+ i 1))))))) | |
238 | + | ||
239 | + | (define (make-meaning-trie results) | |
240 | + | (let ((trie (make-empty-trie))) | |
241 | + | (let loop ((results results) (i 0)) | |
242 | + | (if (null? results) | |
243 | + | (compress-trie trie) | |
244 | + | (begin | |
245 | + | (for-each | |
246 | + | (lambda (meaning) | |
247 | + | (for-each | |
248 | + | (lambda (key) | |
249 | + | (add-to-trie! trie (make-key key) i)) | |
250 | + | (meaning-glosses meaning))) | |
251 | + | (result-meanings (car results))) | |
252 | + | (loop (cdr results) (+ i 1))))))) | |
253 | + | ||
254 | + | (define (update-trie-pos! trie results) | |
255 | + | (let* ((vals (trie-vals trie)) | |
256 | + | (vals (map (lambda (i) (result-position (array-ref results i))) vals))) | |
257 | + | (trie-vals-set! trie vals)) | |
258 | + | (for-each | |
259 | + | (match-lambda | |
260 | + | ((char . child) | |
261 | + | (update-trie-pos! child results))) | |
262 | + | (trie-transitions trie))) | |
263 | + | ||
264 | + | (define (serialize-dictionary results) | |
265 | + | (define kanji-huffman | |
266 | + | (let ((kanjis (apply append (map result-kanjis results)))) | |
267 | + | (create-huffman kanjis))) | |
268 | + | (define kanji-huffman-code (huffman->code kanji-huffman)) | |
269 | + | (define reading-huffman | |
270 | + | (let* ((readings (apply append (map result-readings results))) | |
271 | + | (readings (apply append (map reading-readings readings)))) | |
272 | + | (create-huffman readings))) | |
273 | + | (define reading-huffman-code (huffman->code reading-huffman)) | |
274 | + | (define meaning-huffman | |
275 | + | (let* ((meanings (apply append (map result-meanings results))) | |
276 | + | (infos (apply append (map meaning-infos meanings))) | |
277 | + | (glosses (apply append (map meaning-glosses meanings)))) | |
278 | + | (create-huffman (append infos glosses)))) | |
279 | + | (define meaning-huffman-code (huffman->code meaning-huffman)) | |
280 | + | ||
281 | + | (define (trie-node-size trie) | |
282 | + | (apply + 1 (map trie-node-size (map cdr (trie-transitions trie))))) | |
283 | + | ||
284 | + | (let* ((header (string->utf8 "NANI_JMDICT003")) | |
285 | + | (header-size (bytevector-length header)) | |
286 | + | (pointers (make-bytevector 16 0)) | |
287 | + | (kanji-huffman-bv (serialize-huffman kanji-huffman)) | |
288 | + | (kanji-huffman-size (bytevector-length kanji-huffman-bv)) | |
289 | + | (reading-huffman-bv (serialize-huffman reading-huffman)) | |
290 | + | (reading-huffman-size (bytevector-length reading-huffman-bv)) | |
291 | + | (meaning-huffman-bv (serialize-huffman meaning-huffman)) | |
292 | + | (meaning-huffman-size (bytevector-length meaning-huffman-bv)) | |
293 | + | (serialize-trie (serialize-trie serialize-int int-size)) | |
294 | + | (trie-size (trie-size int-size)) | |
295 | + | (kanji-trie (make-kanji-trie results)) | |
296 | + | (kanji-trie-size (trie-size kanji-trie)) | |
297 | + | (reading-trie (make-reading-trie results)) | |
298 | + | (reading-trie-size (trie-size reading-trie)) | |
299 | + | (meaning-trie (make-meaning-trie results)) | |
300 | + | (meaning-trie-size (trie-size meaning-trie)) | |
301 | + | (trie-sizes (+ kanji-trie-size reading-trie-size meaning-trie-size)) | |
302 | + | (results-size | |
303 | + | ((list-size (result-size kanji-huffman-code reading-huffman-code | |
304 | + | meaning-huffman-code) | |
305 | + | #:size? #f) | |
306 | + | results)) | |
307 | + | (huffman-size (+ reading-huffman-size meaning-huffman-size kanji-huffman-size)) | |
308 | + | (pos-kanji (+ header-size 16 kanji-huffman-size reading-huffman-size | |
309 | + | meaning-huffman-size results-size 4)) | |
310 | + | (bv (make-bytevector (+ header-size 16 kanji-huffman-size | |
311 | + | reading-huffman-size | |
312 | + | meaning-huffman-size | |
313 | + | results-size 4 trie-sizes)))) | |
314 | + | (format #t "Number of nodes in kanjis: ~a~%" | |
315 | + | (trie-node-size kanji-trie)) | |
316 | + | (format #t "Number of nodes in readings: ~a~%" | |
317 | + | (trie-node-size reading-trie)) | |
318 | + | (format #t "Number of nodes in meanings: ~a~%" | |
319 | + | (trie-node-size meaning-trie)) | |
320 | + | ((serialize-list (serialize-result kanji-huffman-code reading-huffman-code | |
321 | + | meaning-huffman-code) | |
322 | + | #:size? #f) | |
323 | + | results (+ header-size 16 huffman-size) bv) | |
324 | + | ;; Serializing results also updated result-pos for each of them | |
325 | + | (let ((results (list->array 1 results))) | |
326 | + | (update-trie-pos! kanji-trie results) | |
327 | + | (update-trie-pos! reading-trie results) | |
328 | + | (update-trie-pos! meaning-trie results)) | |
329 | + | ;; number of entries | |
330 | + | (serialize-int (length results) (+ header-size 16 huffman-size results-size) | |
331 | + | bv) | |
332 | + | (let* ((results (list->array 1 results)) | |
333 | + | (pos pos-kanji) | |
334 | + | (pos (serialize-trie kanji-trie pos bv)) | |
335 | + | (pos-reading pos) | |
336 | + | (pos (serialize-trie reading-trie pos bv)) | |
337 | + | (pos-meaning pos) | |
338 | + | (pos (serialize-trie meaning-trie pos bv))) | |
339 | + | ;; Point to the trie structures | |
340 | + | (bytevector-u32-set! | |
341 | + | pointers 0 | |
342 | + | (+ header-size 16 huffman-size results-size (int-size 0)) | |
343 | + | (endianness big)) | |
344 | + | ;; point to the kanji trie structure | |
345 | + | (bytevector-u32-set! pointers 4 pos-kanji (endianness big)) | |
346 | + | ;; point to the reading trie structure | |
347 | + | (bytevector-u32-set! pointers 8 pos-reading (endianness big)) | |
348 | + | ;; point to the meaning trie structure | |
349 | + | (bytevector-u32-set! pointers 12 pos-meaning (endianness big)) | |
350 | + | ;; copy to result bytevector | |
351 | + | (bytevector-copy! header 0 bv 0 header-size) | |
352 | + | (bytevector-copy! pointers 0 bv header-size 16) | |
353 | + | (bytevector-copy! kanji-huffman-bv 0 bv (+ header-size 16) kanji-huffman-size) | |
354 | + | (bytevector-copy! reading-huffman-bv 0 bv | |
355 | + | (+ header-size 16 kanji-huffman-size) | |
356 | + | reading-huffman-size) | |
357 | + | (bytevector-copy! meaning-huffman-bv 0 bv | |
358 | + | (+ header-size 16 kanji-huffman-size reading-huffman-size) | |
359 | + | meaning-huffman-size) | |
360 | + | ;; give some feedback on the size of file's structures | |
361 | + | (format #t "huffmans are ~a bytes long~%" huffman-size) | |
362 | + | (format #t "results is ~a bytes long~%" results-size) | |
363 | + | (format #t "kanji trie is ~a bytes long~%" kanji-trie-size) | |
364 | + | (format #t "reading trie is ~a bytes long~%" reading-trie-size) | |
365 | + | (format #t "meaning trie is ~a bytes long~%" meaning-trie-size) | |
366 | + | bv))) | |
367 | + | ||
368 | + | (define (dictionary-entry-count file) | |
369 | + | (call-with-input-file file | |
370 | + | (lambda (port) | |
371 | + | (let* ((header (utf8->string (get-bytevector-n port 14))) | |
372 | + | (pointers (get-bytevector-n port 16)) | |
373 | + | (end-pos (bytevector-u32-ref pointers 0 (endianness big)))) | |
374 | + | (seek port (- end-pos 4) SEEK_SET) | |
375 | + | (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big)))))) | |
376 | + | ||
377 | + | (define (sort-results results) | |
378 | + | (define (get-string res) | |
379 | + | (if (null? (result-kanjis res)) | |
380 | + | (car (reading-readings (car (result-readings res)))) | |
381 | + | (car (result-kanjis res)))) | |
382 | + | (sort | |
383 | + | results | |
384 | + | (lambda (a b) | |
385 | + | (cond | |
386 | + | ((> (result-score a) (result-score b)) #t) | |
387 | + | ((= (result-score a) (result-score b)) | |
388 | + | (string>? (get-string a) (get-string b))))))) |
modules/nani/result/wadoku.scm unknown status 1
1 | + | ;;; Nani Project website | |
2 | + | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
3 | + | ;;; | |
4 | + | ;;; This file is part of the Nani Project website. | |
5 | + | ;;; | |
6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | + | ;;; your option) any later version. | |
10 | + | ;;; | |
11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + | ;;; GNU Affero General Public License for more details. | |
15 | + | ;;; | |
16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | + | ||
19 | + | (define-module (nani result wadoku) | |
20 | + | #:use-module (ice-9 match) | |
21 | + | #:use-module (ice-9 rdelim) | |
22 | + | #:use-module (nani result frequency) | |
23 | + | #:use-module (nani result result) | |
24 | + | #:use-module (srfi srfi-9) | |
25 | + | #:use-module (sxml ssax) | |
26 | + | #:export (xml->results)) | |
27 | + | ||
28 | + | (define (usg->infos lst) | |
29 | + | (let loop ((infos '()) (lst lst)) | |
30 | + | (if (null? lst) | |
31 | + | infos | |
32 | + | (loop | |
33 | + | (match (car lst) | |
34 | + | (('reg . reg) (cons reg infos)) | |
35 | + | (('type . type) | |
36 | + | (if (equal? type "abrev") | |
37 | + | (cons type infos) | |
38 | + | infos)) | |
39 | + | ((? string? info) (cons info infos)) | |
40 | + | (_ infos)) | |
41 | + | (cdr lst))))) | |
42 | + | ||
43 | + | (define (ruigo->ref lst) | |
44 | + | (let loop ((ref #f) (lst lst)) | |
45 | + | (match lst | |
46 | + | (() ref) | |
47 | + | ((('id id) lst ...) | |
48 | + | (loop id lst)) | |
49 | + | (((a . b) lst ...) | |
50 | + | (loop (loop ref (list (list a b))) lst)) | |
51 | + | (((? list? l) lst ...) | |
52 | + | (loop (loop ref l) lst))))) | |
53 | + | ||
54 | + | (define (merge-meanings s1 s2) | |
55 | + | (update-meaning s2 | |
56 | + | #:references (append (meaning-references s1) (meaning-references s2)) | |
57 | + | #:infos (append (meaning-infos s1) (meaning-infos s2)) | |
58 | + | #:glosses (append (meaning-glosses s1) (meaning-glosses s2)))) | |
59 | + | ||
60 | + | (define (sxml->meaning lst) | |
61 | + | (let loop ((meaning (make-meaning '() '() '() '() '() "ger")) (lst lst)) | |
62 | + | (if (null? lst) | |
63 | + | meaning | |
64 | + | (loop | |
65 | + | (match (car lst) | |
66 | + | (#f meaning) | |
67 | + | ((? meaning? s) | |
68 | + | (merge-meanings s meaning)) | |
69 | + | ((? source? s) | |
70 | + | (update-meaning meaning #:sources (cons s (meaning-sources meaning)))) | |
71 | + | (('ref (? string? r)) | |
72 | + | (update-meaning meaning #:references (cons r (meaning-references meaning)))) | |
73 | + | (('info (? string? r)) | |
74 | + | (update-meaning meaning #:infos (cons r (meaning-infos meaning)))) | |
75 | + | (('infos (? string? r)) | |
76 | + | (update-meaning meaning #:infos (cons r (meaning-infos meaning)))) | |
77 | + | (('infos (? list? r)) | |
78 | + | (update-meaning meaning #:infos (append r (meaning-infos meaning)))) | |
79 | + | (('trans (? string? r)) | |
80 | + | (update-meaning meaning #:glosses (cons (string-downcase r) | |
81 | + | (meaning-glosses meaning)))) | |
82 | + | (('related . _) meaning) | |
83 | + | (('transcr . _) meaning) | |
84 | + | (('pitch . _) meaning) | |
85 | + | (((? symbol? s) v) | |
86 | + | (throw 'unknown-symbol s v)) | |
87 | + | ((? list? l) (loop meaning l)) | |
88 | + | ((? string? _) meaning)) | |
89 | + | (cdr lst))))) | |
90 | + | ||
91 | + | ;; TODO | |
92 | + | (define (sxml->ref lst) | |
93 | + | (define (sxml->ref-name lst) | |
94 | + | (let loop ((result #f) (lst lst)) | |
95 | + | (if (null? lst) | |
96 | + | result | |
97 | + | (loop | |
98 | + | (match (car lst) | |
99 | + | (('id . id) id) | |
100 | + | (((? symbol? s) . _) result) | |
101 | + | (((? symbol? s) _) result) | |
102 | + | ((? list? l) (loop result l)) | |
103 | + | (_ result)) | |
104 | + | (cdr lst))))) | |
105 | + | (let ((ref (sxml->ref-name lst))) | |
106 | + | (if (string? ref) | |
107 | + | `(ref ,ref) | |
108 | + | (throw 'no-ref ref 'from lst)))) | |
109 | + | ||
110 | + | (define (sxml->source lst) | |
111 | + | (let loop ((source (make-source '() #f "")) (lst lst)) | |
112 | + | (if (null? lst) | |
113 | + | source | |
114 | + | (loop | |
115 | + | (match (car lst) | |
116 | + | (('impli impli) (update-source source #:lang impli)) | |
117 | + | (('foreign foreign) | |
118 | + | (update-source source | |
119 | + | #:content (if (list? foreign) | |
120 | + | (append foreign (source-content source)) | |
121 | + | (cons foreign (source-content source))))) | |
122 | + | ((? list? l) (loop source l)) | |
123 | + | (_ source)) | |
124 | + | (cdr lst))))) | |
125 | + | ||
126 | + | (define (sxml->reading lst) | |
127 | + | (let loop ((reading (make-reading '() '() '())) (lst lst)) | |
128 | + | (if (null? lst) | |
129 | + | reading | |
130 | + | (loop | |
131 | + | (match (car lst) | |
132 | + | (('reading r) | |
133 | + | (update-reading reading #:readings (cons r (reading-readings reading)))) | |
134 | + | (_ reading)) | |
135 | + | (cdr lst))))) | |
136 | + | ||
137 | + | (define (sxml->result sxml frq) | |
138 | + | (define (sxml->result-aux sxml) | |
139 | + | (let loop ((result (make-result 0 0 '() '() '())) (last-source #f) (lst sxml)) | |
140 | + | (if (null? lst) | |
141 | + | result | |
142 | + | (match (car lst) | |
143 | + | (('kanji kanji) | |
144 | + | (loop | |
145 | + | (update-result result #:kanjis (cons kanji (result-kanjis result))) | |
146 | + | last-source (cdr lst))) | |
147 | + | ((? reading? reading) | |
148 | + | (loop | |
149 | + | (update-result result #:readings (cons reading (result-readings result))) | |
150 | + | last-source (cdr lst))) | |
151 | + | ((? meaning? meaning) | |
152 | + | (loop | |
153 | + | (update-result result | |
154 | + | #:meanings | |
155 | + | (cons | |
156 | + | (if last-source | |
157 | + | (update-meaning meaning | |
158 | + | #:sources (cons last-source (meaning-sources meaning))) | |
159 | + | meaning) | |
160 | + | (result-meanings result))) | |
161 | + | last-source (cdr lst))) | |
162 | + | ((? list? l) | |
163 | + | (loop (loop result last-source l) last-source (cdr lst))) | |
164 | + | (_ (loop result last-source (cdr lst))))))) | |
165 | + | (let* ((result (sxml->result-aux sxml)) | |
166 | + | (word (if (null? (result-kanjis result)) | |
167 | + | (car (reading-readings (car (result-readings result)))) | |
168 | + | (car (result-kanjis result)))) | |
169 | + | (score (frequency->score frq word)) | |
170 | + | (meanings (result-meanings result))) | |
171 | + | (update-result result | |
172 | + | #:score score | |
173 | + | #:meanings meanings))) | |
174 | + | ||
175 | + | (define (meishi->info lst) | |
176 | + | (match (assoc-ref lst 'suru) | |
177 | + | ("both" "N., mit suru intrans. od. trans.") | |
178 | + | ("intrans" "N., mit suru intrans.") | |
179 | + | ("trans" "N., mit suru trans."))) | |
180 | + | ||
181 | + | (define (keiyoushi->info lst) | |
182 | + | (cond | |
183 | + | ((equal? (assoc-ref lst 'ku) "true") | |
184 | + | "Adj. auf ???ku") | |
185 | + | ((equal? (assoc-ref lst 'shiku) "true") | |
186 | + | "Adj. auf -shiku"))) | |
187 | + | ||
188 | + | (define (keiyoudoushi->info lst) | |
189 | + | (if (equal? (assoc-ref lst 'nari) "true") | |
190 | + | "Na.???Adj. mit nari" | |
191 | + | "Na.???Adj. mit na od. no")) | |
192 | + | ||
193 | + | (define (fukushi->info lst) | |
194 | + | (define (get attr) | |
195 | + | (match (assoc-ref lst attr) | |
196 | + | ("false" #f) | |
197 | + | ("true" #t) | |
198 | + | (v v))) | |
199 | + | (string-append | |
200 | + | "Adv" | |
201 | + | (if (get 'ni) ", mit ni und Adn. mit naru" "") | |
202 | + | (if (get 'to) | |
203 | + | (if (get 'naru) | |
204 | + | ", mit to und Adn. mit taru" | |
205 | + | ", mit to") | |
206 | + | "") | |
207 | + | (match (get 'suru) | |
208 | + | ("intrans" ", mit suru intrans. V.") | |
209 | + | ("trans" ", mit suru trans. V.") | |
210 | + | ("both" ", mit suru trans. od. intrans. V.") | |
211 | + | (_ "")))) | |
212 | + | ||
213 | + | (define (doushi->info lst) | |
214 | + | (define transitivity-str | |
215 | + | (match (assoc-ref lst 'transitivity) | |
216 | + | ("intrans" "intrans.") | |
217 | + | ("trans" "trans.") | |
218 | + | ("both" "trans. od. intrans."))) | |
219 | + | ||
220 | + | (define onbin (assoc-ref lst 'onbin)) | |
221 | + | ||
222 | + | (match (assoc-ref lst 'level) | |
223 | + | ("kuru" (string-append "unregelm. " transitivity-str " V. auf ka")) | |
224 | + | ("ra" (string-append transitivity-str " V. auf -ra")) | |
225 | + | ("suru" (string-append transitivity-str " V. auf -suru")) | |
226 | + | ("1e" (string-append "1???st. " transitivity-str " V. auf -e")) | |
227 | + | ("1i" (string-append "1???st. " transitivity-str " V. auf -i")) | |
228 | + | ("2e" (string-append "2???st. " transitivity-str " V. auf -e bzw. -u")) | |
229 | + | ("2i" (string-append "2???st. " transitivity-str " V. auf -i bzw. -u")) | |
230 | + | ("4" (string-append "4???st. " transitivity-str " V.")) | |
231 | + | ("5" (string-append "5-st. " transitivity-str " V." | |
232 | + | (match (assoc-ref lst 'godanrow) | |
233 | + | ("ba" (if onbin | |
234 | + | " auf -ba mit regelm. Nasal-Onbin = ???nde" | |
235 | + | " auf -ba")) | |
236 | + | ("ga" (if onbin | |
237 | + | " auf -ga mit regelm. i-Onbin = ???ide" | |
238 | + | " auf -ga")) | |
239 | + | ("ka_i_yu" (if onbin | |
240 | + | " auf -ka mit Geminaten-Onbin = ???tte" | |
241 | + | " auf -ka")) | |
242 | + | ("ka" (if onbin | |
243 | + | " auf -ka mit i-Onbin = ???ite" | |
244 | + | " auf -ka")) | |
245 | + | ("ma" (if onbin | |
246 | + | " auf -ma regelm. Nasal-Onbin = ???nde" | |
247 | + | " auf -ma")) | |
248 | + | ("na" (if onbin | |
249 | + | " auf -na mit regelm. Nasal-Onbin = ???nde" | |
250 | + | " auf -na")) | |
251 | + | ("ra_i" (if onbin | |
252 | + | " auf -ra, Sonderform mit Reny????kei ???i" | |
253 | + | " auf -ra")) | |
254 | + | ("ra" (if onbin | |
255 | + | " auf -ra mit regelm. Geminaten-Onbin = ???tte" | |
256 | + | " auf -ra")) | |
257 | + | ("sa" " auf -sa") | |
258 | + | ("ta" (if onbin | |
259 | + | " auf -ta mit regelm. Geminaten-Onbin = ???tte" | |
260 | + | " auf -ta")) | |
261 | + | ("wa" (if onbin | |
262 | + | " auf -[w]a mit Geminaten-Onbin = ???tte" | |
263 | + | " auf -[w]a")) | |
264 | + | ("wa_o" (if onbin | |
265 | + | " auf -[w]a mit u-Onbin = ?????/??te" | |
266 | + | " auf -[w]a")) | |
267 | + | (#f "")))))) | |
268 | + | ||
269 | + | (define (gram->info lst) | |
270 | + | (map | |
271 | + | (lambda (gram) | |
272 | + | (match gram | |
273 | + | (('daimeishi) "Pron.") | |
274 | + | (('doushi doushi ...) (doushi->info doushi)) | |
275 | + | (('fukujoshi) "adv. Part.") | |
276 | + | (('fukushi) "Adv.") | |
277 | + | (('fukushi fukushi ...) (fukushi->info fukushi)) | |
278 | + | (('jodoushi) "Hilfsv.") | |
279 | + | (('joshi) "Part.") | |
280 | + | (('kakarijoshi) "Themenpart.") | |
281 | + | (('kakujoshi) "Kasuspart.") | |
282 | + | (('kandoushi) "Interj.") | |
283 | + | (('kanji) "Kanji") | |
284 | + | (('keiyoudoushi) "Na.???Adj.") | |
285 | + | (('keiyoudoushi keiyoudoushi ...) (keiyoudoushi->info keiyoudoushi)) | |
286 | + | (('keiyoushi) "Adj.") | |
287 | + | (('keiyoushi keiyoushi ...) (keiyoushi->info keiyoushi)) | |
288 | + | (('meishi) "N.") | |
289 | + | (('meishi meishi ...) (meishi->info meishi)) | |
290 | + | (('prefix) "Pr??f.") | |
291 | + | (('rengo) "Zus.") | |
292 | + | (('rentaishi) "Adn") | |
293 | + | (('setsuzokujoshi) "konjunktionale Part.") | |
294 | + | (('setsuzokushi) "Konj.") | |
295 | + | (('shuujoshi) "satzbeendende Part.") | |
296 | + | (('specialcharacter) "Sonderzeichen") | |
297 | + | (('suffix) "Suff.") | |
298 | + | (('wordcomponent) "Wortkomp."))) | |
299 | + | lst)) | |
300 | + | ||
301 | + | (define (sxml->string lst) | |
302 | + | (define (sub-loop loop infos result lst l) | |
303 | + | (let ((result (loop infos result l))) | |
304 | + | (if (list? result) | |
305 | + | (loop (append infos (filter list? result)) | |
306 | + | (apply string-append (filter string? result)) | |
307 | + | lst) | |
308 | + | (loop infos result lst)))) | |
309 | + | ||
310 | + | (let loop ((infos '()) (result "") (lst lst)) | |
311 | + | (match lst | |
312 | + | (() (if (null? infos) | |
313 | + | result | |
314 | + | (append infos (list result)))) | |
315 | + | ((? string? s) | |
316 | + | (loop infos (string-append result s) '())) | |
317 | + | (((? string? s) lst ...) | |
318 | + | (loop infos (string-append result s) lst)) | |
319 | + | ((('prior . _) lst ...) | |
320 | + | (loop infos result lst)) | |
321 | + | ((('options . _) lst ...) | |
322 | + | (loop infos result lst)) | |
323 | + | ((('firstname . _) lst ...) | |
324 | + | (loop infos result lst)) | |
325 | + | ((('ausn . _) lst ...) | |
326 | + | (loop infos result lst)) | |
327 | + | ((('lang . _) lst ...) | |
328 | + | (loop infos result lst)) | |
329 | + | ((('meta . _) lst ...) | |
330 | + | (loop infos result lst)) | |
331 | + | ((('genki . genki) lst ...) | |
332 | + | ;(loop (cons `(info ,genki) infos) result lst)) | |
333 | + | (loop infos result lst)) | |
334 | + | ((('jlpt . jlpt) lst ...) | |
335 | + | (loop (cons `(info ,(string-append "jlpt-" jlpt)) infos) result lst)) | |
336 | + | ((('ref . (? string? ref)) lst ...) | |
337 | + | (loop (cons `(ref ,ref) infos) result lst)) | |
338 | + | ((('ref (? string? ref)) lst ...) | |
339 | + | (loop (cons `(ref ,ref) infos) result lst)) | |
340 | + | ((('jap l) lst ...) | |
341 | + | (sub-loop loop infos result lst l)) | |
342 | + | ((('foreign l) lst ...) | |
343 | + | (sub-loop loop infos result lst l)) | |
344 | + | ((('transcr l) lst ...) | |
345 | + | (sub-loop loop infos result lst l)) | |
346 | + | ((((? symbol? s) v) lst ...) | |
347 | + | (throw 'unsupported-symbol s v)) | |
348 | + | ((((? symbol? s) . v) lst ...) | |
349 | + | (throw 'unsupported-symbol-pair s v)) | |
350 | + | (((? list? l) lst ...) | |
351 | + | (sub-loop loop infos result lst l))))) | |
352 | + | ||
353 | + | (define (sxml->element lst elem frq) | |
354 | + | (let ((elem (match elem | |
355 | + | ((_ . elem) elem) | |
356 | + | (_ elem)))) | |
357 | + | (match elem | |
358 | + | ('orth (let ((kanji (filter string? lst))) | |
359 | + | (if (null? kanji) | |
360 | + | #f | |
361 | + | `(kanji | |
362 | + | ,(string-filter | |
363 | + | (lambda (c) | |
364 | + | ;; Remove characters that interfere with kanji | |
365 | + | ;; based search | |
366 | + | (not (member c '(#\??? #\??? #\??? #\??? #\??? #\??? #\??? | |
367 | + | #\?? #\???)))) | |
368 | + | (car kanji)))))) | |
369 | + | ('count #f) | |
370 | + | ('entry (sxml->result lst frq)) | |
371 | + | ('hira `(reading ,(car lst))) | |
372 | + | ('hatsuon `(hatsuon ,(car lst))) | |
373 | + | ('accent `(pitch ,(car lst))) | |
374 | + | ('reading (sxml->reading lst)) | |
375 | + | ('form lst) | |
376 | + | ('impli `(impli ,(car lst))) | |
377 | + | ('text (let loop ((text "") (lst lst)) | |
378 | + | (match lst | |
379 | + | (() text) | |
380 | + | ((('hasPrecedingSpace . _) lst ...) | |
381 | + | (loop (string-append " " text) lst)) | |
382 | + | ((('hasFollowingSpace . _) lst ...) | |
383 | + | (string-append (loop text lst) " ")) | |
384 | + | (((? string? s) lst ...) | |
385 | + | (loop (string-append text s) lst))))) | |
386 | + | ('famn (sxml->string lst)) | |
387 | + | ('expl (sxml->string lst)) | |
388 | + | ('expli (filter list? lst)) | |
389 | + | ('abbrev (filter list? lst)) | |
390 | + | ('token (filter string? lst)) | |
391 | + | ('tr (sxml->string lst)) | |
392 | + | ('transcr `(transcr ,lst)) | |
393 | + | ('trans | |
394 | + | (append (filter list? lst) | |
395 | + | (map (lambda (s) `(trans ,s)) (filter string? lst)))) | |
396 | + | ('jap `(jap ,lst)) | |
397 | + | ('emph lst) | |
398 | + | ('title (filter string? lst)) | |
399 | + | ('transl (filter string? lst)) | |
400 | + | ('topic (filter string? lst)) | |
401 | + | ('iron (filter string? lst)) | |
402 | + | ('specchar (filter string? lst)) | |
403 | + | ('scientif (filter string? lst)) | |
404 | + | ('wikide #f) | |
405 | + | ('wikija #f) | |
406 | + | ('link #f) | |
407 | + | ('ref (sxml->ref lst)) | |
408 | + | ('sref (sxml->ref lst)) | |
409 | + | ('etym (sxml->source lst)) | |
410 | + | ('literal (sxml->string (list "???" (sxml->string lst) "???"))) | |
411 | + | ('def (sxml->string (list "(" (sxml->string lst) ")"))) | |
412 | + | ('date (sxml->string (list "(" (sxml->string lst) ")"))) | |
413 | + | ('birthdeath (sxml->string (list "(" (sxml->string lst) ")"))) | |
414 | + | ('descr (sxml->string (list "(" (sxml->string lst) ")"))) | |
415 | + | ('bracket (sxml->string (list "[" (sxml->string lst) "]"))) | |
416 | + | ('foreign (if (null? lst) #f `(foreign ,(car lst)))) | |
417 | + | ('seasonword `(info ,(string-append "season: " (assoc-ref lst 'type)))) | |
418 | + | ('usg `(infos . ,(usg->infos lst))) | |
419 | + | ('sense (sxml->meaning lst)) | |
420 | + | ('steinhaus (let ((ref (sxml->string lst))) | |
421 | + | (if (string? ref) | |
422 | + | `(ref ,(sxml->string lst)) | |
423 | + | (throw 'not-steinhaus ref)))) | |
424 | + | ('pos '()); TODO: actually find what info to use | |
425 | + | ('wordcomponent (cons 'wordcomponent lst)) | |
426 | + | ('meishi (cons 'meishi lst)) | |
427 | + | ('setsuzokushi (cons 'setsuzokushi lst)) | |
428 | + | ('daimeishi (cons 'daimeishi lst)) | |
429 | + | ('doushi (cons 'doushi lst)) | |
430 | + | ('kandoushi (cons 'kandoushi lst)) | |
431 | + | ('keiyoudoushi (cons 'keiyoudoushi lst)) | |
432 | + | ('keiyoushi (cons 'keiyoushi lst)) | |
433 | + | ('fukushi (cons 'fukushi lst)) | |
434 | + | ('rengo (cons 'rengo lst)) | |
435 | + | ('suffix (cons 'suffix lst)) | |
436 | + | ('prefix (cons 'prefix lst)) | |
437 | + | ('kanji (cons 'kanji lst)) | |
438 | + | ('rentaishi (cons 'rentaishi lst)) | |
439 | + | ('specialcharacter (cons 'specialcharacter lst)) | |
440 | + | ('joshi (cons 'joshi lst)) | |
441 | + | ('fukujoshi (cons 'fukujoshi lst)) | |
442 | + | ('kakujoshi (cons 'kakujoshi lst)) | |
443 | + | ('kakarijoshi (cons 'kakarijoshi lst)) | |
444 | + | ('shuujoshi (cons 'shuujoshi lst)) | |
445 | + | ('setsuzokujoshi (cons 'setsuzokujoshi lst)) | |
446 | + | ('jokeiyoushi (cons 'jokeiyoushi lst)) | |
447 | + | ('jodoushi (cons 'jodoushi lst)) | |
448 | + | ('ruigos lst) | |
449 | + | ('ruigo (ruigo->ref lst)) | |
450 | + | ('gramGrp (gram->info lst))))) | |
451 | + | ||
452 | + | (define (create-parser frq) | |
453 | + | (ssax:make-parser | |
454 | + | NEW-LEVEL-SEED | |
455 | + | (lambda (elem-gi attributes namespaces expected-content seed) | |
456 | + | attributes) | |
457 | + | ||
458 | + | FINISH-ELEMENT | |
459 | + | (lambda (elem-gi attributes namespaces parent-seed seed) | |
460 | + | (if (equal? elem-gi 'entries) | |
461 | + | seed | |
462 | + | (let* ((seed (reverse seed)) | |
463 | + | (element (sxml->element seed elem-gi frq))) | |
464 | + | (cons element parent-seed)))) | |
465 | + | ||
466 | + | CHAR-DATA-HANDLER | |
467 | + | (lambda (string1 string2 seed) | |
468 | + | (cons (string-append string1 string2) seed)))) | |
469 | + | ||
470 | + | (define (xml->results port frq) | |
471 | + | (let ((results (filter result? ((create-parser frq) port '())))) | |
472 | + | (sort-results results))) |
modules/nani/serialize.scm unknown status 2
1 | - | ;;; Nani Project website | |
2 | - | ;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu> | |
3 | - | ;;; | |
4 | - | ;;; This file is part of the Nani Project website. | |
5 | - | ;;; | |
6 | - | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | - | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | - | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | - | ;;; your option) any later version. | |
10 | - | ;;; | |
11 | - | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | - | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | - | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | - | ;;; GNU Affero General Public License for more details. | |
15 | - | ;;; | |
16 | - | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | - | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | - | ||
19 | - | (define-module (nani serialize) | |
20 | - | #:use-module (rnrs bytevectors) | |
21 | - | #:export (merge-bvs | |
22 | - | serialize-list list-size | |
23 | - | serialize-char char-size | |
24 | - | serialize-int int-size | |
25 | - | serialize-boolean boolean-size | |
26 | - | serialize-string string-size)) | |
27 | - | ||
28 | - | (define (merge-bvs bvs) | |
29 | - | (let* ((size (apply + (map bytevector-length bvs))) | |
30 | - | (bv (make-bytevector size 0))) | |
31 | - | (let loop ((bvs bvs) (pos 0)) | |
32 | - | (unless (null? bvs) | |
33 | - | (let ((sz (bytevector-length (car bvs)))) | |
34 | - | (bytevector-copy! (car bvs) 0 bv pos sz) | |
35 | - | (loop (cdr bvs) (+ pos sz))))) | |
36 | - | bv)) | |
37 | - | ||
38 | - | (define* (serialize-list lst serialize pos bv #:key (size? #t)) | |
39 | - | (when (not (list? lst)) (throw 'not-list lst)) | |
40 | - | (when size? | |
41 | - | (bytevector-u16-set! bv pos (length lst) (endianness big))) | |
42 | - | (let loop ((lst lst) (pos (+ pos (if size? 2 0)))) | |
43 | - | (if (null? lst) | |
44 | - | pos | |
45 | - | (loop (cdr lst) (serialize (car lst) pos bv))))) | |
46 | - | (define* (list-size lst size #:key (size? #t)) | |
47 | - | (when (not (list? lst)) (throw 'not-list lst)) | |
48 | - | (apply + (if size? 2 0) (map size lst))) | |
49 | - | ||
50 | - | (define (serialize-char int pos bv) | |
51 | - | (bytevector-u8-set! bv pos int) | |
52 | - | (+ pos 1)) | |
53 | - | (define char-size (const 1)) | |
54 | - | ||
55 | - | (define (serialize-int int pos bv) | |
56 | - | (bytevector-u32-set! bv pos int (endianness big)) | |
57 | - | (+ pos 4)) | |
58 | - | (define int-size (const 4)) | |
59 | - | ||
60 | - | (define (serialize-boolean bool pos bv) | |
61 | - | (bytevector-u8-set! bv pos (if bool 1 0)) | |
62 | - | (+ pos 1)) | |
63 | - | (define boolean-size (const 1)) | |
64 | - | ||
65 | - | (define (serialize-string str pos bv) | |
66 | - | (let ((sbv (string->utf8 str))) | |
67 | - | (bytevector-copy! sbv 0 bv pos (bytevector-length sbv)) | |
68 | - | (bytevector-u8-set! bv (+ pos (bytevector-length sbv)) 0) | |
69 | - | (+ pos 1 (bytevector-length sbv)))) | |
70 | - | (define (string-size str) | |
71 | - | (let ((sbv (string->utf8 str))) | |
72 | - | (+ 1 (bytevector-length sbv)))) |
modules/nani/tags.scm unknown status 2
1 | - | (define-module (nani tags) | |
2 | - | #:use-module (nani result) | |
3 | - | #:export (get-tag get-points)) | |
4 | - | ||
5 | - | (define tags | |
6 | - | '("MA" "X" "abbr" "adj-i" "adj-ix" "adj-na" "adj-no" "adj-pn" "adj-t" "adj-f" | |
7 | - | "adv" "adv-to" "arch" "ateji" "aux" "aux-v" "aux-adj" "Buddh" "chem" "chn" | |
8 | - | "col" "comp" "conj" "cop" "ctr" "derog" "eK" "ek" "exp" "fam" "fem" "food" | |
9 | - | "geom" "gikun" "hon" "hum" "iK" "id" "ik" "int" "io" "iv" "ling" "m-sl" | |
10 | - | "male" "male-sl" "math" "meteor" "mil" "n" "n-adv" "n-suf" "n-pref" "n-t" | |
11 | - | "num" "oK" "obs" "obsc" "ok" "oik" "on-mim" "pn" "poet" "pol" "pref" "proverb" | |
12 | - | "prt" "physics" "quote" "rare" "sens" "sl" "suf" "uK" "uk" "unc" "yoji" | |
13 | - | "v1" "v1-s" "v2a-s" "v4h" "v4r" "v5aru" "v5b" "v5g" "v5k" "v5k-s" "v5m" | |
14 | - | "v5n" "v5r" "v5r-i" "v5s" "v5t" "v5u" "v5u-s" "v5uru" "vz" "vi" "vk" "vn" | |
15 | - | "vr" "vs" "vs-c" "vs-s" "vs-i" "kyb" "osb" "ksb" "ktb" "tsb" "thb" "tsug" | |
16 | - | "kyu" "rkb" "nab" "hob" "vt" "vulg" "adj-kari" "adj-ku" "adj-shiku" | |
17 | - | "adj-nari" "n-pr" "v-unspec" "v4k" "v4g" "v4s" "v4t" "v4n" "v4b" "v4m" | |
18 | - | "v2k-k" "v2g-k" "v2t-k" "v2d-k" "v2h-k" "v2b-k" "v2m-k" "v2y-k" "v2r-k" | |
19 | - | "v2k-s" "v2g-s" "v2s-s" "v2z-s" "v2t-s" "v2d-s" "v2n-s" "v2h-s" "v2b-s" | |
20 | - | "v2m-s" "v2y-s" "v2r-s" "v2w-s" "archit" "astron" "baseb" "biol" "bot" | |
21 | - | "bus" "econ" "engr" "finc" "geol" "law" "mahj" "med" "music" "Shinto" | |
22 | - | "shogi" "sports" "sumo" "zool" "joc" "anat" "Christn" "net-sl" "dated" | |
23 | - | "hist" "litf" "surname" "place" "unclass" "company" "product" "work" | |
24 | - | "person" "given" "station" "organization" | |
25 | - | ||
26 | - | ;; wadoku special | |
27 | - | "young" "thief" "secret" "baby" "stud" "sail" "fashion" "archeo" | |
28 | - | "rel" "color" "psy" "print" "aero" "politics" "anth" "biblio" | |
29 | - | "game" "agri" "alco" "demo" "train" "philo" "ethno" "photo" "trans" | |
30 | - | "telecom" "build" "sci" "school" "art" "hobby" | |
31 | - | ||
32 | - | "season-spring" "season-summer" "season-winter" "season-autumn" | |
33 | - | "season-newyear" | |
34 | - | ||
35 | - | "jlpt-1" "jlpt-2" "jlpt-3" "jlpt-4" "jlpt-5" | |
36 | - | ||
37 | - | "frq500" "frq1000" "frq2000" "frq5000" "frq10000" "frq20000")) | |
38 | - | ||
39 | - | (define (get-tag tag) | |
40 | - | (let ((val (member tag tags))) | |
41 | - | (if val (- (length val) 1) (throw 'unknown-tag tag)))) | |
42 | - | ||
43 | - | (define (get-points result) | |
44 | - | (define (tag-point tag) | |
45 | - | (cond | |
46 | - | ;; frequency | |
47 | - | ((eq? tag (get-tag "frq500")) | |
48 | - | 32) | |
49 | - | ((eq? tag (get-tag "frq1000")) | |
50 | - | 16) | |
51 | - | ((eq? tag (get-tag "frq2000")) | |
52 | - | 8) | |
53 | - | ((eq? tag (get-tag "frq5000")) | |
54 | - | 4) | |
55 | - | ((eq? tag (get-tag "frq10000")) | |
56 | - | 2) | |
57 | - | ((eq? tag (get-tag "frq20000")) | |
58 | - | 1) | |
59 | - | ;; archaic term | |
60 | - | ((eq? tag (get-tag "arch")) | |
61 | - | -3) | |
62 | - | ;; children language | |
63 | - | ((eq? tag (get-tag "chn")) | |
64 | - | -1) | |
65 | - | ;; colloquialism | |
66 | - | ((eq? tag (get-tag "col")) | |
67 | - | 5) | |
68 | - | ;; expression | |
69 | - | ((eq? tag (get-tag "exp")) | |
70 | - | 1) | |
71 | - | ;; sonkeigo (honorific or respectful) | |
72 | - | ((eq? tag (get-tag "hon")) | |
73 | - | -1) | |
74 | - | ;; kenjougo (humble) | |
75 | - | ((eq? tag (get-tag "hum")) | |
76 | - | -1) | |
77 | - | ;; teineigo (polite) | |
78 | - | ((eq? tag (get-tag "pol")) | |
79 | - | -1) | |
80 | - | ;; irregular kanji | |
81 | - | ((eq? tag (get-tag "iK")) | |
82 | - | -2) | |
83 | - | ;; idiomatic expression | |
84 | - | ((eq? tag (get-tag "id")) | |
85 | - | 3) | |
86 | - | ;; slang | |
87 | - | ((member tag (list | |
88 | - | (get-tag "sl") | |
89 | - | (get-tag "m-sl") | |
90 | - | (get-tag "male-sl"))) | |
91 | - | -4) | |
92 | - | ;; outdated kanji | |
93 | - | ((eq? tag (get-tag "oK")) | |
94 | - | -2) | |
95 | - | ;; obsolete term | |
96 | - | ((eq? tag (get-tag "obs")) | |
97 | - | -5) | |
98 | - | ;; obscure term | |
99 | - | ((eq? tag (get-tag "obsc")) | |
100 | - | -10) | |
101 | - | ;; onomatopea | |
102 | - | ((eq? tag (get-tag "on-mim")) | |
103 | - | 1) | |
104 | - | ;; poetical term | |
105 | - | ((eq? tag (get-tag "poet")) | |
106 | - | -1) | |
107 | - | ((eq? tag (get-tag "proverb")) | |
108 | - | 1) | |
109 | - | ((eq? tag (get-tag "rare")) | |
110 | - | -3) | |
111 | - | ;; sensitive | |
112 | - | ((eq? tag (get-tag "sens")) | |
113 | - | -2) | |
114 | - | ;; dialect | |
115 | - | ((member tag (list | |
116 | - | (get-tag "kyb") | |
117 | - | (get-tag "osb") | |
118 | - | (get-tag "ksb") | |
119 | - | (get-tag "ktb") | |
120 | - | (get-tag "tsb") | |
121 | - | (get-tag "thb") | |
122 | - | (get-tag "tsug") | |
123 | - | (get-tag "kyu") | |
124 | - | (get-tag "rkb") | |
125 | - | (get-tag "nab") | |
126 | - | (get-tag "hob"))) | |
127 | - | -1) | |
128 | - | (else 0))) | |
129 | - | (let ((tags (apply append (map sense-tags (result-senses result))))) | |
130 | - | (apply + (map tag-point tags)))) |
modules/nani/trie.scm unknown status 2
1 | - | ;;; Nani Project website | |
2 | - | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
3 | - | ;;; | |
4 | - | ;;; This file is part of the Nani Project website. | |
5 | - | ;;; | |
6 | - | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | - | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | - | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | - | ;;; your option) any later version. | |
10 | - | ;;; | |
11 | - | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | - | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | - | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | - | ;;; GNU Affero General Public License for more details. | |
15 | - | ;;; | |
16 | - | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | - | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | - | ||
19 | - | (define-module (nani trie) | |
20 | - | #:use-module (nani serialize) | |
21 | - | #:use-module (rnrs bytevectors) | |
22 | - | #:use-module (srfi srfi-9) | |
23 | - | #:export (make-trie | |
24 | - | trie? | |
25 | - | trie-position | |
26 | - | trie-position-set! | |
27 | - | trie-vals | |
28 | - | trie-vals-set! | |
29 | - | trie-transitions | |
30 | - | trie-transitions-set! | |
31 | - | ||
32 | - | make-empty-trie | |
33 | - | add-to-trie! | |
34 | - | compress-trie | |
35 | - | ||
36 | - | serialize-trie | |
37 | - | trie-size)) | |
38 | - | ||
39 | - | (define-record-type trie | |
40 | - | (make-trie position vals transitions) | |
41 | - | trie? | |
42 | - | (position trie-position trie-position-set!) ; integer | |
43 | - | (vals trie-vals trie-vals-set!) ; list | |
44 | - | (transitions trie-transitions trie-transitions-set!)) ; array or alist | |
45 | - | ||
46 | - | (define (make-empty-trie) | |
47 | - | (make-trie 0 '() (make-array #f 16))) | |
48 | - | ||
49 | - | (define (add-to-trie! trie key value) | |
50 | - | (if (null? key) | |
51 | - | (trie-vals-set! trie (cons value (trie-vals trie))) | |
52 | - | (let ((next-trie (array-ref (trie-transitions trie) (car key)))) | |
53 | - | (if next-trie | |
54 | - | (add-to-trie! next-trie (cdr key) value) | |
55 | - | (let ((next-trie (make-empty-trie))) | |
56 | - | (array-set! (trie-transitions trie) next-trie (car key)) | |
57 | - | (add-to-trie! next-trie (cdr key) value)))))) | |
58 | - | ||
59 | - | (define (convert-trie-transitions! trie) | |
60 | - | (define (get-new-transitions transitions) | |
61 | - | (let loop ((i 0) (tr '())) | |
62 | - | (if (= i 16) | |
63 | - | tr | |
64 | - | (let ((elem (array-ref transitions i))) | |
65 | - | (if elem | |
66 | - | (begin | |
67 | - | (convert-trie-transitions! elem) | |
68 | - | (loop (+ i 1) (cons (cons i elem) tr))) | |
69 | - | (loop (+ i 1) tr)))))) | |
70 | - | (let* ((transitions (trie-transitions trie)) | |
71 | - | (transitions (get-new-transitions transitions))) | |
72 | - | (trie-transitions-set! trie transitions))) | |
73 | - | ||
74 | - | (define (compress-trie trie) | |
75 | - | (define (compress-aux trie) | |
76 | - | (make-trie | |
77 | - | (trie-position trie) | |
78 | - | (trie-vals trie) | |
79 | - | (apply append | |
80 | - | (map | |
81 | - | (lambda (tr) | |
82 | - | (let ((trie (cdr tr))) | |
83 | - | (map | |
84 | - | (lambda (tr2) | |
85 | - | (cons (+ (car tr2) (* 16 (car tr))) | |
86 | - | (compress-aux (cdr tr2)))) | |
87 | - | (trie-transitions trie)))) | |
88 | - | (trie-transitions trie))))) | |
89 | - | (convert-trie-transitions! trie) | |
90 | - | (compress-aux trie)) | |
91 | - | ||
92 | - | (define (pointer-size ptr) | |
93 | - | 5) | |
94 | - | ||
95 | - | (define (serialize-pointer ptr pos bv) | |
96 | - | (bytevector-u8-set! bv pos (car ptr)) | |
97 | - | (bytevector-u32-set! bv (+ pos 1) (trie-position (cdr ptr)) (endianness big)) | |
98 | - | (+ pos 5)) | |
99 | - | ||
100 | - | (define (serialize-trie trie serialize size results pos) | |
101 | - | (define (serialize-trie-aux transitions pos) | |
102 | - | (let loop ((pos pos) | |
103 | - | (trs transitions) | |
104 | - | (bvs '())) | |
105 | - | (if (null? trs) | |
106 | - | (cons pos bvs) | |
107 | - | (let* ((next-trie (cdr (car trs))) | |
108 | - | (bv (get-trie-bv next-trie results pos)) | |
109 | - | (pos (car bv)) | |
110 | - | (bv (cdr bv))) | |
111 | - | (loop pos (cdr trs) (append bvs bv)))))) | |
112 | - | ||
113 | - | (define (get-trie-bv trie results pos) | |
114 | - | (trie-position-set! trie pos) | |
115 | - | (let* ((vals-sz (size (trie-vals trie))) | |
116 | - | (trs-sz (list-size (trie-transitions trie) (const 5) #:size? #f)) | |
117 | - | (sz (+ vals-sz 1 trs-sz)) | |
118 | - | (bv (make-bytevector sz 0))) | |
119 | - | (serialize (trie-vals trie) 0 bv) | |
120 | - | (let* ((bvs (serialize-trie-aux | |
121 | - | (trie-transitions trie) | |
122 | - | (+ pos sz))) | |
123 | - | (next-pos (car bvs)) | |
124 | - | (bvs (cdr bvs))) | |
125 | - | (bytevector-u8-set! bv vals-sz (length (trie-transitions trie))) | |
126 | - | (serialize-list (trie-transitions trie) serialize-pointer | |
127 | - | (+ vals-sz 1) bv #:size? #f) | |
128 | - | (cons next-pos (cons bv bvs))))) | |
129 | - | ||
130 | - | (let* ((trie-bv (get-trie-bv trie results pos)) | |
131 | - | (new-pos (car trie-bv)) | |
132 | - | (trie-bv (merge-bvs (cdr trie-bv)))) | |
133 | - | (bytevector-copy! trie-bv 0 results pos (bytevector-length trie-bv)) | |
134 | - | new-pos)) | |
135 | - | ||
136 | - | (define (trie-size trie size) | |
137 | - | (apply + | |
138 | - | (size (trie-vals trie)) | |
139 | - | 1 | |
140 | - | (list-size (trie-transitions trie) pointer-size #:size? #f) | |
141 | - | (map (lambda (trie) (trie-size trie size)) | |
142 | - | (map cdr (trie-transitions trie))))) |
modules/nani/wadoku/entities.scm unknown status 2
1 | - | (define-module (nani wadoku entities) | |
2 | - | #:use-module (nani result) | |
3 | - | #:use-module ((nani tags) #:prefix tag:) | |
4 | - | #:use-module (ice-9 match) | |
5 | - | #:export (get-tag hint->tag) | |
6 | - | #:re-export (tag:get-points)) | |
7 | - | ||
8 | - | (define wadoku-entities | |
9 | - | `(("Aerodyn." . "aero") | |
10 | - | ("Agrochem." . "chem") | |
11 | - | ("??gypt. Gesch." . "hist") | |
12 | - | ("??gypt. Mythol." . "rel") | |
13 | - | ("Aktien" . "econ") | |
14 | - | ("Akupunktur" . "med") | |
15 | - | ("Akustik" . "music") | |
16 | - | ("American Football" . "sports") | |
17 | - | ("amerik. Gesch." . "hist") | |
18 | - | ("amerik. Pol." . "politics") | |
19 | - | ("amerik. Rechtsw." . "law") | |
20 | - | ("Anat." . "anat") | |
21 | - | ("Angeln" . "sports") | |
22 | - | ("Anime" . "work") | |
23 | - | ("Anthropol." . "anth") | |
24 | - | ("arab. Gesch." . "hist") | |
25 | - | ("Arch??ol." . "archeo") | |
26 | - | ("Archit." . "archit") | |
27 | - | ("Astrol." . "astron") | |
28 | - | ("Astron." . "astron") | |
29 | - | ("atmosph??r. Optik" . "physics") | |
30 | - | ("Atomphys." . "physics") | |
31 | - | ("Audio" . "music") | |
32 | - | ("Augenheilk." . "med") | |
33 | - | ("Autorennen" . "sports") | |
34 | - | ("Backw." . "food") | |
35 | - | ("Badminton" . "sports") | |
36 | - | ("Bahnhofsn." . "station") | |
37 | - | ("Bakteriol." . "biol") | |
38 | - | ("Ballet" . "music") | |
39 | - | ("Ballett" . "music") | |
40 | - | ("Ballsport" . "sports") | |
41 | - | ("Bankw." . "finc") | |
42 | - | ("Baseb." . "baseb") | |
43 | - | ("Baseball" . "baseb") | |
44 | - | ("Basketball" . "sports") | |
45 | - | ("Bauingenieurw." . "archit") | |
46 | - | ("Bautechnik" . "archit") | |
47 | - | ("Bauw." . "archit") | |
48 | - | ("Befestigungsw." . "archit") | |
49 | - | ("Beh??rde" . "law") | |
50 | - | ("Bergb." . "archit") | |
51 | - | ("Bergbau" . "archit") | |
52 | - | ("Bergn." . "place") | |
53 | - | ("Bergsteigen" . "sports") | |
54 | - | ("Bibel" . "Christn") | |
55 | - | ("Bibliotheksw." . "biblio") | |
56 | - | ("Bildhauerei" . "work") | |
57 | - | ("bild. Kunst" . "work") | |
58 | - | ("Billard" . "game") | |
59 | - | ("Biochem." . "biol") | |
60 | - | ("Biol." . "biol") | |
61 | - | ("Bodenkunde" . "agri") | |
62 | - | ("Bogenschie??en" . "sports") | |
63 | - | ("B??rse" . "econ") | |
64 | - | ("Bot." . "bot") | |
65 | - | ("Bowling" . "sports") | |
66 | - | ("Boxen" . "sports") | |
67 | - | ("Brauereiw." . "alco") | |
68 | - | ("Brettspiel" . "game") | |
69 | - | ("brit. Gesch." . "hist") | |
70 | - | ("brit. Rechtsw." . "law") | |
71 | - | ("Br??ckenbau" . "engr") | |
72 | - | ("Bsp." . "exp") | |
73 | - | ("Buchbinden" . "print") | |
74 | - | ("B??chereiw." . "biblio") | |
75 | - | ("Buchtitel" . "work") | |
76 | - | ("Buchw." . "biblio") | |
77 | - | ("Buddh." . "Buddh") | |
78 | - | ("Bud??" . "MA") | |
79 | - | ("B??d??" . "MA") | |
80 | - | ("Chem." . "chem") | |
81 | - | ("chin. Arch??ol." . "archeo") | |
82 | - | ("chin. Astron." . "astron") | |
83 | - | ("chin. Gesch." . "hist") | |
84 | - | ("chin. Gramm." . "ling") | |
85 | - | ("chin. Literaturw." . "work") | |
86 | - | ("chin. Mus." . "music") | |
87 | - | ("chin. Musikinstr." . "music") | |
88 | - | ("chin. Mythol." . "rel") | |
89 | - | ("chin. Philos." . "rel") | |
90 | - | ("chin. Phon." . "ling") | |
91 | - | ("chin. Poet." . "poet") | |
92 | - | ("chin. Pol." . "politics") | |
93 | - | ("chin. Theater" . "work") | |
94 | - | ("Chirurgie" . "med") | |
95 | - | ("Christent." . "Christn") | |
96 | - | ("Cocktail" . "alco") | |
97 | - | ("Computerling." . "comp") | |
98 | - | ("Curling" . "sports") | |
99 | - | ("Dammbau" . "engr") | |
100 | - | ("Demogr." . "demo") | |
101 | - | ("Diplomat." . "politics") | |
102 | - | ("Druckw." . "print") | |
103 | - | ("dtsch. Gesch." . "hist") | |
104 | - | ("dtsch. Gramm." . "ling") | |
105 | - | ("EDV" . "comp") | |
106 | - | ("Einzel-Kanji" . "ling") | |
107 | - | ("Eisenb." . "engr") | |
108 | - | ("Eishockey" . "sports") | |
109 | - | ("Eiskunstl." . "sports") | |
110 | - | ("Eiskunstlauf" . "sports") | |
111 | - | ("Eisschnelllauf" . "sports") | |
112 | - | ("Elektrochem." . "chem") | |
113 | - | ("Elektrot." . "engr") | |
114 | - | ("Embryol." . "biol") | |
115 | - | ("engl. Gesch." . "hist") | |
116 | - | ("Entwicklungsphysiol." . "biol") | |
117 | - | ("Entwicklungspsychol." . "psy") | |
118 | - | ("Ergonomie" . "engr") | |
119 | - | ("Ethik" . "philo") | |
120 | - | ("Ethnol." . "ethno") | |
121 | - | ("europ. Gesch." . "hist") | |
122 | - | ("Fahrrad" . "sports") | |
123 | - | ("Familienn." . "surname") | |
124 | - | ("Familienn.." . "surname") | |
125 | - | ("Farbe" . "color") | |
126 | - | ("F??rberei" . "build") | |
127 | - | ("Fechten" . "build") | |
128 | - | ("Fertigungstechnik" . "engr") | |
129 | - | ("Film" . "work") | |
130 | - | ("Filmtitel" . "work") | |
131 | - | ("Finanzw." . "finc") | |
132 | - | ("Firmenn." . "company") | |
133 | - | ("Fischerei" . "sports") | |
134 | - | ("Fischk." . "food") | |
135 | - | ("Flugw." . "engr") | |
136 | - | ("Flussn." . "place") | |
137 | - | ("Forstw." . "bot") | |
138 | - | ("Fotog." . "photo") | |
139 | - | ("Frachtw." . "trans") | |
140 | - | ("franz. Gesch." . "hist") | |
141 | - | ("Frisur" . "fashion") | |
142 | - | ("Funkt." . "physics") | |
143 | - | ("Fu??ball" . "sports") | |
144 | - | ("Gagaku" . "music") | |
145 | - | ("Garten" . "bot") | |
146 | - | ("Gartenk." . "bot") | |
147 | - | ("Gateball" . "sports") | |
148 | - | ("Gebietsn." . "place") | |
149 | - | ("Geldw." . "finc") | |
150 | - | ("Genetik" . "biol") | |
151 | - | ("Geogr." . "place") | |
152 | - | ("Geol." . "geol") | |
153 | - | ("Geom." . "geom") | |
154 | - | ("Geophys." . "physics") | |
155 | - | ("Gerberei" . "build") | |
156 | - | ("Gesch." . "hist") | |
157 | - | ("Gewichtheben" . "sports") | |
158 | - | ("Glasherst." . "build") | |
159 | - | ("Glaziol." . "sci") | |
160 | - | ("Go" . "game") | |
161 | - | ("Golf" . "sports") | |
162 | - | ("Gramm." . "ling") | |
163 | - | ("griech. Christent." . "Christn") | |
164 | - | ("griech. Gesch." . "hist") | |
165 | - | ("griech. Myth." . "rel") | |
166 | - | ("griech. Mythol." . "rel") | |
167 | - | ("griech. Theat." . "work") | |
168 | - | ("Gymnastik" . "sports") | |
169 | - | ("Handball" . "sports") | |
170 | - | ("Handel" . "econ") | |
171 | - | ("Handwerkskunst" . "build") | |
172 | - | ("Heraldik" . "sci") | |
173 | - | ("Hindu." . "rel") | |
174 | - | ("Hochsprung" . "sports") | |
175 | - | ("Hockey" . "sports") | |
176 | - | ("Holzbau" . "engr") | |
177 | - | ("Holzbearb." . "engr") | |
178 | - | ("Holzschnitt" . "engr") | |
179 | - | ("Hunderasse" . "zool") | |
180 | - | ("Hydrodynamik" . "physics") | |
181 | - | ("I Ging" . "rel") | |
182 | - | ("Ikebana" . "build") | |
183 | - | ("Immunol." . "med") | |
184 | - | ("ind. Gesch." . "hist") | |
185 | - | ("indian. Mythol." . "rel") | |
186 | - | ("ind. Mythol." . "rel") | |
187 | - | ("indones. Gesch." . "hist") | |
188 | - | ("ind. Philos." . "philo") | |
189 | - | ("Insektenk." . "zool") | |
190 | - | ("Inseln." . "place") | |
191 | - | ("Internet" . "comp") | |
192 | - | ("Islam" . "rel") | |
193 | - | ("islam. Mythol." . "rel") | |
194 | - | ("islam. Rechtsw." . "law") | |
195 | - | ("islam. Rel." . "rel") | |
196 | - | ("israel. Gesch." . "hist") | |
197 | - | ("ital. Gesch." . "hist") | |
198 | - | ("japan. Archit." . "archit") | |
199 | - | ("japan. Astrologie" . "aston") | |
200 | - | ("japan. bild. Kunst" . "work") | |
201 | - | ("japan. Christent." . "Christn") | |
202 | - | ("japan. Gesch." . "hist") | |
203 | - | ("japan. Gramm." . "ling") | |
204 | - | ("japan. Literaturw." . "work") | |
205 | - | ("japan. Med." . "med") | |
206 | - | ("japan. Mus." . "music") | |
207 | - | ("japan. Mythol." . "rel") | |
208 | - | ("japan. Philos." . "philo") | |
209 | - | ("japan. Pol." . "politics") | |
210 | - | ("japan. Schwimmen" . "sports") | |
211 | - | ("japan. Soziol." . "sci") | |
212 | - | ("japan. Sprachw." . "ling") | |
213 | - | ("J??ruri" . "music") | |
214 | - | ("Judent." . "rel") | |
215 | - | ("J??d??" . "MA") | |
216 | - | ("Kabuki" . "music") | |
217 | - | ("Kagura" . "music") | |
218 | - | ("Kalligraphie" . "build") | |
219 | - | ("Kanbun" . "ling") | |
220 | - | ("Kanji" . "ling") | |
221 | - | ("Kanp??" . "med") | |
222 | - | ("Karten" . "game") | |
223 | - | ("Kartenspiel" . "game") | |
224 | - | ("Kartogr." . "place") | |
225 | - | ("kath. Christent." . "Christn") | |
226 | - | ("Kend??" . "MA") | |
227 | - | ("Keramik" . "build") | |
228 | - | ("Kernphys." . "physics") | |
229 | - | ("Kfz-W." . "engr") | |
230 | - | ("KI" . "comp") | |
231 | - | ("Kleidung" . "fashion") | |
232 | - | ("Kochk." . "food") | |
233 | - | ("Konditoreiw." . "food") | |
234 | - | ("Konfuz." . "rel") | |
235 | - | ("korean. Gesch." . "hist") | |
236 | - | ("korean. Rel." . "rel") | |
237 | - | ("korean. Theater" . "work") | |
238 | - | ("Kosmetik" . "fashion") | |
239 | - | ("Kricket" . "sports") | |
240 | - | ("Kristallogr." . "physics") | |
241 | - | ("Krocket" . "sports") | |
242 | - | ("Kunst" . "work") | |
243 | - | ("Kunstgesch." . "hist") | |
244 | - | ("Kunst??? und Turmsprung" . "build") | |
245 | - | ("Ky??gen" . "music") | |
246 | - | ("Lack" . "work") | |
247 | - | ("L??ndern." . "place") | |
248 | - | ("Landw." . "agri") | |
249 | - | ("Leichtathl." . "sports") | |
250 | - | ("Liedtitel" . "work") | |
251 | - | ("Literaturw." . "work") | |
252 | - | ("Logik" . "math") | |
253 | - | ("Luftf." . "aero") | |
254 | - | ("Machinenb." . "engr") | |
255 | - | ("Mah-Jongg" . "mahj") | |
256 | - | ("Management" . "bus") | |
257 | - | ("Manga" . "work") | |
258 | - | ("m??nnl. Name" . "given") | |
259 | - | ("m??nnl. Vorn." . "given") | |
260 | - | ("m??nnl. Vorname" . "given") | |
261 | - | ("Mantik" . "rel") | |
262 | - | ("Marine" . "sail") | |
263 | - | ("Marketing" . "bus") | |
264 | - | ("Maschinenb." . "engr") | |
265 | - | ("Ma??" . "engr") | |
266 | - | ("Math." . "math") | |
267 | - | ("Mech." . "physics") | |
268 | - | ("Med." . "med") | |
269 | - | ("Meeresk." . "place") | |
270 | - | ("Meeresn." . "place") | |
271 | - | ("Meinungsforschung" . "politics") | |
272 | - | ("Messw." . "hobby") | |
273 | - | ("Metall." . "engr") | |
274 | - | ("Metallbearb." . "engr") | |
275 | - | ("Metallguss" . "engr") | |
276 | - | ("Meteor." . "meteor") | |
277 | - | ("mexikan. Kochk." . "food") | |
278 | - | ("Mikrobiol." . "biol") | |
279 | - | ("Milit." . "mil") | |
280 | - | ("Mineral." . "geol") | |
281 | - | ("M??belbau" . "build") | |
282 | - | ("Mode" . "fashion") | |
283 | - | ("Motorsport" . "sports") | |
284 | - | ("Mus." . "music") | |
285 | - | ("Muschelk." . "zool") | |
286 | - | ("Musikinstr." . "music") | |
287 | - | ("Mykol." . "bot") | |
288 | - | ("Mythol." . "rel") | |
289 | - | ("Nachrichtent." . "politics") | |
290 | - | ("N??hen" . "fashion") | |
291 | - | ("Naniwabushi" . "music") | |
292 | - | ("Naturmed." . "med") | |
293 | - | ("Naturph??n." . "physics") | |
294 | - | ("Naturphilos." . "philo") | |
295 | - | ("Neng??" . "hist") | |
296 | - | ("Netzwerktechnik" . "comp") | |
297 | - | ("Neurol." . "med") | |
298 | - | ("N??" . "music") | |
299 | - | ("nord. Mythol." . "rel") | |
300 | - | ("Numismatik" . "build") | |
301 | - | ("??kol." . "biol") | |
302 | - | ("Oper" . "music") | |
303 | - | ("Optik" . "physics") | |
304 | - | ("Org." . "organization") | |
305 | - | ("Origami" . "build") | |
306 | - | ("Ortsn." . "place") | |
307 | - | ("P??d." . "school") | |
308 | - | ("Pal??ontol." . "hist") | |
309 | - | ("Papierherst." . "build") | |
310 | - | ("Parapsych." . "psy") | |
311 | - | ("Patentw." . "engr") | |
312 | - | ("Pathol." . "med") | |
313 | - | ("pers. Gesch." . "hist") | |
314 | - | ("Pers??nlichk." . "person") | |
315 | - | ("Petrochem." . "chem") | |
316 | - | ("Pferderennen" . "sports") | |
317 | - | ("Pferdesport" . "sports") | |
318 | - | ("Pharm." . "med") | |
319 | - | ("Philos." . "philo") | |
320 | - | ("philos. Anthrop." . "anth") | |
321 | - | ("Phon." . "ling") | |
322 | - | ("Phys." . "physics") | |
323 | - | ("Physiol." . "anat") | |
324 | - | ("Poetik" . "poet") | |
325 | - | ("Pol." . "politics") | |
326 | - | ("Postw." . "telecom") | |
327 | - | ("Psych." . "psy") | |
328 | - | ("Quantenphys." . "physics") | |
329 | - | ("Radio" . "telecom") | |
330 | - | ("Radrennen" . "sports") | |
331 | - | ("Radsport" . "sports") | |
332 | - | ("Rakugo" . "music") | |
333 | - | ("Raumf." . "engr") | |
334 | - | ("Reaktort." . "engr") | |
335 | - | ("Rechnungsw." . "bus") | |
336 | - | ("Rechtsw." . "law") | |
337 | - | ("Redew." . "exp") | |
338 | - | ("Regelungstechnik" . "engr") | |
339 | - | ("Reiten" . "sports") | |
340 | - | ("Rel." . "rel") | |
341 | - | ("Rennsport" . "sports") | |
342 | - | ("Rhetorik" . "philo") | |
343 | - | ("Ringen" . "sports") | |
344 | - | ("r??m. Gesch." . "hist") | |
345 | - | ("r??m. Mythol." . "rel") | |
346 | - | ("Rudern" . "sports") | |
347 | - | ("Rugby" . "sports") | |
348 | - | ("Rundfunk" . "telecom") | |
349 | - | ("russ. Gesch." . "hist") | |
350 | - | ("R??stung" . "mil") | |
351 | - | ("Schach" . "game") | |
352 | - | ("Schie??sport" . "sports") | |
353 | - | ("Schiff" . "sail") | |
354 | - | ("Schiffbau" . "sail") | |
355 | - | ("Schifffahrt" . "sail") | |
356 | - | ("Schmuck" . "build") | |
357 | - | ("Schneckenk." . "zool") | |
358 | - | ("Schneiderei" . "fashion") | |
359 | - | ("Schuhe" . "fashion") | |
360 | - | ("Schule" . "school") | |
361 | - | ("Schwei??en" . "build") | |
362 | - | ("Schwert" . "mil") | |
363 | - | ("Schwimmen" . "sports") | |
364 | - | ("Seef." . "sail") | |
365 | - | ("Seen." . "place") | |
366 | - | ("Seerechtsw." . "law") | |
367 | - | ("Seew." . "sail") | |
368 | - | ("Segeln" . "sail") | |
369 | - | ("Seide" . "build") | |
370 | - | ("Seismol." . "physics") | |
371 | - | ("SF" . "work") | |
372 | - | ("Shingon" . "Buddh") | |
373 | - | ("Shint??" . "Shinto") | |
374 | - | ("Sh??gi" . "shogi") | |
375 | - | ("Ski" . "sports") | |
376 | - | ("Skispringen" . "sports") | |
377 | - | ("Softball" . "sports") | |
378 | - | ("Sozialpsych." . "psy") | |
379 | - | ("Soziol." . "sci") | |
380 | - | ("span. Gesch." . "hist") | |
381 | - | ("Spiel" . "game") | |
382 | - | ("Spiele-Titel" . "game") | |
383 | - | ("Spieltheorie" . "game") | |
384 | - | ("Spinnen" . "zool") | |
385 | - | ("Spinnenk." . "zool") | |
386 | - | ("Sport" . "sports") | |
387 | - | ("Sprache" . "ling") | |
388 | - | ("Sprachw." . "ling") | |
389 | - | ("Sprichw." . "ling") | |
390 | - | ("Stadtn." . "place") | |
391 | - | ("Stadtplanung" . "engr") | |
392 | - | ("Statistik" . "math") | |
393 | - | ("Steuerw." . "econ") | |
394 | - | ("Sticken" . "build") | |
395 | - | ("Stra??enbau" . "engr") | |
396 | - | ("Streckenkilometertafel" . "train") | |
397 | - | ("Stricken" . "build") | |
398 | - | ("Strukturalismus" . "philo") | |
399 | - | ("Sum??" . "sumo") | |
400 | - | ("Surfen" . "sports") | |
401 | - | ("Systemanalyse" . "engr") | |
402 | - | ("Systemtheorie" . "engr") | |
403 | - | ("taiwan. Pol." . "politics") | |
404 | - | ("Tanz" . "art") | |
405 | - | ("Tanzen" . "art") | |
406 | - | ("Taoismus" . "rel") | |
407 | - | ("Tauchen" . "sports") | |
408 | - | ("Technik" . "engr") | |
409 | - | ("Tee" . "hobby") | |
410 | - | ("Telegrafie" . "telecom") | |
411 | - | ("Telekom." . "telecom") | |
412 | - | ("Tempeln." . "place") | |
413 | - | ("Tennis" . "sports") | |
414 | - | ("Tenn??" . "surname") | |
415 | - | ("Textilt." . "engr") | |
416 | - | ("Theat." . "work") | |
417 | - | ("Theol." . "rel") | |
418 | - | ("tibet. Buddh." . "Buddh") | |
419 | - | ("Tiefb." . "engr") | |
420 | - | ("Tiermed." . "med") | |
421 | - | ("Tischtennis" . "sports") | |
422 | - | ("Tourismus" . "hobby") | |
423 | - | ("Tunnelbau" . "engr") | |
424 | - | ("t??rk. Gesch." . "hist") | |
425 | - | ("Turnen" . "sports") | |
426 | - | ("TV" . "telecom") | |
427 | - | ("TV-Prog." . "telecom") | |
428 | - | ("U-Bahn" . "train") | |
429 | - | ("Umwelt" . "biol") | |
430 | - | ("Univ.-N." . "place") | |
431 | - | ("Verhaltensbiol." . "biol") | |
432 | - | ("Verkehrsw." . "engr") | |
433 | - | ("Verlagsn." . "company") | |
434 | - | ("Verlagsw." . "econ") | |
435 | - | ("Versicherungsw." . "bus") | |
436 | - | ("Video" . "telecom") | |
437 | - | ("Videospiel" . "game") | |
438 | - | ("vietnam. Gesch." . "hist") | |
439 | - | ("Vogelk." . "zool") | |
440 | - | ("V??lkerk." . "ethno") | |
441 | - | ("V??lkerr." . "law") | |
442 | - | ("Volleyball" . "sports") | |
443 | - | ("Vorgesch." . "hist") | |
444 | - | ("VWL" . "econ") | |
445 | - | ("Waffenk." . "mil") | |
446 | - | ("Walfang" . "hobby") | |
447 | - | ("Wasserball" . "sports") | |
448 | - | ("Wasserbau" . "engr") | |
449 | - | ("Wassersport" . "sports") | |
450 | - | ("Weben" . "build") | |
451 | - | ("weibl. Name" . "given") | |
452 | - | ("weibl. Vorn." . "given") | |
453 | - | ("Weichtierk." . "zool") | |
454 | - | ("Wein" . "alco") | |
455 | - | ("Werbung" . "bus") | |
456 | - | ("Werktitel" . "work") | |
457 | - | ("Wintersport" . "sports") | |
458 | - | ("Wirtsch." . "econ") | |
459 | - | ("Wrestling" . "sports") | |
460 | - | ("Wz." . "exp") | |
461 | - | ("Yoga" . "sports") | |
462 | - | ("Z??hlw." . "finc") | |
463 | - | ("Zahnmed." . "med") | |
464 | - | ("Zeitschriftenn." . "work") | |
465 | - | ("Zeitungsn." . "work") | |
466 | - | ("Zeitungsw." . "print") | |
467 | - | ("Zellbiol." . "biol") | |
468 | - | ("Zen" . "rel") | |
469 | - | ("Zirkus" . "hobby") | |
470 | - | ("Zitat" . "exp") | |
471 | - | ("Zollw." . "bus") | |
472 | - | ("Zool." . "biol"))) | |
473 | - | ||
474 | - | (define (hint->tag hint) | |
475 | - | (match hint | |
476 | - | ("Kansai-Dial." "ksb") | |
477 | - | ("Ky??to-Dial." "kyb") | |
478 | - | ("??saka-Dial." "osb") | |
479 | - | ("Kant??-Dial." "ktb") | |
480 | - | ("T??hoku-Dial." "thb") | |
481 | - | ("Tsugaru-Dial." "tsug") | |
482 | - | ("Ky??sh??-Dial." "kyu") | |
483 | - | ("Hokkaid??-Dial." "hob") | |
484 | - | ("altert??ml." "arch") | |
485 | - | ("arch." "arch") | |
486 | - | ("Edo-zeitl." "arch") | |
487 | - | ("ehrerb.-h??fl." "hon") | |
488 | - | ("h??fl.-ehrerb." "hon") | |
489 | - | ("besch.-h??fl." "hum") | |
490 | - | ("abwertend" "derog") | |
491 | - | ("etwas altmodisch" "obs") | |
492 | - | ("Gaunerjargon" "thief") | |
493 | - | ("Frauenspr. der Edo-Zeit" "arch") | |
494 | - | ("Geheimspr. der Gauner" "thief") | |
495 | - | ("hist." "hist") | |
496 | - | ("h??fl." "pol") | |
497 | - | ("kinderspr." "chn") | |
498 | - | ("M??nnersprache" "male") | |
499 | - | ("Medizinerjargon" "med") | |
500 | - | ("Milit??rjargon" "mil") | |
501 | - | ("obsol." "obs") | |
502 | - | ("onomat." "on-mim") | |
503 | - | ("poet." "poet") | |
504 | - | ("Polizeijargon" "law") | |
505 | - | ("Slang" "sl") | |
506 | - | ("veraltet" "arch") | |
507 | - | (_ #f))) | |
508 | - | ||
509 | - | (define (wadoku->tag-name wadoku-tag) | |
510 | - | (let ((tag (assoc-ref wadoku-entities wadoku-tag))) | |
511 | - | (or tag wadoku-tag))) | |
512 | - | ||
513 | - | (define (get-tag tag) | |
514 | - | (tag:get-tag (wadoku->tag-name tag))) |
modules/nani/wadoku/pitch.scm unknown status 2
1 | - | ;;; Nani Project website | |
2 | - | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
3 | - | ;;; | |
4 | - | ;;; This file is part of the Nani Project website. | |
5 | - | ;;; | |
6 | - | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | - | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | - | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | - | ;;; your option) any later version. | |
10 | - | ;;; | |
11 | - | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | - | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | - | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | - | ;;; GNU Affero General Public License for more details. | |
15 | - | ;;; | |
16 | - | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | - | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | - | ||
19 | - | (define-module (nani wadoku pitch) | |
20 | - | #:use-module (ice-9 binary-ports) | |
21 | - | #:use-module (ice-9 match) | |
22 | - | #:use-module (ice-9 rdelim) | |
23 | - | #:use-module (nani frequency) | |
24 | - | #:use-module (nani huffman) | |
25 | - | #:use-module (nani result) | |
26 | - | #:use-module (nani trie) | |
27 | - | #:use-module (nani wadoku entities) | |
28 | - | #:use-module (srfi srfi-1) | |
29 | - | #:use-module (srfi srfi-9) | |
30 | - | #:use-module (sxml ssax) | |
31 | - | #:use-module (rnrs bytevectors) | |
32 | - | #:export (xml->pitch serialize-pitch pitch-entry-count)) | |
33 | - | ||
34 | - | (define-record-type pitch | |
35 | - | (make-pitch kanjis accents) | |
36 | - | pitch? | |
37 | - | (kanjis pitch-kanjis) | |
38 | - | (accents pitch-accents)) | |
39 | - | ||
40 | - | (define (sxml->element lst elem) | |
41 | - | (let ((elem (match elem | |
42 | - | ((_ . elem) elem) | |
43 | - | (_ elem)))) | |
44 | - | (match elem | |
45 | - | ('accent `(pitch ,(car lst))) | |
46 | - | ('orth (let ((kanji (filter string? lst))) | |
47 | - | (if (null? kanji) #f `(kanji . ,(car kanji))))) | |
48 | - | ('form | |
49 | - | `(form . ,(append-map (lambda (a) (if (list? a) a (list a))) lst))) | |
50 | - | ('reading (filter pair? lst)) | |
51 | - | ('entry | |
52 | - | (let loop ((lst lst) (kanjis '()) (accents '())) | |
53 | - | (if (null? lst) | |
54 | - | (if (or (null? kanjis) (null? accents)) | |
55 | - | #f | |
56 | - | (make-pitch kanjis accents)) | |
57 | - | (match (car lst) | |
58 | - | (('form . f) (loop (append f (cdr lst)) kanjis accents)) | |
59 | - | (('pitch . pitch) (loop (cdr lst) kanjis (cons pitch accents))) | |
60 | - | (('kanji . kanji) (loop (cdr lst) (cons kanji kanjis) accents)) | |
61 | - | (_ (loop (cdr lst) kanjis accents)))))) | |
62 | - | (_ #f)))) | |
63 | - | ||
64 | - | (define parser | |
65 | - | (ssax:make-parser | |
66 | - | NEW-LEVEL-SEED | |
67 | - | (lambda (elem-gi attributes namespaces expected-content seed) | |
68 | - | attributes) | |
69 | - | ||
70 | - | FINISH-ELEMENT | |
71 | - | (lambda (elem-gi attributes namespaces parent-seed seed) | |
72 | - | (if (equal? elem-gi 'entries) | |
73 | - | seed | |
74 | - | (let* ((seed (reverse seed)) | |
75 | - | (element (sxml->element seed elem-gi))) | |
76 | - | (cons element parent-seed)))) | |
77 | - | ||
78 | - | CHAR-DATA-HANDLER | |
79 | - | (lambda (string1 string2 seed) | |
80 | - | (cons (string-append string1 string2) seed)))) | |
81 | - | ||
82 | - | (define (xml->pitch port) | |
83 | - | (filter pitch? (parser port '()))) | |
84 | - | ||
85 | - | (define (make-trie-key key) | |
86 | - | (append-map | |
87 | - | (lambda (c) | |
88 | - | (list (quotient c 16) (modulo c 16))) | |
89 | - | (bytevector->u8-list (string->utf8 key)))) | |
90 | - | ||
91 | - | (define (get-pitch-trie pitches) | |
92 | - | (let ((trie (make-empty-trie))) | |
93 | - | (for-each | |
94 | - | (lambda (pitch) | |
95 | - | (for-each | |
96 | - | (lambda (kanji) | |
97 | - | (for-each | |
98 | - | (lambda (accent) | |
99 | - | (cond | |
100 | - | ((string? accent) | |
101 | - | (add-to-trie! trie (make-trie-key kanji) accent)) | |
102 | - | ((list? accent) | |
103 | - | (add-to-trie! trie (make-trie-key kanji) (car accent))) | |
104 | - | (else | |
105 | - | (throw 'accent accent)))) | |
106 | - | (pitch-accents pitch))) | |
107 | - | (pitch-kanjis pitch))) | |
108 | - | pitches) | |
109 | - | (compress-trie trie))) | |
110 | - | ||
111 | - | (define (collapse-vals! trie) | |
112 | - | (let ((transitions (trie-transitions trie)) | |
113 | - | (vals (trie-vals trie))) | |
114 | - | (trie-vals-set! trie (string-join vals ", ")) | |
115 | - | (for-each collapse-vals! (map cdr transitions)))) | |
116 | - | ||
117 | - | (define (collect-vals trie) | |
118 | - | (let ((transitions (trie-transitions trie)) | |
119 | - | (vals (trie-vals trie))) | |
120 | - | (cons vals (append-map collect-vals (map cdr transitions))))) | |
121 | - | ||
122 | - | (define (entry-number trie) | |
123 | - | (let ((transitions (trie-transitions trie)) | |
124 | - | (vals (trie-vals trie))) | |
125 | - | (apply + (if (string-null? vals) 0 1) | |
126 | - | (map entry-number (map cdr transitions))))) | |
127 | - | ||
128 | - | (define (serialize-pitch pitches) | |
129 | - | (let ((trie (get-pitch-trie pitches))) | |
130 | - | (collapse-vals! trie) | |
131 | - | (let* ((huffman (create-huffman (collect-vals trie))) | |
132 | - | (code (huffman->code huffman)) | |
133 | - | (entries (entry-number trie))) | |
134 | - | (let* ((header (string->utf8 "NANI_PITCH001")) | |
135 | - | (header-size (bytevector-length header)) | |
136 | - | (huffman-bv (serialize-huffman huffman)) | |
137 | - | (huffman-size (bytevector-length huffman-bv)) | |
138 | - | (trie-size (trie-size trie (huffman-string-size code))) | |
139 | - | (result (make-bytevector (+ header-size 4 huffman-size trie-size)))) | |
140 | - | (bytevector-copy! header 0 result 0 header-size) | |
141 | - | (bytevector-u32-set! result header-size entries (endianness big)) | |
142 | - | (bytevector-copy! huffman-bv 0 result (+ header-size 4) huffman-size) | |
143 | - | (serialize-trie trie (serialize-huffman-string code) | |
144 | - | (huffman-string-size code) | |
145 | - | result (+ header-size 4 huffman-size)) | |
146 | - | result)))) | |
147 | - | ||
148 | - | (define (pitch-entry-count file) | |
149 | - | (call-with-input-file file | |
150 | - | (lambda (port) | |
151 | - | ;; header | |
152 | - | (get-bytevector-n port 13) | |
153 | - | ;; size | |
154 | - | (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big))))) |
modules/nani/wadoku/xml.scm unknown status 2
1 | - | ;;; Nani Project website | |
2 | - | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
3 | - | ;;; | |
4 | - | ;;; This file is part of the Nani Project website. | |
5 | - | ;;; | |
6 | - | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | - | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | - | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | - | ;;; your option) any later version. | |
10 | - | ;;; | |
11 | - | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | - | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | - | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | - | ;;; GNU Affero General Public License for more details. | |
15 | - | ;;; | |
16 | - | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | - | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | - | ||
19 | - | (define-module (nani wadoku xml) | |
20 | - | #:use-module (ice-9 match) | |
21 | - | #:use-module (ice-9 rdelim) | |
22 | - | #:use-module (nani frequency) | |
23 | - | #:use-module (nani result) | |
24 | - | #:use-module (nani wadoku entities) | |
25 | - | #:use-module (srfi srfi-9) | |
26 | - | #:use-module (sxml ssax) | |
27 | - | #:export (xml->results)) | |
28 | - | ||
29 | - | (define-record-type usage | |
30 | - | (make-usage type reg content) | |
31 | - | usage? | |
32 | - | (type usage-type) | |
33 | - | (reg usage-reg) | |
34 | - | (content usage-content)) | |
35 | - | ||
36 | - | (define (flatten lst) | |
37 | - | (match lst | |
38 | - | (() '()) | |
39 | - | ((head lst ...) | |
40 | - | (if (list? head) | |
41 | - | (append (flatten head) (flatten lst)) | |
42 | - | (cons head (flatten lst)))) | |
43 | - | (lst (list lst)))) | |
44 | - | ||
45 | - | (define* (update-usage usage #:key (type (usage-type usage)) | |
46 | - | (reg (usage-reg usage)) | |
47 | - | (content (usage-content usage))) | |
48 | - | (make-usage type reg content)) | |
49 | - | ||
50 | - | ;; TODO | |
51 | - | (define (usage->tag usage) | |
52 | - | (let ((content (usage-content usage)) | |
53 | - | (type (usage-type usage)) | |
54 | - | (reg (usage-reg usage))) | |
55 | - | `(,@(if (string-null? content) | |
56 | - | (match type | |
57 | - | ("" '()) | |
58 | - | ("time" '()) | |
59 | - | ("hint" '()) | |
60 | - | ("abrev" '(tag "abbr"))) | |
61 | - | (match type | |
62 | - | ("dom" `((tag ,content))) | |
63 | - | ("time" `((tag "obs"))); always obsol. | |
64 | - | ("hint" (let ((tag (hint->tag content))) | |
65 | - | (if tag `(tag ,tag) '()))))) | |
66 | - | ,@(match reg | |
67 | - | ("" '()) | |
68 | - | ("dial." '()) | |
69 | - | ("lit" `((tag "litf"))) | |
70 | - | ("kinderspr." `((tag "chn"))) | |
71 | - | ("kleinkindspr." `((tag "baby"))) | |
72 | - | ("jugendspr." `((tag "young"))) | |
73 | - | ("besch.-h??fl." `((tag "hum"))) | |
74 | - | ("coll" `((tag "col"))) | |
75 | - | ("vulg." `((tag "vulg"))) | |
76 | - | ("Slang" `((tag "sl"))) | |
77 | - | ("poet." `((tag "poet"))) | |
78 | - | ("hist." `((tag "hist"))) | |
79 | - | ("hist" `((tag "hist"))) | |
80 | - | ("obsol." `((tag "obs"))) | |
81 | - | ("veraltet" `((tag "arch"))) | |
82 | - | ("sch??lerspr." `((tag "stud"))) | |
83 | - | ("studentenspr." `((tag "stud"))) | |
84 | - | ("frauenspr." `((tag "fem"))) | |
85 | - | ("gaunerspr." `((tag "thief"))) | |
86 | - | ("geheimspr." `((tag "secret"))) | |
87 | - | ("hofdamenspr." `((tag "fem") (tag "pol"))) | |
88 | - | ("seemannsspr." `((tag "sail"))))))) | |
89 | - | ||
90 | - | (define (usg->tag lst) | |
91 | - | (define (usg->usage lst) | |
92 | - | (let loop ((result (make-usage "" "" "")) (lst lst)) | |
93 | - | (if (null? lst) | |
94 | - | result | |
95 | - | (loop | |
96 | - | (match (car lst) | |
97 | - | (('type . type) (update-usage result #:type type)) | |
98 | - | (('reg . reg) (update-usage result #:reg reg)) | |
99 | - | (((? symbol? s) . v) | |
100 | - | (throw 'unknow-usage-attr s v)) | |
101 | - | ((? string? content) (update-usage result #:content content)) | |
102 | - | ((? list? l) (loop result l)) | |
103 | - | (_ result)) | |
104 | - | (cdr lst))))) | |
105 | - | (usage->tag (usg->usage lst))) | |
106 | - | ||
107 | - | (define (ruigo->ref lst) | |
108 | - | (let loop ((ref #f) (lst lst)) | |
109 | - | (match lst | |
110 | - | (() ref) | |
111 | - | ((('id id) lst ...) | |
112 | - | (loop id lst)) | |
113 | - | (((a . b) lst ...) | |
114 | - | (loop (loop ref (list (list a b))) lst)) | |
115 | - | (((? list? l) lst ...) | |
116 | - | (loop (loop ref l) lst))))) | |
117 | - | ||
118 | - | (define (merge-senses s1 s2) | |
119 | - | (update-sense s2 | |
120 | - | #:references (append (sense-references s1) (sense-references s2)) | |
121 | - | #:tags (append (sense-tags s1) (sense-tags s2)) | |
122 | - | #:glosses (append (sense-glosses s1) (sense-glosses s2)))) | |
123 | - | ||
124 | - | (define (sxml->sense lst) | |
125 | - | (let loop ((sense (make-sense '() '() '() '() '() '() "ger")) (lst lst)) | |
126 | - | (if (null? lst) | |
127 | - | sense | |
128 | - | (loop | |
129 | - | (match (car lst) | |
130 | - | (#f sense) | |
131 | - | ((? sense? s) | |
132 | - | (merge-senses s sense)) | |
133 | - | ((? source? s) | |
134 | - | (update-sense sense #:sources (cons s (sense-sources sense)))) | |
135 | - | (('ref (? string? r)) | |
136 | - | (update-sense sense #:references (cons r (sense-references sense)))) | |
137 | - | (('tag (? string? r)) | |
138 | - | (update-sense sense #:tags (cons (get-tag r) (sense-tags sense)))) | |
139 | - | (('trans (? string? r)) | |
140 | - | (update-sense sense #:glosses (cons r (sense-glosses sense)))) | |
141 | - | (('related . _) sense) | |
142 | - | (('transcr . _) sense) | |
143 | - | (('pitch . _) sense) | |
144 | - | (((? symbol? s) v) | |
145 | - | (throw 'unknown-symbol s v)) | |
146 | - | ((? list? l) (loop sense l)) | |
147 | - | ((? string? _) sense)) | |
148 | - | (cdr lst))))) | |
149 | - | ||
150 | - | ;; TODO | |
151 | - | (define (sxml->ref lst) | |
152 | - | (define (sxml->ref-name lst) | |
153 | - | (let loop ((result #f) (lst lst)) | |
154 | - | (if (null? lst) | |
155 | - | result | |
156 | - | (loop | |
157 | - | (match (car lst) | |
158 | - | (('id . id) id) | |
159 | - | (((? symbol? s) . _) result) | |
160 | - | (((? symbol? s) _) result) | |
161 | - | ((? list? l) (loop result l)) | |
162 | - | (_ result)) | |
163 | - | (cdr lst))))) | |
164 | - | (let ((ref (sxml->ref-name lst))) | |
165 | - | (if (string? ref) | |
166 | - | `(ref ,ref) | |
167 | - | (throw 'no-ref ref 'from lst)))) | |
168 | - | ||
169 | - | (define (sxml->source lst) | |
170 | - | (let loop ((source (make-source '() #f "" "")) (lst lst)) | |
171 | - | (if (null? lst) | |
172 | - | source | |
173 | - | (loop | |
174 | - | (match (car lst) | |
175 | - | (('impli impli) (update-source source #:lang impli)) | |
176 | - | (('foreign foreign) | |
177 | - | (update-source source | |
178 | - | #:content (if (list? foreign) | |
179 | - | (append foreign (source-content source)) | |
180 | - | (cons foreign (source-content source))))) | |
181 | - | ((? list? l) (loop source l)) | |
182 | - | (_ source)) | |
183 | - | (cdr lst))))) | |
184 | - | ||
185 | - | (define (sxml->reading lst) | |
186 | - | (let loop ((reading (make-reading '() '() '())) (lst lst)) | |
187 | - | (if (null? lst) | |
188 | - | reading | |
189 | - | (loop | |
190 | - | (match (car lst) | |
191 | - | (('reading r) | |
192 | - | (update-reading reading #:readings (cons r (reading-readings reading)))) | |
193 | - | (_ reading)) | |
194 | - | (cdr lst))))) | |
195 | - | ||
196 | - | (define (sxml->result sxml frq) | |
197 | - | (define (sxml->result-aux sxml) | |
198 | - | (let loop ((result (make-result 0 0 '() '() '())) (last-source #f) (lst sxml)) | |
199 | - | (if (null? lst) | |
200 | - | result | |
201 | - | (match (car lst) | |
202 | - | (('kanji kanji) | |
203 | - | (loop | |
204 | - | (update-result result #:kanjis (cons kanji (result-kanjis result))) | |
205 | - | last-source (cdr lst))) | |
206 | - | ((? reading? reading) | |
207 | - | (loop | |
208 | - | (update-result result #:readings (cons reading (result-readings result))) | |
209 | - | last-source (cdr lst))) | |
210 | - | ((? sense? sense) | |
211 | - | (loop | |
212 | - | (update-result result | |
213 | - | #:senses | |
214 | - | (cons | |
215 | - | (if last-source | |
216 | - | (update-sense sense | |
217 | - | #:sources (cons last-source (sense-sources sense))) | |
218 | - | sense) | |
219 | - | (result-senses result))) | |
220 | - | last-source (cdr lst))) | |
221 | - | ((? list? l) | |
222 | - | (loop (loop result last-source l) last-source (cdr lst))) | |
223 | - | (_ (loop result last-source (cdr lst))))))) | |
224 | - | (let* ((result (sxml->result-aux sxml)) | |
225 | - | (word (if (null? (result-kanjis result)) | |
226 | - | (car (reading-readings (car (result-readings result)))) | |
227 | - | (car (result-kanjis result)))) | |
228 | - | (entity (frequency-entity frq word)) | |
229 | - | (sense1 (car (result-senses result))) | |
230 | - | (sense1 (if entity | |
231 | - | (update-sense sense1 | |
232 | - | #:tags (cons (get-tag entity) (sense-tags sense1))) | |
233 | - | sense1)) | |
234 | - | (senses (cons sense1 (cdr (result-senses result))))) | |
235 | - | (update-result result | |
236 | - | #:points (tag:get-points (update-result result #:senses senses)) | |
237 | - | #:senses senses))) | |
238 | - | ||
239 | - | (define (gram-tag lst) | |
240 | - | (map | |
241 | - | (lambda (attr) | |
242 | - | (match attr | |
243 | - | (('suru . suru) | |
244 | - | `(tag ,(string-append "suru-" suru))) | |
245 | - | (('ni . ni) | |
246 | - | `(tag ,(string-append "ni-" ni))) | |
247 | - | (('shiku . shiku) | |
248 | - | `(tag ,(string-append "shiku-" shiku))) | |
249 | - | (('ku . ku) | |
250 | - | `(tag ,(string-append "ku-" ku))) | |
251 | - | (('to . to) | |
252 | - | `(tag ,(string-append "to-" to))) | |
253 | - | (('no . no) | |
254 | - | `(tag ,(string-append "no-" no))) | |
255 | - | (('nari . nari) | |
256 | - | `(tag ,(string-append "nari-" nari))) | |
257 | - | (('taru . taru) | |
258 | - | `(tag ,(string-append "taru-" taru))) | |
259 | - | (('level . level) | |
260 | - | `(tag ,(string-append "level-" level))) | |
261 | - | (('godanrow . tr) | |
262 | - | `(tag ,(string-append "godanrow-" tr))) | |
263 | - | (('onbin . tr) | |
264 | - | `(tag ,(string-append "onbin-" tr))) | |
265 | - | (('transitivity . tr) | |
266 | - | `(tag ,(string-append "transitivity-" tr))))) | |
267 | - | lst)) | |
268 | - | ||
269 | - | (define (sxml->string lst) | |
270 | - | (define (sub-loop loop tags result lst l) | |
271 | - | (let ((result (loop tags result l))) | |
272 | - | (if (list? result) | |
273 | - | (loop (append tags (filter list? result)) | |
274 | - | (apply string-append (filter string? result)) | |
275 | - | lst) | |
276 | - | (loop tags result lst)))) | |
277 | - | ||
278 | - | (let loop ((tags '()) (result "") (lst lst)) | |
279 | - | (match lst | |
280 | - | (() (if (null? tags) | |
281 | - | result | |
282 | - | (append tags (list result)))) | |
283 | - | ((? string? s) | |
284 | - | (loop tags (string-append result s) '())) | |
285 | - | (((? string? s) lst ...) | |
286 | - | (loop tags (string-append result s) lst)) | |
287 | - | ((('prior . _) lst ...) | |
288 | - | (loop tags result lst)) | |
289 | - | ((('options . _) lst ...) | |
290 | - | (loop tags result lst)) | |
291 | - | ((('firstname . _) lst ...) | |
292 | - | (loop tags result lst)) | |
293 | - | ((('ausn . _) lst ...) | |
294 | - | (loop tags result lst)) | |
295 | - | ((('lang . _) lst ...) | |
296 | - | (loop tags result lst)) | |
297 | - | ((('meta . _) lst ...) | |
298 | - | (loop tags result lst)) | |
299 | - | ((('genki . genki) lst ...) | |
300 | - | ;(loop (cons `(tag ,genki) tags) result lst)) | |
301 | - | (loop tags result lst)) | |
302 | - | ((('jlpt . jlpt) lst ...) | |
303 | - | (loop (cons `(tag ,(string-append "jlpt-" jlpt)) tags) result lst)) | |
304 | - | ((('ref . (? string? ref)) lst ...) | |
305 | - | (loop (cons `(ref ,ref) tags) result lst)) | |
306 | - | ((('ref (? string? ref)) lst ...) | |
307 | - | (loop (cons `(ref ,ref) tags) result lst)) | |
308 | - | ((('jap l) lst ...) | |
309 | - | (sub-loop loop tags result lst l)) | |
310 | - | ((('foreign l) lst ...) | |
311 | - | (sub-loop loop tags result lst l)) | |
312 | - | ((('transcr l) lst ...) | |
313 | - | (sub-loop loop tags result lst l)) | |
314 | - | ((((? symbol? s) v) lst ...) | |
315 | - | (throw 'unsupported-symbol s v)) | |
316 | - | ((((? symbol? s) . v) lst ...) | |
317 | - | (throw 'unsupported-symbol-pair s v)) | |
318 | - | (((? list? l) lst ...) | |
319 | - | (sub-loop loop tags result lst l))))) | |
320 | - | ||
321 | - | (define (sxml->element lst elem frq) | |
322 | - | (let ((elem (match elem | |
323 | - | ((_ . elem) elem) | |
324 | - | (_ elem)))) | |
325 | - | (match elem | |
326 | - | ('orth (let ((kanji (filter string? lst))) | |
327 | - | (if (null? kanji) #f `(kanji ,(car kanji))))) | |
328 | - | ('entry (sxml->result lst frq)) | |
329 | - | ('hira `(reading ,(car lst))) | |
330 | - | ('hatsuon `(hatsuon ,(car lst))) | |
331 | - | ('accent `(pitch ,(car lst))) | |
332 | - | ('reading (sxml->reading lst)) | |
333 | - | ('form lst) | |
334 | - | ('impli `(impli ,(car lst))) | |
335 | - | ('text (let loop ((text "") (lst lst)) | |
336 | - | (match lst | |
337 | - | (() text) | |
338 | - | ((('hasPrecedingSpace . _) lst ...) | |
339 | - | (loop (string-append " " text) lst)) | |
340 | - | ((('hasFollowingSpace . _) lst ...) | |
341 | - | (string-append (loop text lst) " ")) | |
342 | - | (((? string? s) lst ...) | |
343 | - | (loop (string-append text s) lst))))) | |
344 | - | ('famn (sxml->string lst)) | |
345 | - | ('expl (sxml->string lst)) | |
346 | - | ('expli (filter list? lst)) | |
347 | - | ('abbrev (filter list? lst)) | |
348 | - | ('token (filter string? lst)) | |
349 | - | ('tr (sxml->string lst)) | |
350 | - | ('transcr `(transcr ,lst)) | |
351 | - | ('trans | |
352 | - | (append (filter list? lst) | |
353 | - | (map (lambda (s) `(trans ,s)) (filter string? lst)))) | |
354 | - | ('jap `(jap ,lst)) | |
355 | - | ('emph lst) | |
356 | - | ('title (filter string? lst)) | |
357 | - | ('transl (filter string? lst)) | |
358 | - | ('topic (filter string? lst)) | |
359 | - | ('iron (filter string? lst)) | |
360 | - | ('specchar (filter string? lst)) | |
361 | - | ('scientif (filter string? lst)) | |
362 | - | ('wikide #f) | |
363 | - | ('wikija #f) | |
364 | - | ('link #f) | |
365 | - | ('ref (sxml->ref lst)) | |
366 | - | ('sref (sxml->ref lst)) | |
367 | - | ('etym (sxml->source lst)) | |
368 | - | ('literal (sxml->string (list "???" (sxml->string lst) "???"))) | |
369 | - | ('def (sxml->string (list "(" (sxml->string lst) ")"))) | |
370 | - | ('date (sxml->string (list "(" (sxml->string lst) ")"))) | |
371 | - | ('birthdeath (sxml->string (list "(" (sxml->string lst) ")"))) | |
372 | - | ('descr (sxml->string (list "(" (sxml->string lst) ")"))) | |
373 | - | ('bracket (sxml->string (list "[" (sxml->string lst) "]"))) | |
374 | - | ('foreign (if (null? lst) #f `(foreign ,(car lst)))) | |
375 | - | ('seasonword `(tag ,(string-append "season-" (assoc-ref lst 'type)))) | |
376 | - | ('usg (usg->tag lst)) | |
377 | - | ('sense (sxml->sense lst)) | |
378 | - | ('steinhaus (let ((ref (sxml->string lst))) | |
379 | - | (if (string? ref) | |
380 | - | `(ref ,(sxml->string lst)) | |
381 | - | (throw 'not-steinhaus ref)))) | |
382 | - | ('pos '()); TODO: actually find what tag to use | |
383 | - | ('wordcomponent (cons `(tag "wordcomponent") (gram-tag lst))) | |
384 | - | ('meishi (cons `(tag "meishi") (gram-tag lst))) | |
385 | - | ('setsuzokushi (cons `(tag "setsuzokushi") (gram-tag lst))) | |
386 | - | ('daimeishi (cons `(tag "daimeishi") (gram-tag lst))) | |
387 | - | ('doushi (cons `(tag "doushi") (gram-tag lst))) | |
388 | - | ('kandoushi (cons `(tag "kandoushi") (gram-tag lst))) | |
389 | - | ('keiyoudoushi (cons `(tag "keiyoudoushi") (gram-tag lst))) | |
390 | - | ('keiyoushi (cons `(tag "keiyoushi") (gram-tag lst))) | |
391 | - | ('fukushi (cons `(tag "fukushi") (gram-tag lst))) | |
392 | - | ('rengo (cons `(tag "rengo") (gram-tag lst))) | |
393 | - | ('suffix (cons `(tag "suffix") (gram-tag lst))) | |
394 | - | ('prefix (cons `(tag "prefix") (gram-tag lst))) | |
395 | - | ('kanji (cons `(tag "kanji") (gram-tag lst))) | |
396 | - | ('rentaishi (cons `(tag "rentaishi") (gram-tag lst))) | |
397 | - | ('specialcharacter (cons `(tag "specialcharacter") (gram-tag lst))) | |
398 | - | ('joshi (cons `(tag "joshi") (gram-tag lst))) | |
399 | - | ('fukujoshi (cons `(tag "fukujoshi") (gram-tag lst))) | |
400 | - | ('kakujoshi (cons `(tag "kakujoshi") (gram-tag lst))) | |
401 | - | ('kakarijoshi (cons `(tag "kakarijoshi") (gram-tag lst))) | |
402 | - | ('shuujoshi (cons `(tag "shuujoshi") (gram-tag lst))) | |
403 | - | ('setsuzokujoshi (cons `(tag "setsuzokujoshi") (gram-tag lst))) | |
404 | - | ('jokeiyoushi (cons `(tag "jokeiyoushi") (gram-tag lst))) | |
405 | - | ('jodoushi (cons `(tag "jodoushi") (gram-tag lst))) | |
406 | - | ('ruigos lst) | |
407 | - | ('ruigo (ruigo->ref lst)) | |
408 | - | ('gramGrp lst)))) | |
409 | - | ||
410 | - | (define (create-parser frq) | |
411 | - | (ssax:make-parser | |
412 | - | NEW-LEVEL-SEED | |
413 | - | (lambda (elem-gi attributes namespaces expected-content seed) | |
414 | - | attributes) | |
415 | - | ||
416 | - | FINISH-ELEMENT | |
417 | - | (lambda (elem-gi attributes namespaces parent-seed seed) | |
418 | - | (if (equal? elem-gi 'entries) | |
419 | - | seed | |
420 | - | (let* ((seed (reverse seed)) | |
421 | - | (element (sxml->element seed elem-gi frq))) | |
422 | - | (cons element parent-seed)))) | |
423 | - | ||
424 | - | CHAR-DATA-HANDLER | |
425 | - | (lambda (string1 string2 seed) | |
426 | - | (cons (string-append string1 string2) seed)))) | |
427 | - | ||
428 | - | (define (xml->results port frq) | |
429 | - | (let ((results (filter result? ((create-parser frq) port '())))) | |
430 | - | (sort results (lambda (a b) (> (result-points a) (result-points b)))))) |
po/fr.po
7 | 7 | msgstr "" | |
8 | 8 | "Project-Id-Version: PACKAGE VERSION\n" | |
9 | 9 | "Report-Msgid-Bugs-To: \n" | |
10 | - | "POT-Creation-Date: 2020-06-05 15:21+0200\n" | |
10 | + | "POT-Creation-Date: 2020-06-25 02:57+0200\n" | |
11 | 11 | "PO-Revision-Date: 2020-06-15 14:56+0000\n" | |
12 | 12 | "Last-Translator: full name <fedora-account@lepiller.eu>\n" | |
13 | 13 | "Language-Team: French <https://translate.fedoraproject.org/projects/nani/" | |
… | |||
107 | 107 | msgid "JMdict" | |
108 | 108 | msgstr "JMdict" | |
109 | 109 | ||
110 | - | #: tools/list.scm:55 | |
110 | + | #: tools/list.scm:63 | |
111 | 111 | msgid "" | |
112 | 112 | "Japanese/Dutch dictionary from the Electronic Dictionary Research and " | |
113 | 113 | "Development Group." | |
… | |||
115 | 115 | "Dictionnaire japonais/anglais de l???Electronic Dictionary Research and " | |
116 | 116 | "Development Group." | |
117 | 117 | ||
118 | - | #: tools/list.scm:54 | |
118 | + | #: tools/list.scm:62 | |
119 | 119 | msgid "" | |
120 | 120 | "Japanese/English dictionary from the Electronic Dictionary Research and " | |
121 | 121 | "Development Group." | |
… | |||
123 | 123 | "Dictionnaire japonais/anglais de l???Electronic Dictionary Research and " | |
124 | 124 | "Development Group." | |
125 | 125 | ||
126 | - | #: tools/list.scm:56 | |
126 | + | #: tools/list.scm:64 | |
127 | 127 | msgid "" | |
128 | 128 | "Japanese/French dictionary from the Electronic Dictionary Research and " | |
129 | 129 | "Development Group." | |
… | |||
131 | 131 | "Dictionnaire japonais/fran??ais de l???Electronic Dictionary Research and " | |
132 | 132 | "Development Group." | |
133 | 133 | ||
134 | + | #: tools/list.scm:53 | |
135 | + | #, fuzzy | |
136 | + | msgid "Japanese/French dictionary from the Jibiki project." | |
137 | + | msgstr "Dictionnaire japonais/allemand de Wadoku." | |
138 | + | ||
134 | 139 | #: tools/list.scm:37 | |
135 | 140 | msgid "Japanese/German dictionary from Wadoku." | |
136 | 141 | msgstr "Dictionnaire japonais/allemand de Wadoku." | |
137 | 142 | ||
138 | - | #: tools/list.scm:57 | |
143 | + | #: tools/list.scm:65 | |
139 | 144 | msgid "" | |
140 | 145 | "Japanese/German dictionary from the Electronic Dictionary Research and " | |
141 | 146 | "Development Group." | |
… | |||
143 | 148 | "Dictionnaire japonais/allemand de l???Electronic Dictionary Research and " | |
144 | 149 | "Development Group." | |
145 | 150 | ||
146 | - | #: tools/list.scm:58 | |
151 | + | #: tools/list.scm:66 | |
147 | 152 | msgid "" | |
148 | 153 | "Japanese/Hungarian dictionary from the Electronic Dictionary Research and " | |
149 | 154 | "Development Group." | |
… | |||
151 | 156 | "Dictionnaire japonais/hongrois de l???Electronic Dictionary Research and " | |
152 | 157 | "Development Group." | |
153 | 158 | ||
154 | - | #: tools/list.scm:59 | |
159 | + | #: tools/list.scm:67 | |
155 | 160 | msgid "" | |
156 | 161 | "Japanese/Russian dictionary from the Electronic Dictionary Research and " | |
157 | 162 | "Development Group." | |
… | |||
159 | 164 | "Dictionnaire japonais/russe de l???Electronic Dictionary Research and " | |
160 | 165 | "Development Group." | |
161 | 166 | ||
162 | - | #: tools/list.scm:60 | |
167 | + | #: tools/list.scm:68 | |
163 | 168 | msgid "" | |
164 | 169 | "Japanese/Slovenian dictionary from the Electronic Dictionary Research and " | |
165 | 170 | "Development Group." | |
… | |||
167 | 172 | "Dictionnaire japonais/slov??ne de l???Electronic Dictionary Research and " | |
168 | 173 | "Development Group." | |
169 | 174 | ||
170 | - | #: tools/list.scm:61 | |
175 | + | #: tools/list.scm:69 | |
171 | 176 | msgid "" | |
172 | 177 | "Japanese/Spanish dictionary from the Electronic Dictionary Research and " | |
173 | 178 | "Development Group." | |
… | |||
175 | 180 | "Dictionnaire japonais/espagnol de l???Electronic Dictionary Research and " | |
176 | 181 | "Development Group." | |
177 | 182 | ||
178 | - | #: tools/list.scm:62 | |
183 | + | #: tools/list.scm:70 | |
179 | 184 | msgid "" | |
180 | 185 | "Japanese/Swedish dictionary from the Electronic Dictionary Research and " | |
181 | 186 | "Development Group." | |
… | |||
382 | 387 | " aidera ?? mieux prononcer les mots, avec l'accent de hauteur du japonais\n" | |
383 | 388 | " standard." | |
384 | 389 | ||
390 | + | #: tools/list.scm:55 | |
391 | + | #, fuzzy | |
392 | + | msgid "" | |
393 | + | "This dictionary allows you to do searches on the main view of this app.\n" | |
394 | + | "\tFailing to download one of these dictionaries will make the app unusable\n" | |
395 | + | "\tas you can't search for anything. This dictionary can be searched for\n" | |
396 | + | "\tby kanji, reading (kana) and by French translation." | |
397 | + | msgstr "" | |
398 | + | "Ce dictionnaire vous permet d???effectuer des recherches sur la vue\n" | |
399 | + | " principale de cette appli. Si vous n???en t??l??chargez aucun, l???appli\n" | |
400 | + | " sera inutilisable puisque vous ne pourrez rien rechercher. Ce\n" | |
401 | + | " dictionnaire permet d???effectuer des recherches par kanji, par\n" | |
402 | + | " prononciation (kana) et par traduction allemande." | |
403 | + | ||
385 | 404 | #: tools/list.scm:39 | |
405 | + | #, fuzzy | |
386 | 406 | msgid "" | |
387 | 407 | "This dictionary allows you to do searches on the main view of this app.\n" | |
388 | - | " Failing to download on of these dictionaries will make the app " | |
408 | + | " Failing to download one of these dictionaries will make the app " | |
389 | 409 | "unusable\n" | |
390 | 410 | " as you can't search for anything. This dictionary can be searched " | |
391 | 411 | "for\n" | |
… | |||
397 | 417 | " dictionnaire permet d???effectuer des recherches par kanji, par\n" | |
398 | 418 | " prononciation (kana) et par traduction allemande." | |
399 | 419 | ||
400 | - | #: tools/list.scm:64 | |
420 | + | #: tools/list.scm:72 | |
401 | 421 | msgid "" | |
402 | 422 | "This dictionary allows you to do searches on the main view of this app.\n" | |
403 | 423 | " Failing to download one of these dictionaries will make the app " |
po/nani.pot
8 | 8 | msgstr "" | |
9 | 9 | "Project-Id-Version: PACKAGE VERSION\n" | |
10 | 10 | "Report-Msgid-Bugs-To: \n" | |
11 | - | "POT-Creation-Date: 2020-06-06 01:47+0200\n" | |
11 | + | "POT-Creation-Date: 2021-07-18 16:26+0200\n" | |
12 | 12 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" | |
13 | 13 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n" | |
14 | 14 | "Language-Team: LANGUAGE <LL@li.org>\n" | |
… | |||
97 | 97 | msgid "JMdict" | |
98 | 98 | msgstr "" | |
99 | 99 | ||
100 | - | #: tools/list.scm:55 | |
100 | + | #: tools/list.scm:63 | |
101 | 101 | msgid "" | |
102 | 102 | "Japanese/Dutch dictionary from the Electronic Dictionary Research and " | |
103 | 103 | "Development Group." | |
104 | 104 | msgstr "" | |
105 | 105 | ||
106 | - | #: tools/list.scm:54 | |
106 | + | #: tools/list.scm:62 | |
107 | 107 | msgid "" | |
108 | 108 | "Japanese/English dictionary from the Electronic Dictionary Research and " | |
109 | 109 | "Development Group." | |
110 | 110 | msgstr "" | |
111 | 111 | ||
112 | - | #: tools/list.scm:56 | |
112 | + | #: tools/list.scm:64 | |
113 | 113 | msgid "" | |
114 | 114 | "Japanese/French dictionary from the Electronic Dictionary Research and " | |
115 | 115 | "Development Group." | |
116 | 116 | msgstr "" | |
117 | 117 | ||
118 | + | #: tools/list.scm:53 | |
119 | + | msgid "Japanese/French dictionary from the Jibiki project." | |
120 | + | msgstr "" | |
121 | + | ||
118 | 122 | #: tools/list.scm:37 | |
119 | 123 | msgid "Japanese/German dictionary from Wadoku." | |
120 | 124 | msgstr "" | |
121 | 125 | ||
122 | - | #: tools/list.scm:57 | |
126 | + | #: tools/list.scm:65 | |
123 | 127 | msgid "" | |
124 | 128 | "Japanese/German dictionary from the Electronic Dictionary Research and " | |
125 | 129 | "Development Group." | |
126 | 130 | msgstr "" | |
127 | 131 | ||
128 | - | #: tools/list.scm:58 | |
132 | + | #: tools/list.scm:66 | |
129 | 133 | msgid "" | |
130 | 134 | "Japanese/Hungarian dictionary from the Electronic Dictionary Research and " | |
131 | 135 | "Development Group." | |
132 | 136 | msgstr "" | |
133 | 137 | ||
134 | - | #: tools/list.scm:59 | |
138 | + | #: tools/list.scm:67 | |
135 | 139 | msgid "" | |
136 | 140 | "Japanese/Russian dictionary from the Electronic Dictionary Research and " | |
137 | 141 | "Development Group." | |
138 | 142 | msgstr "" | |
139 | 143 | ||
140 | - | #: tools/list.scm:60 | |
144 | + | #: tools/list.scm:68 | |
141 | 145 | msgid "" | |
142 | 146 | "Japanese/Slovenian dictionary from the Electronic Dictionary Research and " | |
143 | 147 | "Development Group." | |
144 | 148 | msgstr "" | |
145 | 149 | ||
146 | - | #: tools/list.scm:61 | |
150 | + | #: tools/list.scm:69 | |
147 | 151 | msgid "" | |
148 | 152 | "Japanese/Spanish dictionary from the Electronic Dictionary Research and " | |
149 | 153 | "Development Group." | |
150 | 154 | msgstr "" | |
151 | 155 | ||
152 | - | #: tools/list.scm:62 | |
156 | + | #: tools/list.scm:70 | |
153 | 157 | msgid "" | |
154 | 158 | "Japanese/Swedish dictionary from the Electronic Dictionary Research and " | |
155 | 159 | "Development Group." | |
… | |||
311 | 315 | " words better, with a standard Japanese pitch accent." | |
312 | 316 | msgstr "" | |
313 | 317 | ||
318 | + | #: tools/list.scm:55 | |
319 | + | msgid "" | |
320 | + | "This dictionary allows you to do searches on the main view of this app.\n" | |
321 | + | "\tFailing to download one of these dictionaries will make the app unusable\n" | |
322 | + | "\tas you can't search for anything. This dictionary can be searched for\n" | |
323 | + | "\tby kanji, reading (kana) and by French translation." | |
324 | + | msgstr "" | |
325 | + | ||
314 | 326 | #: tools/list.scm:39 | |
315 | 327 | msgid "" | |
316 | 328 | "This dictionary allows you to do searches on the main view of this app.\n" | |
317 | - | " Failing to download on of these dictionaries will make the app " | |
329 | + | " Failing to download one of these dictionaries will make the app " | |
318 | 330 | "unusable\n" | |
319 | 331 | " as you can't search for anything. This dictionary can be searched " | |
320 | 332 | "for\n" | |
321 | 333 | " by kanji, reading (kana) and by German translation." | |
322 | 334 | msgstr "" | |
323 | 335 | ||
324 | - | #: tools/list.scm:64 | |
336 | + | #: tools/list.scm:72 | |
325 | 337 | msgid "" | |
326 | 338 | "This dictionary allows you to do searches on the main view of this app.\n" | |
327 | 339 | " Failing to download one of these dictionaries will make the app " |
po/uk.po
107 | 107 | msgid "JMdict" | |
108 | 108 | msgstr "JMdict" | |
109 | 109 | ||
110 | - | #: tools/list.scm:55 | |
110 | + | #: tools/list.scm:63 | |
111 | 111 | msgid "" | |
112 | 112 | "Japanese/Dutch dictionary from the Electronic Dictionary Research and " | |
113 | 113 | "Development Group." | |
… | |||
115 | 115 | "????????????????-???????????????????????? ?????????????? ?????? Electronic Dictionary Research and " | |
116 | 116 | "Development Group." | |
117 | 117 | ||
118 | - | #: tools/list.scm:54 | |
118 | + | #: tools/list.scm:62 | |
119 | 119 | msgid "" | |
120 | 120 | "Japanese/English dictionary from the Electronic Dictionary Research and " | |
121 | 121 | "Development Group." | |
… | |||
123 | 123 | "????????????????-?????????????????????? ?????????????? ?????? Electronic Dictionary Research and " | |
124 | 124 | "Development Group." | |
125 | 125 | ||
126 | - | #: tools/list.scm:56 | |
126 | + | #: tools/list.scm:64 | |
127 | 127 | msgid "" | |
128 | 128 | "Japanese/French dictionary from the Electronic Dictionary Research and " | |
129 | 129 | "Development Group." | |
… | |||
131 | 131 | "????????????????-?????????????????????? ?????????????? ?????? Electronic Dictionary Research and " | |
132 | 132 | "Development Group." | |
133 | 133 | ||
134 | + | #: tools/list.scm:53 | |
135 | + | msgid "Japanese/French dictionary from the Jibiki project." | |
136 | + | msgstr "" | |
137 | + | ||
134 | 138 | #: tools/list.scm:37 | |
135 | 139 | msgid "Japanese/German dictionary from Wadoku." | |
136 | 140 | msgstr "????????????????-?????????????????? ?????????????? ?????? Wadoku." | |
137 | 141 | ||
138 | - | #: tools/list.scm:57 | |
142 | + | #: tools/list.scm:65 | |
139 | 143 | msgid "" | |
140 | 144 | "Japanese/German dictionary from the Electronic Dictionary Research and " | |
141 | 145 | "Development Group." | |
… | |||
143 | 147 | "????????????????-?????????????????? ?????????????? ?????? Electronic Dictionary Research and " | |
144 | 148 | "Development Group." | |
145 | 149 | ||
146 | - | #: tools/list.scm:58 | |
150 | + | #: tools/list.scm:66 | |
147 | 151 | msgid "" | |
148 | 152 | "Japanese/Hungarian dictionary from the Electronic Dictionary Research and " | |
149 | 153 | "Development Group." | |
… | |||
151 | 155 | "????????????????-?????????????????? ?????????????? ?????? Electronic Dictionary Research and " | |
152 | 156 | "Development Group." | |
153 | 157 | ||
154 | - | #: tools/list.scm:59 | |
158 | + | #: tools/list.scm:67 | |
155 | 159 | msgid "" | |
156 | 160 | "Japanese/Russian dictionary from the Electronic Dictionary Research and " | |
157 | 161 | "Development Group." | |
… | |||
159 | 163 | "????????????????-???????????????????? ?????????????? ?????? Electronic Dictionary Research and " | |
160 | 164 | "Development Group." | |
161 | 165 | ||
162 | - | #: tools/list.scm:60 | |
166 | + | #: tools/list.scm:68 | |
163 | 167 | msgid "" | |
164 | 168 | "Japanese/Slovenian dictionary from the Electronic Dictionary Research and " | |
165 | 169 | "Development Group." | |
… | |||
167 | 171 | "????????????????-?????????????????????? ?????????????? ?????? Electronic Dictionary Research and " | |
168 | 172 | "Development Group." | |
169 | 173 | ||
170 | - | #: tools/list.scm:61 | |
174 | + | #: tools/list.scm:69 | |
171 | 175 | msgid "" | |
172 | 176 | "Japanese/Spanish dictionary from the Electronic Dictionary Research and " | |
173 | 177 | "Development Group." | |
… | |||
175 | 179 | "????????????????-???????????????????? ?????????????? ?????? Electronic Dictionary Research and " | |
176 | 180 | "Development Group." | |
177 | 181 | ||
178 | - | #: tools/list.scm:62 | |
182 | + | #: tools/list.scm:70 | |
179 | 183 | msgid "" | |
180 | 184 | "Japanese/Swedish dictionary from the Electronic Dictionary Research and " | |
181 | 185 | "Development Group." | |
… | |||
383 | 387 | "?????????????????? ???? ?? ????????????????????, ?? ?????? ?????????????? ?????????????? ????????, ?????? ???????????????????? ??????\n" | |
384 | 388 | "?????????????????? ?????????? ??????????, ?? ?????????????????????????? ???????????????????? ???????????????? ????????????????." | |
385 | 389 | ||
390 | + | #: tools/list.scm:55 | |
391 | + | msgid "" | |
392 | + | "This dictionary allows you to do searches on the main view of this app.\n" | |
393 | + | "\tFailing to download one of these dictionaries will make the app unusable\n" | |
394 | + | "\tas you can't search for anything. This dictionary can be searched for\n" | |
395 | + | "\tby kanji, reading (kana) and by French translation." | |
396 | + | msgstr "" | |
397 | + | ||
386 | 398 | #: tools/list.scm:39 | |
387 | 399 | msgid "" | |
388 | 400 | "This dictionary allows you to do searches on the main view of this app.\n" | |
389 | - | " Failing to download on of these dictionaries will make the app " | |
401 | + | " Failing to download one of these dictionaries will make the app " | |
390 | 402 | "unusable\n" | |
391 | 403 | " as you can't search for anything. This dictionary can be searched " | |
392 | 404 | "for\n" | |
… | |||
399 | 411 | "???????????????? ???? ???? ?????????????? ???????????? ?? ?????? ???????????? ??????????. ?? ?????????? ???????????????? ??????????\n" | |
400 | 412 | "???????????? ???? ????????????, ?????????????? ?????????????????? (????????) ???? ???????????????????? ??????????????????." | |
401 | 413 | ||
402 | - | #: tools/list.scm:64 | |
414 | + | #: tools/list.scm:72 | |
403 | 415 | msgid "" | |
404 | 416 | "This dictionary allows you to do searches on the main view of this app.\n" | |
405 | 417 | " Failing to download one of these dictionaries will make the app " | |
… | |||
445 | 457 | "multilingual,\n" | |
446 | 458 | "including Dutch, French, German and others." | |
447 | 459 | msgstr "" | |
448 | - | "???? ?????????????????? ???????? ???????????????? ???? ???????? ???????????????????? ???????????????? <a href=\"~a\"" | |
449 | - | ">Creative Commons\n" | |
460 | + | "???? ?????????????????? ???????? ???????????????? ???? ???????? ???????????????????? ???????????????? <a href=\"~a" | |
461 | + | "\">Creative Commons\n" | |
450 | 462 | "Share-Alike</a>. ???? ???????? ?? ???????????????????? ?? ???????????????????? (????????????????????) ?? " | |
451 | 463 | "???????????????????????? ??????????, ??????????????\n" | |
452 | 464 | "????????????????????????, ??????????????????????, ?????????????????? ???? ???????????? ????????????." |
radicals.mk
1 | - | RADK_MODULES=tools/radk.scm modules/nani/radk.scm modules/nani/serialize.scm | |
1 | + | RADK_MODULES=tools/radk.scm modules/nani/kanji/radk.scm modules/nani/encoding/serialize.scm | |
2 | 2 | DICOS+=dicos/radicals.nani | |
3 | 3 | DOWNLOADS+=dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml | |
4 | 4 | ||
5 | 5 | dictionaries/radkfilex.utf8: | |
6 | - | wget ftp://ftp.monash.edu/pub/nihongo/kradzip.zip -O dictionaries/kradzip.zip | |
6 | + | #wget ftp://ftp.monash.edu/pub/nihongo/kradzip.zip -O dictionaries/kradzip.zip | |
7 | + | wget http://ftp.usf.edu/pub/ftp.monash.edu.au/pub/nihongo/kradzip.zip -O dictionaries/kradzip.zip | |
7 | 8 | unzip dictionaries/kradzip.zip radkfilex -d dictionaries | |
8 | 9 | iconv -f euc-jp -t utf-8 dictionaries/radkfilex > $@ | |
9 | 10 | rm dictionaries/radkfilex |
tools/jibiki.scm unknown status 1
1 | + | ;;; Nani Project website | |
2 | + | ;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu> | |
3 | + | ;;; | |
4 | + | ;;; This file is part of the Nani Project website. | |
5 | + | ;;; | |
6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | + | ;;; your option) any later version. | |
10 | + | ;;; | |
11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + | ;;; GNU Affero General Public License for more details. | |
15 | + | ;;; | |
16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | + | ||
19 | + | (use-modules (nani result result)) | |
20 | + | (use-modules (nani result jibiki)) | |
21 | + | (use-modules (nani result frequency)) | |
22 | + | (use-modules (ice-9 match)) | |
23 | + | (use-modules (ice-9 binary-ports)) | |
24 | + | ||
25 | + | ;; Break these steps to try and let the GC reclaim these big objects | |
26 | + | (define (get-results1 input frq) | |
27 | + | (call-with-input-file input | |
28 | + | (lambda (port) | |
29 | + | (xml->results port frq)))) | |
30 | + | ||
31 | + | (define (get-results input meaning-filter frq) | |
32 | + | (let* ((results (get-results1 input frq)) | |
33 | + | (results (map (lambda (result) | |
34 | + | (update-result | |
35 | + | result | |
36 | + | #:meanings (filter meaning-filter | |
37 | + | (result-meanings result)))) | |
38 | + | results)) | |
39 | + | (results (filter (lambda (result) (not (null? (result-meanings result)))) | |
40 | + | results))) | |
41 | + | results)) | |
42 | + | ||
43 | + | (define (compile input meaning-filter output) | |
44 | + | (let* ((results (get-results input meaning-filter | |
45 | + | (load-frequency "dictionaries/frequency.tsv")))) | |
46 | + | (format #t "Number of entries in ~a: ~a~%" output (length results)) | |
47 | + | (call-with-output-file output | |
48 | + | (lambda (port) | |
49 | + | (put-bytevector port | |
50 | + | (serialize-dictionary results)))))) | |
51 | + | ||
52 | + | (match (command-line) | |
53 | + | ((_ cmd input output) | |
54 | + | (cond | |
55 | + | ((equal? cmd "build") | |
56 | + | (compile input (const #t) output)) | |
57 | + | (else (format #t "Unknown cmd ~a.~%" cmd))))) |
tools/jmdict.scm
16 | 16 | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | 17 | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | 18 | ||
19 | - | (use-modules (nani jmdict trie)) | |
20 | - | (use-modules (nani jmdict serialize)) | |
21 | - | (use-modules (nani jmdict xml)) | |
22 | - | (use-modules (nani frequency)) | |
23 | - | (use-modules (nani trie)) | |
24 | - | (use-modules (nani result)) | |
19 | + | (use-modules (nani result frequency)) | |
20 | + | (use-modules (nani result jmdict)) | |
21 | + | (use-modules (nani result result)) | |
25 | 22 | (use-modules (ice-9 match)) | |
26 | 23 | (use-modules (ice-9 binary-ports)) | |
27 | 24 | ||
… | |||
31 | 28 | (lambda (port) | |
32 | 29 | (xml->results port frq)))) | |
33 | 30 | ||
34 | - | (define (get-results input sense-filter frq) | |
31 | + | (define (get-results input meaning-filter frq) | |
35 | 32 | (let* ((results (get-results1 input frq)) | |
36 | 33 | (results (map (lambda (result) | |
37 | 34 | (update-result | |
38 | 35 | result | |
39 | - | #:senses (filter sense-filter | |
40 | - | (result-senses result)))) | |
36 | + | #:meanings (filter meaning-filter | |
37 | + | (result-meanings result)))) | |
41 | 38 | results)) | |
42 | - | (results (filter (lambda (result) (not (null? (result-senses result)))) | |
39 | + | (results (filter (lambda (result) (not (null? (result-meanings result)))) | |
43 | 40 | results))) | |
44 | 41 | results)) | |
45 | 42 | ||
46 | - | (define (compile input sense-filter output) | |
47 | - | (let* ((results (get-results input sense-filter | |
48 | - | (load-frequency "dictionaries/frequency.tsv"))) | |
49 | - | (kanji-trie (compress-trie (make-kanji-trie results))) | |
50 | - | (reading-trie (compress-trie (make-reading-trie results))) | |
51 | - | (meaning-trie (compress-trie (make-meaning-trie results)))) | |
43 | + | (define (compile input meaning-filter output) | |
44 | + | (let* ((results (get-results input meaning-filter | |
45 | + | (load-frequency "dictionaries/frequency.tsv")))) | |
52 | 46 | (format #t "Number of entries in ~a: ~a~%" output (length results)) | |
53 | 47 | (call-with-output-file output | |
54 | 48 | (lambda (port) | |
55 | 49 | (put-bytevector port | |
56 | - | (serialize-jmdict results kanji-trie reading-trie meaning-trie)))))) | |
50 | + | (serialize-dictionary results)))))) | |
57 | 51 | ||
58 | 52 | (define (print word dict) | |
59 | 53 | #t) | |
… | |||
64 | 58 | ((equal? cmd "build") | |
65 | 59 | (if (equal? lang "e") | |
66 | 60 | (compile input (const #t) output) | |
67 | - | (compile input (lambda (sense) (equal? (sense-language sense) lang)) output))) | |
61 | + | (compile input (lambda (meaning) (equal? (meaning-language meaning) lang)) output))) | |
68 | 62 | (else (format #t "Unknown cmd ~a.~%" cmd)))) | |
69 | 63 | ((_ "print" word input) | |
70 | 64 | (print word input))) |
tools/list.scm
17 | 17 | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | 18 | ||
19 | 19 | (use-modules (tools i18n)) | |
20 | - | (use-modules (nani radk)) | |
21 | - | (use-modules (nani jmdict serialize)) | |
22 | - | (use-modules (nani wadoku pitch)) | |
20 | + | (use-modules (nani kanji radk)) | |
21 | + | (use-modules (nani result result)) | |
22 | + | (use-modules (nani pitch pitch)) | |
23 | 23 | (use-modules (gcrypt hash)) | |
24 | 24 | (use-modules (ice-9 match)) | |
25 | 25 | (use-modules (ice-9 format)) | |
… | |||
37 | 37 | `(_ "Japanese/German dictionary from Wadoku.")) | |
38 | 38 | (define wadoku-description | |
39 | 39 | `(_ "This dictionary allows you to do searches on the main view of this app. | |
40 | - | Failing to download on of these dictionaries will make the app unusable | |
40 | + | Failing to download one of these dictionaries will make the app unusable | |
41 | 41 | as you can't search for anything. This dictionary can be searched for | |
42 | 42 | by kanji, reading (kana) and by German translation.")) | |
43 | 43 | ||
… | |||
49 | 49 | and this dictionary will add information that will help you pronounce | |
50 | 50 | words better, with a standard Japanese pitch accent.")) | |
51 | 51 | ||
52 | + | (define jibiki-synopsis | |
53 | + | `(_ "Japanese/French dictionary from the Jibiki project.")) | |
54 | + | (define jibiki-description | |
55 | + | `(_ "This dictionary allows you to do searches on the main view of this app. | |
56 | + | Failing to download one of these dictionaries will make the app unusable | |
57 | + | as you can't search for anything. This dictionary can be searched for | |
58 | + | by kanji, reading (kana) and by French translation.")) | |
59 | + | ||
52 | 60 | (define (jmdict-synopsis lang) | |
53 | 61 | (match lang | |
54 | 62 | ("e" `(_ "Japanese/English dictionary from the Electronic Dictionary Research and Development Group.")) | |
… | |||
80 | 88 | (if long? | |
81 | 89 | wadoku-pitch-description | |
82 | 90 | wadoku-pitch-synopsis)) | |
91 | + | ((equal? (dico-type dico) "jibiki") | |
92 | + | (if long? | |
93 | + | jibiki-description | |
94 | + | jibiki-synopsis)) | |
83 | 95 | ((equal? (dico-type dico) "jmdict") | |
84 | 96 | (let ((dico-lang (substring dico 7))) | |
85 | 97 | (if long? | |
… | |||
107 | 119 | ((equal? file "radicals") "radk") | |
108 | 120 | ((and (> (string-length file) 6) (equal? (substring file 0 6) "JMdict")) | |
109 | 121 | "jmdict") | |
122 | + | ((equal? file "jibiki_fre") "jibiki") | |
110 | 123 | ((equal? file "wadoku_ger") "wadoku") | |
111 | 124 | ((equal? file "wadoku_pitch") "wadoku_pitch"))) | |
112 | 125 | ||
… | |||
114 | 127 | (cond | |
115 | 128 | ((equal? (dico-type (dico-name file)) "radk") | |
116 | 129 | (kanji-count file)) | |
117 | - | ((member (dico-type (dico-name file)) '("jmdict" "wadoku")) | |
118 | - | (jmdict-entry-count file)) | |
130 | + | ((member (dico-type (dico-name file)) '("jmdict" "wadoku" "jibiki")) | |
131 | + | (dictionary-entry-count file)) | |
119 | 132 | ((equal? (dico-type (dico-name file)) "wadoku_pitch") | |
120 | 133 | (pitch-entry-count file)))) | |
121 | 134 |
tools/radk.scm
16 | 16 | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | 17 | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | 18 | ||
19 | - | (use-modules (nani radk)) | |
19 | + | (use-modules (nani kanji radk)) | |
20 | 20 | (use-modules (ice-9 match)) | |
21 | 21 | (use-modules (ice-9 binary-ports)) | |
22 | 22 |
tools/wadoku.scm
16 | 16 | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | 17 | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | 18 | ||
19 | - | (use-modules (nani jmdict trie)) | |
20 | - | (use-modules (nani jmdict serialize)) | |
21 | - | (use-modules (nani wadoku xml)) | |
22 | - | (use-modules (nani wadoku pitch)) | |
23 | - | (use-modules (nani frequency)) | |
24 | - | (use-modules (nani trie)) | |
25 | - | (use-modules (nani result)) | |
19 | + | (use-modules (nani pitch wadoku)) | |
20 | + | (use-modules (nani pitch pitch)) | |
21 | + | (use-modules (nani result frequency)) | |
22 | + | (use-modules (nani result result)) | |
23 | + | (use-modules (nani result wadoku)) | |
26 | 24 | (use-modules (ice-9 match)) | |
27 | 25 | (use-modules (ice-9 binary-ports)) | |
28 | 26 | ||
… | |||
32 | 30 | (lambda (port) | |
33 | 31 | (xml->results port frq)))) | |
34 | 32 | ||
35 | - | (define (get-results input sense-filter frq) | |
33 | + | (define (get-results input meaning-filter frq) | |
36 | 34 | (let* ((results (get-results1 input frq)) | |
37 | 35 | (results (map (lambda (result) | |
38 | 36 | (update-result | |
39 | 37 | result | |
40 | - | #:senses (filter sense-filter | |
41 | - | (result-senses result)))) | |
38 | + | #:meanings (filter meaning-filter | |
39 | + | (result-meanings result)))) | |
42 | 40 | results)) | |
43 | - | (results (filter (lambda (result) (not (null? (result-senses result)))) | |
41 | + | (results (filter (lambda (result) (not (null? (result-meanings result)))) | |
44 | 42 | results))) | |
45 | 43 | results)) | |
46 | 44 | ||
47 | - | (define (compile input sense-filter output) | |
48 | - | (let* ((results (get-results input sense-filter | |
49 | - | (load-frequency "dictionaries/frequency.tsv"))) | |
50 | - | (kanji-trie (compress-trie (make-kanji-trie results))) | |
51 | - | (reading-trie (compress-trie (make-reading-trie results))) | |
52 | - | (meaning-trie (compress-trie (make-meaning-trie results)))) | |
45 | + | (define (compile input meaning-filter output) | |
46 | + | (let* ((results (get-results input meaning-filter | |
47 | + | (load-frequency "dictionaries/frequency.tsv")))) | |
53 | 48 | (format #t "Number of entries in ~a: ~a~%" output (length results)) | |
54 | 49 | (call-with-output-file output | |
55 | 50 | (lambda (port) | |
56 | 51 | (put-bytevector port | |
57 | - | (serialize-jmdict results kanji-trie reading-trie meaning-trie)))))) | |
52 | + | (serialize-dictionary results)))))) | |
58 | 53 | ||
59 | 54 | (define (get-pitch input) | |
60 | 55 | (call-with-input-file input | |
… | |||
63 | 58 | ||
64 | 59 | (define (pitch input output) | |
65 | 60 | (let ((results (get-pitch input))) | |
66 | - | (format #t "~a results." (length results)) | |
61 | + | (format #t "~a results.~%" (length results)) | |
67 | 62 | (call-with-output-file output | |
68 | 63 | (lambda (port) | |
69 | 64 | (put-bytevector port |