Better compression and sort results by relevance
Makefile
4 | 4 | ||
5 | 5 | DICOS=dicos/JMdict_e.nani $(addprefix dicos/JMdict_, $(addsuffix .nani, $(JMDICT_LANGS))) | |
6 | 6 | ||
7 | - | PAGES=blog.scm e404.scm feeds.scm index.scm mentions.scm | |
7 | + | PAGES=blog.scm data.scm documentation.scm e404.scm feeds.scm index.scm mentions.scm | |
8 | 8 | ||
9 | 9 | HAUNT_FILES= haunt.scm $(addprefix pages/, $(PAGES)) \ | |
10 | 10 | tools/i18n.scm tools/theme.scm | |
… | |||
14 | 14 | ||
15 | 15 | DICO_MODULES=modules/nani/trie.scm modules/nani/result.scm modules/nani/jmdict/trie.scm \ | |
16 | 16 | modules/nani/jmdict/serialize.scm modules/nani/jmdict/xml.scm \ | |
17 | - | modules/nani/jmdict/entities.scm | |
17 | + | modules/nani/jmdict/entities.scm modules/nani/frequency.scm | |
18 | 18 | ||
19 | - | all: po/fr/LC_MESSAGES/nani.mo po/eo/LC_MESSAGES/nani.mo $(WEB_FILES) | |
19 | + | LANGS=fr | |
20 | + | ||
21 | + | all: $(addprefix po/, $(addsuffix /LC_MESSAGES/nani.mo, $(LANGS))) $(WEB_FILES) | |
20 | 22 | haunt build | |
21 | 23 | rm -rf public.bak | |
22 | 24 | mv public public.bak | |
… | |||
25 | 27 | ||
26 | 28 | download: | |
27 | 29 | @rm -f dictionaries/* | |
28 | - | @$(MAKE) dictionaries/JMdic_e.xml dictionaries/JMdic.xml | |
30 | + | @$(MAKE) dictionaries/JMdic_e.xml dictionaries/JMdic.xml dictionaries/frequency.tsv | |
29 | 31 | ||
32 | + | # Download JMdict dictionaries from ERDRG | |
30 | 33 | dictionaries/%.xml: | |
31 | - | wget http://ftp.monash.edu/pub/nihongo/$(shell basename $<).gz -O $<.gz | |
32 | - | gunzip $<.gz | |
33 | - | sed -i -e 's|<|\&\<;|g' -e 's|>|\&\>;|g' $< | |
34 | - | sed -i -e 's|&\([^;]\+\);|\1|g' $< | |
35 | - | cp $< $@ | |
34 | + | dl_filename="$(shell echo "$@" | rev | cut -c5- | rev)"; \ | |
35 | + | wget http://ftp.monash.edu/pub/nihongo/"$$dl_filename".gz -O "$$dl_filename.gz"; \ | |
36 | + | gunzip "$$dl_filename.gz"; \ | |
37 | + | sed -i -e 's|<|\&\<;|g' -e 's|>|\&\>;|g' "$$dl_filename"; \ | |
38 | + | sed -i -e 's|&\([^;]\+\);|\1|g' "$$dl_filename"; \ | |
39 | + | cp "$$dl_filename" "$@" | |
40 | + | ||
41 | + | # Download frequency analysis run on Wikipedia in 2015 | |
42 | + | # https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/Japanese2015_10000 | |
43 | + | dictionaries/frequency.tsv: | |
44 | + | wget --no-check-certificate \ | |
45 | + | https://namakajiri.net/data/wikipedia-20150422-lemmas.tsv -O $@.tmp | |
46 | + | sed -i 's| ||g' $@.tmp | |
47 | + | mv $@.tmp $@ | |
36 | 48 | ||
37 | 49 | dictionaries/%.sxml: dictionaries/%.xml tools/jmdict.scm | |
38 | 50 | guile -L modules tools/jmdict.scm convert $< nolang $@ | |
39 | 51 | ||
40 | - | dicos/JMdict_%.nani: dictionaries/JMdict.sxml tools/jmdict.scm $(DICO_MODULES) | |
52 | + | dicos/JMdict_%.nani: dictionaries/JMdict.sxml tools/jmdict.scm dictionaries/frequency.tsv $(DICO_MODULES) | |
41 | 53 | guile -L modules tools/jmdict.scm build \ | |
42 | 54 | $< $(shell echo $@ | sed 's|^.*_\([^.]*\)\..*$$|\1|g') $@ | |
43 | 55 | ||
44 | - | dicos/JMdict_e.nani: dictionaries/JMdict_e.sxml tools/jmdict.scm $(DICO_MODULES) | |
56 | + | dicos/JMdict_e.nani: dictionaries/JMdict_e.sxml tools/jmdict.scm dictionaries/frequency.tsv $(DICO_MODULES) | |
45 | 57 | guile -L modules tools/jmdict.scm build $< e $@ | |
46 | 58 | ||
47 | 59 | po/%/LC_MESSAGES/nani.mo: po/%.po |
modules/nani/frequency.scm unknown status 1
1 | + | ;;; Nani Project website | |
2 | + | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
3 | + | ;;; | |
4 | + | ;;; This file is part of the Nani Project website. | |
5 | + | ;;; | |
6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | + | ;;; your option) any later version. | |
10 | + | ;;; | |
11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + | ;;; GNU Affero General Public License for more details. | |
15 | + | ;;; | |
16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | + | ||
19 | + | (define-module (nani frequency) | |
20 | + | #:use-module (ice-9 rdelim) | |
21 | + | #:export (load-frequency | |
22 | + | frequency-entity)) | |
23 | + | ||
24 | + | (define (load-frequency file) | |
25 | + | (call-with-input-file file | |
26 | + | (lambda (port) | |
27 | + | (let loop ((frq '())) | |
28 | + | (let* ((line (%read-line port)) | |
29 | + | (line (car line))) | |
30 | + | (if (eof-object? line) | |
31 | + | frq | |
32 | + | (let ((content (string-split line #\tab))) | |
33 | + | (loop (cons (cons (cadr (cdr content)) | |
34 | + | (string->number (car content))) | |
35 | + | frq))))))))) | |
36 | + | ||
37 | + | (define (frequency-entity frq word) | |
38 | + | (let ((freq (assoc-ref frq word))) | |
39 | + | (cond | |
40 | + | ((not freq) | |
41 | + | #f) | |
42 | + | ((< freq 501) | |
43 | + | "frq500") | |
44 | + | ((< freq 1001) | |
45 | + | "frq1000") | |
46 | + | ((< freq 2001) | |
47 | + | "frq2000") | |
48 | + | ((< freq 5001) | |
49 | + | "frq1000") | |
50 | + | ((< freq 10001) | |
51 | + | "frq10000") | |
52 | + | ((< freq 20001) | |
53 | + | "frq20000") | |
54 | + | (else #f)))) |
modules/nani/huffman.scm unknown status 1
1 | + | ;;; Nani Project website | |
2 | + | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
3 | + | ;;; | |
4 | + | ;;; This file is part of the Nani Project website. | |
5 | + | ;;; | |
6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | + | ;;; your option) any later version. | |
10 | + | ;;; | |
11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + | ;;; GNU Affero General Public License for more details. | |
15 | + | ;;; | |
16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | + | ||
19 | + | (define-module (nani huffman) | |
20 | + | #:use-module (ice-9 match) | |
21 | + | #:use-module (rnrs bytevectors) | |
22 | + | #:export (create-huffman | |
23 | + | huffman->code | |
24 | + | huffman-encode | |
25 | + | huffman-decode | |
26 | + | serialize-huffman)) | |
27 | + | ||
28 | + | (define (add-occurence occ char) | |
29 | + | (let* ((o (assoc-ref occ char)) | |
30 | + | (o (+ (if o o 0) 1))) | |
31 | + | (assoc-set! occ char o))) | |
32 | + | ||
33 | + | (define (add-occurences occ str) | |
34 | + | (let loop ((lst (append (string->list str) (list #\nul))) (occ occ)) | |
35 | + | (if (null? lst) | |
36 | + | occ | |
37 | + | (loop (cdr lst) (add-occurence occ (car lst)))))) | |
38 | + | ||
39 | + | (define (get-occurences string-list) | |
40 | + | (let loop ((lst string-list) (occ '())) | |
41 | + | (if (null? lst) | |
42 | + | occ | |
43 | + | (loop (cdr lst) (add-occurences occ (car lst)))))) | |
44 | + | ||
45 | + | (define (order tree) | |
46 | + | (if (null? tree) | |
47 | + | tree | |
48 | + | (add-in-order (car tree) (order (cdr tree))))) | |
49 | + | ||
50 | + | (define (add-in-order elem tree) | |
51 | + | (if (null? tree) | |
52 | + | (list elem) | |
53 | + | (let ((elem-pos (cdr elem)) | |
54 | + | (tree-pos (cdr (car tree)))) | |
55 | + | (if (> elem-pos tree-pos) | |
56 | + | (cons (car tree) (add-in-order elem (cdr tree))) | |
57 | + | (cons elem tree))))) | |
58 | + | ||
59 | + | (define (create-huffman string-list) | |
60 | + | (let ((total (apply + (map string-length string-list))) | |
61 | + | (occurences (get-occurences string-list))) | |
62 | + | (let loop ((tree (order occurences))) | |
63 | + | (if (equal? (length tree) 1) | |
64 | + | tree | |
65 | + | (let* ((e1 (car tree)) | |
66 | + | (tree (cdr tree)) | |
67 | + | (e2 (car tree)) | |
68 | + | (tree (cdr tree))) | |
69 | + | (loop (add-in-order (cons (list e1 e2) (+ (cdr e1) (cdr e2))) tree))))))) | |
70 | + | ||
71 | + | (define (huffman->code huffman) | |
72 | + | (match huffman | |
73 | + | ('() '()) | |
74 | + | ((((h1 h2) . weight)) | |
75 | + | (append | |
76 | + | (map (lambda (e) (cons (car e) (cons 0 (cdr e)))) (huffman->code h1)) | |
77 | + | (map (lambda (e) (cons (car e) (cons 1 (cdr e)))) (huffman->code h2)))) | |
78 | + | (((h1 h2) . weight) | |
79 | + | (append | |
80 | + | (map (lambda (e) (cons (car e) (cons 0 (cdr e)))) (huffman->code h1)) | |
81 | + | (map (lambda (e) (cons (car e) (cons 1 (cdr e)))) (huffman->code h2)))) | |
82 | + | ((((? char? char) . weight)) | |
83 | + | (list (cons char '()))) | |
84 | + | (((? char? char) . weight) | |
85 | + | (list (cons char '()))))) | |
86 | + | ||
87 | + | (define (bits->bytes b) | |
88 | + | (match b | |
89 | + | ('() '()) | |
90 | + | ((a b c d e f g h bs ...) | |
91 | + | (cons (+ h (* 2 g) (* 4 f) (* 8 e) (* 16 d) (* 32 c) (* 64 b) (* 128 a)) | |
92 | + | (bits->bytes bs))) | |
93 | + | (_ (bits->bytes (append b (make-list (- 8 (length b)) 0)))))) | |
94 | + | ||
95 | + | (define (huffman-encode code str) | |
96 | + | (u8-list->bytevector | |
97 | + | (bits->bytes | |
98 | + | (apply append (map (lambda (c) (assoc-ref code c)) | |
99 | + | (append (string->list str) (list #\nul))))))) | |
100 | + | ||
101 | + | (define (byte->bits b) | |
102 | + | (let loop ((i 2) (b b) (by '())) | |
103 | + | (if (eq? i 512) | |
104 | + | by | |
105 | + | (loop (* i 2) (- b (modulo b i)) (cons (if (eq? (modulo b i) 0) 0 1) by))))) | |
106 | + | ||
107 | + | (define (huffman-decode huffman bv) | |
108 | + | (let ((seq (apply append (map byte->bits (bytevector->u8-list bv))))) | |
109 | + | (list->string | |
110 | + | (reverse | |
111 | + | (let loop ((seq seq) (result '()) (huff huffman)) | |
112 | + | (if (null? seq) | |
113 | + | result | |
114 | + | (match huff | |
115 | + | ((((h1 h2) . weight)) | |
116 | + | (if (= (car seq) 0) | |
117 | + | (loop (cdr seq) result h1) | |
118 | + | (loop (cdr seq) result h2))) | |
119 | + | (((h1 h2) . weight) | |
120 | + | (if (= (car seq) 0) | |
121 | + | (loop (cdr seq) result h1) | |
122 | + | (loop (cdr seq) result h2))) | |
123 | + | ((((? char? char) . weight)) | |
124 | + | (if (equal? char #\nul) | |
125 | + | result | |
126 | + | (loop seq (cons char result) huffman))) | |
127 | + | (((? char? char) . weight) | |
128 | + | (if (equal? char #\nul) | |
129 | + | result | |
130 | + | (loop seq (cons char result) huffman)))))))))) | |
131 | + | ||
132 | + | (define (serialize-huffman huffman) | |
133 | + | (define (serialize huffman) | |
134 | + | (match huffman | |
135 | + | ((((h1 h2) . weight)) | |
136 | + | (append '(1) (serialize h1) (serialize h2) '(2))) | |
137 | + | (((h1 h2) . weight) | |
138 | + | (append '(1) (serialize h1) (serialize h2) '(2))) | |
139 | + | ((((? char? char) . weight)) | |
140 | + | (append (bytevector->u8-list (string->utf8 (list->string (list char)))) '(0))) | |
141 | + | (((? char? char) . weight) | |
142 | + | (append (bytevector->u8-list (string->utf8 (list->string (list char)))) '(0))))) | |
143 | + | (u8-list->bytevector (serialize huffman))) |
modules/nani/jmdict/entities.scm
174 | 174 | ("sumo" . 170) | |
175 | 175 | ("zool" . 171) | |
176 | 176 | ("joc" . 172) | |
177 | - | ("anat" . 173))) | |
177 | + | ("anat" . 173) | |
178 | + | ("frq500" . 174) | |
179 | + | ("frq1000" . 175) | |
180 | + | ("frq2000" . 176) | |
181 | + | ("frq5000" . 177) | |
182 | + | ("frq10000" . 178) | |
183 | + | ("frq20000" . 179))) | |
178 | 184 | ||
179 | 185 | (define (get-entity ent) | |
180 | 186 | (let ((val (assoc-ref entities ent))) |
modules/nani/jmdict/serialize.scm
17 | 17 | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | 18 | ||
19 | 19 | (define-module (nani jmdict serialize) | |
20 | + | #:use-module (nani huffman) | |
20 | 21 | #:use-module (nani result) | |
21 | 22 | #:use-module (nani trie) | |
22 | 23 | #:use-module (rnrs bytevectors) | |
… | |||
46 | 47 | (loop pos (cdr trs) (append bvs bv)))))) | |
47 | 48 | (trie-position-set! trie pos) | |
48 | 49 | (let* ((vals-sz (list-size (trie-vals trie) int-size)) | |
49 | - | (trs-sz (list-size (trie-transitions trie) (const 5))) | |
50 | - | (sz (+ vals-sz trs-sz)) | |
50 | + | (trs-sz (list-size (trie-transitions trie) (const 5) #:size? #f)) | |
51 | + | (sz (+ vals-sz 1 trs-sz)) | |
51 | 52 | (bv (make-bytevector sz 0))) | |
52 | 53 | (serialize-list (map (lambda (pos) (result-position (array-ref results pos))) | |
53 | 54 | (trie-vals trie)) | |
… | |||
57 | 58 | (+ pos (bytevector-length bv)))) | |
58 | 59 | (next-pos (car bvs)) | |
59 | 60 | (bvs (cdr bvs))) | |
61 | + | (bytevector-u8-set! bv vals-sz (length (trie-transitions trie))) | |
60 | 62 | (serialize-list (trie-transitions trie) serialize-pointer | |
61 | - | vals-sz bv) | |
63 | + | (+ vals-sz 1) bv #:size? #f) | |
62 | 64 | (cons next-pos (cons bv bvs))))) | |
63 | 65 | ||
66 | + | (define* (serialize-list lst serialize pos bv #:key (size? #t)) | |
67 | + | (when (not (list? lst)) (throw 'not-list lst)) | |
68 | + | (when size? | |
69 | + | (bytevector-u16-set! bv pos (length lst) (endianness big))) | |
70 | + | (let loop ((lst lst) (pos (+ pos (if size? 2 0)))) | |
71 | + | (if (null? lst) | |
72 | + | pos | |
73 | + | (loop (cdr lst) (serialize (car lst) pos bv))))) | |
74 | + | (define* (list-size lst size #:key (size? #t)) | |
75 | + | (when (not (list? lst)) (throw 'not-list lst)) | |
76 | + | (apply + (if size? 2 0) (map size lst))) | |
77 | + | ||
64 | 78 | (define (serialize-pointer ptr pos bv) | |
65 | 79 | (bytevector-u8-set! bv pos (car ptr)) | |
66 | 80 | (bytevector-u32-set! bv (+ pos 1) (trie-position (cdr ptr)) (endianness big)) | |
… | |||
81 | 95 | (+ pos 1)) | |
82 | 96 | (define boolean-size (const 1)) | |
83 | 97 | ||
84 | - | (define (serialize-string str pos bv) | |
85 | - | (let ((sbv (string->utf8 str))) | |
86 | - | (bytevector-u32-set! bv pos (bytevector-length sbv) (endianness big)) | |
87 | - | (bytevector-copy! sbv 0 bv (+ pos 4) (bytevector-length sbv)) | |
88 | - | (+ pos 4 (bytevector-length sbv)))) | |
89 | - | (define (string-size str) | |
90 | - | (let ((sbv (string->utf8 str))) | |
91 | - | (+ 4 (bytevector-length sbv)))) | |
92 | - | ||
93 | - | (define* (serialize-list lst serialize pos bv #:key (size? #t)) | |
94 | - | (when (not (list? lst)) (throw 'not-list lst)) | |
95 | - | (when size? | |
96 | - | (bytevector-u32-set! bv pos (length lst) (endianness big))) | |
97 | - | (let loop ((lst lst) (pos (+ pos (if size? 4 0)))) | |
98 | - | (if (null? lst) | |
99 | - | pos | |
100 | - | (loop (cdr lst) (serialize (car lst) pos bv))))) | |
101 | - | (define* (list-size lst size #:key (size? #t)) | |
102 | - | (when (not (list? lst)) (throw 'not-list lst)) | |
103 | - | (apply + (if size? 4 0) (map size lst))) | |
104 | - | ||
105 | - | (define (serialize-source source pos bv) | |
106 | - | (when (not (source? source)) (throw 'not-source source)) | |
107 | - | (let* ((pos (serialize-list (source-content source) serialize-string pos bv)) | |
108 | - | (pos (serialize-boolean (source-wasei? source) pos bv)) | |
109 | - | (pos (serialize-string (source-type source) pos bv)) | |
110 | - | (pos (serialize-string (source-lang source) pos bv))) | |
111 | - | pos)) | |
112 | - | (define (source-size source) | |
113 | - | (when (not (source? source)) (throw 'not-source source)) | |
114 | - | (+ (list-size (source-content source) string-size) | |
115 | - | (boolean-size (source-wasei? source)) | |
116 | - | (string-size (source-type source)) | |
117 | - | (string-size (source-lang source)))) | |
98 | + | (define (serialize-jmdict results kanji-trie reading-trie sense-trie) | |
99 | + | (define kanji-huffman | |
100 | + | (let ((kanjis (apply append (map result-kanjis results)))) | |
101 | + | (create-huffman kanjis))) | |
102 | + | (define kanji-huffman-code (huffman->code kanji-huffman)) | |
103 | + | (define reading-huffman | |
104 | + | (let* ((readings (apply append (map result-readings results))) | |
105 | + | (readings (apply append (map reading-readings readings)))) | |
106 | + | (create-huffman readings))) | |
107 | + | (define reading-huffman-code (huffman->code reading-huffman)) | |
108 | + | (define sense-huffman | |
109 | + | (let* ((senses (apply append (map result-senses results))) | |
110 | + | (glosses (apply append (map sense-glosses senses)))) | |
111 | + | (create-huffman glosses))) | |
112 | + | (define sense-huffman-code (huffman->code sense-huffman)) | |
118 | 113 | ||
119 | - | (define (serialize-reading reading pos bv) | |
120 | - | (when (not (reading? reading)) (throw 'not-reading reading)) | |
121 | - | (let* ((pos (serialize-list (reading-kanjis reading) serialize-string pos bv)) | |
122 | - | (pos (serialize-list (reading-info reading) serialize-string pos bv)) | |
123 | - | (pos (serialize-list (reading-readings reading) serialize-string pos bv))) | |
124 | - | pos)) | |
125 | - | (define (reading-size reading) | |
126 | - | (when (not (reading? reading)) (throw 'not-reading reading)) | |
127 | - | (+ (list-size (reading-kanjis reading) string-size) | |
128 | - | (list-size (reading-info reading) string-size) | |
129 | - | (list-size (reading-readings reading) string-size))) | |
114 | + | (define (serialize-string str pos bv) | |
115 | + | (let ((sbv (string->utf8 str))) | |
116 | + | (bytevector-copy! sbv 0 bv pos (bytevector-length sbv)) | |
117 | + | (bytevector-u8-set! bv (+ pos (bytevector-length sbv)) 0) | |
118 | + | (+ pos 1 (bytevector-length sbv)))) | |
119 | + | (define (string-size str) | |
120 | + | (let ((sbv (string->utf8 str))) | |
121 | + | (+ 1 (bytevector-length sbv)))) | |
130 | 122 | ||
131 | - | (define (serialize-sense sense pos bv) | |
132 | - | (when (not (sense? sense)) (throw 'not-sense sense)) | |
133 | - | (let* ((pos (serialize-list (sense-references sense) serialize-string pos bv)) | |
134 | - | (pos (serialize-list (sense-limits sense) serialize-string pos bv)) | |
135 | - | (pos (serialize-list (sense-infos sense) serialize-string pos bv)) | |
136 | - | (pos (serialize-list (sense-sources sense) serialize-source pos bv)) | |
137 | - | (pos (serialize-list (sense-tags sense) serialize-char pos bv)) | |
138 | - | (pos (serialize-list (sense-glosses sense) serialize-string pos bv)) | |
139 | - | (pos (serialize-string (sense-language sense) pos bv))) | |
140 | - | pos)) | |
141 | - | (define (sense-size sense) | |
142 | - | (when (not (sense? sense)) (throw 'not-sense sense)) | |
143 | - | (+ (list-size (sense-references sense) string-size) | |
144 | - | (list-size (sense-limits sense) string-size) | |
145 | - | (list-size (sense-infos sense) string-size) | |
146 | - | (list-size (sense-sources sense) source-size) | |
147 | - | (list-size (sense-tags sense) char-size) | |
148 | - | (list-size (sense-glosses sense) string-size) | |
149 | - | (string-size (sense-language sense)))) | |
123 | + | (define (serialize-huffman-string huffman-code) | |
124 | + | (lambda (str pos bv) | |
125 | + | (let ((sbv (huffman-encode huffman-code str))) | |
126 | + | (bytevector-copy! sbv 0 bv (+ pos 4) (bytevector-length sbv)) | |
127 | + | (+ pos (bytevector-length sbv))))) | |
128 | + | (define (huffman-string-size huffman-code) | |
129 | + | (lambda (str) | |
130 | + | (let ((sbv (huffman-encode huffman-code str))) | |
131 | + | (+ (bytevector-length sbv))))) | |
132 | + | ||
133 | + | (define (serialize-source source pos bv) | |
134 | + | (when (not (source? source)) (throw 'not-source source)) | |
135 | + | (let* ((pos (serialize-list (source-content source) serialize-string pos bv)) | |
136 | + | (pos (serialize-boolean (source-wasei? source) pos bv)) | |
137 | + | (pos (serialize-string (source-type source) pos bv)) | |
138 | + | (pos (serialize-string (source-lang source) pos bv))) | |
139 | + | pos)) | |
140 | + | (define (source-size source) | |
141 | + | (when (not (source? source)) (throw 'not-source source)) | |
142 | + | (+ (list-size (source-content source) string-size) | |
143 | + | (boolean-size (source-wasei? source)) | |
144 | + | (string-size (source-type source)) | |
145 | + | (string-size (source-lang source)))) | |
146 | + | ||
147 | + | (define (serialize-reading reading pos bv) | |
148 | + | (when (not (reading? reading)) (throw 'not-reading reading)) | |
149 | + | (let* ((pos (serialize-list (reading-kanjis reading) serialize-string pos bv)) | |
150 | + | (pos (serialize-list (reading-info reading) serialize-string pos bv)) | |
151 | + | (pos (serialize-list (reading-readings reading) | |
152 | + | (serialize-huffman-string reading-huffman-code) pos bv))) | |
153 | + | pos)) | |
154 | + | (define (reading-size reading) | |
155 | + | (when (not (reading? reading)) (throw 'not-reading reading)) | |
156 | + | (+ (list-size (reading-kanjis reading) string-size) | |
157 | + | (list-size (reading-info reading) string-size) | |
158 | + | (list-size (reading-readings reading) (huffman-string-size reading-huffman-code)))) | |
159 | + | ||
160 | + | (define (serialize-sense sense pos bv) | |
161 | + | (when (not (sense? sense)) (throw 'not-sense sense)) | |
162 | + | (let* ((pos (serialize-list (sense-references sense) serialize-string pos bv)) | |
163 | + | (pos (serialize-list (sense-limits sense) serialize-string pos bv)) | |
164 | + | (pos (serialize-list (sense-infos sense) serialize-string pos bv)) | |
165 | + | (pos (serialize-list (sense-sources sense) serialize-source pos bv)) | |
166 | + | (pos (serialize-list (sense-tags sense) serialize-char pos bv)) | |
167 | + | (pos (serialize-list (sense-glosses sense) | |
168 | + | (serialize-huffman-string sense-huffman-code) pos bv)) | |
169 | + | (pos (serialize-string (sense-language sense) pos bv))) | |
170 | + | pos)) | |
171 | + | (define (sense-size sense) | |
172 | + | (when (not (sense? sense)) (throw 'not-sense sense)) | |
173 | + | (+ (list-size (sense-references sense) string-size) | |
174 | + | (list-size (sense-limits sense) string-size) | |
175 | + | (list-size (sense-infos sense) string-size) | |
176 | + | (list-size (sense-sources sense) source-size) | |
177 | + | (list-size (sense-tags sense) char-size) | |
178 | + | (list-size (sense-glosses sense) | |
179 | + | (huffman-string-size sense-huffman-code)) | |
180 | + | (string-size (sense-language sense)))) | |
181 | + | ||
182 | + | (define (serialize-result result pos bv) | |
183 | + | (when (not (result? result)) (throw 'not-result result)) | |
184 | + | (result-position-set! result pos) | |
185 | + | (let* ((pos (serialize-list (result-kanjis result) | |
186 | + | (serialize-huffman-string kanji-huffman-code) pos bv)) | |
187 | + | (pos (serialize-list (result-readings result) serialize-reading pos bv)) | |
188 | + | (pos (serialize-list (result-senses result) serialize-sense pos bv))) | |
189 | + | pos)) | |
190 | + | (define (result-size result) | |
191 | + | (when (not (result? result)) (throw 'not-result result)) | |
192 | + | (+ (list-size (result-kanjis result) (huffman-string-size kanji-huffman-code)) | |
193 | + | (list-size (result-readings result) reading-size) | |
194 | + | (list-size (result-senses result) sense-size))) | |
150 | 195 | ||
151 | - | (define (serialize-result result pos bv) | |
152 | - | (when (not (result? result)) (throw 'not-result result)) | |
153 | - | (result-position-set! result pos) | |
154 | - | (let* ((pos (serialize-list (result-kanjis result) serialize-string pos bv)) | |
155 | - | (pos (serialize-list (result-readings result) serialize-reading pos bv)) | |
156 | - | (pos (serialize-list (result-senses result) serialize-sense pos bv))) | |
157 | - | pos)) | |
158 | - | (define (result-size result) | |
159 | - | (when (not (result? result)) (throw 'not-result result)) | |
160 | - | (+ (list-size (result-kanjis result) string-size) | |
161 | - | (list-size (result-readings result) reading-size) | |
162 | - | (list-size (result-senses result) sense-size))) | |
196 | + | (define (trie-node-size trie) | |
197 | + | (apply + 1 (map trie-node-size (map cdr (trie-transitions trie))))) | |
163 | 198 | ||
164 | - | (define (serialize-jmdict results kanji-trie reading-trie sense-trie) | |
165 | 199 | (let* ((header (string->utf8 "NANI_JMDICT001")) | |
166 | 200 | (header-size (bytevector-length header)) | |
167 | 201 | (pointers (make-bytevector 12 0)) | |
202 | + | (kanji-huffman-bv (serialize-huffman kanji-huffman)) | |
203 | + | (kanji-huffman-size (bytevector-length kanji-huffman-bv)) | |
204 | + | (reading-huffman-bv (serialize-huffman reading-huffman)) | |
205 | + | (reading-huffman-size (bytevector-length reading-huffman-bv)) | |
206 | + | (sense-huffman-bv (serialize-huffman sense-huffman)) | |
207 | + | (sense-huffman-size (bytevector-length sense-huffman-bv)) | |
168 | 208 | (results-size (list-size results result-size #:size? #f)) | |
169 | - | (results-bv (make-bytevector (+ header-size 12 results-size 0)))) | |
170 | - | (serialize-list results serialize-result (+ header-size 12) results-bv #:size? #f) | |
209 | + | (huffman-size (+ reading-huffman-size sense-huffman-size kanji-huffman-size)) | |
210 | + | (results-bv (make-bytevector (+ header-size 12 kanji-huffman-size | |
211 | + | reading-huffman-size sense-huffman-size | |
212 | + | results-size 0)))) | |
213 | + | (format #t "Number of nodes in kanjis: ~a~%" | |
214 | + | (trie-node-size kanji-trie)) | |
215 | + | (format #t "Number of nodes in readings: ~a~%" | |
216 | + | (trie-node-size reading-trie)) | |
217 | + | (format #t "Number of nodes in senses: ~a~%" | |
218 | + | (trie-node-size sense-trie)) | |
219 | + | (serialize-list results serialize-result (+ header-size 12 huffman-size) | |
220 | + | results-bv #:size? #f) | |
171 | 221 | (let* ((results (list->array 1 results)) | |
172 | 222 | (pos (bytevector-length results-bv)) | |
173 | 223 | (kanji-bvs (serialize-trie kanji-trie results pos)) | |
… | |||
180 | 230 | (bytevector-u32-set! pointers 8 (car reading-bvs) (endianness big)) | |
181 | 231 | (bytevector-copy! header 0 results-bv 0 header-size) | |
182 | 232 | (bytevector-copy! pointers 0 results-bv header-size 12) | |
233 | + | (bytevector-copy! kanji-huffman-bv 0 results-bv (+ header-size 12) kanji-huffman-size) | |
234 | + | (bytevector-copy! reading-huffman-bv 0 results-bv | |
235 | + | (+ header-size 12 kanji-huffman-size) | |
236 | + | reading-huffman-size) | |
237 | + | (bytevector-copy! sense-huffman-bv 0 results-bv | |
238 | + | (+ header-size 12 kanji-huffman-size reading-huffman-size) | |
239 | + | sense-huffman-size) | |
183 | 240 | ;; give some feedback on the size of file's structures | |
241 | + | (format #t "huffmans are ~a bytes long~%" huffman-size) | |
184 | 242 | (format #t "results is ~a bytes long~%" (bytevector-length results-bv)) | |
185 | 243 | (format #t "kanjis is ~a bytes long~%" (apply + (map bytevector-length (cdr kanji-bvs)))) | |
186 | 244 | (format #t "readings is ~a bytes long~%" (apply + (map bytevector-length (cdr reading-bvs)))) |
modules/nani/jmdict/xml.scm
20 | 20 | #:use-module (ice-9 match) | |
21 | 21 | #:use-module (ice-9 rdelim) | |
22 | 22 | #:use-module (sxml fold) | |
23 | + | #:use-module (nani frequency) | |
23 | 24 | #:use-module (sxml simple) | |
24 | 25 | #:use-module (nani result) | |
25 | 26 | #:use-module (nani jmdict entities) | |
… | |||
71 | 72 | ((? string? _) sense)) | |
72 | 73 | (cdr lst))))) | |
73 | 74 | ||
74 | - | (define (sxml->result lst) | |
75 | - | (let loop ((result (make-result 0 '() '() '())) (lst lst)) | |
76 | - | (if (null? lst) | |
77 | - | result | |
78 | - | (loop | |
79 | - | (match (car lst) | |
80 | - | (('kanji kanji) (update-result result #:kanjis (cons kanji (result-kanjis result)))) | |
81 | - | ((? reading? r) (update-result result #:readings (cons r (result-readings result)))) | |
82 | - | ((? sense? s) (update-result result #:senses (cons s (result-senses result)))) | |
83 | - | ((? string? _) result)) | |
84 | - | (cdr lst))))) | |
75 | + | (define (sxml->result lst frq) | |
76 | + | (let ((result | |
77 | + | (let loop ((result (make-result 0 0 '() '() '())) (lst lst)) | |
78 | + | (if (null? lst) | |
79 | + | result | |
80 | + | (loop | |
81 | + | (match (car lst) | |
82 | + | (('kanji kanji) | |
83 | + | (update-result result #:kanjis (cons kanji (result-kanjis result)))) | |
84 | + | ((? reading? r) | |
85 | + | (update-result result #:readings (cons r (result-readings result)))) | |
86 | + | ((? sense? s) | |
87 | + | (update-result result #:senses (cons s (result-senses result)))) | |
88 | + | ((? string? _) result)) | |
89 | + | (cdr lst)))))) | |
90 | + | (let* ((word (if (null? (result-kanjis result)) | |
91 | + | (car (reading-readings (car (result-readings result)))) | |
92 | + | (car (result-kanjis result)))) | |
93 | + | (entity (frequency-entity frq word)) | |
94 | + | (sense1 (car (result-senses result))) | |
95 | + | (sense1 (update-sense sense1 #:tags (cons entity (sense-tags sense1)))) | |
96 | + | (senses (cons sense1 (cdr (result-senses result))))) | |
97 | + | (update-result result #:points (get-points (update-result result #:senses senses)))))) | |
98 | + | ||
99 | + | (define (get-points result) | |
100 | + | (let ((tags (apply append (map sense-tags (result-senses result))))) | |
101 | + | (apply + (map (lambda (tag) | |
102 | + | (cond | |
103 | + | ;; frequency | |
104 | + | ((eq? tag (get-entity "frq500")) | |
105 | + | 32) | |
106 | + | ((eq? tag (get-entity "frq1000")) | |
107 | + | 16) | |
108 | + | ((eq? tag (get-entity "frq2000")) | |
109 | + | 8) | |
110 | + | ((eq? tag (get-entity "frq5000")) | |
111 | + | 4) | |
112 | + | ((eq? tag (get-entity "frq10000")) | |
113 | + | 2) | |
114 | + | ((eq? tag (get-entity "frq20000")) | |
115 | + | 1) | |
116 | + | ;; archaic term | |
117 | + | ((eq? tag (get-entity "arch")) | |
118 | + | -3) | |
119 | + | ;; children language | |
120 | + | ((eq? tag (get-entity "chn")) | |
121 | + | -1) | |
122 | + | ;; colloquialism | |
123 | + | ((eq? tag (get-entity "col")) | |
124 | + | 5) | |
125 | + | ;; expression | |
126 | + | ((eq? tag (get-entity "exp")) | |
127 | + | 1) | |
128 | + | ;; sonkeigo (honorific or respectful) | |
129 | + | ((eq? tag (get-entity "hon")) | |
130 | + | -1) | |
131 | + | ;; kenjougo (humble) | |
132 | + | ((eq? tag (get-entity "hum")) | |
133 | + | -1) | |
134 | + | ;; teineigo (polite) | |
135 | + | ((eq? tag (get-entity "pol")) | |
136 | + | -1) | |
137 | + | ;; irregular kanji | |
138 | + | ((eq? tag (get-entity "iK")) | |
139 | + | -2) | |
140 | + | ;; idiomatic expression | |
141 | + | ((eq? tag (get-entity "id")) | |
142 | + | 3) | |
143 | + | ;; slang | |
144 | + | ((member tag (list | |
145 | + | (get-entity "sl") | |
146 | + | (get-entity "m-sl") | |
147 | + | (get-entity "male-sl"))) | |
148 | + | -4) | |
149 | + | ;; outdated kanji | |
150 | + | ((eq? tag (get-entity "oK")) | |
151 | + | -2) | |
152 | + | ;; obsolete term | |
153 | + | ((eq? tag (get-entity "obs")) | |
154 | + | -5) | |
155 | + | ;; obscure term | |
156 | + | ((eq? tag (get-entity "obsc")) | |
157 | + | -10) | |
158 | + | ;; onomatopea | |
159 | + | ((eq? tag (get-entity "on-mim")) | |
160 | + | 1) | |
161 | + | ;; poetical term | |
162 | + | ((eq? tag (get-entity "poet")) | |
163 | + | -1) | |
164 | + | ((eq? tag (get-entity "proverb")) | |
165 | + | 1) | |
166 | + | ((eq? tag (get-entity "rare")) | |
167 | + | -3) | |
168 | + | ;; sensitive | |
169 | + | ((eq? tag (get-entity "sens")) | |
170 | + | -2) | |
171 | + | ;; dialect | |
172 | + | ((member tag (list | |
173 | + | (get-entity "kyb") | |
174 | + | (get-entity "osb") | |
175 | + | (get-entity "ksb") | |
176 | + | (get-entity "ktb") | |
177 | + | (get-entity "tsb") | |
178 | + | (get-entity "thb") | |
179 | + | (get-entity "tsug") | |
180 | + | (get-entity "kyu") | |
181 | + | (get-entity "rkb") | |
182 | + | (get-entity "nab") | |
183 | + | (get-entity "hob"))) | |
184 | + | -1) | |
185 | + | (else 0))) | |
186 | + | tags)))) | |
85 | 187 | ||
86 | - | (define (sxml->results sxml) | |
87 | - | (foldt | |
88 | - | (lambda (xml) | |
89 | - | (match xml | |
90 | - | (('ent_seq _) "") | |
91 | - | (('ke_pri _) "") | |
92 | - | (('re_pri _) "") | |
93 | - | (('pri _) "") | |
94 | - | (('keb kanji) `(kanji ,kanji)) | |
95 | - | (('ke_inf _) "") | |
96 | - | (('k_ele lst ...) (car (filter list? lst))) | |
97 | - | (('reb reading) `(reading ,reading)) | |
98 | - | (('re_nokanji _ ...) "") | |
99 | - | (('re_restr r) `(limit ,r)) | |
100 | - | (('re_inf r) `(info ,r)) | |
101 | - | (('r_ele lst ...) (sxml->reading lst)) | |
102 | - | (('ant pos) `(ref ,pos)) | |
103 | - | (('dial pos) `(tag ,(get-entity pos))) | |
104 | - | (('field pos) `(tag ,(get-entity pos))) | |
105 | - | (('misc pos) `(tag ,(get-entity pos))) | |
106 | - | (('pos pos) `(tag ,(get-entity pos))) | |
107 | - | (('xref pos) `(ref ,pos)) | |
108 | - | (('g_type _ ...) "") | |
109 | - | (('gloss (? string? g)) xml) | |
110 | - | (('gloss (? list? g)) g) | |
111 | - | (('gloss attr g) (cons `(gloss ,g) attr)) | |
112 | - | (('stagk k) `(limit ,k)) | |
113 | - | (('stagr r) `(limit ,r)) | |
114 | - | (('s_inf r) `(info ,r)) | |
115 | - | (('lsource lst ...) (sxml->source lst)) | |
116 | - | (('sense lst ...) (sxml->sense lst)) | |
117 | - | (('entry lst ...) (sxml->result lst)) | |
118 | - | (('JMdict lst ...) lst) | |
119 | - | (('xml:lang l) `(lang ,l)) | |
120 | - | (('ls_wasei _) '(wasei #t)) | |
121 | - | (('ls_type t) `(type ,t)) | |
122 | - | (('@ lst ...) lst) | |
123 | - | (('*TOP* _ l) (filter result? l)) | |
124 | - | (('*PI* _ ...) #f))) | |
125 | - | (lambda (a) a) | |
126 | - | sxml)) | |
188 | + | (define (sxml->results sxml frq) | |
189 | + | (sort | |
190 | + | (foldt | |
191 | + | (lambda (xml) | |
192 | + | (match xml | |
193 | + | (('ent_seq _) "") | |
194 | + | (('ke_pri _) "") | |
195 | + | (('re_pri _) "") | |
196 | + | (('pri _) "") | |
197 | + | (('keb kanji) `(kanji ,kanji)) | |
198 | + | (('ke_inf _) "") | |
199 | + | (('k_ele lst ...) (car (filter list? lst))) | |
200 | + | (('reb reading) `(reading ,reading)) | |
201 | + | (('re_nokanji _ ...) "") | |
202 | + | (('re_restr r) `(limit ,r)) | |
203 | + | (('re_inf r) `(info ,r)) | |
204 | + | (('r_ele lst ...) (sxml->reading lst)) | |
205 | + | (('ant pos) `(ref ,pos)) | |
206 | + | (('dial pos) `(tag ,(get-entity pos))) | |
207 | + | (('field pos) `(tag ,(get-entity pos))) | |
208 | + | (('misc pos) `(tag ,(get-entity pos))) | |
209 | + | (('pos pos) `(tag ,(get-entity pos))) | |
210 | + | (('xref pos) `(ref ,pos)) | |
211 | + | (('g_type _ ...) "") | |
212 | + | (('gloss (? string? g)) xml) | |
213 | + | (('gloss (? list? g)) g) | |
214 | + | (('gloss attr g) (cons `(gloss ,g) attr)) | |
215 | + | (('stagk k) `(limit ,k)) | |
216 | + | (('stagr r) `(limit ,r)) | |
217 | + | (('s_inf r) `(info ,r)) | |
218 | + | (('lsource lst ...) (sxml->source lst)) | |
219 | + | (('sense lst ...) (sxml->sense lst)) | |
220 | + | (('entry lst ...) (sxml->result lst frq)) | |
221 | + | (('JMdict lst ...) lst) | |
222 | + | (('xml:lang l) `(lang ,l)) | |
223 | + | (('ls_wasei _) '(wasei #t)) | |
224 | + | (('ls_type t) `(type ,t)) | |
225 | + | (('@ lst ...) lst) | |
226 | + | (('*TOP* _ l) (filter result? l)) | |
227 | + | (('*PI* _ ...) #f))) | |
228 | + | (lambda (a) a) | |
229 | + | sxml) | |
230 | + | (lambda (a b) (< (result-points a) (result-points b))))) |
modules/nani/result.scm
22 | 22 | result? | |
23 | 23 | result-position | |
24 | 24 | result-position-set! | |
25 | + | result-points | |
25 | 26 | result-kanjis | |
26 | 27 | result-readings | |
27 | 28 | result-senses | |
… | |||
55 | 56 | update-source)) | |
56 | 57 | ||
57 | 58 | (define-record-type result | |
58 | - | (make-result position kanjis readings senses) | |
59 | + | (make-result position points kanjis readings senses) | |
59 | 60 | result? | |
60 | 61 | (position result-position result-position-set!) ; integer | |
62 | + | (points result-points) ; integer | |
61 | 63 | (kanjis result-kanjis) ; string-list | |
62 | 64 | (readings result-readings) ; reanding-list | |
63 | 65 | (senses result-senses)) ; sense-list | |
… | |||
89 | 91 | (lang source-lang)) ; string | |
90 | 92 | ||
91 | 93 | (define* (update-result result | |
92 | - | #:key (kanjis (result-kanjis result)) | |
94 | + | #:key (points (result-points result)) | |
95 | + | (kanjis (result-kanjis result)) | |
93 | 96 | (readings (result-readings result)) | |
94 | 97 | (senses (result-senses result))) | |
95 | - | (make-result (result-position result) kanjis readings senses)) | |
98 | + | (make-result (result-position result) points kanjis readings senses)) | |
96 | 99 | ||
97 | 100 | (define* (update-reading reading | |
98 | 101 | #:key (kanjis (reading-kanjis reading)) |
tools/jmdict.scm
1 | + | ;;; Nani Project website | |
2 | + | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
3 | + | ;;; | |
4 | + | ;;; This file is part of the Nani Project website. | |
5 | + | ;;; | |
6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | + | ;;; your option) any later version. | |
10 | + | ;;; | |
11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + | ;;; GNU Affero General Public License for more details. | |
15 | + | ;;; | |
16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | + | ||
1 | 19 | (use-modules (nani jmdict trie)) | |
2 | 20 | (use-modules (nani jmdict serialize)) | |
3 | 21 | (use-modules (nani jmdict xml)) | |
22 | + | (use-modules (nani frequency)) | |
4 | 23 | (use-modules (nani trie)) | |
5 | 24 | (use-modules (nani result)) | |
6 | 25 | (use-modules (ice-9 match)) | |
… | |||
13 | 32 | (write sxml port))))) | |
14 | 33 | ||
15 | 34 | (define (compile input sense-filter output) | |
16 | - | (let* ((sxml (if (equal? (substring input (- (string-length input) 3)) "xml") | |
35 | + | (let* ((frq (load-frequency "dictionaries/frequency.tsv")) | |
36 | + | (sxml (if (equal? (substring input (- (string-length input) 4)) ".xml") | |
17 | 37 | (load-dic input) | |
18 | - | (read input))) | |
19 | - | (results (sxml->results sxml)) | |
38 | + | (call-with-input-file input read))) | |
39 | + | (results (sxml->results sxml frq)) | |
20 | 40 | (results (map (lambda (result) | |
21 | 41 | (update-result | |
22 | 42 | result |