Better compression and sort results by relevance
Makefile
| 4 | 4 | ||
| 5 | 5 | DICOS=dicos/JMdict_e.nani $(addprefix dicos/JMdict_, $(addsuffix .nani, $(JMDICT_LANGS))) | |
| 6 | 6 | ||
| 7 | - | PAGES=blog.scm e404.scm feeds.scm index.scm mentions.scm | |
| 7 | + | PAGES=blog.scm data.scm documentation.scm e404.scm feeds.scm index.scm mentions.scm | |
| 8 | 8 | ||
| 9 | 9 | HAUNT_FILES= haunt.scm $(addprefix pages/, $(PAGES)) \ | |
| 10 | 10 | tools/i18n.scm tools/theme.scm | |
… | |||
| 14 | 14 | ||
| 15 | 15 | DICO_MODULES=modules/nani/trie.scm modules/nani/result.scm modules/nani/jmdict/trie.scm \ | |
| 16 | 16 | modules/nani/jmdict/serialize.scm modules/nani/jmdict/xml.scm \ | |
| 17 | - | modules/nani/jmdict/entities.scm | |
| 17 | + | modules/nani/jmdict/entities.scm modules/nani/frequency.scm | |
| 18 | 18 | ||
| 19 | - | all: po/fr/LC_MESSAGES/nani.mo po/eo/LC_MESSAGES/nani.mo $(WEB_FILES) | |
| 19 | + | LANGS=fr | |
| 20 | + | ||
| 21 | + | all: $(addprefix po/, $(addsuffix /LC_MESSAGES/nani.mo, $(LANGS))) $(WEB_FILES) | |
| 20 | 22 | haunt build | |
| 21 | 23 | rm -rf public.bak | |
| 22 | 24 | mv public public.bak | |
… | |||
| 25 | 27 | ||
| 26 | 28 | download: | |
| 27 | 29 | @rm -f dictionaries/* | |
| 28 | - | @$(MAKE) dictionaries/JMdic_e.xml dictionaries/JMdic.xml | |
| 30 | + | @$(MAKE) dictionaries/JMdic_e.xml dictionaries/JMdic.xml dictionaries/frequency.tsv | |
| 29 | 31 | ||
| 32 | + | # Download JMdict dictionaries from ERDRG | |
| 30 | 33 | dictionaries/%.xml: | |
| 31 | - | wget http://ftp.monash.edu/pub/nihongo/$(shell basename $<).gz -O $<.gz | |
| 32 | - | gunzip $<.gz | |
| 33 | - | sed -i -e 's|<|\&\<;|g' -e 's|>|\&\>;|g' $< | |
| 34 | - | sed -i -e 's|&\([^;]\+\);|\1|g' $< | |
| 35 | - | cp $< $@ | |
| 34 | + | dl_filename="$(shell echo "$@" | rev | cut -c5- | rev)"; \ | |
| 35 | + | wget http://ftp.monash.edu/pub/nihongo/"$$dl_filename".gz -O "$$dl_filename.gz"; \ | |
| 36 | + | gunzip "$$dl_filename.gz"; \ | |
| 37 | + | sed -i -e 's|<|\&\<;|g' -e 's|>|\&\>;|g' "$$dl_filename"; \ | |
| 38 | + | sed -i -e 's|&\([^;]\+\);|\1|g' "$$dl_filename"; \ | |
| 39 | + | cp "$$dl_filename" "$@" | |
| 40 | + | ||
| 41 | + | # Download frequency analysis run on Wikipedia in 2015 | |
| 42 | + | # https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/Japanese2015_10000 | |
| 43 | + | dictionaries/frequency.tsv: | |
| 44 | + | wget --no-check-certificate \ | |
| 45 | + | https://namakajiri.net/data/wikipedia-20150422-lemmas.tsv -O $@.tmp | |
| 46 | + | sed -i 's| ||g' $@.tmp | |
| 47 | + | mv $@.tmp $@ | |
| 36 | 48 | ||
| 37 | 49 | dictionaries/%.sxml: dictionaries/%.xml tools/jmdict.scm | |
| 38 | 50 | guile -L modules tools/jmdict.scm convert $< nolang $@ | |
| 39 | 51 | ||
| 40 | - | dicos/JMdict_%.nani: dictionaries/JMdict.sxml tools/jmdict.scm $(DICO_MODULES) | |
| 52 | + | dicos/JMdict_%.nani: dictionaries/JMdict.sxml tools/jmdict.scm dictionaries/frequency.tsv $(DICO_MODULES) | |
| 41 | 53 | guile -L modules tools/jmdict.scm build \ | |
| 42 | 54 | $< $(shell echo $@ | sed 's|^.*_\([^.]*\)\..*$$|\1|g') $@ | |
| 43 | 55 | ||
| 44 | - | dicos/JMdict_e.nani: dictionaries/JMdict_e.sxml tools/jmdict.scm $(DICO_MODULES) | |
| 56 | + | dicos/JMdict_e.nani: dictionaries/JMdict_e.sxml tools/jmdict.scm dictionaries/frequency.tsv $(DICO_MODULES) | |
| 45 | 57 | guile -L modules tools/jmdict.scm build $< e $@ | |
| 46 | 58 | ||
| 47 | 59 | po/%/LC_MESSAGES/nani.mo: po/%.po | |
modules/nani/frequency.scm unknown status 1
| 1 | + | ;;; Nani Project website | |
| 2 | + | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
| 3 | + | ;;; | |
| 4 | + | ;;; This file is part of the Nani Project website. | |
| 5 | + | ;;; | |
| 6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | + | ;;; your option) any later version. | |
| 10 | + | ;;; | |
| 11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | + | ;;; GNU Affero General Public License for more details. | |
| 15 | + | ;;; | |
| 16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | + | ||
| 19 | + | (define-module (nani frequency) | |
| 20 | + | #:use-module (ice-9 rdelim) | |
| 21 | + | #:export (load-frequency | |
| 22 | + | frequency-entity)) | |
| 23 | + | ||
| 24 | + | (define (load-frequency file) | |
| 25 | + | (call-with-input-file file | |
| 26 | + | (lambda (port) | |
| 27 | + | (let loop ((frq '())) | |
| 28 | + | (let* ((line (%read-line port)) | |
| 29 | + | (line (car line))) | |
| 30 | + | (if (eof-object? line) | |
| 31 | + | frq | |
| 32 | + | (let ((content (string-split line #\tab))) | |
| 33 | + | (loop (cons (cons (cadr (cdr content)) | |
| 34 | + | (string->number (car content))) | |
| 35 | + | frq))))))))) | |
| 36 | + | ||
| 37 | + | (define (frequency-entity frq word) | |
| 38 | + | (let ((freq (assoc-ref frq word))) | |
| 39 | + | (cond | |
| 40 | + | ((not freq) | |
| 41 | + | #f) | |
| 42 | + | ((< freq 501) | |
| 43 | + | "frq500") | |
| 44 | + | ((< freq 1001) | |
| 45 | + | "frq1000") | |
| 46 | + | ((< freq 2001) | |
| 47 | + | "frq2000") | |
| 48 | + | ((< freq 5001) | |
| 49 | + | "frq1000") | |
| 50 | + | ((< freq 10001) | |
| 51 | + | "frq10000") | |
| 52 | + | ((< freq 20001) | |
| 53 | + | "frq20000") | |
| 54 | + | (else #f)))) |
modules/nani/huffman.scm unknown status 1
| 1 | + | ;;; Nani Project website | |
| 2 | + | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
| 3 | + | ;;; | |
| 4 | + | ;;; This file is part of the Nani Project website. | |
| 5 | + | ;;; | |
| 6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | + | ;;; your option) any later version. | |
| 10 | + | ;;; | |
| 11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | + | ;;; GNU Affero General Public License for more details. | |
| 15 | + | ;;; | |
| 16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | + | ||
| 19 | + | (define-module (nani huffman) | |
| 20 | + | #:use-module (ice-9 match) | |
| 21 | + | #:use-module (rnrs bytevectors) | |
| 22 | + | #:export (create-huffman | |
| 23 | + | huffman->code | |
| 24 | + | huffman-encode | |
| 25 | + | huffman-decode | |
| 26 | + | serialize-huffman)) | |
| 27 | + | ||
| 28 | + | (define (add-occurence occ char) | |
| 29 | + | (let* ((o (assoc-ref occ char)) | |
| 30 | + | (o (+ (if o o 0) 1))) | |
| 31 | + | (assoc-set! occ char o))) | |
| 32 | + | ||
| 33 | + | (define (add-occurences occ str) | |
| 34 | + | (let loop ((lst (append (string->list str) (list #\nul))) (occ occ)) | |
| 35 | + | (if (null? lst) | |
| 36 | + | occ | |
| 37 | + | (loop (cdr lst) (add-occurence occ (car lst)))))) | |
| 38 | + | ||
| 39 | + | (define (get-occurences string-list) | |
| 40 | + | (let loop ((lst string-list) (occ '())) | |
| 41 | + | (if (null? lst) | |
| 42 | + | occ | |
| 43 | + | (loop (cdr lst) (add-occurences occ (car lst)))))) | |
| 44 | + | ||
| 45 | + | (define (order tree) | |
| 46 | + | (if (null? tree) | |
| 47 | + | tree | |
| 48 | + | (add-in-order (car tree) (order (cdr tree))))) | |
| 49 | + | ||
| 50 | + | (define (add-in-order elem tree) | |
| 51 | + | (if (null? tree) | |
| 52 | + | (list elem) | |
| 53 | + | (let ((elem-pos (cdr elem)) | |
| 54 | + | (tree-pos (cdr (car tree)))) | |
| 55 | + | (if (> elem-pos tree-pos) | |
| 56 | + | (cons (car tree) (add-in-order elem (cdr tree))) | |
| 57 | + | (cons elem tree))))) | |
| 58 | + | ||
| 59 | + | (define (create-huffman string-list) | |
| 60 | + | (let ((total (apply + (map string-length string-list))) | |
| 61 | + | (occurences (get-occurences string-list))) | |
| 62 | + | (let loop ((tree (order occurences))) | |
| 63 | + | (if (equal? (length tree) 1) | |
| 64 | + | tree | |
| 65 | + | (let* ((e1 (car tree)) | |
| 66 | + | (tree (cdr tree)) | |
| 67 | + | (e2 (car tree)) | |
| 68 | + | (tree (cdr tree))) | |
| 69 | + | (loop (add-in-order (cons (list e1 e2) (+ (cdr e1) (cdr e2))) tree))))))) | |
| 70 | + | ||
| 71 | + | (define (huffman->code huffman) | |
| 72 | + | (match huffman | |
| 73 | + | ('() '()) | |
| 74 | + | ((((h1 h2) . weight)) | |
| 75 | + | (append | |
| 76 | + | (map (lambda (e) (cons (car e) (cons 0 (cdr e)))) (huffman->code h1)) | |
| 77 | + | (map (lambda (e) (cons (car e) (cons 1 (cdr e)))) (huffman->code h2)))) | |
| 78 | + | (((h1 h2) . weight) | |
| 79 | + | (append | |
| 80 | + | (map (lambda (e) (cons (car e) (cons 0 (cdr e)))) (huffman->code h1)) | |
| 81 | + | (map (lambda (e) (cons (car e) (cons 1 (cdr e)))) (huffman->code h2)))) | |
| 82 | + | ((((? char? char) . weight)) | |
| 83 | + | (list (cons char '()))) | |
| 84 | + | (((? char? char) . weight) | |
| 85 | + | (list (cons char '()))))) | |
| 86 | + | ||
| 87 | + | (define (bits->bytes b) | |
| 88 | + | (match b | |
| 89 | + | ('() '()) | |
| 90 | + | ((a b c d e f g h bs ...) | |
| 91 | + | (cons (+ h (* 2 g) (* 4 f) (* 8 e) (* 16 d) (* 32 c) (* 64 b) (* 128 a)) | |
| 92 | + | (bits->bytes bs))) | |
| 93 | + | (_ (bits->bytes (append b (make-list (- 8 (length b)) 0)))))) | |
| 94 | + | ||
| 95 | + | (define (huffman-encode code str) | |
| 96 | + | (u8-list->bytevector | |
| 97 | + | (bits->bytes | |
| 98 | + | (apply append (map (lambda (c) (assoc-ref code c)) | |
| 99 | + | (append (string->list str) (list #\nul))))))) | |
| 100 | + | ||
| 101 | + | (define (byte->bits b) | |
| 102 | + | (let loop ((i 2) (b b) (by '())) | |
| 103 | + | (if (eq? i 512) | |
| 104 | + | by | |
| 105 | + | (loop (* i 2) (- b (modulo b i)) (cons (if (eq? (modulo b i) 0) 0 1) by))))) | |
| 106 | + | ||
| 107 | + | (define (huffman-decode huffman bv) | |
| 108 | + | (let ((seq (apply append (map byte->bits (bytevector->u8-list bv))))) | |
| 109 | + | (list->string | |
| 110 | + | (reverse | |
| 111 | + | (let loop ((seq seq) (result '()) (huff huffman)) | |
| 112 | + | (if (null? seq) | |
| 113 | + | result | |
| 114 | + | (match huff | |
| 115 | + | ((((h1 h2) . weight)) | |
| 116 | + | (if (= (car seq) 0) | |
| 117 | + | (loop (cdr seq) result h1) | |
| 118 | + | (loop (cdr seq) result h2))) | |
| 119 | + | (((h1 h2) . weight) | |
| 120 | + | (if (= (car seq) 0) | |
| 121 | + | (loop (cdr seq) result h1) | |
| 122 | + | (loop (cdr seq) result h2))) | |
| 123 | + | ((((? char? char) . weight)) | |
| 124 | + | (if (equal? char #\nul) | |
| 125 | + | result | |
| 126 | + | (loop seq (cons char result) huffman))) | |
| 127 | + | (((? char? char) . weight) | |
| 128 | + | (if (equal? char #\nul) | |
| 129 | + | result | |
| 130 | + | (loop seq (cons char result) huffman)))))))))) | |
| 131 | + | ||
| 132 | + | (define (serialize-huffman huffman) | |
| 133 | + | (define (serialize huffman) | |
| 134 | + | (match huffman | |
| 135 | + | ((((h1 h2) . weight)) | |
| 136 | + | (append '(1) (serialize h1) (serialize h2) '(2))) | |
| 137 | + | (((h1 h2) . weight) | |
| 138 | + | (append '(1) (serialize h1) (serialize h2) '(2))) | |
| 139 | + | ((((? char? char) . weight)) | |
| 140 | + | (append (bytevector->u8-list (string->utf8 (list->string (list char)))) '(0))) | |
| 141 | + | (((? char? char) . weight) | |
| 142 | + | (append (bytevector->u8-list (string->utf8 (list->string (list char)))) '(0))))) | |
| 143 | + | (u8-list->bytevector (serialize huffman))) |
modules/nani/jmdict/entities.scm
| 174 | 174 | ("sumo" . 170) | |
| 175 | 175 | ("zool" . 171) | |
| 176 | 176 | ("joc" . 172) | |
| 177 | - | ("anat" . 173))) | |
| 177 | + | ("anat" . 173) | |
| 178 | + | ("frq500" . 174) | |
| 179 | + | ("frq1000" . 175) | |
| 180 | + | ("frq2000" . 176) | |
| 181 | + | ("frq5000" . 177) | |
| 182 | + | ("frq10000" . 178) | |
| 183 | + | ("frq20000" . 179))) | |
| 178 | 184 | ||
| 179 | 185 | (define (get-entity ent) | |
| 180 | 186 | (let ((val (assoc-ref entities ent))) |
modules/nani/jmdict/serialize.scm
| 17 | 17 | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | 18 | ||
| 19 | 19 | (define-module (nani jmdict serialize) | |
| 20 | + | #:use-module (nani huffman) | |
| 20 | 21 | #:use-module (nani result) | |
| 21 | 22 | #:use-module (nani trie) | |
| 22 | 23 | #:use-module (rnrs bytevectors) | |
… | |||
| 46 | 47 | (loop pos (cdr trs) (append bvs bv)))))) | |
| 47 | 48 | (trie-position-set! trie pos) | |
| 48 | 49 | (let* ((vals-sz (list-size (trie-vals trie) int-size)) | |
| 49 | - | (trs-sz (list-size (trie-transitions trie) (const 5))) | |
| 50 | - | (sz (+ vals-sz trs-sz)) | |
| 50 | + | (trs-sz (list-size (trie-transitions trie) (const 5) #:size? #f)) | |
| 51 | + | (sz (+ vals-sz 1 trs-sz)) | |
| 51 | 52 | (bv (make-bytevector sz 0))) | |
| 52 | 53 | (serialize-list (map (lambda (pos) (result-position (array-ref results pos))) | |
| 53 | 54 | (trie-vals trie)) | |
… | |||
| 57 | 58 | (+ pos (bytevector-length bv)))) | |
| 58 | 59 | (next-pos (car bvs)) | |
| 59 | 60 | (bvs (cdr bvs))) | |
| 61 | + | (bytevector-u8-set! bv vals-sz (length (trie-transitions trie))) | |
| 60 | 62 | (serialize-list (trie-transitions trie) serialize-pointer | |
| 61 | - | vals-sz bv) | |
| 63 | + | (+ vals-sz 1) bv #:size? #f) | |
| 62 | 64 | (cons next-pos (cons bv bvs))))) | |
| 63 | 65 | ||
| 66 | + | (define* (serialize-list lst serialize pos bv #:key (size? #t)) | |
| 67 | + | (when (not (list? lst)) (throw 'not-list lst)) | |
| 68 | + | (when size? | |
| 69 | + | (bytevector-u16-set! bv pos (length lst) (endianness big))) | |
| 70 | + | (let loop ((lst lst) (pos (+ pos (if size? 2 0)))) | |
| 71 | + | (if (null? lst) | |
| 72 | + | pos | |
| 73 | + | (loop (cdr lst) (serialize (car lst) pos bv))))) | |
| 74 | + | (define* (list-size lst size #:key (size? #t)) | |
| 75 | + | (when (not (list? lst)) (throw 'not-list lst)) | |
| 76 | + | (apply + (if size? 2 0) (map size lst))) | |
| 77 | + | ||
| 64 | 78 | (define (serialize-pointer ptr pos bv) | |
| 65 | 79 | (bytevector-u8-set! bv pos (car ptr)) | |
| 66 | 80 | (bytevector-u32-set! bv (+ pos 1) (trie-position (cdr ptr)) (endianness big)) | |
… | |||
| 81 | 95 | (+ pos 1)) | |
| 82 | 96 | (define boolean-size (const 1)) | |
| 83 | 97 | ||
| 84 | - | (define (serialize-string str pos bv) | |
| 85 | - | (let ((sbv (string->utf8 str))) | |
| 86 | - | (bytevector-u32-set! bv pos (bytevector-length sbv) (endianness big)) | |
| 87 | - | (bytevector-copy! sbv 0 bv (+ pos 4) (bytevector-length sbv)) | |
| 88 | - | (+ pos 4 (bytevector-length sbv)))) | |
| 89 | - | (define (string-size str) | |
| 90 | - | (let ((sbv (string->utf8 str))) | |
| 91 | - | (+ 4 (bytevector-length sbv)))) | |
| 92 | - | ||
| 93 | - | (define* (serialize-list lst serialize pos bv #:key (size? #t)) | |
| 94 | - | (when (not (list? lst)) (throw 'not-list lst)) | |
| 95 | - | (when size? | |
| 96 | - | (bytevector-u32-set! bv pos (length lst) (endianness big))) | |
| 97 | - | (let loop ((lst lst) (pos (+ pos (if size? 4 0)))) | |
| 98 | - | (if (null? lst) | |
| 99 | - | pos | |
| 100 | - | (loop (cdr lst) (serialize (car lst) pos bv))))) | |
| 101 | - | (define* (list-size lst size #:key (size? #t)) | |
| 102 | - | (when (not (list? lst)) (throw 'not-list lst)) | |
| 103 | - | (apply + (if size? 4 0) (map size lst))) | |
| 104 | - | ||
| 105 | - | (define (serialize-source source pos bv) | |
| 106 | - | (when (not (source? source)) (throw 'not-source source)) | |
| 107 | - | (let* ((pos (serialize-list (source-content source) serialize-string pos bv)) | |
| 108 | - | (pos (serialize-boolean (source-wasei? source) pos bv)) | |
| 109 | - | (pos (serialize-string (source-type source) pos bv)) | |
| 110 | - | (pos (serialize-string (source-lang source) pos bv))) | |
| 111 | - | pos)) | |
| 112 | - | (define (source-size source) | |
| 113 | - | (when (not (source? source)) (throw 'not-source source)) | |
| 114 | - | (+ (list-size (source-content source) string-size) | |
| 115 | - | (boolean-size (source-wasei? source)) | |
| 116 | - | (string-size (source-type source)) | |
| 117 | - | (string-size (source-lang source)))) | |
| 98 | + | (define (serialize-jmdict results kanji-trie reading-trie sense-trie) | |
| 99 | + | (define kanji-huffman | |
| 100 | + | (let ((kanjis (apply append (map result-kanjis results)))) | |
| 101 | + | (create-huffman kanjis))) | |
| 102 | + | (define kanji-huffman-code (huffman->code kanji-huffman)) | |
| 103 | + | (define reading-huffman | |
| 104 | + | (let* ((readings (apply append (map result-readings results))) | |
| 105 | + | (readings (apply append (map reading-readings readings)))) | |
| 106 | + | (create-huffman readings))) | |
| 107 | + | (define reading-huffman-code (huffman->code reading-huffman)) | |
| 108 | + | (define sense-huffman | |
| 109 | + | (let* ((senses (apply append (map result-senses results))) | |
| 110 | + | (glosses (apply append (map sense-glosses senses)))) | |
| 111 | + | (create-huffman glosses))) | |
| 112 | + | (define sense-huffman-code (huffman->code sense-huffman)) | |
| 118 | 113 | ||
| 119 | - | (define (serialize-reading reading pos bv) | |
| 120 | - | (when (not (reading? reading)) (throw 'not-reading reading)) | |
| 121 | - | (let* ((pos (serialize-list (reading-kanjis reading) serialize-string pos bv)) | |
| 122 | - | (pos (serialize-list (reading-info reading) serialize-string pos bv)) | |
| 123 | - | (pos (serialize-list (reading-readings reading) serialize-string pos bv))) | |
| 124 | - | pos)) | |
| 125 | - | (define (reading-size reading) | |
| 126 | - | (when (not (reading? reading)) (throw 'not-reading reading)) | |
| 127 | - | (+ (list-size (reading-kanjis reading) string-size) | |
| 128 | - | (list-size (reading-info reading) string-size) | |
| 129 | - | (list-size (reading-readings reading) string-size))) | |
| 114 | + | (define (serialize-string str pos bv) | |
| 115 | + | (let ((sbv (string->utf8 str))) | |
| 116 | + | (bytevector-copy! sbv 0 bv pos (bytevector-length sbv)) | |
| 117 | + | (bytevector-u8-set! bv (+ pos (bytevector-length sbv)) 0) | |
| 118 | + | (+ pos 1 (bytevector-length sbv)))) | |
| 119 | + | (define (string-size str) | |
| 120 | + | (let ((sbv (string->utf8 str))) | |
| 121 | + | (+ 1 (bytevector-length sbv)))) | |
| 130 | 122 | ||
| 131 | - | (define (serialize-sense sense pos bv) | |
| 132 | - | (when (not (sense? sense)) (throw 'not-sense sense)) | |
| 133 | - | (let* ((pos (serialize-list (sense-references sense) serialize-string pos bv)) | |
| 134 | - | (pos (serialize-list (sense-limits sense) serialize-string pos bv)) | |
| 135 | - | (pos (serialize-list (sense-infos sense) serialize-string pos bv)) | |
| 136 | - | (pos (serialize-list (sense-sources sense) serialize-source pos bv)) | |
| 137 | - | (pos (serialize-list (sense-tags sense) serialize-char pos bv)) | |
| 138 | - | (pos (serialize-list (sense-glosses sense) serialize-string pos bv)) | |
| 139 | - | (pos (serialize-string (sense-language sense) pos bv))) | |
| 140 | - | pos)) | |
| 141 | - | (define (sense-size sense) | |
| 142 | - | (when (not (sense? sense)) (throw 'not-sense sense)) | |
| 143 | - | (+ (list-size (sense-references sense) string-size) | |
| 144 | - | (list-size (sense-limits sense) string-size) | |
| 145 | - | (list-size (sense-infos sense) string-size) | |
| 146 | - | (list-size (sense-sources sense) source-size) | |
| 147 | - | (list-size (sense-tags sense) char-size) | |
| 148 | - | (list-size (sense-glosses sense) string-size) | |
| 149 | - | (string-size (sense-language sense)))) | |
| 123 | + | (define (serialize-huffman-string huffman-code) | |
| 124 | + | (lambda (str pos bv) | |
| 125 | + | (let ((sbv (huffman-encode huffman-code str))) | |
| 126 | + | (bytevector-copy! sbv 0 bv (+ pos 4) (bytevector-length sbv)) | |
| 127 | + | (+ pos (bytevector-length sbv))))) | |
| 128 | + | (define (huffman-string-size huffman-code) | |
| 129 | + | (lambda (str) | |
| 130 | + | (let ((sbv (huffman-encode huffman-code str))) | |
| 131 | + | (+ (bytevector-length sbv))))) | |
| 132 | + | ||
| 133 | + | (define (serialize-source source pos bv) | |
| 134 | + | (when (not (source? source)) (throw 'not-source source)) | |
| 135 | + | (let* ((pos (serialize-list (source-content source) serialize-string pos bv)) | |
| 136 | + | (pos (serialize-boolean (source-wasei? source) pos bv)) | |
| 137 | + | (pos (serialize-string (source-type source) pos bv)) | |
| 138 | + | (pos (serialize-string (source-lang source) pos bv))) | |
| 139 | + | pos)) | |
| 140 | + | (define (source-size source) | |
| 141 | + | (when (not (source? source)) (throw 'not-source source)) | |
| 142 | + | (+ (list-size (source-content source) string-size) | |
| 143 | + | (boolean-size (source-wasei? source)) | |
| 144 | + | (string-size (source-type source)) | |
| 145 | + | (string-size (source-lang source)))) | |
| 146 | + | ||
| 147 | + | (define (serialize-reading reading pos bv) | |
| 148 | + | (when (not (reading? reading)) (throw 'not-reading reading)) | |
| 149 | + | (let* ((pos (serialize-list (reading-kanjis reading) serialize-string pos bv)) | |
| 150 | + | (pos (serialize-list (reading-info reading) serialize-string pos bv)) | |
| 151 | + | (pos (serialize-list (reading-readings reading) | |
| 152 | + | (serialize-huffman-string reading-huffman-code) pos bv))) | |
| 153 | + | pos)) | |
| 154 | + | (define (reading-size reading) | |
| 155 | + | (when (not (reading? reading)) (throw 'not-reading reading)) | |
| 156 | + | (+ (list-size (reading-kanjis reading) string-size) | |
| 157 | + | (list-size (reading-info reading) string-size) | |
| 158 | + | (list-size (reading-readings reading) (huffman-string-size reading-huffman-code)))) | |
| 159 | + | ||
| 160 | + | (define (serialize-sense sense pos bv) | |
| 161 | + | (when (not (sense? sense)) (throw 'not-sense sense)) | |
| 162 | + | (let* ((pos (serialize-list (sense-references sense) serialize-string pos bv)) | |
| 163 | + | (pos (serialize-list (sense-limits sense) serialize-string pos bv)) | |
| 164 | + | (pos (serialize-list (sense-infos sense) serialize-string pos bv)) | |
| 165 | + | (pos (serialize-list (sense-sources sense) serialize-source pos bv)) | |
| 166 | + | (pos (serialize-list (sense-tags sense) serialize-char pos bv)) | |
| 167 | + | (pos (serialize-list (sense-glosses sense) | |
| 168 | + | (serialize-huffman-string sense-huffman-code) pos bv)) | |
| 169 | + | (pos (serialize-string (sense-language sense) pos bv))) | |
| 170 | + | pos)) | |
| 171 | + | (define (sense-size sense) | |
| 172 | + | (when (not (sense? sense)) (throw 'not-sense sense)) | |
| 173 | + | (+ (list-size (sense-references sense) string-size) | |
| 174 | + | (list-size (sense-limits sense) string-size) | |
| 175 | + | (list-size (sense-infos sense) string-size) | |
| 176 | + | (list-size (sense-sources sense) source-size) | |
| 177 | + | (list-size (sense-tags sense) char-size) | |
| 178 | + | (list-size (sense-glosses sense) | |
| 179 | + | (huffman-string-size sense-huffman-code)) | |
| 180 | + | (string-size (sense-language sense)))) | |
| 181 | + | ||
| 182 | + | (define (serialize-result result pos bv) | |
| 183 | + | (when (not (result? result)) (throw 'not-result result)) | |
| 184 | + | (result-position-set! result pos) | |
| 185 | + | (let* ((pos (serialize-list (result-kanjis result) | |
| 186 | + | (serialize-huffman-string kanji-huffman-code) pos bv)) | |
| 187 | + | (pos (serialize-list (result-readings result) serialize-reading pos bv)) | |
| 188 | + | (pos (serialize-list (result-senses result) serialize-sense pos bv))) | |
| 189 | + | pos)) | |
| 190 | + | (define (result-size result) | |
| 191 | + | (when (not (result? result)) (throw 'not-result result)) | |
| 192 | + | (+ (list-size (result-kanjis result) (huffman-string-size kanji-huffman-code)) | |
| 193 | + | (list-size (result-readings result) reading-size) | |
| 194 | + | (list-size (result-senses result) sense-size))) | |
| 150 | 195 | ||
| 151 | - | (define (serialize-result result pos bv) | |
| 152 | - | (when (not (result? result)) (throw 'not-result result)) | |
| 153 | - | (result-position-set! result pos) | |
| 154 | - | (let* ((pos (serialize-list (result-kanjis result) serialize-string pos bv)) | |
| 155 | - | (pos (serialize-list (result-readings result) serialize-reading pos bv)) | |
| 156 | - | (pos (serialize-list (result-senses result) serialize-sense pos bv))) | |
| 157 | - | pos)) | |
| 158 | - | (define (result-size result) | |
| 159 | - | (when (not (result? result)) (throw 'not-result result)) | |
| 160 | - | (+ (list-size (result-kanjis result) string-size) | |
| 161 | - | (list-size (result-readings result) reading-size) | |
| 162 | - | (list-size (result-senses result) sense-size))) | |
| 196 | + | (define (trie-node-size trie) | |
| 197 | + | (apply + 1 (map trie-node-size (map cdr (trie-transitions trie))))) | |
| 163 | 198 | ||
| 164 | - | (define (serialize-jmdict results kanji-trie reading-trie sense-trie) | |
| 165 | 199 | (let* ((header (string->utf8 "NANI_JMDICT001")) | |
| 166 | 200 | (header-size (bytevector-length header)) | |
| 167 | 201 | (pointers (make-bytevector 12 0)) | |
| 202 | + | (kanji-huffman-bv (serialize-huffman kanji-huffman)) | |
| 203 | + | (kanji-huffman-size (bytevector-length kanji-huffman-bv)) | |
| 204 | + | (reading-huffman-bv (serialize-huffman reading-huffman)) | |
| 205 | + | (reading-huffman-size (bytevector-length reading-huffman-bv)) | |
| 206 | + | (sense-huffman-bv (serialize-huffman sense-huffman)) | |
| 207 | + | (sense-huffman-size (bytevector-length sense-huffman-bv)) | |
| 168 | 208 | (results-size (list-size results result-size #:size? #f)) | |
| 169 | - | (results-bv (make-bytevector (+ header-size 12 results-size 0)))) | |
| 170 | - | (serialize-list results serialize-result (+ header-size 12) results-bv #:size? #f) | |
| 209 | + | (huffman-size (+ reading-huffman-size sense-huffman-size kanji-huffman-size)) | |
| 210 | + | (results-bv (make-bytevector (+ header-size 12 kanji-huffman-size | |
| 211 | + | reading-huffman-size sense-huffman-size | |
| 212 | + | results-size 0)))) | |
| 213 | + | (format #t "Number of nodes in kanjis: ~a~%" | |
| 214 | + | (trie-node-size kanji-trie)) | |
| 215 | + | (format #t "Number of nodes in readings: ~a~%" | |
| 216 | + | (trie-node-size reading-trie)) | |
| 217 | + | (format #t "Number of nodes in senses: ~a~%" | |
| 218 | + | (trie-node-size sense-trie)) | |
| 219 | + | (serialize-list results serialize-result (+ header-size 12 huffman-size) | |
| 220 | + | results-bv #:size? #f) | |
| 171 | 221 | (let* ((results (list->array 1 results)) | |
| 172 | 222 | (pos (bytevector-length results-bv)) | |
| 173 | 223 | (kanji-bvs (serialize-trie kanji-trie results pos)) | |
… | |||
| 180 | 230 | (bytevector-u32-set! pointers 8 (car reading-bvs) (endianness big)) | |
| 181 | 231 | (bytevector-copy! header 0 results-bv 0 header-size) | |
| 182 | 232 | (bytevector-copy! pointers 0 results-bv header-size 12) | |
| 233 | + | (bytevector-copy! kanji-huffman-bv 0 results-bv (+ header-size 12) kanji-huffman-size) | |
| 234 | + | (bytevector-copy! reading-huffman-bv 0 results-bv | |
| 235 | + | (+ header-size 12 kanji-huffman-size) | |
| 236 | + | reading-huffman-size) | |
| 237 | + | (bytevector-copy! sense-huffman-bv 0 results-bv | |
| 238 | + | (+ header-size 12 kanji-huffman-size reading-huffman-size) | |
| 239 | + | sense-huffman-size) | |
| 183 | 240 | ;; give some feedback on the size of file's structures | |
| 241 | + | (format #t "huffmans are ~a bytes long~%" huffman-size) | |
| 184 | 242 | (format #t "results is ~a bytes long~%" (bytevector-length results-bv)) | |
| 185 | 243 | (format #t "kanjis is ~a bytes long~%" (apply + (map bytevector-length (cdr kanji-bvs)))) | |
| 186 | 244 | (format #t "readings is ~a bytes long~%" (apply + (map bytevector-length (cdr reading-bvs)))) | |
modules/nani/jmdict/xml.scm
| 20 | 20 | #:use-module (ice-9 match) | |
| 21 | 21 | #:use-module (ice-9 rdelim) | |
| 22 | 22 | #:use-module (sxml fold) | |
| 23 | + | #:use-module (nani frequency) | |
| 23 | 24 | #:use-module (sxml simple) | |
| 24 | 25 | #:use-module (nani result) | |
| 25 | 26 | #:use-module (nani jmdict entities) | |
… | |||
| 71 | 72 | ((? string? _) sense)) | |
| 72 | 73 | (cdr lst))))) | |
| 73 | 74 | ||
| 74 | - | (define (sxml->result lst) | |
| 75 | - | (let loop ((result (make-result 0 '() '() '())) (lst lst)) | |
| 76 | - | (if (null? lst) | |
| 77 | - | result | |
| 78 | - | (loop | |
| 79 | - | (match (car lst) | |
| 80 | - | (('kanji kanji) (update-result result #:kanjis (cons kanji (result-kanjis result)))) | |
| 81 | - | ((? reading? r) (update-result result #:readings (cons r (result-readings result)))) | |
| 82 | - | ((? sense? s) (update-result result #:senses (cons s (result-senses result)))) | |
| 83 | - | ((? string? _) result)) | |
| 84 | - | (cdr lst))))) | |
| 75 | + | (define (sxml->result lst frq) | |
| 76 | + | (let ((result | |
| 77 | + | (let loop ((result (make-result 0 0 '() '() '())) (lst lst)) | |
| 78 | + | (if (null? lst) | |
| 79 | + | result | |
| 80 | + | (loop | |
| 81 | + | (match (car lst) | |
| 82 | + | (('kanji kanji) | |
| 83 | + | (update-result result #:kanjis (cons kanji (result-kanjis result)))) | |
| 84 | + | ((? reading? r) | |
| 85 | + | (update-result result #:readings (cons r (result-readings result)))) | |
| 86 | + | ((? sense? s) | |
| 87 | + | (update-result result #:senses (cons s (result-senses result)))) | |
| 88 | + | ((? string? _) result)) | |
| 89 | + | (cdr lst)))))) | |
| 90 | + | (let* ((word (if (null? (result-kanjis result)) | |
| 91 | + | (car (reading-readings (car (result-readings result)))) | |
| 92 | + | (car (result-kanjis result)))) | |
| 93 | + | (entity (frequency-entity frq word)) | |
| 94 | + | (sense1 (car (result-senses result))) | |
| 95 | + | (sense1 (update-sense sense1 #:tags (cons entity (sense-tags sense1)))) | |
| 96 | + | (senses (cons sense1 (cdr (result-senses result))))) | |
| 97 | + | (update-result result #:points (get-points (update-result result #:senses senses)))))) | |
| 98 | + | ||
| 99 | + | (define (get-points result) | |
| 100 | + | (let ((tags (apply append (map sense-tags (result-senses result))))) | |
| 101 | + | (apply + (map (lambda (tag) | |
| 102 | + | (cond | |
| 103 | + | ;; frequency | |
| 104 | + | ((eq? tag (get-entity "frq500")) | |
| 105 | + | 32) | |
| 106 | + | ((eq? tag (get-entity "frq1000")) | |
| 107 | + | 16) | |
| 108 | + | ((eq? tag (get-entity "frq2000")) | |
| 109 | + | 8) | |
| 110 | + | ((eq? tag (get-entity "frq5000")) | |
| 111 | + | 4) | |
| 112 | + | ((eq? tag (get-entity "frq10000")) | |
| 113 | + | 2) | |
| 114 | + | ((eq? tag (get-entity "frq20000")) | |
| 115 | + | 1) | |
| 116 | + | ;; archaic term | |
| 117 | + | ((eq? tag (get-entity "arch")) | |
| 118 | + | -3) | |
| 119 | + | ;; children language | |
| 120 | + | ((eq? tag (get-entity "chn")) | |
| 121 | + | -1) | |
| 122 | + | ;; colloquialism | |
| 123 | + | ((eq? tag (get-entity "col")) | |
| 124 | + | 5) | |
| 125 | + | ;; expression | |
| 126 | + | ((eq? tag (get-entity "exp")) | |
| 127 | + | 1) | |
| 128 | + | ;; sonkeigo (honorific or respectful) | |
| 129 | + | ((eq? tag (get-entity "hon")) | |
| 130 | + | -1) | |
| 131 | + | ;; kenjougo (humble) | |
| 132 | + | ((eq? tag (get-entity "hum")) | |
| 133 | + | -1) | |
| 134 | + | ;; teineigo (polite) | |
| 135 | + | ((eq? tag (get-entity "pol")) | |
| 136 | + | -1) | |
| 137 | + | ;; irregular kanji | |
| 138 | + | ((eq? tag (get-entity "iK")) | |
| 139 | + | -2) | |
| 140 | + | ;; idiomatic expression | |
| 141 | + | ((eq? tag (get-entity "id")) | |
| 142 | + | 3) | |
| 143 | + | ;; slang | |
| 144 | + | ((member tag (list | |
| 145 | + | (get-entity "sl") | |
| 146 | + | (get-entity "m-sl") | |
| 147 | + | (get-entity "male-sl"))) | |
| 148 | + | -4) | |
| 149 | + | ;; outdated kanji | |
| 150 | + | ((eq? tag (get-entity "oK")) | |
| 151 | + | -2) | |
| 152 | + | ;; obsolete term | |
| 153 | + | ((eq? tag (get-entity "obs")) | |
| 154 | + | -5) | |
| 155 | + | ;; obscure term | |
| 156 | + | ((eq? tag (get-entity "obsc")) | |
| 157 | + | -10) | |
| 158 | + | ;; onomatopea | |
| 159 | + | ((eq? tag (get-entity "on-mim")) | |
| 160 | + | 1) | |
| 161 | + | ;; poetical term | |
| 162 | + | ((eq? tag (get-entity "poet")) | |
| 163 | + | -1) | |
| 164 | + | ((eq? tag (get-entity "proverb")) | |
| 165 | + | 1) | |
| 166 | + | ((eq? tag (get-entity "rare")) | |
| 167 | + | -3) | |
| 168 | + | ;; sensitive | |
| 169 | + | ((eq? tag (get-entity "sens")) | |
| 170 | + | -2) | |
| 171 | + | ;; dialect | |
| 172 | + | ((member tag (list | |
| 173 | + | (get-entity "kyb") | |
| 174 | + | (get-entity "osb") | |
| 175 | + | (get-entity "ksb") | |
| 176 | + | (get-entity "ktb") | |
| 177 | + | (get-entity "tsb") | |
| 178 | + | (get-entity "thb") | |
| 179 | + | (get-entity "tsug") | |
| 180 | + | (get-entity "kyu") | |
| 181 | + | (get-entity "rkb") | |
| 182 | + | (get-entity "nab") | |
| 183 | + | (get-entity "hob"))) | |
| 184 | + | -1) | |
| 185 | + | (else 0))) | |
| 186 | + | tags)))) | |
| 85 | 187 | ||
| 86 | - | (define (sxml->results sxml) | |
| 87 | - | (foldt | |
| 88 | - | (lambda (xml) | |
| 89 | - | (match xml | |
| 90 | - | (('ent_seq _) "") | |
| 91 | - | (('ke_pri _) "") | |
| 92 | - | (('re_pri _) "") | |
| 93 | - | (('pri _) "") | |
| 94 | - | (('keb kanji) `(kanji ,kanji)) | |
| 95 | - | (('ke_inf _) "") | |
| 96 | - | (('k_ele lst ...) (car (filter list? lst))) | |
| 97 | - | (('reb reading) `(reading ,reading)) | |
| 98 | - | (('re_nokanji _ ...) "") | |
| 99 | - | (('re_restr r) `(limit ,r)) | |
| 100 | - | (('re_inf r) `(info ,r)) | |
| 101 | - | (('r_ele lst ...) (sxml->reading lst)) | |
| 102 | - | (('ant pos) `(ref ,pos)) | |
| 103 | - | (('dial pos) `(tag ,(get-entity pos))) | |
| 104 | - | (('field pos) `(tag ,(get-entity pos))) | |
| 105 | - | (('misc pos) `(tag ,(get-entity pos))) | |
| 106 | - | (('pos pos) `(tag ,(get-entity pos))) | |
| 107 | - | (('xref pos) `(ref ,pos)) | |
| 108 | - | (('g_type _ ...) "") | |
| 109 | - | (('gloss (? string? g)) xml) | |
| 110 | - | (('gloss (? list? g)) g) | |
| 111 | - | (('gloss attr g) (cons `(gloss ,g) attr)) | |
| 112 | - | (('stagk k) `(limit ,k)) | |
| 113 | - | (('stagr r) `(limit ,r)) | |
| 114 | - | (('s_inf r) `(info ,r)) | |
| 115 | - | (('lsource lst ...) (sxml->source lst)) | |
| 116 | - | (('sense lst ...) (sxml->sense lst)) | |
| 117 | - | (('entry lst ...) (sxml->result lst)) | |
| 118 | - | (('JMdict lst ...) lst) | |
| 119 | - | (('xml:lang l) `(lang ,l)) | |
| 120 | - | (('ls_wasei _) '(wasei #t)) | |
| 121 | - | (('ls_type t) `(type ,t)) | |
| 122 | - | (('@ lst ...) lst) | |
| 123 | - | (('*TOP* _ l) (filter result? l)) | |
| 124 | - | (('*PI* _ ...) #f))) | |
| 125 | - | (lambda (a) a) | |
| 126 | - | sxml)) | |
| 188 | + | (define (sxml->results sxml frq) | |
| 189 | + | (sort | |
| 190 | + | (foldt | |
| 191 | + | (lambda (xml) | |
| 192 | + | (match xml | |
| 193 | + | (('ent_seq _) "") | |
| 194 | + | (('ke_pri _) "") | |
| 195 | + | (('re_pri _) "") | |
| 196 | + | (('pri _) "") | |
| 197 | + | (('keb kanji) `(kanji ,kanji)) | |
| 198 | + | (('ke_inf _) "") | |
| 199 | + | (('k_ele lst ...) (car (filter list? lst))) | |
| 200 | + | (('reb reading) `(reading ,reading)) | |
| 201 | + | (('re_nokanji _ ...) "") | |
| 202 | + | (('re_restr r) `(limit ,r)) | |
| 203 | + | (('re_inf r) `(info ,r)) | |
| 204 | + | (('r_ele lst ...) (sxml->reading lst)) | |
| 205 | + | (('ant pos) `(ref ,pos)) | |
| 206 | + | (('dial pos) `(tag ,(get-entity pos))) | |
| 207 | + | (('field pos) `(tag ,(get-entity pos))) | |
| 208 | + | (('misc pos) `(tag ,(get-entity pos))) | |
| 209 | + | (('pos pos) `(tag ,(get-entity pos))) | |
| 210 | + | (('xref pos) `(ref ,pos)) | |
| 211 | + | (('g_type _ ...) "") | |
| 212 | + | (('gloss (? string? g)) xml) | |
| 213 | + | (('gloss (? list? g)) g) | |
| 214 | + | (('gloss attr g) (cons `(gloss ,g) attr)) | |
| 215 | + | (('stagk k) `(limit ,k)) | |
| 216 | + | (('stagr r) `(limit ,r)) | |
| 217 | + | (('s_inf r) `(info ,r)) | |
| 218 | + | (('lsource lst ...) (sxml->source lst)) | |
| 219 | + | (('sense lst ...) (sxml->sense lst)) | |
| 220 | + | (('entry lst ...) (sxml->result lst frq)) | |
| 221 | + | (('JMdict lst ...) lst) | |
| 222 | + | (('xml:lang l) `(lang ,l)) | |
| 223 | + | (('ls_wasei _) '(wasei #t)) | |
| 224 | + | (('ls_type t) `(type ,t)) | |
| 225 | + | (('@ lst ...) lst) | |
| 226 | + | (('*TOP* _ l) (filter result? l)) | |
| 227 | + | (('*PI* _ ...) #f))) | |
| 228 | + | (lambda (a) a) | |
| 229 | + | sxml) | |
| 230 | + | (lambda (a b) (< (result-points a) (result-points b))))) | |
modules/nani/result.scm
| 22 | 22 | result? | |
| 23 | 23 | result-position | |
| 24 | 24 | result-position-set! | |
| 25 | + | result-points | |
| 25 | 26 | result-kanjis | |
| 26 | 27 | result-readings | |
| 27 | 28 | result-senses | |
… | |||
| 55 | 56 | update-source)) | |
| 56 | 57 | ||
| 57 | 58 | (define-record-type result | |
| 58 | - | (make-result position kanjis readings senses) | |
| 59 | + | (make-result position points kanjis readings senses) | |
| 59 | 60 | result? | |
| 60 | 61 | (position result-position result-position-set!) ; integer | |
| 62 | + | (points result-points) ; integer | |
| 61 | 63 | (kanjis result-kanjis) ; string-list | |
| 62 | 64 | (readings result-readings) ; reanding-list | |
| 63 | 65 | (senses result-senses)) ; sense-list | |
… | |||
| 89 | 91 | (lang source-lang)) ; string | |
| 90 | 92 | ||
| 91 | 93 | (define* (update-result result | |
| 92 | - | #:key (kanjis (result-kanjis result)) | |
| 94 | + | #:key (points (result-points result)) | |
| 95 | + | (kanjis (result-kanjis result)) | |
| 93 | 96 | (readings (result-readings result)) | |
| 94 | 97 | (senses (result-senses result))) | |
| 95 | - | (make-result (result-position result) kanjis readings senses)) | |
| 98 | + | (make-result (result-position result) points kanjis readings senses)) | |
| 96 | 99 | ||
| 97 | 100 | (define* (update-reading reading | |
| 98 | 101 | #:key (kanjis (reading-kanjis reading)) | |
tools/jmdict.scm
| 1 | + | ;;; Nani Project website | |
| 2 | + | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
| 3 | + | ;;; | |
| 4 | + | ;;; This file is part of the Nani Project website. | |
| 5 | + | ;;; | |
| 6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | + | ;;; your option) any later version. | |
| 10 | + | ;;; | |
| 11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | + | ;;; GNU Affero General Public License for more details. | |
| 15 | + | ;;; | |
| 16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | + | ||
| 1 | 19 | (use-modules (nani jmdict trie)) | |
| 2 | 20 | (use-modules (nani jmdict serialize)) | |
| 3 | 21 | (use-modules (nani jmdict xml)) | |
| 22 | + | (use-modules (nani frequency)) | |
| 4 | 23 | (use-modules (nani trie)) | |
| 5 | 24 | (use-modules (nani result)) | |
| 6 | 25 | (use-modules (ice-9 match)) | |
… | |||
| 13 | 32 | (write sxml port))))) | |
| 14 | 33 | ||
| 15 | 34 | (define (compile input sense-filter output) | |
| 16 | - | (let* ((sxml (if (equal? (substring input (- (string-length input) 3)) "xml") | |
| 35 | + | (let* ((frq (load-frequency "dictionaries/frequency.tsv")) | |
| 36 | + | (sxml (if (equal? (substring input (- (string-length input) 4)) ".xml") | |
| 17 | 37 | (load-dic input) | |
| 18 | - | (read input))) | |
| 19 | - | (results (sxml->results sxml)) | |
| 38 | + | (call-with-input-file input read))) | |
| 39 | + | (results (sxml->results sxml frq)) | |
| 20 | 40 | (results (map (lambda (result) | |
| 21 | 41 | (update-result | |
| 22 | 42 | result | |