Rework dictionary generation
Makefile
| 1 | + | # keep po files, even if they are sometimes generated, and keep downloaded dictionaries around | |
| 1 | 2 | .PRECIOUS: po/%.po dictionaries/% | |
| 2 | 3 | ||
| 3 | 4 | all: site | |
| 4 | 5 | ||
| 6 | + | # To be filled by included files | |
| 7 | + | # DICOS is the list of generated dictionaries | |
| 8 | + | # DOWNLOADS is the list of downloaded files | |
| 5 | 9 | DICOS= | |
| 6 | 10 | DOWNLOADS= | |
| 7 | 11 | ||
| 8 | 12 | include radicals.mk | |
| 9 | 13 | include wadoku.mk | |
| 10 | 14 | include jmdict.mk | |
| 15 | + | include jibiki.mk | |
| 11 | 16 | ||
| 17 | + | # Files that constitute the website | |
| 12 | 18 | PAGES=blog.scm data.scm documentation.scm e404.scm feeds.scm index.scm mentions.scm | |
| 13 | - | ||
| 14 | 19 | HAUNT_FILES= haunt.scm $(addprefix pages/, $(PAGES)) \ | |
| 15 | 20 | tools/i18n.scm tools/theme.scm | |
| 16 | - | ||
| 21 | + | SHA_DICOS=$(addsuffix .sha256, $(DICOS)) | |
| 17 | 22 | WEB_FILES= $(HAUNT_FILES) \ | |
| 18 | - | $(shell find css) $(shell find images) $(DICOS) $(addsuffix .sha256, $(DICOS)) \ | |
| 23 | + | $(shell find css) $(shell find images) $(DICOS) $(SHA_DICOS) \ | |
| 19 | 24 | dicos/list | |
| 20 | 25 | ||
| 26 | + | # Guile modules used to build dictionaries | |
| 21 | 27 | DICO_MODULES=modules/nani/trie.scm modules/nani/result.scm modules/nani/jmdict/trie.scm \ | |
| 22 | 28 | modules/nani/jmdict/serialize.scm modules/nani/jmdict/xml.scm \ | |
| 23 | 29 | modules/nani/jmdict/entities.scm modules/nani/frequency.scm \ | |
| 24 | 30 | modules/nani/serialize.scm | |
| 25 | 31 | ||
| 32 | + | # Available languages | |
| 26 | 33 | LANGS=fr uk | |
| 27 | 34 | MOFILES=$(addprefix po/, $(addsuffix /LC_MESSAGES/nani.mo, $(LANGS))) | |
| 28 | 35 | ||
| 36 | + | dicos: $(DICOS) $(SHA_DICOS) dicos/list | |
| 37 | + | ||
| 29 | 38 | site: $(MOFILES) $(WEB_FILES) | |
| 30 | 39 | haunt build | |
| 31 | 40 | rm -rf public.bak |
jibiki.mk unknown status 1
| 1 | + | DICOS+=dicos/jibiki_fre.nani | |
| 2 | + | DOWNLOADS+=dictionaries/jibiki.xml | |
| 3 | + | ||
| 4 | + | dictionaries/jibiki.xml: | |
| 5 | + | wget https://jibiki.fr/data/Jibiki.fr/jibiki.fr_jpn_fra.xml.gz -O $@.gz | |
| 6 | + | gunzip $@.gz | |
| 7 | + | ||
| 8 | + | dicos/jibiki_fre.nani: dictionaries/jibiki.xml tools/jibiki.scm dictionaries/frequency.tsv $(DICO_MODULES) | |
| 9 | + | guile -L modules tools/jibiki.scm build $< $@ |
jmdict.mk
| 22 | 22 | ||
| 23 | 23 | dicos/JMdict_%.nani: dictionaries/JMdict.xml tools/jmdict.scm dictionaries/frequency.tsv $(DICO_MODULES) | |
| 24 | 24 | guile -L modules tools/jmdict.scm build \ | |
| 25 | - | $< $(shell echo $@ | sed 's|^.*_\([^.]*\)\..*$$|\1|g') $@ | |
| 25 | + | $< $(shell basename $@ .nani | sed 's|^JMdict_||g') $@ | |
| 26 | 26 | ||
| 27 | 27 | dicos/JMdict_e.nani: dictionaries/JMdict_e.xml tools/jmdict.scm dictionaries/frequency.tsv $(DICO_MODULES) | |
| 28 | 28 | guile -L modules tools/jmdict.scm build $< e $@ |
modules/nani/encoding/huffman.scm unknown status 1
| 1 | + | ;;; Nani Project website | |
| 2 | + | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
| 3 | + | ;;; | |
| 4 | + | ;;; This file is part of the Nani Project website. | |
| 5 | + | ;;; | |
| 6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | + | ;;; your option) any later version. | |
| 10 | + | ;;; | |
| 11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | + | ;;; GNU Affero General Public License for more details. | |
| 15 | + | ;;; | |
| 16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | + | ||
| 19 | + | (define-module (nani encoding huffman) | |
| 20 | + | #:use-module (ice-9 match) | |
| 21 | + | #:use-module (rnrs bytevectors) | |
| 22 | + | #:export (create-huffman | |
| 23 | + | huffman->code | |
| 24 | + | huffman-encode | |
| 25 | + | huffman-decode | |
| 26 | + | serialize-huffman | |
| 27 | + | serialize-huffman-string huffman-string-size)) | |
| 28 | + | ||
| 29 | + | (define (add-occurence occ char) | |
| 30 | + | (let* ((o (assoc-ref occ char)) | |
| 31 | + | (o (+ (if o o 0) 1))) | |
| 32 | + | (assoc-set! occ char o))) | |
| 33 | + | ||
| 34 | + | (define (add-occurences occ str) | |
| 35 | + | (let loop ((lst (append (string->list str) (list #\nul))) (occ occ)) | |
| 36 | + | (if (null? lst) | |
| 37 | + | occ | |
| 38 | + | (loop (cdr lst) (add-occurence occ (car lst)))))) | |
| 39 | + | ||
| 40 | + | (define (get-occurences string-list) | |
| 41 | + | (let loop ((lst string-list) (occ '())) | |
| 42 | + | (if (null? lst) | |
| 43 | + | occ | |
| 44 | + | (loop (cdr lst) (add-occurences occ (car lst)))))) | |
| 45 | + | ||
| 46 | + | (define (order tree) | |
| 47 | + | (if (null? tree) | |
| 48 | + | tree | |
| 49 | + | (add-in-order (car tree) (order (cdr tree))))) | |
| 50 | + | ||
| 51 | + | (define (add-in-order elem tree) | |
| 52 | + | (if (null? tree) | |
| 53 | + | (list elem) | |
| 54 | + | (let ((elem-pos (cdr elem)) | |
| 55 | + | (tree-pos (cdr (car tree)))) | |
| 56 | + | (if (> elem-pos tree-pos) | |
| 57 | + | (cons (car tree) (add-in-order elem (cdr tree))) | |
| 58 | + | (cons elem tree))))) | |
| 59 | + | ||
| 60 | + | (define (create-huffman string-list) | |
| 61 | + | (let ((total (apply + (map string-length string-list))) | |
| 62 | + | (occurences (get-occurences string-list))) | |
| 63 | + | (let loop ((tree (order occurences))) | |
| 64 | + | (if (equal? (length tree) 1) | |
| 65 | + | tree | |
| 66 | + | (let* ((e1 (car tree)) | |
| 67 | + | (tree (cdr tree)) | |
| 68 | + | (e2 (car tree)) | |
| 69 | + | (tree (cdr tree))) | |
| 70 | + | (loop (add-in-order (cons (list e1 e2) (+ (cdr e1) (cdr e2))) tree))))))) | |
| 71 | + | ||
| 72 | + | (define (huffman->code huffman) | |
| 73 | + | (match huffman | |
| 74 | + | ('() '()) | |
| 75 | + | ((((h1 h2) . weight)) | |
| 76 | + | (append | |
| 77 | + | (map (lambda (e) (cons (car e) (cons 0 (cdr e)))) (huffman->code h1)) | |
| 78 | + | (map (lambda (e) (cons (car e) (cons 1 (cdr e)))) (huffman->code h2)))) | |
| 79 | + | (((h1 h2) . weight) | |
| 80 | + | (append | |
| 81 | + | (map (lambda (e) (cons (car e) (cons 0 (cdr e)))) (huffman->code h1)) | |
| 82 | + | (map (lambda (e) (cons (car e) (cons 1 (cdr e)))) (huffman->code h2)))) | |
| 83 | + | ((((? char? char) . weight)) | |
| 84 | + | (list (cons char '()))) | |
| 85 | + | (((? char? char) . weight) | |
| 86 | + | (list (cons char '()))))) | |
| 87 | + | ||
| 88 | + | (define (bits->bytes b) | |
| 89 | + | (match b | |
| 90 | + | ('() '()) | |
| 91 | + | ((a b c d e f g h bs ...) | |
| 92 | + | (cons (+ h (* 2 g) (* 4 f) (* 8 e) (* 16 d) (* 32 c) (* 64 b) (* 128 a)) | |
| 93 | + | (bits->bytes bs))) | |
| 94 | + | (_ (bits->bytes (append b (make-list (- 8 (length b)) 0)))))) | |
| 95 | + | ||
| 96 | + | (define (huffman-encode code str) | |
| 97 | + | (u8-list->bytevector | |
| 98 | + | (bits->bytes | |
| 99 | + | (apply append (map (lambda (c) (or (assoc-ref code c) (throw "not in huffman code" c))) | |
| 100 | + | (append (string->list str) (list #\nul))))))) | |
| 101 | + | ||
| 102 | + | (define (byte->bits b) | |
| 103 | + | (let loop ((i 2) (b b) (by '())) | |
| 104 | + | (if (eq? i 512) | |
| 105 | + | by | |
| 106 | + | (loop (* i 2) (- b (modulo b i)) (cons (if (eq? (modulo b i) 0) 0 1) by))))) | |
| 107 | + | ||
| 108 | + | (define (huffman-decode huffman bv) | |
| 109 | + | (let ((seq (apply append (map byte->bits (bytevector->u8-list bv))))) | |
| 110 | + | (list->string | |
| 111 | + | (reverse | |
| 112 | + | (let loop ((seq seq) (result '()) (huff huffman)) | |
| 113 | + | (if (null? seq) | |
| 114 | + | result | |
| 115 | + | (match huff | |
| 116 | + | ((((h1 h2) . weight)) | |
| 117 | + | (if (= (car seq) 0) | |
| 118 | + | (loop (cdr seq) result h1) | |
| 119 | + | (loop (cdr seq) result h2))) | |
| 120 | + | (((h1 h2) . weight) | |
| 121 | + | (if (= (car seq) 0) | |
| 122 | + | (loop (cdr seq) result h1) | |
| 123 | + | (loop (cdr seq) result h2))) | |
| 124 | + | ((((? char? char) . weight)) | |
| 125 | + | (if (equal? char #\nul) | |
| 126 | + | result | |
| 127 | + | (loop seq (cons char result) huffman))) | |
| 128 | + | (((? char? char) . weight) | |
| 129 | + | (if (equal? char #\nul) | |
| 130 | + | result | |
| 131 | + | (loop seq (cons char result) huffman)))))))))) | |
| 132 | + | ||
| 133 | + | (define (serialize-huffman huffman) | |
| 134 | + | (define (serialize huffman) | |
| 135 | + | (match huffman | |
| 136 | + | ((((h1 h2) . weight)) | |
| 137 | + | (append '(1) (serialize h1) (serialize h2))) | |
| 138 | + | (((h1 h2) . weight) | |
| 139 | + | (append '(1) (serialize h1) (serialize h2))) | |
| 140 | + | ((((? char? char) . weight)) | |
| 141 | + | (append (bytevector->u8-list (string->utf8 (list->string (list char)))) '(0))) | |
| 142 | + | (((? char? char) . weight) | |
| 143 | + | (append (bytevector->u8-list (string->utf8 (list->string (list char)))) '(0))))) | |
| 144 | + | (u8-list->bytevector (serialize huffman))) | |
| 145 | + | ||
| 146 | + | (define (serialize-huffman-string huffman-code) | |
| 147 | + | (lambda (str pos bv) | |
| 148 | + | (let ((sbv (huffman-encode huffman-code str))) | |
| 149 | + | (bytevector-copy! sbv 0 bv pos (bytevector-length sbv)) | |
| 150 | + | (+ pos (bytevector-length sbv))))) | |
| 151 | + | ||
| 152 | + | (define (huffman-string-size huffman-code) | |
| 153 | + | (lambda (str) | |
| 154 | + | (let ((sbv (huffman-encode huffman-code str))) | |
| 155 | + | (+ (bytevector-length sbv))))) |
modules/nani/encoding/parse.scm unknown status 1
| 1 | + | ;;; Nani Project website | |
| 2 | + | ;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu> | |
| 3 | + | ;;; | |
| 4 | + | ;;; This file is part of the Nani Project website. | |
| 5 | + | ;;; | |
| 6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | + | ;;; your option) any later version. | |
| 10 | + | ;;; | |
| 11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | + | ;;; GNU Affero General Public License for more details. | |
| 15 | + | ;;; | |
| 16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | + | ||
| 19 | + | (define-module (nani encoding parse) | |
| 20 | + | #:use-module (ice-9 binary-ports) | |
| 21 | + | #:use-module (rnrs bytevectors) | |
| 22 | + | #:export (parse-list | |
| 23 | + | parse-char | |
| 24 | + | parse-int | |
| 25 | + | parse-boolean | |
| 26 | + | parse-string)) | |
| 27 | + | ||
| 28 | + | (define* (parse-list parse-element #:key (size #f)) | |
| 29 | + | (lambda (port) | |
| 30 | + | (let ((size (or size (bytevector-u16-ref (get-bytevector-n port 2) 0 (endianness big))))) | |
| 31 | + | (let loop ((result '()) (remaining size)) | |
| 32 | + | (if (= remaining 0) | |
| 33 | + | (reverse result) | |
| 34 | + | (loop (cons (parse-element port) result) (- remaining 1))))))) | |
| 35 | + | ||
| 36 | + | (define (parse-char port) | |
| 37 | + | (get-u8 port)) | |
| 38 | + | ||
| 39 | + | (define (parse-int port) | |
| 40 | + | (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big))) | |
| 41 | + | ||
| 42 | + | (define (parse-boolean port) | |
| 43 | + | (= (get-u8 port) 1)) | |
| 44 | + | ||
| 45 | + | (define (parse-string port) | |
| 46 | + | (define (get-result-string port) | |
| 47 | + | (let loop ((lu8 '()) (char (get-u8 port))) | |
| 48 | + | (if (= char 0) | |
| 49 | + | lu8 | |
| 50 | + | (loop (cons char lu8) (get-u8 port))))) | |
| 51 | + | (utf8->string (u8-list->bytevector (reverse (get-result-string port))))) |
modules/nani/encoding/serialize.scm unknown status 1
| 1 | + | ;;; Nani Project website | |
| 2 | + | ;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu> | |
| 3 | + | ;;; | |
| 4 | + | ;;; This file is part of the Nani Project website. | |
| 5 | + | ;;; | |
| 6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | + | ;;; your option) any later version. | |
| 10 | + | ;;; | |
| 11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | + | ;;; GNU Affero General Public License for more details. | |
| 15 | + | ;;; | |
| 16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | + | ||
| 19 | + | (define-module (nani encoding serialize) | |
| 20 | + | #:use-module (rnrs bytevectors) | |
| 21 | + | #:export (merge-bvs | |
| 22 | + | serialize-list list-size | |
| 23 | + | serialize-char char-size | |
| 24 | + | serialize-u16 u16-size | |
| 25 | + | serialize-int int-size | |
| 26 | + | serialize-boolean boolean-size | |
| 27 | + | serialize-string string-size)) | |
| 28 | + | ||
| 29 | + | ;;; merge-bvs bvs | |
| 30 | + | ;;; Returns a bytevector that contains the data from `bvs`, a list of bytevectors. For instance, | |
| 31 | + | ;;; (merge-bvs '(#u8(1 2) #u8(3 4))) gives #u8(1 2 3 4). | |
| 32 | + | ;;; | |
| 33 | + | ;;; Serializers and Sizers | |
| 34 | + | ;;; ---------------------- | |
| 35 | + | ;;; | |
| 36 | + | ;;; A serializer is a procedure that takes a value to serialize, a position at which to serialize | |
| 37 | + | ;;; the value, and a bytevector in which to serialize: `serialize val pos bv`. | |
| 38 | + | ;;; | |
| 39 | + | ;;; A sizer is a procedure that takes a value to serialize, and returns the expected size it will | |
| 40 | + | ;;; take in the serialization. | |
| 41 | + | ;;; | |
| 42 | + | ;;; Common Serialization Functions | |
| 43 | + | ;;; ------------------------------ | |
| 44 | + | ;;; | |
| 45 | + | ;;; serialize-list serialize-element [#:size? #t] | |
| 46 | + | ;;; Returns a serializer for a list of elements. `serialize-element` is the serializer function for | |
| 47 | + | ;;; an element. When `#:size` is `#t`, add the size of the list to the serialization, as a 16-bits | |
| 48 | + | ;;; unsigned integer. | |
| 49 | + | ;;; | |
| 50 | + | ;;; list-size element-size [#:size? #t] | |
| 51 | + | ;;; Returns a sizer for lists, where elements are sized by `element-size`. | |
| 52 | + | ||
| 53 | + | (define (merge-bvs bvs) | |
| 54 | + | (let* ((size (apply + (map bytevector-length bvs))) | |
| 55 | + | (bv (make-bytevector size 0))) | |
| 56 | + | (let loop ((bvs bvs) (pos 0)) | |
| 57 | + | (unless (null? bvs) | |
| 58 | + | (let ((sz (bytevector-length (car bvs)))) | |
| 59 | + | (bytevector-copy! (car bvs) 0 bv pos sz) | |
| 60 | + | (loop (cdr bvs) (+ pos sz))))) | |
| 61 | + | bv)) | |
| 62 | + | ||
| 63 | + | (define* (serialize-list serialize-element #:key (size? #t)) | |
| 64 | + | (lambda (lst pos bv) | |
| 65 | + | (when (not (list? lst)) (throw 'not-list lst)) | |
| 66 | + | (when size? | |
| 67 | + | (bytevector-u16-set! bv pos (length lst) (endianness big))) | |
| 68 | + | (let loop ((lst lst) (pos (+ pos (if size? 2 0)))) | |
| 69 | + | (if (null? lst) | |
| 70 | + | pos | |
| 71 | + | (loop (cdr lst) (serialize-element (car lst) pos bv)))))) | |
| 72 | + | (define* (list-size element-size #:key (size? #t)) | |
| 73 | + | (lambda (lst) | |
| 74 | + | (when (not (list? lst)) (throw 'not-list lst)) | |
| 75 | + | (apply + (if size? 2 0) (map element-size lst)))) | |
| 76 | + | ||
| 77 | + | (define (serialize-char int pos bv) | |
| 78 | + | (bytevector-u8-set! bv pos int) | |
| 79 | + | (+ pos 1)) | |
| 80 | + | (define char-size (const 1)) | |
| 81 | + | ||
| 82 | + | (define (serialize-u16 int pos bv) | |
| 83 | + | (bytevector-u16-set! bv pos int (endianness big)) | |
| 84 | + | (+ pos 2)) | |
| 85 | + | (define u16-size (const 2)) | |
| 86 | + | ||
| 87 | + | (define (serialize-int int pos bv) | |
| 88 | + | (bytevector-u32-set! bv pos int (endianness big)) | |
| 89 | + | (+ pos 4)) | |
| 90 | + | (define int-size (const 4)) | |
| 91 | + | ||
| 92 | + | (define (serialize-boolean bool pos bv) | |
| 93 | + | (bytevector-u8-set! bv pos (if bool 1 0)) | |
| 94 | + | (+ pos 1)) | |
| 95 | + | (define boolean-size (const 1)) | |
| 96 | + | ||
| 97 | + | (define (serialize-string str pos bv) | |
| 98 | + | (let ((sbv (string->utf8 str))) | |
| 99 | + | (bytevector-copy! sbv 0 bv pos (bytevector-length sbv)) | |
| 100 | + | (bytevector-u8-set! bv (+ pos (bytevector-length sbv)) 0) | |
| 101 | + | (+ pos 1 (bytevector-length sbv)))) | |
| 102 | + | (define (string-size str) | |
| 103 | + | (let ((sbv (string->utf8 str))) | |
| 104 | + | (+ 1 (bytevector-length sbv)))) |
modules/nani/encoding/trie.scm unknown status 1
| 1 | + | ;;; Nani Project website | |
| 2 | + | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
| 3 | + | ;;; | |
| 4 | + | ;;; This file is part of the Nani Project website. | |
| 5 | + | ;;; | |
| 6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | + | ;;; your option) any later version. | |
| 10 | + | ;;; | |
| 11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | + | ;;; GNU Affero General Public License for more details. | |
| 15 | + | ;;; | |
| 16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | + | ||
| 19 | + | (define-module (nani encoding trie) | |
| 20 | + | #:use-module (nani encoding serialize) | |
| 21 | + | #:use-module (rnrs bytevectors) | |
| 22 | + | #:use-module (srfi srfi-9) | |
| 23 | + | #:export (make-trie | |
| 24 | + | trie? | |
| 25 | + | trie-position | |
| 26 | + | trie-position-set! | |
| 27 | + | trie-vals | |
| 28 | + | trie-vals-set! | |
| 29 | + | trie-transitions | |
| 30 | + | trie-transitions-set! | |
| 31 | + | ||
| 32 | + | make-empty-trie | |
| 33 | + | add-to-trie! | |
| 34 | + | compress-trie | |
| 35 | + | ||
| 36 | + | serialize-trie | |
| 37 | + | serialize-trie-single | |
| 38 | + | trie-size | |
| 39 | + | trie-size-single)) | |
| 40 | + | ||
| 41 | + | (define-record-type trie | |
| 42 | + | (make-trie position vals transitions) | |
| 43 | + | trie? | |
| 44 | + | (position trie-position trie-position-set!) ; integer | |
| 45 | + | (vals trie-vals trie-vals-set!) ; list | |
| 46 | + | (transitions trie-transitions trie-transitions-set!)) ; array or alist | |
| 47 | + | ||
| 48 | + | (define (make-empty-trie) | |
| 49 | + | (make-trie 0 '() (make-array #f 16))) | |
| 50 | + | ||
| 51 | + | (define (add-to-trie! trie key value) | |
| 52 | + | (if (null? key) | |
| 53 | + | (trie-vals-set! trie (cons value (trie-vals trie))) | |
| 54 | + | (let ((next-trie (array-ref (trie-transitions trie) (car key)))) | |
| 55 | + | (if next-trie | |
| 56 | + | (add-to-trie! next-trie (cdr key) value) | |
| 57 | + | (let ((next-trie (make-empty-trie))) | |
| 58 | + | (array-set! (trie-transitions trie) next-trie (car key)) | |
| 59 | + | (add-to-trie! next-trie (cdr key) value)))))) | |
| 60 | + | ||
| 61 | + | (define (convert-trie-transitions! trie) | |
| 62 | + | (define (get-new-transitions transitions) | |
| 63 | + | (let loop ((i 0) (tr '())) | |
| 64 | + | (if (= i 16) | |
| 65 | + | tr | |
| 66 | + | (let ((elem (array-ref transitions i))) | |
| 67 | + | (if elem | |
| 68 | + | (begin | |
| 69 | + | (convert-trie-transitions! elem) | |
| 70 | + | (loop (+ i 1) (cons (cons i elem) tr))) | |
| 71 | + | (loop (+ i 1) tr)))))) | |
| 72 | + | (let* ((transitions (trie-transitions trie)) | |
| 73 | + | (transitions (get-new-transitions transitions))) | |
| 74 | + | (trie-transitions-set! trie transitions))) | |
| 75 | + | ||
| 76 | + | (define (compress-trie trie) | |
| 77 | + | (define (compress-aux trie) | |
| 78 | + | (make-trie | |
| 79 | + | (trie-position trie) | |
| 80 | + | (trie-vals trie) | |
| 81 | + | (apply append | |
| 82 | + | (map | |
| 83 | + | (lambda (tr) | |
| 84 | + | (let ((trie (cdr tr))) | |
| 85 | + | (map | |
| 86 | + | (lambda (tr2) | |
| 87 | + | (cons (+ (car tr2) (* 16 (car tr))) | |
| 88 | + | (compress-aux (cdr tr2)))) | |
| 89 | + | (trie-transitions trie)))) | |
| 90 | + | (trie-transitions trie))))) | |
| 91 | + | (convert-trie-transitions! trie) | |
| 92 | + | (compress-aux trie)) | |
| 93 | + | ||
| 94 | + | (define (pointer-size ptr) | |
| 95 | + | 5) | |
| 96 | + | ||
| 97 | + | (define (serialize-pointer ptr pos bv) | |
| 98 | + | (bytevector-u8-set! bv pos (car ptr)) | |
| 99 | + | (bytevector-u32-set! bv (+ pos 1) (trie-position (cdr ptr)) (endianness big)) | |
| 100 | + | (+ pos 5)) | |
| 101 | + | ||
| 102 | + | (define (serialize-trie-single serialize-value value-size) | |
| 103 | + | (define (serialize-trie-aux transitions pos) | |
| 104 | + | (let loop ((pos pos) | |
| 105 | + | (trs transitions) | |
| 106 | + | (bvs '())) | |
| 107 | + | (if (null? trs) | |
| 108 | + | (cons pos bvs) | |
| 109 | + | (let* ((next-trie (cdr (car trs))) | |
| 110 | + | (bv (get-trie-bv next-trie pos)) | |
| 111 | + | (pos (car bv)) | |
| 112 | + | (bv (cdr bv))) | |
| 113 | + | (loop pos (cdr trs) (append bvs bv)))))) | |
| 114 | + | ||
| 115 | + | (define (get-trie-bv trie pos) | |
| 116 | + | (trie-position-set! trie pos) | |
| 117 | + | (let* ((vals-sz (value-size (trie-vals trie))) | |
| 118 | + | (trs-sz ((list-size (const 5) #:size? #f) (trie-transitions trie))) | |
| 119 | + | (sz (+ vals-sz 1 trs-sz)) | |
| 120 | + | (bv (make-bytevector sz 0))) | |
| 121 | + | (serialize-value (trie-vals trie) 0 bv) | |
| 122 | + | (let* ((bvs (serialize-trie-aux | |
| 123 | + | (trie-transitions trie) | |
| 124 | + | (+ pos sz))) | |
| 125 | + | (next-pos (car bvs)) | |
| 126 | + | (bvs (cdr bvs))) | |
| 127 | + | (bytevector-u8-set! bv vals-sz (length (trie-transitions trie))) | |
| 128 | + | ((serialize-list serialize-pointer #:size? #f) | |
| 129 | + | (trie-transitions trie) (+ vals-sz 1) bv) | |
| 130 | + | (cons next-pos (cons bv bvs))))) | |
| 131 | + | ||
| 132 | + | (lambda (trie pos bv) | |
| 133 | + | (let* ((trie-bv (get-trie-bv trie pos)) | |
| 134 | + | (new-pos (car trie-bv)) | |
| 135 | + | (trie-bv (merge-bvs (cdr trie-bv)))) | |
| 136 | + | (bytevector-copy! trie-bv 0 bv pos (bytevector-length trie-bv)) | |
| 137 | + | new-pos))) | |
| 138 | + | ||
| 139 | + | (define (serialize-trie serialize-value value-size) | |
| 140 | + | (serialize-trie-single (serialize-list serialize-value) (list-size value-size))) | |
| 141 | + | ||
| 142 | + | (define (trie-size-single value-size) | |
| 143 | + | (lambda (trie) | |
| 144 | + | (apply + | |
| 145 | + | (value-size (trie-vals trie)) | |
| 146 | + | 1 | |
| 147 | + | ((list-size pointer-size #:size? #f) (trie-transitions trie)) | |
| 148 | + | (map (lambda (trie) ((trie-size-single value-size) trie)) | |
| 149 | + | (map cdr (trie-transitions trie)))))) | |
| 150 | + | ||
| 151 | + | (define (trie-size value-size) | |
| 152 | + | (trie-size-single (list-size value-size))) |
modules/nani/frequency.scm unknown status 2
| 1 | - | ;;; Nani Project website | |
| 2 | - | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
| 3 | - | ;;; | |
| 4 | - | ;;; This file is part of the Nani Project website. | |
| 5 | - | ;;; | |
| 6 | - | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | - | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | - | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | - | ;;; your option) any later version. | |
| 10 | - | ;;; | |
| 11 | - | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | - | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | - | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | - | ;;; GNU Affero General Public License for more details. | |
| 15 | - | ;;; | |
| 16 | - | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | - | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | - | ||
| 19 | - | (define-module (nani frequency) | |
| 20 | - | #:use-module (ice-9 rdelim) | |
| 21 | - | #:export (load-frequency | |
| 22 | - | frequency-entity)) | |
| 23 | - | ||
| 24 | - | (define (load-frequency file) | |
| 25 | - | (call-with-input-file file | |
| 26 | - | (lambda (port) | |
| 27 | - | (let loop ((frq '()) (i 1)) | |
| 28 | - | (let* ((line (%read-line port)) | |
| 29 | - | (line (car line))) | |
| 30 | - | (if (eof-object? line) | |
| 31 | - | frq | |
| 32 | - | (let ((content (string-split line #\tab))) | |
| 33 | - | (loop (cons (cons (cadr (cdr content)) i) frq) (+ i 1))))))))) | |
| 34 | - | ||
| 35 | - | (define (frequency-entity frq word) | |
| 36 | - | (let ((freq (assoc-ref frq word))) | |
| 37 | - | (cond | |
| 38 | - | ((not freq) | |
| 39 | - | #f) | |
| 40 | - | ((< freq 501) | |
| 41 | - | "frq500") | |
| 42 | - | ((< freq 1001) | |
| 43 | - | "frq1000") | |
| 44 | - | ((< freq 2001) | |
| 45 | - | "frq2000") | |
| 46 | - | ((< freq 5001) | |
| 47 | - | "frq1000") | |
| 48 | - | ((< freq 10001) | |
| 49 | - | "frq10000") | |
| 50 | - | ((< freq 20001) | |
| 51 | - | "frq20000") | |
| 52 | - | (else #f)))) |
modules/nani/huffman.scm unknown status 2
| 1 | - | ;;; Nani Project website | |
| 2 | - | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
| 3 | - | ;;; | |
| 4 | - | ;;; This file is part of the Nani Project website. | |
| 5 | - | ;;; | |
| 6 | - | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | - | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | - | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | - | ;;; your option) any later version. | |
| 10 | - | ;;; | |
| 11 | - | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | - | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | - | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | - | ;;; GNU Affero General Public License for more details. | |
| 15 | - | ;;; | |
| 16 | - | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | - | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | - | ||
| 19 | - | (define-module (nani huffman) | |
| 20 | - | #:use-module (ice-9 match) | |
| 21 | - | #:use-module (rnrs bytevectors) | |
| 22 | - | #:export (create-huffman | |
| 23 | - | huffman->code | |
| 24 | - | huffman-encode | |
| 25 | - | huffman-decode | |
| 26 | - | serialize-huffman | |
| 27 | - | serialize-huffman-string huffman-string-size)) | |
| 28 | - | ||
| 29 | - | (define (add-occurence occ char) | |
| 30 | - | (let* ((o (assoc-ref occ char)) | |
| 31 | - | (o (+ (if o o 0) 1))) | |
| 32 | - | (assoc-set! occ char o))) | |
| 33 | - | ||
| 34 | - | (define (add-occurences occ str) | |
| 35 | - | (let loop ((lst (append (string->list str) (list #\nul))) (occ occ)) | |
| 36 | - | (if (null? lst) | |
| 37 | - | occ | |
| 38 | - | (loop (cdr lst) (add-occurence occ (car lst)))))) | |
| 39 | - | ||
| 40 | - | (define (get-occurences string-list) | |
| 41 | - | (let loop ((lst string-list) (occ '())) | |
| 42 | - | (if (null? lst) | |
| 43 | - | occ | |
| 44 | - | (loop (cdr lst) (add-occurences occ (car lst)))))) | |
| 45 | - | ||
| 46 | - | (define (order tree) | |
| 47 | - | (if (null? tree) | |
| 48 | - | tree | |
| 49 | - | (add-in-order (car tree) (order (cdr tree))))) | |
| 50 | - | ||
| 51 | - | (define (add-in-order elem tree) | |
| 52 | - | (if (null? tree) | |
| 53 | - | (list elem) | |
| 54 | - | (let ((elem-pos (cdr elem)) | |
| 55 | - | (tree-pos (cdr (car tree)))) | |
| 56 | - | (if (> elem-pos tree-pos) | |
| 57 | - | (cons (car tree) (add-in-order elem (cdr tree))) | |
| 58 | - | (cons elem tree))))) | |
| 59 | - | ||
| 60 | - | (define (create-huffman string-list) | |
| 61 | - | (let ((total (apply + (map string-length string-list))) | |
| 62 | - | (occurences (get-occurences string-list))) | |
| 63 | - | (let loop ((tree (order occurences))) | |
| 64 | - | (if (equal? (length tree) 1) | |
| 65 | - | tree | |
| 66 | - | (let* ((e1 (car tree)) | |
| 67 | - | (tree (cdr tree)) | |
| 68 | - | (e2 (car tree)) | |
| 69 | - | (tree (cdr tree))) | |
| 70 | - | (loop (add-in-order (cons (list e1 e2) (+ (cdr e1) (cdr e2))) tree))))))) | |
| 71 | - | ||
| 72 | - | (define (huffman->code huffman) | |
| 73 | - | (match huffman | |
| 74 | - | ('() '()) | |
| 75 | - | ((((h1 h2) . weight)) | |
| 76 | - | (append | |
| 77 | - | (map (lambda (e) (cons (car e) (cons 0 (cdr e)))) (huffman->code h1)) | |
| 78 | - | (map (lambda (e) (cons (car e) (cons 1 (cdr e)))) (huffman->code h2)))) | |
| 79 | - | (((h1 h2) . weight) | |
| 80 | - | (append | |
| 81 | - | (map (lambda (e) (cons (car e) (cons 0 (cdr e)))) (huffman->code h1)) | |
| 82 | - | (map (lambda (e) (cons (car e) (cons 1 (cdr e)))) (huffman->code h2)))) | |
| 83 | - | ((((? char? char) . weight)) | |
| 84 | - | (list (cons char '()))) | |
| 85 | - | (((? char? char) . weight) | |
| 86 | - | (list (cons char '()))))) | |
| 87 | - | ||
| 88 | - | (define (bits->bytes b) | |
| 89 | - | (match b | |
| 90 | - | ('() '()) | |
| 91 | - | ((a b c d e f g h bs ...) | |
| 92 | - | (cons (+ h (* 2 g) (* 4 f) (* 8 e) (* 16 d) (* 32 c) (* 64 b) (* 128 a)) | |
| 93 | - | (bits->bytes bs))) | |
| 94 | - | (_ (bits->bytes (append b (make-list (- 8 (length b)) 0)))))) | |
| 95 | - | ||
| 96 | - | (define (huffman-encode code str) | |
| 97 | - | (u8-list->bytevector | |
| 98 | - | (bits->bytes | |
| 99 | - | (apply append (map (lambda (c) (assoc-ref code c)) | |
| 100 | - | (append (string->list str) (list #\nul))))))) | |
| 101 | - | ||
| 102 | - | (define (byte->bits b) | |
| 103 | - | (let loop ((i 2) (b b) (by '())) | |
| 104 | - | (if (eq? i 512) | |
| 105 | - | by | |
| 106 | - | (loop (* i 2) (- b (modulo b i)) (cons (if (eq? (modulo b i) 0) 0 1) by))))) | |
| 107 | - | ||
| 108 | - | (define (huffman-decode huffman bv) | |
| 109 | - | (let ((seq (apply append (map byte->bits (bytevector->u8-list bv))))) | |
| 110 | - | (list->string | |
| 111 | - | (reverse | |
| 112 | - | (let loop ((seq seq) (result '()) (huff huffman)) | |
| 113 | - | (if (null? seq) | |
| 114 | - | result | |
| 115 | - | (match huff | |
| 116 | - | ((((h1 h2) . weight)) | |
| 117 | - | (if (= (car seq) 0) | |
| 118 | - | (loop (cdr seq) result h1) | |
| 119 | - | (loop (cdr seq) result h2))) | |
| 120 | - | (((h1 h2) . weight) | |
| 121 | - | (if (= (car seq) 0) | |
| 122 | - | (loop (cdr seq) result h1) | |
| 123 | - | (loop (cdr seq) result h2))) | |
| 124 | - | ((((? char? char) . weight)) | |
| 125 | - | (if (equal? char #\nul) | |
| 126 | - | result | |
| 127 | - | (loop seq (cons char result) huffman))) | |
| 128 | - | (((? char? char) . weight) | |
| 129 | - | (if (equal? char #\nul) | |
| 130 | - | result | |
| 131 | - | (loop seq (cons char result) huffman)))))))))) | |
| 132 | - | ||
| 133 | - | (define (serialize-huffman huffman) | |
| 134 | - | (define (serialize huffman) | |
| 135 | - | (match huffman | |
| 136 | - | ((((h1 h2) . weight)) | |
| 137 | - | (append '(1) (serialize h1) (serialize h2))) | |
| 138 | - | (((h1 h2) . weight) | |
| 139 | - | (append '(1) (serialize h1) (serialize h2))) | |
| 140 | - | ((((? char? char) . weight)) | |
| 141 | - | (append (bytevector->u8-list (string->utf8 (list->string (list char)))) '(0))) | |
| 142 | - | (((? char? char) . weight) | |
| 143 | - | (append (bytevector->u8-list (string->utf8 (list->string (list char)))) '(0))))) | |
| 144 | - | (u8-list->bytevector (serialize huffman))) | |
| 145 | - | ||
| 146 | - | (define (serialize-huffman-string huffman-code) | |
| 147 | - | (lambda (str pos bv) | |
| 148 | - | (let ((sbv (huffman-encode huffman-code str))) | |
| 149 | - | (bytevector-copy! sbv 0 bv pos (bytevector-length sbv)) | |
| 150 | - | (+ pos (bytevector-length sbv))))) | |
| 151 | - | ||
| 152 | - | (define (huffman-string-size huffman-code) | |
| 153 | - | (lambda (str) | |
| 154 | - | (let ((sbv (huffman-encode huffman-code str))) | |
| 155 | - | (+ (bytevector-length sbv))))) |
modules/nani/jmdict/entities.scm unknown status 2
| 1 | - | (define-module (nani jmdict entities) | |
| 2 | - | #:use-module (nani result) | |
| 3 | - | #:use-module (nani tags) | |
| 4 | - | #:re-export (get-tag get-points)) |
modules/nani/jmdict/serialize.scm unknown status 2
| 1 | - | ;;; Nani Project website | |
| 2 | - | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
| 3 | - | ;;; | |
| 4 | - | ;;; This file is part of the Nani Project website. | |
| 5 | - | ;;; | |
| 6 | - | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | - | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | - | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | - | ;;; your option) any later version. | |
| 10 | - | ;;; | |
| 11 | - | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | - | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | - | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | - | ;;; GNU Affero General Public License for more details. | |
| 15 | - | ;;; | |
| 16 | - | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | - | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | - | ||
| 19 | - | (define-module (nani jmdict serialize) | |
| 20 | - | #:use-module (nani huffman) | |
| 21 | - | #:use-module (nani result) | |
| 22 | - | #:use-module (nani serialize) | |
| 23 | - | #:use-module (nani trie) | |
| 24 | - | #:use-module (ice-9 binary-ports) | |
| 25 | - | #:use-module (rnrs bytevectors) | |
| 26 | - | #:export (serialize-jmdict | |
| 27 | - | jmdict-entry-count)) | |
| 28 | - | ||
| 29 | - | (define (serialize-pointer ptr pos bv) | |
| 30 | - | (bytevector-u8-set! bv pos (car ptr)) | |
| 31 | - | (bytevector-u32-set! bv (+ pos 1) (trie-position (cdr ptr)) (endianness big)) | |
| 32 | - | (+ pos 5)) | |
| 33 | - | ||
| 34 | - | (define (serialize-trie trie results pos) | |
| 35 | - | (define (serialize-trie-aux transitions pos) | |
| 36 | - | (let loop ((pos pos) | |
| 37 | - | (trs transitions) | |
| 38 | - | (bvs '())) | |
| 39 | - | (if (null? trs) | |
| 40 | - | (cons pos bvs) | |
| 41 | - | (let* ((next-trie (cdr (car trs))) | |
| 42 | - | (bv (serialize-trie next-trie results pos)) | |
| 43 | - | (pos (car bv)) | |
| 44 | - | (bv (cdr bv))) | |
| 45 | - | (loop pos (cdr trs) (append bvs bv)))))) | |
| 46 | - | (trie-position-set! trie pos) | |
| 47 | - | (let* ((vals-sz (list-size (trie-vals trie) int-size)) | |
| 48 | - | (trs-sz (list-size (trie-transitions trie) (const 5) #:size? #f)) | |
| 49 | - | (sz (+ vals-sz 1 trs-sz)) | |
| 50 | - | (bv (make-bytevector sz 0))) | |
| 51 | - | (serialize-list (map (lambda (pos) (result-position (array-ref results pos))) | |
| 52 | - | (trie-vals trie)) | |
| 53 | - | serialize-int 0 bv) | |
| 54 | - | (let* ((bvs (serialize-trie-aux | |
| 55 | - | (trie-transitions trie) | |
| 56 | - | (+ pos (bytevector-length bv)))) | |
| 57 | - | (next-pos (car bvs)) | |
| 58 | - | (bvs (cdr bvs))) | |
| 59 | - | (bytevector-u8-set! bv vals-sz (length (trie-transitions trie))) | |
| 60 | - | (serialize-list (trie-transitions trie) serialize-pointer | |
| 61 | - | (+ vals-sz 1) bv #:size? #f) | |
| 62 | - | (cons next-pos (cons bv bvs))))) | |
| 63 | - | ||
| 64 | - | (define (serialize-jmdict results kanji-trie reading-trie sense-trie) | |
| 65 | - | (define kanji-huffman | |
| 66 | - | (let ((kanjis (apply append (map result-kanjis results)))) | |
| 67 | - | (create-huffman kanjis))) | |
| 68 | - | (define kanji-huffman-code (huffman->code kanji-huffman)) | |
| 69 | - | (define reading-huffman | |
| 70 | - | (let* ((readings (apply append (map result-readings results))) | |
| 71 | - | (readings (apply append (map reading-readings readings)))) | |
| 72 | - | (create-huffman readings))) | |
| 73 | - | (define reading-huffman-code (huffman->code reading-huffman)) | |
| 74 | - | (define sense-huffman | |
| 75 | - | (let* ((senses (apply append (map result-senses results))) | |
| 76 | - | (glosses (apply append (map sense-glosses senses)))) | |
| 77 | - | (create-huffman glosses))) | |
| 78 | - | (define sense-huffman-code (huffman->code sense-huffman)) | |
| 79 | - | ||
| 80 | - | (define (serialize-source source pos bv) | |
| 81 | - | (when (not (source? source)) (throw 'not-source source)) | |
| 82 | - | (let* ((pos (serialize-list (source-content source) serialize-string pos bv)) | |
| 83 | - | (pos (serialize-boolean (source-wasei? source) pos bv)) | |
| 84 | - | (pos (serialize-string (source-type source) pos bv)) | |
| 85 | - | (pos (serialize-string (source-lang source) pos bv))) | |
| 86 | - | pos)) | |
| 87 | - | (define (source-size source) | |
| 88 | - | (when (not (source? source)) (throw 'not-source source)) | |
| 89 | - | (+ (list-size (source-content source) string-size) | |
| 90 | - | (boolean-size (source-wasei? source)) | |
| 91 | - | (string-size (source-type source)) | |
| 92 | - | (string-size (source-lang source)))) | |
| 93 | - | ||
| 94 | - | (define (serialize-reading reading pos bv) | |
| 95 | - | (when (not (reading? reading)) (throw 'not-reading reading)) | |
| 96 | - | (let* ((pos (serialize-list (reading-kanjis reading) serialize-string pos bv)) | |
| 97 | - | (pos (serialize-list (reading-info reading) serialize-string pos bv)) | |
| 98 | - | (pos (serialize-list (reading-readings reading) | |
| 99 | - | (serialize-huffman-string reading-huffman-code) pos bv))) | |
| 100 | - | pos)) | |
| 101 | - | (define (reading-size reading) | |
| 102 | - | (when (not (reading? reading)) (throw 'not-reading reading)) | |
| 103 | - | (+ (list-size (reading-kanjis reading) string-size) | |
| 104 | - | (list-size (reading-info reading) string-size) | |
| 105 | - | (list-size (reading-readings reading) (huffman-string-size reading-huffman-code)))) | |
| 106 | - | ||
| 107 | - | (define (serialize-sense sense pos bv) | |
| 108 | - | (when (not (sense? sense)) (throw 'not-sense sense)) | |
| 109 | - | (let* ((pos (serialize-list (sense-references sense) serialize-string pos bv)) | |
| 110 | - | (pos (serialize-list (sense-limits sense) serialize-string pos bv)) | |
| 111 | - | (pos (serialize-list (sense-infos sense) serialize-string pos bv)) | |
| 112 | - | (pos (serialize-list (sense-sources sense) serialize-source pos bv)) | |
| 113 | - | (pos (serialize-list (sense-tags sense) serialize-char pos bv)) | |
| 114 | - | (pos (serialize-list (sense-glosses sense) | |
| 115 | - | (serialize-huffman-string sense-huffman-code) pos bv)) | |
| 116 | - | (pos (serialize-string (sense-language sense) pos bv))) | |
| 117 | - | pos)) | |
| 118 | - | (define (sense-size sense) | |
| 119 | - | (when (not (sense? sense)) (throw 'not-sense sense)) | |
| 120 | - | (+ (list-size (sense-references sense) string-size) | |
| 121 | - | (list-size (sense-limits sense) string-size) | |
| 122 | - | (list-size (sense-infos sense) string-size) | |
| 123 | - | (list-size (sense-sources sense) source-size) | |
| 124 | - | (list-size (sense-tags sense) char-size) | |
| 125 | - | (list-size (sense-glosses sense) | |
| 126 | - | (huffman-string-size sense-huffman-code)) | |
| 127 | - | (string-size (sense-language sense)))) | |
| 128 | - | ||
| 129 | - | (define (serialize-result result pos bv) | |
| 130 | - | (when (not (result? result)) (throw 'not-result result)) | |
| 131 | - | (result-position-set! result pos) | |
| 132 | - | (let* ((pos (serialize-list (result-kanjis result) | |
| 133 | - | (serialize-huffman-string kanji-huffman-code) pos bv)) | |
| 134 | - | (pos (serialize-list (result-readings result) serialize-reading pos bv)) | |
| 135 | - | (pos (serialize-list (result-senses result) serialize-sense pos bv))) | |
| 136 | - | pos)) | |
| 137 | - | (define (result-size result) | |
| 138 | - | (when (not (result? result)) (throw 'not-result result)) | |
| 139 | - | (+ (list-size (result-kanjis result) (huffman-string-size kanji-huffman-code)) | |
| 140 | - | (list-size (result-readings result) reading-size) | |
| 141 | - | (list-size (result-senses result) sense-size))) | |
| 142 | - | ||
| 143 | - | (define (trie-node-size trie) | |
| 144 | - | (apply + 1 (map trie-node-size (map cdr (trie-transitions trie))))) | |
| 145 | - | ||
| 146 | - | (let* ((header (string->utf8 "NANI_JMDICT001")) | |
| 147 | - | (header-size (bytevector-length header)) | |
| 148 | - | (pointers (make-bytevector 12 0)) | |
| 149 | - | (kanji-huffman-bv (serialize-huffman kanji-huffman)) | |
| 150 | - | (kanji-huffman-size (bytevector-length kanji-huffman-bv)) | |
| 151 | - | (reading-huffman-bv (serialize-huffman reading-huffman)) | |
| 152 | - | (reading-huffman-size (bytevector-length reading-huffman-bv)) | |
| 153 | - | (sense-huffman-bv (serialize-huffman sense-huffman)) | |
| 154 | - | (sense-huffman-size (bytevector-length sense-huffman-bv)) | |
| 155 | - | (results-size (list-size results result-size #:size? #f)) | |
| 156 | - | (huffman-size (+ reading-huffman-size sense-huffman-size kanji-huffman-size)) | |
| 157 | - | (results-bv (make-bytevector (+ header-size 12 kanji-huffman-size | |
| 158 | - | reading-huffman-size sense-huffman-size | |
| 159 | - | results-size 4)))) | |
| 160 | - | (format #t "Number of nodes in kanjis: ~a~%" | |
| 161 | - | (trie-node-size kanji-trie)) | |
| 162 | - | (format #t "Number of nodes in readings: ~a~%" | |
| 163 | - | (trie-node-size reading-trie)) | |
| 164 | - | (format #t "Number of nodes in senses: ~a~%" | |
| 165 | - | (trie-node-size sense-trie)) | |
| 166 | - | (serialize-list results serialize-result (+ header-size 12 huffman-size) | |
| 167 | - | results-bv #:size? #f) | |
| 168 | - | ;; number of entries | |
| 169 | - | (serialize-int (length results) (+ header-size 12 huffman-size results-size) | |
| 170 | - | results-bv) | |
| 171 | - | (let* ((results (list->array 1 results)) | |
| 172 | - | (pos (bytevector-length results-bv)) | |
| 173 | - | (kanji-bvs (serialize-trie kanji-trie results pos)) | |
| 174 | - | (pos (car kanji-bvs)) | |
| 175 | - | (reading-bvs (serialize-trie reading-trie results pos)) | |
| 176 | - | (pos (car reading-bvs)) | |
| 177 | - | (meaning-bvs (serialize-trie sense-trie results pos))) | |
| 178 | - | (bytevector-u32-set! pointers 0 (bytevector-length results-bv) (endianness big)) | |
| 179 | - | (bytevector-u32-set! pointers 4 (car kanji-bvs) (endianness big)) | |
| 180 | - | (bytevector-u32-set! pointers 8 (car reading-bvs) (endianness big)) | |
| 181 | - | (bytevector-copy! header 0 results-bv 0 header-size) | |
| 182 | - | (bytevector-copy! pointers 0 results-bv header-size 12) | |
| 183 | - | (bytevector-copy! kanji-huffman-bv 0 results-bv (+ header-size 12) kanji-huffman-size) | |
| 184 | - | (bytevector-copy! reading-huffman-bv 0 results-bv | |
| 185 | - | (+ header-size 12 kanji-huffman-size) | |
| 186 | - | reading-huffman-size) | |
| 187 | - | (bytevector-copy! sense-huffman-bv 0 results-bv | |
| 188 | - | (+ header-size 12 kanji-huffman-size reading-huffman-size) | |
| 189 | - | sense-huffman-size) | |
| 190 | - | ;; give some feedback on the size of file's structures | |
| 191 | - | (format #t "huffmans are ~a bytes long~%" huffman-size) | |
| 192 | - | (format #t "results is ~a bytes long~%" (bytevector-length results-bv)) | |
| 193 | - | (format #t "kanjis is ~a bytes long~%" (apply + (map bytevector-length (cdr kanji-bvs)))) | |
| 194 | - | (format #t "readings is ~a bytes long~%" (apply + (map bytevector-length (cdr reading-bvs)))) | |
| 195 | - | (format #t "senses is ~a bytes long~%" (apply + (map bytevector-length (cdr meaning-bvs)))) | |
| 196 | - | (merge-bvs (append (list results-bv) (cdr kanji-bvs) (cdr reading-bvs) | |
| 197 | - | (cdr meaning-bvs)))))) | |
| 198 | - | ||
| 199 | - | (define (jmdict-entry-count file) | |
| 200 | - | (call-with-input-file file | |
| 201 | - | (lambda (port) | |
| 202 | - | (let* ((header (utf8->string (get-bytevector-n port 14))) | |
| 203 | - | (pointers (get-bytevector-n port 12)) | |
| 204 | - | (kanji-pos (bytevector-u32-ref pointers 0 (endianness big)))) | |
| 205 | - | (seek port (- kanji-pos 4) SEEK_SET) | |
| 206 | - | (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big)))))) |
modules/nani/jmdict/trie.scm unknown status 2
| 1 | - | ;;; Nani Project website | |
| 2 | - | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
| 3 | - | ;;; | |
| 4 | - | ;;; This file is part of the Nani Project website. | |
| 5 | - | ;;; | |
| 6 | - | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | - | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | - | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | - | ;;; your option) any later version. | |
| 10 | - | ;;; | |
| 11 | - | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | - | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | - | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | - | ;;; GNU Affero General Public License for more details. | |
| 15 | - | ;;; | |
| 16 | - | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | - | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | - | ||
| 19 | - | (define-module (nani jmdict trie) | |
| 20 | - | #:use-module (nani trie) | |
| 21 | - | #:use-module (nani result) | |
| 22 | - | #:use-module (rnrs bytevectors) | |
| 23 | - | #:export (make-kanji-trie | |
| 24 | - | make-reading-trie | |
| 25 | - | make-meaning-trie)) | |
| 26 | - | ||
| 27 | - | (define (make-key key) | |
| 28 | - | (apply append | |
| 29 | - | (map | |
| 30 | - | (lambda (c) | |
| 31 | - | (list (quotient c 16) (modulo c 16))) | |
| 32 | - | (bytevector->u8-list (string->utf8 key))))) | |
| 33 | - | ||
| 34 | - | (define (make-kanji-trie results) | |
| 35 | - | (let ((trie (make-empty-trie))) | |
| 36 | - | (let loop ((results results) (i 0)) | |
| 37 | - | (if (null? results) | |
| 38 | - | trie | |
| 39 | - | (begin | |
| 40 | - | (for-each | |
| 41 | - | (lambda (key) | |
| 42 | - | (add-to-trie! trie (make-key key) i)) | |
| 43 | - | (result-kanjis (car results))) | |
| 44 | - | (loop (cdr results) (+ i 1))))))) | |
| 45 | - | ||
| 46 | - | (define (make-reading-trie results) | |
| 47 | - | (let ((trie (make-empty-trie))) | |
| 48 | - | (let loop ((results results) (i 0)) | |
| 49 | - | (if (null? results) | |
| 50 | - | trie | |
| 51 | - | (begin | |
| 52 | - | (for-each | |
| 53 | - | (lambda (reading) | |
| 54 | - | (for-each | |
| 55 | - | (lambda (key) | |
| 56 | - | (add-to-trie! trie (make-key key) i)) | |
| 57 | - | (reading-readings reading))) | |
| 58 | - | (result-readings (car results))) | |
| 59 | - | (loop (cdr results) (+ i 1))))))) | |
| 60 | - | ||
| 61 | - | (define (make-meaning-trie results) | |
| 62 | - | (let ((trie (make-empty-trie))) | |
| 63 | - | (let loop ((results results) (i 0)) | |
| 64 | - | (if (null? results) | |
| 65 | - | trie | |
| 66 | - | (begin | |
| 67 | - | (for-each | |
| 68 | - | (lambda (meaning) | |
| 69 | - | (for-each | |
| 70 | - | (lambda (key) | |
| 71 | - | (add-to-trie! trie (make-key key) i)) | |
| 72 | - | (sense-glosses meaning))) | |
| 73 | - | (result-senses (car results))) | |
| 74 | - | (loop (cdr results) (+ i 1))))))) |
modules/nani/jmdict/xml.scm unknown status 2
| 1 | - | ;;; Nani Project website | |
| 2 | - | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
| 3 | - | ;;; | |
| 4 | - | ;;; This file is part of the Nani Project website. | |
| 5 | - | ;;; | |
| 6 | - | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | - | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | - | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | - | ;;; your option) any later version. | |
| 10 | - | ;;; | |
| 11 | - | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | - | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | - | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | - | ;;; GNU Affero General Public License for more details. | |
| 15 | - | ;;; | |
| 16 | - | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | - | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | - | ||
| 19 | - | (define-module (nani jmdict xml) | |
| 20 | - | #:use-module (ice-9 match) | |
| 21 | - | #:use-module (ice-9 rdelim) | |
| 22 | - | #:use-module (sxml ssax) | |
| 23 | - | #:use-module (nani frequency) | |
| 24 | - | #:use-module (sxml simple) | |
| 25 | - | #:use-module (nani result) | |
| 26 | - | #:use-module (nani jmdict entities) | |
| 27 | - | #:export (load-dic xml->results)) | |
| 28 | - | ||
| 29 | - | (define (load-dic file) | |
| 30 | - | (xml->sxml (call-with-input-file file read-string))) | |
| 31 | - | ||
| 32 | - | (define (sxml->reading lst) | |
| 33 | - | (let loop ((reading (make-reading '() '() '())) (lst lst)) | |
| 34 | - | (if (null? lst) | |
| 35 | - | reading | |
| 36 | - | (loop | |
| 37 | - | (match (car lst) | |
| 38 | - | (('reading r) (update-reading reading #:readings (cons r (reading-readings reading)))) | |
| 39 | - | (('info r) (update-reading reading #:info (cons r (reading-info reading)))) | |
| 40 | - | (('limit r) (update-reading reading #:kanjis (cons r (reading-kanjis reading)))) | |
| 41 | - | (((? symbol? s) v) (throw 'unknown-content s v)) | |
| 42 | - | ((? string? _) reading)) | |
| 43 | - | (cdr lst))))) | |
| 44 | - | ||
| 45 | - | (define (sxml->source lst) | |
| 46 | - | (let loop ((source (make-source '() #f "" "")) (lst lst)) | |
| 47 | - | (if (null? lst) | |
| 48 | - | source | |
| 49 | - | (loop | |
| 50 | - | (match (car lst) | |
| 51 | - | (('content c) (update-source source #:content (cons c (source-content source)))) | |
| 52 | - | (('ls_wasei _) (update-source source #:wasei? #t)) | |
| 53 | - | (('ls_type t) (update-source source #:type t)) | |
| 54 | - | ((('xml . 'lang) l) (update-source source #:lang l)) | |
| 55 | - | (((? symbol? s) v) (throw 'unknown-content s v)) | |
| 56 | - | ((? list? l) (loop source l)) | |
| 57 | - | ((? string? c) (update-source source #:content | |
| 58 | - | (cons c (source-content source))))) | |
| 59 | - | (cdr lst))))) | |
| 60 | - | ||
| 61 | - | (define (sxml->sense lst) | |
| 62 | - | (let loop ((sense (make-sense '() '() '() '() '() '() "eng")) (lst lst)) | |
| 63 | - | (if (null? lst) | |
| 64 | - | sense | |
| 65 | - | (loop | |
| 66 | - | (match (car lst) | |
| 67 | - | (('ref (? string? r)) (update-sense sense #:references (cons r (sense-references sense)))) | |
| 68 | - | (('limit (? string? r)) (update-sense sense #:limits (cons r (sense-limits sense)))) | |
| 69 | - | (('info (? string? r)) (update-sense sense #:infos (cons r (sense-infos sense)))) | |
| 70 | - | ((? source? s) (update-sense sense #:sources (cons s (sense-sources sense)))) | |
| 71 | - | (('tag (? integer? r)) (update-sense sense #:tags (cons r (sense-tags sense)))) | |
| 72 | - | (('gloss (? string? r)) (update-sense sense #:glosses (cons r (sense-glosses sense)))) | |
| 73 | - | ((('xml . 'lang) (? string? l)) (update-sense sense #:language l)) | |
| 74 | - | (((? symbol? s) v) (throw 'unknown-content s v)) | |
| 75 | - | ((? list? l) (loop sense l)) | |
| 76 | - | ((? string? _) sense)) | |
| 77 | - | (cdr lst))))) | |
| 78 | - | ||
| 79 | - | (define (sxml->result lst frq) | |
| 80 | - | (let ((result | |
| 81 | - | (let loop ((result (make-result 0 0 '() '() '())) (lst lst)) | |
| 82 | - | (if (null? lst) | |
| 83 | - | result | |
| 84 | - | (loop | |
| 85 | - | (match (car lst) | |
| 86 | - | (('kanji kanji) | |
| 87 | - | (update-result result #:kanjis (cons kanji (result-kanjis result)))) | |
| 88 | - | ((? reading? r) | |
| 89 | - | (update-result result #:readings (cons r (result-readings result)))) | |
| 90 | - | ((? sense? s) | |
| 91 | - | (update-result result #:senses (cons s (result-senses result)))) | |
| 92 | - | ((? string? _) result)) | |
| 93 | - | (cdr lst)))))) | |
| 94 | - | (let* ((word (if (null? (result-kanjis result)) | |
| 95 | - | (car (reading-readings (car (result-readings result)))) | |
| 96 | - | (car (result-kanjis result)))) | |
| 97 | - | (entity (frequency-entity frq word)) | |
| 98 | - | (sense1 (car (result-senses result))) | |
| 99 | - | (sense1 (if entity | |
| 100 | - | (update-sense sense1 | |
| 101 | - | #:tags (cons (get-tag entity) (sense-tags sense1))) | |
| 102 | - | sense1)) | |
| 103 | - | (senses (cons sense1 (cdr (result-senses result))))) | |
| 104 | - | (update-result result | |
| 105 | - | #:points (get-points (update-result result #:senses senses)) | |
| 106 | - | #:senses senses)))) | |
| 107 | - | ||
| 108 | - | (define (sxml->element lst elem frq) | |
| 109 | - | (match elem | |
| 110 | - | ('ent_seq "") | |
| 111 | - | ('ke_pri "") | |
| 112 | - | ('re_nokanji "") | |
| 113 | - | ('re_pri "") | |
| 114 | - | ('ke_inf "") | |
| 115 | - | ('misc (if (and (= (length lst) 1) (string? (car lst))) | |
| 116 | - | `(tag ,(get-tag (car lst))) | |
| 117 | - | (throw 'invalid-misc lst))) | |
| 118 | - | ('re_restr (if (and (= (length lst) 1) (string? (car lst))) | |
| 119 | - | `(limit ,(car lst)) | |
| 120 | - | (throw 'invalid-re_restr lst))) | |
| 121 | - | ('keb (if (and (= (length lst) 1) (string? (car lst))) | |
| 122 | - | `(kanji ,(car lst)) | |
| 123 | - | (throw 'invalid-keb lst))) | |
| 124 | - | ('s_inf (if (null? (filter (lambda (s) (not (string? s))) lst)) | |
| 125 | - | `(info ,(apply string-append lst)) | |
| 126 | - | (throw 'invalid-s_inf lst))) | |
| 127 | - | ('dial (if (and (= (length lst) 1) (string? (car lst))) | |
| 128 | - | `(tag ,(get-tag (car lst))) | |
| 129 | - | (throw 'invalid-dial lst))) | |
| 130 | - | ('re_inf (if (and (= (length lst) 1) (string? (car lst))) | |
| 131 | - | `(info ,(car lst)) | |
| 132 | - | (throw 'invalid-re_inf lst))) | |
| 133 | - | ('stagk (if (and (= (length lst) 1) (string? (car lst))) | |
| 134 | - | `(limit ,(car lst)) | |
| 135 | - | (throw 'invalid-stagk lst))) | |
| 136 | - | ('stagr (if (and (= (length lst) 1) (string? (car lst))) | |
| 137 | - | `(limit ,(car lst)) | |
| 138 | - | (throw 'invalid-stagr lst))) | |
| 139 | - | ('field (if (and (= (length lst) 1) (string? (car lst))) | |
| 140 | - | `(tag ,(get-tag (car lst))) | |
| 141 | - | (throw 'invalid-field lst))) | |
| 142 | - | ('ant (if (and (= (length lst) 1) (string? (car lst))) | |
| 143 | - | `(ref ,(car lst)) | |
| 144 | - | (throw 'invalid-ant lst))) | |
| 145 | - | ('reb (if (and (= (length lst) 1) (string? (car lst))) | |
| 146 | - | `(reading ,(car lst)) | |
| 147 | - | (throw 'invalid-reb lst))) | |
| 148 | - | ('r_ele (sxml->reading lst)) | |
| 149 | - | ('k_ele (car (filter list? lst))) | |
| 150 | - | ('pos (if (and (= (length lst) 1) (string? (car lst))) | |
| 151 | - | `(tag ,(get-tag (car lst))) | |
| 152 | - | (throw 'invalid-pos lst))) | |
| 153 | - | ('xref (if (and (= (length lst) 1) (string? (car lst))) | |
| 154 | - | `(ref ,(car lst)) | |
| 155 | - | (throw 'invalid-xref lst))) | |
| 156 | - | ('gloss (cons | |
| 157 | - | `(gloss ,(apply string-append (filter string? lst))) | |
| 158 | - | (filter list? lst))) | |
| 159 | - | ('lsource (sxml->source lst)) | |
| 160 | - | ('sense (sxml->sense lst)) | |
| 161 | - | ('entry (sxml->result lst frq)))) | |
| 162 | - | ||
| 163 | - | (define (create-parser frq) | |
| 164 | - | (ssax:make-parser | |
| 165 | - | NEW-LEVEL-SEED | |
| 166 | - | (lambda (elem-gi attributes namespaces expected-content seed) | |
| 167 | - | (map | |
| 168 | - | (match-lambda | |
| 169 | - | ((k . v) (list k v))) | |
| 170 | - | (filter | |
| 171 | - | (match-lambda | |
| 172 | - | ((k . v) (not (member k '(g_type))))) | |
| 173 | - | attributes))) | |
| 174 | - | ||
| 175 | - | FINISH-ELEMENT | |
| 176 | - | (lambda (elem-gi attributes namespaces parent-seed seed) | |
| 177 | - | (if (equal? elem-gi 'JMdict) | |
| 178 | - | seed | |
| 179 | - | (let* ((seed (reverse seed)) | |
| 180 | - | (element (sxml->element seed elem-gi frq))) | |
| 181 | - | (cons element parent-seed)))) | |
| 182 | - | ||
| 183 | - | CHAR-DATA-HANDLER | |
| 184 | - | (lambda (string1 string2 seed) | |
| 185 | - | (cons (string-append string1 string2) seed)))) | |
| 186 | - | ||
| 187 | - | (define (xml->results port frq) | |
| 188 | - | (let ((results (filter result? ((create-parser frq) port '())))) | |
| 189 | - | (sort results (lambda (a b) (> (result-points a) (result-points b)))))) |
modules/nani/kanji/radk.scm unknown status 1
| 1 | + | ;;; Nani Project website | |
| 2 | + | ;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu> | |
| 3 | + | ;;; | |
| 4 | + | ;;; This file is part of the Nani Project website. | |
| 5 | + | ;;; | |
| 6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | + | ;;; your option) any later version. | |
| 10 | + | ;;; | |
| 11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | + | ;;; GNU Affero General Public License for more details. | |
| 15 | + | ;;; | |
| 16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | + | ||
| 19 | + | (define-module (nani kanji radk) | |
| 20 | + | #:use-module (ice-9 binary-ports) | |
| 21 | + | #:use-module (ice-9 match) | |
| 22 | + | #:use-module (ice-9 peg) | |
| 23 | + | #:use-module (ice-9 rdelim) | |
| 24 | + | #:use-module (nani encoding parse) | |
| 25 | + | #:use-module (nani encoding serialize) | |
| 26 | + | #:use-module (rnrs bytevectors) | |
| 27 | + | #:use-module (sxml simple) | |
| 28 | + | #:export (parse-radk | |
| 29 | + | get-kanji-stroke | |
| 30 | + | get-rad-kanji | |
| 31 | + | get-rad-stroke | |
| 32 | + | serialize-radk | |
| 33 | + | kanji-count)) | |
| 34 | + | ||
| 35 | + | ;; PEG parser for the radk file | |
| 36 | + | (define-peg-pattern comment none (and "#" (* (or "\t" (range #\x20 #\x10ffff))) "\n")) | |
| 37 | + | (define-peg-pattern space none " ") | |
| 38 | + | (define-peg-pattern return none "\n") | |
| 39 | + | (define-peg-pattern entry all | |
| 40 | + | (and (ignore "$") space char space num (? (and space name)) (ignore "\n") | |
| 41 | + | (+ (or char (ignore "\n"))))) | |
| 42 | + | (define-peg-pattern num all (+ (or (range #\0 #\9)))) | |
| 43 | + | (define-peg-pattern name none (+ (or (range #\0 #\9) (range #\a #\z) (range #\A #\Z)))) | |
| 44 | + | (define-peg-pattern char all (and (range #\xff #\x10ffff))) | |
| 45 | + | (define-peg-pattern radk-doc body (* (or return comment entry))) | |
| 46 | + | ||
| 47 | + | ;; parse the file with the peg parser, to low-level alist structure | |
| 48 | + | (define (parse-radk file) | |
| 49 | + | (peg:tree (match-pattern radk-doc (call-with-input-file file read-string)))) | |
| 50 | + | ||
| 51 | + | (define (get-rad-kanji content) | |
| 52 | + | (let loop ((result '()) (content content)) | |
| 53 | + | (match content | |
| 54 | + | (() result) | |
| 55 | + | ((('entry ('char radical) ('num stroke) ('char kanji)) content ...) | |
| 56 | + | (loop (cons (list radical kanji) result) | |
| 57 | + | content)) | |
| 58 | + | ((('entry ('char radical) ('num stroke) (('char kanji) ...)) content ...) | |
| 59 | + | (loop (cons (cons radical kanji) result) | |
| 60 | + | content))))) | |
| 61 | + | ||
| 62 | + | (define (get-rad-stroke content) | |
| 63 | + | (let loop ((result '()) (content content)) | |
| 64 | + | (match content | |
| 65 | + | (() result) | |
| 66 | + | ((('entry ('char radical) ('num stroke) ('char kanji)) content ...) | |
| 67 | + | (loop (cons (cons radical (string->number stroke)) result) | |
| 68 | + | content)) | |
| 69 | + | ((('entry ('char radical) ('num stroke) (('char kanji) ...)) content ...) | |
| 70 | + | (loop (cons (cons radical (string->number stroke)) result) | |
| 71 | + | content))))) | |
| 72 | + | ||
| 73 | + | ;; return an alist where keys are kanjis, and values the associated stroke count. | |
| 74 | + | (define (get-kanji-stroke file) | |
| 75 | + | (define strokes (xml->sxml (call-with-input-file file read-string))) | |
| 76 | + | ||
| 77 | + | (match strokes | |
| 78 | + | (('*TOP* _ ('kanjidic2 content ...)) | |
| 79 | + | (map | |
| 80 | + | (lambda (entry) | |
| 81 | + | (let* ((literal (car (assoc-ref entry 'literal))) | |
| 82 | + | (misc (assoc-ref entry 'misc)) | |
| 83 | + | (misc (filter list? misc)) | |
| 84 | + | (stroke (car (assoc-ref misc 'stroke_count)))) | |
| 85 | + | (cons literal (string->number stroke)))) | |
| 86 | + | (filter | |
| 87 | + | (lambda (entry) | |
| 88 | + | (and | |
| 89 | + | (list? entry) | |
| 90 | + | (equal? (car entry) 'character))) | |
| 91 | + | content))))) | |
| 92 | + | ||
| 93 | + | (define (serialize-radk rad-kanji rad-stroke kanji-stroke) | |
| 94 | + | (define (serialize-rad-kanji-element element pos bv) | |
| 95 | + | (match element | |
| 96 | + | ((radical kanji ...) | |
| 97 | + | (let* ((pos (serialize-string (radical-character radical) pos bv))) | |
| 98 | + | (serialize-string (string-join kanji "") pos bv))))) | |
| 99 | + | (define (rad-kanji-element-size element) | |
| 100 | + | (match element | |
| 101 | + | ((radical kanji ...) | |
| 102 | + | (+ (string-size (radical-character radical)) | |
| 103 | + | (string-size (string-join kanji "")))))) | |
| 104 | + | ||
| 105 | + | (define (serialize-rad-kanji rad-kanji pos bv) | |
| 106 | + | ((serialize-list serialize-rad-kanji-element) rad-kanji pos bv)) | |
| 107 | + | (define (rad-kanji-size rad-kanji) | |
| 108 | + | ((list-size rad-kanji-element-size) rad-kanji)) | |
| 109 | + | ||
| 110 | + | (define (serialize-rad-stroke-element element pos bv) | |
| 111 | + | (match element | |
| 112 | + | ((radical . stroke) | |
| 113 | + | (let ((pos (serialize-string (radical-character radical) pos bv))) | |
| 114 | + | (serialize-char stroke pos bv))))) | |
| 115 | + | (define (rad-stroke-element-size element) | |
| 116 | + | (match element | |
| 117 | + | ((radical . stroke) | |
| 118 | + | (+ (string-size (radical-character radical)) (char-size stroke))))) | |
| 119 | + | ||
| 120 | + | (define (serialize-rad-stroke rad-stroke pos bv) | |
| 121 | + | ((serialize-list serialize-rad-stroke-element) rad-stroke pos bv)) | |
| 122 | + | (define (rad-stroke-size rad-stroke) | |
| 123 | + | ((list-size rad-stroke-element-size) rad-stroke)) | |
| 124 | + | ||
| 125 | + | (define (serialize-kanji-stroke-element element pos bv) | |
| 126 | + | (match element | |
| 127 | + | ((kanji . stroke) | |
| 128 | + | (let ((pos (serialize-string kanji pos bv))) | |
| 129 | + | (serialize-char stroke pos bv))))) | |
| 130 | + | (define (kanji-stroke-element-size element) | |
| 131 | + | (match element | |
| 132 | + | ((kanji . stroke) | |
| 133 | + | (+ (string-size kanji) (char-size stroke))))) | |
| 134 | + | ||
| 135 | + | (define (serialize-kanji-stroke kanji-stroke pos bv) | |
| 136 | + | ((serialize-list serialize-kanji-stroke-element) kanji-stroke pos bv)) | |
| 137 | + | (define (kanji-stroke-size kanji-stroke) | |
| 138 | + | ((list-size kanji-stroke-element-size) kanji-stroke)) | |
| 139 | + | ||
| 140 | + | (let* ((header (string->utf8 "NANI_RADK001")) | |
| 141 | + | (header-size (bytevector-length header)) | |
| 142 | + | (bv (make-bytevector (+ header-size 12 | |
| 143 | + | (rad-kanji-size rad-kanji) | |
| 144 | + | (rad-stroke-size rad-stroke) | |
| 145 | + | (kanji-stroke-size kanji-stroke))))) | |
| 146 | + | (bytevector-copy! header 0 bv 0 header-size) | |
| 147 | + | (let* ((pos header-size) | |
| 148 | + | (pos (serialize-rad-kanji rad-kanji pos bv)) | |
| 149 | + | (pos (serialize-rad-stroke rad-stroke pos bv)) | |
| 150 | + | (pos (serialize-kanji-stroke kanji-stroke pos bv))) | |
| 151 | + | bv))) | |
| 152 | + | ||
| 153 | + | (define (radical-character kanji) | |
| 154 | + | (match kanji | |
| 155 | + | ("???" "???") | |
| 156 | + | ("???" "????") | |
| 157 | + | ("???" "???") | |
| 158 | + | ("???" "???") | |
| 159 | + | ("???" "????") | |
| 160 | + | ("???" "???") | |
| 161 | + | ("???" "???") | |
| 162 | + | ("???" "???") | |
| 163 | + | ("???" "???") | |
| 164 | + | ("???" "???") | |
| 165 | + | ("???" "???") | |
| 166 | + | ("???" "???") | |
| 167 | + | ("???" "???") | |
| 168 | + | ("???" "???") | |
| 169 | + | ("???" "???") | |
| 170 | + | ("???" "???") | |
| 171 | + | ("???" "???") | |
| 172 | + | ("???" "???") | |
| 173 | + | ("???" "???") | |
| 174 | + | ("???" "???") | |
| 175 | + | ("???" "???") | |
| 176 | + | ("???" "???") | |
| 177 | + | (_ kanji))) | |
| 178 | + | ||
| 179 | + | ;; parse a serialized file back to the guile structure | |
| 180 | + | (define (parse file) | |
| 181 | + | (define (parse-rad-kanji-element port) | |
| 182 | + | (let ((radical (parse-string port)) | |
| 183 | + | (kanji-list (parse-string port))) | |
| 184 | + | (cons radical (string->list kanji-list)))) | |
| 185 | + | (define (parse-rad-kanji port) | |
| 186 | + | ((parse-list parse-rad-kanji-element) port)) | |
| 187 | + | ||
| 188 | + | (define (parse-rad-stroke-element port) | |
| 189 | + | (let ((radical (parse-string port)) | |
| 190 | + | (stroke (parse-char port))) | |
| 191 | + | (cons radical stroke))) | |
| 192 | + | (define (parse-rad-stroke port) | |
| 193 | + | ((parse-list parse-rad-stroke-element) port)) | |
| 194 | + | ||
| 195 | + | (define (parse-kanji-stroke-element port) | |
| 196 | + | (let ((kanji (parse-string port)) | |
| 197 | + | (stroke (parse-char port))) | |
| 198 | + | (cons kanji stroke))) | |
| 199 | + | (define (parse-kanji-stroke port) | |
| 200 | + | ((parse-list parse-kanji-stroke-element) port)) | |
| 201 | + | ||
| 202 | + | (call-with-input-file file | |
| 203 | + | (lambda (port) | |
| 204 | + | (let* ((header (utf8->string (get-bytevector-n port 12))) | |
| 205 | + | (rad-kanji (parse-rad-kanji port)) | |
| 206 | + | (rad-stroke (parse-rad-stroke port)) | |
| 207 | + | (kanji-stroke (parse-kanji-stroke port))) | |
| 208 | + | (list rad-kanji rad-stroke kanji-stroke))))) | |
| 209 | + | ||
| 210 | + | (define (get-kanji-list content) | |
| 211 | + | (let loop ((result '()) (content content)) | |
| 212 | + | (match content | |
| 213 | + | (() result) | |
| 214 | + | (((_ kanji ...) content ...) | |
| 215 | + | (loop (append result (filter (lambda (k) (not (member k result))) kanji)) | |
| 216 | + | content))))) | |
| 217 | + | ||
| 218 | + | (define (kanji-count file) | |
| 219 | + | (match (parse file) | |
| 220 | + | ((rad-kanji _ _) | |
| 221 | + | (length (get-kanji-list rad-kanji))))) |
modules/nani/parse-result.scm unknown status 2
| 1 | - | ;;; Nani Project website | |
| 2 | - | ;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu> | |
| 3 | - | ;;; | |
| 4 | - | ;;; This file is part of the Nani Project website. | |
| 5 | - | ;;; | |
| 6 | - | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | - | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | - | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | - | ;;; your option) any later version. | |
| 10 | - | ;;; | |
| 11 | - | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | - | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | - | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | - | ;;; GNU Affero General Public License for more details. | |
| 15 | - | ;;; | |
| 16 | - | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | - | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | - | ||
| 19 | - | (define-module (nani parse-result) | |
| 20 | - | #:use-module (ice-9 binary-ports) | |
| 21 | - | #:use-module (rnrs bytevectors) | |
| 22 | - | #:export (parse-result-list | |
| 23 | - | parse-result-char | |
| 24 | - | parse-result-int | |
| 25 | - | parse-result-boolean | |
| 26 | - | parse-result-string)) | |
| 27 | - | ||
| 28 | - | (define* (parse-result-list port parse-result-element) | |
| 29 | - | (let ((size (bytevector-u16-ref (get-bytevector-n port 2) 0 (endianness big)))) | |
| 30 | - | (let loop ((result '()) (remaining size)) | |
| 31 | - | (if (= remaining 0) | |
| 32 | - | (reverse result) | |
| 33 | - | (loop (cons (parse-result-element port) result) (- remaining 1)))))) | |
| 34 | - | ||
| 35 | - | (define (parse-result-char port) | |
| 36 | - | (get-u8 port)) | |
| 37 | - | ||
| 38 | - | (define (parse-result-int port) | |
| 39 | - | (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big))) | |
| 40 | - | ||
| 41 | - | (define (parse-result-boolean port) | |
| 42 | - | (= (get-u8 port) 1)) | |
| 43 | - | ||
| 44 | - | (define (parse-result-string port) | |
| 45 | - | (define (get-result-string port) | |
| 46 | - | (let loop ((lu8 '()) (char (get-u8 port))) | |
| 47 | - | (if (= char 0) | |
| 48 | - | lu8 | |
| 49 | - | (loop (cons char lu8) (get-u8 port))))) | |
| 50 | - | (utf8->string (u8-list->bytevector (reverse (get-result-string port))))) |
modules/nani/pitch/pitch.scm unknown status 1
| 1 | + | ;;; Nani Project website | |
| 2 | + | ;;; Copyright ?? 2021 Julien Lepiller <julien@lepiller.eu> | |
| 3 | + | ;;; | |
| 4 | + | ;;; This file is part of the Nani Project website. | |
| 5 | + | ;;; | |
| 6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | + | ;;; your option) any later version. | |
| 10 | + | ;;; | |
| 11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | + | ;;; GNU Affero General Public License for more details. | |
| 15 | + | ;;; | |
| 16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | + | ||
| 19 | + | (define-module (nani pitch pitch) | |
| 20 | + | #:use-module (ice-9 binary-ports) | |
| 21 | + | #:use-module (ice-9 match) | |
| 22 | + | #:use-module (nani encoding huffman) | |
| 23 | + | #:use-module (nani encoding trie) | |
| 24 | + | #:use-module (rnrs bytevectors) | |
| 25 | + | #:use-module (srfi srfi-1) | |
| 26 | + | #:use-module (srfi srfi-9) | |
| 27 | + | #:export (make-pitch | |
| 28 | + | pitch? | |
| 29 | + | pitch-kanji | |
| 30 | + | pitch-reading | |
| 31 | + | pitch-accents | |
| 32 | + | ||
| 33 | + | serialize-pitch | |
| 34 | + | pitch-entry-count)) | |
| 35 | + | ||
| 36 | + | (define-record-type pitch | |
| 37 | + | (make-pitch kanji reading accents) | |
| 38 | + | pitch? | |
| 39 | + | (kanji pitch-kanji) | |
| 40 | + | (reading pitch-reading) | |
| 41 | + | (accents pitch-accents)) | |
| 42 | + | ||
| 43 | + | (define (make-trie-key key) | |
| 44 | + | (append-map | |
| 45 | + | (lambda (c) | |
| 46 | + | (list (quotient c 16) (modulo c 16))) | |
| 47 | + | (bytevector->u8-list (string->utf8 key)))) | |
| 48 | + | ||
| 49 | + | (define (get-pitch-trie pitches) | |
| 50 | + | (let ((trie (make-empty-trie))) | |
| 51 | + | (for-each | |
| 52 | + | (lambda (pitch) | |
| 53 | + | (let ((key (string-append (pitch-kanji pitch) (pitch-reading pitch)))) | |
| 54 | + | (for-each | |
| 55 | + | (lambda (accent) | |
| 56 | + | (add-to-trie! trie (make-trie-key key) accent)) | |
| 57 | + | (pitch-accents pitch)))) | |
| 58 | + | pitches) | |
| 59 | + | (compress-trie trie))) | |
| 60 | + | ||
| 61 | + | (define (collapse-vals! trie) | |
| 62 | + | (let ((transitions (trie-transitions trie)) | |
| 63 | + | (vals (map (match-lambda ((? string? s) s) ((? number? i) (number->string i))) | |
| 64 | + | (trie-vals trie)))) | |
| 65 | + | (trie-vals-set! trie (string-join vals ", ")) | |
| 66 | + | (for-each collapse-vals! (map cdr transitions)))) | |
| 67 | + | ||
| 68 | + | (define (collect-vals trie) | |
| 69 | + | (let ((transitions (trie-transitions trie)) | |
| 70 | + | (vals (trie-vals trie))) | |
| 71 | + | (cons vals (append-map collect-vals (map cdr transitions))))) | |
| 72 | + | ||
| 73 | + | (define (serialize-pitch pitches) | |
| 74 | + | (let ((trie (get-pitch-trie pitches))) | |
| 75 | + | (collapse-vals! trie) | |
| 76 | + | (let* ((huffman (create-huffman (collect-vals trie))) | |
| 77 | + | (code (huffman->code huffman))) | |
| 78 | + | (let* ((header (string->utf8 "NANI_PITCH001")) | |
| 79 | + | (header-size (bytevector-length header)) | |
| 80 | + | (huffman-bv (serialize-huffman huffman)) | |
| 81 | + | (huffman-size (bytevector-length huffman-bv)) | |
| 82 | + | (trie-size ((trie-size-single (huffman-string-size code)) trie)) | |
| 83 | + | (result (make-bytevector (+ header-size 4 huffman-size trie-size)))) | |
| 84 | + | (bytevector-copy! header 0 result 0 header-size) | |
| 85 | + | (bytevector-u32-set! result header-size (length pitches) (endianness big)) | |
| 86 | + | (bytevector-copy! huffman-bv 0 result (+ header-size 4) huffman-size) | |
| 87 | + | ((serialize-trie-single (serialize-huffman-string code) | |
| 88 | + | (huffman-string-size code)) | |
| 89 | + | trie (+ header-size 4 huffman-size) result) | |
| 90 | + | result)))) | |
| 91 | + | ||
| 92 | + | (define (pitch-entry-count file) | |
| 93 | + | (call-with-input-file file | |
| 94 | + | (lambda (port) | |
| 95 | + | ;; header | |
| 96 | + | (get-bytevector-n port 13) | |
| 97 | + | ;; size | |
| 98 | + | (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big))))) |
modules/nani/pitch/wadoku.scm unknown status 1
| 1 | + | ;;; Nani Project website | |
| 2 | + | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
| 3 | + | ;;; | |
| 4 | + | ;;; This file is part of the Nani Project website. | |
| 5 | + | ;;; | |
| 6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | + | ;;; your option) any later version. | |
| 10 | + | ;;; | |
| 11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | + | ;;; GNU Affero General Public License for more details. | |
| 15 | + | ;;; | |
| 16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | + | ||
| 19 | + | (define-module (nani pitch wadoku) | |
| 20 | + | #:use-module (ice-9 match) | |
| 21 | + | #:use-module (nani pitch pitch) | |
| 22 | + | #:use-module (srfi srfi-1) | |
| 23 | + | #:use-module (sxml ssax) | |
| 24 | + | #:export (xml->pitch)) | |
| 25 | + | ||
| 26 | + | (define (sxml->element lst elem) | |
| 27 | + | (let ((elem (match elem | |
| 28 | + | ((_ . elem) elem) | |
| 29 | + | (_ elem)))) | |
| 30 | + | (match elem | |
| 31 | + | ('accent `(pitch . ,(car lst))) | |
| 32 | + | ('hira `(reading . ,(car lst))) | |
| 33 | + | ('orth (let ((kanji (filter string? lst))) | |
| 34 | + | (if (null? kanji) #f `(kanji . ,(car kanji))))) | |
| 35 | + | ('form | |
| 36 | + | `(form . ,(append-map (lambda (a) (if (list? a) a (list a))) lst))) | |
| 37 | + | ('reading (filter pair? lst)) | |
| 38 | + | ('entry | |
| 39 | + | (let loop ((lst lst) (kanjis '()) (readings '()) (accents '())) | |
| 40 | + | (if (null? lst) | |
| 41 | + | (if (or (and (null? kanjis) (null? readings)) (null? accents)) | |
| 42 | + | #f | |
| 43 | + | (append-map | |
| 44 | + | (lambda (reading) | |
| 45 | + | (map | |
| 46 | + | (lambda (kanji) | |
| 47 | + | (make-pitch kanji reading accents)) | |
| 48 | + | kanjis)) | |
| 49 | + | readings)) | |
| 50 | + | (match (car lst) | |
| 51 | + | (('form . f) | |
| 52 | + | (loop (append f (cdr lst)) kanjis readings accents)) | |
| 53 | + | (('pitch . pitch) | |
| 54 | + | (loop (cdr lst) kanjis readings (cons pitch accents))) | |
| 55 | + | (('reading . reading) | |
| 56 | + | (loop (cdr lst) kanjis (cons reading readings) accents)) | |
| 57 | + | (('kanji . kanji) | |
| 58 | + | (loop (cdr lst) (cons kanji kanjis) readings accents)) | |
| 59 | + | (_ (loop (cdr lst) kanjis readings accents)))))) | |
| 60 | + | (_ #f)))) | |
| 61 | + | ||
| 62 | + | (define parser | |
| 63 | + | (ssax:make-parser | |
| 64 | + | NEW-LEVEL-SEED | |
| 65 | + | (lambda (elem-gi attributes namespaces expected-content seed) | |
| 66 | + | attributes) | |
| 67 | + | ||
| 68 | + | FINISH-ELEMENT | |
| 69 | + | (lambda (elem-gi attributes namespaces parent-seed seed) | |
| 70 | + | (if (equal? elem-gi 'entries) | |
| 71 | + | seed | |
| 72 | + | (let* ((seed (reverse seed)) | |
| 73 | + | (element (sxml->element seed elem-gi))) | |
| 74 | + | (cons element parent-seed)))) | |
| 75 | + | ||
| 76 | + | CHAR-DATA-HANDLER | |
| 77 | + | (lambda (string1 string2 seed) | |
| 78 | + | (cons (string-append string1 string2) seed)))) | |
| 79 | + | ||
| 80 | + | (define (xml->pitch port) | |
| 81 | + | (filter pitch? (apply append (filter list? (parser port '()))))) |
modules/nani/radk.scm unknown status 2
| 1 | - | ;;; Nani Project website | |
| 2 | - | ;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu> | |
| 3 | - | ;;; | |
| 4 | - | ;;; This file is part of the Nani Project website. | |
| 5 | - | ;;; | |
| 6 | - | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | - | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | - | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | - | ;;; your option) any later version. | |
| 10 | - | ;;; | |
| 11 | - | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | - | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | - | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | - | ;;; GNU Affero General Public License for more details. | |
| 15 | - | ;;; | |
| 16 | - | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | - | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | - | ||
| 19 | - | (define-module (nani radk) | |
| 20 | - | #:use-module (ice-9 binary-ports) | |
| 21 | - | #:use-module (ice-9 match) | |
| 22 | - | #:use-module (ice-9 peg) | |
| 23 | - | #:use-module (ice-9 rdelim) | |
| 24 | - | #:use-module (nani parse-result) | |
| 25 | - | #:use-module (nani serialize) | |
| 26 | - | #:use-module (rnrs bytevectors) | |
| 27 | - | #:use-module (sxml simple) | |
| 28 | - | #:export (parse-radk | |
| 29 | - | get-kanji-stroke | |
| 30 | - | get-rad-kanji | |
| 31 | - | get-rad-stroke | |
| 32 | - | serialize-radk | |
| 33 | - | kanji-count)) | |
| 34 | - | ||
| 35 | - | (define-peg-pattern comment none (and "#" (* (or "\t" (range #\x20 #\x10ffff))) "\n")) | |
| 36 | - | (define-peg-pattern space none " ") | |
| 37 | - | (define-peg-pattern return none "\n") | |
| 38 | - | (define-peg-pattern entry all | |
| 39 | - | (and (ignore "$") space char space num (? (and space name)) (ignore "\n") | |
| 40 | - | (+ (or char (ignore "\n"))))) | |
| 41 | - | (define-peg-pattern num all (+ (or (range #\0 #\9)))) | |
| 42 | - | (define-peg-pattern name none (+ (or (range #\0 #\9) (range #\a #\z) (range #\A #\Z)))) | |
| 43 | - | (define-peg-pattern char all (and (range #\xff #\x10ffff))) | |
| 44 | - | (define-peg-pattern radk-doc body (* (or return comment entry))) | |
| 45 | - | ||
| 46 | - | (define (parse-radk file) | |
| 47 | - | (peg:tree (match-pattern radk-doc (call-with-input-file file read-string)))) | |
| 48 | - | ||
| 49 | - | (define (get-rad-kanji content) | |
| 50 | - | (let loop ((result '()) (content content)) | |
| 51 | - | (match content | |
| 52 | - | (() result) | |
| 53 | - | ((('entry ('char radical) ('num stroke) (('char kanji) ...)) content ...) | |
| 54 | - | (loop (cons (cons radical kanji) result) | |
| 55 | - | content))))) | |
| 56 | - | ||
| 57 | - | (define (get-rad-stroke content) | |
| 58 | - | (let loop ((result '()) (content content)) | |
| 59 | - | (match content | |
| 60 | - | (() result) | |
| 61 | - | ((('entry ('char radical) ('num stroke) (('char kanji) ...)) content ...) | |
| 62 | - | (loop (cons (cons radical (string->number stroke)) result) | |
| 63 | - | content))))) | |
| 64 | - | ||
| 65 | - | (define (get-kanji-stroke file) | |
| 66 | - | (define strokes (xml->sxml (call-with-input-file file read-string))) | |
| 67 | - | ||
| 68 | - | (match strokes | |
| 69 | - | (('*TOP* _ ('kanjidic2 content ...)) | |
| 70 | - | (map | |
| 71 | - | (lambda (entry) | |
| 72 | - | (let* ((literal (car (assoc-ref entry 'literal))) | |
| 73 | - | (misc (assoc-ref entry 'misc)) | |
| 74 | - | (misc (filter list? misc)) | |
| 75 | - | (stroke (car (assoc-ref misc 'stroke_count)))) | |
| 76 | - | (cons literal (string->number stroke)))) | |
| 77 | - | (filter | |
| 78 | - | (lambda (entry) | |
| 79 | - | (and | |
| 80 | - | (list? entry) | |
| 81 | - | (equal? (car entry) 'character))) | |
| 82 | - | content))))) | |
| 83 | - | ||
| 84 | - | (define (serialize-radk rad-kanji rad-stroke kanji-stroke) | |
| 85 | - | (define (serialize-rad-kanji-element element pos bv) | |
| 86 | - | (match element | |
| 87 | - | ((radical kanji ...) | |
| 88 | - | (let* ((pos (serialize-string (radical-character radical) pos bv))) | |
| 89 | - | (serialize-string (string-join kanji "") pos bv))))) | |
| 90 | - | (define (rad-kanji-element-size element) | |
| 91 | - | (match element | |
| 92 | - | ((radical kanji ...) | |
| 93 | - | (+ (string-size (radical-character radical)) | |
| 94 | - | (string-size (string-join kanji "")))))) | |
| 95 | - | ||
| 96 | - | (define (serialize-rad-kanji rad-kanji pos bv) | |
| 97 | - | (serialize-list rad-kanji serialize-rad-kanji-element pos bv)) | |
| 98 | - | (define (rad-kanji-size rad-kanji) | |
| 99 | - | (list-size rad-kanji rad-kanji-element-size)) | |
| 100 | - | ||
| 101 | - | (define (serialize-rad-stroke-element element pos bv) | |
| 102 | - | (match element | |
| 103 | - | ((radical . stroke) | |
| 104 | - | (let ((pos (serialize-string (radical-character radical) pos bv))) | |
| 105 | - | (serialize-char stroke pos bv))))) | |
| 106 | - | (define (rad-stroke-element-size element) | |
| 107 | - | (match element | |
| 108 | - | ((radical . stroke) | |
| 109 | - | (+ (string-size (radical-character radical)) (char-size stroke))))) | |
| 110 | - | ||
| 111 | - | (define (serialize-rad-stroke rad-stroke pos bv) | |
| 112 | - | (serialize-list rad-stroke serialize-rad-stroke-element pos bv)) | |
| 113 | - | (define (rad-stroke-size rad-stroke) | |
| 114 | - | (list-size rad-stroke rad-stroke-element-size)) | |
| 115 | - | ||
| 116 | - | (define (serialize-kanji-stroke-element element pos bv) | |
| 117 | - | (match element | |
| 118 | - | ((kanji . stroke) | |
| 119 | - | (let ((pos (serialize-string kanji pos bv))) | |
| 120 | - | (serialize-char stroke pos bv))))) | |
| 121 | - | (define (kanji-stroke-element-size element) | |
| 122 | - | (match element | |
| 123 | - | ((kanji . stroke) | |
| 124 | - | (+ (string-size kanji) (char-size stroke))))) | |
| 125 | - | ||
| 126 | - | (define (serialize-kanji-stroke kanji-stroke pos bv) | |
| 127 | - | (serialize-list kanji-stroke serialize-kanji-stroke-element pos bv)) | |
| 128 | - | (define (kanji-stroke-size kanji-stroke) | |
| 129 | - | (list-size kanji-stroke kanji-stroke-element-size)) | |
| 130 | - | ||
| 131 | - | (let* ((header (string->utf8 "NANI_RADK001")) | |
| 132 | - | (header-size (bytevector-length header)) | |
| 133 | - | (bv (make-bytevector (+ header-size 12 | |
| 134 | - | (rad-kanji-size rad-kanji) | |
| 135 | - | (rad-stroke-size rad-stroke) | |
| 136 | - | (kanji-stroke-size kanji-stroke))))) | |
| 137 | - | (bytevector-copy! header 0 bv 0 header-size) | |
| 138 | - | (let* ((pos header-size) | |
| 139 | - | (pos (serialize-rad-kanji rad-kanji pos bv)) | |
| 140 | - | (pos (serialize-rad-stroke rad-stroke pos bv)) | |
| 141 | - | (pos (serialize-kanji-stroke kanji-stroke pos bv))) | |
| 142 | - | bv))) | |
| 143 | - | ||
| 144 | - | (define (radical-character kanji) | |
| 145 | - | (match kanji | |
| 146 | - | ("???" "???") | |
| 147 | - | ("???" "????") | |
| 148 | - | ("???" "???") | |
| 149 | - | ("???" "???") | |
| 150 | - | ("???" "????") | |
| 151 | - | ("???" "???") | |
| 152 | - | ("???" "???") | |
| 153 | - | ("???" "???") | |
| 154 | - | ("???" "???") | |
| 155 | - | ("???" "???") | |
| 156 | - | ("???" "???") | |
| 157 | - | ("???" "???") | |
| 158 | - | ("???" "???") | |
| 159 | - | ("???" "???") | |
| 160 | - | ("???" "???") | |
| 161 | - | ("???" "???") | |
| 162 | - | ("???" "???") | |
| 163 | - | ("???" "???") | |
| 164 | - | ("???" "???") | |
| 165 | - | ("???" "???") | |
| 166 | - | ("???" "???") | |
| 167 | - | ("???" "???") | |
| 168 | - | (_ kanji))) | |
| 169 | - | ||
| 170 | - | (define (parse-result file) | |
| 171 | - | (define (parse-result-rad-kanji-element port) | |
| 172 | - | (let ((radical (parse-result-string port)) | |
| 173 | - | (kanji-list (parse-result-string port))) | |
| 174 | - | (cons radical (string->list kanji-list)))) | |
| 175 | - | (define (parse-result-rad-kanji port) | |
| 176 | - | (parse-result-list port parse-result-rad-kanji-element)) | |
| 177 | - | ||
| 178 | - | (define (parse-result-rad-stroke-element port) | |
| 179 | - | (let ((radical (parse-result-string port)) | |
| 180 | - | (stroke (parse-result-char port))) | |
| 181 | - | (cons radical stroke))) | |
| 182 | - | (define (parse-result-rad-stroke port) | |
| 183 | - | (parse-result-list port parse-result-rad-stroke-element)) | |
| 184 | - | ||
| 185 | - | (define (parse-result-kanji-stroke-element port) | |
| 186 | - | (let ((kanji (parse-result-string port)) | |
| 187 | - | (stroke (parse-result-char port))) | |
| 188 | - | (cons kanji stroke))) | |
| 189 | - | (define (parse-result-kanji-stroke port) | |
| 190 | - | (parse-result-list port parse-result-kanji-stroke-element)) | |
| 191 | - | ||
| 192 | - | (call-with-input-file file | |
| 193 | - | (lambda (port) | |
| 194 | - | (let* ((header (utf8->string (get-bytevector-n port 12))) | |
| 195 | - | (rad-kanji (parse-result-rad-kanji port)) | |
| 196 | - | (rad-stroke (parse-result-rad-stroke port)) | |
| 197 | - | (kanji-stroke (parse-result-kanji-stroke port))) | |
| 198 | - | (list rad-kanji rad-stroke kanji-stroke))))) | |
| 199 | - | ||
| 200 | - | (define (get-kanji-list content) | |
| 201 | - | (let loop ((result '()) (content content)) | |
| 202 | - | (match content | |
| 203 | - | (() result) | |
| 204 | - | (((_ kanji ...) content ...) | |
| 205 | - | (loop (append result (filter (lambda (k) (not (member k result))) kanji)) | |
| 206 | - | content))))) | |
| 207 | - | ||
| 208 | - | (define (kanji-count file) | |
| 209 | - | (match (parse-result file) | |
| 210 | - | ((rad-kanji _ _) | |
| 211 | - | (length (get-kanji-list rad-kanji))))) |
modules/nani/result.scm unknown status 2
| 1 | - | ;;; Nani Project website | |
| 2 | - | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
| 3 | - | ;;; | |
| 4 | - | ;;; This file is part of the Nani Project website. | |
| 5 | - | ;;; | |
| 6 | - | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | - | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | - | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | - | ;;; your option) any later version. | |
| 10 | - | ;;; | |
| 11 | - | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | - | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | - | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | - | ;;; GNU Affero General Public License for more details. | |
| 15 | - | ;;; | |
| 16 | - | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | - | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | - | ||
| 19 | - | (define-module (nani result) | |
| 20 | - | #:use-module (srfi srfi-9) | |
| 21 | - | #:export (make-result | |
| 22 | - | result? | |
| 23 | - | result-position | |
| 24 | - | result-position-set! | |
| 25 | - | result-points | |
| 26 | - | result-kanjis | |
| 27 | - | result-readings | |
| 28 | - | result-senses | |
| 29 | - | ||
| 30 | - | make-reading | |
| 31 | - | reading? | |
| 32 | - | reading-kanjis | |
| 33 | - | reading-info | |
| 34 | - | reading-readings | |
| 35 | - | ||
| 36 | - | make-sense | |
| 37 | - | sense? | |
| 38 | - | sense-references | |
| 39 | - | sense-limits | |
| 40 | - | sense-infos | |
| 41 | - | sense-sources | |
| 42 | - | sense-tags | |
| 43 | - | sense-glosses | |
| 44 | - | sense-language | |
| 45 | - | ||
| 46 | - | make-source | |
| 47 | - | source? | |
| 48 | - | source-content | |
| 49 | - | source-wasei? | |
| 50 | - | source-type | |
| 51 | - | source-lang | |
| 52 | - | ||
| 53 | - | update-result | |
| 54 | - | update-reading | |
| 55 | - | update-sense | |
| 56 | - | update-source)) | |
| 57 | - | ||
| 58 | - | (define-record-type result | |
| 59 | - | (make-result position points kanjis readings senses) | |
| 60 | - | result? | |
| 61 | - | (position result-position result-position-set!) ; integer | |
| 62 | - | (points result-points) ; integer | |
| 63 | - | (kanjis result-kanjis) ; string-list | |
| 64 | - | (readings result-readings) ; reanding-list | |
| 65 | - | (senses result-senses)) ; sense-list | |
| 66 | - | ||
| 67 | - | (define-record-type reading | |
| 68 | - | (make-reading kanjis info readings) | |
| 69 | - | reading? | |
| 70 | - | (kanjis reading-kanjis) ; string-list | |
| 71 | - | (info reading-info) ; string-list | |
| 72 | - | (readings reading-readings)) ; string-list | |
| 73 | - | ||
| 74 | - | (define-record-type sense | |
| 75 | - | (make-sense references limits infos sources tags glosses language) | |
| 76 | - | sense? | |
| 77 | - | (references sense-references) ; string-list | |
| 78 | - | (limits sense-limits) ; string-list | |
| 79 | - | (infos sense-infos) ; string-list | |
| 80 | - | (sources sense-sources) ; source-list | |
| 81 | - | (tags sense-tags) ; integer-list | |
| 82 | - | (glosses sense-glosses) ; string-list | |
| 83 | - | (language sense-language)) ; string | |
| 84 | - | ||
| 85 | - | (define-record-type source | |
| 86 | - | (make-source content wasei? type lang) | |
| 87 | - | source? | |
| 88 | - | (content source-content) ; string-list | |
| 89 | - | (wasei? source-wasei?) ; boolean | |
| 90 | - | (type source-type) ; string | |
| 91 | - | (lang source-lang)) ; string | |
| 92 | - | ||
| 93 | - | (define* (update-result result | |
| 94 | - | #:key (points (result-points result)) | |
| 95 | - | (kanjis (result-kanjis result)) | |
| 96 | - | (readings (result-readings result)) | |
| 97 | - | (senses (result-senses result))) | |
| 98 | - | (make-result (result-position result) points kanjis readings senses)) | |
| 99 | - | ||
| 100 | - | (define* (update-reading reading | |
| 101 | - | #:key (kanjis (reading-kanjis reading)) | |
| 102 | - | (info (reading-info reading)) | |
| 103 | - | (readings (reading-readings reading))) | |
| 104 | - | (make-reading kanjis info readings)) | |
| 105 | - | ||
| 106 | - | (define* (update-sense sense | |
| 107 | - | #:key (references (sense-references sense)) | |
| 108 | - | (limits (sense-limits sense)) | |
| 109 | - | (infos (sense-infos sense)) | |
| 110 | - | (sources (sense-sources sense)) | |
| 111 | - | (tags (sense-tags sense)) | |
| 112 | - | (glosses (sense-glosses sense)) | |
| 113 | - | (language (sense-language sense))) | |
| 114 | - | (make-sense references limits infos sources tags glosses language)) | |
| 115 | - | ||
| 116 | - | (define* (update-source source | |
| 117 | - | #:key (content (source-content source)) | |
| 118 | - | (wasei? (source-wasei? source)) | |
| 119 | - | (type (source-type source)) | |
| 120 | - | (lang (source-lang source))) | |
| 121 | - | (make-source content wasei? type lang)) |
modules/nani/result/frequency.scm unknown status 1
| 1 | + | ;;; Nani Project website | |
| 2 | + | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
| 3 | + | ;;; | |
| 4 | + | ;;; This file is part of the Nani Project website. | |
| 5 | + | ;;; | |
| 6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | + | ;;; your option) any later version. | |
| 10 | + | ;;; | |
| 11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | + | ;;; GNU Affero General Public License for more details. | |
| 15 | + | ;;; | |
| 16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | + | ||
| 19 | + | (define-module (nani result frequency) | |
| 20 | + | #:use-module (ice-9 rdelim) | |
| 21 | + | #:export (load-frequency | |
| 22 | + | frequency->score)) | |
| 23 | + | ||
| 24 | + | (define (load-frequency file) | |
| 25 | + | (call-with-input-file file | |
| 26 | + | (lambda (port) | |
| 27 | + | (let loop ((frq '()) (i 1)) | |
| 28 | + | (let* ((line (%read-line port)) | |
| 29 | + | (line (car line))) | |
| 30 | + | (if (eof-object? line) | |
| 31 | + | frq | |
| 32 | + | (let ((content (string-split line #\tab))) | |
| 33 | + | (loop (cons (cons (cadr (cdr content)) i) frq) (+ i 1))))))))) | |
| 34 | + | ||
| 35 | + | (define (frequency->score frq word) | |
| 36 | + | (let ((freq (assoc-ref frq word))) | |
| 37 | + | (cond | |
| 38 | + | ((not freq) 0) | |
| 39 | + | ((< freq 501) 32) | |
| 40 | + | ((< freq 1001) 16) | |
| 41 | + | ((< freq 2001) 8) | |
| 42 | + | ((< freq 5001) 4) | |
| 43 | + | ((< freq 10001) 2) | |
| 44 | + | ((< freq 20001) 1) | |
| 45 | + | (else 0)))) |
modules/nani/result/jibiki.scm unknown status 1
| 1 | + | ;;; Nani Project website | |
| 2 | + | ;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu> | |
| 3 | + | ;;; | |
| 4 | + | ;;; This file is part of the Nani Project website. | |
| 5 | + | ;;; | |
| 6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | + | ;;; your option) any later version. | |
| 10 | + | ;;; | |
| 11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | + | ;;; GNU Affero General Public License for more details. | |
| 15 | + | ;;; | |
| 16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | + | ||
| 19 | + | (define-module (nani result jibiki) | |
| 20 | + | #:use-module (ice-9 match) | |
| 21 | + | #:use-module (ice-9 rdelim) | |
| 22 | + | #:use-module (nani result frequency) | |
| 23 | + | #:use-module (nani result result) | |
| 24 | + | #:use-module (srfi srfi-9) | |
| 25 | + | #:use-module (sxml ssax) | |
| 26 | + | #:export (xml->results)) | |
| 27 | + | ||
| 28 | + | (define (sxml->string lst) | |
| 29 | + | (if (string? lst) | |
| 30 | + | lst | |
| 31 | + | (let loop ((lst lst) (result "")) | |
| 32 | + | (if (null? lst) | |
| 33 | + | result | |
| 34 | + | (loop | |
| 35 | + | (cdr lst) | |
| 36 | + | (match (car lst) | |
| 37 | + | (('lang . _) result) | |
| 38 | + | (#f result) | |
| 39 | + | ((? string? s) (string-append result s)) | |
| 40 | + | ((? list? l) (loop l result)))))))) | |
| 41 | + | ||
| 42 | + | (define (gram->info gram) | |
| 43 | + | (match gram | |
| 44 | + | ("adjectif???" "adj. en ku") | |
| 45 | + | ("adjectif???" "adj. en na") | |
| 46 | + | ("adjectif?????????" "adj. en na (tari)") | |
| 47 | + | ("adjectif?????????" "adj. en na (nari)") | |
| 48 | + | ("contraction" "contr.") | |
| 49 | + | ("counter" "compteur") | |
| 50 | + | ("num??rique" "nombre") | |
| 51 | + | ("particule" "particule") | |
| 52 | + | ("postposition" "postpos.") | |
| 53 | + | ("verbe auxiliaire" "v. aux.") | |
| 54 | + | ("??? pronom" "pronom") | |
| 55 | + | ("??? adverbe" "adv.") | |
| 56 | + | ("??? onomatop??e" "onom.") | |
| 57 | + | ("?????? verbe auxiliaire" "v. aux.") | |
| 58 | + | ("?????? compteur" "compteur") | |
| 59 | + | ("??? verbe" "v.") | |
| 60 | + | ("??? verbe intransitif" "v.i.") | |
| 61 | + | ("??? verbe transitif" "v.t.") | |
| 62 | + | ("??? nom" "n.") | |
| 63 | + | ("??? adjectif" "adj.") | |
| 64 | + | ("???" "interj.") | |
| 65 | + | ("?????? interjection" "interj.") | |
| 66 | + | ("?????? expression" "expr.") | |
| 67 | + | ("?????? suffixe" "suf.") | |
| 68 | + | ("?????? suffixe conjonctif" "suf. conj.") | |
| 69 | + | ("?????? conjonction" "conj.") | |
| 70 | + | ("?????? pr??fixe" "pref.") | |
| 71 | + | ("??????" "adj. pr??nominal") | |
| 72 | + | ("?????? locution adverbiale" "loc. adv.") | |
| 73 | + | ("?????? locution conjonctive" "loc. conj.") | |
| 74 | + | ("?????? locution postpositive" "loc. post.") | |
| 75 | + | ("?????? mot compos??" "mot compos??") | |
| 76 | + | ("??" #f) | |
| 77 | + | ("" #f))) | |
| 78 | + | ||
| 79 | + | (define (info->info i) | |
| 80 | + | (match (sxml->string i) | |
| 81 | + | ("abbreviation" "abbreviation") | |
| 82 | + | ("archaism" "archa??sme") | |
| 83 | + | ("children's language" "langage des enfants") | |
| 84 | + | ("colloquialism" "familier") | |
| 85 | + | ("derogatory" "terme de m??pris") | |
| 86 | + | ("familiar language" "familier") | |
| 87 | + | ("female term or language" "langage des femmes") | |
| 88 | + | ("honorific or respectful (sonkeigo) language" "terme honorifique") | |
| 89 | + | ("humble (kenjougo) language" "terme humble") | |
| 90 | + | ("idiomatic expression" "idiom.") | |
| 91 | + | ("jocular, humorous term" "comique") | |
| 92 | + | ("male term or language" "langage des hommes") | |
| 93 | + | ("obscure term" "obscure") | |
| 94 | + | ("obsolete term" "obsol??te") | |
| 95 | + | ("onomatopoeic or mimetic word" "onom.") | |
| 96 | + | ("polite (teineigo) language" "terme poli") | |
| 97 | + | ("proverb" "proverbe") | |
| 98 | + | ("sensitive" "sensible") | |
| 99 | + | ("slang" "argot") | |
| 100 | + | ("vulgar expression or word" "vulgaire") | |
| 101 | + | ("word usually written using kana alone" "habituellement ??crit en kana") | |
| 102 | + | ("" #f) | |
| 103 | + | (() #f) | |
| 104 | + | (_ i))) | |
| 105 | + | ||
| 106 | + | (define (sxml->ref lst) | |
| 107 | + | (let loop ((lst lst) (ref '())) | |
| 108 | + | (if (null? lst) | |
| 109 | + | ref | |
| 110 | + | (loop | |
| 111 | + | (cdr lst) | |
| 112 | + | (match (car lst) | |
| 113 | + | (#f ref) | |
| 114 | + | (('jpn . jpn) (cons jpn ref))))))) | |
| 115 | + | ||
| 116 | + | (define (sxml->reading str) | |
| 117 | + | (make-reading '() '() (list str))) | |
| 118 | + | ||
| 119 | + | (define (sxml->source lst) | |
| 120 | + | (let loop ((lst lst) (source (make-source '() #f ""))) | |
| 121 | + | (if (null? lst) | |
| 122 | + | source | |
| 123 | + | (loop | |
| 124 | + | (cdr lst) | |
| 125 | + | (match (car lst) | |
| 126 | + | ((? string? s) | |
| 127 | + | (update-source source | |
| 128 | + | #:content (cons s (source-content source)))) | |
| 129 | + | (('lang . l) | |
| 130 | + | (update-source source #:lang l)) | |
| 131 | + | ((? list? l) | |
| 132 | + | (loop l source))))))) | |
| 133 | + | ||
| 134 | + | (define (sxml->meaning lst) | |
| 135 | + | (let loop ((lst lst) (meaning (make-meaning '() '() '() '() '() "fr"))) | |
| 136 | + | (if (null? lst) | |
| 137 | + | meaning | |
| 138 | + | (loop | |
| 139 | + | (cdr lst) | |
| 140 | + | (match (car lst) | |
| 141 | + | (#f meaning) | |
| 142 | + | (('n . _) meaning) | |
| 143 | + | (('info . info) (update-meaning meaning | |
| 144 | + | #:infos (cons info (meaning-infos meaning)))) | |
| 145 | + | (('content . c) (update-meaning meaning | |
| 146 | + | #:glosses (append | |
| 147 | + | (map string-downcase | |
| 148 | + | (string-split c #\,)) | |
| 149 | + | (meaning-glosses meaning)))) | |
| 150 | + | ((? source? s) | |
| 151 | + | (update-meaning meaning | |
| 152 | + | #:sources (cons s (meaning-sources meaning)))) | |
| 153 | + | ((? string? _) meaning) | |
| 154 | + | ((? list? l) (loop l meaning))))))) | |
| 155 | + | ||
| 156 | + | (define (sxml->result lst frq) | |
| 157 | + | (define result | |
| 158 | + | (let loop ((lst lst) (result (make-result 0 0 '() '() '())) (infos '()) | |
| 159 | + | (sources '())) | |
| 160 | + | (if (null? lst) | |
| 161 | + | (list result infos sources) | |
| 162 | + | (match (car lst) | |
| 163 | + | (#f (loop (cdr lst) result infos sources)) | |
| 164 | + | (('id . _) (loop (cdr lst) result infos sources)) | |
| 165 | + | (('identree . _) (loop (cdr lst) result infos sources)) | |
| 166 | + | (('kanji . k) | |
| 167 | + | (loop (cdr lst) | |
| 168 | + | (update-result result | |
| 169 | + | #:kanjis (cons k (result-kanjis result))) | |
| 170 | + | infos | |
| 171 | + | sources)) | |
| 172 | + | ((? reading? r) | |
| 173 | + | (loop | |
| 174 | + | (cdr lst) | |
| 175 | + | (update-result result | |
| 176 | + | #:readings (cons r (result-readings result))) | |
| 177 | + | infos | |
| 178 | + | sources)) | |
| 179 | + | ((? meaning? s) | |
| 180 | + | (loop | |
| 181 | + | (cdr lst) | |
| 182 | + | (update-result result | |
| 183 | + | #:meanings (cons s (result-meanings result))) | |
| 184 | + | infos | |
| 185 | + | sources)) | |
| 186 | + | ((? source? s) | |
| 187 | + | (loop (cdr lst) result infos (cons s sources))) | |
| 188 | + | (('info . info) | |
| 189 | + | (loop (cdr lst) | |
| 190 | + | result | |
| 191 | + | (cons info infos) | |
| 192 | + | sources)) | |
| 193 | + | ((? list? l) | |
| 194 | + | (match (loop l result infos sources) | |
| 195 | + | ((result infos sources) | |
| 196 | + | (loop (cdr lst) result infos sources)))))))) | |
| 197 | + | ||
| 198 | + | (match result | |
| 199 | + | ((result infos sources) | |
| 200 | + | (if (or (and (null? (result-readings result)) | |
| 201 | + | (null? (result-kanjis result))) | |
| 202 | + | (null? (result-meanings result))) | |
| 203 | + | #f | |
| 204 | + | (let* ((word (if (null? (result-kanjis result)) | |
| 205 | + | (car (reading-readings | |
| 206 | + | (car (result-readings result)))) | |
| 207 | + | (car (result-kanjis result)))) | |
| 208 | + | (score (frequency->score frq word)) | |
| 209 | + | (meanings (result-meanings result)) | |
| 210 | + | (meanings | |
| 211 | + | (map | |
| 212 | + | (lambda (s) | |
| 213 | + | (update-meaning s | |
| 214 | + | #:infos (append infos (meaning-infos s)) | |
| 215 | + | #:sources (append sources (meaning-sources s)))) | |
| 216 | + | meanings))) | |
| 217 | + | (update-result result | |
| 218 | + | #:score score | |
| 219 | + | #:meanings meanings)))))) | |
| 220 | + | ||
| 221 | + | (define (sxml->element lst elem frq) | |
| 222 | + | (let ((elem (match elem | |
| 223 | + | ((_ . elem) elem) | |
| 224 | + | (_ elem)))) | |
| 225 | + | (match elem | |
| 226 | + | ('vedette-romaji #f) | |
| 227 | + | ('vedette-hiragana (sxml->reading (sxml->string lst))) | |
| 228 | + | ('vedette-jpn `(kanji . ,(sxml->string lst))) | |
| 229 | + | ('vedette lst) | |
| 230 | + | ('forme lst) | |
| 231 | + | ('domaine (if (null? lst) #f | |
| 232 | + | (let ((info (sxml->string lst))) | |
| 233 | + | (if (or (not info) (string-null? info)) | |
| 234 | + | #f | |
| 235 | + | `(info . ,info))))) | |
| 236 | + | ('gram (let ((info (gram->info (sxml->string lst)))) | |
| 237 | + | (if (or (not info) (string-null? info)) | |
| 238 | + | #f | |
| 239 | + | `(info . ,info)))) | |
| 240 | + | ('??tiquettes lst) | |
| 241 | + | ('??tiquettes-sens lst) | |
| 242 | + | ('texte-sens `(content . ,(sxml->string lst))) | |
| 243 | + | ('sens (sxml->meaning lst)) | |
| 244 | + | ('bloc-gram lst) | |
| 245 | + | ('s??mantique lst) | |
| 246 | + | ('article (sxml->result lst frq)) | |
| 247 | + | ('registre (let ((info (sxml->string lst))) | |
| 248 | + | (if (or (not info) (string-null? info)) | |
| 249 | + | #f | |
| 250 | + | `(info . ,info)))) | |
| 251 | + | ('litt??ralement | |
| 252 | + | (if (null? lst) #f (make-meaning | |
| 253 | + | '() '() '() '("lit") | |
| 254 | + | (list (sxml->string lst)) "fr"))) | |
| 255 | + | ('etymologie (sxml->source lst)) | |
| 256 | + | ('??tymologie (sxml->source lst)) | |
| 257 | + | ('note (let ((info (sxml->string lst))) | |
| 258 | + | (if (or (not info) (string-null? info)) #f `(info . ,info)))) | |
| 259 | + | ('info (let ((info (info->info (sxml->string lst)))) | |
| 260 | + | (if (or (not info) (string-null? info)) #f `(info . ,info)))) | |
| 261 | + | ('dialecte (let ((info (sxml->string lst))) | |
| 262 | + | (if (or (not info) (string-null? info)) #f `(info . ,info)))) | |
| 263 | + | ('en lst) | |
| 264 | + | ('eng lst) | |
| 265 | + | ('xref (sxml->ref lst)) | |
| 266 | + | ||
| 267 | + | ('vr #f) | |
| 268 | + | ('romaji #f) | |
| 269 | + | ('romajidx #f) | |
| 270 | + | ('rt #f) | |
| 271 | + | ('ruby #f) | |
| 272 | + | ('vj #f) | |
| 273 | + | ('jpn `(jpn . ,(sxml->string lst))) | |
| 274 | + | ('??tiquettes-fran??ais #f) | |
| 275 | + | ('??tiquettes-sous-sens #f) | |
| 276 | + | ('texte-sous-sens #f) | |
| 277 | + | ('sous-sens #f) | |
| 278 | + | ('sous-bloc-gram #f) | |
| 279 | + | ('fran??ais #f) | |
| 280 | + | ('exemple #f) | |
| 281 | + | ('exemples #f)))) | |
| 282 | + | ||
| 283 | + | (define (create-parser frq) | |
| 284 | + | (ssax:make-parser | |
| 285 | + | NEW-LEVEL-SEED | |
| 286 | + | (lambda (elem-gi attributes namespaces expected-content seed) | |
| 287 | + | attributes) | |
| 288 | + | ||
| 289 | + | FINISH-ELEMENT | |
| 290 | + | (lambda (elem-gi attributes namespaces parent-seed seed) | |
| 291 | + | (if (equal? elem-gi 'volume) | |
| 292 | + | (filter (lambda (a) a) seed) | |
| 293 | + | (let* ((seed (reverse seed)) | |
| 294 | + | (element (sxml->element seed elem-gi frq))) | |
| 295 | + | (cons element parent-seed)))) | |
| 296 | + | ||
| 297 | + | CHAR-DATA-HANDLER | |
| 298 | + | (lambda (string1 string2 seed) | |
| 299 | + | (cons (string-append string1 string2) seed)))) | |
| 300 | + | ||
| 301 | + | (define (xml->results port frq) | |
| 302 | + | (let ((results (filter result? ((create-parser frq) port '())))) | |
| 303 | + | (sort-results results))) |
modules/nani/result/jmdict.scm unknown status 1
| 1 | + | ;;; Nani Project website | |
| 2 | + | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
| 3 | + | ;;; | |
| 4 | + | ;;; This file is part of the Nani Project website. | |
| 5 | + | ;;; | |
| 6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | + | ;;; your option) any later version. | |
| 10 | + | ;;; | |
| 11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | + | ;;; GNU Affero General Public License for more details. | |
| 15 | + | ;;; | |
| 16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | + | ||
| 19 | + | (define-module (nani result jmdict) | |
| 20 | + | #:use-module (ice-9 match) | |
| 21 | + | #:use-module (ice-9 rdelim) | |
| 22 | + | #:use-module (nani result frequency) | |
| 23 | + | #:use-module (nani result result) | |
| 24 | + | #:use-module (sxml simple) | |
| 25 | + | #:use-module (sxml ssax) | |
| 26 | + | #:export (load-dic xml->results)) | |
| 27 | + | ||
| 28 | + | (define (load-dic file) | |
| 29 | + | (xml->sxml (call-with-input-file file read-string))) | |
| 30 | + | ||
| 31 | + | (define (sxml->reading lst) | |
| 32 | + | (let loop ((reading (make-reading '() '() '())) (lst lst)) | |
| 33 | + | (if (null? lst) | |
| 34 | + | reading | |
| 35 | + | (loop | |
| 36 | + | (match (car lst) | |
| 37 | + | (('reading r) (update-reading reading #:readings (cons r (reading-readings reading)))) | |
| 38 | + | (('info r) (update-reading reading #:info (cons r (reading-info reading)))) | |
| 39 | + | (('limit r) (update-reading reading #:kanjis (cons r (reading-kanjis reading)))) | |
| 40 | + | (((? symbol? s) v) (throw 'unknown-content s v)) | |
| 41 | + | ((? string? _) reading)) | |
| 42 | + | (cdr lst))))) | |
| 43 | + | ||
| 44 | + | (define (sxml->source lst) | |
| 45 | + | (let loop ((source (make-source '() #f "")) (lst lst)) | |
| 46 | + | (if (null? lst) | |
| 47 | + | source | |
| 48 | + | (loop | |
| 49 | + | (match (car lst) | |
| 50 | + | (('content c) (update-source source #:content (cons c (source-content source)))) | |
| 51 | + | (('ls_wasei _) (update-source source #:wasei? #t)) | |
| 52 | + | (('ls_type t) source); type is always "part" or nothing | |
| 53 | + | ((('xml . 'lang) l) (update-source source #:lang l)) | |
| 54 | + | (((? symbol? s) v) (throw 'unknown-content s v)) | |
| 55 | + | ((? list? l) (loop source l)) | |
| 56 | + | ((? string? c) (update-source source #:content | |
| 57 | + | (cons c (source-content source))))) | |
| 58 | + | (cdr lst))))) | |
| 59 | + | ||
| 60 | + | (define (sxml->meaning lst) | |
| 61 | + | (let loop ((meaning (make-meaning '() '() '() '() '() "eng")) (lst lst)) | |
| 62 | + | (if (null? lst) | |
| 63 | + | meaning | |
| 64 | + | (loop | |
| 65 | + | (match (car lst) | |
| 66 | + | (('ref (? string? r)) (update-meaning meaning #:references (cons r (meaning-references meaning)))) | |
| 67 | + | (('limit (? string? r)) (update-meaning meaning #:limits (cons r (meaning-limits meaning)))) | |
| 68 | + | (('info (? string? r)) (update-meaning meaning #:infos (cons r (meaning-infos meaning)))) | |
| 69 | + | ((? source? s) (update-meaning meaning #:sources (cons s (meaning-sources meaning)))) | |
| 70 | + | (('info (? string? r)) (update-meaning meaning #:infos (cons r (meaning-infos meaning)))) | |
| 71 | + | (('gloss (? string? r)) (update-meaning meaning | |
| 72 | + | #:glosses | |
| 73 | + | (cons (string-downcase r) | |
| 74 | + | (meaning-glosses meaning)))) | |
| 75 | + | ((('xml . 'lang) (? string? l)) (update-meaning meaning #:language l)) | |
| 76 | + | (((? symbol? s) v) (throw 'unknown-content s v)) | |
| 77 | + | ((? list? l) (loop meaning l)) | |
| 78 | + | ((? string? _) meaning)) | |
| 79 | + | (cdr lst))))) | |
| 80 | + | ||
| 81 | + | (define (sxml->result lst frq) | |
| 82 | + | (let ((result | |
| 83 | + | (let loop ((result (make-result 0 0 '() '() '())) (lst lst)) | |
| 84 | + | (if (null? lst) | |
| 85 | + | result | |
| 86 | + | (loop | |
| 87 | + | (match (car lst) | |
| 88 | + | (('kanji kanji) | |
| 89 | + | (update-result result #:kanjis (cons kanji (result-kanjis result)))) | |
| 90 | + | ((? reading? r) | |
| 91 | + | (update-result result #:readings (cons r (result-readings result)))) | |
| 92 | + | ((? meaning? s) | |
| 93 | + | (update-result result #:meanings (cons s (result-meanings result)))) | |
| 94 | + | ((? string? _) result)) | |
| 95 | + | (cdr lst)))))) | |
| 96 | + | (let* ((word (if (null? (result-kanjis result)) | |
| 97 | + | (car (reading-readings (car (result-readings result)))) | |
| 98 | + | (car (result-kanjis result)))) | |
| 99 | + | (score (frequency->score frq word)) | |
| 100 | + | (meanings (result-meanings result))) | |
| 101 | + | (update-result result | |
| 102 | + | #:score score | |
| 103 | + | #:meanings meanings)))) | |
| 104 | + | ||
| 105 | + | (define (field->info field) | |
| 106 | + | ;; TODO: translate depending on target language | |
| 107 | + | (match field | |
| 108 | + | ("agric" "agriculture") | |
| 109 | + | ("anat" "anatomy") | |
| 110 | + | ("archeol" "archeology") | |
| 111 | + | ("archit" "architecture") | |
| 112 | + | ("art" "art, aesthetics") | |
| 113 | + | ("astron" "astronomy") | |
| 114 | + | ("audvid" "audiovisual") | |
| 115 | + | ("aviat" "aviation") | |
| 116 | + | ("baseb" "baseball") | |
| 117 | + | ("biochem" "biochemistry") | |
| 118 | + | ("biol" "biology") | |
| 119 | + | ("bot" "botany") | |
| 120 | + | ("Buddh" "Buddhism") | |
| 121 | + | ("bus" "business") | |
| 122 | + | ("chem" "chemistry") | |
| 123 | + | ("Christn" "Christianity") | |
| 124 | + | ("cloth" "clothing") | |
| 125 | + | ("comp" "computing") | |
| 126 | + | ("cryst" "crystallography") | |
| 127 | + | ("ecol" "ecology") | |
| 128 | + | ("econ" "economics") | |
| 129 | + | ("elec" "electricity, elec. eng.") | |
| 130 | + | ("electr" "electronics") | |
| 131 | + | ("embryo" "embryology") | |
| 132 | + | ("engr" "engineering") | |
| 133 | + | ("ent" "entomology") | |
| 134 | + | ("finc" "finance") | |
| 135 | + | ("fish" "fishing") | |
| 136 | + | ("food" "food, cooking") | |
| 137 | + | ("gardn" "gardening, horticulture") | |
| 138 | + | ("genet" "genetics") | |
| 139 | + | ("geogr" "geography") | |
| 140 | + | ("geol" "geology") | |
| 141 | + | ("geom" "geometry") | |
| 142 | + | ("go" "go (game)") | |
| 143 | + | ("golf" "golf") | |
| 144 | + | ("gramm" "grammar") | |
| 145 | + | ("grmyth" "Greek mythology") | |
| 146 | + | ("hanaf" "hanafuda") | |
| 147 | + | ("horse" "horse racing") | |
| 148 | + | ("law" "law") | |
| 149 | + | ("ling" "linguistics") | |
| 150 | + | ("logic" "logic") | |
| 151 | + | ("MA" "martial arts") | |
| 152 | + | ("mahj" "mahjong") | |
| 153 | + | ("math" "mathematics") | |
| 154 | + | ("mech" "mechanical engineering") | |
| 155 | + | ("med" "medicine") | |
| 156 | + | ("met" "meteorology") | |
| 157 | + | ("mil" "military") | |
| 158 | + | ("music" "music") | |
| 159 | + | ("ornith" "ornithology") | |
| 160 | + | ("paleo" "paleontology") | |
| 161 | + | ("pathol" "pathology") | |
| 162 | + | ("pharm" "pharmacy") | |
| 163 | + | ("phil" "philosophy") | |
| 164 | + | ("photo" "photography") | |
| 165 | + | ("physics" "physics") | |
| 166 | + | ("physiol" "physiology") | |
| 167 | + | ("print" "printing") | |
| 168 | + | ("psy" "psychiatry") | |
| 169 | + | ("psych" "psychology") | |
| 170 | + | ("rail" "railway") | |
| 171 | + | ("Shinto" "Shinto") | |
| 172 | + | ("shogi" "shogi") | |
| 173 | + | ("sports" "sports") | |
| 174 | + | ("stat" "statistics") | |
| 175 | + | ("sumo" "sumo") | |
| 176 | + | ("telec" "telecommunications") | |
| 177 | + | ("tradem" "trademark") | |
| 178 | + | ("vidg" "video games") | |
| 179 | + | ("zool" "zoology"))) | |
| 180 | + | ||
| 181 | + | (define (dial->info dial) | |
| 182 | + | (match dial | |
| 183 | + | ("bra" "Brazilian") | |
| 184 | + | ("hob" "Hokkaido-ben") | |
| 185 | + | ("ksb" "Kansai-ben") | |
| 186 | + | ("ktb" "Kantou-ben") | |
| 187 | + | ("kyb" "Kyoto-ben") | |
| 188 | + | ("kyu" "Kyuushuu-ben") | |
| 189 | + | ("nab" "Nagano-ben") | |
| 190 | + | ("osb" "Osaka-ben") | |
| 191 | + | ("rkb" "Ryuukyuu-ben") | |
| 192 | + | ("thb" "Touhoku-ben") | |
| 193 | + | ("tsb" "Tosa-ben") | |
| 194 | + | ("tsug" "Tsugaru-ben"))) | |
| 195 | + | ||
| 196 | + | (define (misc->info misc) | |
| 197 | + | (match misc | |
| 198 | + | ("abbr" "abbreviation") | |
| 199 | + | ("arch" "archaism") | |
| 200 | + | ("char" "character") | |
| 201 | + | ("chn" "children's language") | |
| 202 | + | ("col" "colloquialism") | |
| 203 | + | ("company" "company name") | |
| 204 | + | ("creat" "creature") | |
| 205 | + | ("dated" "dated term") | |
| 206 | + | ("dei" "deity") | |
| 207 | + | ("derog" "derogatory") | |
| 208 | + | ("doc" "document") | |
| 209 | + | ("ev" "event") | |
| 210 | + | ("fam" "familiar language") | |
| 211 | + | ("fem" "female term or language") | |
| 212 | + | ("fict" "fiction") | |
| 213 | + | ("form" "formal or literary term") | |
| 214 | + | ("given" "given name or forename, gender not specified") | |
| 215 | + | ("group" "group") | |
| 216 | + | ("hist" "historical term") | |
| 217 | + | ("hon" "honorific or respectful (sonkeigo) language") | |
| 218 | + | ("hum" "humble (kenjougo) language") | |
| 219 | + | ("id" "idiomatic expression") | |
| 220 | + | ("joc" "jocular, humorous term") | |
| 221 | + | ("leg" "legend") | |
| 222 | + | ("m-sl" "manga slang") | |
| 223 | + | ("male" "male term or language") | |
| 224 | + | ("myth" "mythology") | |
| 225 | + | ("net-sl" "Internet slang") | |
| 226 | + | ("obj" "object") | |
| 227 | + | ("obs" "obsolete term") | |
| 228 | + | ("obsc" "obscure term") | |
| 229 | + | ("on-mim" "onomatopoeic or mimetic word") | |
| 230 | + | ("organization" "organization name") | |
| 231 | + | ("oth" "other") | |
| 232 | + | ("person" "full name of a particular person") | |
| 233 | + | ("place" "place name") | |
| 234 | + | ("poet" "poetical term") | |
| 235 | + | ("pol" "polite (teineigo) language") | |
| 236 | + | ("product" "product name") | |
| 237 | + | ("proverb" "proverb") | |
| 238 | + | ("quote" "quotation") | |
| 239 | + | ("rare" "rare") | |
| 240 | + | ("relig" "religion") | |
| 241 | + | ("sens" "sensitive") | |
| 242 | + | ("serv" "service") | |
| 243 | + | ("sl" "slang") | |
| 244 | + | ("station" "railway station") | |
| 245 | + | ("surname" "family or surname") | |
| 246 | + | ("uk" "word usually written using kana alone") | |
| 247 | + | ("unclass" "unclassified name") | |
| 248 | + | ("vulg" "vulgar expression or word") | |
| 249 | + | ("work" "work of art, literature, music, etc. name") | |
| 250 | + | ("X" "rude or X-rated term") | |
| 251 | + | ("yoji" "yojijukugo"))) | |
| 252 | + | ||
| 253 | + | (define (pos->info pos) | |
| 254 | + | (match pos | |
| 255 | + | ("adj-f" "noun or verb acting prenominally") | |
| 256 | + | ("adj-i" "adjective (keiyoushi)") | |
| 257 | + | ("adj-ix" "adjective (keiyoushi) - yoi/ii class") | |
| 258 | + | ("adj-kari" "'kari' adjective (archaic)") | |
| 259 | + | ("adj-ku" "'ku' adjective (archaic)") | |
| 260 | + | ("adj-na" "adjectival nouns or quasi-adjectives (keiyodoshi)") | |
| 261 | + | ("adj-nari" "archaic/formal form of na-adjective") | |
| 262 | + | ("adj-no" "nouns which may take the genitive case particle 'no'") | |
| 263 | + | ("adj-pn" "pre-noun adjectival (rentaishi)") | |
| 264 | + | ("adj-shiku" "'shiku' adjective (archaic)") | |
| 265 | + | ("adj-t" "'taru' adjective") | |
| 266 | + | ("adv" "adverb (fukushi)") | |
| 267 | + | ("adv-to" "adverb taking the 'to' particle") | |
| 268 | + | ("aux" "auxiliary") | |
| 269 | + | ("aux-adj" "auxiliary adjective") | |
| 270 | + | ("aux-v" "auxiliary verb") | |
| 271 | + | ("conj" "conjunction") | |
| 272 | + | ("cop" "copula") | |
| 273 | + | ("ctr" "counter") | |
| 274 | + | ("exp" "expression") | |
| 275 | + | ("int" "interjection (kandoushi)") | |
| 276 | + | ("n" "noun (common) (futsuumeishi)") | |
| 277 | + | ("n-adv" "adverbial noun (fukushitekimeishi)") | |
| 278 | + | ("n-pr" "proper noun") | |
| 279 | + | ("n-pref" "noun, used as a prefix") | |
| 280 | + | ("n-suf" "noun, used as a suffix") | |
| 281 | + | ("n-t" "noun (temporal) (jisoumeishi)") | |
| 282 | + | ("num" "numeric") | |
| 283 | + | ("pn" "pronoun") | |
| 284 | + | ("pref" "prefix") | |
| 285 | + | ("prt" "particle") | |
| 286 | + | ("suf" "suffix") | |
| 287 | + | ("unc" "unclassified") | |
| 288 | + | ("v-unspec" "verb unspecified") | |
| 289 | + | ("v1" "Ichidan verb") | |
| 290 | + | ("v1-s" "Ichidan verb - kureru special class") | |
| 291 | + | ("v2a-s" "Nidan verb with 'u' ending (archaic)") | |
| 292 | + | ("v2b-k" "Nidan verb (upper class) with 'bu' ending (archaic)") | |
| 293 | + | ("v2b-s" "Nidan verb (lower class) with 'bu' ending (archaic)") | |
| 294 | + | ("v2d-k" "Nidan verb (upper class) with 'dzu' ending (archaic)") | |
| 295 | + | ("v2d-s" "Nidan verb (lower class) with 'dzu' ending (archaic)") | |
| 296 | + | ("v2g-k" "Nidan verb (upper class) with 'gu' ending (archaic)") | |
| 297 | + | ("v2g-s" "Nidan verb (lower class) with 'gu' ending (archaic)") | |
| 298 | + | ("v2h-k" "Nidan verb (upper class) with 'hu/fu' ending (archaic)") | |
| 299 | + | ("v2h-s" "Nidan verb (lower class) with 'hu/fu' ending (archaic)") | |
| 300 | + | ("v2k-k" "Nidan verb (upper class) with 'ku' ending (archaic)") | |
| 301 | + | ("v2k-s" "Nidan verb (lower class) with 'ku' ending (archaic)") | |
| 302 | + | ("v2m-k" "Nidan verb (upper class) with 'mu' ending (archaic)") | |
| 303 | + | ("v2m-s" "Nidan verb (lower class) with 'mu' ending (archaic)") | |
| 304 | + | ("v2n-s" "Nidan verb (lower class) with 'nu' ending (archaic)") | |
| 305 | + | ("v2r-k" "Nidan verb (upper class) with 'ru' ending (archaic)") | |
| 306 | + | ("v2r-s" "Nidan verb (lower class) with 'ru' ending (archaic)") | |
| 307 | + | ("v2s-s" "Nidan verb (lower class) with 'su' ending (archaic)") | |
| 308 | + | ("v2t-k" "Nidan verb (upper class) with 'tsu' ending (archaic)") | |
| 309 | + | ("v2t-s" "Nidan verb (lower class) with 'tsu' ending (archaic)") | |
| 310 | + | ("v2w-s" "Nidan verb (lower class) with 'u' ending and 'we' conjugation (archaic)") | |
| 311 | + | ("v2y-k" "Nidan verb (upper class) with 'yu' ending (archaic)") | |
| 312 | + | ("v2y-s" "Nidan verb (lower class) with 'yu' ending (archaic)") | |
| 313 | + | ("v2z-s" "Nidan verb (lower class) with 'zu' ending (archaic)") | |
| 314 | + | ("v4b" "Yodan verb with 'bu' ending (archaic)") | |
| 315 | + | ("v4g" "Yodan verb with 'gu' ending (archaic)") | |
| 316 | + | ("v4h" "Yodan verb with 'hu/fu' ending (archaic)") | |
| 317 | + | ("v4k" "Yodan verb with 'ku' ending (archaic)") | |
| 318 | + | ("v4m" "Yodan verb with 'mu' ending (archaic)") | |
| 319 | + | ("v4n" "Yodan verb with 'nu' ending (archaic)") | |
| 320 | + | ("v4r" "Yodan verb with 'ru' ending (archaic)") | |
| 321 | + | ("v4s" "Yodan verb with 'su' ending (archaic)") | |
| 322 | + | ("v4t" "Yodan verb with 'tsu' ending (archaic)") | |
| 323 | + | ("v5aru" "Godan verb - -aru special class") | |
| 324 | + | ("v5b" "Godan verb with 'bu' ending") | |
| 325 | + | ("v5g" "Godan verb with 'gu' ending") | |
| 326 | + | ("v5k" "Godan verb with 'ku' ending") | |
| 327 | + | ("v5k-s" "Godan verb - Iku/Yuku special class") | |
| 328 | + | ("v5m" "Godan verb with 'mu' ending") | |
| 329 | + | ("v5n" "Godan verb with 'nu' ending") | |
| 330 | + | ("v5r" "Godan verb with 'ru' ending") | |
| 331 | + | ("v5r-i" "Godan verb with 'ru' ending (irregular verb)") | |
| 332 | + | ("v5s" "Godan verb with 'su' ending") | |
| 333 | + | ("v5t" "Godan verb with 'tsu' ending") | |
| 334 | + | ("v5u" "Godan verb with 'u' ending") | |
| 335 | + | ("v5u-s" "Godan verb with 'u' ending (special class)") | |
| 336 | + | ("v5uru" "Godan verb - Uru old class verb (old form of Eru)") | |
| 337 | + | ("vi" "intransitive verb") | |
| 338 | + | ("vk" "Kuru verb - special class") | |
| 339 | + | ("vn" "irregular nu verb") | |
| 340 | + | ("vr" "irregular ru verb, plain form ends with -ri") | |
| 341 | + | ("vs" "noun or participle which takes the aux. verb suru") | |
| 342 | + | ("vs-c" "su verb - precursor to the modern suru") | |
| 343 | + | ("vs-i" "suru verb - included") | |
| 344 | + | ("vs-s" "suru verb - special class") | |
| 345 | + | ("vt" "transitive verb") | |
| 346 | + | ("vz" "Ichidan verb - zuru verb (alternative form of -jiru verbs)"))) | |
| 347 | + | ||
| 348 | + | (define (re_inf->info i) | |
| 349 | + | (match i | |
| 350 | + | ("gikun" "gikun (meaning as reading) or jukujikun (special kanji reading)") | |
| 351 | + | ("ik" "word containing irregular kana usage") | |
| 352 | + | ("ok" "out-dated or obsolete kana usage") | |
| 353 | + | ("uK" "word usually written using kanji alone"))) | |
| 354 | + | ||
| 355 | + | (define (sxml->element lst elem frq) | |
| 356 | + | (match elem | |
| 357 | + | ('ent_seq "") | |
| 358 | + | ('ke_pri "") | |
| 359 | + | ('re_nokanji "") | |
| 360 | + | ('re_pri "") | |
| 361 | + | ('ke_inf "") | |
| 362 | + | ('misc (if (and (= (length lst) 1) (string? (car lst))) | |
| 363 | + | (misc->info (car lst)) | |
| 364 | + | (throw 'invalid-misc lst))) | |
| 365 | + | ('re_restr (if (and (= (length lst) 1) (string? (car lst))) | |
| 366 | + | `(limit ,(car lst)) | |
| 367 | + | (throw 'invalid-re_restr lst))) | |
| 368 | + | ('keb (if (and (= (length lst) 1) (string? (car lst))) | |
| 369 | + | `(kanji ,(car lst)) | |
| 370 | + | (throw 'invalid-keb lst))) | |
| 371 | + | ('s_inf (if (null? (filter (lambda (s) (not (string? s))) lst)) | |
| 372 | + | `(info ,(apply string-append lst)) | |
| 373 | + | (throw 'invalid-s_inf lst))) | |
| 374 | + | ('dial (if (and (= (length lst) 1) (string? (car lst))) | |
| 375 | + | (dial->info (car lst)) | |
| 376 | + | (throw 'invalid-dial lst))) | |
| 377 | + | ('re_inf (if (and (= (length lst) 1) (string? (car lst))) | |
| 378 | + | (re_inf->info (car lst)) | |
| 379 | + | (throw 'invalid-re_inf lst))) | |
| 380 | + | ('stagk (if (and (= (length lst) 1) (string? (car lst))) | |
| 381 | + | `(limit ,(car lst)) | |
| 382 | + | (throw 'invalid-stagk lst))) | |
| 383 | + | ('stagr (if (and (= (length lst) 1) (string? (car lst))) | |
| 384 | + | `(limit ,(car lst)) | |
| 385 | + | (throw 'invalid-stagr lst))) | |
| 386 | + | ('field (if (and (= (length lst) 1) (string? (car lst))) | |
| 387 | + | (field->info (car lst)) | |
| 388 | + | (throw 'invalid-field lst))) | |
| 389 | + | ('ant (if (and (= (length lst) 1) (string? (car lst))) | |
| 390 | + | `(ref ,(car lst)) | |
| 391 | + | (throw 'invalid-ant lst))) | |
| 392 | + | ('reb (if (and (= (length lst) 1) (string? (car lst))) | |
| 393 | + | `(reading ,(car lst)) | |
| 394 | + | (throw 'invalid-reb lst))) | |
| 395 | + | ('r_ele (sxml->reading lst)) | |
| 396 | + | ('k_ele (car (filter list? lst))) | |
| 397 | + | ('pos (if (and (= (length lst) 1) (string? (car lst))) | |
| 398 | + | (pos->info (car lst)) | |
| 399 | + | (throw 'invalid-pos lst))) | |
| 400 | + | ('xref (if (and (= (length lst) 1) (string? (car lst))) | |
| 401 | + | `(ref ,(car lst)) | |
| 402 | + | (throw 'invalid-xref lst))) | |
| 403 | + | ('gloss (cons | |
| 404 | + | `(gloss ,(apply string-append (filter string? lst))) | |
| 405 | + | (filter list? lst))) | |
| 406 | + | ('lsource (sxml->source lst)) | |
| 407 | + | ('sense (sxml->meaning lst)) | |
| 408 | + | ('entry (sxml->result lst frq)))) | |
| 409 | + | ||
| 410 | + | (define (create-parser frq) | |
| 411 | + | (ssax:make-parser | |
| 412 | + | NEW-LEVEL-SEED | |
| 413 | + | (lambda (elem-gi attributes namespaces expected-content seed) | |
| 414 | + | (map | |
| 415 | + | (match-lambda | |
| 416 | + | ((k . v) (list k v))) | |
| 417 | + | (filter | |
| 418 | + | (match-lambda | |
| 419 | + | ((k . v) (not (member k '(g_type))))) | |
| 420 | + | attributes))) | |
| 421 | + | ||
| 422 | + | FINISH-ELEMENT | |
| 423 | + | (lambda (elem-gi attributes namespaces parent-seed seed) | |
| 424 | + | (if (equal? elem-gi 'JMdict) | |
| 425 | + | seed | |
| 426 | + | (let* ((seed (reverse seed)) | |
| 427 | + | (element (sxml->element seed elem-gi frq))) | |
| 428 | + | (cons element parent-seed)))) | |
| 429 | + | ||
| 430 | + | CHAR-DATA-HANDLER | |
| 431 | + | (lambda (string1 string2 seed) | |
| 432 | + | (cons (string-append string1 string2) seed)))) | |
| 433 | + | ||
| 434 | + | (define (xml->results port frq) | |
| 435 | + | (let ((results (filter result? ((create-parser frq) port '())))) | |
| 436 | + | (sort-results results))) |
modules/nani/result/result.scm unknown status 1
| 1 | + | ;;; Nani Project website | |
| 2 | + | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
| 3 | + | ;;; | |
| 4 | + | ;;; This file is part of the Nani Project website. | |
| 5 | + | ;;; | |
| 6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | + | ;;; your option) any later version. | |
| 10 | + | ;;; | |
| 11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | + | ;;; GNU Affero General Public License for more details. | |
| 15 | + | ;;; | |
| 16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | + | ||
| 19 | + | (define-module (nani result result) | |
| 20 | + | #:use-module (ice-9 binary-ports) | |
| 21 | + | #:use-module (ice-9 match) | |
| 22 | + | #:use-module (rnrs bytevectors) | |
| 23 | + | #:use-module (srfi srfi-9) | |
| 24 | + | #:use-module (nani encoding serialize) | |
| 25 | + | #:use-module (nani encoding trie) | |
| 26 | + | #:use-module (nani encoding huffman) | |
| 27 | + | #:export (make-result | |
| 28 | + | result? | |
| 29 | + | result-position | |
| 30 | + | result-position-set! | |
| 31 | + | result-score | |
| 32 | + | result-kanjis | |
| 33 | + | result-readings | |
| 34 | + | result-meanings | |
| 35 | + | ||
| 36 | + | make-reading | |
| 37 | + | reading? | |
| 38 | + | reading-kanjis | |
| 39 | + | reading-info | |
| 40 | + | reading-readings | |
| 41 | + | ||
| 42 | + | make-meaning | |
| 43 | + | meaning? | |
| 44 | + | meaning-references | |
| 45 | + | meaning-limits | |
| 46 | + | meaning-sources | |
| 47 | + | meaning-infos | |
| 48 | + | meaning-glosses | |
| 49 | + | meaning-language | |
| 50 | + | ||
| 51 | + | make-source | |
| 52 | + | source? | |
| 53 | + | source-content | |
| 54 | + | source-wasei? | |
| 55 | + | source-lang | |
| 56 | + | ||
| 57 | + | update-result | |
| 58 | + | update-reading | |
| 59 | + | update-meaning | |
| 60 | + | update-source | |
| 61 | + | ||
| 62 | + | serialize-result result-size | |
| 63 | + | serialize-reading reading-size | |
| 64 | + | serialize-meaning meaning-size | |
| 65 | + | serialize-source source-size | |
| 66 | + | ||
| 67 | + | serialize-dictionary | |
| 68 | + | dictionary-entry-count | |
| 69 | + | sort-results)) | |
| 70 | + | ||
| 71 | + | (define-record-type result | |
| 72 | + | (make-result position score kanjis readings meanings) | |
| 73 | + | result? | |
| 74 | + | (position result-position result-position-set!) ; integer | |
| 75 | + | (score result-score) ; integer | |
| 76 | + | (kanjis result-kanjis) ; string-list | |
| 77 | + | (readings result-readings) ; reanding-list | |
| 78 | + | (meanings result-meanings)) ; meaning-list | |
| 79 | + | ||
| 80 | + | (define-record-type reading | |
| 81 | + | (make-reading kanjis info readings) | |
| 82 | + | reading? | |
| 83 | + | (kanjis reading-kanjis) ; string-list | |
| 84 | + | (info reading-info) ; string-list | |
| 85 | + | (readings reading-readings)) ; string-list | |
| 86 | + | ||
| 87 | + | (define-record-type meaning | |
| 88 | + | (make-meaning references limits sources infos glosses language) | |
| 89 | + | meaning? | |
| 90 | + | (references meaning-references) ; string-list | |
| 91 | + | (limits meaning-limits) ; string-list | |
| 92 | + | (sources meaning-sources) ; source-list | |
| 93 | + | (infos meaning-infos) ; string-list | |
| 94 | + | (glosses meaning-glosses) ; string-list | |
| 95 | + | (language meaning-language)) ; string | |
| 96 | + | ||
| 97 | + | (define-record-type source | |
| 98 | + | (make-source content wasei? lang) | |
| 99 | + | source? | |
| 100 | + | (content source-content) ; string-list | |
| 101 | + | (wasei? source-wasei?) ; boolean | |
| 102 | + | (lang source-lang)) ; string | |
| 103 | + | ||
| 104 | + | (define* (update-result result | |
| 105 | + | #:key (score (result-score result)) | |
| 106 | + | (kanjis (result-kanjis result)) | |
| 107 | + | (readings (result-readings result)) | |
| 108 | + | (meanings (result-meanings result))) | |
| 109 | + | (make-result (result-position result) score kanjis readings meanings)) | |
| 110 | + | ||
| 111 | + | (define* (update-reading reading | |
| 112 | + | #:key (kanjis (reading-kanjis reading)) | |
| 113 | + | (info (reading-info reading)) | |
| 114 | + | (readings (reading-readings reading))) | |
| 115 | + | (make-reading kanjis info readings)) | |
| 116 | + | ||
| 117 | + | (define* (update-meaning meaning | |
| 118 | + | #:key (references (meaning-references meaning)) | |
| 119 | + | (limits (meaning-limits meaning)) | |
| 120 | + | (sources (meaning-sources meaning)) | |
| 121 | + | (infos (meaning-infos meaning)) | |
| 122 | + | (glosses (meaning-glosses meaning)) | |
| 123 | + | (language (meaning-language meaning))) | |
| 124 | + | (make-meaning references limits sources infos glosses language)) | |
| 125 | + | ||
| 126 | + | (define* (update-source source | |
| 127 | + | #:key (content (source-content source)) | |
| 128 | + | (wasei? (source-wasei? source)) | |
| 129 | + | (lang (source-lang source))) | |
| 130 | + | (make-source content wasei? lang)) | |
| 131 | + | ||
| 132 | + | ;; Note how sources are not compressed, that's because they contain more weird characters | |
| 133 | + | ;; and represent only a very small fraction of the content, even after compression. | |
| 134 | + | (define (serialize-source source pos bv) | |
| 135 | + | (when (not (source? source)) (throw 'not-source source)) | |
| 136 | + | (let* ((pos ((serialize-list serialize-string) (source-content source) pos bv)) | |
| 137 | + | (pos (serialize-boolean (source-wasei? source) pos bv)) | |
| 138 | + | (pos (serialize-string (source-lang source) pos bv))) | |
| 139 | + | pos)) | |
| 140 | + | (define (source-size source) | |
| 141 | + | (when (not (source? source)) (throw 'not-source source)) | |
| 142 | + | (+ ((list-size string-size) (source-content source)) | |
| 143 | + | (boolean-size (source-wasei? source)) | |
| 144 | + | (string-size (source-lang source)))) | |
| 145 | + | ||
| 146 | + | (define (serialize-reading reading-huffman-code) | |
| 147 | + | (lambda (reading pos bv) | |
| 148 | + | (when (not (reading? reading)) (throw 'not-reading reading)) | |
| 149 | + | (let* ((pos ((serialize-list serialize-string) (reading-kanjis reading) pos bv)) | |
| 150 | + | (pos ((serialize-list serialize-string) (reading-info reading) pos bv)) | |
| 151 | + | (pos ((serialize-list (serialize-huffman-string reading-huffman-code)) | |
| 152 | + | (reading-readings reading) pos bv))) | |
| 153 | + | pos))) | |
| 154 | + | (define (reading-size reading-huffman-code) | |
| 155 | + | (lambda (reading) | |
| 156 | + | (when (not (reading? reading)) (throw 'not-reading reading)) | |
| 157 | + | (+ ((list-size string-size) (reading-kanjis reading)) | |
| 158 | + | ((list-size string-size) (reading-info reading)) | |
| 159 | + | ((list-size (huffman-string-size reading-huffman-code)) (reading-readings reading))))) | |
| 160 | + | ||
| 161 | + | (define (serialize-meaning meaning-huffman-code) | |
| 162 | + | (lambda (meaning pos bv) | |
| 163 | + | (when (not (meaning? meaning)) (throw 'not-meaning meaning)) | |
| 164 | + | (let* ((pos ((serialize-list serialize-string) (meaning-references meaning) pos bv)) | |
| 165 | + | (pos ((serialize-list serialize-string) (meaning-limits meaning) pos bv)) | |
| 166 | + | (pos ((serialize-list serialize-source) (meaning-sources meaning) pos bv)) | |
| 167 | + | (pos ((serialize-list (serialize-huffman-string meaning-huffman-code)) | |
| 168 | + | (meaning-infos meaning) pos bv)) | |
| 169 | + | (pos ((serialize-list (serialize-huffman-string meaning-huffman-code)) | |
| 170 | + | (meaning-glosses meaning) pos bv)) | |
| 171 | + | (pos (serialize-string (meaning-language meaning) pos bv))) | |
| 172 | + | pos))) | |
| 173 | + | (define (meaning-size meaning-huffman-code) | |
| 174 | + | (lambda (meaning) | |
| 175 | + | (when (not (meaning? meaning)) (throw 'not-meaning meaning)) | |
| 176 | + | (+ ((list-size string-size) (meaning-references meaning)) | |
| 177 | + | ((list-size string-size) (meaning-limits meaning)) | |
| 178 | + | ((list-size source-size) (meaning-sources meaning)) | |
| 179 | + | ((list-size (huffman-string-size meaning-huffman-code)) | |
| 180 | + | (meaning-infos meaning)) | |
| 181 | + | ((list-size (huffman-string-size meaning-huffman-code)) | |
| 182 | + | (meaning-glosses meaning)) | |
| 183 | + | (string-size (meaning-language meaning))))) | |
| 184 | + | ||
| 185 | + | (define (serialize-result kanji-huffman-code reading-huffman-code meaning-huffman-code) | |
| 186 | + | (lambda (result pos bv) | |
| 187 | + | (when (not (result? result)) (throw 'not-result result)) | |
| 188 | + | (result-position-set! result pos) | |
| 189 | + | (let* ((pos ((serialize-list (serialize-huffman-string kanji-huffman-code)) | |
| 190 | + | (result-kanjis result) pos bv)) | |
| 191 | + | (pos ((serialize-list (serialize-reading reading-huffman-code)) | |
| 192 | + | (result-readings result) pos bv)) | |
| 193 | + | (pos ((serialize-list (serialize-meaning meaning-huffman-code)) | |
| 194 | + | (result-meanings result) pos bv)) | |
| 195 | + | (pos (serialize-char (result-score result) pos bv))) | |
| 196 | + | pos))) | |
| 197 | + | (define (result-size kanji-huffman-code reading-huffman-code meaning-huffman-code) | |
| 198 | + | (lambda (result) | |
| 199 | + | (when (not (result? result)) (throw 'not-result result)) | |
| 200 | + | (+ ((list-size (huffman-string-size kanji-huffman-code)) (result-kanjis result)) | |
| 201 | + | ((list-size (reading-size reading-huffman-code)) (result-readings result)) | |
| 202 | + | ((list-size (meaning-size meaning-huffman-code)) (result-meanings result))))) | |
| 203 | + | ||
| 204 | + | ;; creating tries | |
| 205 | + | (define (make-key key) | |
| 206 | + | (apply append | |
| 207 | + | (map | |
| 208 | + | (lambda (c) | |
| 209 | + | (list (quotient c 16) (modulo c 16))) | |
| 210 | + | (bytevector->u8-list (string->utf8 key))))) | |
| 211 | + | ||
| 212 | + | (define (make-kanji-trie results) | |
| 213 | + | (let ((trie (make-empty-trie))) | |
| 214 | + | (let loop ((results results) (i 0)) | |
| 215 | + | (if (null? results) | |
| 216 | + | (compress-trie trie) | |
| 217 | + | (begin | |
| 218 | + | (for-each | |
| 219 | + | (lambda (key) | |
| 220 | + | (add-to-trie! trie (make-key key) i)) | |
| 221 | + | (result-kanjis (car results))) | |
| 222 | + | (loop (cdr results) (+ i 1))))))) | |
| 223 | + | ||
| 224 | + | (define (make-reading-trie results) | |
| 225 | + | (let ((trie (make-empty-trie))) | |
| 226 | + | (let loop ((results results) (i 0)) | |
| 227 | + | (if (null? results) | |
| 228 | + | (compress-trie trie) | |
| 229 | + | (begin | |
| 230 | + | (for-each | |
| 231 | + | (lambda (reading) | |
| 232 | + | (for-each | |
| 233 | + | (lambda (key) | |
| 234 | + | (add-to-trie! trie (make-key key) i)) | |
| 235 | + | (reading-readings reading))) | |
| 236 | + | (result-readings (car results))) | |
| 237 | + | (loop (cdr results) (+ i 1))))))) | |
| 238 | + | ||
| 239 | + | (define (make-meaning-trie results) | |
| 240 | + | (let ((trie (make-empty-trie))) | |
| 241 | + | (let loop ((results results) (i 0)) | |
| 242 | + | (if (null? results) | |
| 243 | + | (compress-trie trie) | |
| 244 | + | (begin | |
| 245 | + | (for-each | |
| 246 | + | (lambda (meaning) | |
| 247 | + | (for-each | |
| 248 | + | (lambda (key) | |
| 249 | + | (add-to-trie! trie (make-key key) i)) | |
| 250 | + | (meaning-glosses meaning))) | |
| 251 | + | (result-meanings (car results))) | |
| 252 | + | (loop (cdr results) (+ i 1))))))) | |
| 253 | + | ||
| 254 | + | (define (update-trie-pos! trie results) | |
| 255 | + | (let* ((vals (trie-vals trie)) | |
| 256 | + | (vals (map (lambda (i) (result-position (array-ref results i))) vals))) | |
| 257 | + | (trie-vals-set! trie vals)) | |
| 258 | + | (for-each | |
| 259 | + | (match-lambda | |
| 260 | + | ((char . child) | |
| 261 | + | (update-trie-pos! child results))) | |
| 262 | + | (trie-transitions trie))) | |
| 263 | + | ||
| 264 | + | (define (serialize-dictionary results) | |
| 265 | + | (define kanji-huffman | |
| 266 | + | (let ((kanjis (apply append (map result-kanjis results)))) | |
| 267 | + | (create-huffman kanjis))) | |
| 268 | + | (define kanji-huffman-code (huffman->code kanji-huffman)) | |
| 269 | + | (define reading-huffman | |
| 270 | + | (let* ((readings (apply append (map result-readings results))) | |
| 271 | + | (readings (apply append (map reading-readings readings)))) | |
| 272 | + | (create-huffman readings))) | |
| 273 | + | (define reading-huffman-code (huffman->code reading-huffman)) | |
| 274 | + | (define meaning-huffman | |
| 275 | + | (let* ((meanings (apply append (map result-meanings results))) | |
| 276 | + | (infos (apply append (map meaning-infos meanings))) | |
| 277 | + | (glosses (apply append (map meaning-glosses meanings)))) | |
| 278 | + | (create-huffman (append infos glosses)))) | |
| 279 | + | (define meaning-huffman-code (huffman->code meaning-huffman)) | |
| 280 | + | ||
| 281 | + | (define (trie-node-size trie) | |
| 282 | + | (apply + 1 (map trie-node-size (map cdr (trie-transitions trie))))) | |
| 283 | + | ||
| 284 | + | (let* ((header (string->utf8 "NANI_JMDICT003")) | |
| 285 | + | (header-size (bytevector-length header)) | |
| 286 | + | (pointers (make-bytevector 16 0)) | |
| 287 | + | (kanji-huffman-bv (serialize-huffman kanji-huffman)) | |
| 288 | + | (kanji-huffman-size (bytevector-length kanji-huffman-bv)) | |
| 289 | + | (reading-huffman-bv (serialize-huffman reading-huffman)) | |
| 290 | + | (reading-huffman-size (bytevector-length reading-huffman-bv)) | |
| 291 | + | (meaning-huffman-bv (serialize-huffman meaning-huffman)) | |
| 292 | + | (meaning-huffman-size (bytevector-length meaning-huffman-bv)) | |
| 293 | + | (serialize-trie (serialize-trie serialize-int int-size)) | |
| 294 | + | (trie-size (trie-size int-size)) | |
| 295 | + | (kanji-trie (make-kanji-trie results)) | |
| 296 | + | (kanji-trie-size (trie-size kanji-trie)) | |
| 297 | + | (reading-trie (make-reading-trie results)) | |
| 298 | + | (reading-trie-size (trie-size reading-trie)) | |
| 299 | + | (meaning-trie (make-meaning-trie results)) | |
| 300 | + | (meaning-trie-size (trie-size meaning-trie)) | |
| 301 | + | (trie-sizes (+ kanji-trie-size reading-trie-size meaning-trie-size)) | |
| 302 | + | (results-size | |
| 303 | + | ((list-size (result-size kanji-huffman-code reading-huffman-code | |
| 304 | + | meaning-huffman-code) | |
| 305 | + | #:size? #f) | |
| 306 | + | results)) | |
| 307 | + | (huffman-size (+ reading-huffman-size meaning-huffman-size kanji-huffman-size)) | |
| 308 | + | (pos-kanji (+ header-size 16 kanji-huffman-size reading-huffman-size | |
| 309 | + | meaning-huffman-size results-size 4)) | |
| 310 | + | (bv (make-bytevector (+ header-size 16 kanji-huffman-size | |
| 311 | + | reading-huffman-size | |
| 312 | + | meaning-huffman-size | |
| 313 | + | results-size 4 trie-sizes)))) | |
| 314 | + | (format #t "Number of nodes in kanjis: ~a~%" | |
| 315 | + | (trie-node-size kanji-trie)) | |
| 316 | + | (format #t "Number of nodes in readings: ~a~%" | |
| 317 | + | (trie-node-size reading-trie)) | |
| 318 | + | (format #t "Number of nodes in meanings: ~a~%" | |
| 319 | + | (trie-node-size meaning-trie)) | |
| 320 | + | ((serialize-list (serialize-result kanji-huffman-code reading-huffman-code | |
| 321 | + | meaning-huffman-code) | |
| 322 | + | #:size? #f) | |
| 323 | + | results (+ header-size 16 huffman-size) bv) | |
| 324 | + | ;; Serializing results also updated result-pos for each of them | |
| 325 | + | (let ((results (list->array 1 results))) | |
| 326 | + | (update-trie-pos! kanji-trie results) | |
| 327 | + | (update-trie-pos! reading-trie results) | |
| 328 | + | (update-trie-pos! meaning-trie results)) | |
| 329 | + | ;; number of entries | |
| 330 | + | (serialize-int (length results) (+ header-size 16 huffman-size results-size) | |
| 331 | + | bv) | |
| 332 | + | (let* ((results (list->array 1 results)) | |
| 333 | + | (pos pos-kanji) | |
| 334 | + | (pos (serialize-trie kanji-trie pos bv)) | |
| 335 | + | (pos-reading pos) | |
| 336 | + | (pos (serialize-trie reading-trie pos bv)) | |
| 337 | + | (pos-meaning pos) | |
| 338 | + | (pos (serialize-trie meaning-trie pos bv))) | |
| 339 | + | ;; Point to the trie structures | |
| 340 | + | (bytevector-u32-set! | |
| 341 | + | pointers 0 | |
| 342 | + | (+ header-size 16 huffman-size results-size (int-size 0)) | |
| 343 | + | (endianness big)) | |
| 344 | + | ;; point to the kanji trie structure | |
| 345 | + | (bytevector-u32-set! pointers 4 pos-kanji (endianness big)) | |
| 346 | + | ;; point to the reading trie structure | |
| 347 | + | (bytevector-u32-set! pointers 8 pos-reading (endianness big)) | |
| 348 | + | ;; point to the meaning trie structure | |
| 349 | + | (bytevector-u32-set! pointers 12 pos-meaning (endianness big)) | |
| 350 | + | ;; copy to result bytevector | |
| 351 | + | (bytevector-copy! header 0 bv 0 header-size) | |
| 352 | + | (bytevector-copy! pointers 0 bv header-size 16) | |
| 353 | + | (bytevector-copy! kanji-huffman-bv 0 bv (+ header-size 16) kanji-huffman-size) | |
| 354 | + | (bytevector-copy! reading-huffman-bv 0 bv | |
| 355 | + | (+ header-size 16 kanji-huffman-size) | |
| 356 | + | reading-huffman-size) | |
| 357 | + | (bytevector-copy! meaning-huffman-bv 0 bv | |
| 358 | + | (+ header-size 16 kanji-huffman-size reading-huffman-size) | |
| 359 | + | meaning-huffman-size) | |
| 360 | + | ;; give some feedback on the size of file's structures | |
| 361 | + | (format #t "huffmans are ~a bytes long~%" huffman-size) | |
| 362 | + | (format #t "results is ~a bytes long~%" results-size) | |
| 363 | + | (format #t "kanji trie is ~a bytes long~%" kanji-trie-size) | |
| 364 | + | (format #t "reading trie is ~a bytes long~%" reading-trie-size) | |
| 365 | + | (format #t "meaning trie is ~a bytes long~%" meaning-trie-size) | |
| 366 | + | bv))) | |
| 367 | + | ||
| 368 | + | (define (dictionary-entry-count file) | |
| 369 | + | (call-with-input-file file | |
| 370 | + | (lambda (port) | |
| 371 | + | (let* ((header (utf8->string (get-bytevector-n port 14))) | |
| 372 | + | (pointers (get-bytevector-n port 16)) | |
| 373 | + | (end-pos (bytevector-u32-ref pointers 0 (endianness big)))) | |
| 374 | + | (seek port (- end-pos 4) SEEK_SET) | |
| 375 | + | (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big)))))) | |
| 376 | + | ||
| 377 | + | (define (sort-results results) | |
| 378 | + | (define (get-string res) | |
| 379 | + | (if (null? (result-kanjis res)) | |
| 380 | + | (car (reading-readings (car (result-readings res)))) | |
| 381 | + | (car (result-kanjis res)))) | |
| 382 | + | (sort | |
| 383 | + | results | |
| 384 | + | (lambda (a b) | |
| 385 | + | (cond | |
| 386 | + | ((> (result-score a) (result-score b)) #t) | |
| 387 | + | ((= (result-score a) (result-score b)) | |
| 388 | + | (string>? (get-string a) (get-string b))))))) |
modules/nani/result/wadoku.scm unknown status 1
| 1 | + | ;;; Nani Project website | |
| 2 | + | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
| 3 | + | ;;; | |
| 4 | + | ;;; This file is part of the Nani Project website. | |
| 5 | + | ;;; | |
| 6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | + | ;;; your option) any later version. | |
| 10 | + | ;;; | |
| 11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | + | ;;; GNU Affero General Public License for more details. | |
| 15 | + | ;;; | |
| 16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | + | ||
| 19 | + | (define-module (nani result wadoku) | |
| 20 | + | #:use-module (ice-9 match) | |
| 21 | + | #:use-module (ice-9 rdelim) | |
| 22 | + | #:use-module (nani result frequency) | |
| 23 | + | #:use-module (nani result result) | |
| 24 | + | #:use-module (srfi srfi-9) | |
| 25 | + | #:use-module (sxml ssax) | |
| 26 | + | #:export (xml->results)) | |
| 27 | + | ||
| 28 | + | (define (usg->infos lst) | |
| 29 | + | (let loop ((infos '()) (lst lst)) | |
| 30 | + | (if (null? lst) | |
| 31 | + | infos | |
| 32 | + | (loop | |
| 33 | + | (match (car lst) | |
| 34 | + | (('reg . reg) (cons reg infos)) | |
| 35 | + | (('type . type) | |
| 36 | + | (if (equal? type "abrev") | |
| 37 | + | (cons type infos) | |
| 38 | + | infos)) | |
| 39 | + | ((? string? info) (cons info infos)) | |
| 40 | + | (_ infos)) | |
| 41 | + | (cdr lst))))) | |
| 42 | + | ||
| 43 | + | (define (ruigo->ref lst) | |
| 44 | + | (let loop ((ref #f) (lst lst)) | |
| 45 | + | (match lst | |
| 46 | + | (() ref) | |
| 47 | + | ((('id id) lst ...) | |
| 48 | + | (loop id lst)) | |
| 49 | + | (((a . b) lst ...) | |
| 50 | + | (loop (loop ref (list (list a b))) lst)) | |
| 51 | + | (((? list? l) lst ...) | |
| 52 | + | (loop (loop ref l) lst))))) | |
| 53 | + | ||
| 54 | + | (define (merge-meanings s1 s2) | |
| 55 | + | (update-meaning s2 | |
| 56 | + | #:references (append (meaning-references s1) (meaning-references s2)) | |
| 57 | + | #:infos (append (meaning-infos s1) (meaning-infos s2)) | |
| 58 | + | #:glosses (append (meaning-glosses s1) (meaning-glosses s2)))) | |
| 59 | + | ||
| 60 | + | (define (sxml->meaning lst) | |
| 61 | + | (let loop ((meaning (make-meaning '() '() '() '() '() "ger")) (lst lst)) | |
| 62 | + | (if (null? lst) | |
| 63 | + | meaning | |
| 64 | + | (loop | |
| 65 | + | (match (car lst) | |
| 66 | + | (#f meaning) | |
| 67 | + | ((? meaning? s) | |
| 68 | + | (merge-meanings s meaning)) | |
| 69 | + | ((? source? s) | |
| 70 | + | (update-meaning meaning #:sources (cons s (meaning-sources meaning)))) | |
| 71 | + | (('ref (? string? r)) | |
| 72 | + | (update-meaning meaning #:references (cons r (meaning-references meaning)))) | |
| 73 | + | (('info (? string? r)) | |
| 74 | + | (update-meaning meaning #:infos (cons r (meaning-infos meaning)))) | |
| 75 | + | (('infos (? string? r)) | |
| 76 | + | (update-meaning meaning #:infos (cons r (meaning-infos meaning)))) | |
| 77 | + | (('infos (? list? r)) | |
| 78 | + | (update-meaning meaning #:infos (append r (meaning-infos meaning)))) | |
| 79 | + | (('trans (? string? r)) | |
| 80 | + | (update-meaning meaning #:glosses (cons (string-downcase r) | |
| 81 | + | (meaning-glosses meaning)))) | |
| 82 | + | (('related . _) meaning) | |
| 83 | + | (('transcr . _) meaning) | |
| 84 | + | (('pitch . _) meaning) | |
| 85 | + | (((? symbol? s) v) | |
| 86 | + | (throw 'unknown-symbol s v)) | |
| 87 | + | ((? list? l) (loop meaning l)) | |
| 88 | + | ((? string? _) meaning)) | |
| 89 | + | (cdr lst))))) | |
| 90 | + | ||
| 91 | + | ;; TODO | |
| 92 | + | (define (sxml->ref lst) | |
| 93 | + | (define (sxml->ref-name lst) | |
| 94 | + | (let loop ((result #f) (lst lst)) | |
| 95 | + | (if (null? lst) | |
| 96 | + | result | |
| 97 | + | (loop | |
| 98 | + | (match (car lst) | |
| 99 | + | (('id . id) id) | |
| 100 | + | (((? symbol? s) . _) result) | |
| 101 | + | (((? symbol? s) _) result) | |
| 102 | + | ((? list? l) (loop result l)) | |
| 103 | + | (_ result)) | |
| 104 | + | (cdr lst))))) | |
| 105 | + | (let ((ref (sxml->ref-name lst))) | |
| 106 | + | (if (string? ref) | |
| 107 | + | `(ref ,ref) | |
| 108 | + | (throw 'no-ref ref 'from lst)))) | |
| 109 | + | ||
| 110 | + | (define (sxml->source lst) | |
| 111 | + | (let loop ((source (make-source '() #f "")) (lst lst)) | |
| 112 | + | (if (null? lst) | |
| 113 | + | source | |
| 114 | + | (loop | |
| 115 | + | (match (car lst) | |
| 116 | + | (('impli impli) (update-source source #:lang impli)) | |
| 117 | + | (('foreign foreign) | |
| 118 | + | (update-source source | |
| 119 | + | #:content (if (list? foreign) | |
| 120 | + | (append foreign (source-content source)) | |
| 121 | + | (cons foreign (source-content source))))) | |
| 122 | + | ((? list? l) (loop source l)) | |
| 123 | + | (_ source)) | |
| 124 | + | (cdr lst))))) | |
| 125 | + | ||
| 126 | + | (define (sxml->reading lst) | |
| 127 | + | (let loop ((reading (make-reading '() '() '())) (lst lst)) | |
| 128 | + | (if (null? lst) | |
| 129 | + | reading | |
| 130 | + | (loop | |
| 131 | + | (match (car lst) | |
| 132 | + | (('reading r) | |
| 133 | + | (update-reading reading #:readings (cons r (reading-readings reading)))) | |
| 134 | + | (_ reading)) | |
| 135 | + | (cdr lst))))) | |
| 136 | + | ||
| 137 | + | (define (sxml->result sxml frq) | |
| 138 | + | (define (sxml->result-aux sxml) | |
| 139 | + | (let loop ((result (make-result 0 0 '() '() '())) (last-source #f) (lst sxml)) | |
| 140 | + | (if (null? lst) | |
| 141 | + | result | |
| 142 | + | (match (car lst) | |
| 143 | + | (('kanji kanji) | |
| 144 | + | (loop | |
| 145 | + | (update-result result #:kanjis (cons kanji (result-kanjis result))) | |
| 146 | + | last-source (cdr lst))) | |
| 147 | + | ((? reading? reading) | |
| 148 | + | (loop | |
| 149 | + | (update-result result #:readings (cons reading (result-readings result))) | |
| 150 | + | last-source (cdr lst))) | |
| 151 | + | ((? meaning? meaning) | |
| 152 | + | (loop | |
| 153 | + | (update-result result | |
| 154 | + | #:meanings | |
| 155 | + | (cons | |
| 156 | + | (if last-source | |
| 157 | + | (update-meaning meaning | |
| 158 | + | #:sources (cons last-source (meaning-sources meaning))) | |
| 159 | + | meaning) | |
| 160 | + | (result-meanings result))) | |
| 161 | + | last-source (cdr lst))) | |
| 162 | + | ((? list? l) | |
| 163 | + | (loop (loop result last-source l) last-source (cdr lst))) | |
| 164 | + | (_ (loop result last-source (cdr lst))))))) | |
| 165 | + | (let* ((result (sxml->result-aux sxml)) | |
| 166 | + | (word (if (null? (result-kanjis result)) | |
| 167 | + | (car (reading-readings (car (result-readings result)))) | |
| 168 | + | (car (result-kanjis result)))) | |
| 169 | + | (score (frequency->score frq word)) | |
| 170 | + | (meanings (result-meanings result))) | |
| 171 | + | (update-result result | |
| 172 | + | #:score score | |
| 173 | + | #:meanings meanings))) | |
| 174 | + | ||
| 175 | + | (define (meishi->info lst) | |
| 176 | + | (match (assoc-ref lst 'suru) | |
| 177 | + | ("both" "N., mit suru intrans. od. trans.") | |
| 178 | + | ("intrans" "N., mit suru intrans.") | |
| 179 | + | ("trans" "N., mit suru trans."))) | |
| 180 | + | ||
| 181 | + | (define (keiyoushi->info lst) | |
| 182 | + | (cond | |
| 183 | + | ((equal? (assoc-ref lst 'ku) "true") | |
| 184 | + | "Adj. auf ???ku") | |
| 185 | + | ((equal? (assoc-ref lst 'shiku) "true") | |
| 186 | + | "Adj. auf -shiku"))) | |
| 187 | + | ||
| 188 | + | (define (keiyoudoushi->info lst) | |
| 189 | + | (if (equal? (assoc-ref lst 'nari) "true") | |
| 190 | + | "Na.???Adj. mit nari" | |
| 191 | + | "Na.???Adj. mit na od. no")) | |
| 192 | + | ||
| 193 | + | (define (fukushi->info lst) | |
| 194 | + | (define (get attr) | |
| 195 | + | (match (assoc-ref lst attr) | |
| 196 | + | ("false" #f) | |
| 197 | + | ("true" #t) | |
| 198 | + | (v v))) | |
| 199 | + | (string-append | |
| 200 | + | "Adv" | |
| 201 | + | (if (get 'ni) ", mit ni und Adn. mit naru" "") | |
| 202 | + | (if (get 'to) | |
| 203 | + | (if (get 'naru) | |
| 204 | + | ", mit to und Adn. mit taru" | |
| 205 | + | ", mit to") | |
| 206 | + | "") | |
| 207 | + | (match (get 'suru) | |
| 208 | + | ("intrans" ", mit suru intrans. V.") | |
| 209 | + | ("trans" ", mit suru trans. V.") | |
| 210 | + | ("both" ", mit suru trans. od. intrans. V.") | |
| 211 | + | (_ "")))) | |
| 212 | + | ||
| 213 | + | (define (doushi->info lst) | |
| 214 | + | (define transitivity-str | |
| 215 | + | (match (assoc-ref lst 'transitivity) | |
| 216 | + | ("intrans" "intrans.") | |
| 217 | + | ("trans" "trans.") | |
| 218 | + | ("both" "trans. od. intrans."))) | |
| 219 | + | ||
| 220 | + | (define onbin (assoc-ref lst 'onbin)) | |
| 221 | + | ||
| 222 | + | (match (assoc-ref lst 'level) | |
| 223 | + | ("kuru" (string-append "unregelm. " transitivity-str " V. auf ka")) | |
| 224 | + | ("ra" (string-append transitivity-str " V. auf -ra")) | |
| 225 | + | ("suru" (string-append transitivity-str " V. auf -suru")) | |
| 226 | + | ("1e" (string-append "1???st. " transitivity-str " V. auf -e")) | |
| 227 | + | ("1i" (string-append "1???st. " transitivity-str " V. auf -i")) | |
| 228 | + | ("2e" (string-append "2???st. " transitivity-str " V. auf -e bzw. -u")) | |
| 229 | + | ("2i" (string-append "2???st. " transitivity-str " V. auf -i bzw. -u")) | |
| 230 | + | ("4" (string-append "4???st. " transitivity-str " V.")) | |
| 231 | + | ("5" (string-append "5-st. " transitivity-str " V." | |
| 232 | + | (match (assoc-ref lst 'godanrow) | |
| 233 | + | ("ba" (if onbin | |
| 234 | + | " auf -ba mit regelm. Nasal-Onbin = ???nde" | |
| 235 | + | " auf -ba")) | |
| 236 | + | ("ga" (if onbin | |
| 237 | + | " auf -ga mit regelm. i-Onbin = ???ide" | |
| 238 | + | " auf -ga")) | |
| 239 | + | ("ka_i_yu" (if onbin | |
| 240 | + | " auf -ka mit Geminaten-Onbin = ???tte" | |
| 241 | + | " auf -ka")) | |
| 242 | + | ("ka" (if onbin | |
| 243 | + | " auf -ka mit i-Onbin = ???ite" | |
| 244 | + | " auf -ka")) | |
| 245 | + | ("ma" (if onbin | |
| 246 | + | " auf -ma regelm. Nasal-Onbin = ???nde" | |
| 247 | + | " auf -ma")) | |
| 248 | + | ("na" (if onbin | |
| 249 | + | " auf -na mit regelm. Nasal-Onbin = ???nde" | |
| 250 | + | " auf -na")) | |
| 251 | + | ("ra_i" (if onbin | |
| 252 | + | " auf -ra, Sonderform mit Reny????kei ???i" | |
| 253 | + | " auf -ra")) | |
| 254 | + | ("ra" (if onbin | |
| 255 | + | " auf -ra mit regelm. Geminaten-Onbin = ???tte" | |
| 256 | + | " auf -ra")) | |
| 257 | + | ("sa" " auf -sa") | |
| 258 | + | ("ta" (if onbin | |
| 259 | + | " auf -ta mit regelm. Geminaten-Onbin = ???tte" | |
| 260 | + | " auf -ta")) | |
| 261 | + | ("wa" (if onbin | |
| 262 | + | " auf -[w]a mit Geminaten-Onbin = ???tte" | |
| 263 | + | " auf -[w]a")) | |
| 264 | + | ("wa_o" (if onbin | |
| 265 | + | " auf -[w]a mit u-Onbin = ?????/??te" | |
| 266 | + | " auf -[w]a")) | |
| 267 | + | (#f "")))))) | |
| 268 | + | ||
| 269 | + | (define (gram->info lst) | |
| 270 | + | (map | |
| 271 | + | (lambda (gram) | |
| 272 | + | (match gram | |
| 273 | + | (('daimeishi) "Pron.") | |
| 274 | + | (('doushi doushi ...) (doushi->info doushi)) | |
| 275 | + | (('fukujoshi) "adv. Part.") | |
| 276 | + | (('fukushi) "Adv.") | |
| 277 | + | (('fukushi fukushi ...) (fukushi->info fukushi)) | |
| 278 | + | (('jodoushi) "Hilfsv.") | |
| 279 | + | (('joshi) "Part.") | |
| 280 | + | (('kakarijoshi) "Themenpart.") | |
| 281 | + | (('kakujoshi) "Kasuspart.") | |
| 282 | + | (('kandoushi) "Interj.") | |
| 283 | + | (('kanji) "Kanji") | |
| 284 | + | (('keiyoudoushi) "Na.???Adj.") | |
| 285 | + | (('keiyoudoushi keiyoudoushi ...) (keiyoudoushi->info keiyoudoushi)) | |
| 286 | + | (('keiyoushi) "Adj.") | |
| 287 | + | (('keiyoushi keiyoushi ...) (keiyoushi->info keiyoushi)) | |
| 288 | + | (('meishi) "N.") | |
| 289 | + | (('meishi meishi ...) (meishi->info meishi)) | |
| 290 | + | (('prefix) "Pr??f.") | |
| 291 | + | (('rengo) "Zus.") | |
| 292 | + | (('rentaishi) "Adn") | |
| 293 | + | (('setsuzokujoshi) "konjunktionale Part.") | |
| 294 | + | (('setsuzokushi) "Konj.") | |
| 295 | + | (('shuujoshi) "satzbeendende Part.") | |
| 296 | + | (('specialcharacter) "Sonderzeichen") | |
| 297 | + | (('suffix) "Suff.") | |
| 298 | + | (('wordcomponent) "Wortkomp."))) | |
| 299 | + | lst)) | |
| 300 | + | ||
| 301 | + | (define (sxml->string lst) | |
| 302 | + | (define (sub-loop loop infos result lst l) | |
| 303 | + | (let ((result (loop infos result l))) | |
| 304 | + | (if (list? result) | |
| 305 | + | (loop (append infos (filter list? result)) | |
| 306 | + | (apply string-append (filter string? result)) | |
| 307 | + | lst) | |
| 308 | + | (loop infos result lst)))) | |
| 309 | + | ||
| 310 | + | (let loop ((infos '()) (result "") (lst lst)) | |
| 311 | + | (match lst | |
| 312 | + | (() (if (null? infos) | |
| 313 | + | result | |
| 314 | + | (append infos (list result)))) | |
| 315 | + | ((? string? s) | |
| 316 | + | (loop infos (string-append result s) '())) | |
| 317 | + | (((? string? s) lst ...) | |
| 318 | + | (loop infos (string-append result s) lst)) | |
| 319 | + | ((('prior . _) lst ...) | |
| 320 | + | (loop infos result lst)) | |
| 321 | + | ((('options . _) lst ...) | |
| 322 | + | (loop infos result lst)) | |
| 323 | + | ((('firstname . _) lst ...) | |
| 324 | + | (loop infos result lst)) | |
| 325 | + | ((('ausn . _) lst ...) | |
| 326 | + | (loop infos result lst)) | |
| 327 | + | ((('lang . _) lst ...) | |
| 328 | + | (loop infos result lst)) | |
| 329 | + | ((('meta . _) lst ...) | |
| 330 | + | (loop infos result lst)) | |
| 331 | + | ((('genki . genki) lst ...) | |
| 332 | + | ;(loop (cons `(info ,genki) infos) result lst)) | |
| 333 | + | (loop infos result lst)) | |
| 334 | + | ((('jlpt . jlpt) lst ...) | |
| 335 | + | (loop (cons `(info ,(string-append "jlpt-" jlpt)) infos) result lst)) | |
| 336 | + | ((('ref . (? string? ref)) lst ...) | |
| 337 | + | (loop (cons `(ref ,ref) infos) result lst)) | |
| 338 | + | ((('ref (? string? ref)) lst ...) | |
| 339 | + | (loop (cons `(ref ,ref) infos) result lst)) | |
| 340 | + | ((('jap l) lst ...) | |
| 341 | + | (sub-loop loop infos result lst l)) | |
| 342 | + | ((('foreign l) lst ...) | |
| 343 | + | (sub-loop loop infos result lst l)) | |
| 344 | + | ((('transcr l) lst ...) | |
| 345 | + | (sub-loop loop infos result lst l)) | |
| 346 | + | ((((? symbol? s) v) lst ...) | |
| 347 | + | (throw 'unsupported-symbol s v)) | |
| 348 | + | ((((? symbol? s) . v) lst ...) | |
| 349 | + | (throw 'unsupported-symbol-pair s v)) | |
| 350 | + | (((? list? l) lst ...) | |
| 351 | + | (sub-loop loop infos result lst l))))) | |
| 352 | + | ||
| 353 | + | (define (sxml->element lst elem frq) | |
| 354 | + | (let ((elem (match elem | |
| 355 | + | ((_ . elem) elem) | |
| 356 | + | (_ elem)))) | |
| 357 | + | (match elem | |
| 358 | + | ('orth (let ((kanji (filter string? lst))) | |
| 359 | + | (if (null? kanji) | |
| 360 | + | #f | |
| 361 | + | `(kanji | |
| 362 | + | ,(string-filter | |
| 363 | + | (lambda (c) | |
| 364 | + | ;; Remove characters that interfere with kanji | |
| 365 | + | ;; based search | |
| 366 | + | (not (member c '(#\??? #\??? #\??? #\??? #\??? #\??? #\??? | |
| 367 | + | #\?? #\???)))) | |
| 368 | + | (car kanji)))))) | |
| 369 | + | ('count #f) | |
| 370 | + | ('entry (sxml->result lst frq)) | |
| 371 | + | ('hira `(reading ,(car lst))) | |
| 372 | + | ('hatsuon `(hatsuon ,(car lst))) | |
| 373 | + | ('accent `(pitch ,(car lst))) | |
| 374 | + | ('reading (sxml->reading lst)) | |
| 375 | + | ('form lst) | |
| 376 | + | ('impli `(impli ,(car lst))) | |
| 377 | + | ('text (let loop ((text "") (lst lst)) | |
| 378 | + | (match lst | |
| 379 | + | (() text) | |
| 380 | + | ((('hasPrecedingSpace . _) lst ...) | |
| 381 | + | (loop (string-append " " text) lst)) | |
| 382 | + | ((('hasFollowingSpace . _) lst ...) | |
| 383 | + | (string-append (loop text lst) " ")) | |
| 384 | + | (((? string? s) lst ...) | |
| 385 | + | (loop (string-append text s) lst))))) | |
| 386 | + | ('famn (sxml->string lst)) | |
| 387 | + | ('expl (sxml->string lst)) | |
| 388 | + | ('expli (filter list? lst)) | |
| 389 | + | ('abbrev (filter list? lst)) | |
| 390 | + | ('token (filter string? lst)) | |
| 391 | + | ('tr (sxml->string lst)) | |
| 392 | + | ('transcr `(transcr ,lst)) | |
| 393 | + | ('trans | |
| 394 | + | (append (filter list? lst) | |
| 395 | + | (map (lambda (s) `(trans ,s)) (filter string? lst)))) | |
| 396 | + | ('jap `(jap ,lst)) | |
| 397 | + | ('emph lst) | |
| 398 | + | ('title (filter string? lst)) | |
| 399 | + | ('transl (filter string? lst)) | |
| 400 | + | ('topic (filter string? lst)) | |
| 401 | + | ('iron (filter string? lst)) | |
| 402 | + | ('specchar (filter string? lst)) | |
| 403 | + | ('scientif (filter string? lst)) | |
| 404 | + | ('wikide #f) | |
| 405 | + | ('wikija #f) | |
| 406 | + | ('link #f) | |
| 407 | + | ('ref (sxml->ref lst)) | |
| 408 | + | ('sref (sxml->ref lst)) | |
| 409 | + | ('etym (sxml->source lst)) | |
| 410 | + | ('literal (sxml->string (list "???" (sxml->string lst) "???"))) | |
| 411 | + | ('def (sxml->string (list "(" (sxml->string lst) ")"))) | |
| 412 | + | ('date (sxml->string (list "(" (sxml->string lst) ")"))) | |
| 413 | + | ('birthdeath (sxml->string (list "(" (sxml->string lst) ")"))) | |
| 414 | + | ('descr (sxml->string (list "(" (sxml->string lst) ")"))) | |
| 415 | + | ('bracket (sxml->string (list "[" (sxml->string lst) "]"))) | |
| 416 | + | ('foreign (if (null? lst) #f `(foreign ,(car lst)))) | |
| 417 | + | ('seasonword `(info ,(string-append "season: " (assoc-ref lst 'type)))) | |
| 418 | + | ('usg `(infos . ,(usg->infos lst))) | |
| 419 | + | ('sense (sxml->meaning lst)) | |
| 420 | + | ('steinhaus (let ((ref (sxml->string lst))) | |
| 421 | + | (if (string? ref) | |
| 422 | + | `(ref ,(sxml->string lst)) | |
| 423 | + | (throw 'not-steinhaus ref)))) | |
| 424 | + | ('pos '()); TODO: actually find what info to use | |
| 425 | + | ('wordcomponent (cons 'wordcomponent lst)) | |
| 426 | + | ('meishi (cons 'meishi lst)) | |
| 427 | + | ('setsuzokushi (cons 'setsuzokushi lst)) | |
| 428 | + | ('daimeishi (cons 'daimeishi lst)) | |
| 429 | + | ('doushi (cons 'doushi lst)) | |
| 430 | + | ('kandoushi (cons 'kandoushi lst)) | |
| 431 | + | ('keiyoudoushi (cons 'keiyoudoushi lst)) | |
| 432 | + | ('keiyoushi (cons 'keiyoushi lst)) | |
| 433 | + | ('fukushi (cons 'fukushi lst)) | |
| 434 | + | ('rengo (cons 'rengo lst)) | |
| 435 | + | ('suffix (cons 'suffix lst)) | |
| 436 | + | ('prefix (cons 'prefix lst)) | |
| 437 | + | ('kanji (cons 'kanji lst)) | |
| 438 | + | ('rentaishi (cons 'rentaishi lst)) | |
| 439 | + | ('specialcharacter (cons 'specialcharacter lst)) | |
| 440 | + | ('joshi (cons 'joshi lst)) | |
| 441 | + | ('fukujoshi (cons 'fukujoshi lst)) | |
| 442 | + | ('kakujoshi (cons 'kakujoshi lst)) | |
| 443 | + | ('kakarijoshi (cons 'kakarijoshi lst)) | |
| 444 | + | ('shuujoshi (cons 'shuujoshi lst)) | |
| 445 | + | ('setsuzokujoshi (cons 'setsuzokujoshi lst)) | |
| 446 | + | ('jokeiyoushi (cons 'jokeiyoushi lst)) | |
| 447 | + | ('jodoushi (cons 'jodoushi lst)) | |
| 448 | + | ('ruigos lst) | |
| 449 | + | ('ruigo (ruigo->ref lst)) | |
| 450 | + | ('gramGrp (gram->info lst))))) | |
| 451 | + | ||
| 452 | + | (define (create-parser frq) | |
| 453 | + | (ssax:make-parser | |
| 454 | + | NEW-LEVEL-SEED | |
| 455 | + | (lambda (elem-gi attributes namespaces expected-content seed) | |
| 456 | + | attributes) | |
| 457 | + | ||
| 458 | + | FINISH-ELEMENT | |
| 459 | + | (lambda (elem-gi attributes namespaces parent-seed seed) | |
| 460 | + | (if (equal? elem-gi 'entries) | |
| 461 | + | seed | |
| 462 | + | (let* ((seed (reverse seed)) | |
| 463 | + | (element (sxml->element seed elem-gi frq))) | |
| 464 | + | (cons element parent-seed)))) | |
| 465 | + | ||
| 466 | + | CHAR-DATA-HANDLER | |
| 467 | + | (lambda (string1 string2 seed) | |
| 468 | + | (cons (string-append string1 string2) seed)))) | |
| 469 | + | ||
| 470 | + | (define (xml->results port frq) | |
| 471 | + | (let ((results (filter result? ((create-parser frq) port '())))) | |
| 472 | + | (sort-results results))) |
modules/nani/serialize.scm unknown status 2
| 1 | - | ;;; Nani Project website | |
| 2 | - | ;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu> | |
| 3 | - | ;;; | |
| 4 | - | ;;; This file is part of the Nani Project website. | |
| 5 | - | ;;; | |
| 6 | - | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | - | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | - | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | - | ;;; your option) any later version. | |
| 10 | - | ;;; | |
| 11 | - | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | - | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | - | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | - | ;;; GNU Affero General Public License for more details. | |
| 15 | - | ;;; | |
| 16 | - | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | - | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | - | ||
| 19 | - | (define-module (nani serialize) | |
| 20 | - | #:use-module (rnrs bytevectors) | |
| 21 | - | #:export (merge-bvs | |
| 22 | - | serialize-list list-size | |
| 23 | - | serialize-char char-size | |
| 24 | - | serialize-int int-size | |
| 25 | - | serialize-boolean boolean-size | |
| 26 | - | serialize-string string-size)) | |
| 27 | - | ||
| 28 | - | (define (merge-bvs bvs) | |
| 29 | - | (let* ((size (apply + (map bytevector-length bvs))) | |
| 30 | - | (bv (make-bytevector size 0))) | |
| 31 | - | (let loop ((bvs bvs) (pos 0)) | |
| 32 | - | (unless (null? bvs) | |
| 33 | - | (let ((sz (bytevector-length (car bvs)))) | |
| 34 | - | (bytevector-copy! (car bvs) 0 bv pos sz) | |
| 35 | - | (loop (cdr bvs) (+ pos sz))))) | |
| 36 | - | bv)) | |
| 37 | - | ||
| 38 | - | (define* (serialize-list lst serialize pos bv #:key (size? #t)) | |
| 39 | - | (when (not (list? lst)) (throw 'not-list lst)) | |
| 40 | - | (when size? | |
| 41 | - | (bytevector-u16-set! bv pos (length lst) (endianness big))) | |
| 42 | - | (let loop ((lst lst) (pos (+ pos (if size? 2 0)))) | |
| 43 | - | (if (null? lst) | |
| 44 | - | pos | |
| 45 | - | (loop (cdr lst) (serialize (car lst) pos bv))))) | |
| 46 | - | (define* (list-size lst size #:key (size? #t)) | |
| 47 | - | (when (not (list? lst)) (throw 'not-list lst)) | |
| 48 | - | (apply + (if size? 2 0) (map size lst))) | |
| 49 | - | ||
| 50 | - | (define (serialize-char int pos bv) | |
| 51 | - | (bytevector-u8-set! bv pos int) | |
| 52 | - | (+ pos 1)) | |
| 53 | - | (define char-size (const 1)) | |
| 54 | - | ||
| 55 | - | (define (serialize-int int pos bv) | |
| 56 | - | (bytevector-u32-set! bv pos int (endianness big)) | |
| 57 | - | (+ pos 4)) | |
| 58 | - | (define int-size (const 4)) | |
| 59 | - | ||
| 60 | - | (define (serialize-boolean bool pos bv) | |
| 61 | - | (bytevector-u8-set! bv pos (if bool 1 0)) | |
| 62 | - | (+ pos 1)) | |
| 63 | - | (define boolean-size (const 1)) | |
| 64 | - | ||
| 65 | - | (define (serialize-string str pos bv) | |
| 66 | - | (let ((sbv (string->utf8 str))) | |
| 67 | - | (bytevector-copy! sbv 0 bv pos (bytevector-length sbv)) | |
| 68 | - | (bytevector-u8-set! bv (+ pos (bytevector-length sbv)) 0) | |
| 69 | - | (+ pos 1 (bytevector-length sbv)))) | |
| 70 | - | (define (string-size str) | |
| 71 | - | (let ((sbv (string->utf8 str))) | |
| 72 | - | (+ 1 (bytevector-length sbv)))) |
modules/nani/tags.scm unknown status 2
| 1 | - | (define-module (nani tags) | |
| 2 | - | #:use-module (nani result) | |
| 3 | - | #:export (get-tag get-points)) | |
| 4 | - | ||
| 5 | - | (define tags | |
| 6 | - | '("MA" "X" "abbr" "adj-i" "adj-ix" "adj-na" "adj-no" "adj-pn" "adj-t" "adj-f" | |
| 7 | - | "adv" "adv-to" "arch" "ateji" "aux" "aux-v" "aux-adj" "Buddh" "chem" "chn" | |
| 8 | - | "col" "comp" "conj" "cop" "ctr" "derog" "eK" "ek" "exp" "fam" "fem" "food" | |
| 9 | - | "geom" "gikun" "hon" "hum" "iK" "id" "ik" "int" "io" "iv" "ling" "m-sl" | |
| 10 | - | "male" "male-sl" "math" "meteor" "mil" "n" "n-adv" "n-suf" "n-pref" "n-t" | |
| 11 | - | "num" "oK" "obs" "obsc" "ok" "oik" "on-mim" "pn" "poet" "pol" "pref" "proverb" | |
| 12 | - | "prt" "physics" "quote" "rare" "sens" "sl" "suf" "uK" "uk" "unc" "yoji" | |
| 13 | - | "v1" "v1-s" "v2a-s" "v4h" "v4r" "v5aru" "v5b" "v5g" "v5k" "v5k-s" "v5m" | |
| 14 | - | "v5n" "v5r" "v5r-i" "v5s" "v5t" "v5u" "v5u-s" "v5uru" "vz" "vi" "vk" "vn" | |
| 15 | - | "vr" "vs" "vs-c" "vs-s" "vs-i" "kyb" "osb" "ksb" "ktb" "tsb" "thb" "tsug" | |
| 16 | - | "kyu" "rkb" "nab" "hob" "vt" "vulg" "adj-kari" "adj-ku" "adj-shiku" | |
| 17 | - | "adj-nari" "n-pr" "v-unspec" "v4k" "v4g" "v4s" "v4t" "v4n" "v4b" "v4m" | |
| 18 | - | "v2k-k" "v2g-k" "v2t-k" "v2d-k" "v2h-k" "v2b-k" "v2m-k" "v2y-k" "v2r-k" | |
| 19 | - | "v2k-s" "v2g-s" "v2s-s" "v2z-s" "v2t-s" "v2d-s" "v2n-s" "v2h-s" "v2b-s" | |
| 20 | - | "v2m-s" "v2y-s" "v2r-s" "v2w-s" "archit" "astron" "baseb" "biol" "bot" | |
| 21 | - | "bus" "econ" "engr" "finc" "geol" "law" "mahj" "med" "music" "Shinto" | |
| 22 | - | "shogi" "sports" "sumo" "zool" "joc" "anat" "Christn" "net-sl" "dated" | |
| 23 | - | "hist" "litf" "surname" "place" "unclass" "company" "product" "work" | |
| 24 | - | "person" "given" "station" "organization" | |
| 25 | - | ||
| 26 | - | ;; wadoku special | |
| 27 | - | "young" "thief" "secret" "baby" "stud" "sail" "fashion" "archeo" | |
| 28 | - | "rel" "color" "psy" "print" "aero" "politics" "anth" "biblio" | |
| 29 | - | "game" "agri" "alco" "demo" "train" "philo" "ethno" "photo" "trans" | |
| 30 | - | "telecom" "build" "sci" "school" "art" "hobby" | |
| 31 | - | ||
| 32 | - | "season-spring" "season-summer" "season-winter" "season-autumn" | |
| 33 | - | "season-newyear" | |
| 34 | - | ||
| 35 | - | "jlpt-1" "jlpt-2" "jlpt-3" "jlpt-4" "jlpt-5" | |
| 36 | - | ||
| 37 | - | "frq500" "frq1000" "frq2000" "frq5000" "frq10000" "frq20000")) | |
| 38 | - | ||
| 39 | - | (define (get-tag tag) | |
| 40 | - | (let ((val (member tag tags))) | |
| 41 | - | (if val (- (length val) 1) (throw 'unknown-tag tag)))) | |
| 42 | - | ||
| 43 | - | (define (get-points result) | |
| 44 | - | (define (tag-point tag) | |
| 45 | - | (cond | |
| 46 | - | ;; frequency | |
| 47 | - | ((eq? tag (get-tag "frq500")) | |
| 48 | - | 32) | |
| 49 | - | ((eq? tag (get-tag "frq1000")) | |
| 50 | - | 16) | |
| 51 | - | ((eq? tag (get-tag "frq2000")) | |
| 52 | - | 8) | |
| 53 | - | ((eq? tag (get-tag "frq5000")) | |
| 54 | - | 4) | |
| 55 | - | ((eq? tag (get-tag "frq10000")) | |
| 56 | - | 2) | |
| 57 | - | ((eq? tag (get-tag "frq20000")) | |
| 58 | - | 1) | |
| 59 | - | ;; archaic term | |
| 60 | - | ((eq? tag (get-tag "arch")) | |
| 61 | - | -3) | |
| 62 | - | ;; children language | |
| 63 | - | ((eq? tag (get-tag "chn")) | |
| 64 | - | -1) | |
| 65 | - | ;; colloquialism | |
| 66 | - | ((eq? tag (get-tag "col")) | |
| 67 | - | 5) | |
| 68 | - | ;; expression | |
| 69 | - | ((eq? tag (get-tag "exp")) | |
| 70 | - | 1) | |
| 71 | - | ;; sonkeigo (honorific or respectful) | |
| 72 | - | ((eq? tag (get-tag "hon")) | |
| 73 | - | -1) | |
| 74 | - | ;; kenjougo (humble) | |
| 75 | - | ((eq? tag (get-tag "hum")) | |
| 76 | - | -1) | |
| 77 | - | ;; teineigo (polite) | |
| 78 | - | ((eq? tag (get-tag "pol")) | |
| 79 | - | -1) | |
| 80 | - | ;; irregular kanji | |
| 81 | - | ((eq? tag (get-tag "iK")) | |
| 82 | - | -2) | |
| 83 | - | ;; idiomatic expression | |
| 84 | - | ((eq? tag (get-tag "id")) | |
| 85 | - | 3) | |
| 86 | - | ;; slang | |
| 87 | - | ((member tag (list | |
| 88 | - | (get-tag "sl") | |
| 89 | - | (get-tag "m-sl") | |
| 90 | - | (get-tag "male-sl"))) | |
| 91 | - | -4) | |
| 92 | - | ;; outdated kanji | |
| 93 | - | ((eq? tag (get-tag "oK")) | |
| 94 | - | -2) | |
| 95 | - | ;; obsolete term | |
| 96 | - | ((eq? tag (get-tag "obs")) | |
| 97 | - | -5) | |
| 98 | - | ;; obscure term | |
| 99 | - | ((eq? tag (get-tag "obsc")) | |
| 100 | - | -10) | |
| 101 | - | ;; onomatopea | |
| 102 | - | ((eq? tag (get-tag "on-mim")) | |
| 103 | - | 1) | |
| 104 | - | ;; poetical term | |
| 105 | - | ((eq? tag (get-tag "poet")) | |
| 106 | - | -1) | |
| 107 | - | ((eq? tag (get-tag "proverb")) | |
| 108 | - | 1) | |
| 109 | - | ((eq? tag (get-tag "rare")) | |
| 110 | - | -3) | |
| 111 | - | ;; sensitive | |
| 112 | - | ((eq? tag (get-tag "sens")) | |
| 113 | - | -2) | |
| 114 | - | ;; dialect | |
| 115 | - | ((member tag (list | |
| 116 | - | (get-tag "kyb") | |
| 117 | - | (get-tag "osb") | |
| 118 | - | (get-tag "ksb") | |
| 119 | - | (get-tag "ktb") | |
| 120 | - | (get-tag "tsb") | |
| 121 | - | (get-tag "thb") | |
| 122 | - | (get-tag "tsug") | |
| 123 | - | (get-tag "kyu") | |
| 124 | - | (get-tag "rkb") | |
| 125 | - | (get-tag "nab") | |
| 126 | - | (get-tag "hob"))) | |
| 127 | - | -1) | |
| 128 | - | (else 0))) | |
| 129 | - | (let ((tags (apply append (map sense-tags (result-senses result))))) | |
| 130 | - | (apply + (map tag-point tags)))) |
modules/nani/trie.scm unknown status 2
| 1 | - | ;;; Nani Project website | |
| 2 | - | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
| 3 | - | ;;; | |
| 4 | - | ;;; This file is part of the Nani Project website. | |
| 5 | - | ;;; | |
| 6 | - | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | - | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | - | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | - | ;;; your option) any later version. | |
| 10 | - | ;;; | |
| 11 | - | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | - | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | - | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | - | ;;; GNU Affero General Public License for more details. | |
| 15 | - | ;;; | |
| 16 | - | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | - | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | - | ||
| 19 | - | (define-module (nani trie) | |
| 20 | - | #:use-module (nani serialize) | |
| 21 | - | #:use-module (rnrs bytevectors) | |
| 22 | - | #:use-module (srfi srfi-9) | |
| 23 | - | #:export (make-trie | |
| 24 | - | trie? | |
| 25 | - | trie-position | |
| 26 | - | trie-position-set! | |
| 27 | - | trie-vals | |
| 28 | - | trie-vals-set! | |
| 29 | - | trie-transitions | |
| 30 | - | trie-transitions-set! | |
| 31 | - | ||
| 32 | - | make-empty-trie | |
| 33 | - | add-to-trie! | |
| 34 | - | compress-trie | |
| 35 | - | ||
| 36 | - | serialize-trie | |
| 37 | - | trie-size)) | |
| 38 | - | ||
| 39 | - | (define-record-type trie | |
| 40 | - | (make-trie position vals transitions) | |
| 41 | - | trie? | |
| 42 | - | (position trie-position trie-position-set!) ; integer | |
| 43 | - | (vals trie-vals trie-vals-set!) ; list | |
| 44 | - | (transitions trie-transitions trie-transitions-set!)) ; array or alist | |
| 45 | - | ||
| 46 | - | (define (make-empty-trie) | |
| 47 | - | (make-trie 0 '() (make-array #f 16))) | |
| 48 | - | ||
| 49 | - | (define (add-to-trie! trie key value) | |
| 50 | - | (if (null? key) | |
| 51 | - | (trie-vals-set! trie (cons value (trie-vals trie))) | |
| 52 | - | (let ((next-trie (array-ref (trie-transitions trie) (car key)))) | |
| 53 | - | (if next-trie | |
| 54 | - | (add-to-trie! next-trie (cdr key) value) | |
| 55 | - | (let ((next-trie (make-empty-trie))) | |
| 56 | - | (array-set! (trie-transitions trie) next-trie (car key)) | |
| 57 | - | (add-to-trie! next-trie (cdr key) value)))))) | |
| 58 | - | ||
| 59 | - | (define (convert-trie-transitions! trie) | |
| 60 | - | (define (get-new-transitions transitions) | |
| 61 | - | (let loop ((i 0) (tr '())) | |
| 62 | - | (if (= i 16) | |
| 63 | - | tr | |
| 64 | - | (let ((elem (array-ref transitions i))) | |
| 65 | - | (if elem | |
| 66 | - | (begin | |
| 67 | - | (convert-trie-transitions! elem) | |
| 68 | - | (loop (+ i 1) (cons (cons i elem) tr))) | |
| 69 | - | (loop (+ i 1) tr)))))) | |
| 70 | - | (let* ((transitions (trie-transitions trie)) | |
| 71 | - | (transitions (get-new-transitions transitions))) | |
| 72 | - | (trie-transitions-set! trie transitions))) | |
| 73 | - | ||
| 74 | - | (define (compress-trie trie) | |
| 75 | - | (define (compress-aux trie) | |
| 76 | - | (make-trie | |
| 77 | - | (trie-position trie) | |
| 78 | - | (trie-vals trie) | |
| 79 | - | (apply append | |
| 80 | - | (map | |
| 81 | - | (lambda (tr) | |
| 82 | - | (let ((trie (cdr tr))) | |
| 83 | - | (map | |
| 84 | - | (lambda (tr2) | |
| 85 | - | (cons (+ (car tr2) (* 16 (car tr))) | |
| 86 | - | (compress-aux (cdr tr2)))) | |
| 87 | - | (trie-transitions trie)))) | |
| 88 | - | (trie-transitions trie))))) | |
| 89 | - | (convert-trie-transitions! trie) | |
| 90 | - | (compress-aux trie)) | |
| 91 | - | ||
| 92 | - | (define (pointer-size ptr) | |
| 93 | - | 5) | |
| 94 | - | ||
| 95 | - | (define (serialize-pointer ptr pos bv) | |
| 96 | - | (bytevector-u8-set! bv pos (car ptr)) | |
| 97 | - | (bytevector-u32-set! bv (+ pos 1) (trie-position (cdr ptr)) (endianness big)) | |
| 98 | - | (+ pos 5)) | |
| 99 | - | ||
| 100 | - | (define (serialize-trie trie serialize size results pos) | |
| 101 | - | (define (serialize-trie-aux transitions pos) | |
| 102 | - | (let loop ((pos pos) | |
| 103 | - | (trs transitions) | |
| 104 | - | (bvs '())) | |
| 105 | - | (if (null? trs) | |
| 106 | - | (cons pos bvs) | |
| 107 | - | (let* ((next-trie (cdr (car trs))) | |
| 108 | - | (bv (get-trie-bv next-trie results pos)) | |
| 109 | - | (pos (car bv)) | |
| 110 | - | (bv (cdr bv))) | |
| 111 | - | (loop pos (cdr trs) (append bvs bv)))))) | |
| 112 | - | ||
| 113 | - | (define (get-trie-bv trie results pos) | |
| 114 | - | (trie-position-set! trie pos) | |
| 115 | - | (let* ((vals-sz (size (trie-vals trie))) | |
| 116 | - | (trs-sz (list-size (trie-transitions trie) (const 5) #:size? #f)) | |
| 117 | - | (sz (+ vals-sz 1 trs-sz)) | |
| 118 | - | (bv (make-bytevector sz 0))) | |
| 119 | - | (serialize (trie-vals trie) 0 bv) | |
| 120 | - | (let* ((bvs (serialize-trie-aux | |
| 121 | - | (trie-transitions trie) | |
| 122 | - | (+ pos sz))) | |
| 123 | - | (next-pos (car bvs)) | |
| 124 | - | (bvs (cdr bvs))) | |
| 125 | - | (bytevector-u8-set! bv vals-sz (length (trie-transitions trie))) | |
| 126 | - | (serialize-list (trie-transitions trie) serialize-pointer | |
| 127 | - | (+ vals-sz 1) bv #:size? #f) | |
| 128 | - | (cons next-pos (cons bv bvs))))) | |
| 129 | - | ||
| 130 | - | (let* ((trie-bv (get-trie-bv trie results pos)) | |
| 131 | - | (new-pos (car trie-bv)) | |
| 132 | - | (trie-bv (merge-bvs (cdr trie-bv)))) | |
| 133 | - | (bytevector-copy! trie-bv 0 results pos (bytevector-length trie-bv)) | |
| 134 | - | new-pos)) | |
| 135 | - | ||
| 136 | - | (define (trie-size trie size) | |
| 137 | - | (apply + | |
| 138 | - | (size (trie-vals trie)) | |
| 139 | - | 1 | |
| 140 | - | (list-size (trie-transitions trie) pointer-size #:size? #f) | |
| 141 | - | (map (lambda (trie) (trie-size trie size)) | |
| 142 | - | (map cdr (trie-transitions trie))))) |
modules/nani/wadoku/entities.scm unknown status 2
| 1 | - | (define-module (nani wadoku entities) | |
| 2 | - | #:use-module (nani result) | |
| 3 | - | #:use-module ((nani tags) #:prefix tag:) | |
| 4 | - | #:use-module (ice-9 match) | |
| 5 | - | #:export (get-tag hint->tag) | |
| 6 | - | #:re-export (tag:get-points)) | |
| 7 | - | ||
| 8 | - | (define wadoku-entities | |
| 9 | - | `(("Aerodyn." . "aero") | |
| 10 | - | ("Agrochem." . "chem") | |
| 11 | - | ("??gypt. Gesch." . "hist") | |
| 12 | - | ("??gypt. Mythol." . "rel") | |
| 13 | - | ("Aktien" . "econ") | |
| 14 | - | ("Akupunktur" . "med") | |
| 15 | - | ("Akustik" . "music") | |
| 16 | - | ("American Football" . "sports") | |
| 17 | - | ("amerik. Gesch." . "hist") | |
| 18 | - | ("amerik. Pol." . "politics") | |
| 19 | - | ("amerik. Rechtsw." . "law") | |
| 20 | - | ("Anat." . "anat") | |
| 21 | - | ("Angeln" . "sports") | |
| 22 | - | ("Anime" . "work") | |
| 23 | - | ("Anthropol." . "anth") | |
| 24 | - | ("arab. Gesch." . "hist") | |
| 25 | - | ("Arch??ol." . "archeo") | |
| 26 | - | ("Archit." . "archit") | |
| 27 | - | ("Astrol." . "astron") | |
| 28 | - | ("Astron." . "astron") | |
| 29 | - | ("atmosph??r. Optik" . "physics") | |
| 30 | - | ("Atomphys." . "physics") | |
| 31 | - | ("Audio" . "music") | |
| 32 | - | ("Augenheilk." . "med") | |
| 33 | - | ("Autorennen" . "sports") | |
| 34 | - | ("Backw." . "food") | |
| 35 | - | ("Badminton" . "sports") | |
| 36 | - | ("Bahnhofsn." . "station") | |
| 37 | - | ("Bakteriol." . "biol") | |
| 38 | - | ("Ballet" . "music") | |
| 39 | - | ("Ballett" . "music") | |
| 40 | - | ("Ballsport" . "sports") | |
| 41 | - | ("Bankw." . "finc") | |
| 42 | - | ("Baseb." . "baseb") | |
| 43 | - | ("Baseball" . "baseb") | |
| 44 | - | ("Basketball" . "sports") | |
| 45 | - | ("Bauingenieurw." . "archit") | |
| 46 | - | ("Bautechnik" . "archit") | |
| 47 | - | ("Bauw." . "archit") | |
| 48 | - | ("Befestigungsw." . "archit") | |
| 49 | - | ("Beh??rde" . "law") | |
| 50 | - | ("Bergb." . "archit") | |
| 51 | - | ("Bergbau" . "archit") | |
| 52 | - | ("Bergn." . "place") | |
| 53 | - | ("Bergsteigen" . "sports") | |
| 54 | - | ("Bibel" . "Christn") | |
| 55 | - | ("Bibliotheksw." . "biblio") | |
| 56 | - | ("Bildhauerei" . "work") | |
| 57 | - | ("bild. Kunst" . "work") | |
| 58 | - | ("Billard" . "game") | |
| 59 | - | ("Biochem." . "biol") | |
| 60 | - | ("Biol." . "biol") | |
| 61 | - | ("Bodenkunde" . "agri") | |
| 62 | - | ("Bogenschie??en" . "sports") | |
| 63 | - | ("B??rse" . "econ") | |
| 64 | - | ("Bot." . "bot") | |
| 65 | - | ("Bowling" . "sports") | |
| 66 | - | ("Boxen" . "sports") | |
| 67 | - | ("Brauereiw." . "alco") | |
| 68 | - | ("Brettspiel" . "game") | |
| 69 | - | ("brit. Gesch." . "hist") | |
| 70 | - | ("brit. Rechtsw." . "law") | |
| 71 | - | ("Br??ckenbau" . "engr") | |
| 72 | - | ("Bsp." . "exp") | |
| 73 | - | ("Buchbinden" . "print") | |
| 74 | - | ("B??chereiw." . "biblio") | |
| 75 | - | ("Buchtitel" . "work") | |
| 76 | - | ("Buchw." . "biblio") | |
| 77 | - | ("Buddh." . "Buddh") | |
| 78 | - | ("Bud??" . "MA") | |
| 79 | - | ("B??d??" . "MA") | |
| 80 | - | ("Chem." . "chem") | |
| 81 | - | ("chin. Arch??ol." . "archeo") | |
| 82 | - | ("chin. Astron." . "astron") | |
| 83 | - | ("chin. Gesch." . "hist") | |
| 84 | - | ("chin. Gramm." . "ling") | |
| 85 | - | ("chin. Literaturw." . "work") | |
| 86 | - | ("chin. Mus." . "music") | |
| 87 | - | ("chin. Musikinstr." . "music") | |
| 88 | - | ("chin. Mythol." . "rel") | |
| 89 | - | ("chin. Philos." . "rel") | |
| 90 | - | ("chin. Phon." . "ling") | |
| 91 | - | ("chin. Poet." . "poet") | |
| 92 | - | ("chin. Pol." . "politics") | |
| 93 | - | ("chin. Theater" . "work") | |
| 94 | - | ("Chirurgie" . "med") | |
| 95 | - | ("Christent." . "Christn") | |
| 96 | - | ("Cocktail" . "alco") | |
| 97 | - | ("Computerling." . "comp") | |
| 98 | - | ("Curling" . "sports") | |
| 99 | - | ("Dammbau" . "engr") | |
| 100 | - | ("Demogr." . "demo") | |
| 101 | - | ("Diplomat." . "politics") | |
| 102 | - | ("Druckw." . "print") | |
| 103 | - | ("dtsch. Gesch." . "hist") | |
| 104 | - | ("dtsch. Gramm." . "ling") | |
| 105 | - | ("EDV" . "comp") | |
| 106 | - | ("Einzel-Kanji" . "ling") | |
| 107 | - | ("Eisenb." . "engr") | |
| 108 | - | ("Eishockey" . "sports") | |
| 109 | - | ("Eiskunstl." . "sports") | |
| 110 | - | ("Eiskunstlauf" . "sports") | |
| 111 | - | ("Eisschnelllauf" . "sports") | |
| 112 | - | ("Elektrochem." . "chem") | |
| 113 | - | ("Elektrot." . "engr") | |
| 114 | - | ("Embryol." . "biol") | |
| 115 | - | ("engl. Gesch." . "hist") | |
| 116 | - | ("Entwicklungsphysiol." . "biol") | |
| 117 | - | ("Entwicklungspsychol." . "psy") | |
| 118 | - | ("Ergonomie" . "engr") | |
| 119 | - | ("Ethik" . "philo") | |
| 120 | - | ("Ethnol." . "ethno") | |
| 121 | - | ("europ. Gesch." . "hist") | |
| 122 | - | ("Fahrrad" . "sports") | |
| 123 | - | ("Familienn." . "surname") | |
| 124 | - | ("Familienn.." . "surname") | |
| 125 | - | ("Farbe" . "color") | |
| 126 | - | ("F??rberei" . "build") | |
| 127 | - | ("Fechten" . "build") | |
| 128 | - | ("Fertigungstechnik" . "engr") | |
| 129 | - | ("Film" . "work") | |
| 130 | - | ("Filmtitel" . "work") | |
| 131 | - | ("Finanzw." . "finc") | |
| 132 | - | ("Firmenn." . "company") | |
| 133 | - | ("Fischerei" . "sports") | |
| 134 | - | ("Fischk." . "food") | |
| 135 | - | ("Flugw." . "engr") | |
| 136 | - | ("Flussn." . "place") | |
| 137 | - | ("Forstw." . "bot") | |
| 138 | - | ("Fotog." . "photo") | |
| 139 | - | ("Frachtw." . "trans") | |
| 140 | - | ("franz. Gesch." . "hist") | |
| 141 | - | ("Frisur" . "fashion") | |
| 142 | - | ("Funkt." . "physics") | |
| 143 | - | ("Fu??ball" . "sports") | |
| 144 | - | ("Gagaku" . "music") | |
| 145 | - | ("Garten" . "bot") | |
| 146 | - | ("Gartenk." . "bot") | |
| 147 | - | ("Gateball" . "sports") | |
| 148 | - | ("Gebietsn." . "place") | |
| 149 | - | ("Geldw." . "finc") | |
| 150 | - | ("Genetik" . "biol") | |
| 151 | - | ("Geogr." . "place") | |
| 152 | - | ("Geol." . "geol") | |
| 153 | - | ("Geom." . "geom") | |
| 154 | - | ("Geophys." . "physics") | |
| 155 | - | ("Gerberei" . "build") | |
| 156 | - | ("Gesch." . "hist") | |
| 157 | - | ("Gewichtheben" . "sports") | |
| 158 | - | ("Glasherst." . "build") | |
| 159 | - | ("Glaziol." . "sci") | |
| 160 | - | ("Go" . "game") | |
| 161 | - | ("Golf" . "sports") | |
| 162 | - | ("Gramm." . "ling") | |
| 163 | - | ("griech. Christent." . "Christn") | |
| 164 | - | ("griech. Gesch." . "hist") | |
| 165 | - | ("griech. Myth." . "rel") | |
| 166 | - | ("griech. Mythol." . "rel") | |
| 167 | - | ("griech. Theat." . "work") | |
| 168 | - | ("Gymnastik" . "sports") | |
| 169 | - | ("Handball" . "sports") | |
| 170 | - | ("Handel" . "econ") | |
| 171 | - | ("Handwerkskunst" . "build") | |
| 172 | - | ("Heraldik" . "sci") | |
| 173 | - | ("Hindu." . "rel") | |
| 174 | - | ("Hochsprung" . "sports") | |
| 175 | - | ("Hockey" . "sports") | |
| 176 | - | ("Holzbau" . "engr") | |
| 177 | - | ("Holzbearb." . "engr") | |
| 178 | - | ("Holzschnitt" . "engr") | |
| 179 | - | ("Hunderasse" . "zool") | |
| 180 | - | ("Hydrodynamik" . "physics") | |
| 181 | - | ("I Ging" . "rel") | |
| 182 | - | ("Ikebana" . "build") | |
| 183 | - | ("Immunol." . "med") | |
| 184 | - | ("ind. Gesch." . "hist") | |
| 185 | - | ("indian. Mythol." . "rel") | |
| 186 | - | ("ind. Mythol." . "rel") | |
| 187 | - | ("indones. Gesch." . "hist") | |
| 188 | - | ("ind. Philos." . "philo") | |
| 189 | - | ("Insektenk." . "zool") | |
| 190 | - | ("Inseln." . "place") | |
| 191 | - | ("Internet" . "comp") | |
| 192 | - | ("Islam" . "rel") | |
| 193 | - | ("islam. Mythol." . "rel") | |
| 194 | - | ("islam. Rechtsw." . "law") | |
| 195 | - | ("islam. Rel." . "rel") | |
| 196 | - | ("israel. Gesch." . "hist") | |
| 197 | - | ("ital. Gesch." . "hist") | |
| 198 | - | ("japan. Archit." . "archit") | |
| 199 | - | ("japan. Astrologie" . "aston") | |
| 200 | - | ("japan. bild. Kunst" . "work") | |
| 201 | - | ("japan. Christent." . "Christn") | |
| 202 | - | ("japan. Gesch." . "hist") | |
| 203 | - | ("japan. Gramm." . "ling") | |
| 204 | - | ("japan. Literaturw." . "work") | |
| 205 | - | ("japan. Med." . "med") | |
| 206 | - | ("japan. Mus." . "music") | |
| 207 | - | ("japan. Mythol." . "rel") | |
| 208 | - | ("japan. Philos." . "philo") | |
| 209 | - | ("japan. Pol." . "politics") | |
| 210 | - | ("japan. Schwimmen" . "sports") | |
| 211 | - | ("japan. Soziol." . "sci") | |
| 212 | - | ("japan. Sprachw." . "ling") | |
| 213 | - | ("J??ruri" . "music") | |
| 214 | - | ("Judent." . "rel") | |
| 215 | - | ("J??d??" . "MA") | |
| 216 | - | ("Kabuki" . "music") | |
| 217 | - | ("Kagura" . "music") | |
| 218 | - | ("Kalligraphie" . "build") | |
| 219 | - | ("Kanbun" . "ling") | |
| 220 | - | ("Kanji" . "ling") | |
| 221 | - | ("Kanp??" . "med") | |
| 222 | - | ("Karten" . "game") | |
| 223 | - | ("Kartenspiel" . "game") | |
| 224 | - | ("Kartogr." . "place") | |
| 225 | - | ("kath. Christent." . "Christn") | |
| 226 | - | ("Kend??" . "MA") | |
| 227 | - | ("Keramik" . "build") | |
| 228 | - | ("Kernphys." . "physics") | |
| 229 | - | ("Kfz-W." . "engr") | |
| 230 | - | ("KI" . "comp") | |
| 231 | - | ("Kleidung" . "fashion") | |
| 232 | - | ("Kochk." . "food") | |
| 233 | - | ("Konditoreiw." . "food") | |
| 234 | - | ("Konfuz." . "rel") | |
| 235 | - | ("korean. Gesch." . "hist") | |
| 236 | - | ("korean. Rel." . "rel") | |
| 237 | - | ("korean. Theater" . "work") | |
| 238 | - | ("Kosmetik" . "fashion") | |
| 239 | - | ("Kricket" . "sports") | |
| 240 | - | ("Kristallogr." . "physics") | |
| 241 | - | ("Krocket" . "sports") | |
| 242 | - | ("Kunst" . "work") | |
| 243 | - | ("Kunstgesch." . "hist") | |
| 244 | - | ("Kunst??? und Turmsprung" . "build") | |
| 245 | - | ("Ky??gen" . "music") | |
| 246 | - | ("Lack" . "work") | |
| 247 | - | ("L??ndern." . "place") | |
| 248 | - | ("Landw." . "agri") | |
| 249 | - | ("Leichtathl." . "sports") | |
| 250 | - | ("Liedtitel" . "work") | |
| 251 | - | ("Literaturw." . "work") | |
| 252 | - | ("Logik" . "math") | |
| 253 | - | ("Luftf." . "aero") | |
| 254 | - | ("Machinenb." . "engr") | |
| 255 | - | ("Mah-Jongg" . "mahj") | |
| 256 | - | ("Management" . "bus") | |
| 257 | - | ("Manga" . "work") | |
| 258 | - | ("m??nnl. Name" . "given") | |
| 259 | - | ("m??nnl. Vorn." . "given") | |
| 260 | - | ("m??nnl. Vorname" . "given") | |
| 261 | - | ("Mantik" . "rel") | |
| 262 | - | ("Marine" . "sail") | |
| 263 | - | ("Marketing" . "bus") | |
| 264 | - | ("Maschinenb." . "engr") | |
| 265 | - | ("Ma??" . "engr") | |
| 266 | - | ("Math." . "math") | |
| 267 | - | ("Mech." . "physics") | |
| 268 | - | ("Med." . "med") | |
| 269 | - | ("Meeresk." . "place") | |
| 270 | - | ("Meeresn." . "place") | |
| 271 | - | ("Meinungsforschung" . "politics") | |
| 272 | - | ("Messw." . "hobby") | |
| 273 | - | ("Metall." . "engr") | |
| 274 | - | ("Metallbearb." . "engr") | |
| 275 | - | ("Metallguss" . "engr") | |
| 276 | - | ("Meteor." . "meteor") | |
| 277 | - | ("mexikan. Kochk." . "food") | |
| 278 | - | ("Mikrobiol." . "biol") | |
| 279 | - | ("Milit." . "mil") | |
| 280 | - | ("Mineral." . "geol") | |
| 281 | - | ("M??belbau" . "build") | |
| 282 | - | ("Mode" . "fashion") | |
| 283 | - | ("Motorsport" . "sports") | |
| 284 | - | ("Mus." . "music") | |
| 285 | - | ("Muschelk." . "zool") | |
| 286 | - | ("Musikinstr." . "music") | |
| 287 | - | ("Mykol." . "bot") | |
| 288 | - | ("Mythol." . "rel") | |
| 289 | - | ("Nachrichtent." . "politics") | |
| 290 | - | ("N??hen" . "fashion") | |
| 291 | - | ("Naniwabushi" . "music") | |
| 292 | - | ("Naturmed." . "med") | |
| 293 | - | ("Naturph??n." . "physics") | |
| 294 | - | ("Naturphilos." . "philo") | |
| 295 | - | ("Neng??" . "hist") | |
| 296 | - | ("Netzwerktechnik" . "comp") | |
| 297 | - | ("Neurol." . "med") | |
| 298 | - | ("N??" . "music") | |
| 299 | - | ("nord. Mythol." . "rel") | |
| 300 | - | ("Numismatik" . "build") | |
| 301 | - | ("??kol." . "biol") | |
| 302 | - | ("Oper" . "music") | |
| 303 | - | ("Optik" . "physics") | |
| 304 | - | ("Org." . "organization") | |
| 305 | - | ("Origami" . "build") | |
| 306 | - | ("Ortsn." . "place") | |
| 307 | - | ("P??d." . "school") | |
| 308 | - | ("Pal??ontol." . "hist") | |
| 309 | - | ("Papierherst." . "build") | |
| 310 | - | ("Parapsych." . "psy") | |
| 311 | - | ("Patentw." . "engr") | |
| 312 | - | ("Pathol." . "med") | |
| 313 | - | ("pers. Gesch." . "hist") | |
| 314 | - | ("Pers??nlichk." . "person") | |
| 315 | - | ("Petrochem." . "chem") | |
| 316 | - | ("Pferderennen" . "sports") | |
| 317 | - | ("Pferdesport" . "sports") | |
| 318 | - | ("Pharm." . "med") | |
| 319 | - | ("Philos." . "philo") | |
| 320 | - | ("philos. Anthrop." . "anth") | |
| 321 | - | ("Phon." . "ling") | |
| 322 | - | ("Phys." . "physics") | |
| 323 | - | ("Physiol." . "anat") | |
| 324 | - | ("Poetik" . "poet") | |
| 325 | - | ("Pol." . "politics") | |
| 326 | - | ("Postw." . "telecom") | |
| 327 | - | ("Psych." . "psy") | |
| 328 | - | ("Quantenphys." . "physics") | |
| 329 | - | ("Radio" . "telecom") | |
| 330 | - | ("Radrennen" . "sports") | |
| 331 | - | ("Radsport" . "sports") | |
| 332 | - | ("Rakugo" . "music") | |
| 333 | - | ("Raumf." . "engr") | |
| 334 | - | ("Reaktort." . "engr") | |
| 335 | - | ("Rechnungsw." . "bus") | |
| 336 | - | ("Rechtsw." . "law") | |
| 337 | - | ("Redew." . "exp") | |
| 338 | - | ("Regelungstechnik" . "engr") | |
| 339 | - | ("Reiten" . "sports") | |
| 340 | - | ("Rel." . "rel") | |
| 341 | - | ("Rennsport" . "sports") | |
| 342 | - | ("Rhetorik" . "philo") | |
| 343 | - | ("Ringen" . "sports") | |
| 344 | - | ("r??m. Gesch." . "hist") | |
| 345 | - | ("r??m. Mythol." . "rel") | |
| 346 | - | ("Rudern" . "sports") | |
| 347 | - | ("Rugby" . "sports") | |
| 348 | - | ("Rundfunk" . "telecom") | |
| 349 | - | ("russ. Gesch." . "hist") | |
| 350 | - | ("R??stung" . "mil") | |
| 351 | - | ("Schach" . "game") | |
| 352 | - | ("Schie??sport" . "sports") | |
| 353 | - | ("Schiff" . "sail") | |
| 354 | - | ("Schiffbau" . "sail") | |
| 355 | - | ("Schifffahrt" . "sail") | |
| 356 | - | ("Schmuck" . "build") | |
| 357 | - | ("Schneckenk." . "zool") | |
| 358 | - | ("Schneiderei" . "fashion") | |
| 359 | - | ("Schuhe" . "fashion") | |
| 360 | - | ("Schule" . "school") | |
| 361 | - | ("Schwei??en" . "build") | |
| 362 | - | ("Schwert" . "mil") | |
| 363 | - | ("Schwimmen" . "sports") | |
| 364 | - | ("Seef." . "sail") | |
| 365 | - | ("Seen." . "place") | |
| 366 | - | ("Seerechtsw." . "law") | |
| 367 | - | ("Seew." . "sail") | |
| 368 | - | ("Segeln" . "sail") | |
| 369 | - | ("Seide" . "build") | |
| 370 | - | ("Seismol." . "physics") | |
| 371 | - | ("SF" . "work") | |
| 372 | - | ("Shingon" . "Buddh") | |
| 373 | - | ("Shint??" . "Shinto") | |
| 374 | - | ("Sh??gi" . "shogi") | |
| 375 | - | ("Ski" . "sports") | |
| 376 | - | ("Skispringen" . "sports") | |
| 377 | - | ("Softball" . "sports") | |
| 378 | - | ("Sozialpsych." . "psy") | |
| 379 | - | ("Soziol." . "sci") | |
| 380 | - | ("span. Gesch." . "hist") | |
| 381 | - | ("Spiel" . "game") | |
| 382 | - | ("Spiele-Titel" . "game") | |
| 383 | - | ("Spieltheorie" . "game") | |
| 384 | - | ("Spinnen" . "zool") | |
| 385 | - | ("Spinnenk." . "zool") | |
| 386 | - | ("Sport" . "sports") | |
| 387 | - | ("Sprache" . "ling") | |
| 388 | - | ("Sprachw." . "ling") | |
| 389 | - | ("Sprichw." . "ling") | |
| 390 | - | ("Stadtn." . "place") | |
| 391 | - | ("Stadtplanung" . "engr") | |
| 392 | - | ("Statistik" . "math") | |
| 393 | - | ("Steuerw." . "econ") | |
| 394 | - | ("Sticken" . "build") | |
| 395 | - | ("Stra??enbau" . "engr") | |
| 396 | - | ("Streckenkilometertafel" . "train") | |
| 397 | - | ("Stricken" . "build") | |
| 398 | - | ("Strukturalismus" . "philo") | |
| 399 | - | ("Sum??" . "sumo") | |
| 400 | - | ("Surfen" . "sports") | |
| 401 | - | ("Systemanalyse" . "engr") | |
| 402 | - | ("Systemtheorie" . "engr") | |
| 403 | - | ("taiwan. Pol." . "politics") | |
| 404 | - | ("Tanz" . "art") | |
| 405 | - | ("Tanzen" . "art") | |
| 406 | - | ("Taoismus" . "rel") | |
| 407 | - | ("Tauchen" . "sports") | |
| 408 | - | ("Technik" . "engr") | |
| 409 | - | ("Tee" . "hobby") | |
| 410 | - | ("Telegrafie" . "telecom") | |
| 411 | - | ("Telekom." . "telecom") | |
| 412 | - | ("Tempeln." . "place") | |
| 413 | - | ("Tennis" . "sports") | |
| 414 | - | ("Tenn??" . "surname") | |
| 415 | - | ("Textilt." . "engr") | |
| 416 | - | ("Theat." . "work") | |
| 417 | - | ("Theol." . "rel") | |
| 418 | - | ("tibet. Buddh." . "Buddh") | |
| 419 | - | ("Tiefb." . "engr") | |
| 420 | - | ("Tiermed." . "med") | |
| 421 | - | ("Tischtennis" . "sports") | |
| 422 | - | ("Tourismus" . "hobby") | |
| 423 | - | ("Tunnelbau" . "engr") | |
| 424 | - | ("t??rk. Gesch." . "hist") | |
| 425 | - | ("Turnen" . "sports") | |
| 426 | - | ("TV" . "telecom") | |
| 427 | - | ("TV-Prog." . "telecom") | |
| 428 | - | ("U-Bahn" . "train") | |
| 429 | - | ("Umwelt" . "biol") | |
| 430 | - | ("Univ.-N." . "place") | |
| 431 | - | ("Verhaltensbiol." . "biol") | |
| 432 | - | ("Verkehrsw." . "engr") | |
| 433 | - | ("Verlagsn." . "company") | |
| 434 | - | ("Verlagsw." . "econ") | |
| 435 | - | ("Versicherungsw." . "bus") | |
| 436 | - | ("Video" . "telecom") | |
| 437 | - | ("Videospiel" . "game") | |
| 438 | - | ("vietnam. Gesch." . "hist") | |
| 439 | - | ("Vogelk." . "zool") | |
| 440 | - | ("V??lkerk." . "ethno") | |
| 441 | - | ("V??lkerr." . "law") | |
| 442 | - | ("Volleyball" . "sports") | |
| 443 | - | ("Vorgesch." . "hist") | |
| 444 | - | ("VWL" . "econ") | |
| 445 | - | ("Waffenk." . "mil") | |
| 446 | - | ("Walfang" . "hobby") | |
| 447 | - | ("Wasserball" . "sports") | |
| 448 | - | ("Wasserbau" . "engr") | |
| 449 | - | ("Wassersport" . "sports") | |
| 450 | - | ("Weben" . "build") | |
| 451 | - | ("weibl. Name" . "given") | |
| 452 | - | ("weibl. Vorn." . "given") | |
| 453 | - | ("Weichtierk." . "zool") | |
| 454 | - | ("Wein" . "alco") | |
| 455 | - | ("Werbung" . "bus") | |
| 456 | - | ("Werktitel" . "work") | |
| 457 | - | ("Wintersport" . "sports") | |
| 458 | - | ("Wirtsch." . "econ") | |
| 459 | - | ("Wrestling" . "sports") | |
| 460 | - | ("Wz." . "exp") | |
| 461 | - | ("Yoga" . "sports") | |
| 462 | - | ("Z??hlw." . "finc") | |
| 463 | - | ("Zahnmed." . "med") | |
| 464 | - | ("Zeitschriftenn." . "work") | |
| 465 | - | ("Zeitungsn." . "work") | |
| 466 | - | ("Zeitungsw." . "print") | |
| 467 | - | ("Zellbiol." . "biol") | |
| 468 | - | ("Zen" . "rel") | |
| 469 | - | ("Zirkus" . "hobby") | |
| 470 | - | ("Zitat" . "exp") | |
| 471 | - | ("Zollw." . "bus") | |
| 472 | - | ("Zool." . "biol"))) | |
| 473 | - | ||
| 474 | - | (define (hint->tag hint) | |
| 475 | - | (match hint | |
| 476 | - | ("Kansai-Dial." "ksb") | |
| 477 | - | ("Ky??to-Dial." "kyb") | |
| 478 | - | ("??saka-Dial." "osb") | |
| 479 | - | ("Kant??-Dial." "ktb") | |
| 480 | - | ("T??hoku-Dial." "thb") | |
| 481 | - | ("Tsugaru-Dial." "tsug") | |
| 482 | - | ("Ky??sh??-Dial." "kyu") | |
| 483 | - | ("Hokkaid??-Dial." "hob") | |
| 484 | - | ("altert??ml." "arch") | |
| 485 | - | ("arch." "arch") | |
| 486 | - | ("Edo-zeitl." "arch") | |
| 487 | - | ("ehrerb.-h??fl." "hon") | |
| 488 | - | ("h??fl.-ehrerb." "hon") | |
| 489 | - | ("besch.-h??fl." "hum") | |
| 490 | - | ("abwertend" "derog") | |
| 491 | - | ("etwas altmodisch" "obs") | |
| 492 | - | ("Gaunerjargon" "thief") | |
| 493 | - | ("Frauenspr. der Edo-Zeit" "arch") | |
| 494 | - | ("Geheimspr. der Gauner" "thief") | |
| 495 | - | ("hist." "hist") | |
| 496 | - | ("h??fl." "pol") | |
| 497 | - | ("kinderspr." "chn") | |
| 498 | - | ("M??nnersprache" "male") | |
| 499 | - | ("Medizinerjargon" "med") | |
| 500 | - | ("Milit??rjargon" "mil") | |
| 501 | - | ("obsol." "obs") | |
| 502 | - | ("onomat." "on-mim") | |
| 503 | - | ("poet." "poet") | |
| 504 | - | ("Polizeijargon" "law") | |
| 505 | - | ("Slang" "sl") | |
| 506 | - | ("veraltet" "arch") | |
| 507 | - | (_ #f))) | |
| 508 | - | ||
| 509 | - | (define (wadoku->tag-name wadoku-tag) | |
| 510 | - | (let ((tag (assoc-ref wadoku-entities wadoku-tag))) | |
| 511 | - | (or tag wadoku-tag))) | |
| 512 | - | ||
| 513 | - | (define (get-tag tag) | |
| 514 | - | (tag:get-tag (wadoku->tag-name tag))) |
modules/nani/wadoku/pitch.scm unknown status 2
| 1 | - | ;;; Nani Project website | |
| 2 | - | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
| 3 | - | ;;; | |
| 4 | - | ;;; This file is part of the Nani Project website. | |
| 5 | - | ;;; | |
| 6 | - | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | - | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | - | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | - | ;;; your option) any later version. | |
| 10 | - | ;;; | |
| 11 | - | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | - | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | - | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | - | ;;; GNU Affero General Public License for more details. | |
| 15 | - | ;;; | |
| 16 | - | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | - | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | - | ||
| 19 | - | (define-module (nani wadoku pitch) | |
| 20 | - | #:use-module (ice-9 binary-ports) | |
| 21 | - | #:use-module (ice-9 match) | |
| 22 | - | #:use-module (ice-9 rdelim) | |
| 23 | - | #:use-module (nani frequency) | |
| 24 | - | #:use-module (nani huffman) | |
| 25 | - | #:use-module (nani result) | |
| 26 | - | #:use-module (nani trie) | |
| 27 | - | #:use-module (nani wadoku entities) | |
| 28 | - | #:use-module (srfi srfi-1) | |
| 29 | - | #:use-module (srfi srfi-9) | |
| 30 | - | #:use-module (sxml ssax) | |
| 31 | - | #:use-module (rnrs bytevectors) | |
| 32 | - | #:export (xml->pitch serialize-pitch pitch-entry-count)) | |
| 33 | - | ||
| 34 | - | (define-record-type pitch | |
| 35 | - | (make-pitch kanjis accents) | |
| 36 | - | pitch? | |
| 37 | - | (kanjis pitch-kanjis) | |
| 38 | - | (accents pitch-accents)) | |
| 39 | - | ||
| 40 | - | (define (sxml->element lst elem) | |
| 41 | - | (let ((elem (match elem | |
| 42 | - | ((_ . elem) elem) | |
| 43 | - | (_ elem)))) | |
| 44 | - | (match elem | |
| 45 | - | ('accent `(pitch ,(car lst))) | |
| 46 | - | ('orth (let ((kanji (filter string? lst))) | |
| 47 | - | (if (null? kanji) #f `(kanji . ,(car kanji))))) | |
| 48 | - | ('form | |
| 49 | - | `(form . ,(append-map (lambda (a) (if (list? a) a (list a))) lst))) | |
| 50 | - | ('reading (filter pair? lst)) | |
| 51 | - | ('entry | |
| 52 | - | (let loop ((lst lst) (kanjis '()) (accents '())) | |
| 53 | - | (if (null? lst) | |
| 54 | - | (if (or (null? kanjis) (null? accents)) | |
| 55 | - | #f | |
| 56 | - | (make-pitch kanjis accents)) | |
| 57 | - | (match (car lst) | |
| 58 | - | (('form . f) (loop (append f (cdr lst)) kanjis accents)) | |
| 59 | - | (('pitch . pitch) (loop (cdr lst) kanjis (cons pitch accents))) | |
| 60 | - | (('kanji . kanji) (loop (cdr lst) (cons kanji kanjis) accents)) | |
| 61 | - | (_ (loop (cdr lst) kanjis accents)))))) | |
| 62 | - | (_ #f)))) | |
| 63 | - | ||
| 64 | - | (define parser | |
| 65 | - | (ssax:make-parser | |
| 66 | - | NEW-LEVEL-SEED | |
| 67 | - | (lambda (elem-gi attributes namespaces expected-content seed) | |
| 68 | - | attributes) | |
| 69 | - | ||
| 70 | - | FINISH-ELEMENT | |
| 71 | - | (lambda (elem-gi attributes namespaces parent-seed seed) | |
| 72 | - | (if (equal? elem-gi 'entries) | |
| 73 | - | seed | |
| 74 | - | (let* ((seed (reverse seed)) | |
| 75 | - | (element (sxml->element seed elem-gi))) | |
| 76 | - | (cons element parent-seed)))) | |
| 77 | - | ||
| 78 | - | CHAR-DATA-HANDLER | |
| 79 | - | (lambda (string1 string2 seed) | |
| 80 | - | (cons (string-append string1 string2) seed)))) | |
| 81 | - | ||
| 82 | - | (define (xml->pitch port) | |
| 83 | - | (filter pitch? (parser port '()))) | |
| 84 | - | ||
| 85 | - | (define (make-trie-key key) | |
| 86 | - | (append-map | |
| 87 | - | (lambda (c) | |
| 88 | - | (list (quotient c 16) (modulo c 16))) | |
| 89 | - | (bytevector->u8-list (string->utf8 key)))) | |
| 90 | - | ||
| 91 | - | (define (get-pitch-trie pitches) | |
| 92 | - | (let ((trie (make-empty-trie))) | |
| 93 | - | (for-each | |
| 94 | - | (lambda (pitch) | |
| 95 | - | (for-each | |
| 96 | - | (lambda (kanji) | |
| 97 | - | (for-each | |
| 98 | - | (lambda (accent) | |
| 99 | - | (cond | |
| 100 | - | ((string? accent) | |
| 101 | - | (add-to-trie! trie (make-trie-key kanji) accent)) | |
| 102 | - | ((list? accent) | |
| 103 | - | (add-to-trie! trie (make-trie-key kanji) (car accent))) | |
| 104 | - | (else | |
| 105 | - | (throw 'accent accent)))) | |
| 106 | - | (pitch-accents pitch))) | |
| 107 | - | (pitch-kanjis pitch))) | |
| 108 | - | pitches) | |
| 109 | - | (compress-trie trie))) | |
| 110 | - | ||
| 111 | - | (define (collapse-vals! trie) | |
| 112 | - | (let ((transitions (trie-transitions trie)) | |
| 113 | - | (vals (trie-vals trie))) | |
| 114 | - | (trie-vals-set! trie (string-join vals ", ")) | |
| 115 | - | (for-each collapse-vals! (map cdr transitions)))) | |
| 116 | - | ||
| 117 | - | (define (collect-vals trie) | |
| 118 | - | (let ((transitions (trie-transitions trie)) | |
| 119 | - | (vals (trie-vals trie))) | |
| 120 | - | (cons vals (append-map collect-vals (map cdr transitions))))) | |
| 121 | - | ||
| 122 | - | (define (entry-number trie) | |
| 123 | - | (let ((transitions (trie-transitions trie)) | |
| 124 | - | (vals (trie-vals trie))) | |
| 125 | - | (apply + (if (string-null? vals) 0 1) | |
| 126 | - | (map entry-number (map cdr transitions))))) | |
| 127 | - | ||
| 128 | - | (define (serialize-pitch pitches) | |
| 129 | - | (let ((trie (get-pitch-trie pitches))) | |
| 130 | - | (collapse-vals! trie) | |
| 131 | - | (let* ((huffman (create-huffman (collect-vals trie))) | |
| 132 | - | (code (huffman->code huffman)) | |
| 133 | - | (entries (entry-number trie))) | |
| 134 | - | (let* ((header (string->utf8 "NANI_PITCH001")) | |
| 135 | - | (header-size (bytevector-length header)) | |
| 136 | - | (huffman-bv (serialize-huffman huffman)) | |
| 137 | - | (huffman-size (bytevector-length huffman-bv)) | |
| 138 | - | (trie-size (trie-size trie (huffman-string-size code))) | |
| 139 | - | (result (make-bytevector (+ header-size 4 huffman-size trie-size)))) | |
| 140 | - | (bytevector-copy! header 0 result 0 header-size) | |
| 141 | - | (bytevector-u32-set! result header-size entries (endianness big)) | |
| 142 | - | (bytevector-copy! huffman-bv 0 result (+ header-size 4) huffman-size) | |
| 143 | - | (serialize-trie trie (serialize-huffman-string code) | |
| 144 | - | (huffman-string-size code) | |
| 145 | - | result (+ header-size 4 huffman-size)) | |
| 146 | - | result)))) | |
| 147 | - | ||
| 148 | - | (define (pitch-entry-count file) | |
| 149 | - | (call-with-input-file file | |
| 150 | - | (lambda (port) | |
| 151 | - | ;; header | |
| 152 | - | (get-bytevector-n port 13) | |
| 153 | - | ;; size | |
| 154 | - | (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big))))) |
modules/nani/wadoku/xml.scm unknown status 2
| 1 | - | ;;; Nani Project website | |
| 2 | - | ;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu> | |
| 3 | - | ;;; | |
| 4 | - | ;;; This file is part of the Nani Project website. | |
| 5 | - | ;;; | |
| 6 | - | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | - | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | - | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | - | ;;; your option) any later version. | |
| 10 | - | ;;; | |
| 11 | - | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | - | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | - | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | - | ;;; GNU Affero General Public License for more details. | |
| 15 | - | ;;; | |
| 16 | - | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | - | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | - | ||
| 19 | - | (define-module (nani wadoku xml) | |
| 20 | - | #:use-module (ice-9 match) | |
| 21 | - | #:use-module (ice-9 rdelim) | |
| 22 | - | #:use-module (nani frequency) | |
| 23 | - | #:use-module (nani result) | |
| 24 | - | #:use-module (nani wadoku entities) | |
| 25 | - | #:use-module (srfi srfi-9) | |
| 26 | - | #:use-module (sxml ssax) | |
| 27 | - | #:export (xml->results)) | |
| 28 | - | ||
| 29 | - | (define-record-type usage | |
| 30 | - | (make-usage type reg content) | |
| 31 | - | usage? | |
| 32 | - | (type usage-type) | |
| 33 | - | (reg usage-reg) | |
| 34 | - | (content usage-content)) | |
| 35 | - | ||
| 36 | - | (define (flatten lst) | |
| 37 | - | (match lst | |
| 38 | - | (() '()) | |
| 39 | - | ((head lst ...) | |
| 40 | - | (if (list? head) | |
| 41 | - | (append (flatten head) (flatten lst)) | |
| 42 | - | (cons head (flatten lst)))) | |
| 43 | - | (lst (list lst)))) | |
| 44 | - | ||
| 45 | - | (define* (update-usage usage #:key (type (usage-type usage)) | |
| 46 | - | (reg (usage-reg usage)) | |
| 47 | - | (content (usage-content usage))) | |
| 48 | - | (make-usage type reg content)) | |
| 49 | - | ||
| 50 | - | ;; TODO | |
| 51 | - | (define (usage->tag usage) | |
| 52 | - | (let ((content (usage-content usage)) | |
| 53 | - | (type (usage-type usage)) | |
| 54 | - | (reg (usage-reg usage))) | |
| 55 | - | `(,@(if (string-null? content) | |
| 56 | - | (match type | |
| 57 | - | ("" '()) | |
| 58 | - | ("time" '()) | |
| 59 | - | ("hint" '()) | |
| 60 | - | ("abrev" '(tag "abbr"))) | |
| 61 | - | (match type | |
| 62 | - | ("dom" `((tag ,content))) | |
| 63 | - | ("time" `((tag "obs"))); always obsol. | |
| 64 | - | ("hint" (let ((tag (hint->tag content))) | |
| 65 | - | (if tag `(tag ,tag) '()))))) | |
| 66 | - | ,@(match reg | |
| 67 | - | ("" '()) | |
| 68 | - | ("dial." '()) | |
| 69 | - | ("lit" `((tag "litf"))) | |
| 70 | - | ("kinderspr." `((tag "chn"))) | |
| 71 | - | ("kleinkindspr." `((tag "baby"))) | |
| 72 | - | ("jugendspr." `((tag "young"))) | |
| 73 | - | ("besch.-h??fl." `((tag "hum"))) | |
| 74 | - | ("coll" `((tag "col"))) | |
| 75 | - | ("vulg." `((tag "vulg"))) | |
| 76 | - | ("Slang" `((tag "sl"))) | |
| 77 | - | ("poet." `((tag "poet"))) | |
| 78 | - | ("hist." `((tag "hist"))) | |
| 79 | - | ("hist" `((tag "hist"))) | |
| 80 | - | ("obsol." `((tag "obs"))) | |
| 81 | - | ("veraltet" `((tag "arch"))) | |
| 82 | - | ("sch??lerspr." `((tag "stud"))) | |
| 83 | - | ("studentenspr." `((tag "stud"))) | |
| 84 | - | ("frauenspr." `((tag "fem"))) | |
| 85 | - | ("gaunerspr." `((tag "thief"))) | |
| 86 | - | ("geheimspr." `((tag "secret"))) | |
| 87 | - | ("hofdamenspr." `((tag "fem") (tag "pol"))) | |
| 88 | - | ("seemannsspr." `((tag "sail"))))))) | |
| 89 | - | ||
| 90 | - | (define (usg->tag lst) | |
| 91 | - | (define (usg->usage lst) | |
| 92 | - | (let loop ((result (make-usage "" "" "")) (lst lst)) | |
| 93 | - | (if (null? lst) | |
| 94 | - | result | |
| 95 | - | (loop | |
| 96 | - | (match (car lst) | |
| 97 | - | (('type . type) (update-usage result #:type type)) | |
| 98 | - | (('reg . reg) (update-usage result #:reg reg)) | |
| 99 | - | (((? symbol? s) . v) | |
| 100 | - | (throw 'unknow-usage-attr s v)) | |
| 101 | - | ((? string? content) (update-usage result #:content content)) | |
| 102 | - | ((? list? l) (loop result l)) | |
| 103 | - | (_ result)) | |
| 104 | - | (cdr lst))))) | |
| 105 | - | (usage->tag (usg->usage lst))) | |
| 106 | - | ||
| 107 | - | (define (ruigo->ref lst) | |
| 108 | - | (let loop ((ref #f) (lst lst)) | |
| 109 | - | (match lst | |
| 110 | - | (() ref) | |
| 111 | - | ((('id id) lst ...) | |
| 112 | - | (loop id lst)) | |
| 113 | - | (((a . b) lst ...) | |
| 114 | - | (loop (loop ref (list (list a b))) lst)) | |
| 115 | - | (((? list? l) lst ...) | |
| 116 | - | (loop (loop ref l) lst))))) | |
| 117 | - | ||
| 118 | - | (define (merge-senses s1 s2) | |
| 119 | - | (update-sense s2 | |
| 120 | - | #:references (append (sense-references s1) (sense-references s2)) | |
| 121 | - | #:tags (append (sense-tags s1) (sense-tags s2)) | |
| 122 | - | #:glosses (append (sense-glosses s1) (sense-glosses s2)))) | |
| 123 | - | ||
| 124 | - | (define (sxml->sense lst) | |
| 125 | - | (let loop ((sense (make-sense '() '() '() '() '() '() "ger")) (lst lst)) | |
| 126 | - | (if (null? lst) | |
| 127 | - | sense | |
| 128 | - | (loop | |
| 129 | - | (match (car lst) | |
| 130 | - | (#f sense) | |
| 131 | - | ((? sense? s) | |
| 132 | - | (merge-senses s sense)) | |
| 133 | - | ((? source? s) | |
| 134 | - | (update-sense sense #:sources (cons s (sense-sources sense)))) | |
| 135 | - | (('ref (? string? r)) | |
| 136 | - | (update-sense sense #:references (cons r (sense-references sense)))) | |
| 137 | - | (('tag (? string? r)) | |
| 138 | - | (update-sense sense #:tags (cons (get-tag r) (sense-tags sense)))) | |
| 139 | - | (('trans (? string? r)) | |
| 140 | - | (update-sense sense #:glosses (cons r (sense-glosses sense)))) | |
| 141 | - | (('related . _) sense) | |
| 142 | - | (('transcr . _) sense) | |
| 143 | - | (('pitch . _) sense) | |
| 144 | - | (((? symbol? s) v) | |
| 145 | - | (throw 'unknown-symbol s v)) | |
| 146 | - | ((? list? l) (loop sense l)) | |
| 147 | - | ((? string? _) sense)) | |
| 148 | - | (cdr lst))))) | |
| 149 | - | ||
| 150 | - | ;; TODO | |
| 151 | - | (define (sxml->ref lst) | |
| 152 | - | (define (sxml->ref-name lst) | |
| 153 | - | (let loop ((result #f) (lst lst)) | |
| 154 | - | (if (null? lst) | |
| 155 | - | result | |
| 156 | - | (loop | |
| 157 | - | (match (car lst) | |
| 158 | - | (('id . id) id) | |
| 159 | - | (((? symbol? s) . _) result) | |
| 160 | - | (((? symbol? s) _) result) | |
| 161 | - | ((? list? l) (loop result l)) | |
| 162 | - | (_ result)) | |
| 163 | - | (cdr lst))))) | |
| 164 | - | (let ((ref (sxml->ref-name lst))) | |
| 165 | - | (if (string? ref) | |
| 166 | - | `(ref ,ref) | |
| 167 | - | (throw 'no-ref ref 'from lst)))) | |
| 168 | - | ||
| 169 | - | (define (sxml->source lst) | |
| 170 | - | (let loop ((source (make-source '() #f "" "")) (lst lst)) | |
| 171 | - | (if (null? lst) | |
| 172 | - | source | |
| 173 | - | (loop | |
| 174 | - | (match (car lst) | |
| 175 | - | (('impli impli) (update-source source #:lang impli)) | |
| 176 | - | (('foreign foreign) | |
| 177 | - | (update-source source | |
| 178 | - | #:content (if (list? foreign) | |
| 179 | - | (append foreign (source-content source)) | |
| 180 | - | (cons foreign (source-content source))))) | |
| 181 | - | ((? list? l) (loop source l)) | |
| 182 | - | (_ source)) | |
| 183 | - | (cdr lst))))) | |
| 184 | - | ||
| 185 | - | (define (sxml->reading lst) | |
| 186 | - | (let loop ((reading (make-reading '() '() '())) (lst lst)) | |
| 187 | - | (if (null? lst) | |
| 188 | - | reading | |
| 189 | - | (loop | |
| 190 | - | (match (car lst) | |
| 191 | - | (('reading r) | |
| 192 | - | (update-reading reading #:readings (cons r (reading-readings reading)))) | |
| 193 | - | (_ reading)) | |
| 194 | - | (cdr lst))))) | |
| 195 | - | ||
| 196 | - | (define (sxml->result sxml frq) | |
| 197 | - | (define (sxml->result-aux sxml) | |
| 198 | - | (let loop ((result (make-result 0 0 '() '() '())) (last-source #f) (lst sxml)) | |
| 199 | - | (if (null? lst) | |
| 200 | - | result | |
| 201 | - | (match (car lst) | |
| 202 | - | (('kanji kanji) | |
| 203 | - | (loop | |
| 204 | - | (update-result result #:kanjis (cons kanji (result-kanjis result))) | |
| 205 | - | last-source (cdr lst))) | |
| 206 | - | ((? reading? reading) | |
| 207 | - | (loop | |
| 208 | - | (update-result result #:readings (cons reading (result-readings result))) | |
| 209 | - | last-source (cdr lst))) | |
| 210 | - | ((? sense? sense) | |
| 211 | - | (loop | |
| 212 | - | (update-result result | |
| 213 | - | #:senses | |
| 214 | - | (cons | |
| 215 | - | (if last-source | |
| 216 | - | (update-sense sense | |
| 217 | - | #:sources (cons last-source (sense-sources sense))) | |
| 218 | - | sense) | |
| 219 | - | (result-senses result))) | |
| 220 | - | last-source (cdr lst))) | |
| 221 | - | ((? list? l) | |
| 222 | - | (loop (loop result last-source l) last-source (cdr lst))) | |
| 223 | - | (_ (loop result last-source (cdr lst))))))) | |
| 224 | - | (let* ((result (sxml->result-aux sxml)) | |
| 225 | - | (word (if (null? (result-kanjis result)) | |
| 226 | - | (car (reading-readings (car (result-readings result)))) | |
| 227 | - | (car (result-kanjis result)))) | |
| 228 | - | (entity (frequency-entity frq word)) | |
| 229 | - | (sense1 (car (result-senses result))) | |
| 230 | - | (sense1 (if entity | |
| 231 | - | (update-sense sense1 | |
| 232 | - | #:tags (cons (get-tag entity) (sense-tags sense1))) | |
| 233 | - | sense1)) | |
| 234 | - | (senses (cons sense1 (cdr (result-senses result))))) | |
| 235 | - | (update-result result | |
| 236 | - | #:points (tag:get-points (update-result result #:senses senses)) | |
| 237 | - | #:senses senses))) | |
| 238 | - | ||
| 239 | - | (define (gram-tag lst) | |
| 240 | - | (map | |
| 241 | - | (lambda (attr) | |
| 242 | - | (match attr | |
| 243 | - | (('suru . suru) | |
| 244 | - | `(tag ,(string-append "suru-" suru))) | |
| 245 | - | (('ni . ni) | |
| 246 | - | `(tag ,(string-append "ni-" ni))) | |
| 247 | - | (('shiku . shiku) | |
| 248 | - | `(tag ,(string-append "shiku-" shiku))) | |
| 249 | - | (('ku . ku) | |
| 250 | - | `(tag ,(string-append "ku-" ku))) | |
| 251 | - | (('to . to) | |
| 252 | - | `(tag ,(string-append "to-" to))) | |
| 253 | - | (('no . no) | |
| 254 | - | `(tag ,(string-append "no-" no))) | |
| 255 | - | (('nari . nari) | |
| 256 | - | `(tag ,(string-append "nari-" nari))) | |
| 257 | - | (('taru . taru) | |
| 258 | - | `(tag ,(string-append "taru-" taru))) | |
| 259 | - | (('level . level) | |
| 260 | - | `(tag ,(string-append "level-" level))) | |
| 261 | - | (('godanrow . tr) | |
| 262 | - | `(tag ,(string-append "godanrow-" tr))) | |
| 263 | - | (('onbin . tr) | |
| 264 | - | `(tag ,(string-append "onbin-" tr))) | |
| 265 | - | (('transitivity . tr) | |
| 266 | - | `(tag ,(string-append "transitivity-" tr))))) | |
| 267 | - | lst)) | |
| 268 | - | ||
| 269 | - | (define (sxml->string lst) | |
| 270 | - | (define (sub-loop loop tags result lst l) | |
| 271 | - | (let ((result (loop tags result l))) | |
| 272 | - | (if (list? result) | |
| 273 | - | (loop (append tags (filter list? result)) | |
| 274 | - | (apply string-append (filter string? result)) | |
| 275 | - | lst) | |
| 276 | - | (loop tags result lst)))) | |
| 277 | - | ||
| 278 | - | (let loop ((tags '()) (result "") (lst lst)) | |
| 279 | - | (match lst | |
| 280 | - | (() (if (null? tags) | |
| 281 | - | result | |
| 282 | - | (append tags (list result)))) | |
| 283 | - | ((? string? s) | |
| 284 | - | (loop tags (string-append result s) '())) | |
| 285 | - | (((? string? s) lst ...) | |
| 286 | - | (loop tags (string-append result s) lst)) | |
| 287 | - | ((('prior . _) lst ...) | |
| 288 | - | (loop tags result lst)) | |
| 289 | - | ((('options . _) lst ...) | |
| 290 | - | (loop tags result lst)) | |
| 291 | - | ((('firstname . _) lst ...) | |
| 292 | - | (loop tags result lst)) | |
| 293 | - | ((('ausn . _) lst ...) | |
| 294 | - | (loop tags result lst)) | |
| 295 | - | ((('lang . _) lst ...) | |
| 296 | - | (loop tags result lst)) | |
| 297 | - | ((('meta . _) lst ...) | |
| 298 | - | (loop tags result lst)) | |
| 299 | - | ((('genki . genki) lst ...) | |
| 300 | - | ;(loop (cons `(tag ,genki) tags) result lst)) | |
| 301 | - | (loop tags result lst)) | |
| 302 | - | ((('jlpt . jlpt) lst ...) | |
| 303 | - | (loop (cons `(tag ,(string-append "jlpt-" jlpt)) tags) result lst)) | |
| 304 | - | ((('ref . (? string? ref)) lst ...) | |
| 305 | - | (loop (cons `(ref ,ref) tags) result lst)) | |
| 306 | - | ((('ref (? string? ref)) lst ...) | |
| 307 | - | (loop (cons `(ref ,ref) tags) result lst)) | |
| 308 | - | ((('jap l) lst ...) | |
| 309 | - | (sub-loop loop tags result lst l)) | |
| 310 | - | ((('foreign l) lst ...) | |
| 311 | - | (sub-loop loop tags result lst l)) | |
| 312 | - | ((('transcr l) lst ...) | |
| 313 | - | (sub-loop loop tags result lst l)) | |
| 314 | - | ((((? symbol? s) v) lst ...) | |
| 315 | - | (throw 'unsupported-symbol s v)) | |
| 316 | - | ((((? symbol? s) . v) lst ...) | |
| 317 | - | (throw 'unsupported-symbol-pair s v)) | |
| 318 | - | (((? list? l) lst ...) | |
| 319 | - | (sub-loop loop tags result lst l))))) | |
| 320 | - | ||
| 321 | - | (define (sxml->element lst elem frq) | |
| 322 | - | (let ((elem (match elem | |
| 323 | - | ((_ . elem) elem) | |
| 324 | - | (_ elem)))) | |
| 325 | - | (match elem | |
| 326 | - | ('orth (let ((kanji (filter string? lst))) | |
| 327 | - | (if (null? kanji) #f `(kanji ,(car kanji))))) | |
| 328 | - | ('entry (sxml->result lst frq)) | |
| 329 | - | ('hira `(reading ,(car lst))) | |
| 330 | - | ('hatsuon `(hatsuon ,(car lst))) | |
| 331 | - | ('accent `(pitch ,(car lst))) | |
| 332 | - | ('reading (sxml->reading lst)) | |
| 333 | - | ('form lst) | |
| 334 | - | ('impli `(impli ,(car lst))) | |
| 335 | - | ('text (let loop ((text "") (lst lst)) | |
| 336 | - | (match lst | |
| 337 | - | (() text) | |
| 338 | - | ((('hasPrecedingSpace . _) lst ...) | |
| 339 | - | (loop (string-append " " text) lst)) | |
| 340 | - | ((('hasFollowingSpace . _) lst ...) | |
| 341 | - | (string-append (loop text lst) " ")) | |
| 342 | - | (((? string? s) lst ...) | |
| 343 | - | (loop (string-append text s) lst))))) | |
| 344 | - | ('famn (sxml->string lst)) | |
| 345 | - | ('expl (sxml->string lst)) | |
| 346 | - | ('expli (filter list? lst)) | |
| 347 | - | ('abbrev (filter list? lst)) | |
| 348 | - | ('token (filter string? lst)) | |
| 349 | - | ('tr (sxml->string lst)) | |
| 350 | - | ('transcr `(transcr ,lst)) | |
| 351 | - | ('trans | |
| 352 | - | (append (filter list? lst) | |
| 353 | - | (map (lambda (s) `(trans ,s)) (filter string? lst)))) | |
| 354 | - | ('jap `(jap ,lst)) | |
| 355 | - | ('emph lst) | |
| 356 | - | ('title (filter string? lst)) | |
| 357 | - | ('transl (filter string? lst)) | |
| 358 | - | ('topic (filter string? lst)) | |
| 359 | - | ('iron (filter string? lst)) | |
| 360 | - | ('specchar (filter string? lst)) | |
| 361 | - | ('scientif (filter string? lst)) | |
| 362 | - | ('wikide #f) | |
| 363 | - | ('wikija #f) | |
| 364 | - | ('link #f) | |
| 365 | - | ('ref (sxml->ref lst)) | |
| 366 | - | ('sref (sxml->ref lst)) | |
| 367 | - | ('etym (sxml->source lst)) | |
| 368 | - | ('literal (sxml->string (list "???" (sxml->string lst) "???"))) | |
| 369 | - | ('def (sxml->string (list "(" (sxml->string lst) ")"))) | |
| 370 | - | ('date (sxml->string (list "(" (sxml->string lst) ")"))) | |
| 371 | - | ('birthdeath (sxml->string (list "(" (sxml->string lst) ")"))) | |
| 372 | - | ('descr (sxml->string (list "(" (sxml->string lst) ")"))) | |
| 373 | - | ('bracket (sxml->string (list "[" (sxml->string lst) "]"))) | |
| 374 | - | ('foreign (if (null? lst) #f `(foreign ,(car lst)))) | |
| 375 | - | ('seasonword `(tag ,(string-append "season-" (assoc-ref lst 'type)))) | |
| 376 | - | ('usg (usg->tag lst)) | |
| 377 | - | ('sense (sxml->sense lst)) | |
| 378 | - | ('steinhaus (let ((ref (sxml->string lst))) | |
| 379 | - | (if (string? ref) | |
| 380 | - | `(ref ,(sxml->string lst)) | |
| 381 | - | (throw 'not-steinhaus ref)))) | |
| 382 | - | ('pos '()); TODO: actually find what tag to use | |
| 383 | - | ('wordcomponent (cons `(tag "wordcomponent") (gram-tag lst))) | |
| 384 | - | ('meishi (cons `(tag "meishi") (gram-tag lst))) | |
| 385 | - | ('setsuzokushi (cons `(tag "setsuzokushi") (gram-tag lst))) | |
| 386 | - | ('daimeishi (cons `(tag "daimeishi") (gram-tag lst))) | |
| 387 | - | ('doushi (cons `(tag "doushi") (gram-tag lst))) | |
| 388 | - | ('kandoushi (cons `(tag "kandoushi") (gram-tag lst))) | |
| 389 | - | ('keiyoudoushi (cons `(tag "keiyoudoushi") (gram-tag lst))) | |
| 390 | - | ('keiyoushi (cons `(tag "keiyoushi") (gram-tag lst))) | |
| 391 | - | ('fukushi (cons `(tag "fukushi") (gram-tag lst))) | |
| 392 | - | ('rengo (cons `(tag "rengo") (gram-tag lst))) | |
| 393 | - | ('suffix (cons `(tag "suffix") (gram-tag lst))) | |
| 394 | - | ('prefix (cons `(tag "prefix") (gram-tag lst))) | |
| 395 | - | ('kanji (cons `(tag "kanji") (gram-tag lst))) | |
| 396 | - | ('rentaishi (cons `(tag "rentaishi") (gram-tag lst))) | |
| 397 | - | ('specialcharacter (cons `(tag "specialcharacter") (gram-tag lst))) | |
| 398 | - | ('joshi (cons `(tag "joshi") (gram-tag lst))) | |
| 399 | - | ('fukujoshi (cons `(tag "fukujoshi") (gram-tag lst))) | |
| 400 | - | ('kakujoshi (cons `(tag "kakujoshi") (gram-tag lst))) | |
| 401 | - | ('kakarijoshi (cons `(tag "kakarijoshi") (gram-tag lst))) | |
| 402 | - | ('shuujoshi (cons `(tag "shuujoshi") (gram-tag lst))) | |
| 403 | - | ('setsuzokujoshi (cons `(tag "setsuzokujoshi") (gram-tag lst))) | |
| 404 | - | ('jokeiyoushi (cons `(tag "jokeiyoushi") (gram-tag lst))) | |
| 405 | - | ('jodoushi (cons `(tag "jodoushi") (gram-tag lst))) | |
| 406 | - | ('ruigos lst) | |
| 407 | - | ('ruigo (ruigo->ref lst)) | |
| 408 | - | ('gramGrp lst)))) | |
| 409 | - | ||
| 410 | - | (define (create-parser frq) | |
| 411 | - | (ssax:make-parser | |
| 412 | - | NEW-LEVEL-SEED | |
| 413 | - | (lambda (elem-gi attributes namespaces expected-content seed) | |
| 414 | - | attributes) | |
| 415 | - | ||
| 416 | - | FINISH-ELEMENT | |
| 417 | - | (lambda (elem-gi attributes namespaces parent-seed seed) | |
| 418 | - | (if (equal? elem-gi 'entries) | |
| 419 | - | seed | |
| 420 | - | (let* ((seed (reverse seed)) | |
| 421 | - | (element (sxml->element seed elem-gi frq))) | |
| 422 | - | (cons element parent-seed)))) | |
| 423 | - | ||
| 424 | - | CHAR-DATA-HANDLER | |
| 425 | - | (lambda (string1 string2 seed) | |
| 426 | - | (cons (string-append string1 string2) seed)))) | |
| 427 | - | ||
| 428 | - | (define (xml->results port frq) | |
| 429 | - | (let ((results (filter result? ((create-parser frq) port '())))) | |
| 430 | - | (sort results (lambda (a b) (> (result-points a) (result-points b)))))) |
po/fr.po
| 7 | 7 | msgstr "" | |
| 8 | 8 | "Project-Id-Version: PACKAGE VERSION\n" | |
| 9 | 9 | "Report-Msgid-Bugs-To: \n" | |
| 10 | - | "POT-Creation-Date: 2020-06-05 15:21+0200\n" | |
| 10 | + | "POT-Creation-Date: 2020-06-25 02:57+0200\n" | |
| 11 | 11 | "PO-Revision-Date: 2020-06-15 14:56+0000\n" | |
| 12 | 12 | "Last-Translator: full name <fedora-account@lepiller.eu>\n" | |
| 13 | 13 | "Language-Team: French <https://translate.fedoraproject.org/projects/nani/" | |
… | |||
| 107 | 107 | msgid "JMdict" | |
| 108 | 108 | msgstr "JMdict" | |
| 109 | 109 | ||
| 110 | - | #: tools/list.scm:55 | |
| 110 | + | #: tools/list.scm:63 | |
| 111 | 111 | msgid "" | |
| 112 | 112 | "Japanese/Dutch dictionary from the Electronic Dictionary Research and " | |
| 113 | 113 | "Development Group." | |
… | |||
| 115 | 115 | "Dictionnaire japonais/anglais de l???Electronic Dictionary Research and " | |
| 116 | 116 | "Development Group." | |
| 117 | 117 | ||
| 118 | - | #: tools/list.scm:54 | |
| 118 | + | #: tools/list.scm:62 | |
| 119 | 119 | msgid "" | |
| 120 | 120 | "Japanese/English dictionary from the Electronic Dictionary Research and " | |
| 121 | 121 | "Development Group." | |
… | |||
| 123 | 123 | "Dictionnaire japonais/anglais de l???Electronic Dictionary Research and " | |
| 124 | 124 | "Development Group." | |
| 125 | 125 | ||
| 126 | - | #: tools/list.scm:56 | |
| 126 | + | #: tools/list.scm:64 | |
| 127 | 127 | msgid "" | |
| 128 | 128 | "Japanese/French dictionary from the Electronic Dictionary Research and " | |
| 129 | 129 | "Development Group." | |
… | |||
| 131 | 131 | "Dictionnaire japonais/fran??ais de l???Electronic Dictionary Research and " | |
| 132 | 132 | "Development Group." | |
| 133 | 133 | ||
| 134 | + | #: tools/list.scm:53 | |
| 135 | + | #, fuzzy | |
| 136 | + | msgid "Japanese/French dictionary from the Jibiki project." | |
| 137 | + | msgstr "Dictionnaire japonais/allemand de Wadoku." | |
| 138 | + | ||
| 134 | 139 | #: tools/list.scm:37 | |
| 135 | 140 | msgid "Japanese/German dictionary from Wadoku." | |
| 136 | 141 | msgstr "Dictionnaire japonais/allemand de Wadoku." | |
| 137 | 142 | ||
| 138 | - | #: tools/list.scm:57 | |
| 143 | + | #: tools/list.scm:65 | |
| 139 | 144 | msgid "" | |
| 140 | 145 | "Japanese/German dictionary from the Electronic Dictionary Research and " | |
| 141 | 146 | "Development Group." | |
… | |||
| 143 | 148 | "Dictionnaire japonais/allemand de l???Electronic Dictionary Research and " | |
| 144 | 149 | "Development Group." | |
| 145 | 150 | ||
| 146 | - | #: tools/list.scm:58 | |
| 151 | + | #: tools/list.scm:66 | |
| 147 | 152 | msgid "" | |
| 148 | 153 | "Japanese/Hungarian dictionary from the Electronic Dictionary Research and " | |
| 149 | 154 | "Development Group." | |
… | |||
| 151 | 156 | "Dictionnaire japonais/hongrois de l???Electronic Dictionary Research and " | |
| 152 | 157 | "Development Group." | |
| 153 | 158 | ||
| 154 | - | #: tools/list.scm:59 | |
| 159 | + | #: tools/list.scm:67 | |
| 155 | 160 | msgid "" | |
| 156 | 161 | "Japanese/Russian dictionary from the Electronic Dictionary Research and " | |
| 157 | 162 | "Development Group." | |
… | |||
| 159 | 164 | "Dictionnaire japonais/russe de l???Electronic Dictionary Research and " | |
| 160 | 165 | "Development Group." | |
| 161 | 166 | ||
| 162 | - | #: tools/list.scm:60 | |
| 167 | + | #: tools/list.scm:68 | |
| 163 | 168 | msgid "" | |
| 164 | 169 | "Japanese/Slovenian dictionary from the Electronic Dictionary Research and " | |
| 165 | 170 | "Development Group." | |
… | |||
| 167 | 172 | "Dictionnaire japonais/slov??ne de l???Electronic Dictionary Research and " | |
| 168 | 173 | "Development Group." | |
| 169 | 174 | ||
| 170 | - | #: tools/list.scm:61 | |
| 175 | + | #: tools/list.scm:69 | |
| 171 | 176 | msgid "" | |
| 172 | 177 | "Japanese/Spanish dictionary from the Electronic Dictionary Research and " | |
| 173 | 178 | "Development Group." | |
… | |||
| 175 | 180 | "Dictionnaire japonais/espagnol de l???Electronic Dictionary Research and " | |
| 176 | 181 | "Development Group." | |
| 177 | 182 | ||
| 178 | - | #: tools/list.scm:62 | |
| 183 | + | #: tools/list.scm:70 | |
| 179 | 184 | msgid "" | |
| 180 | 185 | "Japanese/Swedish dictionary from the Electronic Dictionary Research and " | |
| 181 | 186 | "Development Group." | |
… | |||
| 382 | 387 | " aidera ?? mieux prononcer les mots, avec l'accent de hauteur du japonais\n" | |
| 383 | 388 | " standard." | |
| 384 | 389 | ||
| 390 | + | #: tools/list.scm:55 | |
| 391 | + | #, fuzzy | |
| 392 | + | msgid "" | |
| 393 | + | "This dictionary allows you to do searches on the main view of this app.\n" | |
| 394 | + | "\tFailing to download one of these dictionaries will make the app unusable\n" | |
| 395 | + | "\tas you can't search for anything. This dictionary can be searched for\n" | |
| 396 | + | "\tby kanji, reading (kana) and by French translation." | |
| 397 | + | msgstr "" | |
| 398 | + | "Ce dictionnaire vous permet d???effectuer des recherches sur la vue\n" | |
| 399 | + | " principale de cette appli. Si vous n???en t??l??chargez aucun, l???appli\n" | |
| 400 | + | " sera inutilisable puisque vous ne pourrez rien rechercher. Ce\n" | |
| 401 | + | " dictionnaire permet d???effectuer des recherches par kanji, par\n" | |
| 402 | + | " prononciation (kana) et par traduction allemande." | |
| 403 | + | ||
| 385 | 404 | #: tools/list.scm:39 | |
| 405 | + | #, fuzzy | |
| 386 | 406 | msgid "" | |
| 387 | 407 | "This dictionary allows you to do searches on the main view of this app.\n" | |
| 388 | - | " Failing to download on of these dictionaries will make the app " | |
| 408 | + | " Failing to download one of these dictionaries will make the app " | |
| 389 | 409 | "unusable\n" | |
| 390 | 410 | " as you can't search for anything. This dictionary can be searched " | |
| 391 | 411 | "for\n" | |
… | |||
| 397 | 417 | " dictionnaire permet d???effectuer des recherches par kanji, par\n" | |
| 398 | 418 | " prononciation (kana) et par traduction allemande." | |
| 399 | 419 | ||
| 400 | - | #: tools/list.scm:64 | |
| 420 | + | #: tools/list.scm:72 | |
| 401 | 421 | msgid "" | |
| 402 | 422 | "This dictionary allows you to do searches on the main view of this app.\n" | |
| 403 | 423 | " Failing to download one of these dictionaries will make the app " | |
po/nani.pot
| 8 | 8 | msgstr "" | |
| 9 | 9 | "Project-Id-Version: PACKAGE VERSION\n" | |
| 10 | 10 | "Report-Msgid-Bugs-To: \n" | |
| 11 | - | "POT-Creation-Date: 2020-06-06 01:47+0200\n" | |
| 11 | + | "POT-Creation-Date: 2021-07-18 16:26+0200\n" | |
| 12 | 12 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" | |
| 13 | 13 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n" | |
| 14 | 14 | "Language-Team: LANGUAGE <LL@li.org>\n" | |
… | |||
| 97 | 97 | msgid "JMdict" | |
| 98 | 98 | msgstr "" | |
| 99 | 99 | ||
| 100 | - | #: tools/list.scm:55 | |
| 100 | + | #: tools/list.scm:63 | |
| 101 | 101 | msgid "" | |
| 102 | 102 | "Japanese/Dutch dictionary from the Electronic Dictionary Research and " | |
| 103 | 103 | "Development Group." | |
| 104 | 104 | msgstr "" | |
| 105 | 105 | ||
| 106 | - | #: tools/list.scm:54 | |
| 106 | + | #: tools/list.scm:62 | |
| 107 | 107 | msgid "" | |
| 108 | 108 | "Japanese/English dictionary from the Electronic Dictionary Research and " | |
| 109 | 109 | "Development Group." | |
| 110 | 110 | msgstr "" | |
| 111 | 111 | ||
| 112 | - | #: tools/list.scm:56 | |
| 112 | + | #: tools/list.scm:64 | |
| 113 | 113 | msgid "" | |
| 114 | 114 | "Japanese/French dictionary from the Electronic Dictionary Research and " | |
| 115 | 115 | "Development Group." | |
| 116 | 116 | msgstr "" | |
| 117 | 117 | ||
| 118 | + | #: tools/list.scm:53 | |
| 119 | + | msgid "Japanese/French dictionary from the Jibiki project." | |
| 120 | + | msgstr "" | |
| 121 | + | ||
| 118 | 122 | #: tools/list.scm:37 | |
| 119 | 123 | msgid "Japanese/German dictionary from Wadoku." | |
| 120 | 124 | msgstr "" | |
| 121 | 125 | ||
| 122 | - | #: tools/list.scm:57 | |
| 126 | + | #: tools/list.scm:65 | |
| 123 | 127 | msgid "" | |
| 124 | 128 | "Japanese/German dictionary from the Electronic Dictionary Research and " | |
| 125 | 129 | "Development Group." | |
| 126 | 130 | msgstr "" | |
| 127 | 131 | ||
| 128 | - | #: tools/list.scm:58 | |
| 132 | + | #: tools/list.scm:66 | |
| 129 | 133 | msgid "" | |
| 130 | 134 | "Japanese/Hungarian dictionary from the Electronic Dictionary Research and " | |
| 131 | 135 | "Development Group." | |
| 132 | 136 | msgstr "" | |
| 133 | 137 | ||
| 134 | - | #: tools/list.scm:59 | |
| 138 | + | #: tools/list.scm:67 | |
| 135 | 139 | msgid "" | |
| 136 | 140 | "Japanese/Russian dictionary from the Electronic Dictionary Research and " | |
| 137 | 141 | "Development Group." | |
| 138 | 142 | msgstr "" | |
| 139 | 143 | ||
| 140 | - | #: tools/list.scm:60 | |
| 144 | + | #: tools/list.scm:68 | |
| 141 | 145 | msgid "" | |
| 142 | 146 | "Japanese/Slovenian dictionary from the Electronic Dictionary Research and " | |
| 143 | 147 | "Development Group." | |
| 144 | 148 | msgstr "" | |
| 145 | 149 | ||
| 146 | - | #: tools/list.scm:61 | |
| 150 | + | #: tools/list.scm:69 | |
| 147 | 151 | msgid "" | |
| 148 | 152 | "Japanese/Spanish dictionary from the Electronic Dictionary Research and " | |
| 149 | 153 | "Development Group." | |
| 150 | 154 | msgstr "" | |
| 151 | 155 | ||
| 152 | - | #: tools/list.scm:62 | |
| 156 | + | #: tools/list.scm:70 | |
| 153 | 157 | msgid "" | |
| 154 | 158 | "Japanese/Swedish dictionary from the Electronic Dictionary Research and " | |
| 155 | 159 | "Development Group." | |
… | |||
| 311 | 315 | " words better, with a standard Japanese pitch accent." | |
| 312 | 316 | msgstr "" | |
| 313 | 317 | ||
| 318 | + | #: tools/list.scm:55 | |
| 319 | + | msgid "" | |
| 320 | + | "This dictionary allows you to do searches on the main view of this app.\n" | |
| 321 | + | "\tFailing to download one of these dictionaries will make the app unusable\n" | |
| 322 | + | "\tas you can't search for anything. This dictionary can be searched for\n" | |
| 323 | + | "\tby kanji, reading (kana) and by French translation." | |
| 324 | + | msgstr "" | |
| 325 | + | ||
| 314 | 326 | #: tools/list.scm:39 | |
| 315 | 327 | msgid "" | |
| 316 | 328 | "This dictionary allows you to do searches on the main view of this app.\n" | |
| 317 | - | " Failing to download on of these dictionaries will make the app " | |
| 329 | + | " Failing to download one of these dictionaries will make the app " | |
| 318 | 330 | "unusable\n" | |
| 319 | 331 | " as you can't search for anything. This dictionary can be searched " | |
| 320 | 332 | "for\n" | |
| 321 | 333 | " by kanji, reading (kana) and by German translation." | |
| 322 | 334 | msgstr "" | |
| 323 | 335 | ||
| 324 | - | #: tools/list.scm:64 | |
| 336 | + | #: tools/list.scm:72 | |
| 325 | 337 | msgid "" | |
| 326 | 338 | "This dictionary allows you to do searches on the main view of this app.\n" | |
| 327 | 339 | " Failing to download one of these dictionaries will make the app " | |
po/uk.po
| 107 | 107 | msgid "JMdict" | |
| 108 | 108 | msgstr "JMdict" | |
| 109 | 109 | ||
| 110 | - | #: tools/list.scm:55 | |
| 110 | + | #: tools/list.scm:63 | |
| 111 | 111 | msgid "" | |
| 112 | 112 | "Japanese/Dutch dictionary from the Electronic Dictionary Research and " | |
| 113 | 113 | "Development Group." | |
… | |||
| 115 | 115 | "????????????????-???????????????????????? ?????????????? ?????? Electronic Dictionary Research and " | |
| 116 | 116 | "Development Group." | |
| 117 | 117 | ||
| 118 | - | #: tools/list.scm:54 | |
| 118 | + | #: tools/list.scm:62 | |
| 119 | 119 | msgid "" | |
| 120 | 120 | "Japanese/English dictionary from the Electronic Dictionary Research and " | |
| 121 | 121 | "Development Group." | |
… | |||
| 123 | 123 | "????????????????-?????????????????????? ?????????????? ?????? Electronic Dictionary Research and " | |
| 124 | 124 | "Development Group." | |
| 125 | 125 | ||
| 126 | - | #: tools/list.scm:56 | |
| 126 | + | #: tools/list.scm:64 | |
| 127 | 127 | msgid "" | |
| 128 | 128 | "Japanese/French dictionary from the Electronic Dictionary Research and " | |
| 129 | 129 | "Development Group." | |
… | |||
| 131 | 131 | "????????????????-?????????????????????? ?????????????? ?????? Electronic Dictionary Research and " | |
| 132 | 132 | "Development Group." | |
| 133 | 133 | ||
| 134 | + | #: tools/list.scm:53 | |
| 135 | + | msgid "Japanese/French dictionary from the Jibiki project." | |
| 136 | + | msgstr "" | |
| 137 | + | ||
| 134 | 138 | #: tools/list.scm:37 | |
| 135 | 139 | msgid "Japanese/German dictionary from Wadoku." | |
| 136 | 140 | msgstr "????????????????-?????????????????? ?????????????? ?????? Wadoku." | |
| 137 | 141 | ||
| 138 | - | #: tools/list.scm:57 | |
| 142 | + | #: tools/list.scm:65 | |
| 139 | 143 | msgid "" | |
| 140 | 144 | "Japanese/German dictionary from the Electronic Dictionary Research and " | |
| 141 | 145 | "Development Group." | |
… | |||
| 143 | 147 | "????????????????-?????????????????? ?????????????? ?????? Electronic Dictionary Research and " | |
| 144 | 148 | "Development Group." | |
| 145 | 149 | ||
| 146 | - | #: tools/list.scm:58 | |
| 150 | + | #: tools/list.scm:66 | |
| 147 | 151 | msgid "" | |
| 148 | 152 | "Japanese/Hungarian dictionary from the Electronic Dictionary Research and " | |
| 149 | 153 | "Development Group." | |
… | |||
| 151 | 155 | "????????????????-?????????????????? ?????????????? ?????? Electronic Dictionary Research and " | |
| 152 | 156 | "Development Group." | |
| 153 | 157 | ||
| 154 | - | #: tools/list.scm:59 | |
| 158 | + | #: tools/list.scm:67 | |
| 155 | 159 | msgid "" | |
| 156 | 160 | "Japanese/Russian dictionary from the Electronic Dictionary Research and " | |
| 157 | 161 | "Development Group." | |
… | |||
| 159 | 163 | "????????????????-???????????????????? ?????????????? ?????? Electronic Dictionary Research and " | |
| 160 | 164 | "Development Group." | |
| 161 | 165 | ||
| 162 | - | #: tools/list.scm:60 | |
| 166 | + | #: tools/list.scm:68 | |
| 163 | 167 | msgid "" | |
| 164 | 168 | "Japanese/Slovenian dictionary from the Electronic Dictionary Research and " | |
| 165 | 169 | "Development Group." | |
… | |||
| 167 | 171 | "????????????????-?????????????????????? ?????????????? ?????? Electronic Dictionary Research and " | |
| 168 | 172 | "Development Group." | |
| 169 | 173 | ||
| 170 | - | #: tools/list.scm:61 | |
| 174 | + | #: tools/list.scm:69 | |
| 171 | 175 | msgid "" | |
| 172 | 176 | "Japanese/Spanish dictionary from the Electronic Dictionary Research and " | |
| 173 | 177 | "Development Group." | |
… | |||
| 175 | 179 | "????????????????-???????????????????? ?????????????? ?????? Electronic Dictionary Research and " | |
| 176 | 180 | "Development Group." | |
| 177 | 181 | ||
| 178 | - | #: tools/list.scm:62 | |
| 182 | + | #: tools/list.scm:70 | |
| 179 | 183 | msgid "" | |
| 180 | 184 | "Japanese/Swedish dictionary from the Electronic Dictionary Research and " | |
| 181 | 185 | "Development Group." | |
… | |||
| 383 | 387 | "?????????????????? ???? ?? ????????????????????, ?? ?????? ?????????????? ?????????????? ????????, ?????? ???????????????????? ??????\n" | |
| 384 | 388 | "?????????????????? ?????????? ??????????, ?? ?????????????????????????? ???????????????????? ???????????????? ????????????????." | |
| 385 | 389 | ||
| 390 | + | #: tools/list.scm:55 | |
| 391 | + | msgid "" | |
| 392 | + | "This dictionary allows you to do searches on the main view of this app.\n" | |
| 393 | + | "\tFailing to download one of these dictionaries will make the app unusable\n" | |
| 394 | + | "\tas you can't search for anything. This dictionary can be searched for\n" | |
| 395 | + | "\tby kanji, reading (kana) and by French translation." | |
| 396 | + | msgstr "" | |
| 397 | + | ||
| 386 | 398 | #: tools/list.scm:39 | |
| 387 | 399 | msgid "" | |
| 388 | 400 | "This dictionary allows you to do searches on the main view of this app.\n" | |
| 389 | - | " Failing to download on of these dictionaries will make the app " | |
| 401 | + | " Failing to download one of these dictionaries will make the app " | |
| 390 | 402 | "unusable\n" | |
| 391 | 403 | " as you can't search for anything. This dictionary can be searched " | |
| 392 | 404 | "for\n" | |
… | |||
| 399 | 411 | "???????????????? ???? ???? ?????????????? ???????????? ?? ?????? ???????????? ??????????. ?? ?????????? ???????????????? ??????????\n" | |
| 400 | 412 | "???????????? ???? ????????????, ?????????????? ?????????????????? (????????) ???? ???????????????????? ??????????????????." | |
| 401 | 413 | ||
| 402 | - | #: tools/list.scm:64 | |
| 414 | + | #: tools/list.scm:72 | |
| 403 | 415 | msgid "" | |
| 404 | 416 | "This dictionary allows you to do searches on the main view of this app.\n" | |
| 405 | 417 | " Failing to download one of these dictionaries will make the app " | |
… | |||
| 445 | 457 | "multilingual,\n" | |
| 446 | 458 | "including Dutch, French, German and others." | |
| 447 | 459 | msgstr "" | |
| 448 | - | "???? ?????????????????? ???????? ???????????????? ???? ???????? ???????????????????? ???????????????? <a href=\"~a\"" | |
| 449 | - | ">Creative Commons\n" | |
| 460 | + | "???? ?????????????????? ???????? ???????????????? ???? ???????? ???????????????????? ???????????????? <a href=\"~a" | |
| 461 | + | "\">Creative Commons\n" | |
| 450 | 462 | "Share-Alike</a>. ???? ???????? ?? ???????????????????? ?? ???????????????????? (????????????????????) ?? " | |
| 451 | 463 | "???????????????????????? ??????????, ??????????????\n" | |
| 452 | 464 | "????????????????????????, ??????????????????????, ?????????????????? ???? ???????????? ????????????." | |
radicals.mk
| 1 | - | RADK_MODULES=tools/radk.scm modules/nani/radk.scm modules/nani/serialize.scm | |
| 1 | + | RADK_MODULES=tools/radk.scm modules/nani/kanji/radk.scm modules/nani/encoding/serialize.scm | |
| 2 | 2 | DICOS+=dicos/radicals.nani | |
| 3 | 3 | DOWNLOADS+=dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml | |
| 4 | 4 | ||
| 5 | 5 | dictionaries/radkfilex.utf8: | |
| 6 | - | wget ftp://ftp.monash.edu/pub/nihongo/kradzip.zip -O dictionaries/kradzip.zip | |
| 6 | + | #wget ftp://ftp.monash.edu/pub/nihongo/kradzip.zip -O dictionaries/kradzip.zip | |
| 7 | + | wget http://ftp.usf.edu/pub/ftp.monash.edu.au/pub/nihongo/kradzip.zip -O dictionaries/kradzip.zip | |
| 7 | 8 | unzip dictionaries/kradzip.zip radkfilex -d dictionaries | |
| 8 | 9 | iconv -f euc-jp -t utf-8 dictionaries/radkfilex > $@ | |
| 9 | 10 | rm dictionaries/radkfilex |
tools/jibiki.scm unknown status 1
| 1 | + | ;;; Nani Project website | |
| 2 | + | ;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu> | |
| 3 | + | ;;; | |
| 4 | + | ;;; This file is part of the Nani Project website. | |
| 5 | + | ;;; | |
| 6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | + | ;;; your option) any later version. | |
| 10 | + | ;;; | |
| 11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | + | ;;; GNU Affero General Public License for more details. | |
| 15 | + | ;;; | |
| 16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | + | ||
| 19 | + | (use-modules (nani result result)) | |
| 20 | + | (use-modules (nani result jibiki)) | |
| 21 | + | (use-modules (nani result frequency)) | |
| 22 | + | (use-modules (ice-9 match)) | |
| 23 | + | (use-modules (ice-9 binary-ports)) | |
| 24 | + | ||
| 25 | + | ;; Break these steps to try and let the GC reclaim these big objects | |
| 26 | + | (define (get-results1 input frq) | |
| 27 | + | (call-with-input-file input | |
| 28 | + | (lambda (port) | |
| 29 | + | (xml->results port frq)))) | |
| 30 | + | ||
| 31 | + | (define (get-results input meaning-filter frq) | |
| 32 | + | (let* ((results (get-results1 input frq)) | |
| 33 | + | (results (map (lambda (result) | |
| 34 | + | (update-result | |
| 35 | + | result | |
| 36 | + | #:meanings (filter meaning-filter | |
| 37 | + | (result-meanings result)))) | |
| 38 | + | results)) | |
| 39 | + | (results (filter (lambda (result) (not (null? (result-meanings result)))) | |
| 40 | + | results))) | |
| 41 | + | results)) | |
| 42 | + | ||
| 43 | + | (define (compile input meaning-filter output) | |
| 44 | + | (let* ((results (get-results input meaning-filter | |
| 45 | + | (load-frequency "dictionaries/frequency.tsv")))) | |
| 46 | + | (format #t "Number of entries in ~a: ~a~%" output (length results)) | |
| 47 | + | (call-with-output-file output | |
| 48 | + | (lambda (port) | |
| 49 | + | (put-bytevector port | |
| 50 | + | (serialize-dictionary results)))))) | |
| 51 | + | ||
| 52 | + | (match (command-line) | |
| 53 | + | ((_ cmd input output) | |
| 54 | + | (cond | |
| 55 | + | ((equal? cmd "build") | |
| 56 | + | (compile input (const #t) output)) | |
| 57 | + | (else (format #t "Unknown cmd ~a.~%" cmd))))) |
tools/jmdict.scm
| 16 | 16 | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | 17 | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | 18 | ||
| 19 | - | (use-modules (nani jmdict trie)) | |
| 20 | - | (use-modules (nani jmdict serialize)) | |
| 21 | - | (use-modules (nani jmdict xml)) | |
| 22 | - | (use-modules (nani frequency)) | |
| 23 | - | (use-modules (nani trie)) | |
| 24 | - | (use-modules (nani result)) | |
| 19 | + | (use-modules (nani result frequency)) | |
| 20 | + | (use-modules (nani result jmdict)) | |
| 21 | + | (use-modules (nani result result)) | |
| 25 | 22 | (use-modules (ice-9 match)) | |
| 26 | 23 | (use-modules (ice-9 binary-ports)) | |
| 27 | 24 | ||
… | |||
| 31 | 28 | (lambda (port) | |
| 32 | 29 | (xml->results port frq)))) | |
| 33 | 30 | ||
| 34 | - | (define (get-results input sense-filter frq) | |
| 31 | + | (define (get-results input meaning-filter frq) | |
| 35 | 32 | (let* ((results (get-results1 input frq)) | |
| 36 | 33 | (results (map (lambda (result) | |
| 37 | 34 | (update-result | |
| 38 | 35 | result | |
| 39 | - | #:senses (filter sense-filter | |
| 40 | - | (result-senses result)))) | |
| 36 | + | #:meanings (filter meaning-filter | |
| 37 | + | (result-meanings result)))) | |
| 41 | 38 | results)) | |
| 42 | - | (results (filter (lambda (result) (not (null? (result-senses result)))) | |
| 39 | + | (results (filter (lambda (result) (not (null? (result-meanings result)))) | |
| 43 | 40 | results))) | |
| 44 | 41 | results)) | |
| 45 | 42 | ||
| 46 | - | (define (compile input sense-filter output) | |
| 47 | - | (let* ((results (get-results input sense-filter | |
| 48 | - | (load-frequency "dictionaries/frequency.tsv"))) | |
| 49 | - | (kanji-trie (compress-trie (make-kanji-trie results))) | |
| 50 | - | (reading-trie (compress-trie (make-reading-trie results))) | |
| 51 | - | (meaning-trie (compress-trie (make-meaning-trie results)))) | |
| 43 | + | (define (compile input meaning-filter output) | |
| 44 | + | (let* ((results (get-results input meaning-filter | |
| 45 | + | (load-frequency "dictionaries/frequency.tsv")))) | |
| 52 | 46 | (format #t "Number of entries in ~a: ~a~%" output (length results)) | |
| 53 | 47 | (call-with-output-file output | |
| 54 | 48 | (lambda (port) | |
| 55 | 49 | (put-bytevector port | |
| 56 | - | (serialize-jmdict results kanji-trie reading-trie meaning-trie)))))) | |
| 50 | + | (serialize-dictionary results)))))) | |
| 57 | 51 | ||
| 58 | 52 | (define (print word dict) | |
| 59 | 53 | #t) | |
… | |||
| 64 | 58 | ((equal? cmd "build") | |
| 65 | 59 | (if (equal? lang "e") | |
| 66 | 60 | (compile input (const #t) output) | |
| 67 | - | (compile input (lambda (sense) (equal? (sense-language sense) lang)) output))) | |
| 61 | + | (compile input (lambda (meaning) (equal? (meaning-language meaning) lang)) output))) | |
| 68 | 62 | (else (format #t "Unknown cmd ~a.~%" cmd)))) | |
| 69 | 63 | ((_ "print" word input) | |
| 70 | 64 | (print word input))) | |
tools/list.scm
| 17 | 17 | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | 18 | ||
| 19 | 19 | (use-modules (tools i18n)) | |
| 20 | - | (use-modules (nani radk)) | |
| 21 | - | (use-modules (nani jmdict serialize)) | |
| 22 | - | (use-modules (nani wadoku pitch)) | |
| 20 | + | (use-modules (nani kanji radk)) | |
| 21 | + | (use-modules (nani result result)) | |
| 22 | + | (use-modules (nani pitch pitch)) | |
| 23 | 23 | (use-modules (gcrypt hash)) | |
| 24 | 24 | (use-modules (ice-9 match)) | |
| 25 | 25 | (use-modules (ice-9 format)) | |
… | |||
| 37 | 37 | `(_ "Japanese/German dictionary from Wadoku.")) | |
| 38 | 38 | (define wadoku-description | |
| 39 | 39 | `(_ "This dictionary allows you to do searches on the main view of this app. | |
| 40 | - | Failing to download on of these dictionaries will make the app unusable | |
| 40 | + | Failing to download one of these dictionaries will make the app unusable | |
| 41 | 41 | as you can't search for anything. This dictionary can be searched for | |
| 42 | 42 | by kanji, reading (kana) and by German translation.")) | |
| 43 | 43 | ||
… | |||
| 49 | 49 | and this dictionary will add information that will help you pronounce | |
| 50 | 50 | words better, with a standard Japanese pitch accent.")) | |
| 51 | 51 | ||
| 52 | + | (define jibiki-synopsis | |
| 53 | + | `(_ "Japanese/French dictionary from the Jibiki project.")) | |
| 54 | + | (define jibiki-description | |
| 55 | + | `(_ "This dictionary allows you to do searches on the main view of this app. | |
| 56 | + | Failing to download one of these dictionaries will make the app unusable | |
| 57 | + | as you can't search for anything. This dictionary can be searched for | |
| 58 | + | by kanji, reading (kana) and by French translation.")) | |
| 59 | + | ||
| 52 | 60 | (define (jmdict-synopsis lang) | |
| 53 | 61 | (match lang | |
| 54 | 62 | ("e" `(_ "Japanese/English dictionary from the Electronic Dictionary Research and Development Group.")) | |
… | |||
| 80 | 88 | (if long? | |
| 81 | 89 | wadoku-pitch-description | |
| 82 | 90 | wadoku-pitch-synopsis)) | |
| 91 | + | ((equal? (dico-type dico) "jibiki") | |
| 92 | + | (if long? | |
| 93 | + | jibiki-description | |
| 94 | + | jibiki-synopsis)) | |
| 83 | 95 | ((equal? (dico-type dico) "jmdict") | |
| 84 | 96 | (let ((dico-lang (substring dico 7))) | |
| 85 | 97 | (if long? | |
… | |||
| 107 | 119 | ((equal? file "radicals") "radk") | |
| 108 | 120 | ((and (> (string-length file) 6) (equal? (substring file 0 6) "JMdict")) | |
| 109 | 121 | "jmdict") | |
| 122 | + | ((equal? file "jibiki_fre") "jibiki") | |
| 110 | 123 | ((equal? file "wadoku_ger") "wadoku") | |
| 111 | 124 | ((equal? file "wadoku_pitch") "wadoku_pitch"))) | |
| 112 | 125 | ||
… | |||
| 114 | 127 | (cond | |
| 115 | 128 | ((equal? (dico-type (dico-name file)) "radk") | |
| 116 | 129 | (kanji-count file)) | |
| 117 | - | ((member (dico-type (dico-name file)) '("jmdict" "wadoku")) | |
| 118 | - | (jmdict-entry-count file)) | |
| 130 | + | ((member (dico-type (dico-name file)) '("jmdict" "wadoku" "jibiki")) | |
| 131 | + | (dictionary-entry-count file)) | |
| 119 | 132 | ((equal? (dico-type (dico-name file)) "wadoku_pitch") | |
| 120 | 133 | (pitch-entry-count file)))) | |
| 121 | 134 | ||
tools/radk.scm
| 16 | 16 | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | 17 | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | 18 | ||
| 19 | - | (use-modules (nani radk)) | |
| 19 | + | (use-modules (nani kanji radk)) | |
| 20 | 20 | (use-modules (ice-9 match)) | |
| 21 | 21 | (use-modules (ice-9 binary-ports)) | |
| 22 | 22 |
tools/wadoku.scm
| 16 | 16 | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | 17 | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | 18 | ||
| 19 | - | (use-modules (nani jmdict trie)) | |
| 20 | - | (use-modules (nani jmdict serialize)) | |
| 21 | - | (use-modules (nani wadoku xml)) | |
| 22 | - | (use-modules (nani wadoku pitch)) | |
| 23 | - | (use-modules (nani frequency)) | |
| 24 | - | (use-modules (nani trie)) | |
| 25 | - | (use-modules (nani result)) | |
| 19 | + | (use-modules (nani pitch wadoku)) | |
| 20 | + | (use-modules (nani pitch pitch)) | |
| 21 | + | (use-modules (nani result frequency)) | |
| 22 | + | (use-modules (nani result result)) | |
| 23 | + | (use-modules (nani result wadoku)) | |
| 26 | 24 | (use-modules (ice-9 match)) | |
| 27 | 25 | (use-modules (ice-9 binary-ports)) | |
| 28 | 26 | ||
… | |||
| 32 | 30 | (lambda (port) | |
| 33 | 31 | (xml->results port frq)))) | |
| 34 | 32 | ||
| 35 | - | (define (get-results input sense-filter frq) | |
| 33 | + | (define (get-results input meaning-filter frq) | |
| 36 | 34 | (let* ((results (get-results1 input frq)) | |
| 37 | 35 | (results (map (lambda (result) | |
| 38 | 36 | (update-result | |
| 39 | 37 | result | |
| 40 | - | #:senses (filter sense-filter | |
| 41 | - | (result-senses result)))) | |
| 38 | + | #:meanings (filter meaning-filter | |
| 39 | + | (result-meanings result)))) | |
| 42 | 40 | results)) | |
| 43 | - | (results (filter (lambda (result) (not (null? (result-senses result)))) | |
| 41 | + | (results (filter (lambda (result) (not (null? (result-meanings result)))) | |
| 44 | 42 | results))) | |
| 45 | 43 | results)) | |
| 46 | 44 | ||
| 47 | - | (define (compile input sense-filter output) | |
| 48 | - | (let* ((results (get-results input sense-filter | |
| 49 | - | (load-frequency "dictionaries/frequency.tsv"))) | |
| 50 | - | (kanji-trie (compress-trie (make-kanji-trie results))) | |
| 51 | - | (reading-trie (compress-trie (make-reading-trie results))) | |
| 52 | - | (meaning-trie (compress-trie (make-meaning-trie results)))) | |
| 45 | + | (define (compile input meaning-filter output) | |
| 46 | + | (let* ((results (get-results input meaning-filter | |
| 47 | + | (load-frequency "dictionaries/frequency.tsv")))) | |
| 53 | 48 | (format #t "Number of entries in ~a: ~a~%" output (length results)) | |
| 54 | 49 | (call-with-output-file output | |
| 55 | 50 | (lambda (port) | |
| 56 | 51 | (put-bytevector port | |
| 57 | - | (serialize-jmdict results kanji-trie reading-trie meaning-trie)))))) | |
| 52 | + | (serialize-dictionary results)))))) | |
| 58 | 53 | ||
| 59 | 54 | (define (get-pitch input) | |
| 60 | 55 | (call-with-input-file input | |
… | |||
| 63 | 58 | ||
| 64 | 59 | (define (pitch input output) | |
| 65 | 60 | (let ((results (get-pitch input))) | |
| 66 | - | (format #t "~a results." (length results)) | |
| 61 | + | (format #t "~a results.~%" (length results)) | |
| 67 | 62 | (call-with-output-file output | |
| 68 | 63 | (lambda (port) | |
| 69 | 64 | (put-bytevector port | |