Rework dictionary generation

Julien LepillerMon Jul 19 23:26:26+0200 2021

8a51332

Rework dictionary generation

Makefile

1+
# keep po files, even if they are sometimes generated, and keep downloaded dictionaries around
12
.PRECIOUS: po/%.po dictionaries/%
23
34
all: site
45
6+
# To be filled by included files
7+
# DICOS is the list of generated dictionaries
8+
# DOWNLOADS is the list of downloaded files
59
DICOS=
610
DOWNLOADS=
711
812
include radicals.mk
913
include wadoku.mk
1014
include jmdict.mk
15+
include jibiki.mk
1116
17+
# Files that constitute the website
1218
PAGES=blog.scm data.scm documentation.scm e404.scm feeds.scm index.scm mentions.scm
13-
1419
HAUNT_FILES= haunt.scm $(addprefix pages/, $(PAGES)) \
1520
        tools/i18n.scm tools/theme.scm
16-
21+
SHA_DICOS=$(addsuffix .sha256, $(DICOS))
1722
WEB_FILES= $(HAUNT_FILES) \
18-
        $(shell find css) $(shell find images) $(DICOS) $(addsuffix .sha256, $(DICOS)) \
23+
		   $(shell find css) $(shell find images) $(DICOS) $(SHA_DICOS) \
1924
        dicos/list
2025
26+
# Guile modules used to build dictionaries
2127
DICO_MODULES=modules/nani/trie.scm modules/nani/result.scm modules/nani/jmdict/trie.scm \
2228
        modules/nani/jmdict/serialize.scm modules/nani/jmdict/xml.scm \
2329
        modules/nani/jmdict/entities.scm modules/nani/frequency.scm \
2430
        modules/nani/serialize.scm
2531
32+
# Available languages
2633
LANGS=fr uk
2734
MOFILES=$(addprefix po/, $(addsuffix /LC_MESSAGES/nani.mo, $(LANGS)))
2835
36+
dicos: $(DICOS) $(SHA_DICOS) dicos/list
37+
2938
site: $(MOFILES) $(WEB_FILES)
3039
	haunt build
3140
	rm -rf public.bak

jibiki.mk unknown status 1

1+
DICOS+=dicos/jibiki_fre.nani
2+
DOWNLOADS+=dictionaries/jibiki.xml
3+
4+
dictionaries/jibiki.xml:
5+
	wget https://jibiki.fr/data/Jibiki.fr/jibiki.fr_jpn_fra.xml.gz -O $@.gz
6+
	gunzip $@.gz
7+
8+
dicos/jibiki_fre.nani: dictionaries/jibiki.xml tools/jibiki.scm dictionaries/frequency.tsv $(DICO_MODULES)
9+
	guile -L modules tools/jibiki.scm build $< $@

jmdict.mk

2222
2323
dicos/JMdict_%.nani: dictionaries/JMdict.xml tools/jmdict.scm dictionaries/frequency.tsv $(DICO_MODULES)
2424
	guile -L modules tools/jmdict.scm build \
25-
        $< $(shell echo $@ | sed 's|^.*_\([^.]*\)\..*$$|\1|g') $@
25+
        $< $(shell basename $@ .nani | sed 's|^JMdict_||g') $@
2626
2727
dicos/JMdict_e.nani: dictionaries/JMdict_e.xml tools/jmdict.scm dictionaries/frequency.tsv $(DICO_MODULES)
2828
	guile -L modules tools/jmdict.scm build $< e $@

modules/nani/encoding/huffman.scm unknown status 1

1+
;;; Nani Project website
2+
;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
19+
(define-module (nani encoding huffman)
20+
  #:use-module (ice-9 match)
21+
  #:use-module (rnrs bytevectors)
22+
  #:export (create-huffman
23+
            huffman->code
24+
            huffman-encode
25+
            huffman-decode
26+
            serialize-huffman
27+
            serialize-huffman-string huffman-string-size))
28+
29+
(define (add-occurence occ char)
30+
  (let* ((o (assoc-ref occ char))
31+
         (o (+ (if o o 0) 1)))
32+
    (assoc-set! occ char o)))
33+
34+
(define (add-occurences occ str)
35+
  (let loop ((lst (append (string->list str) (list #\nul))) (occ occ))
36+
    (if (null? lst)
37+
      occ
38+
      (loop (cdr lst) (add-occurence occ (car lst))))))
39+
40+
(define (get-occurences string-list)
41+
  (let loop ((lst string-list) (occ '()))
42+
    (if (null? lst)
43+
      occ
44+
      (loop (cdr lst) (add-occurences occ (car lst))))))
45+
46+
(define (order tree)
47+
  (if (null? tree)
48+
    tree
49+
    (add-in-order (car tree) (order (cdr tree)))))
50+
51+
(define (add-in-order elem tree)
52+
  (if (null? tree)
53+
    (list elem)
54+
    (let ((elem-pos (cdr elem))
55+
          (tree-pos (cdr (car tree))))
56+
      (if (> elem-pos tree-pos)
57+
        (cons (car tree) (add-in-order elem (cdr tree)))
58+
        (cons elem tree)))))
59+
60+
(define (create-huffman string-list)
61+
  (let ((total (apply + (map string-length string-list)))
62+
        (occurences (get-occurences string-list)))
63+
    (let loop ((tree (order occurences)))
64+
      (if (equal? (length tree) 1)
65+
        tree
66+
        (let* ((e1 (car tree))
67+
               (tree (cdr tree))
68+
               (e2 (car tree))
69+
               (tree (cdr tree)))
70+
          (loop (add-in-order (cons (list e1 e2) (+ (cdr e1) (cdr e2))) tree)))))))
71+
72+
(define (huffman->code huffman)
73+
  (match huffman
74+
    ('() '())
75+
    ((((h1 h2) . weight))
76+
     (append
77+
       (map (lambda (e) (cons (car e) (cons 0 (cdr e)))) (huffman->code h1))
78+
       (map (lambda (e) (cons (car e) (cons 1 (cdr e)))) (huffman->code h2))))
79+
    (((h1 h2) . weight)
80+
     (append
81+
       (map (lambda (e) (cons (car e) (cons 0 (cdr e)))) (huffman->code h1))
82+
       (map (lambda (e) (cons (car e) (cons 1 (cdr e)))) (huffman->code h2))))
83+
    ((((? char? char) . weight))
84+
     (list (cons char '())))
85+
    (((? char? char) . weight)
86+
     (list (cons char '())))))
87+
88+
(define (bits->bytes b)
89+
  (match b
90+
   ('() '())
91+
   ((a b c d e f g h bs ...)
92+
    (cons (+ h (* 2 g) (* 4 f) (* 8 e) (* 16 d) (* 32 c) (* 64 b) (* 128 a))
93+
      (bits->bytes bs)))
94+
   (_ (bits->bytes (append b (make-list (- 8 (length b)) 0))))))
95+
96+
(define (huffman-encode code str)
97+
  (u8-list->bytevector
98+
    (bits->bytes
99+
      (apply append (map (lambda (c) (or (assoc-ref code c) (throw "not in huffman code" c)))
100+
                         (append (string->list str) (list #\nul)))))))
101+
102+
(define (byte->bits b)
103+
  (let loop ((i 2) (b b) (by '()))
104+
    (if (eq? i 512)
105+
      by
106+
      (loop (* i 2) (- b (modulo b i)) (cons (if (eq? (modulo b i) 0) 0 1) by)))))
107+
108+
(define (huffman-decode huffman bv)
109+
  (let ((seq (apply append (map byte->bits (bytevector->u8-list bv)))))
110+
    (list->string
111+
      (reverse
112+
        (let loop ((seq seq) (result '()) (huff huffman))
113+
          (if (null? seq)
114+
            result
115+
            (match huff
116+
              ((((h1 h2) . weight))
117+
               (if (= (car seq) 0)
118+
                 (loop (cdr seq) result h1)
119+
                 (loop (cdr seq) result h2)))
120+
              (((h1 h2) . weight)
121+
               (if (= (car seq) 0)
122+
                 (loop (cdr seq) result h1)
123+
                 (loop (cdr seq) result h2)))
124+
              ((((? char? char) . weight))
125+
               (if (equal? char #\nul)
126+
                 result
127+
                 (loop seq (cons char result) huffman)))
128+
              (((? char? char) . weight)
129+
               (if (equal? char #\nul)
130+
                 result
131+
                 (loop seq (cons char result) huffman))))))))))
132+
133+
(define (serialize-huffman huffman)
134+
  (define (serialize huffman)
135+
    (match huffman
136+
      ((((h1 h2) . weight))
137+
       (append '(1) (serialize h1) (serialize h2)))
138+
      (((h1 h2) . weight)
139+
       (append '(1) (serialize h1) (serialize h2)))
140+
      ((((? char? char) . weight))
141+
       (append (bytevector->u8-list (string->utf8 (list->string (list char)))) '(0)))
142+
      (((? char? char) . weight)
143+
       (append (bytevector->u8-list (string->utf8 (list->string (list char)))) '(0)))))
144+
  (u8-list->bytevector (serialize huffman)))
145+
146+
(define (serialize-huffman-string huffman-code)
147+
  (lambda (str pos bv)
148+
    (let ((sbv (huffman-encode huffman-code str)))
149+
      (bytevector-copy! sbv 0 bv pos (bytevector-length sbv))
150+
      (+ pos (bytevector-length sbv)))))
151+
152+
(define (huffman-string-size huffman-code)
153+
  (lambda (str)
154+
    (let ((sbv (huffman-encode huffman-code str)))
155+
      (+ (bytevector-length sbv)))))

modules/nani/encoding/parse.scm unknown status 1

1+
;;; Nani Project website
2+
;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
19+
(define-module (nani encoding parse)
20+
  #:use-module (ice-9 binary-ports)
21+
  #:use-module (rnrs bytevectors)
22+
  #:export (parse-list
23+
            parse-char
24+
            parse-int
25+
            parse-boolean
26+
            parse-string))
27+
28+
(define* (parse-list parse-element #:key (size #f))
29+
  (lambda (port)
30+
    (let ((size (or size (bytevector-u16-ref (get-bytevector-n port 2) 0 (endianness big)))))
31+
      (let loop ((result '()) (remaining size))
32+
        (if (= remaining 0)
33+
            (reverse result)
34+
            (loop (cons (parse-element port) result) (- remaining 1)))))))
35+
36+
(define (parse-char port)
37+
  (get-u8 port))
38+
39+
(define (parse-int port)
40+
  (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big)))
41+
42+
(define (parse-boolean port)
43+
  (= (get-u8 port) 1))
44+
45+
(define (parse-string port)
46+
  (define (get-result-string port)
47+
    (let loop ((lu8 '()) (char (get-u8 port)))
48+
      (if (= char 0)
49+
          lu8
50+
          (loop (cons char lu8) (get-u8 port)))))
51+
  (utf8->string (u8-list->bytevector (reverse (get-result-string port)))))

modules/nani/encoding/serialize.scm unknown status 1

1+
;;; Nani Project website
2+
;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
19+
(define-module (nani encoding serialize)
20+
  #:use-module (rnrs bytevectors)
21+
  #:export (merge-bvs
22+
            serialize-list list-size
23+
            serialize-char char-size
24+
            serialize-u16 u16-size
25+
            serialize-int int-size
26+
            serialize-boolean boolean-size
27+
            serialize-string string-size))
28+
29+
;;; merge-bvs bvs
30+
;;;   Returns a bytevector that contains the data from `bvs`, a list of bytevectors.  For instance,
31+
;;;   (merge-bvs '(#u8(1 2) #u8(3 4))) gives #u8(1 2 3 4).
32+
;;;
33+
;;; Serializers and Sizers
34+
;;; ----------------------
35+
;;; 
36+
;;; A serializer is a procedure that takes a value to serialize, a position at which to serialize
37+
;;; the value, and a bytevector in which to serialize: `serialize val pos bv`.
38+
;;;
39+
;;; A sizer is a procedure that takes a value to serialize, and returns the expected size it will
40+
;;; take in the serialization.
41+
;;;
42+
;;; Common Serialization Functions
43+
;;; ------------------------------
44+
;;;
45+
;;; serialize-list serialize-element [#:size? #t]
46+
;;;   Returns a serializer for a list of elements. `serialize-element` is the serializer function for
47+
;;;   an element.  When `#:size` is `#t`, add the size of the list to the serialization, as a 16-bits
48+
;;;   unsigned integer.
49+
;;;
50+
;;; list-size element-size [#:size? #t]
51+
;;;   Returns a sizer for lists, where elements are sized by `element-size`.
52+
53+
(define (merge-bvs bvs)
54+
  (let* ((size (apply + (map bytevector-length bvs)))
55+
         (bv (make-bytevector size 0)))
56+
    (let loop ((bvs bvs) (pos 0))
57+
      (unless (null? bvs)
58+
        (let ((sz (bytevector-length (car bvs))))
59+
          (bytevector-copy! (car bvs) 0 bv pos sz)
60+
          (loop (cdr bvs) (+ pos sz)))))
61+
    bv))
62+
63+
(define* (serialize-list serialize-element #:key (size? #t))
64+
  (lambda (lst pos bv)
65+
    (when (not (list? lst)) (throw 'not-list lst))
66+
    (when size?
67+
      (bytevector-u16-set! bv pos (length lst) (endianness big)))
68+
    (let loop ((lst lst) (pos (+ pos (if size? 2 0))))
69+
      (if (null? lst)
70+
        pos
71+
        (loop (cdr lst) (serialize-element (car lst) pos bv))))))
72+
(define* (list-size element-size #:key (size? #t))
73+
  (lambda (lst)
74+
    (when (not (list? lst)) (throw 'not-list lst))
75+
    (apply + (if size? 2 0) (map element-size lst))))
76+
77+
(define (serialize-char int pos bv)
78+
  (bytevector-u8-set! bv pos int)
79+
  (+ pos 1))
80+
(define char-size (const 1))
81+
82+
(define (serialize-u16 int pos bv)
83+
  (bytevector-u16-set! bv pos int (endianness big))
84+
  (+ pos 2))
85+
(define u16-size (const 2))
86+
87+
(define (serialize-int int pos bv)
88+
  (bytevector-u32-set! bv pos int (endianness big))
89+
  (+ pos 4))
90+
(define int-size (const 4))
91+
92+
(define (serialize-boolean bool pos bv)
93+
  (bytevector-u8-set! bv pos (if bool 1 0))
94+
  (+ pos 1))
95+
(define boolean-size (const 1))
96+
97+
(define (serialize-string str pos bv)
98+
  (let ((sbv (string->utf8 str)))
99+
    (bytevector-copy! sbv 0 bv pos (bytevector-length sbv))
100+
    (bytevector-u8-set! bv (+ pos (bytevector-length sbv)) 0)
101+
    (+ pos 1 (bytevector-length sbv))))
102+
(define (string-size str)
103+
  (let ((sbv (string->utf8 str)))
104+
    (+ 1 (bytevector-length sbv))))

modules/nani/encoding/trie.scm unknown status 1

1+
;;; Nani Project website
2+
;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
19+
(define-module (nani encoding trie)
20+
  #:use-module (nani encoding serialize)
21+
  #:use-module (rnrs bytevectors)
22+
  #:use-module (srfi srfi-9)
23+
  #:export (make-trie
24+
            trie?
25+
            trie-position
26+
            trie-position-set!
27+
            trie-vals
28+
            trie-vals-set!
29+
            trie-transitions
30+
            trie-transitions-set!
31+
            
32+
            make-empty-trie
33+
            add-to-trie!
34+
            compress-trie
35+
            
36+
            serialize-trie
37+
            serialize-trie-single
38+
            trie-size
39+
            trie-size-single))
40+
41+
(define-record-type trie
42+
  (make-trie position vals transitions)
43+
  trie?
44+
  (position trie-position trie-position-set!) ; integer
45+
  (vals trie-vals trie-vals-set!) ; list
46+
  (transitions trie-transitions trie-transitions-set!)) ; array or alist
47+
48+
(define (make-empty-trie)
49+
  (make-trie 0 '() (make-array #f 16)))
50+
51+
(define (add-to-trie! trie key value)
52+
  (if (null? key)
53+
    (trie-vals-set! trie (cons value (trie-vals trie)))
54+
    (let ((next-trie (array-ref (trie-transitions trie) (car key))))
55+
      (if next-trie
56+
          (add-to-trie! next-trie (cdr key) value)
57+
          (let ((next-trie (make-empty-trie)))
58+
            (array-set! (trie-transitions trie) next-trie (car key))
59+
            (add-to-trie! next-trie (cdr key) value))))))
60+
61+
(define (convert-trie-transitions! trie)
62+
  (define (get-new-transitions transitions)
63+
    (let loop ((i 0) (tr '()))
64+
      (if (= i 16)
65+
        tr
66+
        (let ((elem (array-ref transitions i)))
67+
          (if elem
68+
            (begin
69+
              (convert-trie-transitions! elem)
70+
              (loop (+ i 1) (cons (cons i elem) tr)))
71+
            (loop (+ i 1) tr))))))
72+
  (let* ((transitions (trie-transitions trie))
73+
         (transitions (get-new-transitions transitions)))
74+
    (trie-transitions-set! trie transitions)))
75+
76+
(define (compress-trie trie)
77+
  (define (compress-aux trie)
78+
    (make-trie
79+
      (trie-position trie)
80+
      (trie-vals trie)
81+
      (apply append
82+
        (map
83+
          (lambda (tr)
84+
            (let ((trie (cdr tr)))
85+
              (map
86+
                (lambda (tr2)
87+
                  (cons (+ (car tr2) (* 16 (car tr)))
88+
                        (compress-aux (cdr tr2))))
89+
                (trie-transitions trie))))
90+
          (trie-transitions trie)))))
91+
  (convert-trie-transitions! trie)
92+
  (compress-aux trie))
93+
94+
(define (pointer-size ptr)
95+
  5)
96+
97+
(define (serialize-pointer ptr pos bv)
98+
  (bytevector-u8-set! bv pos (car ptr))
99+
  (bytevector-u32-set! bv (+ pos 1) (trie-position (cdr ptr)) (endianness big))
100+
  (+ pos 5))
101+
102+
(define (serialize-trie-single serialize-value value-size)
103+
  (define (serialize-trie-aux transitions pos)
104+
    (let loop ((pos pos)
105+
               (trs transitions)
106+
               (bvs '()))
107+
      (if (null? trs)
108+
        (cons pos bvs)
109+
        (let* ((next-trie (cdr (car trs)))
110+
               (bv (get-trie-bv next-trie pos))
111+
               (pos (car bv))
112+
               (bv (cdr bv)))
113+
          (loop pos (cdr trs) (append bvs bv))))))
114+
115+
  (define (get-trie-bv trie pos)
116+
    (trie-position-set! trie pos)
117+
    (let* ((vals-sz (value-size (trie-vals trie)))
118+
           (trs-sz ((list-size (const 5) #:size? #f) (trie-transitions trie)))
119+
           (sz (+ vals-sz 1 trs-sz))
120+
           (bv (make-bytevector sz 0)))
121+
      (serialize-value (trie-vals trie) 0 bv)
122+
      (let* ((bvs (serialize-trie-aux
123+
                    (trie-transitions trie)
124+
                    (+ pos sz)))
125+
             (next-pos (car bvs))
126+
             (bvs (cdr bvs)))
127+
        (bytevector-u8-set! bv vals-sz (length (trie-transitions trie)))
128+
        ((serialize-list serialize-pointer #:size? #f)
129+
         (trie-transitions trie) (+ vals-sz 1) bv)
130+
        (cons next-pos (cons bv bvs)))))
131+
  
132+
  (lambda (trie pos bv)
133+
    (let* ((trie-bv (get-trie-bv trie pos))
134+
           (new-pos (car trie-bv))
135+
           (trie-bv (merge-bvs (cdr trie-bv))))
136+
      (bytevector-copy! trie-bv 0 bv pos (bytevector-length trie-bv))
137+
      new-pos)))
138+
139+
(define (serialize-trie serialize-value value-size)
140+
  (serialize-trie-single (serialize-list serialize-value) (list-size value-size)))
141+
 
142+
(define (trie-size-single value-size)
143+
  (lambda (trie)
144+
    (apply +
145+
      (value-size (trie-vals trie))
146+
      1
147+
      ((list-size pointer-size #:size? #f) (trie-transitions trie))
148+
      (map (lambda (trie) ((trie-size-single value-size) trie))
149+
           (map cdr (trie-transitions trie))))))
150+
151+
(define (trie-size value-size)
152+
  (trie-size-single (list-size value-size)))

modules/nani/frequency.scm unknown status 2

1-
;;; Nani Project website
2-
;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu>
3-
;;;
4-
;;; This file is part of the Nani Project website.
5-
;;;
6-
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7-
;;; under the terms of the GNU Affero General Public License as published by
8-
;;; the Free Software Foundation; either version 3 of the License, or (at
9-
;;; your option) any later version.
10-
;;;
11-
;;; The Nani Project website is distributed in the hope that it will be useful, but
12-
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13-
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14-
;;; GNU Affero General Public License for more details.
15-
;;;
16-
;;; You should have received a copy of the GNU Affero General Public License
17-
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18-
19-
(define-module (nani frequency)
20-
  #:use-module (ice-9 rdelim)
21-
  #:export (load-frequency
22-
            frequency-entity))
23-
24-
(define (load-frequency file)
25-
  (call-with-input-file file
26-
    (lambda (port)
27-
      (let loop ((frq '()) (i 1))
28-
        (let* ((line (%read-line port))
29-
               (line (car line)))
30-
          (if (eof-object? line)
31-
            frq
32-
            (let ((content (string-split line #\tab)))
33-
              (loop (cons (cons (cadr (cdr content)) i) frq) (+ i 1)))))))))
34-
35-
(define (frequency-entity frq word)
36-
  (let ((freq (assoc-ref frq word)))
37-
    (cond
38-
      ((not freq)
39-
       #f)
40-
      ((< freq 501)
41-
       "frq500")
42-
      ((< freq 1001)
43-
       "frq1000")
44-
      ((< freq 2001)
45-
       "frq2000")
46-
      ((< freq 5001)
47-
       "frq1000")
48-
      ((< freq 10001)
49-
       "frq10000")
50-
      ((< freq 20001)
51-
       "frq20000")
52-
      (else #f))))

modules/nani/huffman.scm unknown status 2

1-
;;; Nani Project website
2-
;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu>
3-
;;;
4-
;;; This file is part of the Nani Project website.
5-
;;;
6-
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7-
;;; under the terms of the GNU Affero General Public License as published by
8-
;;; the Free Software Foundation; either version 3 of the License, or (at
9-
;;; your option) any later version.
10-
;;;
11-
;;; The Nani Project website is distributed in the hope that it will be useful, but
12-
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13-
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14-
;;; GNU Affero General Public License for more details.
15-
;;;
16-
;;; You should have received a copy of the GNU Affero General Public License
17-
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18-
19-
(define-module (nani huffman)
20-
  #:use-module (ice-9 match)
21-
  #:use-module (rnrs bytevectors)
22-
  #:export (create-huffman
23-
            huffman->code
24-
            huffman-encode
25-
            huffman-decode
26-
            serialize-huffman
27-
            serialize-huffman-string huffman-string-size))
28-
29-
(define (add-occurence occ char)
30-
  (let* ((o (assoc-ref occ char))
31-
         (o (+ (if o o 0) 1)))
32-
    (assoc-set! occ char o)))
33-
34-
(define (add-occurences occ str)
35-
  (let loop ((lst (append (string->list str) (list #\nul))) (occ occ))
36-
    (if (null? lst)
37-
      occ
38-
      (loop (cdr lst) (add-occurence occ (car lst))))))
39-
40-
(define (get-occurences string-list)
41-
  (let loop ((lst string-list) (occ '()))
42-
    (if (null? lst)
43-
      occ
44-
      (loop (cdr lst) (add-occurences occ (car lst))))))
45-
46-
(define (order tree)
47-
  (if (null? tree)
48-
    tree
49-
    (add-in-order (car tree) (order (cdr tree)))))
50-
51-
(define (add-in-order elem tree)
52-
  (if (null? tree)
53-
    (list elem)
54-
    (let ((elem-pos (cdr elem))
55-
          (tree-pos (cdr (car tree))))
56-
      (if (> elem-pos tree-pos)
57-
        (cons (car tree) (add-in-order elem (cdr tree)))
58-
        (cons elem tree)))))
59-
60-
(define (create-huffman string-list)
61-
  (let ((total (apply + (map string-length string-list)))
62-
        (occurences (get-occurences string-list)))
63-
    (let loop ((tree (order occurences)))
64-
      (if (equal? (length tree) 1)
65-
        tree
66-
        (let* ((e1 (car tree))
67-
               (tree (cdr tree))
68-
               (e2 (car tree))
69-
               (tree (cdr tree)))
70-
          (loop (add-in-order (cons (list e1 e2) (+ (cdr e1) (cdr e2))) tree)))))))
71-
72-
(define (huffman->code huffman)
73-
  (match huffman
74-
    ('() '())
75-
    ((((h1 h2) . weight))
76-
     (append
77-
       (map (lambda (e) (cons (car e) (cons 0 (cdr e)))) (huffman->code h1))
78-
       (map (lambda (e) (cons (car e) (cons 1 (cdr e)))) (huffman->code h2))))
79-
    (((h1 h2) . weight)
80-
     (append
81-
       (map (lambda (e) (cons (car e) (cons 0 (cdr e)))) (huffman->code h1))
82-
       (map (lambda (e) (cons (car e) (cons 1 (cdr e)))) (huffman->code h2))))
83-
    ((((? char? char) . weight))
84-
     (list (cons char '())))
85-
    (((? char? char) . weight)
86-
     (list (cons char '())))))
87-
88-
(define (bits->bytes b)
89-
  (match b
90-
   ('() '())
91-
   ((a b c d e f g h bs ...)
92-
    (cons (+ h (* 2 g) (* 4 f) (* 8 e) (* 16 d) (* 32 c) (* 64 b) (* 128 a))
93-
      (bits->bytes bs)))
94-
   (_ (bits->bytes (append b (make-list (- 8 (length b)) 0))))))
95-
96-
(define (huffman-encode code str)
97-
  (u8-list->bytevector
98-
    (bits->bytes
99-
      (apply append (map (lambda (c) (assoc-ref code c))
100-
                         (append (string->list str) (list #\nul)))))))
101-
102-
(define (byte->bits b)
103-
  (let loop ((i 2) (b b) (by '()))
104-
    (if (eq? i 512)
105-
      by
106-
      (loop (* i 2) (- b (modulo b i)) (cons (if (eq? (modulo b i) 0) 0 1) by)))))
107-
108-
(define (huffman-decode huffman bv)
109-
  (let ((seq (apply append (map byte->bits (bytevector->u8-list bv)))))
110-
    (list->string
111-
      (reverse
112-
        (let loop ((seq seq) (result '()) (huff huffman))
113-
          (if (null? seq)
114-
            result
115-
            (match huff
116-
              ((((h1 h2) . weight))
117-
               (if (= (car seq) 0)
118-
                 (loop (cdr seq) result h1)
119-
                 (loop (cdr seq) result h2)))
120-
              (((h1 h2) . weight)
121-
               (if (= (car seq) 0)
122-
                 (loop (cdr seq) result h1)
123-
                 (loop (cdr seq) result h2)))
124-
              ((((? char? char) . weight))
125-
               (if (equal? char #\nul)
126-
                 result
127-
                 (loop seq (cons char result) huffman)))
128-
              (((? char? char) . weight)
129-
               (if (equal? char #\nul)
130-
                 result
131-
                 (loop seq (cons char result) huffman))))))))))
132-
133-
(define (serialize-huffman huffman)
134-
  (define (serialize huffman)
135-
    (match huffman
136-
      ((((h1 h2) . weight))
137-
       (append '(1) (serialize h1) (serialize h2)))
138-
      (((h1 h2) . weight)
139-
       (append '(1) (serialize h1) (serialize h2)))
140-
      ((((? char? char) . weight))
141-
       (append (bytevector->u8-list (string->utf8 (list->string (list char)))) '(0)))
142-
      (((? char? char) . weight)
143-
       (append (bytevector->u8-list (string->utf8 (list->string (list char)))) '(0)))))
144-
  (u8-list->bytevector (serialize huffman)))
145-
146-
(define (serialize-huffman-string huffman-code)
147-
  (lambda (str pos bv)
148-
    (let ((sbv (huffman-encode huffman-code str)))
149-
      (bytevector-copy! sbv 0 bv pos (bytevector-length sbv))
150-
      (+ pos (bytevector-length sbv)))))
151-
152-
(define (huffman-string-size huffman-code)
153-
  (lambda (str)
154-
    (let ((sbv (huffman-encode huffman-code str)))
155-
      (+ (bytevector-length sbv)))))

modules/nani/jmdict/entities.scm unknown status 2

1-
(define-module (nani jmdict entities)
2-
  #:use-module (nani result)
3-
  #:use-module (nani tags)
4-
  #:re-export (get-tag get-points))

modules/nani/jmdict/serialize.scm unknown status 2

1-
;;; Nani Project website
2-
;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu>
3-
;;;
4-
;;; This file is part of the Nani Project website.
5-
;;;
6-
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7-
;;; under the terms of the GNU Affero General Public License as published by
8-
;;; the Free Software Foundation; either version 3 of the License, or (at
9-
;;; your option) any later version.
10-
;;;
11-
;;; The Nani Project website is distributed in the hope that it will be useful, but
12-
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13-
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14-
;;; GNU Affero General Public License for more details.
15-
;;;
16-
;;; You should have received a copy of the GNU Affero General Public License
17-
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18-
19-
(define-module (nani jmdict serialize)
20-
  #:use-module (nani huffman)
21-
  #:use-module (nani result)
22-
  #:use-module (nani serialize)
23-
  #:use-module (nani trie)
24-
  #:use-module (ice-9 binary-ports)
25-
  #:use-module (rnrs bytevectors)
26-
  #:export (serialize-jmdict
27-
            jmdict-entry-count))
28-
29-
(define (serialize-pointer ptr pos bv)
30-
  (bytevector-u8-set! bv pos (car ptr))
31-
  (bytevector-u32-set! bv (+ pos 1) (trie-position (cdr ptr)) (endianness big))
32-
  (+ pos 5))
33-
34-
(define (serialize-trie trie results pos)
35-
  (define (serialize-trie-aux transitions pos)
36-
    (let loop ((pos pos)
37-
               (trs transitions)
38-
               (bvs '()))
39-
      (if (null? trs)
40-
        (cons pos bvs)
41-
        (let* ((next-trie (cdr (car trs)))
42-
               (bv (serialize-trie next-trie results pos))
43-
               (pos (car bv))
44-
               (bv (cdr bv)))
45-
          (loop pos (cdr trs) (append bvs bv))))))
46-
  (trie-position-set! trie pos)
47-
  (let* ((vals-sz (list-size (trie-vals trie) int-size))
48-
         (trs-sz (list-size (trie-transitions trie) (const 5) #:size? #f))
49-
         (sz (+ vals-sz 1 trs-sz))
50-
         (bv (make-bytevector sz 0)))
51-
    (serialize-list (map (lambda (pos) (result-position (array-ref results pos)))
52-
                         (trie-vals trie))
53-
                    serialize-int 0 bv)
54-
    (let* ((bvs (serialize-trie-aux
55-
                  (trie-transitions trie)
56-
                  (+ pos (bytevector-length bv))))
57-
           (next-pos (car bvs))
58-
           (bvs (cdr bvs)))
59-
      (bytevector-u8-set! bv vals-sz (length (trie-transitions trie)))
60-
      (serialize-list (trie-transitions trie) serialize-pointer
61-
                      (+ vals-sz 1) bv #:size? #f)
62-
      (cons next-pos (cons bv bvs)))))
63-
64-
(define (serialize-jmdict results kanji-trie reading-trie sense-trie)
65-
  (define kanji-huffman
66-
    (let ((kanjis (apply append (map result-kanjis results))))
67-
      (create-huffman kanjis)))
68-
  (define kanji-huffman-code (huffman->code kanji-huffman))
69-
  (define reading-huffman
70-
    (let* ((readings (apply append (map result-readings results)))
71-
           (readings (apply append (map reading-readings readings))))
72-
      (create-huffman readings)))
73-
  (define reading-huffman-code (huffman->code reading-huffman))
74-
  (define sense-huffman
75-
    (let* ((senses (apply append (map result-senses results)))
76-
           (glosses (apply append (map sense-glosses senses))))
77-
      (create-huffman glosses)))
78-
  (define sense-huffman-code (huffman->code sense-huffman))
79-
80-
  (define (serialize-source source pos bv)
81-
    (when (not (source? source)) (throw 'not-source source))
82-
    (let* ((pos (serialize-list (source-content source) serialize-string pos bv))
83-
           (pos (serialize-boolean (source-wasei? source) pos bv))
84-
           (pos (serialize-string (source-type source) pos bv))
85-
           (pos (serialize-string (source-lang source) pos bv)))
86-
      pos))
87-
  (define (source-size source)
88-
    (when (not (source? source)) (throw 'not-source source))
89-
    (+ (list-size (source-content source) string-size)
90-
       (boolean-size (source-wasei? source))
91-
       (string-size (source-type source))
92-
       (string-size (source-lang source))))
93-
  
94-
  (define (serialize-reading reading pos bv)
95-
    (when (not (reading? reading)) (throw 'not-reading reading))
96-
    (let* ((pos (serialize-list (reading-kanjis reading) serialize-string pos bv))
97-
           (pos (serialize-list (reading-info reading) serialize-string pos bv))
98-
           (pos (serialize-list (reading-readings reading)
99-
                                (serialize-huffman-string reading-huffman-code) pos bv)))
100-
      pos))
101-
  (define (reading-size reading)
102-
    (when (not (reading? reading)) (throw 'not-reading reading))
103-
    (+ (list-size (reading-kanjis reading) string-size)
104-
       (list-size (reading-info reading) string-size)
105-
       (list-size (reading-readings reading) (huffman-string-size reading-huffman-code))))
106-
  
107-
  (define (serialize-sense sense pos bv)
108-
    (when (not (sense? sense)) (throw 'not-sense sense))
109-
    (let* ((pos (serialize-list (sense-references sense) serialize-string pos bv))
110-
           (pos (serialize-list (sense-limits sense) serialize-string pos bv))
111-
           (pos (serialize-list (sense-infos sense) serialize-string pos bv))
112-
           (pos (serialize-list (sense-sources sense) serialize-source pos bv))
113-
           (pos (serialize-list (sense-tags sense) serialize-char pos bv))
114-
           (pos (serialize-list (sense-glosses sense)
115-
                (serialize-huffman-string sense-huffman-code) pos bv))
116-
           (pos (serialize-string (sense-language sense) pos bv)))
117-
      pos))
118-
  (define (sense-size sense)
119-
    (when (not (sense? sense)) (throw 'not-sense sense))
120-
    (+ (list-size (sense-references sense) string-size)
121-
       (list-size (sense-limits sense) string-size)
122-
       (list-size (sense-infos sense) string-size)
123-
       (list-size (sense-sources sense) source-size)
124-
       (list-size (sense-tags sense) char-size)
125-
       (list-size (sense-glosses sense)
126-
                  (huffman-string-size sense-huffman-code))
127-
       (string-size (sense-language sense))))
128-
  
129-
  (define (serialize-result result pos bv)
130-
    (when (not (result? result)) (throw 'not-result result))
131-
    (result-position-set! result pos)
132-
    (let* ((pos (serialize-list (result-kanjis result)
133-
                                (serialize-huffman-string kanji-huffman-code) pos bv))
134-
           (pos (serialize-list (result-readings result) serialize-reading pos bv))
135-
           (pos (serialize-list (result-senses result) serialize-sense pos bv)))
136-
      pos))
137-
  (define (result-size result)
138-
    (when (not (result? result)) (throw 'not-result result))
139-
    (+ (list-size (result-kanjis result) (huffman-string-size kanji-huffman-code))
140-
       (list-size (result-readings result) reading-size)
141-
       (list-size (result-senses result) sense-size)))
142-
143-
  (define (trie-node-size trie)
144-
    (apply + 1 (map trie-node-size (map cdr (trie-transitions trie)))))
145-
146-
  (let* ((header (string->utf8 "NANI_JMDICT001"))
147-
         (header-size (bytevector-length header))
148-
         (pointers (make-bytevector 12 0))
149-
         (kanji-huffman-bv (serialize-huffman kanji-huffman))
150-
         (kanji-huffman-size (bytevector-length kanji-huffman-bv))
151-
         (reading-huffman-bv (serialize-huffman reading-huffman))
152-
         (reading-huffman-size (bytevector-length reading-huffman-bv))
153-
         (sense-huffman-bv (serialize-huffman sense-huffman))
154-
         (sense-huffman-size (bytevector-length sense-huffman-bv))
155-
         (results-size (list-size results result-size #:size? #f))
156-
         (huffman-size (+ reading-huffman-size sense-huffman-size kanji-huffman-size))
157-
         (results-bv (make-bytevector (+ header-size 12 kanji-huffman-size
158-
                                         reading-huffman-size sense-huffman-size
159-
                                         results-size 4))))
160-
    (format #t "Number of nodes in kanjis: ~a~%"
161-
      (trie-node-size kanji-trie))
162-
    (format #t "Number of nodes in readings: ~a~%"
163-
      (trie-node-size reading-trie))
164-
    (format #t "Number of nodes in senses: ~a~%"
165-
      (trie-node-size sense-trie))
166-
    (serialize-list results serialize-result (+ header-size 12 huffman-size)
167-
                    results-bv #:size? #f)
168-
    ;; number of entries
169-
    (serialize-int (length results) (+ header-size 12 huffman-size results-size)
170-
                   results-bv)
171-
    (let* ((results (list->array 1 results))
172-
           (pos (bytevector-length results-bv))
173-
           (kanji-bvs (serialize-trie kanji-trie results pos))
174-
           (pos (car kanji-bvs))
175-
           (reading-bvs (serialize-trie reading-trie results pos))
176-
           (pos (car reading-bvs))
177-
           (meaning-bvs (serialize-trie sense-trie results pos)))
178-
      (bytevector-u32-set! pointers 0 (bytevector-length results-bv) (endianness big))
179-
      (bytevector-u32-set! pointers 4 (car kanji-bvs) (endianness big))
180-
      (bytevector-u32-set! pointers 8 (car reading-bvs) (endianness big))
181-
      (bytevector-copy! header 0 results-bv 0 header-size)
182-
      (bytevector-copy! pointers 0 results-bv header-size 12)
183-
      (bytevector-copy! kanji-huffman-bv 0 results-bv (+ header-size 12) kanji-huffman-size)
184-
      (bytevector-copy! reading-huffman-bv 0 results-bv
185-
                        (+ header-size 12 kanji-huffman-size)
186-
                        reading-huffman-size)
187-
      (bytevector-copy! sense-huffman-bv 0 results-bv
188-
                        (+ header-size 12 kanji-huffman-size reading-huffman-size)
189-
                        sense-huffman-size)
190-
      ;; give some feedback on the size of file's structures
191-
      (format #t "huffmans are ~a bytes long~%" huffman-size)
192-
      (format #t "results is ~a bytes long~%" (bytevector-length results-bv))
193-
      (format #t "kanjis is ~a bytes long~%" (apply + (map bytevector-length (cdr kanji-bvs))))
194-
      (format #t "readings is ~a bytes long~%" (apply + (map bytevector-length (cdr reading-bvs))))
195-
      (format #t "senses is ~a bytes long~%" (apply + (map bytevector-length (cdr meaning-bvs))))
196-
      (merge-bvs (append (list results-bv) (cdr kanji-bvs) (cdr reading-bvs)
197-
                         (cdr meaning-bvs))))))
198-
199-
(define (jmdict-entry-count file)
200-
  (call-with-input-file file
201-
    (lambda (port)
202-
      (let* ((header (utf8->string (get-bytevector-n port 14)))
203-
             (pointers (get-bytevector-n port 12))
204-
             (kanji-pos (bytevector-u32-ref pointers 0 (endianness big))))
205-
        (seek port (- kanji-pos 4) SEEK_SET)
206-
        (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big))))))

modules/nani/jmdict/trie.scm unknown status 2

1-
;;; Nani Project website
2-
;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu>
3-
;;;
4-
;;; This file is part of the Nani Project website.
5-
;;;
6-
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7-
;;; under the terms of the GNU Affero General Public License as published by
8-
;;; the Free Software Foundation; either version 3 of the License, or (at
9-
;;; your option) any later version.
10-
;;;
11-
;;; The Nani Project website is distributed in the hope that it will be useful, but
12-
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13-
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14-
;;; GNU Affero General Public License for more details.
15-
;;;
16-
;;; You should have received a copy of the GNU Affero General Public License
17-
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18-
19-
(define-module (nani jmdict trie)
20-
  #:use-module (nani trie)
21-
  #:use-module (nani result)
22-
  #:use-module (rnrs bytevectors)
23-
  #:export (make-kanji-trie
24-
            make-reading-trie
25-
            make-meaning-trie))
26-
27-
(define (make-key key)
28-
  (apply append
29-
    (map
30-
      (lambda (c)
31-
        (list (quotient c 16) (modulo c 16)))
32-
      (bytevector->u8-list (string->utf8 key)))))
33-
34-
(define (make-kanji-trie results)
35-
  (let ((trie (make-empty-trie)))
36-
    (let loop ((results results) (i 0))
37-
      (if (null? results)
38-
        trie
39-
        (begin
40-
          (for-each
41-
            (lambda (key)
42-
              (add-to-trie! trie (make-key key) i))
43-
            (result-kanjis (car results)))
44-
          (loop (cdr results) (+ i 1)))))))
45-
46-
(define (make-reading-trie results)
47-
  (let ((trie (make-empty-trie)))
48-
    (let loop ((results results) (i 0))
49-
      (if (null? results)
50-
        trie
51-
        (begin
52-
          (for-each
53-
            (lambda (reading)
54-
              (for-each
55-
                (lambda (key)
56-
                  (add-to-trie! trie (make-key key) i))
57-
                (reading-readings reading)))
58-
            (result-readings (car results)))
59-
          (loop (cdr results) (+ i 1)))))))
60-
61-
(define (make-meaning-trie results)
62-
  (let ((trie (make-empty-trie)))
63-
    (let loop ((results results) (i 0))
64-
      (if (null? results)
65-
        trie
66-
        (begin
67-
          (for-each
68-
            (lambda (meaning)
69-
              (for-each
70-
                (lambda (key)
71-
                  (add-to-trie! trie (make-key key) i))
72-
                (sense-glosses meaning)))
73-
            (result-senses (car results)))
74-
          (loop (cdr results) (+ i 1)))))))

modules/nani/jmdict/xml.scm unknown status 2

1-
;;; Nani Project website
2-
;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu>
3-
;;;
4-
;;; This file is part of the Nani Project website.
5-
;;;
6-
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7-
;;; under the terms of the GNU Affero General Public License as published by
8-
;;; the Free Software Foundation; either version 3 of the License, or (at
9-
;;; your option) any later version.
10-
;;;
11-
;;; The Nani Project website is distributed in the hope that it will be useful, but
12-
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13-
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14-
;;; GNU Affero General Public License for more details.
15-
;;;
16-
;;; You should have received a copy of the GNU Affero General Public License
17-
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18-
19-
(define-module (nani jmdict xml)
20-
  #:use-module (ice-9 match)
21-
  #:use-module (ice-9 rdelim)
22-
  #:use-module (sxml ssax)
23-
  #:use-module (nani frequency)
24-
  #:use-module (sxml simple)
25-
  #:use-module (nani result)
26-
  #:use-module (nani jmdict entities)
27-
  #:export (load-dic xml->results))
28-
29-
(define (load-dic file)
30-
  (xml->sxml (call-with-input-file file read-string)))
31-
32-
(define (sxml->reading lst)
33-
  (let loop ((reading (make-reading '() '() '())) (lst lst))
34-
    (if (null? lst)
35-
      reading
36-
      (loop
37-
        (match (car lst)
38-
          (('reading r) (update-reading reading #:readings (cons r (reading-readings reading))))
39-
          (('info r) (update-reading reading #:info (cons r (reading-info reading))))
40-
          (('limit r) (update-reading reading #:kanjis (cons r (reading-kanjis reading))))
41-
          (((? symbol? s) v) (throw 'unknown-content s v))
42-
          ((? string? _) reading))
43-
        (cdr lst)))))
44-
45-
(define (sxml->source lst)
46-
  (let loop ((source (make-source '() #f "" "")) (lst lst))
47-
    (if (null? lst)
48-
      source
49-
      (loop
50-
        (match (car lst)
51-
          (('content c) (update-source source #:content (cons c (source-content source))))
52-
          (('ls_wasei _) (update-source source #:wasei? #t))
53-
          (('ls_type t) (update-source source #:type t))
54-
          ((('xml . 'lang) l) (update-source source #:lang l))
55-
          (((? symbol? s) v) (throw 'unknown-content s v))
56-
          ((? list? l) (loop source l))
57-
          ((? string? c) (update-source source #:content
58-
                                        (cons c (source-content source)))))
59-
        (cdr lst)))))
60-
61-
(define (sxml->sense lst)
62-
  (let loop ((sense (make-sense '() '() '() '() '() '() "eng")) (lst lst))
63-
    (if (null? lst)
64-
      sense
65-
      (loop
66-
        (match (car lst)
67-
          (('ref (? string? r)) (update-sense sense #:references (cons r (sense-references sense))))
68-
          (('limit (? string? r)) (update-sense sense #:limits (cons r (sense-limits sense))))
69-
          (('info (? string? r)) (update-sense sense #:infos (cons r (sense-infos sense))))
70-
          ((? source? s) (update-sense sense #:sources (cons s (sense-sources sense))))
71-
          (('tag (? integer? r)) (update-sense sense #:tags (cons r (sense-tags sense))))
72-
          (('gloss (? string? r)) (update-sense sense #:glosses (cons r (sense-glosses sense))))
73-
          ((('xml . 'lang) (? string? l)) (update-sense sense #:language l))
74-
          (((? symbol? s) v) (throw 'unknown-content s v))
75-
          ((? list? l) (loop sense l))
76-
          ((? string? _) sense))
77-
        (cdr lst)))))
78-
79-
(define (sxml->result lst frq)
80-
  (let ((result
81-
        (let loop ((result (make-result 0 0 '() '() '())) (lst lst))
82-
          (if (null? lst)
83-
            result
84-
            (loop
85-
              (match (car lst)
86-
                (('kanji kanji)
87-
                 (update-result result #:kanjis (cons kanji (result-kanjis result))))
88-
                ((? reading? r)
89-
                 (update-result result #:readings (cons r (result-readings result))))
90-
                ((? sense? s)
91-
                 (update-result result #:senses (cons s (result-senses result))))
92-
                ((? string? _) result))
93-
              (cdr lst))))))
94-
    (let* ((word (if (null? (result-kanjis result))
95-
                   (car (reading-readings (car (result-readings result))))
96-
                   (car (result-kanjis result))))
97-
           (entity (frequency-entity frq word))
98-
           (sense1 (car (result-senses result)))
99-
           (sense1 (if entity
100-
                       (update-sense sense1
101-
                         #:tags (cons (get-tag entity) (sense-tags sense1)))
102-
                       sense1))
103-
           (senses (cons sense1 (cdr (result-senses result)))))
104-
      (update-result result
105-
        #:points (get-points (update-result result #:senses senses))
106-
        #:senses senses))))
107-
108-
(define (sxml->element lst elem frq)
109-
  (match elem
110-
    ('ent_seq "")
111-
    ('ke_pri "")
112-
    ('re_nokanji "")
113-
    ('re_pri "")
114-
    ('ke_inf "")
115-
    ('misc (if (and (= (length lst) 1) (string? (car lst)))
116-
               `(tag ,(get-tag (car lst)))
117-
               (throw 'invalid-misc lst)))
118-
    ('re_restr (if (and (= (length lst) 1) (string? (car lst)))
119-
                   `(limit ,(car lst))
120-
                   (throw 'invalid-re_restr lst)))
121-
    ('keb (if (and (= (length lst) 1) (string? (car lst)))
122-
              `(kanji ,(car lst))
123-
              (throw 'invalid-keb lst)))
124-
    ('s_inf (if (null? (filter (lambda (s) (not (string? s))) lst))
125-
                `(info ,(apply string-append lst))
126-
                (throw 'invalid-s_inf lst)))
127-
    ('dial (if (and (= (length lst) 1) (string? (car lst)))
128-
               `(tag ,(get-tag (car lst)))
129-
               (throw 'invalid-dial lst)))
130-
    ('re_inf (if (and (= (length lst) 1) (string? (car lst)))
131-
                 `(info ,(car lst))
132-
                 (throw 'invalid-re_inf lst)))
133-
    ('stagk (if (and (= (length lst) 1) (string? (car lst)))
134-
                `(limit ,(car lst))
135-
                (throw 'invalid-stagk lst)))
136-
    ('stagr (if (and (= (length lst) 1) (string? (car lst)))
137-
                `(limit ,(car lst))
138-
                (throw 'invalid-stagr lst)))
139-
    ('field (if (and (= (length lst) 1) (string? (car lst)))
140-
                `(tag ,(get-tag (car lst)))
141-
                (throw 'invalid-field lst)))
142-
    ('ant (if (and (= (length lst) 1) (string? (car lst)))
143-
              `(ref ,(car lst))
144-
              (throw 'invalid-ant lst)))
145-
    ('reb (if (and (= (length lst) 1) (string? (car lst)))
146-
              `(reading ,(car lst))
147-
              (throw 'invalid-reb lst)))
148-
    ('r_ele (sxml->reading lst))
149-
    ('k_ele (car (filter list? lst)))
150-
    ('pos (if (and (= (length lst) 1) (string? (car lst)))
151-
              `(tag ,(get-tag (car lst)))
152-
              (throw 'invalid-pos lst)))
153-
    ('xref (if (and (= (length lst) 1) (string? (car lst)))
154-
               `(ref ,(car lst))
155-
               (throw 'invalid-xref lst)))
156-
    ('gloss (cons
157-
              `(gloss ,(apply string-append (filter string? lst)))
158-
              (filter list? lst)))
159-
    ('lsource (sxml->source lst))
160-
    ('sense (sxml->sense lst))
161-
    ('entry (sxml->result lst frq))))
162-
163-
(define (create-parser frq)
164-
  (ssax:make-parser
165-
    NEW-LEVEL-SEED
166-
    (lambda (elem-gi attributes namespaces expected-content seed)
167-
      (map
168-
        (match-lambda
169-
          ((k . v) (list k v)))
170-
        (filter
171-
          (match-lambda
172-
            ((k . v) (not (member k '(g_type)))))
173-
          attributes)))
174-
175-
    FINISH-ELEMENT
176-
    (lambda (elem-gi attributes namespaces parent-seed seed)
177-
      (if (equal? elem-gi 'JMdict)
178-
          seed
179-
          (let* ((seed (reverse seed))
180-
                 (element (sxml->element seed elem-gi frq)))
181-
            (cons element parent-seed))))
182-
183-
    CHAR-DATA-HANDLER
184-
    (lambda (string1 string2 seed)
185-
      (cons (string-append string1 string2) seed))))
186-
187-
(define (xml->results port frq)
188-
  (let ((results (filter result? ((create-parser frq) port '()))))
189-
    (sort results (lambda (a b) (> (result-points a) (result-points b))))))

modules/nani/kanji/radk.scm unknown status 1

1+
;;; Nani Project website
2+
;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
19+
(define-module (nani kanji radk)
20+
  #:use-module (ice-9 binary-ports)
21+
  #:use-module (ice-9 match)
22+
  #:use-module (ice-9 peg)
23+
  #:use-module (ice-9 rdelim)
24+
  #:use-module (nani encoding parse)
25+
  #:use-module (nani encoding serialize)
26+
  #:use-module (rnrs bytevectors)
27+
  #:use-module (sxml simple)
28+
  #:export (parse-radk
29+
            get-kanji-stroke
30+
            get-rad-kanji
31+
            get-rad-stroke
32+
            serialize-radk
33+
            kanji-count))
34+
35+
;; PEG parser for the radk file
36+
(define-peg-pattern comment none (and "#" (* (or "\t" (range #\x20 #\x10ffff))) "\n"))
37+
(define-peg-pattern space none " ")
38+
(define-peg-pattern return none "\n")
39+
(define-peg-pattern entry all
40+
  (and (ignore "$") space char space num (? (and space name)) (ignore "\n")
41+
       (+ (or char (ignore "\n")))))
42+
(define-peg-pattern num all (+ (or (range #\0 #\9))))
43+
(define-peg-pattern name none (+ (or (range #\0 #\9) (range #\a #\z) (range #\A #\Z))))
44+
(define-peg-pattern char all (and (range #\xff #\x10ffff)))
45+
(define-peg-pattern radk-doc body (* (or return comment entry)))
46+
47+
;; parse the file with the peg parser, to low-level alist structure
48+
(define (parse-radk file)
49+
  (peg:tree (match-pattern radk-doc (call-with-input-file file read-string))))
50+
51+
(define (get-rad-kanji content)
52+
  (let loop ((result '()) (content content))
53+
    (match content
54+
      (() result)
55+
      ((('entry ('char radical) ('num stroke) ('char kanji)) content ...)
56+
       (loop (cons (list radical kanji) result)
57+
             content))
58+
      ((('entry ('char radical) ('num stroke) (('char kanji) ...)) content ...)
59+
       (loop (cons (cons radical kanji) result)
60+
             content)))))
61+
62+
(define (get-rad-stroke content)
63+
  (let loop ((result '()) (content content))
64+
    (match content
65+
      (() result)
66+
      ((('entry ('char radical) ('num stroke) ('char kanji)) content ...)
67+
       (loop (cons (cons radical (string->number stroke)) result)
68+
             content))
69+
      ((('entry ('char radical) ('num stroke) (('char kanji) ...)) content ...)
70+
       (loop (cons (cons radical (string->number stroke)) result)
71+
             content)))))
72+
73+
;; return an alist where keys are kanjis, and values the associated stroke count.
74+
(define (get-kanji-stroke file)
75+
  (define strokes (xml->sxml (call-with-input-file file read-string)))
76+
77+
  (match strokes
78+
    (('*TOP* _ ('kanjidic2 content ...))
79+
     (map
80+
       (lambda (entry)
81+
         (let* ((literal (car (assoc-ref entry 'literal)))
82+
                (misc (assoc-ref entry 'misc))
83+
                (misc (filter list? misc))
84+
                (stroke (car (assoc-ref misc 'stroke_count))))
85+
           (cons literal (string->number stroke))))
86+
       (filter
87+
         (lambda (entry)
88+
           (and
89+
             (list? entry)
90+
             (equal? (car entry) 'character)))
91+
         content)))))
92+
93+
(define (serialize-radk rad-kanji rad-stroke kanji-stroke)
94+
  (define (serialize-rad-kanji-element element pos bv)
95+
    (match element
96+
      ((radical kanji ...)
97+
       (let* ((pos (serialize-string (radical-character radical) pos bv)))
98+
         (serialize-string (string-join kanji "") pos bv)))))
99+
  (define (rad-kanji-element-size element)
100+
    (match element
101+
      ((radical kanji ...)
102+
       (+ (string-size (radical-character radical))
103+
          (string-size (string-join kanji ""))))))
104+
105+
  (define (serialize-rad-kanji rad-kanji pos bv)
106+
    ((serialize-list serialize-rad-kanji-element) rad-kanji pos bv))
107+
  (define (rad-kanji-size rad-kanji)
108+
    ((list-size rad-kanji-element-size) rad-kanji))
109+
110+
  (define (serialize-rad-stroke-element element pos bv)
111+
    (match element
112+
      ((radical . stroke)
113+
       (let ((pos (serialize-string (radical-character radical) pos bv)))
114+
         (serialize-char stroke pos bv)))))
115+
  (define (rad-stroke-element-size element)
116+
    (match element
117+
      ((radical . stroke)
118+
       (+ (string-size (radical-character radical)) (char-size stroke)))))
119+
120+
  (define (serialize-rad-stroke rad-stroke pos bv)
121+
    ((serialize-list serialize-rad-stroke-element) rad-stroke pos bv))
122+
  (define (rad-stroke-size rad-stroke)
123+
    ((list-size rad-stroke-element-size) rad-stroke))
124+
125+
  (define (serialize-kanji-stroke-element element pos bv)
126+
    (match element
127+
      ((kanji . stroke)
128+
       (let ((pos (serialize-string kanji pos bv)))
129+
         (serialize-char stroke pos bv)))))
130+
  (define (kanji-stroke-element-size element)
131+
    (match element
132+
      ((kanji . stroke)
133+
       (+ (string-size kanji) (char-size stroke)))))
134+
135+
  (define (serialize-kanji-stroke kanji-stroke pos bv)
136+
    ((serialize-list serialize-kanji-stroke-element) kanji-stroke pos bv))
137+
  (define (kanji-stroke-size kanji-stroke)
138+
    ((list-size kanji-stroke-element-size) kanji-stroke))
139+
140+
  (let* ((header (string->utf8 "NANI_RADK001"))
141+
         (header-size (bytevector-length header))
142+
         (bv (make-bytevector (+ header-size 12
143+
                                 (rad-kanji-size rad-kanji)
144+
                                 (rad-stroke-size rad-stroke)
145+
                                 (kanji-stroke-size kanji-stroke)))))
146+
    (bytevector-copy! header 0 bv 0 header-size)
147+
    (let* ((pos header-size)
148+
           (pos (serialize-rad-kanji rad-kanji pos bv))
149+
           (pos (serialize-rad-stroke rad-stroke pos bv))
150+
           (pos (serialize-kanji-stroke kanji-stroke pos bv)))
151+
      bv)))
152+
153+
(define (radical-character kanji)
154+
  (match kanji
155+
    ("???" "???")
156+
    ("???" "????")
157+
    ("???" "???")
158+
    ("???" "???")
159+
    ("???" "????")
160+
    ("???" "???")
161+
    ("???" "???")
162+
    ("???" "???")
163+
    ("???" "???")
164+
    ("???" "???")
165+
    ("???" "???")
166+
    ("???" "???")
167+
    ("???" "???")
168+
    ("???" "???")
169+
    ("???" "???")
170+
    ("???" "???")
171+
    ("???" "???")
172+
    ("???" "???")
173+
    ("???" "???")
174+
    ("???" "???")
175+
    ("???" "???")
176+
    ("???" "???")
177+
    (_ kanji)))
178+
179+
;; parse a serialized file back to the guile structure
180+
(define (parse file)
181+
  (define (parse-rad-kanji-element port)
182+
    (let ((radical (parse-string port))
183+
          (kanji-list (parse-string port)))
184+
       (cons radical (string->list kanji-list))))
185+
  (define (parse-rad-kanji port)
186+
    ((parse-list parse-rad-kanji-element) port))
187+
188+
  (define (parse-rad-stroke-element port)
189+
    (let ((radical (parse-string port))
190+
          (stroke (parse-char port)))
191+
       (cons radical stroke)))
192+
  (define (parse-rad-stroke port)
193+
    ((parse-list parse-rad-stroke-element) port))
194+
195+
  (define (parse-kanji-stroke-element port)
196+
    (let ((kanji (parse-string port))
197+
          (stroke (parse-char port)))
198+
      (cons kanji stroke)))
199+
  (define (parse-kanji-stroke port)
200+
    ((parse-list parse-kanji-stroke-element) port))
201+
202+
  (call-with-input-file file
203+
    (lambda (port)
204+
      (let* ((header (utf8->string (get-bytevector-n port 12)))
205+
             (rad-kanji (parse-rad-kanji port))
206+
             (rad-stroke (parse-rad-stroke port))
207+
             (kanji-stroke (parse-kanji-stroke port)))
208+
        (list rad-kanji rad-stroke kanji-stroke)))))
209+
210+
(define (get-kanji-list content)
211+
  (let loop ((result '()) (content content))
212+
    (match content
213+
      (() result)
214+
      (((_ kanji ...) content ...)
215+
       (loop (append result (filter (lambda (k) (not (member k result))) kanji))
216+
             content)))))
217+
218+
(define (kanji-count file)
219+
  (match (parse file)
220+
    ((rad-kanji _ _)
221+
     (length (get-kanji-list rad-kanji)))))

modules/nani/parse-result.scm unknown status 2

1-
;;; Nani Project website
2-
;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu>
3-
;;;
4-
;;; This file is part of the Nani Project website.
5-
;;;
6-
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7-
;;; under the terms of the GNU Affero General Public License as published by
8-
;;; the Free Software Foundation; either version 3 of the License, or (at
9-
;;; your option) any later version.
10-
;;;
11-
;;; The Nani Project website is distributed in the hope that it will be useful, but
12-
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13-
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14-
;;; GNU Affero General Public License for more details.
15-
;;;
16-
;;; You should have received a copy of the GNU Affero General Public License
17-
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18-
19-
(define-module (nani parse-result)
20-
  #:use-module (ice-9 binary-ports)
21-
  #:use-module (rnrs bytevectors)
22-
  #:export (parse-result-list
23-
            parse-result-char
24-
            parse-result-int
25-
            parse-result-boolean
26-
            parse-result-string))
27-
28-
(define* (parse-result-list port parse-result-element)
29-
  (let ((size (bytevector-u16-ref (get-bytevector-n port 2) 0 (endianness big))))
30-
    (let loop ((result '()) (remaining size))
31-
      (if (= remaining 0)
32-
          (reverse result)
33-
          (loop (cons (parse-result-element port) result) (- remaining 1))))))
34-
35-
(define (parse-result-char port)
36-
  (get-u8 port))
37-
38-
(define (parse-result-int port)
39-
  (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big)))
40-
41-
(define (parse-result-boolean port)
42-
  (= (get-u8 port) 1))
43-
44-
(define (parse-result-string port)
45-
  (define (get-result-string port)
46-
    (let loop ((lu8 '()) (char (get-u8 port)))
47-
      (if (= char 0)
48-
          lu8
49-
          (loop (cons char lu8) (get-u8 port)))))
50-
  (utf8->string (u8-list->bytevector (reverse (get-result-string port)))))

modules/nani/pitch/pitch.scm unknown status 1

1+
;;; Nani Project website
2+
;;; Copyright ?? 2021 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
19+
(define-module (nani pitch pitch)
20+
  #:use-module (ice-9 binary-ports)
21+
  #:use-module (ice-9 match)
22+
  #:use-module (nani encoding huffman)
23+
  #:use-module (nani encoding trie)
24+
  #:use-module (rnrs bytevectors)
25+
  #:use-module (srfi srfi-1)
26+
  #:use-module (srfi srfi-9)
27+
  #:export (make-pitch
28+
            pitch?
29+
            pitch-kanji
30+
            pitch-reading
31+
            pitch-accents
32+
33+
            serialize-pitch
34+
            pitch-entry-count))
35+
36+
(define-record-type pitch
37+
  (make-pitch kanji reading accents)
38+
  pitch?
39+
  (kanji   pitch-kanji)
40+
  (reading pitch-reading)
41+
  (accents pitch-accents))
42+
43+
(define (make-trie-key key)
44+
  (append-map
45+
    (lambda (c)
46+
      (list (quotient c 16) (modulo c 16)))
47+
    (bytevector->u8-list (string->utf8 key))))
48+
49+
(define (get-pitch-trie pitches)
50+
  (let ((trie (make-empty-trie)))
51+
    (for-each
52+
      (lambda (pitch)
53+
        (let ((key (string-append (pitch-kanji pitch) (pitch-reading pitch))))
54+
          (for-each
55+
            (lambda (accent)
56+
              (add-to-trie! trie (make-trie-key key) accent))
57+
            (pitch-accents pitch))))
58+
      pitches)
59+
    (compress-trie trie)))
60+
61+
(define (collapse-vals! trie)
62+
  (let ((transitions (trie-transitions trie))
63+
        (vals (map (match-lambda ((? string? s) s) ((? number? i) (number->string i)))
64+
                   (trie-vals trie))))
65+
    (trie-vals-set! trie (string-join vals ", "))
66+
    (for-each collapse-vals! (map cdr transitions))))
67+
68+
(define (collect-vals trie)
69+
  (let ((transitions (trie-transitions trie))
70+
        (vals (trie-vals trie)))
71+
    (cons vals (append-map collect-vals (map cdr transitions)))))
72+
73+
(define (serialize-pitch pitches)
74+
  (let ((trie (get-pitch-trie pitches)))
75+
    (collapse-vals! trie)
76+
    (let* ((huffman (create-huffman (collect-vals trie)))
77+
           (code (huffman->code huffman)))
78+
      (let* ((header (string->utf8 "NANI_PITCH001"))
79+
             (header-size (bytevector-length header))
80+
             (huffman-bv (serialize-huffman huffman))
81+
             (huffman-size (bytevector-length huffman-bv))
82+
             (trie-size ((trie-size-single (huffman-string-size code)) trie))
83+
             (result (make-bytevector (+ header-size 4 huffman-size trie-size))))
84+
        (bytevector-copy! header 0 result 0 header-size)
85+
        (bytevector-u32-set! result header-size (length pitches) (endianness big))
86+
        (bytevector-copy! huffman-bv 0 result (+ header-size 4) huffman-size)
87+
        ((serialize-trie-single (serialize-huffman-string code)
88+
                                (huffman-string-size code))
89+
         trie (+ header-size 4 huffman-size) result)
90+
        result))))
91+
92+
(define (pitch-entry-count file)
93+
  (call-with-input-file file
94+
    (lambda (port)
95+
      ;; header
96+
      (get-bytevector-n port 13)
97+
      ;; size
98+
      (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big)))))

modules/nani/pitch/wadoku.scm unknown status 1

1+
;;; Nani Project website
2+
;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
19+
(define-module (nani pitch wadoku)
20+
  #:use-module (ice-9 match)
21+
  #:use-module (nani pitch pitch)
22+
  #:use-module (srfi srfi-1)
23+
  #:use-module (sxml ssax)
24+
  #:export (xml->pitch))
25+
26+
(define (sxml->element lst elem)
27+
  (let ((elem (match elem
28+
                ((_ . elem) elem)
29+
                (_ elem))))
30+
    (match elem
31+
      ('accent `(pitch . ,(car lst)))
32+
      ('hira `(reading . ,(car lst)))
33+
      ('orth (let ((kanji (filter string? lst)))
34+
               (if (null? kanji) #f `(kanji . ,(car kanji)))))
35+
      ('form
36+
       `(form . ,(append-map (lambda (a) (if (list? a) a (list a))) lst)))
37+
      ('reading (filter pair? lst))
38+
      ('entry
39+
        (let loop ((lst lst) (kanjis '()) (readings '()) (accents '()))
40+
          (if (null? lst)
41+
              (if (or (and (null? kanjis) (null? readings)) (null? accents))
42+
                  #f
43+
                  (append-map
44+
                    (lambda (reading)
45+
                      (map
46+
                        (lambda (kanji)
47+
                          (make-pitch kanji reading accents))
48+
                        kanjis))
49+
                    readings))
50+
              (match (car lst)
51+
                (('form . f)
52+
                 (loop (append f (cdr lst)) kanjis readings accents))
53+
                (('pitch . pitch)
54+
                 (loop (cdr lst) kanjis readings (cons pitch accents)))
55+
                (('reading . reading)
56+
                 (loop (cdr lst) kanjis (cons reading readings) accents))
57+
                (('kanji . kanji)
58+
                 (loop (cdr lst) (cons kanji kanjis) readings accents))
59+
                (_ (loop (cdr lst) kanjis readings accents))))))
60+
      (_ #f))))
61+
62+
(define parser
63+
  (ssax:make-parser
64+
    NEW-LEVEL-SEED
65+
    (lambda (elem-gi attributes namespaces expected-content seed)
66+
      attributes)
67+
    
68+
    FINISH-ELEMENT
69+
    (lambda (elem-gi attributes namespaces parent-seed seed)
70+
      (if (equal? elem-gi 'entries)
71+
          seed
72+
          (let* ((seed (reverse seed))
73+
                 (element (sxml->element seed elem-gi)))
74+
            (cons element parent-seed))))
75+
    
76+
    CHAR-DATA-HANDLER
77+
    (lambda (string1 string2 seed)
78+
      (cons (string-append string1 string2) seed))))
79+
80+
(define (xml->pitch port)
81+
  (filter pitch? (apply append (filter list? (parser port '())))))

modules/nani/radk.scm unknown status 2

1-
;;; Nani Project website
2-
;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu>
3-
;;;
4-
;;; This file is part of the Nani Project website.
5-
;;;
6-
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7-
;;; under the terms of the GNU Affero General Public License as published by
8-
;;; the Free Software Foundation; either version 3 of the License, or (at
9-
;;; your option) any later version.
10-
;;;
11-
;;; The Nani Project website is distributed in the hope that it will be useful, but
12-
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13-
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14-
;;; GNU Affero General Public License for more details.
15-
;;;
16-
;;; You should have received a copy of the GNU Affero General Public License
17-
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18-
19-
(define-module (nani radk)
20-
  #:use-module (ice-9 binary-ports)
21-
  #:use-module (ice-9 match)
22-
  #:use-module (ice-9 peg)
23-
  #:use-module (ice-9 rdelim)
24-
  #:use-module (nani parse-result)
25-
  #:use-module (nani serialize)
26-
  #:use-module (rnrs bytevectors)
27-
  #:use-module (sxml simple)
28-
  #:export (parse-radk
29-
            get-kanji-stroke
30-
            get-rad-kanji
31-
            get-rad-stroke
32-
            serialize-radk
33-
            kanji-count))
34-
35-
(define-peg-pattern comment none (and "#" (* (or "\t" (range #\x20 #\x10ffff))) "\n"))
36-
(define-peg-pattern space none " ")
37-
(define-peg-pattern return none "\n")
38-
(define-peg-pattern entry all
39-
  (and (ignore "$") space char space num (? (and space name)) (ignore "\n")
40-
       (+ (or char (ignore "\n")))))
41-
(define-peg-pattern num all (+ (or (range #\0 #\9))))
42-
(define-peg-pattern name none (+ (or (range #\0 #\9) (range #\a #\z) (range #\A #\Z))))
43-
(define-peg-pattern char all (and (range #\xff #\x10ffff)))
44-
(define-peg-pattern radk-doc body (* (or return comment entry)))
45-
46-
(define (parse-radk file)
47-
  (peg:tree (match-pattern radk-doc (call-with-input-file file read-string))))
48-
49-
(define (get-rad-kanji content)
50-
  (let loop ((result '()) (content content))
51-
    (match content
52-
      (() result)
53-
      ((('entry ('char radical) ('num stroke) (('char kanji) ...)) content ...)
54-
       (loop (cons (cons radical kanji) result)
55-
             content)))))
56-
57-
(define (get-rad-stroke content)
58-
  (let loop ((result '()) (content content))
59-
    (match content
60-
      (() result)
61-
      ((('entry ('char radical) ('num stroke) (('char kanji) ...)) content ...)
62-
       (loop (cons (cons radical (string->number stroke)) result)
63-
             content)))))
64-
65-
(define (get-kanji-stroke file)
66-
  (define strokes (xml->sxml (call-with-input-file file read-string)))
67-
68-
  (match strokes
69-
    (('*TOP* _ ('kanjidic2 content ...))
70-
     (map
71-
       (lambda (entry)
72-
         (let* ((literal (car (assoc-ref entry 'literal)))
73-
                (misc (assoc-ref entry 'misc))
74-
                (misc (filter list? misc))
75-
                (stroke (car (assoc-ref misc 'stroke_count))))
76-
           (cons literal (string->number stroke))))
77-
       (filter
78-
         (lambda (entry)
79-
           (and
80-
             (list? entry)
81-
             (equal? (car entry) 'character)))
82-
         content)))))
83-
84-
(define (serialize-radk rad-kanji rad-stroke kanji-stroke)
85-
  (define (serialize-rad-kanji-element element pos bv)
86-
    (match element
87-
      ((radical kanji ...)
88-
       (let* ((pos (serialize-string (radical-character radical) pos bv)))
89-
         (serialize-string (string-join kanji "") pos bv)))))
90-
  (define (rad-kanji-element-size element)
91-
    (match element
92-
      ((radical kanji ...)
93-
       (+ (string-size (radical-character radical))
94-
          (string-size (string-join kanji ""))))))
95-
96-
  (define (serialize-rad-kanji rad-kanji pos bv)
97-
    (serialize-list rad-kanji serialize-rad-kanji-element pos bv))
98-
  (define (rad-kanji-size rad-kanji)
99-
    (list-size rad-kanji rad-kanji-element-size))
100-
101-
  (define (serialize-rad-stroke-element element pos bv)
102-
    (match element
103-
      ((radical . stroke)
104-
       (let ((pos (serialize-string (radical-character radical) pos bv)))
105-
         (serialize-char stroke pos bv)))))
106-
  (define (rad-stroke-element-size element)
107-
    (match element
108-
      ((radical . stroke)
109-
       (+ (string-size (radical-character radical)) (char-size stroke)))))
110-
111-
  (define (serialize-rad-stroke rad-stroke pos bv)
112-
    (serialize-list rad-stroke serialize-rad-stroke-element pos bv))
113-
  (define (rad-stroke-size rad-stroke)
114-
    (list-size rad-stroke rad-stroke-element-size))
115-
116-
  (define (serialize-kanji-stroke-element element pos bv)
117-
    (match element
118-
      ((kanji . stroke)
119-
       (let ((pos (serialize-string kanji pos bv)))
120-
         (serialize-char stroke pos bv)))))
121-
  (define (kanji-stroke-element-size element)
122-
    (match element
123-
      ((kanji . stroke)
124-
       (+ (string-size kanji) (char-size stroke)))))
125-
126-
  (define (serialize-kanji-stroke kanji-stroke pos bv)
127-
    (serialize-list kanji-stroke serialize-kanji-stroke-element pos bv))
128-
  (define (kanji-stroke-size kanji-stroke)
129-
    (list-size kanji-stroke kanji-stroke-element-size))
130-
131-
  (let* ((header (string->utf8 "NANI_RADK001"))
132-
         (header-size (bytevector-length header))
133-
         (bv (make-bytevector (+ header-size 12
134-
                                 (rad-kanji-size rad-kanji)
135-
                                 (rad-stroke-size rad-stroke)
136-
                                 (kanji-stroke-size kanji-stroke)))))
137-
    (bytevector-copy! header 0 bv 0 header-size)
138-
    (let* ((pos header-size)
139-
           (pos (serialize-rad-kanji rad-kanji pos bv))
140-
           (pos (serialize-rad-stroke rad-stroke pos bv))
141-
           (pos (serialize-kanji-stroke kanji-stroke pos bv)))
142-
      bv)))
143-
144-
(define (radical-character kanji)
145-
  (match kanji
146-
    ("???" "???")
147-
    ("???" "????")
148-
    ("???" "???")
149-
    ("???" "???")
150-
    ("???" "????")
151-
    ("???" "???")
152-
    ("???" "???")
153-
    ("???" "???")
154-
    ("???" "???")
155-
    ("???" "???")
156-
    ("???" "???")
157-
    ("???" "???")
158-
    ("???" "???")
159-
    ("???" "???")
160-
    ("???" "???")
161-
    ("???" "???")
162-
    ("???" "???")
163-
    ("???" "???")
164-
    ("???" "???")
165-
    ("???" "???")
166-
    ("???" "???")
167-
    ("???" "???")
168-
    (_ kanji)))
169-
170-
(define (parse-result file)
171-
  (define (parse-result-rad-kanji-element port)
172-
    (let ((radical (parse-result-string port))
173-
          (kanji-list (parse-result-string port)))
174-
       (cons radical (string->list kanji-list))))
175-
  (define (parse-result-rad-kanji port)
176-
    (parse-result-list port parse-result-rad-kanji-element))
177-
178-
  (define (parse-result-rad-stroke-element port)
179-
    (let ((radical (parse-result-string port))
180-
          (stroke (parse-result-char port)))
181-
       (cons radical stroke)))
182-
  (define (parse-result-rad-stroke port)
183-
    (parse-result-list port parse-result-rad-stroke-element))
184-
185-
  (define (parse-result-kanji-stroke-element port)
186-
    (let ((kanji (parse-result-string port))
187-
          (stroke (parse-result-char port)))
188-
      (cons kanji stroke)))
189-
  (define (parse-result-kanji-stroke port)
190-
    (parse-result-list port parse-result-kanji-stroke-element))
191-
192-
  (call-with-input-file file
193-
    (lambda (port)
194-
      (let* ((header (utf8->string (get-bytevector-n port 12)))
195-
             (rad-kanji (parse-result-rad-kanji port))
196-
             (rad-stroke (parse-result-rad-stroke port))
197-
             (kanji-stroke (parse-result-kanji-stroke port)))
198-
        (list rad-kanji rad-stroke kanji-stroke)))))
199-
200-
(define (get-kanji-list content)
201-
  (let loop ((result '()) (content content))
202-
    (match content
203-
      (() result)
204-
      (((_ kanji ...) content ...)
205-
       (loop (append result (filter (lambda (k) (not (member k result))) kanji))
206-
             content)))))
207-
208-
(define (kanji-count file)
209-
  (match (parse-result file)
210-
    ((rad-kanji _ _)
211-
     (length (get-kanji-list rad-kanji)))))

modules/nani/result.scm unknown status 2

1-
;;; Nani Project website
2-
;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu>
3-
;;;
4-
;;; This file is part of the Nani Project website.
5-
;;;
6-
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7-
;;; under the terms of the GNU Affero General Public License as published by
8-
;;; the Free Software Foundation; either version 3 of the License, or (at
9-
;;; your option) any later version.
10-
;;;
11-
;;; The Nani Project website is distributed in the hope that it will be useful, but
12-
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13-
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14-
;;; GNU Affero General Public License for more details.
15-
;;;
16-
;;; You should have received a copy of the GNU Affero General Public License
17-
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18-
19-
(define-module (nani result)
20-
  #:use-module (srfi srfi-9)
21-
  #:export (make-result
22-
            result?
23-
            result-position
24-
            result-position-set!
25-
            result-points
26-
            result-kanjis
27-
            result-readings
28-
            result-senses
29-
            
30-
            make-reading
31-
            reading?
32-
            reading-kanjis
33-
            reading-info
34-
            reading-readings
35-
            
36-
            make-sense
37-
            sense?
38-
            sense-references
39-
            sense-limits
40-
            sense-infos
41-
            sense-sources
42-
            sense-tags
43-
            sense-glosses
44-
            sense-language
45-
            
46-
            make-source
47-
            source?
48-
            source-content
49-
            source-wasei?
50-
            source-type
51-
            source-lang
52-
            
53-
            update-result
54-
            update-reading
55-
            update-sense
56-
            update-source))
57-
58-
(define-record-type result
59-
  (make-result position points kanjis readings senses)
60-
  result?
61-
  (position result-position result-position-set!) ; integer
62-
  (points result-points) ; integer
63-
  (kanjis result-kanjis) ; string-list
64-
  (readings result-readings) ; reanding-list
65-
  (senses result-senses)) ; sense-list
66-
67-
(define-record-type reading
68-
  (make-reading kanjis info readings)
69-
  reading?
70-
  (kanjis reading-kanjis) ; string-list
71-
  (info reading-info) ; string-list
72-
  (readings reading-readings)) ; string-list
73-
74-
(define-record-type sense
75-
  (make-sense references limits infos sources tags glosses language)
76-
  sense?
77-
  (references sense-references) ; string-list
78-
  (limits sense-limits) ; string-list
79-
  (infos sense-infos) ; string-list
80-
  (sources sense-sources) ; source-list
81-
  (tags sense-tags) ; integer-list
82-
  (glosses sense-glosses) ; string-list
83-
  (language sense-language)) ; string
84-
85-
(define-record-type source
86-
  (make-source content wasei? type lang)
87-
  source?
88-
  (content source-content) ; string-list
89-
  (wasei? source-wasei?) ; boolean
90-
  (type source-type) ; string
91-
  (lang source-lang)) ; string
92-
93-
(define* (update-result result
94-
           #:key (points (result-points result))
95-
                 (kanjis (result-kanjis result))
96-
                 (readings (result-readings result))
97-
                 (senses (result-senses result)))
98-
  (make-result (result-position result) points kanjis readings senses))
99-
100-
(define* (update-reading reading
101-
           #:key (kanjis (reading-kanjis reading))
102-
                 (info (reading-info reading))
103-
                 (readings (reading-readings reading)))
104-
  (make-reading kanjis info readings))
105-
106-
(define* (update-sense sense
107-
           #:key (references (sense-references sense))
108-
                 (limits (sense-limits sense))
109-
                 (infos (sense-infos sense))
110-
                 (sources (sense-sources sense))
111-
                 (tags (sense-tags sense))
112-
                 (glosses (sense-glosses sense))
113-
                 (language (sense-language sense)))
114-
  (make-sense references limits infos sources tags glosses language))
115-
116-
(define* (update-source source
117-
           #:key (content (source-content source))
118-
                 (wasei? (source-wasei? source))
119-
                 (type (source-type source))
120-
                 (lang (source-lang source)))
121-
  (make-source content wasei? type lang))

modules/nani/result/frequency.scm unknown status 1

1+
;;; Nani Project website
2+
;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
19+
(define-module (nani result frequency)
20+
  #:use-module (ice-9 rdelim)
21+
  #:export (load-frequency
22+
            frequency->score))
23+
24+
(define (load-frequency file)
25+
  (call-with-input-file file
26+
    (lambda (port)
27+
      (let loop ((frq '()) (i 1))
28+
        (let* ((line (%read-line port))
29+
               (line (car line)))
30+
          (if (eof-object? line)
31+
            frq
32+
            (let ((content (string-split line #\tab)))
33+
              (loop (cons (cons (cadr (cdr content)) i) frq) (+ i 1)))))))))
34+
35+
(define (frequency->score frq word)
36+
  (let ((freq (assoc-ref frq word)))
37+
    (cond
38+
      ((not freq) 0)
39+
      ((< freq 501) 32)
40+
      ((< freq 1001) 16)
41+
      ((< freq 2001) 8)
42+
      ((< freq 5001) 4)
43+
      ((< freq 10001) 2)
44+
      ((< freq 20001) 1)
45+
      (else 0))))

modules/nani/result/jibiki.scm unknown status 1

1+
;;; Nani Project website
2+
;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
19+
(define-module (nani result jibiki)
20+
  #:use-module (ice-9 match)
21+
  #:use-module (ice-9 rdelim)
22+
  #:use-module (nani result frequency)
23+
  #:use-module (nani result result)
24+
  #:use-module (srfi srfi-9)
25+
  #:use-module (sxml ssax)
26+
  #:export (xml->results))
27+
28+
(define (sxml->string lst)
29+
  (if (string? lst)
30+
      lst
31+
      (let loop ((lst lst) (result ""))
32+
        (if (null? lst)
33+
            result
34+
            (loop
35+
              (cdr lst)
36+
              (match (car lst)
37+
                (('lang . _) result)
38+
                (#f result)
39+
                ((? string? s) (string-append result s))
40+
                ((? list? l) (loop l result))))))))
41+
42+
(define (gram->info gram)
43+
  (match gram
44+
    ("adjectif???" "adj. en ku")
45+
    ("adjectif???" "adj. en na")
46+
    ("adjectif?????????" "adj. en na (tari)")
47+
    ("adjectif?????????" "adj. en na (nari)")
48+
    ("contraction" "contr.")
49+
    ("counter" "compteur")
50+
    ("num??rique" "nombre")
51+
    ("particule" "particule")
52+
    ("postposition" "postpos.")
53+
    ("verbe auxiliaire" "v. aux.")
54+
    ("??? pronom" "pronom")
55+
    ("??? adverbe" "adv.")
56+
    ("??? onomatop??e" "onom.")
57+
    ("?????? verbe auxiliaire" "v. aux.")
58+
    ("?????? compteur" "compteur")
59+
    ("??? verbe" "v.")
60+
    ("??? verbe intransitif" "v.i.")
61+
    ("??? verbe transitif" "v.t.")
62+
    ("??? nom" "n.")
63+
    ("??? adjectif" "adj.")
64+
    ("???" "interj.")
65+
    ("?????? interjection" "interj.")
66+
    ("?????? expression" "expr.")
67+
    ("?????? suffixe" "suf.")
68+
    ("?????? suffixe conjonctif" "suf. conj.")
69+
    ("?????? conjonction" "conj.")
70+
    ("?????? pr??fixe" "pref.")
71+
    ("??????" "adj. pr??nominal")
72+
    ("?????? locution adverbiale" "loc. adv.")
73+
    ("?????? locution conjonctive" "loc. conj.")
74+
    ("?????? locution postpositive" "loc. post.")
75+
    ("?????? mot compos??" "mot compos??")
76+
    ("??" #f)
77+
    ("" #f)))
78+
79+
(define (info->info i)
80+
  (match (sxml->string i)
81+
    ("abbreviation" "abbreviation")
82+
    ("archaism" "archa??sme")
83+
    ("children's language" "langage des enfants")
84+
    ("colloquialism" "familier")
85+
    ("derogatory" "terme de m??pris")
86+
    ("familiar language" "familier")
87+
    ("female term or language" "langage des femmes")
88+
    ("honorific or respectful (sonkeigo) language" "terme honorifique")
89+
    ("humble (kenjougo) language" "terme humble")
90+
    ("idiomatic expression" "idiom.")
91+
    ("jocular, humorous term" "comique")
92+
    ("male term or language" "langage des hommes")
93+
    ("obscure term" "obscure")
94+
    ("obsolete term" "obsol??te")
95+
    ("onomatopoeic or mimetic word" "onom.")
96+
    ("polite (teineigo) language" "terme poli")
97+
    ("proverb" "proverbe")
98+
    ("sensitive" "sensible")
99+
    ("slang" "argot")
100+
    ("vulgar expression or word" "vulgaire")
101+
    ("word usually written using kana alone" "habituellement ??crit en kana")
102+
    ("" #f)
103+
    (() #f)
104+
    (_ i)))
105+
106+
(define (sxml->ref lst)
107+
  (let loop ((lst lst) (ref '()))
108+
    (if (null? lst)
109+
        ref
110+
        (loop
111+
          (cdr lst)
112+
          (match (car lst)
113+
            (#f ref)
114+
            (('jpn . jpn) (cons jpn ref)))))))
115+
116+
(define (sxml->reading str)
117+
  (make-reading '() '() (list str)))
118+
119+
(define (sxml->source lst)
120+
  (let loop ((lst lst) (source (make-source '() #f "")))
121+
    (if (null? lst)
122+
        source
123+
        (loop
124+
          (cdr lst)
125+
          (match (car lst)
126+
            ((? string? s)
127+
             (update-source source
128+
               #:content (cons s (source-content source))))
129+
            (('lang . l)
130+
             (update-source source #:lang l))
131+
            ((? list? l)
132+
             (loop l source)))))))
133+
134+
(define (sxml->meaning lst)
135+
  (let loop ((lst lst) (meaning (make-meaning '() '() '() '() '() "fr")))
136+
    (if (null? lst)
137+
        meaning
138+
        (loop
139+
          (cdr lst)
140+
          (match (car lst)
141+
            (#f meaning)
142+
            (('n . _) meaning)
143+
            (('info . info) (update-meaning meaning
144+
                              #:infos (cons info (meaning-infos meaning))))
145+
            (('content . c) (update-meaning meaning
146+
                              #:glosses (append
147+
                                          (map string-downcase
148+
					       (string-split c #\,))
149+
                                          (meaning-glosses meaning))))
150+
            ((? source? s)
151+
             (update-meaning meaning
152+
               #:sources (cons s (meaning-sources meaning))))
153+
            ((? string? _) meaning)
154+
            ((? list? l) (loop l meaning)))))))
155+
156+
(define (sxml->result lst frq)
157+
  (define result
158+
    (let loop ((lst lst) (result (make-result 0 0 '() '() '())) (infos '())
159+
                         (sources '()))
160+
      (if (null? lst)
161+
          (list result infos sources)
162+
          (match (car lst)
163+
            (#f (loop (cdr lst) result infos sources))
164+
            (('id . _) (loop (cdr lst) result infos sources))
165+
            (('identree . _) (loop (cdr lst) result infos sources))
166+
            (('kanji . k)
167+
             (loop (cdr lst)
168+
                   (update-result result
169+
                     #:kanjis (cons k (result-kanjis result)))
170+
                   infos
171+
                   sources))
172+
            ((? reading? r)
173+
             (loop
174+
               (cdr lst)
175+
               (update-result result
176+
                 #:readings (cons r (result-readings result)))
177+
               infos
178+
               sources))
179+
            ((? meaning? s)
180+
             (loop
181+
               (cdr lst)
182+
               (update-result result
183+
                 #:meanings (cons s (result-meanings result)))
184+
               infos
185+
               sources))
186+
            ((? source? s)
187+
             (loop (cdr lst) result infos (cons s sources)))
188+
            (('info . info)
189+
             (loop (cdr lst)
190+
                   result
191+
                   (cons info infos)
192+
                   sources))
193+
            ((? list? l)
194+
             (match (loop l result infos sources)
195+
               ((result infos sources)
196+
                (loop (cdr lst) result infos sources))))))))
197+
198+
  (match result
199+
    ((result infos sources)
200+
     (if (or (and (null? (result-readings result))
201+
                  (null? (result-kanjis result)))
202+
             (null? (result-meanings result)))
203+
         #f
204+
         (let* ((word (if (null? (result-kanjis result))
205+
                          (car (reading-readings
206+
                                 (car (result-readings result))))
207+
                          (car (result-kanjis result))))
208+
                (score (frequency->score frq word))
209+
                (meanings (result-meanings result))
210+
                (meanings
211+
                  (map
212+
                    (lambda (s)
213+
                      (update-meaning s
214+
                        #:infos (append infos (meaning-infos s))
215+
                        #:sources (append sources (meaning-sources s))))
216+
                    meanings)))
217+
           (update-result result
218+
             #:score score
219+
             #:meanings meanings))))))
220+
221+
(define (sxml->element lst elem frq)
222+
  (let ((elem (match elem
223+
                ((_ . elem) elem)
224+
                (_ elem))))
225+
    (match elem
226+
      ('vedette-romaji #f)
227+
      ('vedette-hiragana (sxml->reading (sxml->string lst)))
228+
      ('vedette-jpn `(kanji . ,(sxml->string lst)))
229+
      ('vedette lst)
230+
      ('forme lst)
231+
      ('domaine (if (null? lst) #f
232+
                    (let ((info (sxml->string lst)))
233+
                      (if (or (not info) (string-null? info))
234+
                          #f
235+
                          `(info . ,info)))))
236+
      ('gram (let ((info (gram->info (sxml->string lst))))
237+
               (if (or (not info) (string-null? info))
238+
                   #f
239+
                   `(info . ,info))))
240+
      ('??tiquettes lst)
241+
      ('??tiquettes-sens lst)
242+
      ('texte-sens `(content . ,(sxml->string lst)))
243+
      ('sens (sxml->meaning lst))
244+
      ('bloc-gram lst)
245+
      ('s??mantique lst)
246+
      ('article (sxml->result lst frq))
247+
      ('registre (let ((info (sxml->string lst)))
248+
                   (if (or (not info) (string-null? info))
249+
                       #f
250+
                       `(info . ,info))))
251+
      ('litt??ralement
252+
       (if (null? lst) #f (make-meaning
253+
                            '() '() '() '("lit")
254+
                            (list (sxml->string lst)) "fr")))
255+
      ('etymologie (sxml->source lst))
256+
      ('??tymologie (sxml->source lst))
257+
      ('note (let ((info (sxml->string lst)))
258+
               (if (or (not info) (string-null? info)) #f `(info . ,info))))
259+
      ('info (let ((info (info->info (sxml->string lst))))
260+
               (if (or (not info) (string-null? info)) #f `(info . ,info))))
261+
      ('dialecte (let ((info (sxml->string lst)))
262+
                   (if (or (not info) (string-null? info)) #f `(info . ,info))))
263+
      ('en lst)
264+
      ('eng lst)
265+
      ('xref (sxml->ref lst))
266+
267+
      ('vr #f)
268+
      ('romaji #f)
269+
      ('romajidx #f)
270+
      ('rt #f)
271+
      ('ruby #f)
272+
      ('vj #f)
273+
      ('jpn `(jpn . ,(sxml->string lst)))
274+
      ('??tiquettes-fran??ais #f)
275+
      ('??tiquettes-sous-sens #f)
276+
      ('texte-sous-sens #f)
277+
      ('sous-sens #f)
278+
      ('sous-bloc-gram #f)
279+
      ('fran??ais #f)
280+
      ('exemple #f)
281+
      ('exemples #f))))
282+
283+
(define (create-parser frq)
284+
  (ssax:make-parser
285+
    NEW-LEVEL-SEED
286+
    (lambda (elem-gi attributes namespaces expected-content seed)
287+
      attributes)
288+
    
289+
    FINISH-ELEMENT
290+
    (lambda (elem-gi attributes namespaces parent-seed seed)
291+
      (if (equal? elem-gi 'volume)
292+
          (filter (lambda (a) a) seed)
293+
          (let* ((seed (reverse seed))
294+
                 (element (sxml->element seed elem-gi frq)))
295+
            (cons element parent-seed))))
296+
    
297+
    CHAR-DATA-HANDLER
298+
    (lambda (string1 string2 seed)
299+
      (cons (string-append string1 string2) seed))))
300+
301+
(define (xml->results port frq)
302+
  (let ((results (filter result? ((create-parser frq) port '()))))
303+
    (sort-results results)))

modules/nani/result/jmdict.scm unknown status 1

1+
;;; Nani Project website
2+
;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
19+
(define-module (nani result jmdict)
20+
  #:use-module (ice-9 match)
21+
  #:use-module (ice-9 rdelim)
22+
  #:use-module (nani result frequency)
23+
  #:use-module (nani result result)
24+
  #:use-module (sxml simple)
25+
  #:use-module (sxml ssax)
26+
  #:export (load-dic xml->results))
27+
28+
(define (load-dic file)
29+
  (xml->sxml (call-with-input-file file read-string)))
30+
31+
(define (sxml->reading lst)
32+
  (let loop ((reading (make-reading '() '() '())) (lst lst))
33+
    (if (null? lst)
34+
      reading
35+
      (loop
36+
        (match (car lst)
37+
          (('reading r) (update-reading reading #:readings (cons r (reading-readings reading))))
38+
          (('info r) (update-reading reading #:info (cons r (reading-info reading))))
39+
          (('limit r) (update-reading reading #:kanjis (cons r (reading-kanjis reading))))
40+
          (((? symbol? s) v) (throw 'unknown-content s v))
41+
          ((? string? _) reading))
42+
        (cdr lst)))))
43+
44+
(define (sxml->source lst)
45+
  (let loop ((source (make-source '() #f "")) (lst lst))
46+
    (if (null? lst)
47+
      source
48+
      (loop
49+
        (match (car lst)
50+
          (('content c) (update-source source #:content (cons c (source-content source))))
51+
          (('ls_wasei _) (update-source source #:wasei? #t))
52+
          (('ls_type t) source); type is always "part" or nothing
53+
          ((('xml . 'lang) l) (update-source source #:lang l))
54+
          (((? symbol? s) v) (throw 'unknown-content s v))
55+
          ((? list? l) (loop source l))
56+
          ((? string? c) (update-source source #:content
57+
                                        (cons c (source-content source)))))
58+
        (cdr lst)))))
59+
60+
(define (sxml->meaning lst)
61+
  (let loop ((meaning (make-meaning '() '() '() '() '() "eng")) (lst lst))
62+
    (if (null? lst)
63+
      meaning
64+
      (loop
65+
        (match (car lst)
66+
          (('ref (? string? r)) (update-meaning meaning #:references (cons r (meaning-references meaning))))
67+
          (('limit (? string? r)) (update-meaning meaning #:limits (cons r (meaning-limits meaning))))
68+
          (('info (? string? r)) (update-meaning meaning #:infos (cons r (meaning-infos meaning))))
69+
          ((? source? s) (update-meaning meaning #:sources (cons s (meaning-sources meaning))))
70+
          (('info (? string? r)) (update-meaning meaning #:infos (cons r (meaning-infos meaning))))
71+
          (('gloss (? string? r)) (update-meaning meaning
72+
                                    #:glosses
73+
                                    (cons (string-downcase r)
74+
                                          (meaning-glosses meaning))))
75+
          ((('xml . 'lang) (? string? l)) (update-meaning meaning #:language l))
76+
          (((? symbol? s) v) (throw 'unknown-content s v))
77+
          ((? list? l) (loop meaning l))
78+
          ((? string? _) meaning))
79+
        (cdr lst)))))
80+
81+
(define (sxml->result lst frq)
82+
  (let ((result
83+
        (let loop ((result (make-result 0 0 '() '() '())) (lst lst))
84+
          (if (null? lst)
85+
            result
86+
            (loop
87+
              (match (car lst)
88+
                (('kanji kanji)
89+
                 (update-result result #:kanjis (cons kanji (result-kanjis result))))
90+
                ((? reading? r)
91+
                 (update-result result #:readings (cons r (result-readings result))))
92+
                ((? meaning? s)
93+
                 (update-result result #:meanings (cons s (result-meanings result))))
94+
                ((? string? _) result))
95+
              (cdr lst))))))
96+
    (let* ((word (if (null? (result-kanjis result))
97+
                   (car (reading-readings (car (result-readings result))))
98+
                   (car (result-kanjis result))))
99+
           (score (frequency->score frq word))
100+
           (meanings (result-meanings result)))
101+
      (update-result result
102+
        #:score score
103+
        #:meanings meanings))))
104+
105+
(define (field->info field)
106+
  ;; TODO: translate depending on target language
107+
  (match field
108+
    ("agric" "agriculture")
109+
    ("anat" "anatomy")
110+
    ("archeol" "archeology")
111+
    ("archit" "architecture")
112+
    ("art" "art, aesthetics")
113+
    ("astron" "astronomy")
114+
    ("audvid" "audiovisual")
115+
    ("aviat" "aviation")
116+
    ("baseb" "baseball")
117+
    ("biochem" "biochemistry")
118+
    ("biol" "biology")
119+
    ("bot" "botany")
120+
    ("Buddh" "Buddhism")
121+
    ("bus" "business")
122+
    ("chem" "chemistry")
123+
    ("Christn" "Christianity")
124+
    ("cloth" "clothing")
125+
    ("comp" "computing")
126+
    ("cryst" "crystallography")
127+
    ("ecol" "ecology")
128+
    ("econ" "economics")
129+
    ("elec" "electricity, elec. eng.")
130+
    ("electr" "electronics")
131+
    ("embryo" "embryology")
132+
    ("engr" "engineering")
133+
    ("ent" "entomology")
134+
    ("finc" "finance")
135+
    ("fish" "fishing")
136+
    ("food" "food, cooking")
137+
    ("gardn" "gardening, horticulture")
138+
    ("genet" "genetics")
139+
    ("geogr" "geography")
140+
    ("geol" "geology")
141+
    ("geom" "geometry")
142+
    ("go" "go (game)")
143+
    ("golf" "golf")
144+
    ("gramm" "grammar")
145+
    ("grmyth" "Greek mythology")
146+
    ("hanaf" "hanafuda")
147+
    ("horse" "horse racing")
148+
    ("law" "law")
149+
    ("ling" "linguistics")
150+
    ("logic" "logic")
151+
    ("MA" "martial arts")
152+
    ("mahj" "mahjong")
153+
    ("math" "mathematics")
154+
    ("mech" "mechanical engineering")
155+
    ("med" "medicine")
156+
    ("met" "meteorology")
157+
    ("mil" "military")
158+
    ("music" "music")
159+
    ("ornith" "ornithology")
160+
    ("paleo" "paleontology")
161+
    ("pathol" "pathology")
162+
    ("pharm" "pharmacy")
163+
    ("phil" "philosophy")
164+
    ("photo" "photography")
165+
    ("physics" "physics")
166+
    ("physiol" "physiology")
167+
    ("print" "printing")
168+
    ("psy" "psychiatry")
169+
    ("psych" "psychology")
170+
    ("rail" "railway")
171+
    ("Shinto" "Shinto")
172+
    ("shogi" "shogi")
173+
    ("sports" "sports")
174+
    ("stat" "statistics")
175+
    ("sumo" "sumo")
176+
    ("telec" "telecommunications")
177+
    ("tradem" "trademark")
178+
    ("vidg" "video games")
179+
    ("zool" "zoology")))
180+
181+
(define (dial->info dial)
182+
  (match dial
183+
    ("bra" "Brazilian")
184+
    ("hob" "Hokkaido-ben")
185+
    ("ksb" "Kansai-ben")
186+
    ("ktb" "Kantou-ben")
187+
    ("kyb" "Kyoto-ben")
188+
    ("kyu" "Kyuushuu-ben")
189+
    ("nab" "Nagano-ben")
190+
    ("osb" "Osaka-ben")
191+
    ("rkb" "Ryuukyuu-ben")
192+
    ("thb" "Touhoku-ben")
193+
    ("tsb" "Tosa-ben")
194+
    ("tsug" "Tsugaru-ben")))
195+
196+
(define (misc->info misc)
197+
  (match misc
198+
    ("abbr" "abbreviation")
199+
    ("arch" "archaism")
200+
    ("char" "character")
201+
    ("chn" "children's language")
202+
    ("col" "colloquialism")
203+
    ("company" "company name")
204+
    ("creat" "creature")
205+
    ("dated" "dated term")
206+
    ("dei" "deity")
207+
    ("derog" "derogatory")
208+
    ("doc" "document")
209+
    ("ev" "event")
210+
    ("fam" "familiar language")
211+
    ("fem" "female term or language")
212+
    ("fict" "fiction")
213+
    ("form" "formal or literary term")
214+
    ("given" "given name or forename, gender not specified")
215+
    ("group" "group")
216+
    ("hist" "historical term")
217+
    ("hon" "honorific or respectful (sonkeigo) language")
218+
    ("hum" "humble (kenjougo) language")
219+
    ("id" "idiomatic expression")
220+
    ("joc" "jocular, humorous term")
221+
    ("leg" "legend")
222+
    ("m-sl" "manga slang")
223+
    ("male" "male term or language")
224+
    ("myth" "mythology")
225+
    ("net-sl" "Internet slang")
226+
    ("obj" "object")
227+
    ("obs" "obsolete term")
228+
    ("obsc" "obscure term")
229+
    ("on-mim" "onomatopoeic or mimetic word")
230+
    ("organization" "organization name")
231+
    ("oth" "other")
232+
    ("person" "full name of a particular person")
233+
    ("place" "place name")
234+
    ("poet" "poetical term")
235+
    ("pol" "polite (teineigo) language")
236+
    ("product" "product name")
237+
    ("proverb" "proverb")
238+
    ("quote" "quotation")
239+
    ("rare" "rare")
240+
    ("relig" "religion")
241+
    ("sens" "sensitive")
242+
    ("serv" "service")
243+
    ("sl" "slang")
244+
    ("station" "railway station")
245+
    ("surname" "family or surname")
246+
    ("uk" "word usually written using kana alone")
247+
    ("unclass" "unclassified name")
248+
    ("vulg" "vulgar expression or word")
249+
    ("work" "work of art, literature, music, etc. name")
250+
    ("X" "rude or X-rated term")
251+
    ("yoji" "yojijukugo")))
252+
253+
(define (pos->info pos)
254+
  (match pos
255+
    ("adj-f" "noun or verb acting prenominally")
256+
    ("adj-i" "adjective (keiyoushi)")
257+
    ("adj-ix" "adjective (keiyoushi) - yoi/ii class")
258+
    ("adj-kari" "'kari' adjective (archaic)")
259+
    ("adj-ku" "'ku' adjective (archaic)")
260+
    ("adj-na" "adjectival nouns or quasi-adjectives (keiyodoshi)")
261+
    ("adj-nari" "archaic/formal form of na-adjective")
262+
    ("adj-no" "nouns which may take the genitive case particle 'no'")
263+
    ("adj-pn" "pre-noun adjectival (rentaishi)")
264+
    ("adj-shiku" "'shiku' adjective (archaic)")
265+
    ("adj-t" "'taru' adjective")
266+
    ("adv" "adverb (fukushi)")
267+
    ("adv-to" "adverb taking the 'to' particle")
268+
    ("aux" "auxiliary")
269+
    ("aux-adj" "auxiliary adjective")
270+
    ("aux-v" "auxiliary verb")
271+
    ("conj" "conjunction")
272+
    ("cop" "copula")
273+
    ("ctr" "counter")
274+
    ("exp" "expression")
275+
    ("int" "interjection (kandoushi)")
276+
    ("n" "noun (common) (futsuumeishi)")
277+
    ("n-adv" "adverbial noun (fukushitekimeishi)")
278+
    ("n-pr" "proper noun")
279+
    ("n-pref" "noun, used as a prefix")
280+
    ("n-suf" "noun, used as a suffix")
281+
    ("n-t" "noun (temporal) (jisoumeishi)")
282+
    ("num" "numeric")
283+
    ("pn" "pronoun")
284+
    ("pref" "prefix")
285+
    ("prt" "particle")
286+
    ("suf" "suffix")
287+
    ("unc" "unclassified")
288+
    ("v-unspec" "verb unspecified")
289+
    ("v1" "Ichidan verb")
290+
    ("v1-s" "Ichidan verb - kureru special class")
291+
    ("v2a-s" "Nidan verb with 'u' ending (archaic)")
292+
    ("v2b-k" "Nidan verb (upper class) with 'bu' ending (archaic)")
293+
    ("v2b-s" "Nidan verb (lower class) with 'bu' ending (archaic)")
294+
    ("v2d-k" "Nidan verb (upper class) with 'dzu' ending (archaic)")
295+
    ("v2d-s" "Nidan verb (lower class) with 'dzu' ending (archaic)")
296+
    ("v2g-k" "Nidan verb (upper class) with 'gu' ending (archaic)")
297+
    ("v2g-s" "Nidan verb (lower class) with 'gu' ending (archaic)")
298+
    ("v2h-k" "Nidan verb (upper class) with 'hu/fu' ending (archaic)")
299+
    ("v2h-s" "Nidan verb (lower class) with 'hu/fu' ending (archaic)")
300+
    ("v2k-k" "Nidan verb (upper class) with 'ku' ending (archaic)")
301+
    ("v2k-s" "Nidan verb (lower class) with 'ku' ending (archaic)")
302+
    ("v2m-k" "Nidan verb (upper class) with 'mu' ending (archaic)")
303+
    ("v2m-s" "Nidan verb (lower class) with 'mu' ending (archaic)")
304+
    ("v2n-s" "Nidan verb (lower class) with 'nu' ending (archaic)")
305+
    ("v2r-k" "Nidan verb (upper class) with 'ru' ending (archaic)")
306+
    ("v2r-s" "Nidan verb (lower class) with 'ru' ending (archaic)")
307+
    ("v2s-s" "Nidan verb (lower class) with 'su' ending (archaic)")
308+
    ("v2t-k" "Nidan verb (upper class) with 'tsu' ending (archaic)")
309+
    ("v2t-s" "Nidan verb (lower class) with 'tsu' ending (archaic)")
310+
    ("v2w-s" "Nidan verb (lower class) with 'u' ending and 'we' conjugation (archaic)")
311+
    ("v2y-k" "Nidan verb (upper class) with 'yu' ending (archaic)")
312+
    ("v2y-s" "Nidan verb (lower class) with 'yu' ending (archaic)")
313+
    ("v2z-s" "Nidan verb (lower class) with 'zu' ending (archaic)")
314+
    ("v4b" "Yodan verb with 'bu' ending (archaic)")
315+
    ("v4g" "Yodan verb with 'gu' ending (archaic)")
316+
    ("v4h" "Yodan verb with 'hu/fu' ending (archaic)")
317+
    ("v4k" "Yodan verb with 'ku' ending (archaic)")
318+
    ("v4m" "Yodan verb with 'mu' ending (archaic)")
319+
    ("v4n" "Yodan verb with 'nu' ending (archaic)")
320+
    ("v4r" "Yodan verb with 'ru' ending (archaic)")
321+
    ("v4s" "Yodan verb with 'su' ending (archaic)")
322+
    ("v4t" "Yodan verb with 'tsu' ending (archaic)")
323+
    ("v5aru" "Godan verb - -aru special class")
324+
    ("v5b" "Godan verb with 'bu' ending")
325+
    ("v5g" "Godan verb with 'gu' ending")
326+
    ("v5k" "Godan verb with 'ku' ending")
327+
    ("v5k-s" "Godan verb - Iku/Yuku special class")
328+
    ("v5m" "Godan verb with 'mu' ending")
329+
    ("v5n" "Godan verb with 'nu' ending")
330+
    ("v5r" "Godan verb with 'ru' ending")
331+
    ("v5r-i" "Godan verb with 'ru' ending (irregular verb)")
332+
    ("v5s" "Godan verb with 'su' ending")
333+
    ("v5t" "Godan verb with 'tsu' ending")
334+
    ("v5u" "Godan verb with 'u' ending")
335+
    ("v5u-s" "Godan verb with 'u' ending (special class)")
336+
    ("v5uru" "Godan verb - Uru old class verb (old form of Eru)")
337+
    ("vi" "intransitive verb")
338+
    ("vk" "Kuru verb - special class")
339+
    ("vn" "irregular nu verb")
340+
    ("vr" "irregular ru verb, plain form ends with -ri")
341+
    ("vs" "noun or participle which takes the aux. verb suru")
342+
    ("vs-c" "su verb - precursor to the modern suru")
343+
    ("vs-i" "suru verb - included")
344+
    ("vs-s" "suru verb - special class")
345+
    ("vt" "transitive verb")
346+
    ("vz" "Ichidan verb - zuru verb (alternative form of -jiru verbs)")))
347+
348+
(define (re_inf->info i)
349+
  (match i
350+
    ("gikun" "gikun (meaning as reading) or jukujikun (special kanji reading)")
351+
    ("ik" "word containing irregular kana usage")
352+
    ("ok" "out-dated or obsolete kana usage")
353+
    ("uK" "word usually written using kanji alone")))
354+
355+
(define (sxml->element lst elem frq)
356+
  (match elem
357+
    ('ent_seq "")
358+
    ('ke_pri "")
359+
    ('re_nokanji "")
360+
    ('re_pri "")
361+
    ('ke_inf "")
362+
    ('misc (if (and (= (length lst) 1) (string? (car lst)))
363+
               (misc->info (car lst))
364+
               (throw 'invalid-misc lst)))
365+
    ('re_restr (if (and (= (length lst) 1) (string? (car lst)))
366+
                   `(limit ,(car lst))
367+
                   (throw 'invalid-re_restr lst)))
368+
    ('keb (if (and (= (length lst) 1) (string? (car lst)))
369+
              `(kanji ,(car lst))
370+
              (throw 'invalid-keb lst)))
371+
    ('s_inf (if (null? (filter (lambda (s) (not (string? s))) lst))
372+
                `(info ,(apply string-append lst))
373+
                (throw 'invalid-s_inf lst)))
374+
    ('dial (if (and (= (length lst) 1) (string? (car lst)))
375+
               (dial->info (car lst))
376+
               (throw 'invalid-dial lst)))
377+
    ('re_inf (if (and (= (length lst) 1) (string? (car lst)))
378+
                 (re_inf->info (car lst))
379+
                 (throw 'invalid-re_inf lst)))
380+
    ('stagk (if (and (= (length lst) 1) (string? (car lst)))
381+
                `(limit ,(car lst))
382+
                (throw 'invalid-stagk lst)))
383+
    ('stagr (if (and (= (length lst) 1) (string? (car lst)))
384+
                `(limit ,(car lst))
385+
                (throw 'invalid-stagr lst)))
386+
    ('field (if (and (= (length lst) 1) (string? (car lst)))
387+
                (field->info (car lst))
388+
                (throw 'invalid-field lst)))
389+
    ('ant (if (and (= (length lst) 1) (string? (car lst)))
390+
              `(ref ,(car lst))
391+
              (throw 'invalid-ant lst)))
392+
    ('reb (if (and (= (length lst) 1) (string? (car lst)))
393+
              `(reading ,(car lst))
394+
              (throw 'invalid-reb lst)))
395+
    ('r_ele (sxml->reading lst))
396+
    ('k_ele (car (filter list? lst)))
397+
    ('pos (if (and (= (length lst) 1) (string? (car lst)))
398+
              (pos->info (car lst))
399+
              (throw 'invalid-pos lst)))
400+
    ('xref (if (and (= (length lst) 1) (string? (car lst)))
401+
               `(ref ,(car lst))
402+
               (throw 'invalid-xref lst)))
403+
    ('gloss (cons
404+
              `(gloss ,(apply string-append (filter string? lst)))
405+
              (filter list? lst)))
406+
    ('lsource (sxml->source lst))
407+
    ('sense (sxml->meaning lst))
408+
    ('entry (sxml->result lst frq))))
409+
410+
(define (create-parser frq)
411+
  (ssax:make-parser
412+
    NEW-LEVEL-SEED
413+
    (lambda (elem-gi attributes namespaces expected-content seed)
414+
      (map
415+
        (match-lambda
416+
          ((k . v) (list k v)))
417+
        (filter
418+
          (match-lambda
419+
            ((k . v) (not (member k '(g_type)))))
420+
          attributes)))
421+
422+
    FINISH-ELEMENT
423+
    (lambda (elem-gi attributes namespaces parent-seed seed)
424+
      (if (equal? elem-gi 'JMdict)
425+
          seed
426+
          (let* ((seed (reverse seed))
427+
                 (element (sxml->element seed elem-gi frq)))
428+
            (cons element parent-seed))))
429+
430+
    CHAR-DATA-HANDLER
431+
    (lambda (string1 string2 seed)
432+
      (cons (string-append string1 string2) seed))))
433+
434+
(define (xml->results port frq)
435+
  (let ((results (filter result? ((create-parser frq) port '()))))
436+
    (sort-results results)))

modules/nani/result/result.scm unknown status 1

1+
;;; Nani Project website
2+
;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
19+
(define-module (nani result result)
20+
  #:use-module (ice-9 binary-ports)
21+
  #:use-module (ice-9 match)
22+
  #:use-module (rnrs bytevectors)
23+
  #:use-module (srfi srfi-9)
24+
  #:use-module (nani encoding serialize)
25+
  #:use-module (nani encoding trie)
26+
  #:use-module (nani encoding huffman)
27+
  #:export (make-result
28+
            result?
29+
            result-position
30+
            result-position-set!
31+
            result-score
32+
            result-kanjis
33+
            result-readings
34+
            result-meanings
35+
            
36+
            make-reading
37+
            reading?
38+
            reading-kanjis
39+
            reading-info
40+
            reading-readings
41+
            
42+
            make-meaning
43+
            meaning?
44+
            meaning-references
45+
            meaning-limits
46+
            meaning-sources
47+
            meaning-infos
48+
            meaning-glosses
49+
            meaning-language
50+
            
51+
            make-source
52+
            source?
53+
            source-content
54+
            source-wasei?
55+
            source-lang
56+
            
57+
            update-result
58+
            update-reading
59+
            update-meaning
60+
            update-source
61+
62+
            serialize-result result-size
63+
            serialize-reading reading-size
64+
            serialize-meaning meaning-size
65+
            serialize-source source-size
66+
67+
            serialize-dictionary
68+
            dictionary-entry-count
69+
            sort-results))
70+
71+
(define-record-type result
72+
  (make-result position score kanjis readings meanings)
73+
  result?
74+
  (position result-position result-position-set!) ; integer
75+
  (score result-score) ; integer
76+
  (kanjis result-kanjis) ; string-list
77+
  (readings result-readings) ; reanding-list
78+
  (meanings result-meanings)) ; meaning-list
79+
80+
(define-record-type reading
81+
  (make-reading kanjis info readings)
82+
  reading?
83+
  (kanjis reading-kanjis) ; string-list
84+
  (info reading-info) ; string-list
85+
  (readings reading-readings)) ; string-list
86+
87+
(define-record-type meaning
88+
  (make-meaning references limits sources infos glosses language)
89+
  meaning?
90+
  (references meaning-references) ; string-list
91+
  (limits meaning-limits) ; string-list
92+
  (sources meaning-sources) ; source-list
93+
  (infos meaning-infos) ; string-list
94+
  (glosses meaning-glosses) ; string-list
95+
  (language meaning-language)) ; string
96+
97+
(define-record-type source
98+
  (make-source content wasei? lang)
99+
  source?
100+
  (content source-content) ; string-list
101+
  (wasei? source-wasei?) ; boolean
102+
  (lang source-lang)) ; string
103+
104+
(define* (update-result result
105+
           #:key (score (result-score result))
106+
                 (kanjis (result-kanjis result))
107+
                 (readings (result-readings result))
108+
                 (meanings (result-meanings result)))
109+
  (make-result (result-position result) score kanjis readings meanings))
110+
111+
(define* (update-reading reading
112+
           #:key (kanjis (reading-kanjis reading))
113+
                 (info (reading-info reading))
114+
                 (readings (reading-readings reading)))
115+
  (make-reading kanjis info readings))
116+
117+
(define* (update-meaning meaning
118+
           #:key (references (meaning-references meaning))
119+
                 (limits (meaning-limits meaning))
120+
                 (sources (meaning-sources meaning))
121+
                 (infos (meaning-infos meaning))
122+
                 (glosses (meaning-glosses meaning))
123+
                 (language (meaning-language meaning)))
124+
  (make-meaning references limits sources infos glosses language))
125+
126+
(define* (update-source source
127+
           #:key (content (source-content source))
128+
                 (wasei? (source-wasei? source))
129+
                 (lang (source-lang source)))
130+
  (make-source content wasei? lang))
131+
132+
;; Note how sources are not compressed, that's because they contain more weird characters
133+
;; and represent only a very small fraction of the content, even after compression.
134+
(define (serialize-source source pos bv)
135+
  (when (not (source? source)) (throw 'not-source source))
136+
  (let* ((pos ((serialize-list serialize-string) (source-content source) pos bv))
137+
         (pos (serialize-boolean (source-wasei? source) pos bv))
138+
         (pos (serialize-string (source-lang source) pos bv)))
139+
    pos))
140+
(define (source-size source)
141+
  (when (not (source? source)) (throw 'not-source source))
142+
  (+ ((list-size string-size) (source-content source))
143+
     (boolean-size (source-wasei? source))
144+
     (string-size (source-lang source))))
145+
146+
(define (serialize-reading reading-huffman-code)
147+
  (lambda (reading pos bv)
148+
    (when (not (reading? reading)) (throw 'not-reading reading))
149+
    (let* ((pos ((serialize-list serialize-string) (reading-kanjis reading) pos bv))
150+
           (pos ((serialize-list serialize-string) (reading-info reading) pos bv))
151+
           (pos ((serialize-list (serialize-huffman-string reading-huffman-code))
152+
                 (reading-readings reading) pos bv)))
153+
      pos)))
154+
(define (reading-size reading-huffman-code)
155+
  (lambda (reading)
156+
    (when (not (reading? reading)) (throw 'not-reading reading))
157+
    (+ ((list-size string-size) (reading-kanjis reading))
158+
       ((list-size string-size) (reading-info reading))
159+
       ((list-size (huffman-string-size reading-huffman-code)) (reading-readings reading)))))
160+
161+
(define (serialize-meaning meaning-huffman-code)
162+
  (lambda (meaning pos bv)
163+
    (when (not (meaning? meaning)) (throw 'not-meaning meaning))
164+
    (let* ((pos ((serialize-list serialize-string) (meaning-references meaning) pos bv))
165+
           (pos ((serialize-list serialize-string) (meaning-limits meaning) pos bv))
166+
           (pos ((serialize-list serialize-source) (meaning-sources meaning) pos bv))
167+
           (pos ((serialize-list (serialize-huffman-string meaning-huffman-code))
168+
                 (meaning-infos meaning) pos bv))
169+
           (pos ((serialize-list (serialize-huffman-string meaning-huffman-code))
170+
                 (meaning-glosses meaning) pos bv))
171+
           (pos (serialize-string (meaning-language meaning) pos bv)))
172+
      pos)))
173+
(define (meaning-size meaning-huffman-code)
174+
  (lambda (meaning)
175+
    (when (not (meaning? meaning)) (throw 'not-meaning meaning))
176+
    (+ ((list-size string-size) (meaning-references meaning))
177+
       ((list-size string-size) (meaning-limits meaning))
178+
       ((list-size source-size) (meaning-sources meaning))
179+
       ((list-size (huffman-string-size meaning-huffman-code))
180+
        (meaning-infos meaning))
181+
       ((list-size (huffman-string-size meaning-huffman-code))
182+
        (meaning-glosses meaning))
183+
       (string-size (meaning-language meaning)))))
184+
185+
(define (serialize-result kanji-huffman-code reading-huffman-code meaning-huffman-code)
186+
  (lambda (result pos bv)
187+
    (when (not (result? result)) (throw 'not-result result))
188+
    (result-position-set! result pos)
189+
    (let* ((pos ((serialize-list (serialize-huffman-string kanji-huffman-code))
190+
                 (result-kanjis result) pos bv))
191+
           (pos ((serialize-list (serialize-reading reading-huffman-code))
192+
                 (result-readings result) pos bv))
193+
           (pos ((serialize-list (serialize-meaning meaning-huffman-code))
194+
                 (result-meanings result) pos bv))
195+
           (pos (serialize-char (result-score result) pos bv)))
196+
      pos)))
197+
(define (result-size kanji-huffman-code reading-huffman-code meaning-huffman-code)
198+
  (lambda (result)
199+
    (when (not (result? result)) (throw 'not-result result))
200+
    (+ ((list-size (huffman-string-size kanji-huffman-code)) (result-kanjis result))
201+
       ((list-size (reading-size reading-huffman-code)) (result-readings result))
202+
       ((list-size (meaning-size meaning-huffman-code)) (result-meanings result)))))
203+
204+
;; creating tries
205+
(define (make-key key)
206+
  (apply append
207+
    (map
208+
      (lambda (c)
209+
        (list (quotient c 16) (modulo c 16)))
210+
      (bytevector->u8-list (string->utf8 key)))))
211+
212+
(define (make-kanji-trie results)
213+
  (let ((trie (make-empty-trie)))
214+
    (let loop ((results results) (i 0))
215+
      (if (null? results)
216+
        (compress-trie trie)
217+
        (begin
218+
          (for-each
219+
            (lambda (key)
220+
              (add-to-trie! trie (make-key key) i))
221+
            (result-kanjis (car results)))
222+
          (loop (cdr results) (+ i 1)))))))
223+
224+
(define (make-reading-trie results)
225+
  (let ((trie (make-empty-trie)))
226+
    (let loop ((results results) (i 0))
227+
      (if (null? results)
228+
        (compress-trie trie)
229+
        (begin
230+
          (for-each
231+
            (lambda (reading)
232+
              (for-each
233+
                (lambda (key)
234+
                  (add-to-trie! trie (make-key key) i))
235+
                (reading-readings reading)))
236+
            (result-readings (car results)))
237+
          (loop (cdr results) (+ i 1)))))))
238+
239+
(define (make-meaning-trie results)
240+
  (let ((trie (make-empty-trie)))
241+
    (let loop ((results results) (i 0))
242+
      (if (null? results)
243+
        (compress-trie trie)
244+
        (begin
245+
          (for-each
246+
            (lambda (meaning)
247+
              (for-each
248+
                (lambda (key)
249+
                  (add-to-trie! trie (make-key key) i))
250+
                (meaning-glosses meaning)))
251+
            (result-meanings (car results)))
252+
          (loop (cdr results) (+ i 1)))))))
253+
254+
(define (update-trie-pos! trie results)
255+
  (let* ((vals (trie-vals trie))
256+
         (vals (map (lambda (i) (result-position (array-ref results i))) vals)))
257+
    (trie-vals-set! trie vals))
258+
  (for-each
259+
    (match-lambda
260+
      ((char . child)
261+
       (update-trie-pos! child results)))
262+
    (trie-transitions trie)))
263+
264+
(define (serialize-dictionary results)
265+
  (define kanji-huffman
266+
    (let ((kanjis (apply append (map result-kanjis results))))
267+
      (create-huffman kanjis)))
268+
  (define kanji-huffman-code (huffman->code kanji-huffman))
269+
  (define reading-huffman
270+
    (let* ((readings (apply append (map result-readings results)))
271+
           (readings (apply append (map reading-readings readings))))
272+
      (create-huffman readings)))
273+
  (define reading-huffman-code (huffman->code reading-huffman))
274+
  (define meaning-huffman
275+
    (let* ((meanings (apply append (map result-meanings results)))
276+
           (infos (apply append (map meaning-infos meanings)))
277+
           (glosses (apply append (map meaning-glosses meanings))))
278+
      (create-huffman (append infos glosses))))
279+
  (define meaning-huffman-code (huffman->code meaning-huffman))
280+
281+
  (define (trie-node-size trie)
282+
    (apply + 1 (map trie-node-size (map cdr (trie-transitions trie)))))
283+
284+
  (let* ((header (string->utf8 "NANI_JMDICT003"))
285+
         (header-size (bytevector-length header))
286+
         (pointers (make-bytevector 16 0))
287+
         (kanji-huffman-bv (serialize-huffman kanji-huffman))
288+
         (kanji-huffman-size (bytevector-length kanji-huffman-bv))
289+
         (reading-huffman-bv (serialize-huffman reading-huffman))
290+
         (reading-huffman-size (bytevector-length reading-huffman-bv))
291+
         (meaning-huffman-bv (serialize-huffman meaning-huffman))
292+
         (meaning-huffman-size (bytevector-length meaning-huffman-bv))
293+
         (serialize-trie (serialize-trie serialize-int int-size))
294+
         (trie-size (trie-size int-size))
295+
         (kanji-trie (make-kanji-trie results))
296+
         (kanji-trie-size (trie-size kanji-trie))
297+
         (reading-trie (make-reading-trie results))
298+
         (reading-trie-size (trie-size reading-trie))
299+
         (meaning-trie (make-meaning-trie results))
300+
         (meaning-trie-size (trie-size meaning-trie))
301+
         (trie-sizes (+ kanji-trie-size reading-trie-size meaning-trie-size))
302+
         (results-size
303+
           ((list-size (result-size kanji-huffman-code reading-huffman-code
304+
                                    meaning-huffman-code)
305+
                       #:size? #f)
306+
            results))
307+
         (huffman-size (+ reading-huffman-size meaning-huffman-size kanji-huffman-size))
308+
         (pos-kanji (+ header-size 16 kanji-huffman-size reading-huffman-size
309+
                       meaning-huffman-size results-size 4))
310+
         (bv (make-bytevector (+ header-size 16 kanji-huffman-size
311+
                                 reading-huffman-size
312+
                                 meaning-huffman-size
313+
                                 results-size 4 trie-sizes))))
314+
    (format #t "Number of nodes in kanjis: ~a~%"
315+
      (trie-node-size kanji-trie))
316+
    (format #t "Number of nodes in readings: ~a~%"
317+
      (trie-node-size reading-trie))
318+
    (format #t "Number of nodes in meanings: ~a~%"
319+
      (trie-node-size meaning-trie))
320+
    ((serialize-list (serialize-result kanji-huffman-code reading-huffman-code
321+
                                       meaning-huffman-code)
322+
                     #:size? #f)
323+
     results (+ header-size 16 huffman-size) bv)
324+
    ;; Serializing results also updated result-pos for each of them
325+
    (let ((results (list->array 1 results)))
326+
      (update-trie-pos! kanji-trie results)
327+
      (update-trie-pos! reading-trie results)
328+
      (update-trie-pos! meaning-trie results))
329+
    ;; number of entries
330+
    (serialize-int (length results) (+ header-size 16 huffman-size results-size)
331+
                   bv)
332+
    (let* ((results (list->array 1 results))
333+
           (pos pos-kanji)
334+
           (pos (serialize-trie kanji-trie pos bv))
335+
           (pos-reading pos)
336+
           (pos (serialize-trie reading-trie pos bv))
337+
           (pos-meaning pos)
338+
           (pos (serialize-trie meaning-trie pos bv)))
339+
      ;; Point to the trie structures
340+
      (bytevector-u32-set!
341+
        pointers 0
342+
        (+ header-size 16 huffman-size results-size (int-size 0))
343+
        (endianness big))
344+
      ;; point to the kanji trie structure
345+
      (bytevector-u32-set! pointers 4 pos-kanji (endianness big))
346+
      ;; point to the reading trie structure
347+
      (bytevector-u32-set! pointers 8 pos-reading (endianness big))
348+
      ;; point to the meaning trie structure
349+
      (bytevector-u32-set! pointers 12 pos-meaning (endianness big))
350+
      ;; copy to result bytevector
351+
      (bytevector-copy! header 0 bv 0 header-size)
352+
      (bytevector-copy! pointers 0 bv header-size 16)
353+
      (bytevector-copy! kanji-huffman-bv 0 bv (+ header-size 16) kanji-huffman-size)
354+
      (bytevector-copy! reading-huffman-bv 0 bv
355+
                        (+ header-size 16 kanji-huffman-size)
356+
                        reading-huffman-size)
357+
      (bytevector-copy! meaning-huffman-bv 0 bv
358+
                        (+ header-size 16 kanji-huffman-size reading-huffman-size)
359+
                        meaning-huffman-size)
360+
      ;; give some feedback on the size of file's structures
361+
      (format #t "huffmans are ~a bytes long~%" huffman-size)
362+
      (format #t "results is ~a bytes long~%" results-size)
363+
      (format #t "kanji trie is ~a bytes long~%" kanji-trie-size)
364+
      (format #t "reading trie is ~a bytes long~%" reading-trie-size)
365+
      (format #t "meaning trie is ~a bytes long~%" meaning-trie-size)
366+
      bv)))
367+
368+
(define (dictionary-entry-count file)
369+
  (call-with-input-file file
370+
    (lambda (port)
371+
      (let* ((header (utf8->string (get-bytevector-n port 14)))
372+
             (pointers (get-bytevector-n port 16))
373+
             (end-pos (bytevector-u32-ref pointers 0 (endianness big))))
374+
        (seek port (- end-pos 4) SEEK_SET)
375+
        (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big))))))
376+
377+
(define (sort-results results)
378+
  (define (get-string res)
379+
    (if (null? (result-kanjis res))
380+
        (car (reading-readings (car (result-readings res))))
381+
        (car (result-kanjis res))))
382+
  (sort
383+
    results
384+
    (lambda (a b)
385+
      (cond
386+
        ((> (result-score a) (result-score b)) #t)
387+
        ((= (result-score a) (result-score b))
388+
         (string>? (get-string a) (get-string b)))))))

modules/nani/result/wadoku.scm unknown status 1

1+
;;; Nani Project website
2+
;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
19+
(define-module (nani result wadoku)
20+
  #:use-module (ice-9 match)
21+
  #:use-module (ice-9 rdelim)
22+
  #:use-module (nani result frequency)
23+
  #:use-module (nani result result)
24+
  #:use-module (srfi srfi-9)
25+
  #:use-module (sxml ssax)
26+
  #:export (xml->results))
27+
28+
(define (usg->infos lst)
29+
  (let loop ((infos '()) (lst lst))
30+
    (if (null? lst)
31+
        infos
32+
        (loop
33+
          (match (car lst)
34+
            (('reg . reg) (cons reg infos))
35+
            (('type . type)
36+
             (if (equal? type "abrev")
37+
                 (cons type infos)
38+
                 infos))
39+
            ((? string? info) (cons info infos))
40+
            (_ infos))
41+
          (cdr lst)))))
42+
43+
(define (ruigo->ref lst)
44+
  (let loop ((ref #f) (lst lst))
45+
    (match lst
46+
      (() ref)
47+
      ((('id id) lst ...)
48+
       (loop id lst))
49+
      (((a . b) lst ...)
50+
       (loop (loop ref (list (list a b))) lst))
51+
      (((? list? l) lst ...)
52+
       (loop (loop ref l) lst)))))
53+
54+
(define (merge-meanings s1 s2)
55+
  (update-meaning s2
56+
    #:references (append (meaning-references s1) (meaning-references s2))
57+
    #:infos (append (meaning-infos s1) (meaning-infos s2))
58+
    #:glosses (append (meaning-glosses s1) (meaning-glosses s2))))
59+
60+
(define (sxml->meaning lst)
61+
  (let loop ((meaning (make-meaning '() '() '() '() '() "ger")) (lst lst))
62+
    (if (null? lst)
63+
      meaning
64+
      (loop
65+
        (match (car lst)
66+
          (#f meaning)
67+
          ((? meaning? s)
68+
           (merge-meanings s meaning))
69+
          ((? source? s)
70+
           (update-meaning meaning #:sources (cons s (meaning-sources meaning))))
71+
          (('ref (? string? r))
72+
           (update-meaning meaning #:references (cons r (meaning-references meaning))))
73+
          (('info (? string? r))
74+
           (update-meaning meaning #:infos (cons r (meaning-infos meaning))))
75+
          (('infos (? string? r))
76+
           (update-meaning meaning #:infos (cons r (meaning-infos meaning))))
77+
          (('infos (? list? r))
78+
           (update-meaning meaning #:infos (append r (meaning-infos meaning))))
79+
          (('trans (? string? r))
80+
           (update-meaning meaning #:glosses (cons (string-downcase r)
81+
                                               (meaning-glosses meaning))))
82+
          (('related . _) meaning)
83+
          (('transcr . _) meaning)
84+
          (('pitch . _) meaning)
85+
          (((? symbol? s) v)
86+
           (throw 'unknown-symbol s v))
87+
          ((? list? l) (loop meaning l))
88+
          ((? string? _) meaning))
89+
        (cdr lst)))))
90+
91+
;; TODO
92+
(define (sxml->ref lst)
93+
  (define (sxml->ref-name lst)
94+
    (let loop ((result #f) (lst lst))
95+
      (if (null? lst)
96+
        result
97+
        (loop
98+
          (match (car lst)
99+
            (('id . id) id)
100+
            (((? symbol? s) . _) result)
101+
            (((? symbol? s) _) result)
102+
            ((? list? l) (loop result l))
103+
            (_ result))
104+
          (cdr lst)))))
105+
  (let ((ref (sxml->ref-name lst)))
106+
    (if (string? ref)
107+
        `(ref ,ref)
108+
        (throw 'no-ref ref 'from lst))))
109+
110+
(define (sxml->source lst)
111+
  (let loop ((source (make-source '() #f "")) (lst lst))
112+
    (if (null? lst)
113+
      source
114+
      (loop
115+
        (match (car lst)
116+
          (('impli impli) (update-source source #:lang impli))
117+
          (('foreign foreign)
118+
           (update-source source
119+
             #:content (if (list? foreign)
120+
                           (append foreign (source-content source))
121+
                           (cons foreign (source-content source)))))
122+
          ((? list? l) (loop source l))
123+
          (_ source))
124+
        (cdr lst)))))
125+
126+
(define (sxml->reading lst)
127+
  (let loop ((reading (make-reading '() '() '())) (lst lst))
128+
    (if (null? lst)
129+
      reading
130+
      (loop
131+
        (match (car lst)
132+
          (('reading r)
133+
           (update-reading reading #:readings (cons r (reading-readings reading))))
134+
          (_ reading))
135+
        (cdr lst)))))
136+
137+
(define (sxml->result sxml frq)
138+
  (define (sxml->result-aux sxml)
139+
    (let loop ((result (make-result 0 0 '() '() '())) (last-source #f) (lst sxml))
140+
      (if (null? lst)
141+
        result
142+
        (match (car lst)
143+
          (('kanji kanji)
144+
           (loop
145+
             (update-result result #:kanjis (cons kanji (result-kanjis result)))
146+
             last-source (cdr lst)))
147+
          ((? reading? reading)
148+
           (loop
149+
             (update-result result #:readings (cons reading (result-readings result)))
150+
             last-source (cdr lst)))
151+
          ((? meaning? meaning)
152+
           (loop
153+
             (update-result result
154+
               #:meanings
155+
               (cons
156+
                 (if last-source
157+
                     (update-meaning meaning
158+
                       #:sources (cons last-source (meaning-sources meaning)))
159+
                     meaning)
160+
                 (result-meanings result)))
161+
             last-source (cdr lst)))
162+
          ((? list? l)
163+
           (loop (loop result last-source l) last-source (cdr lst)))
164+
          (_ (loop result last-source (cdr lst)))))))
165+
  (let* ((result (sxml->result-aux sxml))
166+
         (word (if (null? (result-kanjis result))
167+
                   (car (reading-readings (car (result-readings result))))
168+
                   (car (result-kanjis result))))
169+
         (score (frequency->score frq word))
170+
         (meanings (result-meanings result)))
171+
    (update-result result
172+
      #:score score
173+
      #:meanings meanings)))
174+
175+
(define (meishi->info lst)
176+
  (match (assoc-ref lst 'suru)
177+
    ("both" "N., mit suru intrans. od. trans.")
178+
    ("intrans" "N., mit suru intrans.")
179+
    ("trans" "N., mit suru trans.")))
180+
181+
(define (keiyoushi->info lst)
182+
  (cond
183+
    ((equal? (assoc-ref lst 'ku) "true")
184+
     "Adj. auf ???ku")
185+
    ((equal? (assoc-ref lst 'shiku) "true")
186+
     "Adj. auf -shiku")))
187+
188+
(define (keiyoudoushi->info lst)
189+
  (if (equal? (assoc-ref lst 'nari) "true")
190+
      "Na.???Adj. mit nari"
191+
      "Na.???Adj. mit na od. no"))
192+
193+
(define (fukushi->info lst)
194+
  (define (get attr)
195+
    (match (assoc-ref lst attr)
196+
      ("false" #f)
197+
      ("true" #t)
198+
      (v v)))
199+
  (string-append
200+
    "Adv"
201+
    (if (get 'ni) ", mit ni und Adn. mit naru" "")
202+
    (if (get 'to)
203+
        (if (get 'naru)
204+
            ", mit to und Adn. mit taru"
205+
            ", mit to")
206+
        "")
207+
    (match (get 'suru)
208+
      ("intrans" ", mit suru intrans. V.")
209+
      ("trans" ", mit suru trans. V.")
210+
      ("both" ", mit suru trans. od. intrans. V.")
211+
      (_ ""))))
212+
213+
(define (doushi->info lst)
214+
  (define transitivity-str
215+
    (match (assoc-ref lst 'transitivity)
216+
      ("intrans" "intrans.")
217+
      ("trans" "trans.")
218+
      ("both" "trans. od. intrans.")))
219+
220+
  (define onbin (assoc-ref lst 'onbin))
221+
222+
  (match (assoc-ref lst 'level)
223+
    ("kuru" (string-append "unregelm. " transitivity-str " V. auf ka"))
224+
    ("ra" (string-append transitivity-str " V. auf -ra"))
225+
    ("suru" (string-append transitivity-str " V. auf -suru"))
226+
    ("1e" (string-append "1???st. " transitivity-str " V. auf -e"))
227+
    ("1i" (string-append "1???st. " transitivity-str " V. auf -i"))
228+
    ("2e" (string-append "2???st. " transitivity-str " V. auf -e bzw. -u"))
229+
    ("2i" (string-append "2???st. " transitivity-str " V. auf -i bzw. -u"))
230+
    ("4" (string-append "4???st. " transitivity-str " V."))
231+
    ("5" (string-append "5-st. " transitivity-str " V."
232+
            (match (assoc-ref lst 'godanrow)
233+
              ("ba" (if onbin
234+
                        " auf -ba mit regelm. Nasal-Onbin = ???nde"
235+
                        " auf -ba"))
236+
              ("ga" (if onbin
237+
                        " auf -ga mit regelm. i-Onbin = ???ide"
238+
                        " auf -ga"))
239+
              ("ka_i_yu" (if onbin
240+
                             " auf -ka mit Geminaten-Onbin = ???tte"
241+
                             " auf -ka"))
242+
              ("ka" (if onbin
243+
                        " auf -ka mit i-Onbin = ???ite"
244+
                        " auf -ka"))
245+
              ("ma" (if onbin
246+
                        " auf -ma regelm. Nasal-Onbin = ???nde"
247+
                        " auf -ma"))
248+
              ("na" (if onbin
249+
                        " auf -na mit regelm. Nasal-Onbin = ???nde"
250+
                        " auf -na"))
251+
              ("ra_i" (if onbin
252+
                          " auf -ra, Sonderform mit Reny????kei ???i"
253+
                          " auf -ra"))
254+
              ("ra" (if onbin
255+
                        " auf -ra mit regelm. Geminaten-Onbin = ???tte"
256+
                        " auf -ra"))
257+
              ("sa" " auf -sa")
258+
              ("ta" (if onbin
259+
                        " auf -ta mit regelm. Geminaten-Onbin = ???tte"
260+
                        " auf -ta"))
261+
              ("wa" (if onbin
262+
                        " auf -[w]a mit Geminaten-Onbin = ???tte"
263+
                        " auf -[w]a"))
264+
              ("wa_o" (if onbin
265+
                          " auf -[w]a mit u-Onbin = ?????/??te"
266+
                          " auf -[w]a"))
267+
              (#f ""))))))
268+
269+
(define (gram->info lst)
270+
  (map
271+
    (lambda (gram)
272+
      (match gram
273+
        (('daimeishi) "Pron.")
274+
        (('doushi doushi ...) (doushi->info doushi))
275+
        (('fukujoshi) "adv. Part.")
276+
        (('fukushi) "Adv.")
277+
        (('fukushi fukushi ...) (fukushi->info fukushi))
278+
        (('jodoushi) "Hilfsv.")
279+
        (('joshi) "Part.")
280+
        (('kakarijoshi) "Themenpart.")
281+
        (('kakujoshi) "Kasuspart.")
282+
        (('kandoushi) "Interj.")
283+
        (('kanji) "Kanji")
284+
        (('keiyoudoushi) "Na.???Adj.")
285+
        (('keiyoudoushi keiyoudoushi ...) (keiyoudoushi->info keiyoudoushi))
286+
        (('keiyoushi) "Adj.")
287+
        (('keiyoushi keiyoushi ...) (keiyoushi->info keiyoushi))
288+
        (('meishi) "N.")
289+
        (('meishi meishi ...) (meishi->info meishi))
290+
        (('prefix) "Pr??f.")
291+
        (('rengo) "Zus.")
292+
        (('rentaishi) "Adn")
293+
        (('setsuzokujoshi) "konjunktionale Part.")
294+
        (('setsuzokushi) "Konj.")
295+
        (('shuujoshi) "satzbeendende Part.")
296+
        (('specialcharacter) "Sonderzeichen")
297+
        (('suffix) "Suff.")
298+
        (('wordcomponent) "Wortkomp.")))
299+
    lst))
300+
301+
(define (sxml->string lst)
302+
  (define (sub-loop loop infos result lst l)
303+
    (let ((result (loop infos result l)))
304+
      (if (list? result)
305+
          (loop (append infos (filter list? result))
306+
                (apply string-append (filter string? result))
307+
                lst)
308+
          (loop infos result lst))))
309+
310+
  (let loop ((infos '()) (result "") (lst lst))
311+
    (match lst
312+
      (() (if (null? infos)
313+
              result
314+
              (append infos (list result))))
315+
      ((? string? s)
316+
       (loop infos (string-append result s) '()))
317+
      (((? string? s) lst ...)
318+
       (loop infos (string-append result s) lst))
319+
      ((('prior . _) lst ...)
320+
       (loop infos result lst))
321+
      ((('options . _) lst ...)
322+
       (loop infos result lst))
323+
      ((('firstname . _) lst ...)
324+
       (loop infos result lst))
325+
      ((('ausn . _) lst ...)
326+
       (loop infos result lst))
327+
      ((('lang . _) lst ...)
328+
       (loop infos result lst))
329+
      ((('meta . _) lst ...)
330+
       (loop infos result lst))
331+
      ((('genki . genki) lst ...)
332+
       ;(loop (cons `(info ,genki) infos) result lst))
333+
       (loop infos result lst))
334+
      ((('jlpt . jlpt) lst ...)
335+
       (loop (cons `(info ,(string-append "jlpt-" jlpt)) infos) result lst))
336+
      ((('ref . (? string? ref)) lst ...)
337+
       (loop (cons `(ref ,ref) infos) result lst))
338+
      ((('ref (? string? ref)) lst ...)
339+
       (loop (cons `(ref ,ref) infos) result lst))
340+
      ((('jap l) lst ...)
341+
       (sub-loop loop infos result lst l))
342+
      ((('foreign l) lst ...)
343+
       (sub-loop loop infos result lst l))
344+
      ((('transcr l) lst ...)
345+
       (sub-loop loop infos result lst l))
346+
      ((((? symbol? s) v) lst ...)
347+
       (throw 'unsupported-symbol s v))
348+
      ((((? symbol? s) . v) lst ...)
349+
       (throw 'unsupported-symbol-pair s v))
350+
      (((? list? l) lst ...)
351+
       (sub-loop loop infos result lst l)))))
352+
353+
(define (sxml->element lst elem frq)
354+
  (let ((elem (match elem
355+
                ((_ . elem) elem)
356+
                (_ elem))))
357+
    (match elem
358+
      ('orth (let ((kanji (filter string? lst)))
359+
               (if (null? kanji)
360+
                   #f
361+
                   `(kanji
362+
                      ,(string-filter
363+
                         (lambda (c)
364+
                           ;; Remove characters that interfere with kanji
365+
                           ;; based search
366+
                           (not (member c '(#\??? #\??? #\??? #\??? #\??? #\??? #\???
367+
                                            #\?? #\???))))
368+
                         (car kanji))))))
369+
      ('count #f)
370+
      ('entry (sxml->result lst frq))
371+
      ('hira `(reading ,(car lst)))
372+
      ('hatsuon `(hatsuon ,(car lst)))
373+
      ('accent `(pitch ,(car lst)))
374+
      ('reading (sxml->reading lst))
375+
      ('form lst)
376+
      ('impli `(impli ,(car lst)))
377+
      ('text (let loop ((text "") (lst lst))
378+
               (match lst
379+
                 (() text)
380+
                 ((('hasPrecedingSpace . _) lst ...)
381+
                  (loop (string-append " " text) lst))
382+
                 ((('hasFollowingSpace . _) lst ...)
383+
                  (string-append (loop text lst) " "))
384+
                 (((? string? s) lst ...)
385+
                  (loop (string-append text s) lst)))))
386+
      ('famn (sxml->string lst))
387+
      ('expl (sxml->string lst))
388+
      ('expli (filter list? lst))
389+
      ('abbrev (filter list? lst))
390+
      ('token (filter string? lst))
391+
      ('tr (sxml->string lst))
392+
      ('transcr `(transcr ,lst))
393+
      ('trans
394+
        (append (filter list? lst)
395+
                (map (lambda (s) `(trans ,s)) (filter string? lst))))
396+
      ('jap `(jap ,lst))
397+
      ('emph lst)
398+
      ('title (filter string? lst))
399+
      ('transl (filter string? lst))
400+
      ('topic (filter string? lst))
401+
      ('iron (filter string? lst))
402+
      ('specchar (filter string? lst))
403+
      ('scientif (filter string? lst))
404+
      ('wikide #f)
405+
      ('wikija #f)
406+
      ('link #f)
407+
      ('ref (sxml->ref lst))
408+
      ('sref (sxml->ref lst))
409+
      ('etym (sxml->source lst))
410+
      ('literal (sxml->string (list "???" (sxml->string lst) "???")))
411+
      ('def (sxml->string (list "(" (sxml->string lst) ")")))
412+
      ('date (sxml->string (list "(" (sxml->string lst) ")")))
413+
      ('birthdeath (sxml->string (list "(" (sxml->string lst) ")")))
414+
      ('descr (sxml->string (list "(" (sxml->string lst) ")")))
415+
      ('bracket (sxml->string (list "[" (sxml->string lst) "]")))
416+
      ('foreign (if (null? lst) #f `(foreign ,(car lst))))
417+
      ('seasonword `(info ,(string-append "season: " (assoc-ref lst 'type))))
418+
      ('usg `(infos . ,(usg->infos lst)))
419+
      ('sense (sxml->meaning lst))
420+
      ('steinhaus (let ((ref (sxml->string lst)))
421+
                    (if (string? ref)
422+
                        `(ref ,(sxml->string lst))
423+
                        (throw 'not-steinhaus ref))))
424+
      ('pos '()); TODO: actually find what info to use
425+
      ('wordcomponent (cons 'wordcomponent lst))
426+
      ('meishi (cons 'meishi lst))
427+
      ('setsuzokushi (cons 'setsuzokushi lst))
428+
      ('daimeishi (cons 'daimeishi lst))
429+
      ('doushi (cons 'doushi lst))
430+
      ('kandoushi (cons 'kandoushi lst))
431+
      ('keiyoudoushi (cons 'keiyoudoushi lst))
432+
      ('keiyoushi (cons 'keiyoushi lst))
433+
      ('fukushi (cons 'fukushi lst))
434+
      ('rengo (cons 'rengo lst))
435+
      ('suffix (cons 'suffix lst))
436+
      ('prefix (cons 'prefix lst))
437+
      ('kanji (cons 'kanji lst))
438+
      ('rentaishi (cons 'rentaishi lst))
439+
      ('specialcharacter (cons 'specialcharacter lst))
440+
      ('joshi (cons 'joshi lst))
441+
      ('fukujoshi (cons 'fukujoshi lst))
442+
      ('kakujoshi (cons 'kakujoshi lst))
443+
      ('kakarijoshi (cons 'kakarijoshi lst))
444+
      ('shuujoshi (cons 'shuujoshi lst))
445+
      ('setsuzokujoshi (cons 'setsuzokujoshi lst))
446+
      ('jokeiyoushi (cons 'jokeiyoushi lst))
447+
      ('jodoushi (cons 'jodoushi lst))
448+
      ('ruigos lst)
449+
      ('ruigo (ruigo->ref lst))
450+
      ('gramGrp (gram->info lst)))))
451+
452+
(define (create-parser frq)
453+
  (ssax:make-parser
454+
    NEW-LEVEL-SEED
455+
    (lambda (elem-gi attributes namespaces expected-content seed)
456+
      attributes)
457+
    
458+
    FINISH-ELEMENT
459+
    (lambda (elem-gi attributes namespaces parent-seed seed)
460+
      (if (equal? elem-gi 'entries)
461+
          seed
462+
          (let* ((seed (reverse seed))
463+
                 (element (sxml->element seed elem-gi frq)))
464+
            (cons element parent-seed))))
465+
    
466+
    CHAR-DATA-HANDLER
467+
    (lambda (string1 string2 seed)
468+
      (cons (string-append string1 string2) seed))))
469+
470+
(define (xml->results port frq)
471+
  (let ((results (filter result? ((create-parser frq) port '()))))
472+
    (sort-results results)))

modules/nani/serialize.scm unknown status 2

1-
;;; Nani Project website
2-
;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu>
3-
;;;
4-
;;; This file is part of the Nani Project website.
5-
;;;
6-
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7-
;;; under the terms of the GNU Affero General Public License as published by
8-
;;; the Free Software Foundation; either version 3 of the License, or (at
9-
;;; your option) any later version.
10-
;;;
11-
;;; The Nani Project website is distributed in the hope that it will be useful, but
12-
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13-
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14-
;;; GNU Affero General Public License for more details.
15-
;;;
16-
;;; You should have received a copy of the GNU Affero General Public License
17-
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18-
19-
(define-module (nani serialize)
20-
  #:use-module (rnrs bytevectors)
21-
  #:export (merge-bvs
22-
            serialize-list list-size
23-
            serialize-char char-size
24-
            serialize-int int-size
25-
            serialize-boolean boolean-size
26-
            serialize-string string-size))
27-
28-
(define (merge-bvs bvs)
29-
  (let* ((size (apply + (map bytevector-length bvs)))
30-
         (bv (make-bytevector size 0)))
31-
    (let loop ((bvs bvs) (pos 0))
32-
      (unless (null? bvs)
33-
        (let ((sz (bytevector-length (car bvs))))
34-
          (bytevector-copy! (car bvs) 0 bv pos sz)
35-
          (loop (cdr bvs) (+ pos sz)))))
36-
    bv))
37-
38-
(define* (serialize-list lst serialize pos bv #:key (size? #t))
39-
  (when (not (list? lst)) (throw 'not-list lst))
40-
  (when size?
41-
    (bytevector-u16-set! bv pos (length lst) (endianness big)))
42-
  (let loop ((lst lst) (pos (+ pos (if size? 2 0))))
43-
    (if (null? lst)
44-
      pos
45-
      (loop (cdr lst) (serialize (car lst) pos bv)))))
46-
(define* (list-size lst size #:key (size? #t))
47-
  (when (not (list? lst)) (throw 'not-list lst))
48-
  (apply + (if size? 2 0) (map size lst)))
49-
50-
(define (serialize-char int pos bv)
51-
  (bytevector-u8-set! bv pos int)
52-
  (+ pos 1))
53-
(define char-size (const 1))
54-
55-
(define (serialize-int int pos bv)
56-
  (bytevector-u32-set! bv pos int (endianness big))
57-
  (+ pos 4))
58-
(define int-size (const 4))
59-
60-
(define (serialize-boolean bool pos bv)
61-
  (bytevector-u8-set! bv pos (if bool 1 0))
62-
  (+ pos 1))
63-
(define boolean-size (const 1))
64-
65-
(define (serialize-string str pos bv)
66-
  (let ((sbv (string->utf8 str)))
67-
    (bytevector-copy! sbv 0 bv pos (bytevector-length sbv))
68-
    (bytevector-u8-set! bv (+ pos (bytevector-length sbv)) 0)
69-
    (+ pos 1 (bytevector-length sbv))))
70-
(define (string-size str)
71-
  (let ((sbv (string->utf8 str)))
72-
    (+ 1 (bytevector-length sbv))))

modules/nani/tags.scm unknown status 2

1-
(define-module (nani tags)
2-
  #:use-module (nani result)
3-
  #:export (get-tag get-points))
4-
5-
(define tags
6-
  '("MA" "X" "abbr" "adj-i" "adj-ix" "adj-na" "adj-no" "adj-pn" "adj-t" "adj-f"
7-
    "adv" "adv-to" "arch" "ateji" "aux" "aux-v" "aux-adj" "Buddh" "chem" "chn"
8-
    "col" "comp" "conj" "cop" "ctr" "derog" "eK" "ek" "exp" "fam" "fem" "food"
9-
    "geom" "gikun" "hon" "hum" "iK" "id" "ik" "int" "io" "iv" "ling" "m-sl"
10-
    "male" "male-sl" "math" "meteor" "mil" "n" "n-adv" "n-suf" "n-pref" "n-t"
11-
    "num" "oK" "obs" "obsc" "ok" "oik" "on-mim" "pn" "poet" "pol" "pref" "proverb"
12-
    "prt" "physics" "quote" "rare" "sens" "sl" "suf" "uK" "uk" "unc" "yoji"
13-
    "v1" "v1-s" "v2a-s" "v4h" "v4r" "v5aru" "v5b" "v5g" "v5k" "v5k-s" "v5m"
14-
    "v5n" "v5r" "v5r-i" "v5s" "v5t" "v5u" "v5u-s" "v5uru" "vz" "vi" "vk" "vn"
15-
    "vr" "vs" "vs-c" "vs-s" "vs-i" "kyb" "osb" "ksb" "ktb" "tsb" "thb" "tsug"
16-
    "kyu" "rkb" "nab" "hob" "vt" "vulg" "adj-kari" "adj-ku" "adj-shiku"
17-
    "adj-nari" "n-pr" "v-unspec" "v4k" "v4g" "v4s" "v4t" "v4n" "v4b" "v4m"
18-
    "v2k-k" "v2g-k" "v2t-k" "v2d-k" "v2h-k" "v2b-k" "v2m-k" "v2y-k" "v2r-k"
19-
    "v2k-s" "v2g-s" "v2s-s" "v2z-s" "v2t-s" "v2d-s" "v2n-s" "v2h-s" "v2b-s"
20-
    "v2m-s" "v2y-s" "v2r-s" "v2w-s" "archit" "astron" "baseb" "biol" "bot"
21-
    "bus" "econ" "engr" "finc" "geol" "law" "mahj" "med" "music" "Shinto"
22-
    "shogi" "sports" "sumo" "zool" "joc" "anat" "Christn" "net-sl" "dated"
23-
    "hist" "litf" "surname" "place" "unclass" "company" "product" "work"
24-
    "person" "given" "station" "organization"
25-
    
26-
    ;; wadoku special
27-
    "young" "thief" "secret" "baby" "stud" "sail" "fashion" "archeo"
28-
    "rel" "color" "psy" "print" "aero" "politics" "anth" "biblio"
29-
    "game" "agri" "alco" "demo" "train" "philo" "ethno" "photo" "trans"
30-
    "telecom" "build" "sci" "school" "art" "hobby"
31-
32-
    "season-spring" "season-summer" "season-winter" "season-autumn"
33-
    "season-newyear"
34-
35-
    "jlpt-1" "jlpt-2" "jlpt-3" "jlpt-4" "jlpt-5"
36-
37-
    "frq500" "frq1000" "frq2000" "frq5000" "frq10000" "frq20000"))
38-
39-
(define (get-tag tag)
40-
  (let ((val (member tag tags)))
41-
    (if val (- (length val) 1) (throw 'unknown-tag tag))))
42-
43-
(define (get-points result)
44-
  (define (tag-point tag)
45-
    (cond
46-
      ;; frequency
47-
      ((eq? tag (get-tag "frq500"))
48-
       32)
49-
      ((eq? tag (get-tag "frq1000"))
50-
       16)
51-
      ((eq? tag (get-tag "frq2000"))
52-
       8)
53-
      ((eq? tag (get-tag "frq5000"))
54-
       4)
55-
      ((eq? tag (get-tag "frq10000"))
56-
       2)
57-
      ((eq? tag (get-tag "frq20000"))
58-
       1)
59-
      ;; archaic term
60-
      ((eq? tag (get-tag "arch"))
61-
       -3)
62-
      ;; children language
63-
      ((eq? tag (get-tag "chn"))
64-
       -1)
65-
      ;; colloquialism
66-
      ((eq? tag (get-tag "col"))
67-
       5)
68-
      ;; expression
69-
      ((eq? tag (get-tag "exp"))
70-
       1)
71-
      ;; sonkeigo (honorific or respectful)
72-
      ((eq? tag (get-tag "hon"))
73-
       -1)
74-
      ;; kenjougo (humble)
75-
      ((eq? tag (get-tag "hum"))
76-
       -1)
77-
      ;; teineigo (polite)
78-
      ((eq? tag (get-tag "pol"))
79-
       -1)
80-
      ;; irregular kanji
81-
      ((eq? tag (get-tag "iK"))
82-
       -2)
83-
      ;; idiomatic expression
84-
      ((eq? tag (get-tag "id"))
85-
       3)
86-
      ;; slang
87-
      ((member tag (list
88-
                     (get-tag "sl")
89-
                     (get-tag "m-sl")
90-
                     (get-tag "male-sl")))
91-
       -4)
92-
      ;; outdated kanji
93-
      ((eq? tag (get-tag "oK"))
94-
       -2)
95-
      ;; obsolete term
96-
      ((eq? tag (get-tag "obs"))
97-
       -5)
98-
      ;; obscure term
99-
      ((eq? tag (get-tag "obsc"))
100-
       -10)
101-
      ;; onomatopea
102-
      ((eq? tag (get-tag "on-mim"))
103-
       1)
104-
      ;; poetical term
105-
      ((eq? tag (get-tag "poet"))
106-
       -1)
107-
      ((eq? tag (get-tag "proverb"))
108-
       1)
109-
      ((eq? tag (get-tag "rare"))
110-
       -3)
111-
      ;; sensitive
112-
      ((eq? tag (get-tag "sens"))
113-
       -2)
114-
      ;; dialect
115-
      ((member tag (list
116-
                     (get-tag "kyb")
117-
                     (get-tag "osb")
118-
                     (get-tag "ksb")
119-
                     (get-tag "ktb")
120-
                     (get-tag "tsb")
121-
                     (get-tag "thb")
122-
                     (get-tag "tsug")
123-
                     (get-tag "kyu")
124-
                     (get-tag "rkb")
125-
                     (get-tag "nab")
126-
                     (get-tag "hob")))
127-
       -1)
128-
      (else 0)))
129-
  (let ((tags (apply append (map sense-tags (result-senses result)))))
130-
    (apply + (map tag-point tags))))

modules/nani/trie.scm unknown status 2

1-
;;; Nani Project website
2-
;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu>
3-
;;;
4-
;;; This file is part of the Nani Project website.
5-
;;;
6-
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7-
;;; under the terms of the GNU Affero General Public License as published by
8-
;;; the Free Software Foundation; either version 3 of the License, or (at
9-
;;; your option) any later version.
10-
;;;
11-
;;; The Nani Project website is distributed in the hope that it will be useful, but
12-
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13-
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14-
;;; GNU Affero General Public License for more details.
15-
;;;
16-
;;; You should have received a copy of the GNU Affero General Public License
17-
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18-
19-
(define-module (nani trie)
20-
  #:use-module (nani serialize)
21-
  #:use-module (rnrs bytevectors)
22-
  #:use-module (srfi srfi-9)
23-
  #:export (make-trie
24-
            trie?
25-
            trie-position
26-
            trie-position-set!
27-
            trie-vals
28-
            trie-vals-set!
29-
            trie-transitions
30-
            trie-transitions-set!
31-
            
32-
            make-empty-trie
33-
            add-to-trie!
34-
            compress-trie
35-
            
36-
            serialize-trie
37-
            trie-size))
38-
39-
(define-record-type trie
40-
  (make-trie position vals transitions)
41-
  trie?
42-
  (position trie-position trie-position-set!) ; integer
43-
  (vals trie-vals trie-vals-set!) ; list
44-
  (transitions trie-transitions trie-transitions-set!)) ; array or alist
45-
46-
(define (make-empty-trie)
47-
  (make-trie 0 '() (make-array #f 16)))
48-
49-
(define (add-to-trie! trie key value)
50-
  (if (null? key)
51-
    (trie-vals-set! trie (cons value (trie-vals trie)))
52-
    (let ((next-trie (array-ref (trie-transitions trie) (car key))))
53-
      (if next-trie
54-
          (add-to-trie! next-trie (cdr key) value)
55-
          (let ((next-trie (make-empty-trie)))
56-
            (array-set! (trie-transitions trie) next-trie (car key))
57-
            (add-to-trie! next-trie (cdr key) value))))))
58-
59-
(define (convert-trie-transitions! trie)
60-
  (define (get-new-transitions transitions)
61-
    (let loop ((i 0) (tr '()))
62-
      (if (= i 16)
63-
        tr
64-
        (let ((elem (array-ref transitions i)))
65-
          (if elem
66-
            (begin
67-
              (convert-trie-transitions! elem)
68-
              (loop (+ i 1) (cons (cons i elem) tr)))
69-
            (loop (+ i 1) tr))))))
70-
  (let* ((transitions (trie-transitions trie))
71-
         (transitions (get-new-transitions transitions)))
72-
    (trie-transitions-set! trie transitions)))
73-
74-
(define (compress-trie trie)
75-
  (define (compress-aux trie)
76-
    (make-trie
77-
      (trie-position trie)
78-
      (trie-vals trie)
79-
      (apply append
80-
        (map
81-
          (lambda (tr)
82-
            (let ((trie (cdr tr)))
83-
              (map
84-
                (lambda (tr2)
85-
                  (cons (+ (car tr2) (* 16 (car tr)))
86-
                        (compress-aux (cdr tr2))))
87-
                (trie-transitions trie))))
88-
          (trie-transitions trie)))))
89-
  (convert-trie-transitions! trie)
90-
  (compress-aux trie))
91-
92-
(define (pointer-size ptr)
93-
  5)
94-
95-
(define (serialize-pointer ptr pos bv)
96-
  (bytevector-u8-set! bv pos (car ptr))
97-
  (bytevector-u32-set! bv (+ pos 1) (trie-position (cdr ptr)) (endianness big))
98-
  (+ pos 5))
99-
100-
(define (serialize-trie trie serialize size results pos)
101-
  (define (serialize-trie-aux transitions pos)
102-
    (let loop ((pos pos)
103-
               (trs transitions)
104-
               (bvs '()))
105-
      (if (null? trs)
106-
        (cons pos bvs)
107-
        (let* ((next-trie (cdr (car trs)))
108-
               (bv (get-trie-bv next-trie results pos))
109-
               (pos (car bv))
110-
               (bv (cdr bv)))
111-
          (loop pos (cdr trs) (append bvs bv))))))
112-
113-
  (define (get-trie-bv trie results pos)
114-
    (trie-position-set! trie pos)
115-
    (let* ((vals-sz (size (trie-vals trie)))
116-
           (trs-sz (list-size (trie-transitions trie) (const 5) #:size? #f))
117-
           (sz (+ vals-sz 1 trs-sz))
118-
           (bv (make-bytevector sz 0)))
119-
      (serialize (trie-vals trie) 0 bv)
120-
      (let* ((bvs (serialize-trie-aux
121-
                    (trie-transitions trie)
122-
                    (+ pos sz)))
123-
             (next-pos (car bvs))
124-
             (bvs (cdr bvs)))
125-
        (bytevector-u8-set! bv vals-sz (length (trie-transitions trie)))
126-
        (serialize-list (trie-transitions trie) serialize-pointer
127-
                        (+ vals-sz 1) bv #:size? #f)
128-
        (cons next-pos (cons bv bvs)))))
129-
  
130-
  (let* ((trie-bv (get-trie-bv trie results pos))
131-
         (new-pos (car trie-bv))
132-
         (trie-bv (merge-bvs (cdr trie-bv))))
133-
    (bytevector-copy! trie-bv 0 results pos (bytevector-length trie-bv))
134-
    new-pos))
135-
 
136-
(define (trie-size trie size)
137-
  (apply +
138-
    (size (trie-vals trie))
139-
    1
140-
    (list-size (trie-transitions trie) pointer-size #:size? #f)
141-
    (map (lambda (trie) (trie-size trie size))
142-
         (map cdr (trie-transitions trie)))))

modules/nani/wadoku/entities.scm unknown status 2

1-
(define-module (nani wadoku entities)
2-
  #:use-module (nani result)
3-
  #:use-module ((nani tags) #:prefix tag:)
4-
  #:use-module (ice-9 match)
5-
  #:export (get-tag hint->tag)
6-
  #:re-export (tag:get-points))
7-
8-
(define wadoku-entities
9-
  `(("Aerodyn." . "aero")
10-
    ("Agrochem." . "chem")
11-
    ("??gypt. Gesch." . "hist")
12-
    ("??gypt. Mythol." . "rel")
13-
    ("Aktien" . "econ")
14-
    ("Akupunktur" . "med")
15-
    ("Akustik" . "music")
16-
    ("American Football" . "sports")
17-
    ("amerik. Gesch." . "hist")
18-
    ("amerik. Pol." . "politics")
19-
    ("amerik. Rechtsw." . "law")
20-
    ("Anat." . "anat")
21-
    ("Angeln" . "sports")
22-
    ("Anime" . "work")
23-
    ("Anthropol." . "anth")
24-
    ("arab. Gesch." . "hist")
25-
    ("Arch??ol." . "archeo")
26-
    ("Archit." . "archit")
27-
    ("Astrol." . "astron")
28-
    ("Astron." . "astron")
29-
    ("atmosph??r. Optik" . "physics")
30-
    ("Atomphys." . "physics")
31-
    ("Audio" . "music")
32-
    ("Augenheilk." . "med")
33-
    ("Autorennen" . "sports")
34-
    ("Backw." . "food")
35-
    ("Badminton" . "sports")
36-
    ("Bahnhofsn." . "station")
37-
    ("Bakteriol." . "biol")
38-
    ("Ballet" . "music")
39-
    ("Ballett" . "music")
40-
    ("Ballsport" . "sports")
41-
    ("Bankw." . "finc")
42-
    ("Baseb." . "baseb")
43-
    ("Baseball" . "baseb")
44-
    ("Basketball" . "sports")
45-
    ("Bauingenieurw." . "archit")
46-
    ("Bautechnik" . "archit")
47-
    ("Bauw." . "archit")
48-
    ("Befestigungsw." . "archit")
49-
    ("Beh??rde" . "law")
50-
    ("Bergb." . "archit")
51-
    ("Bergbau" . "archit")
52-
    ("Bergn." . "place")
53-
    ("Bergsteigen" . "sports")
54-
    ("Bibel" . "Christn")
55-
    ("Bibliotheksw." . "biblio")
56-
    ("Bildhauerei" . "work")
57-
    ("bild. Kunst" . "work")
58-
    ("Billard" . "game")
59-
    ("Biochem." . "biol")
60-
    ("Biol." . "biol")
61-
    ("Bodenkunde" . "agri")
62-
    ("Bogenschie??en" . "sports")
63-
    ("B??rse" . "econ")
64-
    ("Bot." . "bot")
65-
    ("Bowling" . "sports")
66-
    ("Boxen" . "sports")
67-
    ("Brauereiw." . "alco")
68-
    ("Brettspiel" . "game")
69-
    ("brit. Gesch." . "hist")
70-
    ("brit. Rechtsw." . "law")
71-
    ("Br??ckenbau" . "engr")
72-
    ("Bsp." . "exp")
73-
    ("Buchbinden" . "print")
74-
    ("B??chereiw." . "biblio")
75-
    ("Buchtitel" . "work")
76-
    ("Buchw." . "biblio")
77-
    ("Buddh." . "Buddh")
78-
    ("Bud??" . "MA")
79-
    ("B??d??" . "MA")
80-
    ("Chem." . "chem")
81-
    ("chin. Arch??ol." . "archeo")
82-
    ("chin. Astron." . "astron")
83-
    ("chin. Gesch." . "hist")
84-
    ("chin. Gramm." . "ling")
85-
    ("chin. Literaturw." . "work")
86-
    ("chin. Mus." . "music")
87-
    ("chin. Musikinstr." . "music")
88-
    ("chin. Mythol." . "rel")
89-
    ("chin. Philos." . "rel")
90-
    ("chin. Phon." . "ling")
91-
    ("chin. Poet." . "poet")
92-
    ("chin. Pol." . "politics")
93-
    ("chin. Theater" . "work")
94-
    ("Chirurgie" . "med")
95-
    ("Christent." . "Christn")
96-
    ("Cocktail" . "alco")
97-
    ("Computerling." . "comp")
98-
    ("Curling" . "sports")
99-
    ("Dammbau" . "engr")
100-
    ("Demogr." . "demo")
101-
    ("Diplomat." . "politics")
102-
    ("Druckw." . "print")
103-
    ("dtsch. Gesch." . "hist")
104-
    ("dtsch. Gramm." . "ling")
105-
    ("EDV" . "comp")
106-
    ("Einzel-Kanji" . "ling")
107-
    ("Eisenb." . "engr")
108-
    ("Eishockey" . "sports")
109-
    ("Eiskunstl." . "sports")
110-
    ("Eiskunstlauf" . "sports")
111-
    ("Eisschnelllauf" . "sports")
112-
    ("Elektrochem." . "chem")
113-
    ("Elektrot." . "engr")
114-
    ("Embryol." . "biol")
115-
    ("engl. Gesch." . "hist")
116-
    ("Entwicklungsphysiol." . "biol")
117-
    ("Entwicklungspsychol." . "psy")
118-
    ("Ergonomie" . "engr")
119-
    ("Ethik" . "philo")
120-
    ("Ethnol." . "ethno")
121-
    ("europ. Gesch." . "hist")
122-
    ("Fahrrad" . "sports")
123-
    ("Familienn." . "surname")
124-
    ("Familienn.." . "surname")
125-
    ("Farbe" . "color")
126-
    ("F??rberei" . "build")
127-
    ("Fechten" . "build")
128-
    ("Fertigungstechnik" . "engr")
129-
    ("Film" . "work")
130-
    ("Filmtitel" . "work")
131-
    ("Finanzw." . "finc")
132-
    ("Firmenn." . "company")
133-
    ("Fischerei" . "sports")
134-
    ("Fischk." . "food")
135-
    ("Flugw." . "engr")
136-
    ("Flussn." . "place")
137-
    ("Forstw." . "bot")
138-
    ("Fotog." . "photo")
139-
    ("Frachtw." . "trans")
140-
    ("franz. Gesch." . "hist")
141-
    ("Frisur" . "fashion")
142-
    ("Funkt." . "physics")
143-
    ("Fu??ball" . "sports")
144-
    ("Gagaku" . "music")
145-
    ("Garten" . "bot")
146-
    ("Gartenk." . "bot")
147-
    ("Gateball" . "sports")
148-
    ("Gebietsn." . "place")
149-
    ("Geldw." . "finc")
150-
    ("Genetik" . "biol")
151-
    ("Geogr." . "place")
152-
    ("Geol." . "geol")
153-
    ("Geom." . "geom")
154-
    ("Geophys." . "physics")
155-
    ("Gerberei" . "build")
156-
    ("Gesch." . "hist")
157-
    ("Gewichtheben" . "sports")
158-
    ("Glasherst." . "build")
159-
    ("Glaziol." . "sci")
160-
    ("Go" . "game")
161-
    ("Golf" . "sports")
162-
    ("Gramm." . "ling")
163-
    ("griech. Christent." . "Christn")
164-
    ("griech. Gesch." . "hist")
165-
    ("griech. Myth." . "rel")
166-
    ("griech. Mythol." . "rel")
167-
    ("griech. Theat." . "work")
168-
    ("Gymnastik" . "sports")
169-
    ("Handball" . "sports")
170-
    ("Handel" . "econ")
171-
    ("Handwerkskunst" . "build")
172-
    ("Heraldik" . "sci")
173-
    ("Hindu." . "rel")
174-
    ("Hochsprung" . "sports")
175-
    ("Hockey" . "sports")
176-
    ("Holzbau" . "engr")
177-
    ("Holzbearb." . "engr")
178-
    ("Holzschnitt" . "engr")
179-
    ("Hunderasse" . "zool")
180-
    ("Hydrodynamik" . "physics")
181-
    ("I Ging" . "rel")
182-
    ("Ikebana" . "build")
183-
    ("Immunol." . "med")
184-
    ("ind. Gesch." . "hist")
185-
    ("indian. Mythol." . "rel")
186-
    ("ind. Mythol." . "rel")
187-
    ("indones. Gesch." . "hist")
188-
    ("ind. Philos." . "philo")
189-
    ("Insektenk." . "zool")
190-
    ("Inseln." . "place")
191-
    ("Internet" . "comp")
192-
    ("Islam" . "rel")
193-
    ("islam. Mythol." . "rel")
194-
    ("islam. Rechtsw." . "law")
195-
    ("islam. Rel." . "rel")
196-
    ("israel. Gesch." . "hist")
197-
    ("ital. Gesch." . "hist")
198-
    ("japan. Archit." . "archit")
199-
    ("japan. Astrologie" . "aston")
200-
    ("japan. bild. Kunst" . "work")
201-
    ("japan. Christent." . "Christn")
202-
    ("japan. Gesch." . "hist")
203-
    ("japan. Gramm." . "ling")
204-
    ("japan. Literaturw." . "work")
205-
    ("japan. Med." . "med")
206-
    ("japan. Mus." . "music")
207-
    ("japan. Mythol." . "rel")
208-
    ("japan. Philos." . "philo")
209-
    ("japan. Pol." . "politics")
210-
    ("japan. Schwimmen" . "sports")
211-
    ("japan. Soziol." . "sci")
212-
    ("japan. Sprachw." . "ling")
213-
    ("J??ruri" . "music")
214-
    ("Judent." . "rel")
215-
    ("J??d??" . "MA")
216-
    ("Kabuki" . "music")
217-
    ("Kagura" . "music")
218-
    ("Kalligraphie" . "build")
219-
    ("Kanbun" . "ling")
220-
    ("Kanji" . "ling")
221-
    ("Kanp??" . "med")
222-
    ("Karten" . "game")
223-
    ("Kartenspiel" . "game")
224-
    ("Kartogr." . "place")
225-
    ("kath. Christent." . "Christn")
226-
    ("Kend??" . "MA")
227-
    ("Keramik" . "build")
228-
    ("Kernphys." . "physics")
229-
    ("Kfz-W." . "engr")
230-
    ("KI" . "comp")
231-
    ("Kleidung" . "fashion")
232-
    ("Kochk." . "food")
233-
    ("Konditoreiw." . "food")
234-
    ("Konfuz." . "rel")
235-
    ("korean. Gesch." . "hist")
236-
    ("korean. Rel." . "rel")
237-
    ("korean. Theater" . "work")
238-
    ("Kosmetik" . "fashion")
239-
    ("Kricket" . "sports")
240-
    ("Kristallogr." . "physics")
241-
    ("Krocket" . "sports")
242-
    ("Kunst" . "work")
243-
    ("Kunstgesch." . "hist")
244-
    ("Kunst??? und Turmsprung" . "build")
245-
    ("Ky??gen" . "music")
246-
    ("Lack" . "work")
247-
    ("L??ndern." . "place")
248-
    ("Landw." . "agri")
249-
    ("Leichtathl." . "sports")
250-
    ("Liedtitel" . "work")
251-
    ("Literaturw." . "work")
252-
    ("Logik" . "math")
253-
    ("Luftf." . "aero")
254-
    ("Machinenb." . "engr")
255-
    ("Mah-Jongg" . "mahj")
256-
    ("Management" . "bus")
257-
    ("Manga" . "work")
258-
    ("m??nnl. Name" . "given")
259-
    ("m??nnl. Vorn." . "given")
260-
    ("m??nnl. Vorname" . "given")
261-
    ("Mantik" . "rel")
262-
    ("Marine" . "sail")
263-
    ("Marketing" . "bus")
264-
    ("Maschinenb." . "engr")
265-
    ("Ma??" . "engr")
266-
    ("Math." . "math")
267-
    ("Mech." . "physics")
268-
    ("Med." . "med")
269-
    ("Meeresk." . "place")
270-
    ("Meeresn." . "place")
271-
    ("Meinungsforschung" . "politics")
272-
    ("Messw." . "hobby")
273-
    ("Metall." . "engr")
274-
    ("Metallbearb." . "engr")
275-
    ("Metallguss" . "engr")
276-
    ("Meteor." . "meteor")
277-
    ("mexikan. Kochk." . "food")
278-
    ("Mikrobiol." . "biol")
279-
    ("Milit." . "mil")
280-
    ("Mineral." . "geol")
281-
    ("M??belbau" . "build")
282-
    ("Mode" . "fashion")
283-
    ("Motorsport" . "sports")
284-
    ("Mus." . "music")
285-
    ("Muschelk." . "zool")
286-
    ("Musikinstr." . "music")
287-
    ("Mykol." . "bot")
288-
    ("Mythol." . "rel")
289-
    ("Nachrichtent." . "politics")
290-
    ("N??hen" . "fashion")
291-
    ("Naniwabushi" . "music")
292-
    ("Naturmed." . "med")
293-
    ("Naturph??n." . "physics")
294-
    ("Naturphilos." . "philo")
295-
    ("Neng??" . "hist")
296-
    ("Netzwerktechnik" . "comp")
297-
    ("Neurol." . "med")
298-
    ("N??" . "music")
299-
    ("nord. Mythol." . "rel")
300-
    ("Numismatik" . "build")
301-
    ("??kol." . "biol")
302-
    ("Oper" . "music")
303-
    ("Optik" . "physics")
304-
    ("Org." . "organization")
305-
    ("Origami" . "build")
306-
    ("Ortsn." . "place")
307-
    ("P??d." . "school")
308-
    ("Pal??ontol." . "hist")
309-
    ("Papierherst." . "build")
310-
    ("Parapsych." . "psy")
311-
    ("Patentw." . "engr")
312-
    ("Pathol." . "med")
313-
    ("pers. Gesch." . "hist")
314-
    ("Pers??nlichk." . "person")
315-
    ("Petrochem." . "chem")
316-
    ("Pferderennen" . "sports")
317-
    ("Pferdesport" . "sports")
318-
    ("Pharm." . "med")
319-
    ("Philos." . "philo")
320-
    ("philos. Anthrop." . "anth")
321-
    ("Phon." . "ling")
322-
    ("Phys." . "physics")
323-
    ("Physiol." . "anat")
324-
    ("Poetik" . "poet")
325-
    ("Pol." . "politics")
326-
    ("Postw." . "telecom")
327-
    ("Psych." . "psy")
328-
    ("Quantenphys." . "physics")
329-
    ("Radio" . "telecom")
330-
    ("Radrennen" . "sports")
331-
    ("Radsport" . "sports")
332-
    ("Rakugo" . "music")
333-
    ("Raumf." . "engr")
334-
    ("Reaktort." . "engr")
335-
    ("Rechnungsw." . "bus")
336-
    ("Rechtsw." . "law")
337-
    ("Redew." . "exp")
338-
    ("Regelungstechnik" . "engr")
339-
    ("Reiten" . "sports")
340-
    ("Rel." . "rel")
341-
    ("Rennsport" . "sports")
342-
    ("Rhetorik" . "philo")
343-
    ("Ringen" . "sports")
344-
    ("r??m. Gesch." . "hist")
345-
    ("r??m. Mythol." . "rel")
346-
    ("Rudern" . "sports")
347-
    ("Rugby" . "sports")
348-
    ("Rundfunk" . "telecom")
349-
    ("russ. Gesch." . "hist")
350-
    ("R??stung" . "mil")
351-
    ("Schach" . "game")
352-
    ("Schie??sport" . "sports")
353-
    ("Schiff" . "sail")
354-
    ("Schiffbau" . "sail")
355-
    ("Schifffahrt" . "sail")
356-
    ("Schmuck" . "build")
357-
    ("Schneckenk." . "zool")
358-
    ("Schneiderei" . "fashion")
359-
    ("Schuhe" . "fashion")
360-
    ("Schule" . "school")
361-
    ("Schwei??en" . "build")
362-
    ("Schwert" . "mil")
363-
    ("Schwimmen" . "sports")
364-
    ("Seef." . "sail")
365-
    ("Seen." . "place")
366-
    ("Seerechtsw." . "law")
367-
    ("Seew." . "sail")
368-
    ("Segeln" . "sail")
369-
    ("Seide" . "build")
370-
    ("Seismol." . "physics")
371-
    ("SF" . "work")
372-
    ("Shingon" . "Buddh")
373-
    ("Shint??" . "Shinto")
374-
    ("Sh??gi" . "shogi")
375-
    ("Ski" . "sports")
376-
    ("Skispringen" . "sports")
377-
    ("Softball" . "sports")
378-
    ("Sozialpsych." . "psy")
379-
    ("Soziol." . "sci")
380-
    ("span. Gesch." . "hist")
381-
    ("Spiel" . "game")
382-
    ("Spiele-Titel" . "game")
383-
    ("Spieltheorie" . "game")
384-
    ("Spinnen" . "zool")
385-
    ("Spinnenk." . "zool")
386-
    ("Sport" . "sports")
387-
    ("Sprache" . "ling")
388-
    ("Sprachw." . "ling")
389-
    ("Sprichw." . "ling")
390-
    ("Stadtn." . "place")
391-
    ("Stadtplanung" . "engr")
392-
    ("Statistik" . "math")
393-
    ("Steuerw." . "econ")
394-
    ("Sticken" . "build")
395-
    ("Stra??enbau" . "engr")
396-
    ("Streckenkilometertafel" . "train")
397-
    ("Stricken" . "build")
398-
    ("Strukturalismus" . "philo")
399-
    ("Sum??" . "sumo")
400-
    ("Surfen" . "sports")
401-
    ("Systemanalyse" . "engr")
402-
    ("Systemtheorie" . "engr")
403-
    ("taiwan. Pol." . "politics")
404-
    ("Tanz" . "art")
405-
    ("Tanzen" . "art")
406-
    ("Taoismus" . "rel")
407-
    ("Tauchen" . "sports")
408-
    ("Technik" . "engr")
409-
    ("Tee" . "hobby")
410-
    ("Telegrafie" . "telecom")
411-
    ("Telekom." . "telecom")
412-
    ("Tempeln." . "place")
413-
    ("Tennis" . "sports")
414-
    ("Tenn??" . "surname")
415-
    ("Textilt." . "engr")
416-
    ("Theat." . "work")
417-
    ("Theol." . "rel")
418-
    ("tibet. Buddh." . "Buddh")
419-
    ("Tiefb." . "engr")
420-
    ("Tiermed." . "med")
421-
    ("Tischtennis" . "sports")
422-
    ("Tourismus" . "hobby")
423-
    ("Tunnelbau" . "engr")
424-
    ("t??rk. Gesch." . "hist")
425-
    ("Turnen" . "sports")
426-
    ("TV" . "telecom")
427-
    ("TV-Prog." . "telecom")
428-
    ("U-Bahn" . "train")
429-
    ("Umwelt" . "biol")
430-
    ("Univ.-N." . "place")
431-
    ("Verhaltensbiol." . "biol")
432-
    ("Verkehrsw." . "engr")
433-
    ("Verlagsn." . "company")
434-
    ("Verlagsw." . "econ")
435-
    ("Versicherungsw." . "bus")
436-
    ("Video" . "telecom")
437-
    ("Videospiel" . "game")
438-
    ("vietnam. Gesch." . "hist")
439-
    ("Vogelk." . "zool")
440-
    ("V??lkerk." . "ethno")
441-
    ("V??lkerr." . "law")
442-
    ("Volleyball" . "sports")
443-
    ("Vorgesch." . "hist")
444-
    ("VWL" . "econ")
445-
    ("Waffenk." . "mil")
446-
    ("Walfang" . "hobby")
447-
    ("Wasserball" . "sports")
448-
    ("Wasserbau" . "engr")
449-
    ("Wassersport" . "sports")
450-
    ("Weben" . "build")
451-
    ("weibl. Name" . "given")
452-
    ("weibl. Vorn." . "given")
453-
    ("Weichtierk." . "zool")
454-
    ("Wein" . "alco")
455-
    ("Werbung" . "bus")
456-
    ("Werktitel" . "work")
457-
    ("Wintersport" . "sports")
458-
    ("Wirtsch." . "econ")
459-
    ("Wrestling" . "sports")
460-
    ("Wz." . "exp")
461-
    ("Yoga" . "sports")
462-
    ("Z??hlw." . "finc")
463-
    ("Zahnmed." . "med")
464-
    ("Zeitschriftenn." . "work")
465-
    ("Zeitungsn." . "work")
466-
    ("Zeitungsw." . "print")
467-
    ("Zellbiol." . "biol")
468-
    ("Zen" . "rel")
469-
    ("Zirkus" . "hobby")
470-
    ("Zitat" . "exp")
471-
    ("Zollw." . "bus")
472-
    ("Zool." . "biol")))
473-
474-
(define (hint->tag hint)
475-
  (match hint
476-
    ("Kansai-Dial." "ksb")
477-
    ("Ky??to-Dial." "kyb")
478-
    ("??saka-Dial." "osb")
479-
    ("Kant??-Dial." "ktb")
480-
    ("T??hoku-Dial." "thb")
481-
    ("Tsugaru-Dial." "tsug")
482-
    ("Ky??sh??-Dial." "kyu")
483-
    ("Hokkaid??-Dial." "hob")
484-
    ("altert??ml." "arch")
485-
    ("arch." "arch")
486-
    ("Edo-zeitl." "arch")
487-
    ("ehrerb.-h??fl." "hon")
488-
    ("h??fl.-ehrerb." "hon")
489-
    ("besch.-h??fl." "hum")
490-
    ("abwertend" "derog")
491-
    ("etwas altmodisch" "obs")
492-
    ("Gaunerjargon" "thief")
493-
    ("Frauenspr. der Edo-Zeit" "arch")
494-
    ("Geheimspr. der Gauner" "thief")
495-
    ("hist." "hist")
496-
    ("h??fl." "pol")
497-
    ("kinderspr." "chn")
498-
    ("M??nnersprache" "male")
499-
    ("Medizinerjargon" "med")
500-
    ("Milit??rjargon" "mil")
501-
    ("obsol." "obs")
502-
    ("onomat." "on-mim")
503-
    ("poet." "poet")
504-
    ("Polizeijargon" "law")
505-
    ("Slang" "sl")
506-
    ("veraltet" "arch")
507-
    (_ #f)))
508-
509-
(define (wadoku->tag-name wadoku-tag)
510-
  (let ((tag (assoc-ref wadoku-entities wadoku-tag)))
511-
    (or tag wadoku-tag)))
512-
513-
(define (get-tag tag)
514-
  (tag:get-tag (wadoku->tag-name tag)))

modules/nani/wadoku/pitch.scm unknown status 2

1-
;;; Nani Project website
2-
;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu>
3-
;;;
4-
;;; This file is part of the Nani Project website.
5-
;;;
6-
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7-
;;; under the terms of the GNU Affero General Public License as published by
8-
;;; the Free Software Foundation; either version 3 of the License, or (at
9-
;;; your option) any later version.
10-
;;;
11-
;;; The Nani Project website is distributed in the hope that it will be useful, but
12-
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13-
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14-
;;; GNU Affero General Public License for more details.
15-
;;;
16-
;;; You should have received a copy of the GNU Affero General Public License
17-
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18-
19-
(define-module (nani wadoku pitch)
20-
  #:use-module (ice-9 binary-ports)
21-
  #:use-module (ice-9 match)
22-
  #:use-module (ice-9 rdelim)
23-
  #:use-module (nani frequency)
24-
  #:use-module (nani huffman)
25-
  #:use-module (nani result)
26-
  #:use-module (nani trie)
27-
  #:use-module (nani wadoku entities)
28-
  #:use-module (srfi srfi-1)
29-
  #:use-module (srfi srfi-9)
30-
  #:use-module (sxml ssax)
31-
  #:use-module (rnrs bytevectors)
32-
  #:export (xml->pitch serialize-pitch pitch-entry-count))
33-
34-
(define-record-type pitch
35-
  (make-pitch kanjis accents)
36-
  pitch?
37-
  (kanjis  pitch-kanjis)
38-
  (accents pitch-accents))
39-
40-
(define (sxml->element lst elem)
41-
  (let ((elem (match elem
42-
                ((_ . elem) elem)
43-
                (_ elem))))
44-
    (match elem
45-
      ('accent `(pitch ,(car lst)))
46-
      ('orth (let ((kanji (filter string? lst)))
47-
               (if (null? kanji) #f `(kanji . ,(car kanji)))))
48-
      ('form
49-
       `(form . ,(append-map (lambda (a) (if (list? a) a (list a))) lst)))
50-
      ('reading (filter pair? lst))
51-
      ('entry
52-
        (let loop ((lst lst) (kanjis '()) (accents '()))
53-
          (if (null? lst)
54-
              (if (or (null? kanjis) (null? accents))
55-
                  #f
56-
                  (make-pitch kanjis accents))
57-
              (match (car lst)
58-
                (('form . f) (loop (append f (cdr lst)) kanjis accents))
59-
                (('pitch . pitch) (loop (cdr lst) kanjis (cons pitch accents)))
60-
                (('kanji . kanji) (loop (cdr lst) (cons kanji kanjis) accents))
61-
                (_ (loop (cdr lst) kanjis accents))))))
62-
      (_ #f))))
63-
64-
(define parser
65-
  (ssax:make-parser
66-
    NEW-LEVEL-SEED
67-
    (lambda (elem-gi attributes namespaces expected-content seed)
68-
      attributes)
69-
    
70-
    FINISH-ELEMENT
71-
    (lambda (elem-gi attributes namespaces parent-seed seed)
72-
      (if (equal? elem-gi 'entries)
73-
          seed
74-
          (let* ((seed (reverse seed))
75-
                 (element (sxml->element seed elem-gi)))
76-
            (cons element parent-seed))))
77-
    
78-
    CHAR-DATA-HANDLER
79-
    (lambda (string1 string2 seed)
80-
      (cons (string-append string1 string2) seed))))
81-
82-
(define (xml->pitch port)
83-
  (filter pitch? (parser port '())))
84-
85-
(define (make-trie-key key)
86-
  (append-map
87-
    (lambda (c)
88-
      (list (quotient c 16) (modulo c 16)))
89-
    (bytevector->u8-list (string->utf8 key))))
90-
91-
(define (get-pitch-trie pitches)
92-
  (let ((trie (make-empty-trie)))
93-
    (for-each
94-
      (lambda (pitch)
95-
        (for-each
96-
          (lambda (kanji)
97-
            (for-each
98-
              (lambda (accent)
99-
                (cond
100-
                  ((string? accent)
101-
                   (add-to-trie! trie (make-trie-key kanji) accent))
102-
                  ((list? accent)
103-
                   (add-to-trie! trie (make-trie-key kanji) (car accent)))
104-
                  (else
105-
                    (throw 'accent accent))))
106-
              (pitch-accents pitch)))
107-
          (pitch-kanjis pitch)))
108-
      pitches)
109-
    (compress-trie trie)))
110-
111-
(define (collapse-vals! trie)
112-
  (let ((transitions (trie-transitions trie))
113-
        (vals (trie-vals trie)))
114-
    (trie-vals-set! trie (string-join vals ", "))
115-
    (for-each collapse-vals! (map cdr transitions))))
116-
117-
(define (collect-vals trie)
118-
  (let ((transitions (trie-transitions trie))
119-
        (vals (trie-vals trie)))
120-
    (cons vals (append-map collect-vals (map cdr transitions)))))
121-
122-
(define (entry-number trie)
123-
  (let ((transitions (trie-transitions trie))
124-
        (vals (trie-vals trie)))
125-
    (apply + (if (string-null? vals) 0 1) 
126-
           (map entry-number (map cdr transitions)))))
127-
128-
(define (serialize-pitch pitches)
129-
  (let ((trie (get-pitch-trie pitches)))
130-
    (collapse-vals! trie)
131-
    (let* ((huffman (create-huffman (collect-vals trie)))
132-
           (code (huffman->code huffman))
133-
           (entries (entry-number trie)))
134-
      (let* ((header (string->utf8 "NANI_PITCH001"))
135-
             (header-size (bytevector-length header))
136-
             (huffman-bv (serialize-huffman huffman))
137-
             (huffman-size (bytevector-length huffman-bv))
138-
             (trie-size (trie-size trie (huffman-string-size code)))
139-
             (result (make-bytevector (+ header-size 4 huffman-size trie-size))))
140-
        (bytevector-copy! header 0 result 0 header-size)
141-
        (bytevector-u32-set! result header-size entries (endianness big))
142-
        (bytevector-copy! huffman-bv 0 result (+ header-size 4) huffman-size)
143-
        (serialize-trie trie (serialize-huffman-string code)
144-
                        (huffman-string-size code)
145-
                        result (+ header-size 4 huffman-size))
146-
        result))))
147-
148-
(define (pitch-entry-count file)
149-
  (call-with-input-file file
150-
    (lambda (port)
151-
      ;; header
152-
      (get-bytevector-n port 13)
153-
      ;; size
154-
      (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big)))))

modules/nani/wadoku/xml.scm unknown status 2

1-
;;; Nani Project website
2-
;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu>
3-
;;;
4-
;;; This file is part of the Nani Project website.
5-
;;;
6-
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7-
;;; under the terms of the GNU Affero General Public License as published by
8-
;;; the Free Software Foundation; either version 3 of the License, or (at
9-
;;; your option) any later version.
10-
;;;
11-
;;; The Nani Project website is distributed in the hope that it will be useful, but
12-
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13-
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14-
;;; GNU Affero General Public License for more details.
15-
;;;
16-
;;; You should have received a copy of the GNU Affero General Public License
17-
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18-
19-
(define-module (nani wadoku xml)
20-
  #:use-module (ice-9 match)
21-
  #:use-module (ice-9 rdelim)
22-
  #:use-module (nani frequency)
23-
  #:use-module (nani result)
24-
  #:use-module (nani wadoku entities)
25-
  #:use-module (srfi srfi-9)
26-
  #:use-module (sxml ssax)
27-
  #:export (xml->results))
28-
29-
(define-record-type usage
30-
  (make-usage type reg content)
31-
  usage?
32-
  (type    usage-type)
33-
  (reg     usage-reg)
34-
  (content usage-content))
35-
36-
(define (flatten lst)
37-
  (match lst
38-
    (() '())
39-
    ((head lst ...)
40-
     (if (list? head)
41-
         (append (flatten head) (flatten lst))
42-
         (cons head (flatten lst))))
43-
    (lst (list lst))))
44-
45-
(define* (update-usage usage #:key (type    (usage-type    usage))
46-
                                   (reg     (usage-reg     usage))
47-
                                   (content (usage-content usage)))
48-
  (make-usage type reg content))
49-
50-
;; TODO
51-
(define (usage->tag usage)
52-
  (let ((content (usage-content usage))
53-
        (type (usage-type usage))
54-
        (reg (usage-reg usage)))
55-
    `(,@(if (string-null? content)
56-
            (match type
57-
              ("" '())
58-
              ("time" '())
59-
              ("hint" '())
60-
              ("abrev" '(tag "abbr")))
61-
            (match type
62-
              ("dom" `((tag ,content)))
63-
              ("time" `((tag "obs"))); always obsol.
64-
              ("hint" (let ((tag (hint->tag content)))
65-
                        (if tag `(tag ,tag) '())))))
66-
      ,@(match reg
67-
          ("" '())
68-
          ("dial." '())
69-
          ("lit" `((tag "litf")))
70-
          ("kinderspr." `((tag "chn")))
71-
          ("kleinkindspr." `((tag "baby")))
72-
          ("jugendspr." `((tag "young")))
73-
          ("besch.-h??fl." `((tag "hum")))
74-
          ("coll" `((tag "col")))
75-
          ("vulg." `((tag "vulg")))
76-
          ("Slang" `((tag "sl")))
77-
          ("poet." `((tag "poet")))
78-
          ("hist." `((tag "hist")))
79-
          ("hist" `((tag "hist")))
80-
          ("obsol." `((tag "obs")))
81-
          ("veraltet" `((tag "arch")))
82-
          ("sch??lerspr." `((tag "stud")))
83-
          ("studentenspr." `((tag "stud")))
84-
          ("frauenspr." `((tag "fem")))
85-
          ("gaunerspr." `((tag "thief")))
86-
          ("geheimspr." `((tag "secret")))
87-
          ("hofdamenspr." `((tag "fem") (tag "pol")))
88-
          ("seemannsspr." `((tag "sail")))))))
89-
90-
(define (usg->tag lst)
91-
  (define (usg->usage lst)
92-
    (let loop ((result (make-usage "" "" "")) (lst lst))
93-
      (if (null? lst)
94-
        result
95-
        (loop
96-
          (match (car lst)
97-
            (('type . type) (update-usage result #:type type))
98-
            (('reg . reg) (update-usage result #:reg reg))
99-
            (((? symbol? s) . v)
100-
             (throw 'unknow-usage-attr s v))
101-
            ((? string? content) (update-usage result #:content content))
102-
            ((? list? l) (loop result l))
103-
            (_ result))
104-
          (cdr lst)))))
105-
  (usage->tag (usg->usage lst)))
106-
107-
(define (ruigo->ref lst)
108-
  (let loop ((ref #f) (lst lst))
109-
    (match lst
110-
      (() ref)
111-
      ((('id id) lst ...)
112-
       (loop id lst))
113-
      (((a . b) lst ...)
114-
       (loop (loop ref (list (list a b))) lst))
115-
      (((? list? l) lst ...)
116-
       (loop (loop ref l) lst)))))
117-
118-
(define (merge-senses s1 s2)
119-
  (update-sense s2
120-
    #:references (append (sense-references s1) (sense-references s2))
121-
    #:tags (append (sense-tags s1) (sense-tags s2))
122-
    #:glosses (append (sense-glosses s1) (sense-glosses s2))))
123-
124-
(define (sxml->sense lst)
125-
  (let loop ((sense (make-sense '() '() '() '() '() '() "ger")) (lst lst))
126-
    (if (null? lst)
127-
      sense
128-
      (loop
129-
        (match (car lst)
130-
          (#f sense)
131-
          ((? sense? s)
132-
           (merge-senses s sense))
133-
          ((? source? s)
134-
           (update-sense sense #:sources (cons s (sense-sources sense))))
135-
          (('ref (? string? r))
136-
           (update-sense sense #:references (cons r (sense-references sense))))
137-
          (('tag (? string? r))
138-
           (update-sense sense #:tags (cons (get-tag r) (sense-tags sense))))
139-
          (('trans (? string? r))
140-
           (update-sense sense #:glosses (cons r (sense-glosses sense))))
141-
          (('related . _) sense)
142-
          (('transcr . _) sense)
143-
          (('pitch . _) sense)
144-
          (((? symbol? s) v)
145-
           (throw 'unknown-symbol s v))
146-
          ((? list? l) (loop sense l))
147-
          ((? string? _) sense))
148-
        (cdr lst)))))
149-
150-
;; TODO
151-
(define (sxml->ref lst)
152-
  (define (sxml->ref-name lst)
153-
    (let loop ((result #f) (lst lst))
154-
      (if (null? lst)
155-
        result
156-
        (loop
157-
          (match (car lst)
158-
            (('id . id) id)
159-
            (((? symbol? s) . _) result)
160-
            (((? symbol? s) _) result)
161-
            ((? list? l) (loop result l))
162-
            (_ result))
163-
          (cdr lst)))))
164-
  (let ((ref (sxml->ref-name lst)))
165-
    (if (string? ref)
166-
        `(ref ,ref)
167-
        (throw 'no-ref ref 'from lst))))
168-
169-
(define (sxml->source lst)
170-
  (let loop ((source (make-source '() #f "" "")) (lst lst))
171-
    (if (null? lst)
172-
      source
173-
      (loop
174-
        (match (car lst)
175-
          (('impli impli) (update-source source #:lang impli))
176-
          (('foreign foreign)
177-
           (update-source source
178-
             #:content (if (list? foreign)
179-
                           (append foreign (source-content source))
180-
                           (cons foreign (source-content source)))))
181-
          ((? list? l) (loop source l))
182-
          (_ source))
183-
        (cdr lst)))))
184-
185-
(define (sxml->reading lst)
186-
  (let loop ((reading (make-reading '() '() '())) (lst lst))
187-
    (if (null? lst)
188-
      reading
189-
      (loop
190-
        (match (car lst)
191-
          (('reading r)
192-
           (update-reading reading #:readings (cons r (reading-readings reading))))
193-
          (_ reading))
194-
        (cdr lst)))))
195-
196-
(define (sxml->result sxml frq)
197-
  (define (sxml->result-aux sxml)
198-
    (let loop ((result (make-result 0 0 '() '() '())) (last-source #f) (lst sxml))
199-
      (if (null? lst)
200-
        result
201-
        (match (car lst)
202-
          (('kanji kanji)
203-
           (loop
204-
             (update-result result #:kanjis (cons kanji (result-kanjis result)))
205-
             last-source (cdr lst)))
206-
          ((? reading? reading)
207-
           (loop
208-
             (update-result result #:readings (cons reading (result-readings result)))
209-
             last-source (cdr lst)))
210-
          ((? sense? sense)
211-
           (loop
212-
             (update-result result
213-
               #:senses
214-
               (cons
215-
                 (if last-source
216-
                     (update-sense sense
217-
                       #:sources (cons last-source (sense-sources sense)))
218-
                     sense)
219-
                 (result-senses result)))
220-
             last-source (cdr lst)))
221-
          ((? list? l)
222-
           (loop (loop result last-source l) last-source (cdr lst)))
223-
          (_ (loop result last-source (cdr lst)))))))
224-
  (let* ((result (sxml->result-aux sxml))
225-
         (word (if (null? (result-kanjis result))
226-
                   (car (reading-readings (car (result-readings result))))
227-
                   (car (result-kanjis result))))
228-
         (entity (frequency-entity frq word))
229-
         (sense1 (car (result-senses result)))
230-
         (sense1 (if entity
231-
                     (update-sense sense1
232-
                       #:tags (cons (get-tag entity) (sense-tags sense1)))
233-
                     sense1))
234-
         (senses (cons sense1 (cdr (result-senses result)))))
235-
    (update-result result
236-
      #:points (tag:get-points (update-result result #:senses senses))
237-
      #:senses senses)))
238-
239-
(define (gram-tag lst)
240-
  (map
241-
    (lambda (attr)
242-
      (match attr
243-
        (('suru . suru)
244-
         `(tag ,(string-append "suru-" suru)))
245-
        (('ni . ni)
246-
         `(tag ,(string-append "ni-" ni)))
247-
        (('shiku . shiku)
248-
         `(tag ,(string-append "shiku-" shiku)))
249-
        (('ku . ku)
250-
         `(tag ,(string-append "ku-" ku)))
251-
        (('to . to)
252-
         `(tag ,(string-append "to-" to)))
253-
        (('no . no)
254-
         `(tag ,(string-append "no-" no)))
255-
        (('nari . nari)
256-
         `(tag ,(string-append "nari-" nari)))
257-
        (('taru . taru)
258-
         `(tag ,(string-append "taru-" taru)))
259-
        (('level . level)
260-
         `(tag ,(string-append "level-" level)))
261-
        (('godanrow . tr)
262-
         `(tag ,(string-append "godanrow-" tr)))
263-
        (('onbin . tr)
264-
         `(tag ,(string-append "onbin-" tr)))
265-
        (('transitivity . tr)
266-
         `(tag ,(string-append "transitivity-" tr)))))
267-
    lst))
268-
269-
(define (sxml->string lst)
270-
  (define (sub-loop loop tags result lst l)
271-
    (let ((result (loop tags result l)))
272-
      (if (list? result)
273-
          (loop (append tags (filter list? result))
274-
                (apply string-append (filter string? result))
275-
                lst)
276-
          (loop tags result lst))))
277-
278-
  (let loop ((tags '()) (result "") (lst lst))
279-
    (match lst
280-
      (() (if (null? tags)
281-
              result
282-
              (append tags (list result))))
283-
      ((? string? s)
284-
       (loop tags (string-append result s) '()))
285-
      (((? string? s) lst ...)
286-
       (loop tags (string-append result s) lst))
287-
      ((('prior . _) lst ...)
288-
       (loop tags result lst))
289-
      ((('options . _) lst ...)
290-
       (loop tags result lst))
291-
      ((('firstname . _) lst ...)
292-
       (loop tags result lst))
293-
      ((('ausn . _) lst ...)
294-
       (loop tags result lst))
295-
      ((('lang . _) lst ...)
296-
       (loop tags result lst))
297-
      ((('meta . _) lst ...)
298-
       (loop tags result lst))
299-
      ((('genki . genki) lst ...)
300-
       ;(loop (cons `(tag ,genki) tags) result lst))
301-
       (loop tags result lst))
302-
      ((('jlpt . jlpt) lst ...)
303-
       (loop (cons `(tag ,(string-append "jlpt-" jlpt)) tags) result lst))
304-
      ((('ref . (? string? ref)) lst ...)
305-
       (loop (cons `(ref ,ref) tags) result lst))
306-
      ((('ref (? string? ref)) lst ...)
307-
       (loop (cons `(ref ,ref) tags) result lst))
308-
      ((('jap l) lst ...)
309-
       (sub-loop loop tags result lst l))
310-
      ((('foreign l) lst ...)
311-
       (sub-loop loop tags result lst l))
312-
      ((('transcr l) lst ...)
313-
       (sub-loop loop tags result lst l))
314-
      ((((? symbol? s) v) lst ...)
315-
       (throw 'unsupported-symbol s v))
316-
      ((((? symbol? s) . v) lst ...)
317-
       (throw 'unsupported-symbol-pair s v))
318-
      (((? list? l) lst ...)
319-
       (sub-loop loop tags result lst l)))))
320-
321-
(define (sxml->element lst elem frq)
322-
  (let ((elem (match elem
323-
                ((_ . elem) elem)
324-
                (_ elem))))
325-
    (match elem
326-
      ('orth (let ((kanji (filter string? lst)))
327-
               (if (null? kanji) #f `(kanji ,(car kanji)))))
328-
      ('entry (sxml->result lst frq))
329-
      ('hira `(reading ,(car lst)))
330-
      ('hatsuon `(hatsuon ,(car lst)))
331-
      ('accent `(pitch ,(car lst)))
332-
      ('reading (sxml->reading lst))
333-
      ('form lst)
334-
      ('impli `(impli ,(car lst)))
335-
      ('text (let loop ((text "") (lst lst))
336-
               (match lst
337-
                 (() text)
338-
                 ((('hasPrecedingSpace . _) lst ...)
339-
                  (loop (string-append " " text) lst))
340-
                 ((('hasFollowingSpace . _) lst ...)
341-
                  (string-append (loop text lst) " "))
342-
                 (((? string? s) lst ...)
343-
                  (loop (string-append text s) lst)))))
344-
      ('famn (sxml->string lst))
345-
      ('expl (sxml->string lst))
346-
      ('expli (filter list? lst))
347-
      ('abbrev (filter list? lst))
348-
      ('token (filter string? lst))
349-
      ('tr (sxml->string lst))
350-
      ('transcr `(transcr ,lst))
351-
      ('trans
352-
        (append (filter list? lst)
353-
                (map (lambda (s) `(trans ,s)) (filter string? lst))))
354-
      ('jap `(jap ,lst))
355-
      ('emph lst)
356-
      ('title (filter string? lst))
357-
      ('transl (filter string? lst))
358-
      ('topic (filter string? lst))
359-
      ('iron (filter string? lst))
360-
      ('specchar (filter string? lst))
361-
      ('scientif (filter string? lst))
362-
      ('wikide #f)
363-
      ('wikija #f)
364-
      ('link #f)
365-
      ('ref (sxml->ref lst))
366-
      ('sref (sxml->ref lst))
367-
      ('etym (sxml->source lst))
368-
      ('literal (sxml->string (list "???" (sxml->string lst) "???")))
369-
      ('def (sxml->string (list "(" (sxml->string lst) ")")))
370-
      ('date (sxml->string (list "(" (sxml->string lst) ")")))
371-
      ('birthdeath (sxml->string (list "(" (sxml->string lst) ")")))
372-
      ('descr (sxml->string (list "(" (sxml->string lst) ")")))
373-
      ('bracket (sxml->string (list "[" (sxml->string lst) "]")))
374-
      ('foreign (if (null? lst) #f `(foreign ,(car lst))))
375-
      ('seasonword `(tag ,(string-append "season-" (assoc-ref lst 'type))))
376-
      ('usg (usg->tag lst))
377-
      ('sense (sxml->sense lst))
378-
      ('steinhaus (let ((ref (sxml->string lst)))
379-
                    (if (string? ref)
380-
                        `(ref ,(sxml->string lst))
381-
                        (throw 'not-steinhaus ref))))
382-
      ('pos '()); TODO: actually find what tag to use
383-
      ('wordcomponent (cons `(tag "wordcomponent") (gram-tag lst)))
384-
      ('meishi (cons `(tag "meishi") (gram-tag lst)))
385-
      ('setsuzokushi (cons `(tag "setsuzokushi") (gram-tag lst)))
386-
      ('daimeishi (cons `(tag "daimeishi") (gram-tag lst)))
387-
      ('doushi (cons `(tag "doushi") (gram-tag lst)))
388-
      ('kandoushi (cons `(tag "kandoushi") (gram-tag lst)))
389-
      ('keiyoudoushi (cons `(tag "keiyoudoushi") (gram-tag lst)))
390-
      ('keiyoushi (cons `(tag "keiyoushi") (gram-tag lst)))
391-
      ('fukushi (cons `(tag "fukushi") (gram-tag lst)))
392-
      ('rengo (cons `(tag "rengo") (gram-tag lst)))
393-
      ('suffix (cons `(tag "suffix") (gram-tag lst)))
394-
      ('prefix (cons `(tag "prefix") (gram-tag lst)))
395-
      ('kanji (cons `(tag "kanji") (gram-tag lst)))
396-
      ('rentaishi (cons `(tag "rentaishi") (gram-tag lst)))
397-
      ('specialcharacter (cons `(tag "specialcharacter") (gram-tag lst)))
398-
      ('joshi (cons `(tag "joshi") (gram-tag lst)))
399-
      ('fukujoshi (cons `(tag "fukujoshi") (gram-tag lst)))
400-
      ('kakujoshi (cons `(tag "kakujoshi") (gram-tag lst)))
401-
      ('kakarijoshi (cons `(tag "kakarijoshi") (gram-tag lst)))
402-
      ('shuujoshi (cons `(tag "shuujoshi") (gram-tag lst)))
403-
      ('setsuzokujoshi (cons `(tag "setsuzokujoshi") (gram-tag lst)))
404-
      ('jokeiyoushi (cons `(tag "jokeiyoushi") (gram-tag lst)))
405-
      ('jodoushi (cons `(tag "jodoushi") (gram-tag lst)))
406-
      ('ruigos lst)
407-
      ('ruigo (ruigo->ref lst))
408-
      ('gramGrp lst))))
409-
410-
(define (create-parser frq)
411-
  (ssax:make-parser
412-
    NEW-LEVEL-SEED
413-
    (lambda (elem-gi attributes namespaces expected-content seed)
414-
      attributes)
415-
    
416-
    FINISH-ELEMENT
417-
    (lambda (elem-gi attributes namespaces parent-seed seed)
418-
      (if (equal? elem-gi 'entries)
419-
          seed
420-
          (let* ((seed (reverse seed))
421-
                 (element (sxml->element seed elem-gi frq)))
422-
            (cons element parent-seed))))
423-
    
424-
    CHAR-DATA-HANDLER
425-
    (lambda (string1 string2 seed)
426-
      (cons (string-append string1 string2) seed))))
427-
428-
(define (xml->results port frq)
429-
  (let ((results (filter result? ((create-parser frq) port '()))))
430-
    (sort results (lambda (a b) (> (result-points a) (result-points b))))))

po/fr.po

77
msgstr ""
88
"Project-Id-Version: PACKAGE VERSION\n"
99
"Report-Msgid-Bugs-To: \n"
10-
"POT-Creation-Date: 2020-06-05 15:21+0200\n"
10+
"POT-Creation-Date: 2020-06-25 02:57+0200\n"
1111
"PO-Revision-Date: 2020-06-15 14:56+0000\n"
1212
"Last-Translator: full name <fedora-account@lepiller.eu>\n"
1313
"Language-Team: French <https://translate.fedoraproject.org/projects/nani/"

107107
msgid "JMdict"
108108
msgstr "JMdict"
109109
110-
#: tools/list.scm:55
110+
#: tools/list.scm:63
111111
msgid ""
112112
"Japanese/Dutch dictionary from the Electronic Dictionary Research and "
113113
"Development Group."

115115
"Dictionnaire japonais/anglais de l???Electronic Dictionary Research and "
116116
"Development Group."
117117
118-
#: tools/list.scm:54
118+
#: tools/list.scm:62
119119
msgid ""
120120
"Japanese/English dictionary from the Electronic Dictionary Research and "
121121
"Development Group."

123123
"Dictionnaire japonais/anglais de l???Electronic Dictionary Research and "
124124
"Development Group."
125125
126-
#: tools/list.scm:56
126+
#: tools/list.scm:64
127127
msgid ""
128128
"Japanese/French dictionary from the Electronic Dictionary Research and "
129129
"Development Group."

131131
"Dictionnaire japonais/fran??ais de l???Electronic Dictionary Research and "
132132
"Development Group."
133133
134+
#: tools/list.scm:53
135+
#, fuzzy
136+
msgid "Japanese/French dictionary from the Jibiki project."
137+
msgstr "Dictionnaire japonais/allemand de Wadoku."
138+
134139
#: tools/list.scm:37
135140
msgid "Japanese/German dictionary from Wadoku."
136141
msgstr "Dictionnaire japonais/allemand de Wadoku."
137142
138-
#: tools/list.scm:57
143+
#: tools/list.scm:65
139144
msgid ""
140145
"Japanese/German dictionary from the Electronic Dictionary Research and "
141146
"Development Group."

143148
"Dictionnaire japonais/allemand de l???Electronic Dictionary Research and "
144149
"Development Group."
145150
146-
#: tools/list.scm:58
151+
#: tools/list.scm:66
147152
msgid ""
148153
"Japanese/Hungarian dictionary from the Electronic Dictionary Research and "
149154
"Development Group."

151156
"Dictionnaire japonais/hongrois de l???Electronic Dictionary Research and "
152157
"Development Group."
153158
154-
#: tools/list.scm:59
159+
#: tools/list.scm:67
155160
msgid ""
156161
"Japanese/Russian dictionary from the Electronic Dictionary Research and "
157162
"Development Group."

159164
"Dictionnaire japonais/russe de l???Electronic Dictionary Research and "
160165
"Development Group."
161166
162-
#: tools/list.scm:60
167+
#: tools/list.scm:68
163168
msgid ""
164169
"Japanese/Slovenian dictionary from the Electronic Dictionary Research and "
165170
"Development Group."

167172
"Dictionnaire japonais/slov??ne de l???Electronic Dictionary Research and "
168173
"Development Group."
169174
170-
#: tools/list.scm:61
175+
#: tools/list.scm:69
171176
msgid ""
172177
"Japanese/Spanish dictionary from the Electronic Dictionary Research and "
173178
"Development Group."

175180
"Dictionnaire japonais/espagnol de l???Electronic Dictionary Research and "
176181
"Development Group."
177182
178-
#: tools/list.scm:62
183+
#: tools/list.scm:70
179184
msgid ""
180185
"Japanese/Swedish dictionary from the Electronic Dictionary Research and "
181186
"Development Group."

382387
"    aidera ?? mieux prononcer les mots, avec l'accent de hauteur du japonais\n"
383388
"    standard."
384389
390+
#: tools/list.scm:55
391+
#, fuzzy
392+
msgid ""
393+
"This dictionary allows you to do searches on the main view of this app.\n"
394+
"\tFailing to download one of these dictionaries will make the app unusable\n"
395+
"\tas you can't search for anything.  This dictionary can be searched for\n"
396+
"\tby kanji, reading (kana) and by French translation."
397+
msgstr ""
398+
"Ce dictionnaire vous permet d???effectuer des recherches sur la vue\n"
399+
"    principale de cette appli. Si vous n???en t??l??chargez aucun, l???appli\n"
400+
"    sera inutilisable puisque vous ne pourrez rien rechercher. Ce\n"
401+
"    dictionnaire permet d???effectuer des recherches par kanji, par\n"
402+
"    prononciation (kana) et par traduction allemande."
403+
385404
#: tools/list.scm:39
405+
#, fuzzy
386406
msgid ""
387407
"This dictionary allows you to do searches on the main view of this app.\n"
388-
"        Failing to download on of these dictionaries will make the app "
408+
"        Failing to download one of these dictionaries will make the app "
389409
"unusable\n"
390410
"        as you can't search for anything.  This dictionary can be searched "
391411
"for\n"

397417
"    dictionnaire permet d???effectuer des recherches par kanji, par\n"
398418
"    prononciation (kana) et par traduction allemande."
399419
400-
#: tools/list.scm:64
420+
#: tools/list.scm:72
401421
msgid ""
402422
"This dictionary allows you to do searches on the main view of this app.\n"
403423
"        Failing to download one of these dictionaries will make the app "

po/nani.pot

88
msgstr ""
99
"Project-Id-Version: PACKAGE VERSION\n"
1010
"Report-Msgid-Bugs-To: \n"
11-
"POT-Creation-Date: 2020-06-06 01:47+0200\n"
11+
"POT-Creation-Date: 2021-07-18 16:26+0200\n"
1212
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
1313
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
1414
"Language-Team: LANGUAGE <LL@li.org>\n"

9797
msgid "JMdict"
9898
msgstr ""
9999
100-
#: tools/list.scm:55
100+
#: tools/list.scm:63
101101
msgid ""
102102
"Japanese/Dutch dictionary from the Electronic Dictionary Research and "
103103
"Development Group."
104104
msgstr ""
105105
106-
#: tools/list.scm:54
106+
#: tools/list.scm:62
107107
msgid ""
108108
"Japanese/English dictionary from the Electronic Dictionary Research and "
109109
"Development Group."
110110
msgstr ""
111111
112-
#: tools/list.scm:56
112+
#: tools/list.scm:64
113113
msgid ""
114114
"Japanese/French dictionary from the Electronic Dictionary Research and "
115115
"Development Group."
116116
msgstr ""
117117
118+
#: tools/list.scm:53
119+
msgid "Japanese/French dictionary from the Jibiki project."
120+
msgstr ""
121+
118122
#: tools/list.scm:37
119123
msgid "Japanese/German dictionary from Wadoku."
120124
msgstr ""
121125
122-
#: tools/list.scm:57
126+
#: tools/list.scm:65
123127
msgid ""
124128
"Japanese/German dictionary from the Electronic Dictionary Research and "
125129
"Development Group."
126130
msgstr ""
127131
128-
#: tools/list.scm:58
132+
#: tools/list.scm:66
129133
msgid ""
130134
"Japanese/Hungarian dictionary from the Electronic Dictionary Research and "
131135
"Development Group."
132136
msgstr ""
133137
134-
#: tools/list.scm:59
138+
#: tools/list.scm:67
135139
msgid ""
136140
"Japanese/Russian dictionary from the Electronic Dictionary Research and "
137141
"Development Group."
138142
msgstr ""
139143
140-
#: tools/list.scm:60
144+
#: tools/list.scm:68
141145
msgid ""
142146
"Japanese/Slovenian dictionary from the Electronic Dictionary Research and "
143147
"Development Group."
144148
msgstr ""
145149
146-
#: tools/list.scm:61
150+
#: tools/list.scm:69
147151
msgid ""
148152
"Japanese/Spanish dictionary from the Electronic Dictionary Research and "
149153
"Development Group."
150154
msgstr ""
151155
152-
#: tools/list.scm:62
156+
#: tools/list.scm:70
153157
msgid ""
154158
"Japanese/Swedish dictionary from the Electronic Dictionary Research and "
155159
"Development Group."

311315
"         words better, with a standard Japanese pitch accent."
312316
msgstr ""
313317
318+
#: tools/list.scm:55
319+
msgid ""
320+
"This dictionary allows you to do searches on the main view of this app.\n"
321+
"\tFailing to download one of these dictionaries will make the app unusable\n"
322+
"\tas you can't search for anything.  This dictionary can be searched for\n"
323+
"\tby kanji, reading (kana) and by French translation."
324+
msgstr ""
325+
314326
#: tools/list.scm:39
315327
msgid ""
316328
"This dictionary allows you to do searches on the main view of this app.\n"
317-
"        Failing to download on of these dictionaries will make the app "
329+
"        Failing to download one of these dictionaries will make the app "
318330
"unusable\n"
319331
"        as you can't search for anything.  This dictionary can be searched "
320332
"for\n"
321333
"        by kanji, reading (kana) and by German translation."
322334
msgstr ""
323335
324-
#: tools/list.scm:64
336+
#: tools/list.scm:72
325337
msgid ""
326338
"This dictionary allows you to do searches on the main view of this app.\n"
327339
"        Failing to download one of these dictionaries will make the app "

po/uk.po

107107
msgid "JMdict"
108108
msgstr "JMdict"
109109
110-
#: tools/list.scm:55
110+
#: tools/list.scm:63
111111
msgid ""
112112
"Japanese/Dutch dictionary from the Electronic Dictionary Research and "
113113
"Development Group."

115115
"????????????????-???????????????????????? ?????????????? ?????? Electronic Dictionary Research and "
116116
"Development Group."
117117
118-
#: tools/list.scm:54
118+
#: tools/list.scm:62
119119
msgid ""
120120
"Japanese/English dictionary from the Electronic Dictionary Research and "
121121
"Development Group."

123123
"????????????????-?????????????????????? ?????????????? ?????? Electronic Dictionary Research and "
124124
"Development Group."
125125
126-
#: tools/list.scm:56
126+
#: tools/list.scm:64
127127
msgid ""
128128
"Japanese/French dictionary from the Electronic Dictionary Research and "
129129
"Development Group."

131131
"????????????????-?????????????????????? ?????????????? ?????? Electronic Dictionary Research and "
132132
"Development Group."
133133
134+
#: tools/list.scm:53
135+
msgid "Japanese/French dictionary from the Jibiki project."
136+
msgstr ""
137+
134138
#: tools/list.scm:37
135139
msgid "Japanese/German dictionary from Wadoku."
136140
msgstr "????????????????-?????????????????? ?????????????? ?????? Wadoku."
137141
138-
#: tools/list.scm:57
142+
#: tools/list.scm:65
139143
msgid ""
140144
"Japanese/German dictionary from the Electronic Dictionary Research and "
141145
"Development Group."

143147
"????????????????-?????????????????? ?????????????? ?????? Electronic Dictionary Research and "
144148
"Development Group."
145149
146-
#: tools/list.scm:58
150+
#: tools/list.scm:66
147151
msgid ""
148152
"Japanese/Hungarian dictionary from the Electronic Dictionary Research and "
149153
"Development Group."

151155
"????????????????-?????????????????? ?????????????? ?????? Electronic Dictionary Research and "
152156
"Development Group."
153157
154-
#: tools/list.scm:59
158+
#: tools/list.scm:67
155159
msgid ""
156160
"Japanese/Russian dictionary from the Electronic Dictionary Research and "
157161
"Development Group."

159163
"????????????????-???????????????????? ?????????????? ?????? Electronic Dictionary Research and "
160164
"Development Group."
161165
162-
#: tools/list.scm:60
166+
#: tools/list.scm:68
163167
msgid ""
164168
"Japanese/Slovenian dictionary from the Electronic Dictionary Research and "
165169
"Development Group."

167171
"????????????????-?????????????????????? ?????????????? ?????? Electronic Dictionary Research and "
168172
"Development Group."
169173
170-
#: tools/list.scm:61
174+
#: tools/list.scm:69
171175
msgid ""
172176
"Japanese/Spanish dictionary from the Electronic Dictionary Research and "
173177
"Development Group."

175179
"????????????????-???????????????????? ?????????????? ?????? Electronic Dictionary Research and "
176180
"Development Group."
177181
178-
#: tools/list.scm:62
182+
#: tools/list.scm:70
179183
msgid ""
180184
"Japanese/Swedish dictionary from the Electronic Dictionary Research and "
181185
"Development Group."

383387
"?????????????????? ???? ?? ????????????????????, ?? ?????? ?????????????? ?????????????? ????????, ?????? ???????????????????? ??????\n"
384388
"?????????????????? ?????????? ??????????, ?? ?????????????????????????? ???????????????????? ???????????????? ????????????????."
385389
390+
#: tools/list.scm:55
391+
msgid ""
392+
"This dictionary allows you to do searches on the main view of this app.\n"
393+
"\tFailing to download one of these dictionaries will make the app unusable\n"
394+
"\tas you can't search for anything.  This dictionary can be searched for\n"
395+
"\tby kanji, reading (kana) and by French translation."
396+
msgstr ""
397+
386398
#: tools/list.scm:39
387399
msgid ""
388400
"This dictionary allows you to do searches on the main view of this app.\n"
389-
"        Failing to download on of these dictionaries will make the app "
401+
"        Failing to download one of these dictionaries will make the app "
390402
"unusable\n"
391403
"        as you can't search for anything.  This dictionary can be searched "
392404
"for\n"

399411
"???????????????? ???? ???? ?????????????? ???????????? ?? ?????? ???????????? ??????????. ?? ?????????? ???????????????? ??????????\n"
400412
"???????????? ???? ????????????, ?????????????? ?????????????????? (????????) ???? ???????????????????? ??????????????????."
401413
402-
#: tools/list.scm:64
414+
#: tools/list.scm:72
403415
msgid ""
404416
"This dictionary allows you to do searches on the main view of this app.\n"
405417
"        Failing to download one of these dictionaries will make the app "

445457
"multilingual,\n"
446458
"including Dutch, French, German and others."
447459
msgstr ""
448-
"???? ?????????????????? ???????? ???????????????? ???? ???????? ???????????????????? ???????????????? <a href=\"~a\""
449-
">Creative Commons\n"
460+
"???? ?????????????????? ???????? ???????????????? ???? ???????? ???????????????????? ???????????????? <a href=\"~a"
461+
"\">Creative Commons\n"
450462
"Share-Alike</a>. ???? ???????? ?? ???????????????????? ?? ???????????????????? (????????????????????) ?? "
451463
"???????????????????????? ??????????, ??????????????\n"
452464
"????????????????????????, ??????????????????????, ?????????????????? ???? ???????????? ????????????."

radicals.mk

1-
RADK_MODULES=tools/radk.scm modules/nani/radk.scm modules/nani/serialize.scm
1+
RADK_MODULES=tools/radk.scm modules/nani/kanji/radk.scm modules/nani/encoding/serialize.scm
22
DICOS+=dicos/radicals.nani
33
DOWNLOADS+=dictionaries/radkfilex.utf8 dictionaries/kanjidic2.xml
44
55
dictionaries/radkfilex.utf8:
6-
	wget ftp://ftp.monash.edu/pub/nihongo/kradzip.zip -O dictionaries/kradzip.zip
6+
	#wget ftp://ftp.monash.edu/pub/nihongo/kradzip.zip -O dictionaries/kradzip.zip
7+
	wget http://ftp.usf.edu/pub/ftp.monash.edu.au/pub/nihongo/kradzip.zip -O dictionaries/kradzip.zip
78
	unzip dictionaries/kradzip.zip radkfilex -d dictionaries
89
	iconv -f euc-jp -t utf-8 dictionaries/radkfilex > $@
910
	rm dictionaries/radkfilex

tools/jibiki.scm unknown status 1

1+
;;; Nani Project website
2+
;;; Copyright ?? 2020 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
19+
(use-modules (nani result result))
20+
(use-modules (nani result jibiki))
21+
(use-modules (nani result frequency))
22+
(use-modules (ice-9 match))
23+
(use-modules (ice-9 binary-ports))
24+
25+
;; Break these steps to try and let the GC reclaim these big objects
26+
(define (get-results1 input frq)
27+
  (call-with-input-file input
28+
    (lambda (port)
29+
      (xml->results port frq))))
30+
31+
(define (get-results input meaning-filter frq)
32+
  (let* ((results (get-results1 input frq))
33+
         (results (map (lambda (result)
34+
                         (update-result
35+
                           result
36+
                           #:meanings (filter meaning-filter
37+
                                            (result-meanings result))))
38+
                       results))
39+
         (results (filter (lambda (result) (not (null? (result-meanings result))))
40+
                          results)))
41+
    results))
42+
43+
(define (compile input meaning-filter output)
44+
  (let* ((results (get-results input meaning-filter
45+
                               (load-frequency "dictionaries/frequency.tsv"))))
46+
    (format #t "Number of entries in ~a: ~a~%" output (length results))
47+
    (call-with-output-file output
48+
      (lambda (port)
49+
        (put-bytevector port
50+
          (serialize-dictionary results))))))
51+
52+
(match (command-line)
53+
  ((_ cmd input output)
54+
   (cond
55+
    ((equal? cmd "build")
56+
     (compile input (const #t) output))
57+
    (else (format #t "Unknown cmd ~a.~%" cmd)))))

tools/jmdict.scm

1616
;;; You should have received a copy of the GNU Affero General Public License
1717
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
1818
19-
(use-modules (nani jmdict trie))
20-
(use-modules (nani jmdict serialize))
21-
(use-modules (nani jmdict xml))
22-
(use-modules (nani frequency))
23-
(use-modules (nani trie))
24-
(use-modules (nani result))
19+
(use-modules (nani result frequency))
20+
(use-modules (nani result jmdict))
21+
(use-modules (nani result result))
2522
(use-modules (ice-9 match))
2623
(use-modules (ice-9 binary-ports))
2724

3128
    (lambda (port)
3229
      (xml->results port frq))))
3330
34-
(define (get-results input sense-filter frq)
31+
(define (get-results input meaning-filter frq)
3532
  (let* ((results (get-results1 input frq))
3633
         (results (map (lambda (result)
3734
                         (update-result
3835
                           result
39-
                           #:senses (filter sense-filter
40-
                                            (result-senses result))))
36+
                           #:meanings (filter meaning-filter
37+
                                            (result-meanings result))))
4138
                       results))
42-
         (results (filter (lambda (result) (not (null? (result-senses result))))
39+
         (results (filter (lambda (result) (not (null? (result-meanings result))))
4340
                          results)))
4441
    results))
4542
46-
(define (compile input sense-filter output)
47-
  (let* ((results (get-results input sense-filter
48-
                               (load-frequency "dictionaries/frequency.tsv")))
49-
         (kanji-trie (compress-trie (make-kanji-trie results)))
50-
         (reading-trie (compress-trie (make-reading-trie results)))
51-
         (meaning-trie (compress-trie (make-meaning-trie results))))
43+
(define (compile input meaning-filter output)
44+
  (let* ((results (get-results input meaning-filter
45+
                               (load-frequency "dictionaries/frequency.tsv"))))
5246
    (format #t "Number of entries in ~a: ~a~%" output (length results))
5347
    (call-with-output-file output
5448
      (lambda (port)
5549
        (put-bytevector port
56-
          (serialize-jmdict results kanji-trie reading-trie meaning-trie))))))
50+
          (serialize-dictionary results))))))
5751
5852
(define (print word dict)
5953
  #t)

6458
    ((equal? cmd "build")
6559
     (if (equal? lang "e")
6660
       (compile input (const #t) output)
67-
       (compile input (lambda (sense) (equal? (sense-language sense) lang)) output)))
61+
       (compile input (lambda (meaning) (equal? (meaning-language meaning) lang)) output)))
6862
    (else (format #t "Unknown cmd ~a.~%" cmd))))
6963
  ((_ "print" word input)
7064
   (print word input)))

tools/list.scm

1717
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
1818
1919
(use-modules (tools i18n))
20-
(use-modules (nani radk))
21-
(use-modules (nani jmdict serialize))
22-
(use-modules (nani wadoku pitch))
20+
(use-modules (nani kanji radk))
21+
(use-modules (nani result result))
22+
(use-modules (nani pitch pitch))
2323
(use-modules (gcrypt hash))
2424
(use-modules (ice-9 match))
2525
(use-modules (ice-9 format))

3737
    `(_ "Japanese/German dictionary from Wadoku."))
3838
  (define wadoku-description
3939
    `(_ "This dictionary allows you to do searches on the main view of this app.
40-
        Failing to download on of these dictionaries will make the app unusable
40+
        Failing to download one of these dictionaries will make the app unusable
4141
        as you can't search for anything.  This dictionary can be searched for
4242
        by kanji, reading (kana) and by German translation."))
4343

4949
         and this dictionary will add information that will help you pronounce
5050
         words better, with a standard Japanese pitch accent."))
5151
52+
  (define jibiki-synopsis
53+
    `(_ "Japanese/French dictionary from the Jibiki project."))
54+
  (define jibiki-description
55+
    `(_ "This dictionary allows you to do searches on the main view of this app.
56+
	Failing to download one of these dictionaries will make the app unusable
57+
	as you can't search for anything.  This dictionary can be searched for
58+
	by kanji, reading (kana) and by French translation."))
59+
5260
  (define (jmdict-synopsis lang)
5361
    (match lang
5462
      ("e" `(_ "Japanese/English dictionary from the Electronic Dictionary Research and Development Group."))

8088
             (if long?
8189
                 wadoku-pitch-description
8290
                 wadoku-pitch-synopsis))
91+
            ((equal? (dico-type dico) "jibiki")
92+
             (if long?
93+
                 jibiki-description
94+
                 jibiki-synopsis))
8395
            ((equal? (dico-type dico) "jmdict")
8496
             (let ((dico-lang (substring dico 7)))
8597
               (if long?

107119
    ((equal? file "radicals") "radk")
108120
    ((and (> (string-length file) 6) (equal? (substring file 0 6) "JMdict"))
109121
     "jmdict")
122+
    ((equal? file "jibiki_fre") "jibiki")
110123
    ((equal? file "wadoku_ger") "wadoku")
111124
    ((equal? file "wadoku_pitch") "wadoku_pitch")))
112125

114127
  (cond
115128
    ((equal? (dico-type (dico-name file)) "radk")
116129
     (kanji-count file))
117-
    ((member (dico-type (dico-name file)) '("jmdict" "wadoku"))
118-
     (jmdict-entry-count file))
130+
    ((member (dico-type (dico-name file)) '("jmdict" "wadoku" "jibiki"))
131+
     (dictionary-entry-count file))
119132
    ((equal? (dico-type (dico-name file)) "wadoku_pitch")
120133
     (pitch-entry-count file))))
121134

tools/radk.scm

1616
;;; You should have received a copy of the GNU Affero General Public License
1717
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
1818
19-
(use-modules (nani radk))
19+
(use-modules (nani kanji radk))
2020
(use-modules (ice-9 match))
2121
(use-modules (ice-9 binary-ports))
2222

tools/wadoku.scm

1616
;;; You should have received a copy of the GNU Affero General Public License
1717
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
1818
19-
(use-modules (nani jmdict trie))
20-
(use-modules (nani jmdict serialize))
21-
(use-modules (nani wadoku xml))
22-
(use-modules (nani wadoku pitch))
23-
(use-modules (nani frequency))
24-
(use-modules (nani trie))
25-
(use-modules (nani result))
19+
(use-modules (nani pitch wadoku))
20+
(use-modules (nani pitch pitch))
21+
(use-modules (nani result frequency))
22+
(use-modules (nani result result))
23+
(use-modules (nani result wadoku))
2624
(use-modules (ice-9 match))
2725
(use-modules (ice-9 binary-ports))
2826

3230
    (lambda (port)
3331
      (xml->results port frq))))
3432
35-
(define (get-results input sense-filter frq)
33+
(define (get-results input meaning-filter frq)
3634
  (let* ((results (get-results1 input frq))
3735
         (results (map (lambda (result)
3836
                         (update-result
3937
                           result
40-
                           #:senses (filter sense-filter
41-
                                            (result-senses result))))
38+
                           #:meanings (filter meaning-filter
39+
                                            (result-meanings result))))
4240
                       results))
43-
         (results (filter (lambda (result) (not (null? (result-senses result))))
41+
         (results (filter (lambda (result) (not (null? (result-meanings result))))
4442
                          results)))
4543
    results))
4644
47-
(define (compile input sense-filter output)
48-
  (let* ((results (get-results input sense-filter
49-
                               (load-frequency "dictionaries/frequency.tsv")))
50-
         (kanji-trie (compress-trie (make-kanji-trie results)))
51-
         (reading-trie (compress-trie (make-reading-trie results)))
52-
         (meaning-trie (compress-trie (make-meaning-trie results))))
45+
(define (compile input meaning-filter output)
46+
  (let* ((results (get-results input meaning-filter
47+
                               (load-frequency "dictionaries/frequency.tsv"))))
5348
    (format #t "Number of entries in ~a: ~a~%" output (length results))
5449
    (call-with-output-file output
5550
      (lambda (port)
5651
        (put-bytevector port
57-
          (serialize-jmdict results kanji-trie reading-trie meaning-trie))))))
52+
          (serialize-dictionary results))))))
5853
5954
(define (get-pitch input)
6055
  (call-with-input-file input

6358
6459
(define (pitch input output)
6560
  (let ((results (get-pitch input)))
66-
    (format #t "~a results." (length results))
61+
    (format #t "~a results.~%" (length results))
6762
    (call-with-output-file output
6863
      (lambda (port)
6964
        (put-bytevector port