Better compression and sort results by relevance

Julien LepillerThu Apr 18 17:35:09+0200 2019

d858d11

Better compression and sort results by relevance

Makefile

44
55
DICOS=dicos/JMdict_e.nani $(addprefix dicos/JMdict_, $(addsuffix .nani, $(JMDICT_LANGS)))
66
7-
PAGES=blog.scm e404.scm feeds.scm index.scm mentions.scm
7+
PAGES=blog.scm data.scm documentation.scm e404.scm feeds.scm index.scm mentions.scm
88
99
HAUNT_FILES= haunt.scm $(addprefix pages/, $(PAGES)) \
1010
        tools/i18n.scm tools/theme.scm

1414
1515
DICO_MODULES=modules/nani/trie.scm modules/nani/result.scm modules/nani/jmdict/trie.scm \
1616
        modules/nani/jmdict/serialize.scm modules/nani/jmdict/xml.scm \
17-
        modules/nani/jmdict/entities.scm
17+
        modules/nani/jmdict/entities.scm modules/nani/frequency.scm
1818
19-
all: po/fr/LC_MESSAGES/nani.mo po/eo/LC_MESSAGES/nani.mo $(WEB_FILES)
19+
LANGS=fr
20+
21+
all: $(addprefix po/, $(addsuffix /LC_MESSAGES/nani.mo, $(LANGS))) $(WEB_FILES)
2022
	haunt build
2123
	rm -rf public.bak
2224
	mv public public.bak

2527
2628
download:
2729
	@rm -f dictionaries/*
28-
	@$(MAKE) dictionaries/JMdic_e.xml dictionaries/JMdic.xml
30+
	@$(MAKE) dictionaries/JMdic_e.xml dictionaries/JMdic.xml dictionaries/frequency.tsv
2931
32+
# Download JMdict dictionaries from ERDRG
3033
dictionaries/%.xml:
31-
	wget http://ftp.monash.edu/pub/nihongo/$(shell basename $<).gz -O $<.gz
32-
	gunzip $<.gz
33-
	sed -i -e 's|&lt;|\&\&lt;;|g' -e 's|&gt;|\&\&gt;;|g' $<
34-
	sed -i -e 's|&\([^;]\+\);|\1|g' $<
35-
	cp $< $@
34+
	dl_filename="$(shell echo "$@" | rev | cut -c5- | rev)"; \
35+
	wget http://ftp.monash.edu/pub/nihongo/"$$dl_filename".gz -O "$$dl_filename.gz"; \
36+
	gunzip "$$dl_filename.gz"; \
37+
	sed -i -e 's|&lt;|\&\&lt;;|g' -e 's|&gt;|\&\&gt;;|g' "$$dl_filename"; \
38+
	sed -i -e 's|&\([^;]\+\);|\1|g' "$$dl_filename"; \
39+
	cp "$$dl_filename" "$@"
40+
41+
# Download frequency analysis run on Wikipedia in 2015
42+
# https://en.wiktionary.org/wiki/Wiktionary:Frequency_lists/Japanese2015_10000
43+
dictionaries/frequency.tsv:
44+
	wget --no-check-certificate \
45+
        https://namakajiri.net/data/wikipedia-20150422-lemmas.tsv -O $@.tmp
46+
	sed -i 's| ||g' $@.tmp
47+
	mv $@.tmp $@
3648
3749
dictionaries/%.sxml: dictionaries/%.xml tools/jmdict.scm
3850
	guile -L modules tools/jmdict.scm convert $< nolang $@
3951
40-
dicos/JMdict_%.nani: dictionaries/JMdict.sxml tools/jmdict.scm $(DICO_MODULES)
52+
dicos/JMdict_%.nani: dictionaries/JMdict.sxml tools/jmdict.scm dictionaries/frequency.tsv $(DICO_MODULES)
4153
	guile -L modules tools/jmdict.scm build \
4254
        $< $(shell echo $@ | sed 's|^.*_\([^.]*\)\..*$$|\1|g') $@
4355
44-
dicos/JMdict_e.nani: dictionaries/JMdict_e.sxml tools/jmdict.scm $(DICO_MODULES)
56+
dicos/JMdict_e.nani: dictionaries/JMdict_e.sxml tools/jmdict.scm dictionaries/frequency.tsv $(DICO_MODULES)
4557
	guile -L modules tools/jmdict.scm build $< e $@
4658
4759
po/%/LC_MESSAGES/nani.mo: po/%.po

modules/nani/frequency.scm unknown status 1

1+
;;; Nani Project website
2+
;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
19+
(define-module (nani frequency)
20+
  #:use-module (ice-9 rdelim)
21+
  #:export (load-frequency
22+
            frequency-entity))
23+
24+
(define (load-frequency file)
25+
  (call-with-input-file file
26+
    (lambda (port)
27+
      (let loop ((frq '()))
28+
        (let* ((line (%read-line port))
29+
               (line (car line)))
30+
          (if (eof-object? line)
31+
            frq
32+
            (let ((content (string-split line #\tab)))
33+
              (loop (cons (cons (cadr (cdr content))
34+
                                (string->number (car content)))
35+
                          frq)))))))))
36+
37+
(define (frequency-entity frq word)
38+
  (let ((freq (assoc-ref frq word)))
39+
    (cond
40+
      ((not freq)
41+
       #f)
42+
      ((< freq 501)
43+
       "frq500")
44+
      ((< freq 1001)
45+
       "frq1000")
46+
      ((< freq 2001)
47+
       "frq2000")
48+
      ((< freq 5001)
49+
       "frq1000")
50+
      ((< freq 10001)
51+
       "frq10000")
52+
      ((< freq 20001)
53+
       "frq20000")
54+
      (else #f))))

modules/nani/huffman.scm unknown status 1

1+
;;; Nani Project website
2+
;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
19+
(define-module (nani huffman)
20+
  #:use-module (ice-9 match)
21+
  #:use-module (rnrs bytevectors)
22+
  #:export (create-huffman
23+
            huffman->code
24+
            huffman-encode
25+
            huffman-decode
26+
            serialize-huffman))
27+
28+
(define (add-occurence occ char)
29+
  (let* ((o (assoc-ref occ char))
30+
         (o (+ (if o o 0) 1)))
31+
    (assoc-set! occ char o)))
32+
33+
(define (add-occurences occ str)
34+
  (let loop ((lst (append (string->list str) (list #\nul))) (occ occ))
35+
    (if (null? lst)
36+
      occ
37+
      (loop (cdr lst) (add-occurence occ (car lst))))))
38+
39+
(define (get-occurences string-list)
40+
  (let loop ((lst string-list) (occ '()))
41+
    (if (null? lst)
42+
      occ
43+
      (loop (cdr lst) (add-occurences occ (car lst))))))
44+
45+
(define (order tree)
46+
  (if (null? tree)
47+
    tree
48+
    (add-in-order (car tree) (order (cdr tree)))))
49+
50+
(define (add-in-order elem tree)
51+
  (if (null? tree)
52+
    (list elem)
53+
    (let ((elem-pos (cdr elem))
54+
          (tree-pos (cdr (car tree))))
55+
      (if (> elem-pos tree-pos)
56+
        (cons (car tree) (add-in-order elem (cdr tree)))
57+
        (cons elem tree)))))
58+
59+
(define (create-huffman string-list)
60+
  (let ((total (apply + (map string-length string-list)))
61+
        (occurences (get-occurences string-list)))
62+
    (let loop ((tree (order occurences)))
63+
      (if (equal? (length tree) 1)
64+
        tree
65+
        (let* ((e1 (car tree))
66+
               (tree (cdr tree))
67+
               (e2 (car tree))
68+
               (tree (cdr tree)))
69+
          (loop (add-in-order (cons (list e1 e2) (+ (cdr e1) (cdr e2))) tree)))))))
70+
71+
(define (huffman->code huffman)
72+
  (match huffman
73+
    ('() '())
74+
    ((((h1 h2) . weight))
75+
     (append
76+
       (map (lambda (e) (cons (car e) (cons 0 (cdr e)))) (huffman->code h1))
77+
       (map (lambda (e) (cons (car e) (cons 1 (cdr e)))) (huffman->code h2))))
78+
    (((h1 h2) . weight)
79+
     (append
80+
       (map (lambda (e) (cons (car e) (cons 0 (cdr e)))) (huffman->code h1))
81+
       (map (lambda (e) (cons (car e) (cons 1 (cdr e)))) (huffman->code h2))))
82+
    ((((? char? char) . weight))
83+
     (list (cons char '())))
84+
    (((? char? char) . weight)
85+
     (list (cons char '())))))
86+
87+
(define (bits->bytes b)
88+
  (match b
89+
   ('() '())
90+
   ((a b c d e f g h bs ...)
91+
    (cons (+ h (* 2 g) (* 4 f) (* 8 e) (* 16 d) (* 32 c) (* 64 b) (* 128 a))
92+
      (bits->bytes bs)))
93+
   (_ (bits->bytes (append b (make-list (- 8 (length b)) 0))))))
94+
95+
(define (huffman-encode code str)
96+
  (u8-list->bytevector
97+
    (bits->bytes
98+
      (apply append (map (lambda (c) (assoc-ref code c))
99+
                         (append (string->list str) (list #\nul)))))))
100+
101+
(define (byte->bits b)
102+
  (let loop ((i 2) (b b) (by '()))
103+
    (if (eq? i 512)
104+
      by
105+
      (loop (* i 2) (- b (modulo b i)) (cons (if (eq? (modulo b i) 0) 0 1) by)))))
106+
107+
(define (huffman-decode huffman bv)
108+
  (let ((seq (apply append (map byte->bits (bytevector->u8-list bv)))))
109+
    (list->string
110+
      (reverse
111+
        (let loop ((seq seq) (result '()) (huff huffman))
112+
          (if (null? seq)
113+
            result
114+
            (match huff
115+
              ((((h1 h2) . weight))
116+
               (if (= (car seq) 0)
117+
                 (loop (cdr seq) result h1)
118+
                 (loop (cdr seq) result h2)))
119+
              (((h1 h2) . weight)
120+
               (if (= (car seq) 0)
121+
                 (loop (cdr seq) result h1)
122+
                 (loop (cdr seq) result h2)))
123+
              ((((? char? char) . weight))
124+
               (if (equal? char #\nul)
125+
                 result
126+
                 (loop seq (cons char result) huffman)))
127+
              (((? char? char) . weight)
128+
               (if (equal? char #\nul)
129+
                 result
130+
                 (loop seq (cons char result) huffman))))))))))
131+
132+
(define (serialize-huffman huffman)
133+
  (define (serialize huffman)
134+
    (match huffman
135+
      ((((h1 h2) . weight))
136+
       (append '(1) (serialize h1) (serialize h2) '(2)))
137+
      (((h1 h2) . weight)
138+
       (append '(1) (serialize h1) (serialize h2) '(2)))
139+
      ((((? char? char) . weight))
140+
       (append (bytevector->u8-list (string->utf8 (list->string (list char)))) '(0)))
141+
      (((? char? char) . weight)
142+
       (append (bytevector->u8-list (string->utf8 (list->string (list char)))) '(0)))))
143+
  (u8-list->bytevector (serialize huffman)))

modules/nani/jmdict/entities.scm

174174
    ("sumo" . 170)
175175
    ("zool" . 171)
176176
    ("joc" . 172)
177-
    ("anat" . 173)))
177+
    ("anat" . 173)
178+
    ("frq500" . 174)
179+
    ("frq1000" . 175)
180+
    ("frq2000" . 176)
181+
    ("frq5000" . 177)
182+
    ("frq10000" . 178)
183+
    ("frq20000" . 179)))
178184
179185
(define (get-entity ent)
180186
  (let ((val (assoc-ref entities ent)))

modules/nani/jmdict/serialize.scm

1717
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
1818
1919
(define-module (nani jmdict serialize)
20+
  #:use-module (nani huffman)
2021
  #:use-module (nani result)
2122
  #:use-module (nani trie)
2223
  #:use-module (rnrs bytevectors)

4647
          (loop pos (cdr trs) (append bvs bv))))))
4748
  (trie-position-set! trie pos)
4849
  (let* ((vals-sz (list-size (trie-vals trie) int-size))
49-
         (trs-sz (list-size (trie-transitions trie) (const 5)))
50-
         (sz (+ vals-sz trs-sz))
50+
         (trs-sz (list-size (trie-transitions trie) (const 5) #:size? #f))
51+
         (sz (+ vals-sz 1 trs-sz))
5152
         (bv (make-bytevector sz 0)))
5253
    (serialize-list (map (lambda (pos) (result-position (array-ref results pos)))
5354
                         (trie-vals trie))

5758
                  (+ pos (bytevector-length bv))))
5859
           (next-pos (car bvs))
5960
           (bvs (cdr bvs)))
61+
      (bytevector-u8-set! bv vals-sz (length (trie-transitions trie)))
6062
      (serialize-list (trie-transitions trie) serialize-pointer
61-
                      vals-sz bv)
63+
                      (+ vals-sz 1) bv #:size? #f)
6264
      (cons next-pos (cons bv bvs)))))
6365
66+
(define* (serialize-list lst serialize pos bv #:key (size? #t))
67+
  (when (not (list? lst)) (throw 'not-list lst))
68+
  (when size?
69+
    (bytevector-u16-set! bv pos (length lst) (endianness big)))
70+
  (let loop ((lst lst) (pos (+ pos (if size? 2 0))))
71+
    (if (null? lst)
72+
      pos
73+
      (loop (cdr lst) (serialize (car lst) pos bv)))))
74+
(define* (list-size lst size #:key (size? #t))
75+
  (when (not (list? lst)) (throw 'not-list lst))
76+
  (apply + (if size? 2 0) (map size lst)))
77+
6478
(define (serialize-pointer ptr pos bv)
6579
  (bytevector-u8-set! bv pos (car ptr))
6680
  (bytevector-u32-set! bv (+ pos 1) (trie-position (cdr ptr)) (endianness big))

8195
  (+ pos 1))
8296
(define boolean-size (const 1))
8397
84-
(define (serialize-string str pos bv)
85-
  (let ((sbv (string->utf8 str)))
86-
    (bytevector-u32-set! bv pos (bytevector-length sbv) (endianness big))
87-
    (bytevector-copy! sbv 0 bv (+ pos 4) (bytevector-length sbv))
88-
    (+ pos 4 (bytevector-length sbv))))
89-
(define (string-size str)
90-
  (let ((sbv (string->utf8 str)))
91-
    (+ 4 (bytevector-length sbv))))
92-
93-
(define* (serialize-list lst serialize pos bv #:key (size? #t))
94-
  (when (not (list? lst)) (throw 'not-list lst))
95-
  (when size?
96-
    (bytevector-u32-set! bv pos (length lst) (endianness big)))
97-
  (let loop ((lst lst) (pos (+ pos (if size? 4 0))))
98-
    (if (null? lst)
99-
      pos
100-
      (loop (cdr lst) (serialize (car lst) pos bv)))))
101-
(define* (list-size lst size #:key (size? #t))
102-
  (when (not (list? lst)) (throw 'not-list lst))
103-
  (apply + (if size? 4 0) (map size lst)))
104-
105-
(define (serialize-source source pos bv)
106-
  (when (not (source? source)) (throw 'not-source source))
107-
  (let* ((pos (serialize-list (source-content source) serialize-string pos bv))
108-
         (pos (serialize-boolean (source-wasei? source) pos bv))
109-
         (pos (serialize-string (source-type source) pos bv))
110-
         (pos (serialize-string (source-lang source) pos bv)))
111-
    pos))
112-
(define (source-size source)
113-
  (when (not (source? source)) (throw 'not-source source))
114-
  (+ (list-size (source-content source) string-size)
115-
     (boolean-size (source-wasei? source))
116-
     (string-size (source-type source))
117-
     (string-size (source-lang source))))
98+
(define (serialize-jmdict results kanji-trie reading-trie sense-trie)
99+
  (define kanji-huffman
100+
    (let ((kanjis (apply append (map result-kanjis results))))
101+
      (create-huffman kanjis)))
102+
  (define kanji-huffman-code (huffman->code kanji-huffman))
103+
  (define reading-huffman
104+
    (let* ((readings (apply append (map result-readings results)))
105+
           (readings (apply append (map reading-readings readings))))
106+
      (create-huffman readings)))
107+
  (define reading-huffman-code (huffman->code reading-huffman))
108+
  (define sense-huffman
109+
    (let* ((senses (apply append (map result-senses results)))
110+
           (glosses (apply append (map sense-glosses senses))))
111+
      (create-huffman glosses)))
112+
  (define sense-huffman-code (huffman->code sense-huffman))
118113
119-
(define (serialize-reading reading pos bv)
120-
  (when (not (reading? reading)) (throw 'not-reading reading))
121-
  (let* ((pos (serialize-list (reading-kanjis reading) serialize-string pos bv))
122-
         (pos (serialize-list (reading-info reading) serialize-string pos bv))
123-
         (pos (serialize-list (reading-readings reading) serialize-string pos bv)))
124-
    pos))
125-
(define (reading-size reading)
126-
  (when (not (reading? reading)) (throw 'not-reading reading))
127-
  (+ (list-size (reading-kanjis reading) string-size)
128-
     (list-size (reading-info reading) string-size)
129-
     (list-size (reading-readings reading) string-size)))
114+
  (define (serialize-string str pos bv)
115+
    (let ((sbv (string->utf8 str)))
116+
      (bytevector-copy! sbv 0 bv pos (bytevector-length sbv))
117+
      (bytevector-u8-set! bv (+ pos (bytevector-length sbv)) 0)
118+
      (+ pos 1 (bytevector-length sbv))))
119+
  (define (string-size str)
120+
    (let ((sbv (string->utf8 str)))
121+
      (+ 1 (bytevector-length sbv))))
130122
131-
(define (serialize-sense sense pos bv)
132-
  (when (not (sense? sense)) (throw 'not-sense sense))
133-
  (let* ((pos (serialize-list (sense-references sense) serialize-string pos bv))
134-
         (pos (serialize-list (sense-limits sense) serialize-string pos bv))
135-
         (pos (serialize-list (sense-infos sense) serialize-string pos bv))
136-
         (pos (serialize-list (sense-sources sense) serialize-source pos bv))
137-
         (pos (serialize-list (sense-tags sense) serialize-char pos bv))
138-
         (pos (serialize-list (sense-glosses sense) serialize-string pos bv))
139-
         (pos (serialize-string (sense-language sense) pos bv)))
140-
    pos))
141-
(define (sense-size sense)
142-
  (when (not (sense? sense)) (throw 'not-sense sense))
143-
  (+ (list-size (sense-references sense) string-size)
144-
     (list-size (sense-limits sense) string-size)
145-
     (list-size (sense-infos sense) string-size)
146-
     (list-size (sense-sources sense) source-size)
147-
     (list-size (sense-tags sense) char-size)
148-
     (list-size (sense-glosses sense) string-size)
149-
     (string-size (sense-language sense))))
123+
  (define (serialize-huffman-string huffman-code)
124+
    (lambda (str pos bv)
125+
      (let ((sbv (huffman-encode huffman-code str)))
126+
        (bytevector-copy! sbv 0 bv (+ pos 4) (bytevector-length sbv))
127+
        (+ pos (bytevector-length sbv)))))
128+
  (define (huffman-string-size huffman-code)
129+
    (lambda (str)
130+
      (let ((sbv (huffman-encode huffman-code str)))
131+
        (+ (bytevector-length sbv)))))
132+
  
133+
  (define (serialize-source source pos bv)
134+
    (when (not (source? source)) (throw 'not-source source))
135+
    (let* ((pos (serialize-list (source-content source) serialize-string pos bv))
136+
           (pos (serialize-boolean (source-wasei? source) pos bv))
137+
           (pos (serialize-string (source-type source) pos bv))
138+
           (pos (serialize-string (source-lang source) pos bv)))
139+
      pos))
140+
  (define (source-size source)
141+
    (when (not (source? source)) (throw 'not-source source))
142+
    (+ (list-size (source-content source) string-size)
143+
       (boolean-size (source-wasei? source))
144+
       (string-size (source-type source))
145+
       (string-size (source-lang source))))
146+
  
147+
  (define (serialize-reading reading pos bv)
148+
    (when (not (reading? reading)) (throw 'not-reading reading))
149+
    (let* ((pos (serialize-list (reading-kanjis reading) serialize-string pos bv))
150+
           (pos (serialize-list (reading-info reading) serialize-string pos bv))
151+
           (pos (serialize-list (reading-readings reading)
152+
                                (serialize-huffman-string reading-huffman-code) pos bv)))
153+
      pos))
154+
  (define (reading-size reading)
155+
    (when (not (reading? reading)) (throw 'not-reading reading))
156+
    (+ (list-size (reading-kanjis reading) string-size)
157+
       (list-size (reading-info reading) string-size)
158+
       (list-size (reading-readings reading) (huffman-string-size reading-huffman-code))))
159+
  
160+
  (define (serialize-sense sense pos bv)
161+
    (when (not (sense? sense)) (throw 'not-sense sense))
162+
    (let* ((pos (serialize-list (sense-references sense) serialize-string pos bv))
163+
           (pos (serialize-list (sense-limits sense) serialize-string pos bv))
164+
           (pos (serialize-list (sense-infos sense) serialize-string pos bv))
165+
           (pos (serialize-list (sense-sources sense) serialize-source pos bv))
166+
           (pos (serialize-list (sense-tags sense) serialize-char pos bv))
167+
           (pos (serialize-list (sense-glosses sense)
168+
                (serialize-huffman-string sense-huffman-code) pos bv))
169+
           (pos (serialize-string (sense-language sense) pos bv)))
170+
      pos))
171+
  (define (sense-size sense)
172+
    (when (not (sense? sense)) (throw 'not-sense sense))
173+
    (+ (list-size (sense-references sense) string-size)
174+
       (list-size (sense-limits sense) string-size)
175+
       (list-size (sense-infos sense) string-size)
176+
       (list-size (sense-sources sense) source-size)
177+
       (list-size (sense-tags sense) char-size)
178+
       (list-size (sense-glosses sense)
179+
                  (huffman-string-size sense-huffman-code))
180+
       (string-size (sense-language sense))))
181+
  
182+
  (define (serialize-result result pos bv)
183+
    (when (not (result? result)) (throw 'not-result result))
184+
    (result-position-set! result pos)
185+
    (let* ((pos (serialize-list (result-kanjis result)
186+
                                (serialize-huffman-string kanji-huffman-code) pos bv))
187+
           (pos (serialize-list (result-readings result) serialize-reading pos bv))
188+
           (pos (serialize-list (result-senses result) serialize-sense pos bv)))
189+
      pos))
190+
  (define (result-size result)
191+
    (when (not (result? result)) (throw 'not-result result))
192+
    (+ (list-size (result-kanjis result) (huffman-string-size kanji-huffman-code))
193+
       (list-size (result-readings result) reading-size)
194+
       (list-size (result-senses result) sense-size)))
150195
151-
(define (serialize-result result pos bv)
152-
  (when (not (result? result)) (throw 'not-result result))
153-
  (result-position-set! result pos)
154-
  (let* ((pos (serialize-list (result-kanjis result) serialize-string pos bv))
155-
         (pos (serialize-list (result-readings result) serialize-reading pos bv))
156-
         (pos (serialize-list (result-senses result) serialize-sense pos bv)))
157-
    pos))
158-
(define (result-size result)
159-
  (when (not (result? result)) (throw 'not-result result))
160-
  (+ (list-size (result-kanjis result) string-size)
161-
     (list-size (result-readings result) reading-size)
162-
     (list-size (result-senses result) sense-size)))
196+
  (define (trie-node-size trie)
197+
    (apply + 1 (map trie-node-size (map cdr (trie-transitions trie)))))
163198
164-
(define (serialize-jmdict results kanji-trie reading-trie sense-trie)
165199
  (let* ((header (string->utf8 "NANI_JMDICT001"))
166200
         (header-size (bytevector-length header))
167201
         (pointers (make-bytevector 12 0))
202+
         (kanji-huffman-bv (serialize-huffman kanji-huffman))
203+
         (kanji-huffman-size (bytevector-length kanji-huffman-bv))
204+
         (reading-huffman-bv (serialize-huffman reading-huffman))
205+
         (reading-huffman-size (bytevector-length reading-huffman-bv))
206+
         (sense-huffman-bv (serialize-huffman sense-huffman))
207+
         (sense-huffman-size (bytevector-length sense-huffman-bv))
168208
         (results-size (list-size results result-size #:size? #f))
169-
         (results-bv (make-bytevector (+ header-size 12 results-size 0))))
170-
    (serialize-list results serialize-result (+ header-size 12) results-bv #:size? #f)
209+
         (huffman-size (+ reading-huffman-size sense-huffman-size kanji-huffman-size))
210+
         (results-bv (make-bytevector (+ header-size 12 kanji-huffman-size
211+
                                         reading-huffman-size sense-huffman-size
212+
                                         results-size 0))))
213+
    (format #t "Number of nodes in kanjis: ~a~%"
214+
      (trie-node-size kanji-trie))
215+
    (format #t "Number of nodes in readings: ~a~%"
216+
      (trie-node-size reading-trie))
217+
    (format #t "Number of nodes in senses: ~a~%"
218+
      (trie-node-size sense-trie))
219+
    (serialize-list results serialize-result (+ header-size 12 huffman-size)
220+
                    results-bv #:size? #f)
171221
    (let* ((results (list->array 1 results))
172222
           (pos (bytevector-length results-bv))
173223
           (kanji-bvs (serialize-trie kanji-trie results pos))

180230
      (bytevector-u32-set! pointers 8 (car reading-bvs) (endianness big))
181231
      (bytevector-copy! header 0 results-bv 0 header-size)
182232
      (bytevector-copy! pointers 0 results-bv header-size 12)
233+
      (bytevector-copy! kanji-huffman-bv 0 results-bv (+ header-size 12) kanji-huffman-size)
234+
      (bytevector-copy! reading-huffman-bv 0 results-bv
235+
                        (+ header-size 12 kanji-huffman-size)
236+
                        reading-huffman-size)
237+
      (bytevector-copy! sense-huffman-bv 0 results-bv
238+
                        (+ header-size 12 kanji-huffman-size reading-huffman-size)
239+
                        sense-huffman-size)
183240
      ;; give some feedback on the size of file's structures
241+
      (format #t "huffmans are ~a bytes long~%" huffman-size)
184242
      (format #t "results is ~a bytes long~%" (bytevector-length results-bv))
185243
      (format #t "kanjis is ~a bytes long~%" (apply + (map bytevector-length (cdr kanji-bvs))))
186244
      (format #t "readings is ~a bytes long~%" (apply + (map bytevector-length (cdr reading-bvs))))

modules/nani/jmdict/xml.scm

2020
  #:use-module (ice-9 match)
2121
  #:use-module (ice-9 rdelim)
2222
  #:use-module (sxml fold)
23+
  #:use-module (nani frequency)
2324
  #:use-module (sxml simple)
2425
  #:use-module (nani result)
2526
  #:use-module (nani jmdict entities)

7172
          ((? string? _) sense))
7273
        (cdr lst)))))
7374
74-
(define (sxml->result lst)
75-
  (let loop ((result (make-result 0 '() '() '())) (lst lst))
76-
    (if (null? lst)
77-
      result
78-
      (loop
79-
        (match (car lst)
80-
          (('kanji kanji) (update-result result #:kanjis (cons kanji (result-kanjis result))))
81-
          ((? reading? r) (update-result result #:readings (cons r (result-readings result))))
82-
          ((? sense? s) (update-result result #:senses (cons s (result-senses result))))
83-
          ((? string? _) result))
84-
        (cdr lst)))))
75+
(define (sxml->result lst frq)
76+
  (let ((result
77+
        (let loop ((result (make-result 0 0 '() '() '())) (lst lst))
78+
          (if (null? lst)
79+
            result
80+
            (loop
81+
              (match (car lst)
82+
                (('kanji kanji)
83+
                 (update-result result #:kanjis (cons kanji (result-kanjis result))))
84+
                ((? reading? r)
85+
                 (update-result result #:readings (cons r (result-readings result))))
86+
                ((? sense? s)
87+
                 (update-result result #:senses (cons s (result-senses result))))
88+
                ((? string? _) result))
89+
              (cdr lst))))))
90+
    (let* ((word (if (null? (result-kanjis result))
91+
                   (car (reading-readings (car (result-readings result))))
92+
                   (car (result-kanjis result))))
93+
           (entity (frequency-entity frq word))
94+
           (sense1 (car (result-senses result)))
95+
           (sense1 (update-sense sense1 #:tags (cons entity (sense-tags sense1))))
96+
           (senses (cons sense1 (cdr (result-senses result)))))
97+
      (update-result result #:points (get-points (update-result result #:senses senses))))))
98+
99+
(define (get-points result)
100+
  (let ((tags (apply append (map sense-tags (result-senses result)))))
101+
    (apply + (map (lambda (tag)
102+
                    (cond
103+
                      ;; frequency
104+
                      ((eq? tag (get-entity "frq500"))
105+
                       32)
106+
                      ((eq? tag (get-entity "frq1000"))
107+
                       16)
108+
                      ((eq? tag (get-entity "frq2000"))
109+
                       8)
110+
                      ((eq? tag (get-entity "frq5000"))
111+
                       4)
112+
                      ((eq? tag (get-entity "frq10000"))
113+
                       2)
114+
                      ((eq? tag (get-entity "frq20000"))
115+
                       1)
116+
                      ;; archaic term
117+
                      ((eq? tag (get-entity "arch"))
118+
                       -3)
119+
                      ;; children language
120+
                      ((eq? tag (get-entity "chn"))
121+
                       -1)
122+
                      ;; colloquialism
123+
                      ((eq? tag (get-entity "col"))
124+
                       5)
125+
                      ;; expression
126+
                      ((eq? tag (get-entity "exp"))
127+
                       1)
128+
                      ;; sonkeigo (honorific or respectful)
129+
                      ((eq? tag (get-entity "hon"))
130+
                       -1)
131+
                      ;; kenjougo (humble)
132+
                      ((eq? tag (get-entity "hum"))
133+
                       -1)
134+
                      ;; teineigo (polite)
135+
                      ((eq? tag (get-entity "pol"))
136+
                       -1)
137+
                      ;; irregular kanji
138+
                      ((eq? tag (get-entity "iK"))
139+
                       -2)
140+
                      ;; idiomatic expression
141+
                      ((eq? tag (get-entity "id"))
142+
                       3)
143+
                      ;; slang
144+
                      ((member tag (list
145+
                                     (get-entity "sl")
146+
                                     (get-entity "m-sl")
147+
                                     (get-entity "male-sl")))
148+
                       -4)
149+
                      ;; outdated kanji
150+
                      ((eq? tag (get-entity "oK"))
151+
                       -2)
152+
                      ;; obsolete term
153+
                      ((eq? tag (get-entity "obs"))
154+
                       -5)
155+
                      ;; obscure term
156+
                      ((eq? tag (get-entity "obsc"))
157+
                       -10)
158+
                      ;; onomatopea
159+
                      ((eq? tag (get-entity "on-mim"))
160+
                       1)
161+
                      ;; poetical term
162+
                      ((eq? tag (get-entity "poet"))
163+
                       -1)
164+
                      ((eq? tag (get-entity "proverb"))
165+
                       1)
166+
                      ((eq? tag (get-entity "rare"))
167+
                       -3)
168+
                      ;; sensitive
169+
                      ((eq? tag (get-entity "sens"))
170+
                       -2)
171+
                      ;; dialect
172+
                      ((member tag (list
173+
                                     (get-entity "kyb")
174+
                                     (get-entity "osb")
175+
                                     (get-entity "ksb")
176+
                                     (get-entity "ktb")
177+
                                     (get-entity "tsb")
178+
                                     (get-entity "thb")
179+
                                     (get-entity "tsug")
180+
                                     (get-entity "kyu")
181+
                                     (get-entity "rkb")
182+
                                     (get-entity "nab")
183+
                                     (get-entity "hob")))
184+
                       -1)
185+
                      (else 0)))
186+
                  tags))))
85187
      
86-
(define (sxml->results sxml)
87-
  (foldt
88-
    (lambda (xml)
89-
      (match xml
90-
       (('ent_seq _) "")
91-
       (('ke_pri _) "")
92-
       (('re_pri _) "")
93-
       (('pri _) "")
94-
       (('keb kanji) `(kanji ,kanji))
95-
       (('ke_inf _) "")
96-
       (('k_ele lst ...) (car (filter list? lst)))
97-
       (('reb reading) `(reading ,reading))
98-
       (('re_nokanji _ ...) "")
99-
       (('re_restr r) `(limit ,r))
100-
       (('re_inf r) `(info ,r))
101-
       (('r_ele lst ...) (sxml->reading lst))
102-
       (('ant pos) `(ref ,pos))
103-
       (('dial pos) `(tag ,(get-entity pos)))
104-
       (('field pos) `(tag ,(get-entity pos)))
105-
       (('misc pos) `(tag ,(get-entity pos)))
106-
       (('pos pos) `(tag ,(get-entity pos)))
107-
       (('xref pos) `(ref ,pos))
108-
       (('g_type _ ...) "")
109-
       (('gloss (? string? g)) xml)
110-
       (('gloss (? list? g)) g)
111-
       (('gloss attr g) (cons `(gloss ,g) attr))
112-
       (('stagk k) `(limit ,k))
113-
       (('stagr r) `(limit ,r))
114-
       (('s_inf r) `(info ,r))
115-
       (('lsource lst ...) (sxml->source lst))
116-
       (('sense lst ...) (sxml->sense lst))
117-
       (('entry lst ...) (sxml->result lst))
118-
       (('JMdict lst ...) lst)
119-
       (('xml:lang l) `(lang ,l))
120-
       (('ls_wasei _) '(wasei #t))
121-
       (('ls_type t) `(type ,t))
122-
       (('@ lst ...) lst)
123-
       (('*TOP* _ l) (filter result? l))
124-
       (('*PI* _ ...) #f)))
125-
    (lambda (a) a)
126-
    sxml))
188+
(define (sxml->results sxml frq)
189+
  (sort
190+
    (foldt
191+
      (lambda (xml)
192+
        (match xml
193+
         (('ent_seq _) "")
194+
         (('ke_pri _) "")
195+
         (('re_pri _) "")
196+
         (('pri _) "")
197+
         (('keb kanji) `(kanji ,kanji))
198+
         (('ke_inf _) "")
199+
         (('k_ele lst ...) (car (filter list? lst)))
200+
         (('reb reading) `(reading ,reading))
201+
         (('re_nokanji _ ...) "")
202+
         (('re_restr r) `(limit ,r))
203+
         (('re_inf r) `(info ,r))
204+
         (('r_ele lst ...) (sxml->reading lst))
205+
         (('ant pos) `(ref ,pos))
206+
         (('dial pos) `(tag ,(get-entity pos)))
207+
         (('field pos) `(tag ,(get-entity pos)))
208+
         (('misc pos) `(tag ,(get-entity pos)))
209+
         (('pos pos) `(tag ,(get-entity pos)))
210+
         (('xref pos) `(ref ,pos))
211+
         (('g_type _ ...) "")
212+
         (('gloss (? string? g)) xml)
213+
         (('gloss (? list? g)) g)
214+
         (('gloss attr g) (cons `(gloss ,g) attr))
215+
         (('stagk k) `(limit ,k))
216+
         (('stagr r) `(limit ,r))
217+
         (('s_inf r) `(info ,r))
218+
         (('lsource lst ...) (sxml->source lst))
219+
         (('sense lst ...) (sxml->sense lst))
220+
         (('entry lst ...) (sxml->result lst frq))
221+
         (('JMdict lst ...) lst)
222+
         (('xml:lang l) `(lang ,l))
223+
         (('ls_wasei _) '(wasei #t))
224+
         (('ls_type t) `(type ,t))
225+
         (('@ lst ...) lst)
226+
         (('*TOP* _ l) (filter result? l))
227+
         (('*PI* _ ...) #f)))
228+
      (lambda (a) a)
229+
      sxml)
230+
    (lambda (a b) (< (result-points a) (result-points b)))))

modules/nani/result.scm

2222
            result?
2323
            result-position
2424
            result-position-set!
25+
            result-points
2526
            result-kanjis
2627
            result-readings
2728
            result-senses

5556
            update-source))
5657
5758
(define-record-type result
58-
  (make-result position kanjis readings senses)
59+
  (make-result position points kanjis readings senses)
5960
  result?
6061
  (position result-position result-position-set!) ; integer
62+
  (points result-points) ; integer
6163
  (kanjis result-kanjis) ; string-list
6264
  (readings result-readings) ; reanding-list
6365
  (senses result-senses)) ; sense-list

8991
  (lang source-lang)) ; string
9092
9193
(define* (update-result result
92-
           #:key (kanjis (result-kanjis result))
94+
           #:key (points (result-points result))
95+
                 (kanjis (result-kanjis result))
9396
                 (readings (result-readings result))
9497
                 (senses (result-senses result)))
95-
  (make-result (result-position result) kanjis readings senses))
98+
  (make-result (result-position result) points kanjis readings senses))
9699
97100
(define* (update-reading reading
98101
           #:key (kanjis (reading-kanjis reading))

tools/jmdict.scm

1+
;;; Nani Project website
2+
;;; Copyright ?? 2019 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
119
(use-modules (nani jmdict trie))
220
(use-modules (nani jmdict serialize))
321
(use-modules (nani jmdict xml))
22+
(use-modules (nani frequency))
423
(use-modules (nani trie))
524
(use-modules (nani result))
625
(use-modules (ice-9 match))

1332
        (write sxml port)))))
1433
1534
(define (compile input sense-filter output)
16-
  (let* ((sxml (if (equal? (substring input (- (string-length input) 3)) "xml")
35+
  (let* ((frq (load-frequency "dictionaries/frequency.tsv"))
36+
         (sxml (if (equal? (substring input (- (string-length input) 4)) ".xml")
1737
                 (load-dic input)
18-
                 (read input)))
19-
         (results (sxml->results sxml))
38+
                 (call-with-input-file input read)))
39+
         (results (sxml->results sxml frq))
2040
         (results (map (lambda (result)
2141
                         (update-result
2242
                           result