sort jmdict by relevance

Julien LepillerSat Apr 20 13:34:51+0200 2019

a9fec24

sort jmdict by relevance

Makefile

4444
	wget --no-check-certificate \
4545
        https://namakajiri.net/data/wikipedia-20150422-lemmas.tsv -O $@.tmp
4646
	sed -i 's| ||g' $@.tmp
47-
	mv $@.tmp $@
47+
	head -n20000 $@.tmp > $@
48+
	rm $@.tmp
4849
4950
dictionaries/%.sxml: dictionaries/%.xml tools/jmdict.scm
5051
	guile -L modules tools/jmdict.scm convert $< nolang $@

modules/nani/frequency.scm

2424
(define (load-frequency file)
2525
  (call-with-input-file file
2626
    (lambda (port)
27-
      (let loop ((frq '()))
27+
      (let loop ((frq '()) (i 1))
2828
        (let* ((line (%read-line port))
2929
               (line (car line)))
3030
          (if (eof-object? line)
3131
            frq
3232
            (let ((content (string-split line #\tab)))
33-
              (loop (cons (cons (cadr (cdr content))
34-
                                (string->number (car content)))
35-
                          frq)))))))))
33+
              (loop (cons (cons (cadr (cdr content)) i) frq) (+ i 1)))))))))
3634
3735
(define (frequency-entity frq word)
3836
  (let ((freq (assoc-ref frq word)))

modules/nani/huffman.scm

133133
  (define (serialize huffman)
134134
    (match huffman
135135
      ((((h1 h2) . weight))
136-
       (append '(1) (serialize h1) (serialize h2) '(2)))
136+
       (append '(1) (serialize h1) (serialize h2)))
137137
      (((h1 h2) . weight)
138-
       (append '(1) (serialize h1) (serialize h2) '(2)))
138+
       (append '(1) (serialize h1) (serialize h2)))
139139
      ((((? char? char) . weight))
140140
       (append (bytevector->u8-list (string->utf8 (list->string (list char)))) '(0)))
141141
      (((? char? char) . weight)

modules/nani/jmdict/serialize.scm

123123
  (define (serialize-huffman-string huffman-code)
124124
    (lambda (str pos bv)
125125
      (let ((sbv (huffman-encode huffman-code str)))
126-
        (bytevector-copy! sbv 0 bv (+ pos 4) (bytevector-length sbv))
126+
        (bytevector-copy! sbv 0 bv pos (bytevector-length sbv))
127127
        (+ pos (bytevector-length sbv)))))
128128
  (define (huffman-string-size huffman-code)
129129
    (lambda (str)

modules/nani/jmdict/xml.scm

227227
         (('*PI* _ ...) #f)))
228228
      (lambda (a) a)
229229
      sxml)
230-
    (lambda (a b) (< (result-points a) (result-points b)))))
230+
    ;; reverse order: bigger score first
231+
    (lambda (a b) (> (result-points a) (result-points b)))))

tools/jmdict.scm

3131
      (lambda (port)
3232
        (write sxml port)))))
3333
34-
(define (compile input sense-filter output)
35-
  (let* ((frq (load-frequency "dictionaries/frequency.tsv"))
36-
         (sxml (if (equal? (substring input (- (string-length input) 4)) ".xml")
34+
;; Break these steps to try and let the GC reclaim these big objects
35+
(define (get-results1 input frq)
36+
  (let ((sxml (if (equal? (substring input (- (string-length input) 4)) ".xml")
3737
                 (load-dic input)
38-
                 (call-with-input-file input read)))
39-
         (results (sxml->results sxml frq))
38+
                 (call-with-input-file input read))))
39+
    (format #t "Read xml~%")
40+
    (sxml->results sxml frq)))
41+
42+
(define (get-results input sense-filter frq)
43+
  (let* ((results (get-results1 input frq))
4044
         (results (map (lambda (result)
4145
                         (update-result
4246
                           result

4448
                                            (result-senses result))))
4549
                       results))
4650
         (results (filter (lambda (result) (not (null? (result-senses result))))
47-
                          results))
51+
                          results)))
52+
    results))
53+
54+
(define (compile input sense-filter output)
55+
  (let* ((results (get-results input sense-filter
56+
                               (load-frequency "dictionaries/frequency.tsv")))
4857
         (kanji-trie (compress-trie (make-kanji-trie results)))
4958
         (reading-trie (compress-trie (make-reading-trie results)))
5059
         (meaning-trie (compress-trie (make-meaning-trie results))))