sort jmdict by relevance
Makefile
44 | 44 | wget --no-check-certificate \ | |
45 | 45 | https://namakajiri.net/data/wikipedia-20150422-lemmas.tsv -O $@.tmp | |
46 | 46 | sed -i 's| ||g' $@.tmp | |
47 | - | mv $@.tmp $@ | |
47 | + | head -n20000 $@.tmp > $@ | |
48 | + | rm $@.tmp | |
48 | 49 | ||
49 | 50 | dictionaries/%.sxml: dictionaries/%.xml tools/jmdict.scm | |
50 | 51 | guile -L modules tools/jmdict.scm convert $< nolang $@ |
modules/nani/frequency.scm
24 | 24 | (define (load-frequency file) | |
25 | 25 | (call-with-input-file file | |
26 | 26 | (lambda (port) | |
27 | - | (let loop ((frq '())) | |
27 | + | (let loop ((frq '()) (i 1)) | |
28 | 28 | (let* ((line (%read-line port)) | |
29 | 29 | (line (car line))) | |
30 | 30 | (if (eof-object? line) | |
31 | 31 | frq | |
32 | 32 | (let ((content (string-split line #\tab))) | |
33 | - | (loop (cons (cons (cadr (cdr content)) | |
34 | - | (string->number (car content))) | |
35 | - | frq))))))))) | |
33 | + | (loop (cons (cons (cadr (cdr content)) i) frq) (+ i 1))))))))) | |
36 | 34 | ||
37 | 35 | (define (frequency-entity frq word) | |
38 | 36 | (let ((freq (assoc-ref frq word))) |
modules/nani/huffman.scm
133 | 133 | (define (serialize huffman) | |
134 | 134 | (match huffman | |
135 | 135 | ((((h1 h2) . weight)) | |
136 | - | (append '(1) (serialize h1) (serialize h2) '(2))) | |
136 | + | (append '(1) (serialize h1) (serialize h2))) | |
137 | 137 | (((h1 h2) . weight) | |
138 | - | (append '(1) (serialize h1) (serialize h2) '(2))) | |
138 | + | (append '(1) (serialize h1) (serialize h2))) | |
139 | 139 | ((((? char? char) . weight)) | |
140 | 140 | (append (bytevector->u8-list (string->utf8 (list->string (list char)))) '(0))) | |
141 | 141 | (((? char? char) . weight) |
modules/nani/jmdict/serialize.scm
123 | 123 | (define (serialize-huffman-string huffman-code) | |
124 | 124 | (lambda (str pos bv) | |
125 | 125 | (let ((sbv (huffman-encode huffman-code str))) | |
126 | - | (bytevector-copy! sbv 0 bv (+ pos 4) (bytevector-length sbv)) | |
126 | + | (bytevector-copy! sbv 0 bv pos (bytevector-length sbv)) | |
127 | 127 | (+ pos (bytevector-length sbv))))) | |
128 | 128 | (define (huffman-string-size huffman-code) | |
129 | 129 | (lambda (str) |
modules/nani/jmdict/xml.scm
227 | 227 | (('*PI* _ ...) #f))) | |
228 | 228 | (lambda (a) a) | |
229 | 229 | sxml) | |
230 | - | (lambda (a b) (< (result-points a) (result-points b))))) | |
230 | + | ;; reverse order: bigger score first | |
231 | + | (lambda (a b) (> (result-points a) (result-points b))))) |
tools/jmdict.scm
31 | 31 | (lambda (port) | |
32 | 32 | (write sxml port))))) | |
33 | 33 | ||
34 | - | (define (compile input sense-filter output) | |
35 | - | (let* ((frq (load-frequency "dictionaries/frequency.tsv")) | |
36 | - | (sxml (if (equal? (substring input (- (string-length input) 4)) ".xml") | |
34 | + | ;; Break these steps to try and let the GC reclaim these big objects | |
35 | + | (define (get-results1 input frq) | |
36 | + | (let ((sxml (if (equal? (substring input (- (string-length input) 4)) ".xml") | |
37 | 37 | (load-dic input) | |
38 | - | (call-with-input-file input read))) | |
39 | - | (results (sxml->results sxml frq)) | |
38 | + | (call-with-input-file input read)))) | |
39 | + | (format #t "Read xml~%") | |
40 | + | (sxml->results sxml frq))) | |
41 | + | ||
42 | + | (define (get-results input sense-filter frq) | |
43 | + | (let* ((results (get-results1 input frq)) | |
40 | 44 | (results (map (lambda (result) | |
41 | 45 | (update-result | |
42 | 46 | result | |
… | |||
44 | 48 | (result-senses result)))) | |
45 | 49 | results)) | |
46 | 50 | (results (filter (lambda (result) (not (null? (result-senses result)))) | |
47 | - | results)) | |
51 | + | results))) | |
52 | + | results)) | |
53 | + | ||
54 | + | (define (compile input sense-filter output) | |
55 | + | (let* ((results (get-results input sense-filter | |
56 | + | (load-frequency "dictionaries/frequency.tsv"))) | |
48 | 57 | (kanji-trie (compress-trie (make-kanji-trie results))) | |
49 | 58 | (reading-trie (compress-trie (make-reading-trie results))) | |
50 | 59 | (meaning-trie (compress-trie (make-meaning-trie results)))) |