nani/website/tools/jmdict.scm

jmdict.scm

1
;;; Nani Project website
2
;;; Copyright © 2019 Julien Lepiller <julien@lepiller.eu>
3
;;;
4
;;; This file is part of the Nani Project website.
5
;;;
6
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7
;;; under the terms of the GNU Affero General Public License as published by
8
;;; the Free Software Foundation; either version 3 of the License, or (at
9
;;; your option) any later version.
10
;;;
11
;;; The Nani Project website is distributed in the hope that it will be useful, but
12
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
;;; GNU Affero General Public License for more details.
15
;;;
16
;;; You should have received a copy of the GNU Affero General Public License
17
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18
19
(use-modules (nani jmdict trie))
20
(use-modules (nani jmdict serialize))
21
(use-modules (nani jmdict xml))
22
(use-modules (nani frequency))
23
(use-modules (nani trie))
24
(use-modules (nani result))
25
(use-modules (ice-9 match))
26
(use-modules (ice-9 binary-ports))
27
28
(define (convert input output)
29
  (let ((sxml (load-dic input)))
30
    (call-with-output-file output
31
      (lambda (port)
32
        (write sxml port)))))
33
34
;; Break these steps to try and let the GC reclaim these big objects
35
(define (get-results1 input frq)
36
  (let ((sxml (if (equal? (substring input (- (string-length input) 4)) ".xml")
37
                 (load-dic input)
38
                 (call-with-input-file input read))))
39
    (format #t "Read xml~%")
40
    (sxml->results sxml frq)))
41
42
(define (get-results input sense-filter frq)
43
  (let* ((results (get-results1 input frq))
44
         (results (map (lambda (result)
45
                         (update-result
46
                           result
47
                           #:senses (filter sense-filter
48
                                            (result-senses result))))
49
                       results))
50
         (results (filter (lambda (result) (not (null? (result-senses result))))
51
                          results)))
52
    results))
53
54
(define (compile input sense-filter output)
55
  (let* ((results (get-results input sense-filter
56
                               (load-frequency "dictionaries/frequency.tsv")))
57
         (kanji-trie (compress-trie (make-kanji-trie results)))
58
         (reading-trie (compress-trie (make-reading-trie results)))
59
         (meaning-trie (compress-trie (make-meaning-trie results))))
60
    (format #t "Number of entries in ~a: ~a~%" output (length results))
61
    (call-with-output-file output
62
      (lambda (port)
63
        (put-bytevector port
64
          (serialize-jmdict results kanji-trie reading-trie meaning-trie))))))
65
66
(define (print word dict)
67
  #t)
68
69
(match (command-line)
70
  ((_ cmd input lang output)
71
   (cond
72
    ((equal? cmd "build")
73
     (if (equal? lang "e")
74
       (compile input (const #t) output)
75
       (compile input (lambda (sense) (equal? (sense-language sense) lang)) output)))
76
    ((equal? cmd "convert")
77
     (convert input output))
78
    (else (format #t "Unknown cmd ~a.~%" cmd))))
79
  ((_ "print" word input)
80
   (print word input)))
81