nani/website/tools/tatoeba.scm

tatoeba.scm

1
;;; Nani Project website
2
;;; Copyright © 2022 Julien Lepiller <julien@lepiller.eu>
3
;;;
4
;;; This file is part of the Nani Project website.
5
;;;
6
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7
;;; under the terms of the GNU Affero General Public License as published by
8
;;; the Free Software Foundation; either version 3 of the License, or (at
9
;;; your option) any later version.
10
;;;
11
;;; The Nani Project website is distributed in the hope that it will be useful, but
12
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
;;; GNU Affero General Public License for more details.
15
;;;
16
;;; You should have received a copy of the GNU Affero General Public License
17
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18
19
(use-modules (srfi srfi-9))
20
(use-modules (srfi srfi-11))
21
(use-modules (ice-9 match))
22
(use-modules (ice-9 binary-ports))
23
(use-modules (ice-9 textual-ports))
24
(use-modules (nani sentence sentence))
25
26
(define (tatoeba-file file)
27
  (string-append (dirname (current-filename)) "/../dictionaries/tatoeba_" file))
28
29
(define (get-sentences lang native-users)
30
  (call-with-input-file (tatoeba-file "sentences_detailed.csv")
31
    (lambda (port)
32
      (let loop ((jpn '()) (trans '()) (line (get-line port)))
33
        (if (eof-object? line)
34
          (values (reverse jpn) (reverse trans))
35
          (match (string-split line #\tab)
36
            ((id slang text user _ _)
37
             (cond
38
               ((equal? lang slang)
39
                (loop jpn (cons (cons (string->number id) text) trans)
40
                      (get-line port)))
41
               ((and (equal? slang "jpn") (member user native-users))
42
                (loop (cons (cons (string->number id) text) jpn) trans (get-line port)))
43
               (else
44
                 (loop jpn trans (get-line port)))))))))))
45
46
(define (get-native-jpn-users)
47
  (call-with-input-file (tatoeba-file "user_languages.csv")
48
    (lambda (port)
49
      (let loop ((jpn '()) (line (get-line port)))
50
        (if (eof-object? line)
51
          jpn
52
          (match (string-split line #\tab)
53
            ((ulang level user _)
54
             ;; consume the comment if it's on multiple lines
55
             (let loop2 ((line line))
56
               (when (string-suffix? "\\" line)
57
                 (loop2 (get-line port))))
58
             ;; Add native japanese user
59
             (if (and (equal? ulang "jpn") (equal? level "5"))
60
               (loop (cons user jpn) (get-line port))
61
               (loop jpn (get-line port))))))))))
62
63
(define (get-translations jpn trans)
64
  (define jpn-len (vector-length jpn))
65
  (define trans-len (vector-length trans))
66
  (define (member-vect elem vect len)
67
    (let loop ((min 0) (max (- len 1)))
68
      (if (> min max)
69
        #f
70
        (let* ((mid-pos (floor (+ min (/ (- max min) 2))))
71
               (mid-elem (vector-ref vect mid-pos)))
72
          (cond
73
            ((equal? mid-elem elem) #t)
74
            ((> mid-elem elem) (loop min (- mid-pos 1)))
75
            ((< mid-elem elem) (loop (+ mid-pos 1) max)))))))
76
77
  (call-with-input-file (tatoeba-file "sentences_base.csv")
78
    (lambda (port)
79
      (let loop ((translations '()) (line (get-line port)))
80
        (if (or (eof-object? line) (null? trans))
81
          translations
82
          (match (string-split line #\tab)
83
            ((id translation)
84
             (cond
85
               ((and (string->number id)
86
                     (member-vect (string->number id) trans trans-len)
87
                     (string->number translation)
88
                     (member-vect (string->number translation) jpn jpn-len))
89
                (loop (cons (cons (string->number translation) (string->number id)) translations)
90
                      (get-line port)))
91
               ((and (string->number id)
92
                     (member-vect (string->number id) jpn jpn-len)
93
                     (string->number translation)
94
                     (member-vect (string->number translation) trans trans-len))
95
                (loop (cons (cons (string->number id) (string->number translation)) translations)
96
                      (get-line port)))
97
               (else
98
                 (loop translations (get-line port)))))))))))
99
100
(define (add-tags translations trans jpn)
101
  (define tags
102
    (call-with-input-file (tatoeba-file "tags.csv")
103
      (lambda (port)
104
        (let loop ((tags '()) (line (get-line port)))
105
          (if (eof-object? line)
106
            tags
107
            (match (string-split line #\tab)
108
              ((id tag)
109
               (assoc-set! tags (string->number id) (cons tag (or (assoc-ref tags (string->number id)) '()))))))))))
110
  (map
111
    (match-lambda
112
      ((jpn-id . trans-id)
113
       (let ((tags (or (assoc-ref tags jpn-id) '()))
114
             (trans (assoc-ref trans trans-id))
115
             (jpn (assoc-ref jpn jpn-id)))
116
         (make-sentence jpn trans tags #f))))
117
    translations))
118
119
(define (get-tatoeba-sentences lang)
120
  (define native-users (get-native-jpn-users))
121
  (let-values (((jpn trans) (get-sentences lang native-users)))
122
    (format #t "jpn: ~a sentences~%" (length jpn))
123
    (format #t "~a: ~a sentences~%" lang (length trans))
124
    (let ((translations
125
            (get-translations
126
              (list->vector (sort (map car jpn) <))
127
              (list->vector (sort (map car trans) <)))))
128
      (format #t "~a pairs~%" (length translations))
129
      (add-tags translations trans jpn))))
130
131
(match (command-line)
132
  ((_ lang output)
133
   (let ((sentences (get-tatoeba-sentences lang)))
134
     (format #t "Number of entries in ~a: ~a~%" output (length sentences))
135
     (call-with-output-file output
136
       (lambda (port)
137
         (put-bytevector port
138
           (serialize-sentence-dictionary sentences)))))))
139