tatoeba.scm
1 | ;;; Nani Project website |
2 | ;;; Copyright © 2022 Julien Lepiller <julien@lepiller.eu> |
3 | ;;; |
4 | ;;; This file is part of the Nani Project website. |
5 | ;;; |
6 | ;;; The Nani Project website is free software; you can redistribute it and/or modify it |
7 | ;;; under the terms of the GNU Affero General Public License as published by |
8 | ;;; the Free Software Foundation; either version 3 of the License, or (at |
9 | ;;; your option) any later version. |
10 | ;;; |
11 | ;;; The Nani Project website is distributed in the hope that it will be useful, but |
12 | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | ;;; GNU Affero General Public License for more details. |
15 | ;;; |
16 | ;;; You should have received a copy of the GNU Affero General Public License |
17 | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. |
18 | |
19 | (use-modules (srfi srfi-9)) |
20 | (use-modules (srfi srfi-11)) |
21 | (use-modules (ice-9 match)) |
22 | (use-modules (ice-9 binary-ports)) |
23 | (use-modules (ice-9 textual-ports)) |
24 | (use-modules (nani sentence sentence)) |
25 | |
26 | (define (tatoeba-file file) |
27 | (string-append (dirname (current-filename)) "/../dictionaries/tatoeba_" file)) |
28 | |
29 | (define (get-sentences lang native-users) |
30 | (call-with-input-file (tatoeba-file "sentences_detailed.csv") |
31 | (lambda (port) |
32 | (let loop ((jpn '()) (trans '()) (line (get-line port))) |
33 | (if (eof-object? line) |
34 | (values (reverse jpn) (reverse trans)) |
35 | (match (string-split line #\tab) |
36 | ((id slang text user _ _) |
37 | (cond |
38 | ((equal? lang slang) |
39 | (loop jpn (cons (cons (string->number id) text) trans) |
40 | (get-line port))) |
41 | ((and (equal? slang "jpn") (member user native-users)) |
42 | (loop (cons (cons (string->number id) text) jpn) trans (get-line port))) |
43 | (else |
44 | (loop jpn trans (get-line port))))))))))) |
45 | |
46 | (define (get-native-jpn-users) |
47 | (call-with-input-file (tatoeba-file "user_languages.csv") |
48 | (lambda (port) |
49 | (let loop ((jpn '()) (line (get-line port))) |
50 | (if (eof-object? line) |
51 | jpn |
52 | (match (string-split line #\tab) |
53 | ((ulang level user _) |
54 | ;; consume the comment if it's on multiple lines |
55 | (let loop2 ((line line)) |
56 | (when (string-suffix? "\\" line) |
57 | (loop2 (get-line port)))) |
58 | ;; Add native japanese user |
59 | (if (and (equal? ulang "jpn") (equal? level "5")) |
60 | (loop (cons user jpn) (get-line port)) |
61 | (loop jpn (get-line port)))))))))) |
62 | |
63 | (define (get-translations jpn trans) |
64 | (define jpn-len (vector-length jpn)) |
65 | (define trans-len (vector-length trans)) |
66 | (define (member-vect elem vect len) |
67 | (let loop ((min 0) (max (- len 1))) |
68 | (if (> min max) |
69 | #f |
70 | (let* ((mid-pos (floor (+ min (/ (- max min) 2)))) |
71 | (mid-elem (vector-ref vect mid-pos))) |
72 | (cond |
73 | ((equal? mid-elem elem) #t) |
74 | ((> mid-elem elem) (loop min (- mid-pos 1))) |
75 | ((< mid-elem elem) (loop (+ mid-pos 1) max))))))) |
76 | |
77 | (call-with-input-file (tatoeba-file "sentences_base.csv") |
78 | (lambda (port) |
79 | (let loop ((translations '()) (line (get-line port))) |
80 | (if (or (eof-object? line) (null? trans)) |
81 | translations |
82 | (match (string-split line #\tab) |
83 | ((id translation) |
84 | (cond |
85 | ((and (string->number id) |
86 | (member-vect (string->number id) trans trans-len) |
87 | (string->number translation) |
88 | (member-vect (string->number translation) jpn jpn-len)) |
89 | (loop (cons (cons (string->number translation) (string->number id)) translations) |
90 | (get-line port))) |
91 | ((and (string->number id) |
92 | (member-vect (string->number id) jpn jpn-len) |
93 | (string->number translation) |
94 | (member-vect (string->number translation) trans trans-len)) |
95 | (loop (cons (cons (string->number id) (string->number translation)) translations) |
96 | (get-line port))) |
97 | (else |
98 | (loop translations (get-line port))))))))))) |
99 | |
100 | (define (add-tags translations trans jpn) |
101 | (define tags |
102 | (call-with-input-file (tatoeba-file "tags.csv") |
103 | (lambda (port) |
104 | (let loop ((tags '()) (line (get-line port))) |
105 | (if (eof-object? line) |
106 | tags |
107 | (match (string-split line #\tab) |
108 | ((id tag) |
109 | (assoc-set! tags (string->number id) (cons tag (or (assoc-ref tags (string->number id)) '())))))))))) |
110 | (map |
111 | (match-lambda |
112 | ((jpn-id . trans-id) |
113 | (let ((tags (or (assoc-ref tags jpn-id) '())) |
114 | (trans (assoc-ref trans trans-id)) |
115 | (jpn (assoc-ref jpn jpn-id))) |
116 | (make-sentence jpn trans tags #f)))) |
117 | translations)) |
118 | |
119 | (define (get-tatoeba-sentences lang) |
120 | (define native-users (get-native-jpn-users)) |
121 | (let-values (((jpn trans) (get-sentences lang native-users))) |
122 | (format #t "jpn: ~a sentences~%" (length jpn)) |
123 | (format #t "~a: ~a sentences~%" lang (length trans)) |
124 | (let ((translations |
125 | (get-translations |
126 | (list->vector (sort (map car jpn) <)) |
127 | (list->vector (sort (map car trans) <))))) |
128 | (format #t "~a pairs~%" (length translations)) |
129 | (add-tags translations trans jpn)))) |
130 | |
131 | (match (command-line) |
132 | ((_ lang output) |
133 | (let ((sentences (get-tatoeba-sentences lang))) |
134 | (format #t "Number of entries in ~a: ~a~%" output (length sentences)) |
135 | (call-with-output-file output |
136 | (lambda (port) |
137 | (put-bytevector port |
138 | (serialize-sentence-dictionary sentences))))))) |
139 |