Add tatoeba

Julien LepillerSat Jul 09 21:45:24+0200 2022

3138fbd

Add tatoeba

Makefile

1414
include kanjidic.mk
1515
include kanjivg.mk
1616
include radicals.mk
17+
include tatoeba.mk
1718
include wadoku.mk
1819
1920
# Files that constitute the website

5455
	touch site
5556
5657
download:
57-
	@rm -f dictionaries/*
58+
	@rm -rf dictionaries/*
5859
	@$(MAKE) $(DOWNLOADS)
5960
6061
po/%/LC_MESSAGES/nani.mo: po/%.po

manifest.scm

66
77
    ; for all
88
    "gettext"))
9+

modules/nani/sentence/sentence.scm unknown status 1

1+
;;; Nani Project website
2+
;;; Copyright ?? 2022 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
19+
(define-module (nani sentence sentence)
20+
  #:use-module (ice-9 binary-ports)
21+
  #:use-module (ice-9 match)
22+
  #:use-module (rnrs bytevectors)
23+
  #:use-module (srfi srfi-9)
24+
  #:use-module (nani encoding serialize)
25+
  #:use-module (nani encoding huffman)
26+
  #:use-module (nani encoding trie)
27+
  #:use-module (mecab mecab)
28+
  #:export (make-sentence
29+
            sentence?
30+
            sentence-jpn
31+
            sentence-trans
32+
            sentence-tags
33+
            sentence-audio
34+
35+
            serialize-sentence
36+
            serialize-sentence-dictionary
37+
            sentence-dictionary-entry-count))
38+
39+
(define-record-type <sentence>
40+
  (make-sentence jpn trans tags audio)
41+
  sentence?
42+
  (position sentence-position sentence-position-set!)
43+
  (jpn      sentence-jpn)
44+
  (trans    sentence-trans)
45+
  (tags     sentence-tags)
46+
  (audio    sentence-audio))
47+
48+
(define (serialize-audio filename pos bv)
49+
  (let ((size (if filename (stat:size (stat filename)) 0)))
50+
    (let ((pos (serialize-u16 size pos bv)))
51+
      (if filename
52+
        (let ((fbv (call-with-input-file filename get-bytevector-all)))
53+
          (bytevector-copy! fbv 0 bv pos size)
54+
          (+ pos size))
55+
        pos))))
56+
(define (audio-size filename)
57+
  (let ((size (if filename (stat:size (stat filename)) 0)))
58+
    (+ (u16-size size) size)))
59+
60+
(define (serialize-sentence jpn-huffman trans-huffman)
61+
  (lambda (sentence pos bv)
62+
    (when (not (sentence? sentence)) (throw 'not-sentence sentence))
63+
    (sentence-position-set! sentence pos)
64+
    (let* ((pos ((serialize-huffman-string jpn-huffman)
65+
                 (sentence-jpn sentence) pos bv))
66+
           (pos ((serialize-huffman-string trans-huffman)
67+
                 (sentence-trans sentence) pos bv))
68+
           (pos ((serialize-list (serialize-huffman-string trans-huffman))
69+
                 (sentence-tags sentence) pos bv))
70+
           (pos (serialize-audio (sentence-audio sentence) pos bv)))
71+
      pos)))
72+
(define (sentence-size jpn-huffman trans-huffman)
73+
  (lambda (sentence)
74+
    (when (not (sentence? sentence)) (throw 'not-sentence sentence))
75+
    (+ ((huffman-string-size jpn-huffman) (sentence-jpn sentence))
76+
       ((huffman-string-size trans-huffman) (sentence-trans sentence))
77+
       ((list-size (huffman-string-size trans-huffman)) (sentence-tags sentence))
78+
       (audio-size (sentence-audio sentence)))))
79+
80+
(define (make-key key)
81+
  (apply append
82+
    (map
83+
      (lambda (c)
84+
        (list (quotient c 16) (modulo c 16)))
85+
      (bytevector->u8-list (string->utf8 key)))))
86+
87+
(define (update-trie-pos! trie sentences)
88+
  (let* ((vals (trie-vals trie))
89+
         (vals (map (lambda (i) (sentence-position (array-ref sentences i))) vals)))
90+
    (trie-vals-set! trie vals))
91+
  (for-each
92+
    (match-lambda
93+
      ((char . child)
94+
       (update-trie-pos! child sentences)))
95+
    (trie-transitions trie)))
96+
97+
(define (serialize-sentence-dictionary sentences)
98+
  (define jpn-huffman
99+
    (let ((jpn (map sentence-jpn sentences)))
100+
      (create-huffman jpn)))
101+
  (define jpn-huffman-code (huffman->code jpn-huffman))
102+
103+
  (define trans-huffman
104+
    (let ((trans (map sentence-trans sentences))
105+
          (tags (apply append (map sentence-tags sentences))))
106+
      (create-huffman (append trans tags))))
107+
  (define trans-huffman-code (huffman->code trans-huffman))
108+
109+
  (define (make-sentence-trie sentences)
110+
    (let ((trie (make-empty-trie))
111+
          (tagger (mecab-new-tagger '())))
112+
      (let loop ((sentences sentences) (i 0))
113+
        (if (null? sentences)
114+
          (begin
115+
            (mecab-destroy tagger)
116+
            (compress-trie trie))
117+
          (begin
118+
            (for-each
119+
              (lambda (key)
120+
                (add-to-trie! trie (make-key key) i))
121+
              (mecab-words tagger (sentence-jpn (car sentences))))
122+
            (loop (cdr sentences) (+ i 1)))))))
123+
124+
  (define (trie-node-size trie)
125+
    (apply + 1 (map trie-node-size (map cdr (trie-transitions trie)))))
126+
127+
  (let* ((header (string->utf8 "NANI_SENTENCE001"))
128+
         (header-size (bytevector-length header))
129+
         (pointers (make-bytevector 4 0))
130+
         (jpn-huffman-bv (serialize-huffman jpn-huffman))
131+
         (jpn-huffman-size (bytevector-length jpn-huffman-bv))
132+
         (trans-huffman-bv (serialize-huffman trans-huffman))
133+
         (trans-huffman-size (bytevector-length trans-huffman-bv))
134+
         (serialize-trie (serialize-trie serialize-int int-size))
135+
         (trie-size (trie-size int-size))
136+
         (sentence-trie (make-sentence-trie sentences))
137+
         (sentence-trie-size (trie-size sentence-trie))
138+
         (sentences-size
139+
           ((list-size (sentence-size jpn-huffman-code trans-huffman-code)
140+
                       #:size? #f)
141+
            sentences))
142+
         (huffman-size (+ jpn-huffman-size trans-huffman-size))
143+
         (pos-trie (+ header-size 4 jpn-huffman-size trans-huffman-size
144+
                      sentences-size 4))
145+
         (bv (make-bytevector (+ header-size 4 jpn-huffman-size
146+
                                 trans-huffman-size sentences-size 4
147+
                                 sentence-trie-size))))
148+
    (format #t "Number of nodes in trie: ~a~%" (trie-node-size sentence-trie))
149+
    ((serialize-list (serialize-sentence jpn-huffman-code trans-huffman-code)
150+
                     #:size? #f)
151+
     sentences (+ header-size 4 huffman-size) bv)
152+
    ;; Serializing sentences also updated sentence-pos for each of them
153+
    (let ((sentences (list->array 1 sentences)))
154+
      (update-trie-pos! sentence-trie sentences))
155+
    ;; number of entries
156+
    (serialize-int (length sentences) (+ header-size 4 huffman-size sentences-size)
157+
                   bv)
158+
    (let* ((sentences (list->array 1 sentences))
159+
           (pos pos-trie)
160+
           (pos (serialize-trie sentence-trie pos bv)))
161+
      ;; point to the trie structure
162+
      (bytevector-u32-set!
163+
        pointers 0
164+
        (+ header-size 4 huffman-size sentences-size (int-size 0))
165+
        (endianness big))
166+
      ;; copy to result bytevector
167+
      (bytevector-copy! header 0 bv 0 header-size)
168+
      (bytevector-copy! pointers 0 bv header-size 4)
169+
      (bytevector-copy! jpn-huffman-bv 0 bv (+ header-size 4) jpn-huffman-size)
170+
      (bytevector-copy! trans-huffman-bv 0 bv (+ header-size 4 jpn-huffman-size)
171+
                        trans-huffman-size)
172+
      ;; gide some feedback on the size of file's structures
173+
      (format #t "huffmans are ~a bytes long~%" huffman-size)
174+
      (format #t "sentences are ~a bytes long~%" sentences-size)
175+
      (format #t "trie is ~a bytes long~%" sentence-trie-size)
176+
      bv)))
177+
178+
(define (sentence-dictionary-entry-count file)
179+
  (call-with-input-file file
180+
    (lambda (port)
181+
      (let* ((header (utf8->string (get-bytevector-n port 16)))
182+
             (pointers (get-bytevector-n port 4))
183+
             (end-pos (bytevector-u32-ref pointers 0 (endianness big))))
184+
        (seek port (- end-pos 4) SEEK_SET)
185+
        (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big))))))

po/nani.pot

88
msgstr ""
99
"Project-Id-Version: PACKAGE VERSION\n"
1010
"Report-Msgid-Bugs-To: \n"
11-
"POT-Creation-Date: 2021-07-29 21:50+0200\n"
11+
"POT-Creation-Date: 2022-07-09 21:44+0200\n"
1212
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
1313
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
1414
"Language-Team: LANGUAGE <LL@li.org>\n"

102102
msgid "JMdict"
103103
msgstr ""
104104
105-
#: tools/list.scm:71
105+
#: tools/list.scm:85
106106
msgid ""
107107
"Japanese/Dutch dictionary from the Electronic Dictionary Research and "
108108
"Development Group."
109109
msgstr ""
110110
111-
#: tools/list.scm:70
111+
#: tools/list.scm:84
112112
msgid ""
113113
"Japanese/English dictionary from the Electronic Dictionary Research and "
114114
"Development Group."
115115
msgstr ""
116116
117117
#: tools/list.scm:72
118+
msgid "Japanese/French aligned sentences from the Tatoeba project."
119+
msgstr ""
120+
121+
#: tools/list.scm:86
118122
msgid ""
119123
"Japanese/French dictionary from the Electronic Dictionary Research and "
120124
"Development Group."
121125
msgstr ""
122126
123-
#: tools/list.scm:61
127+
#: tools/list.scm:62
124128
msgid "Japanese/French dictionary from the Jibiki project."
125129
msgstr ""
126130
127-
#: tools/list.scm:45
131+
#: tools/list.scm:46
128132
msgid "Japanese/German dictionary from Wadoku."
129133
msgstr ""
130134
131-
#: tools/list.scm:73
135+
#: tools/list.scm:87
132136
msgid ""
133137
"Japanese/German dictionary from the Electronic Dictionary Research and "
134138
"Development Group."
135139
msgstr ""
136140
137-
#: tools/list.scm:74
141+
#: tools/list.scm:88
138142
msgid ""
139143
"Japanese/Hungarian dictionary from the Electronic Dictionary Research and "
140144
"Development Group."
141145
msgstr ""
142146
143-
#: tools/list.scm:75
147+
#: tools/list.scm:73
148+
msgid "Japanese/Russian aligned sentences from the Tatoeba project."
149+
msgstr ""
150+
151+
#: tools/list.scm:89
144152
msgid ""
145153
"Japanese/Russian dictionary from the Electronic Dictionary Research and "
146154
"Development Group."
147155
msgstr ""
148156
149-
#: tools/list.scm:76
157+
#: tools/list.scm:90
150158
msgid ""
151159
"Japanese/Slovenian dictionary from the Electronic Dictionary Research and "
152160
"Development Group."
153161
msgstr ""
154162
155-
#: tools/list.scm:77
163+
#: tools/list.scm:74
164+
msgid "Japanese/Spanish aligned sentences from the Tatoeba project."
165+
msgstr ""
166+
167+
#: tools/list.scm:91
156168
msgid ""
157169
"Japanese/Spanish dictionary from the Electronic Dictionary Research and "
158170
"Development Group."
159171
msgstr ""
160172
161-
#: tools/list.scm:78
173+
#: tools/list.scm:92
162174
msgid ""
163175
"Japanese/Swedish dictionary from the Electronic Dictionary Research and "
164176
"Development Group."
165177
msgstr ""
166178
167-
#: tools/list.scm:87
179+
#: tools/list.scm:75
180+
msgid "Japanese/Ukrainian aligned sentences from the Tatoeba project."
181+
msgstr ""
182+
183+
#: tools/list.scm:71
184+
msgid "Japanese/english aligned sentences from the Tatoeba project."
185+
msgstr ""
186+
187+
#: tools/list.scm:101
168188
msgid "Kanji dictionary with English meanings."
169189
msgstr ""
170190
171-
#: tools/list.scm:89
191+
#: tools/list.scm:103
172192
msgid "Kanji dictionary with French meanings."
173193
msgstr ""
174194
175-
#: tools/list.scm:90
195+
#: tools/list.scm:104
176196
msgid "Kanji dictionary with Portuguese meanings."
177197
msgstr ""
178198
179-
#: tools/list.scm:88
199+
#: tools/list.scm:102
180200
msgid "Kanji dictionary with Spanish meanings."
181201
msgstr ""
182202
183-
#: tools/list.scm:39
203+
#: tools/list.scm:40
184204
msgid "Kanji writing visual help by the Kanjivg project."
185205
msgstr ""
186206

233253
msgid "Phone: "
234254
msgstr ""
235255
236-
#: tools/list.scm:53
256+
#: tools/list.scm:54
237257
msgid "Pitch accent dictionary from Wadoku."
238258
msgstr ""
239259
240-
#: tools/list.scm:32
260+
#: tools/list.scm:33
241261
msgid ""
242262
"Radical to Kanji dictionary from the Electronic Dictionary Research and "
243263
"Development Group."

281301
msgid "Tap on the search button and you'll see the results. Easy, right?"
282302
msgstr ""
283303
304+
#: tools/list.scm:77
305+
msgid ""
306+
"Tatoeba is a collection of sentences and translations. This\n"
307+
"        dictionary contains pairs of sentences that are direct translations "
308+
"of\n"
309+
"        one another, which allows you to see example sentences in search\n"
310+
"        results."
311+
msgstr ""
312+
284313
#: pages/e404.scm:27
285314
msgid "That's a 404 :/"
286315
msgstr ""

335364
"In the following sections we will see how to use them."
336365
msgstr ""
337366
338-
#: tools/list.scm:55
367+
#: tools/list.scm:56
339368
msgid ""
340369
"This dictionary allows you to augment search results on the main view\n"
341370
"         with pitch accent (pronunciation) information.  Japanese is not "

345374
"         words better, with a standard Japanese pitch accent."
346375
msgstr ""
347376
348-
#: tools/list.scm:63
377+
#: tools/list.scm:64
349378
msgid ""
350379
"This dictionary allows you to do searches on the main view of this app.\n"
351380
"\tFailing to download one of these dictionaries will make the app unusable\n"

353382
"\tby kanji, reading (kana) and by French translation."
354383
msgstr ""
355384
356-
#: tools/list.scm:47
385+
#: tools/list.scm:48
357386
msgid ""
358387
"This dictionary allows you to do searches on the main view of this app.\n"
359388
"        Failing to download one of these dictionaries will make the app "

363392
"        by kanji, reading (kana) and by German translation."
364393
msgstr ""
365394
366-
#: tools/list.scm:80
395+
#: tools/list.scm:94
367396
msgid ""
368397
"This dictionary allows you to do searches on the main view of this app.\n"
369398
"        Failing to download one of these dictionaries will make the app "

373402
"        kanji, reading (kana) and by meaning in the languages you selected."
374403
msgstr ""
375404
376-
#: tools/list.scm:34
405+
#: tools/list.scm:35
377406
msgid ""
378407
"This dictionary allows you to enter kanji by selecting some of its\n"
379408
"    components.  Tap the water component button on the bottom of the screen "

381410
"    access the kanji selection by component view"
382411
msgstr ""
383412
384-
#: tools/list.scm:92
413+
#: tools/list.scm:106
385414
msgid ""
386415
"This dictionary allows you to search for kanji and view kanji information\n"
387416
"        such as number of strokes, pronunciations and meanings."
388417
msgstr ""
389418
390-
#: tools/list.scm:41
419+
#: tools/list.scm:42
391420
msgid ""
392421
"This dictionary allows you to see how a kanji is written, what it is\n"
393422
"composed of, and the order in which strokes are written."

tatoeba.mk unknown status 1

1+
TATOEBA_LANGS=eng fra rus spa ukr
2+
DICOS+=$(addprefix dicos/tatoeba_, $(addsuffix .nani, $(TATOEBA_LANGS)))
3+
TATOEBA_DOWNLOADS+=$(addprefix dictionaries/tatoeba_, $(addsuffix .csv, sentences_detailed sentences_base tags user_languages))
4+
DOWNLOADS+=$(TATOEBA_DOWNLOADS)
5+
6+
.PRECIOUS: dictionaries/tatoeba%.csv
7+
8+
dictionaries/tatoeba%.csv:
9+
	wget https://downloads.tatoeba.org/exports/$$(basename $@ .csv | cut -c9-).tar.bz2 -O $@.tar.bz2 --continue
10+
	tar xf $@.tar.bz2 -C dictionaries
11+
	mv dictionaries/$$(basename $@ | cut -c9-) $@
12+
13+
dicos/tatoeba_%.nani: $(TATOEBA_DOWNLOADS) $(TATOEBA_MODULES) $(RADK_MODULES)
14+
	guile -L modules tools/tatoeba.scm $(shell basename $@ .nani | sed 's|^tatoeba_||g') $@

tools/list.scm

2222
(use-modules (nani kanji kanjivg))
2323
(use-modules (nani result result))
2424
(use-modules (nani pitch pitch))
25+
(use-modules (nani sentence sentence))
2526
(use-modules (gcrypt hash))
2627
(use-modules (ice-9 match))
2728
(use-modules (ice-9 format))

6566
	as you can't search for anything.  This dictionary can be searched for
6667
	by kanji, reading (kana) and by French translation."))
6768
69+
  (define (tatoeba-synopsis lang)
70+
    (match lang
71+
      ("eng" `(_ "Japanese/english aligned sentences from the Tatoeba project."))
72+
      ("fra" `(_ "Japanese/French aligned sentences from the Tatoeba project."))
73+
      ("rus" `(_ "Japanese/Russian aligned sentences from the Tatoeba project."))
74+
      ("spa" `(_ "Japanese/Spanish aligned sentences from the Tatoeba project."))
75+
      ("ukr" `(_ "Japanese/Ukrainian aligned sentences from the Tatoeba project."))))
76+
  (define (tatoeba-description lang)
77+
    `(_ "Tatoeba is a collection of sentences and translations. This
78+
        dictionary contains pairs of sentences that are direct translations of
79+
        one another, which allows you to see example sentences in search
80+
        results."))
81+
6882
  (define (jmdict-synopsis lang)
6983
    (match lang
7084
      ("e" `(_ "Japanese/English dictionary from the Electronic Dictionary Research and Development Group."))

123137
             (let ((dico-lang (substring dico 7)))
124138
               (if long?
125139
                   (jmdict-description dico-lang)
126-
                   (jmdict-synopsis dico-lang))))))
140+
                   (jmdict-synopsis dico-lang))))
141+
            ((equal? (dico-type dico) "tatoeba")
142+
             (let ((dico-lang (substring dico 8)))
143+
               (if long?
144+
                   (tatoeba-description dico-lang)
145+
                   (tatoeba-synopsis dico-lang))))))
127146
         (translated (translate english lang)))
128147
    (if (and (equal? english translated) (not (equal? lang "en")))
129148
        #f

149168
     "kanjidic")
150169
    ((and (> (string-length file) 6) (equal? (substring file 0 6) "JMdict"))
151170
     "jmdict")
171+
    ((and (> (string-length file) 7) (equal? (substring file 0 7) "tatoeba"))
172+
     "tatoeba")
152173
    ((equal? file "jibiki_fre") "jibiki")
153174
    ((equal? file "wadoku_ger") "wadoku")
154175
    ((equal? file "wadoku_pitch") "wadoku_pitch")))

163184
     (kanjidic-entry-count file))
164185
    ((member (dico-type (dico-name file)) '("jmdict" "wadoku" "jibiki"))
165186
     (dictionary-entry-count file))
187+
    ((equal? (dico-type (dico-name file)) "tatoeba")
188+
     (sentence-dictionary-entry-count file))
166189
    ((equal? (dico-type (dico-name file)) "wadoku_pitch")
167190
     (pitch-entry-count file))))
168191

tools/tatoeba.scm unknown status 1

1+
;;; Nani Project website
2+
;;; Copyright ?? 2022 Julien Lepiller <julien@lepiller.eu>
3+
;;;
4+
;;; This file is part of the Nani Project website.
5+
;;;
6+
;;; The Nani Project website is free software; you can redistribute it and/or modify it
7+
;;; under the terms of the GNU Affero General Public License as published by
8+
;;; the Free Software Foundation; either version 3 of the License, or (at
9+
;;; your option) any later version.
10+
;;;
11+
;;; The Nani Project website is distributed in the hope that it will be useful, but
12+
;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13+
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14+
;;; GNU Affero General Public License for more details.
15+
;;;
16+
;;; You should have received a copy of the GNU Affero General Public License
17+
;;; along with the Nani Project website.  If not, see <http://www.gnu.org/licenses/>.
18+
19+
(use-modules (srfi srfi-9))
20+
(use-modules (srfi srfi-11))
21+
(use-modules (ice-9 match))
22+
(use-modules (ice-9 binary-ports))
23+
(use-modules (ice-9 textual-ports))
24+
(use-modules (nani sentence sentence))
25+
26+
(define (tatoeba-file file)
27+
  (string-append (dirname (current-filename)) "/../dictionaries/tatoeba_" file))
28+
29+
(define (get-sentences lang native-users)
30+
  (call-with-input-file (tatoeba-file "sentences_detailed.csv")
31+
    (lambda (port)
32+
      (let loop ((jpn '()) (trans '()) (line (get-line port)))
33+
        (if (eof-object? line)
34+
          (values (reverse jpn) (reverse trans))
35+
          (match (string-split line #\tab)
36+
            ((id slang text user _ _)
37+
             (cond
38+
               ((equal? lang slang)
39+
                (loop jpn (cons (cons (string->number id) text) trans)
40+
                      (get-line port)))
41+
               ((and (equal? slang "jpn") (member user native-users))
42+
                (loop (cons (cons (string->number id) text) jpn) trans (get-line port)))
43+
               (else
44+
                 (loop jpn trans (get-line port)))))))))))
45+
46+
(define (get-native-jpn-users)
47+
  (call-with-input-file (tatoeba-file "user_languages.csv")
48+
    (lambda (port)
49+
      (let loop ((jpn '()) (line (get-line port)))
50+
        (if (eof-object? line)
51+
          jpn
52+
          (match (string-split line #\tab)
53+
            ((ulang level user _)
54+
             ;; consume the comment if it's on multiple lines
55+
             (let loop2 ((line line))
56+
               (when (string-suffix? "\\" line)
57+
                 (loop2 (get-line port))))
58+
             ;; Add native japanese user
59+
             (if (and (equal? ulang "jpn") (equal? level "5"))
60+
               (loop (cons user jpn) (get-line port))
61+
               (loop jpn (get-line port))))))))))
62+
63+
(define (get-translations jpn trans)
64+
  (define jpn-len (vector-length jpn))
65+
  (define trans-len (vector-length trans))
66+
  (define (member-vect elem vect len)
67+
    (let loop ((min 0) (max (- len 1)))
68+
      (if (> min max)
69+
        #f
70+
        (let* ((mid-pos (floor (+ min (/ (- max min) 2))))
71+
               (mid-elem (vector-ref vect mid-pos)))
72+
          (cond
73+
            ((equal? mid-elem elem) #t)
74+
            ((> mid-elem elem) (loop min (- mid-pos 1)))
75+
            ((< mid-elem elem) (loop (+ mid-pos 1) max)))))))
76+
77+
  (call-with-input-file (tatoeba-file "sentences_base.csv")
78+
    (lambda (port)
79+
      (let loop ((translations '()) (line (get-line port)))
80+
        (if (or (eof-object? line) (null? trans))
81+
          translations
82+
          (match (string-split line #\tab)
83+
            ((id translation)
84+
             (cond
85+
               ((and (string->number id)
86+
                     (member-vect (string->number id) trans trans-len)
87+
                     (string->number translation)
88+
                     (member-vect (string->number translation) jpn jpn-len))
89+
                (loop (cons (cons (string->number translation) (string->number id)) translations)
90+
                      (get-line port)))
91+
               ((and (string->number id)
92+
                     (member-vect (string->number id) jpn jpn-len)
93+
                     (string->number translation)
94+
                     (member-vect (string->number translation) trans trans-len))
95+
                (loop (cons (cons (string->number id) (string->number translation)) translations)
96+
                      (get-line port)))
97+
               (else
98+
                 (loop translations (get-line port)))))))))))
99+
100+
(define (add-tags translations trans jpn)
101+
  (define tags
102+
    (call-with-input-file (tatoeba-file "tags.csv")
103+
      (lambda (port)
104+
        (let loop ((tags '()) (line (get-line port)))
105+
          (if (eof-object? line)
106+
            tags
107+
            (match (string-split line #\tab)
108+
              ((id tag)
109+
               (assoc-set! tags (string->number id) (cons tag (or (assoc-ref tags (string->number id)) '()))))))))))
110+
  (map
111+
    (match-lambda
112+
      ((jpn-id . trans-id)
113+
       (let ((tags (or (assoc-ref tags jpn-id) '()))
114+
             (trans (assoc-ref trans trans-id))
115+
             (jpn (assoc-ref jpn jpn-id)))
116+
         (make-sentence jpn trans tags #f))))
117+
    translations))
118+
119+
(define (get-tatoeba-sentences lang)
120+
  (define native-users (get-native-jpn-users))
121+
  (let-values (((jpn trans) (get-sentences lang native-users)))
122+
    (format #t "jpn: ~a sentences~%" (length jpn))
123+
    (format #t "~a: ~a sentences~%" lang (length trans))
124+
    (let ((translations
125+
            (get-translations
126+
              (list->vector (sort (map car jpn) <))
127+
              (list->vector (sort (map car trans) <)))))
128+
      (format #t "~a pairs~%" (length translations))
129+
      (add-tags translations trans jpn))))
130+
131+
(match (command-line)
132+
  ((_ lang output)
133+
   (let ((sentences (get-tatoeba-sentences lang)))
134+
     (format #t "Number of entries in ~a: ~a~%" output (length sentences))
135+
     (call-with-output-file output
136+
       (lambda (port)
137+
         (put-bytevector port
138+
           (serialize-sentence-dictionary sentences)))))))