Add tatoeba
Makefile
14 | 14 | include kanjidic.mk | |
15 | 15 | include kanjivg.mk | |
16 | 16 | include radicals.mk | |
17 | + | include tatoeba.mk | |
17 | 18 | include wadoku.mk | |
18 | 19 | ||
19 | 20 | # Files that constitute the website | |
… | |||
54 | 55 | touch site | |
55 | 56 | ||
56 | 57 | download: | |
57 | - | @rm -f dictionaries/* | |
58 | + | @rm -rf dictionaries/* | |
58 | 59 | @$(MAKE) $(DOWNLOADS) | |
59 | 60 | ||
60 | 61 | po/%/LC_MESSAGES/nani.mo: po/%.po |
manifest.scm
6 | 6 | ||
7 | 7 | ; for all | |
8 | 8 | "gettext")) | |
9 | + |
modules/nani/sentence/sentence.scm unknown status 1
1 | + | ;;; Nani Project website | |
2 | + | ;;; Copyright ?? 2022 Julien Lepiller <julien@lepiller.eu> | |
3 | + | ;;; | |
4 | + | ;;; This file is part of the Nani Project website. | |
5 | + | ;;; | |
6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | + | ;;; your option) any later version. | |
10 | + | ;;; | |
11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + | ;;; GNU Affero General Public License for more details. | |
15 | + | ;;; | |
16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | + | ||
19 | + | (define-module (nani sentence sentence) | |
20 | + | #:use-module (ice-9 binary-ports) | |
21 | + | #:use-module (ice-9 match) | |
22 | + | #:use-module (rnrs bytevectors) | |
23 | + | #:use-module (srfi srfi-9) | |
24 | + | #:use-module (nani encoding serialize) | |
25 | + | #:use-module (nani encoding huffman) | |
26 | + | #:use-module (nani encoding trie) | |
27 | + | #:use-module (mecab mecab) | |
28 | + | #:export (make-sentence | |
29 | + | sentence? | |
30 | + | sentence-jpn | |
31 | + | sentence-trans | |
32 | + | sentence-tags | |
33 | + | sentence-audio | |
34 | + | ||
35 | + | serialize-sentence | |
36 | + | serialize-sentence-dictionary | |
37 | + | sentence-dictionary-entry-count)) | |
38 | + | ||
39 | + | (define-record-type <sentence> | |
40 | + | (make-sentence jpn trans tags audio) | |
41 | + | sentence? | |
42 | + | (position sentence-position sentence-position-set!) | |
43 | + | (jpn sentence-jpn) | |
44 | + | (trans sentence-trans) | |
45 | + | (tags sentence-tags) | |
46 | + | (audio sentence-audio)) | |
47 | + | ||
48 | + | (define (serialize-audio filename pos bv) | |
49 | + | (let ((size (if filename (stat:size (stat filename)) 0))) | |
50 | + | (let ((pos (serialize-u16 size pos bv))) | |
51 | + | (if filename | |
52 | + | (let ((fbv (call-with-input-file filename get-bytevector-all))) | |
53 | + | (bytevector-copy! fbv 0 bv pos size) | |
54 | + | (+ pos size)) | |
55 | + | pos)))) | |
56 | + | (define (audio-size filename) | |
57 | + | (let ((size (if filename (stat:size (stat filename)) 0))) | |
58 | + | (+ (u16-size size) size))) | |
59 | + | ||
60 | + | (define (serialize-sentence jpn-huffman trans-huffman) | |
61 | + | (lambda (sentence pos bv) | |
62 | + | (when (not (sentence? sentence)) (throw 'not-sentence sentence)) | |
63 | + | (sentence-position-set! sentence pos) | |
64 | + | (let* ((pos ((serialize-huffman-string jpn-huffman) | |
65 | + | (sentence-jpn sentence) pos bv)) | |
66 | + | (pos ((serialize-huffman-string trans-huffman) | |
67 | + | (sentence-trans sentence) pos bv)) | |
68 | + | (pos ((serialize-list (serialize-huffman-string trans-huffman)) | |
69 | + | (sentence-tags sentence) pos bv)) | |
70 | + | (pos (serialize-audio (sentence-audio sentence) pos bv))) | |
71 | + | pos))) | |
72 | + | (define (sentence-size jpn-huffman trans-huffman) | |
73 | + | (lambda (sentence) | |
74 | + | (when (not (sentence? sentence)) (throw 'not-sentence sentence)) | |
75 | + | (+ ((huffman-string-size jpn-huffman) (sentence-jpn sentence)) | |
76 | + | ((huffman-string-size trans-huffman) (sentence-trans sentence)) | |
77 | + | ((list-size (huffman-string-size trans-huffman)) (sentence-tags sentence)) | |
78 | + | (audio-size (sentence-audio sentence))))) | |
79 | + | ||
80 | + | (define (make-key key) | |
81 | + | (apply append | |
82 | + | (map | |
83 | + | (lambda (c) | |
84 | + | (list (quotient c 16) (modulo c 16))) | |
85 | + | (bytevector->u8-list (string->utf8 key))))) | |
86 | + | ||
87 | + | (define (update-trie-pos! trie sentences) | |
88 | + | (let* ((vals (trie-vals trie)) | |
89 | + | (vals (map (lambda (i) (sentence-position (array-ref sentences i))) vals))) | |
90 | + | (trie-vals-set! trie vals)) | |
91 | + | (for-each | |
92 | + | (match-lambda | |
93 | + | ((char . child) | |
94 | + | (update-trie-pos! child sentences))) | |
95 | + | (trie-transitions trie))) | |
96 | + | ||
97 | + | (define (serialize-sentence-dictionary sentences) | |
98 | + | (define jpn-huffman | |
99 | + | (let ((jpn (map sentence-jpn sentences))) | |
100 | + | (create-huffman jpn))) | |
101 | + | (define jpn-huffman-code (huffman->code jpn-huffman)) | |
102 | + | ||
103 | + | (define trans-huffman | |
104 | + | (let ((trans (map sentence-trans sentences)) | |
105 | + | (tags (apply append (map sentence-tags sentences)))) | |
106 | + | (create-huffman (append trans tags)))) | |
107 | + | (define trans-huffman-code (huffman->code trans-huffman)) | |
108 | + | ||
109 | + | (define (make-sentence-trie sentences) | |
110 | + | (let ((trie (make-empty-trie)) | |
111 | + | (tagger (mecab-new-tagger '()))) | |
112 | + | (let loop ((sentences sentences) (i 0)) | |
113 | + | (if (null? sentences) | |
114 | + | (begin | |
115 | + | (mecab-destroy tagger) | |
116 | + | (compress-trie trie)) | |
117 | + | (begin | |
118 | + | (for-each | |
119 | + | (lambda (key) | |
120 | + | (add-to-trie! trie (make-key key) i)) | |
121 | + | (mecab-words tagger (sentence-jpn (car sentences)))) | |
122 | + | (loop (cdr sentences) (+ i 1))))))) | |
123 | + | ||
124 | + | (define (trie-node-size trie) | |
125 | + | (apply + 1 (map trie-node-size (map cdr (trie-transitions trie))))) | |
126 | + | ||
127 | + | (let* ((header (string->utf8 "NANI_SENTENCE001")) | |
128 | + | (header-size (bytevector-length header)) | |
129 | + | (pointers (make-bytevector 4 0)) | |
130 | + | (jpn-huffman-bv (serialize-huffman jpn-huffman)) | |
131 | + | (jpn-huffman-size (bytevector-length jpn-huffman-bv)) | |
132 | + | (trans-huffman-bv (serialize-huffman trans-huffman)) | |
133 | + | (trans-huffman-size (bytevector-length trans-huffman-bv)) | |
134 | + | (serialize-trie (serialize-trie serialize-int int-size)) | |
135 | + | (trie-size (trie-size int-size)) | |
136 | + | (sentence-trie (make-sentence-trie sentences)) | |
137 | + | (sentence-trie-size (trie-size sentence-trie)) | |
138 | + | (sentences-size | |
139 | + | ((list-size (sentence-size jpn-huffman-code trans-huffman-code) | |
140 | + | #:size? #f) | |
141 | + | sentences)) | |
142 | + | (huffman-size (+ jpn-huffman-size trans-huffman-size)) | |
143 | + | (pos-trie (+ header-size 4 jpn-huffman-size trans-huffman-size | |
144 | + | sentences-size 4)) | |
145 | + | (bv (make-bytevector (+ header-size 4 jpn-huffman-size | |
146 | + | trans-huffman-size sentences-size 4 | |
147 | + | sentence-trie-size)))) | |
148 | + | (format #t "Number of nodes in trie: ~a~%" (trie-node-size sentence-trie)) | |
149 | + | ((serialize-list (serialize-sentence jpn-huffman-code trans-huffman-code) | |
150 | + | #:size? #f) | |
151 | + | sentences (+ header-size 4 huffman-size) bv) | |
152 | + | ;; Serializing sentences also updated sentence-pos for each of them | |
153 | + | (let ((sentences (list->array 1 sentences))) | |
154 | + | (update-trie-pos! sentence-trie sentences)) | |
155 | + | ;; number of entries | |
156 | + | (serialize-int (length sentences) (+ header-size 4 huffman-size sentences-size) | |
157 | + | bv) | |
158 | + | (let* ((sentences (list->array 1 sentences)) | |
159 | + | (pos pos-trie) | |
160 | + | (pos (serialize-trie sentence-trie pos bv))) | |
161 | + | ;; point to the trie structure | |
162 | + | (bytevector-u32-set! | |
163 | + | pointers 0 | |
164 | + | (+ header-size 4 huffman-size sentences-size (int-size 0)) | |
165 | + | (endianness big)) | |
166 | + | ;; copy to result bytevector | |
167 | + | (bytevector-copy! header 0 bv 0 header-size) | |
168 | + | (bytevector-copy! pointers 0 bv header-size 4) | |
169 | + | (bytevector-copy! jpn-huffman-bv 0 bv (+ header-size 4) jpn-huffman-size) | |
170 | + | (bytevector-copy! trans-huffman-bv 0 bv (+ header-size 4 jpn-huffman-size) | |
171 | + | trans-huffman-size) | |
172 | + | ;; gide some feedback on the size of file's structures | |
173 | + | (format #t "huffmans are ~a bytes long~%" huffman-size) | |
174 | + | (format #t "sentences are ~a bytes long~%" sentences-size) | |
175 | + | (format #t "trie is ~a bytes long~%" sentence-trie-size) | |
176 | + | bv))) | |
177 | + | ||
178 | + | (define (sentence-dictionary-entry-count file) | |
179 | + | (call-with-input-file file | |
180 | + | (lambda (port) | |
181 | + | (let* ((header (utf8->string (get-bytevector-n port 16))) | |
182 | + | (pointers (get-bytevector-n port 4)) | |
183 | + | (end-pos (bytevector-u32-ref pointers 0 (endianness big)))) | |
184 | + | (seek port (- end-pos 4) SEEK_SET) | |
185 | + | (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big)))))) |
po/nani.pot
8 | 8 | msgstr "" | |
9 | 9 | "Project-Id-Version: PACKAGE VERSION\n" | |
10 | 10 | "Report-Msgid-Bugs-To: \n" | |
11 | - | "POT-Creation-Date: 2021-07-29 21:50+0200\n" | |
11 | + | "POT-Creation-Date: 2022-07-09 21:44+0200\n" | |
12 | 12 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" | |
13 | 13 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n" | |
14 | 14 | "Language-Team: LANGUAGE <LL@li.org>\n" | |
… | |||
102 | 102 | msgid "JMdict" | |
103 | 103 | msgstr "" | |
104 | 104 | ||
105 | - | #: tools/list.scm:71 | |
105 | + | #: tools/list.scm:85 | |
106 | 106 | msgid "" | |
107 | 107 | "Japanese/Dutch dictionary from the Electronic Dictionary Research and " | |
108 | 108 | "Development Group." | |
109 | 109 | msgstr "" | |
110 | 110 | ||
111 | - | #: tools/list.scm:70 | |
111 | + | #: tools/list.scm:84 | |
112 | 112 | msgid "" | |
113 | 113 | "Japanese/English dictionary from the Electronic Dictionary Research and " | |
114 | 114 | "Development Group." | |
115 | 115 | msgstr "" | |
116 | 116 | ||
117 | 117 | #: tools/list.scm:72 | |
118 | + | msgid "Japanese/French aligned sentences from the Tatoeba project." | |
119 | + | msgstr "" | |
120 | + | ||
121 | + | #: tools/list.scm:86 | |
118 | 122 | msgid "" | |
119 | 123 | "Japanese/French dictionary from the Electronic Dictionary Research and " | |
120 | 124 | "Development Group." | |
121 | 125 | msgstr "" | |
122 | 126 | ||
123 | - | #: tools/list.scm:61 | |
127 | + | #: tools/list.scm:62 | |
124 | 128 | msgid "Japanese/French dictionary from the Jibiki project." | |
125 | 129 | msgstr "" | |
126 | 130 | ||
127 | - | #: tools/list.scm:45 | |
131 | + | #: tools/list.scm:46 | |
128 | 132 | msgid "Japanese/German dictionary from Wadoku." | |
129 | 133 | msgstr "" | |
130 | 134 | ||
131 | - | #: tools/list.scm:73 | |
135 | + | #: tools/list.scm:87 | |
132 | 136 | msgid "" | |
133 | 137 | "Japanese/German dictionary from the Electronic Dictionary Research and " | |
134 | 138 | "Development Group." | |
135 | 139 | msgstr "" | |
136 | 140 | ||
137 | - | #: tools/list.scm:74 | |
141 | + | #: tools/list.scm:88 | |
138 | 142 | msgid "" | |
139 | 143 | "Japanese/Hungarian dictionary from the Electronic Dictionary Research and " | |
140 | 144 | "Development Group." | |
141 | 145 | msgstr "" | |
142 | 146 | ||
143 | - | #: tools/list.scm:75 | |
147 | + | #: tools/list.scm:73 | |
148 | + | msgid "Japanese/Russian aligned sentences from the Tatoeba project." | |
149 | + | msgstr "" | |
150 | + | ||
151 | + | #: tools/list.scm:89 | |
144 | 152 | msgid "" | |
145 | 153 | "Japanese/Russian dictionary from the Electronic Dictionary Research and " | |
146 | 154 | "Development Group." | |
147 | 155 | msgstr "" | |
148 | 156 | ||
149 | - | #: tools/list.scm:76 | |
157 | + | #: tools/list.scm:90 | |
150 | 158 | msgid "" | |
151 | 159 | "Japanese/Slovenian dictionary from the Electronic Dictionary Research and " | |
152 | 160 | "Development Group." | |
153 | 161 | msgstr "" | |
154 | 162 | ||
155 | - | #: tools/list.scm:77 | |
163 | + | #: tools/list.scm:74 | |
164 | + | msgid "Japanese/Spanish aligned sentences from the Tatoeba project." | |
165 | + | msgstr "" | |
166 | + | ||
167 | + | #: tools/list.scm:91 | |
156 | 168 | msgid "" | |
157 | 169 | "Japanese/Spanish dictionary from the Electronic Dictionary Research and " | |
158 | 170 | "Development Group." | |
159 | 171 | msgstr "" | |
160 | 172 | ||
161 | - | #: tools/list.scm:78 | |
173 | + | #: tools/list.scm:92 | |
162 | 174 | msgid "" | |
163 | 175 | "Japanese/Swedish dictionary from the Electronic Dictionary Research and " | |
164 | 176 | "Development Group." | |
165 | 177 | msgstr "" | |
166 | 178 | ||
167 | - | #: tools/list.scm:87 | |
179 | + | #: tools/list.scm:75 | |
180 | + | msgid "Japanese/Ukrainian aligned sentences from the Tatoeba project." | |
181 | + | msgstr "" | |
182 | + | ||
183 | + | #: tools/list.scm:71 | |
184 | + | msgid "Japanese/english aligned sentences from the Tatoeba project." | |
185 | + | msgstr "" | |
186 | + | ||
187 | + | #: tools/list.scm:101 | |
168 | 188 | msgid "Kanji dictionary with English meanings." | |
169 | 189 | msgstr "" | |
170 | 190 | ||
171 | - | #: tools/list.scm:89 | |
191 | + | #: tools/list.scm:103 | |
172 | 192 | msgid "Kanji dictionary with French meanings." | |
173 | 193 | msgstr "" | |
174 | 194 | ||
175 | - | #: tools/list.scm:90 | |
195 | + | #: tools/list.scm:104 | |
176 | 196 | msgid "Kanji dictionary with Portuguese meanings." | |
177 | 197 | msgstr "" | |
178 | 198 | ||
179 | - | #: tools/list.scm:88 | |
199 | + | #: tools/list.scm:102 | |
180 | 200 | msgid "Kanji dictionary with Spanish meanings." | |
181 | 201 | msgstr "" | |
182 | 202 | ||
183 | - | #: tools/list.scm:39 | |
203 | + | #: tools/list.scm:40 | |
184 | 204 | msgid "Kanji writing visual help by the Kanjivg project." | |
185 | 205 | msgstr "" | |
186 | 206 | ||
… | |||
233 | 253 | msgid "Phone: " | |
234 | 254 | msgstr "" | |
235 | 255 | ||
236 | - | #: tools/list.scm:53 | |
256 | + | #: tools/list.scm:54 | |
237 | 257 | msgid "Pitch accent dictionary from Wadoku." | |
238 | 258 | msgstr "" | |
239 | 259 | ||
240 | - | #: tools/list.scm:32 | |
260 | + | #: tools/list.scm:33 | |
241 | 261 | msgid "" | |
242 | 262 | "Radical to Kanji dictionary from the Electronic Dictionary Research and " | |
243 | 263 | "Development Group." | |
… | |||
281 | 301 | msgid "Tap on the search button and you'll see the results. Easy, right?" | |
282 | 302 | msgstr "" | |
283 | 303 | ||
304 | + | #: tools/list.scm:77 | |
305 | + | msgid "" | |
306 | + | "Tatoeba is a collection of sentences and translations. This\n" | |
307 | + | " dictionary contains pairs of sentences that are direct translations " | |
308 | + | "of\n" | |
309 | + | " one another, which allows you to see example sentences in search\n" | |
310 | + | " results." | |
311 | + | msgstr "" | |
312 | + | ||
284 | 313 | #: pages/e404.scm:27 | |
285 | 314 | msgid "That's a 404 :/" | |
286 | 315 | msgstr "" | |
… | |||
335 | 364 | "In the following sections we will see how to use them." | |
336 | 365 | msgstr "" | |
337 | 366 | ||
338 | - | #: tools/list.scm:55 | |
367 | + | #: tools/list.scm:56 | |
339 | 368 | msgid "" | |
340 | 369 | "This dictionary allows you to augment search results on the main view\n" | |
341 | 370 | " with pitch accent (pronunciation) information. Japanese is not " | |
… | |||
345 | 374 | " words better, with a standard Japanese pitch accent." | |
346 | 375 | msgstr "" | |
347 | 376 | ||
348 | - | #: tools/list.scm:63 | |
377 | + | #: tools/list.scm:64 | |
349 | 378 | msgid "" | |
350 | 379 | "This dictionary allows you to do searches on the main view of this app.\n" | |
351 | 380 | "\tFailing to download one of these dictionaries will make the app unusable\n" | |
… | |||
353 | 382 | "\tby kanji, reading (kana) and by French translation." | |
354 | 383 | msgstr "" | |
355 | 384 | ||
356 | - | #: tools/list.scm:47 | |
385 | + | #: tools/list.scm:48 | |
357 | 386 | msgid "" | |
358 | 387 | "This dictionary allows you to do searches on the main view of this app.\n" | |
359 | 388 | " Failing to download one of these dictionaries will make the app " | |
… | |||
363 | 392 | " by kanji, reading (kana) and by German translation." | |
364 | 393 | msgstr "" | |
365 | 394 | ||
366 | - | #: tools/list.scm:80 | |
395 | + | #: tools/list.scm:94 | |
367 | 396 | msgid "" | |
368 | 397 | "This dictionary allows you to do searches on the main view of this app.\n" | |
369 | 398 | " Failing to download one of these dictionaries will make the app " | |
… | |||
373 | 402 | " kanji, reading (kana) and by meaning in the languages you selected." | |
374 | 403 | msgstr "" | |
375 | 404 | ||
376 | - | #: tools/list.scm:34 | |
405 | + | #: tools/list.scm:35 | |
377 | 406 | msgid "" | |
378 | 407 | "This dictionary allows you to enter kanji by selecting some of its\n" | |
379 | 408 | " components. Tap the water component button on the bottom of the screen " | |
… | |||
381 | 410 | " access the kanji selection by component view" | |
382 | 411 | msgstr "" | |
383 | 412 | ||
384 | - | #: tools/list.scm:92 | |
413 | + | #: tools/list.scm:106 | |
385 | 414 | msgid "" | |
386 | 415 | "This dictionary allows you to search for kanji and view kanji information\n" | |
387 | 416 | " such as number of strokes, pronunciations and meanings." | |
388 | 417 | msgstr "" | |
389 | 418 | ||
390 | - | #: tools/list.scm:41 | |
419 | + | #: tools/list.scm:42 | |
391 | 420 | msgid "" | |
392 | 421 | "This dictionary allows you to see how a kanji is written, what it is\n" | |
393 | 422 | "composed of, and the order in which strokes are written." |
tatoeba.mk unknown status 1
1 | + | TATOEBA_LANGS=eng fra rus spa ukr | |
2 | + | DICOS+=$(addprefix dicos/tatoeba_, $(addsuffix .nani, $(TATOEBA_LANGS))) | |
3 | + | TATOEBA_DOWNLOADS+=$(addprefix dictionaries/tatoeba_, $(addsuffix .csv, sentences_detailed sentences_base tags user_languages)) | |
4 | + | DOWNLOADS+=$(TATOEBA_DOWNLOADS) | |
5 | + | ||
6 | + | .PRECIOUS: dictionaries/tatoeba%.csv | |
7 | + | ||
8 | + | dictionaries/tatoeba%.csv: | |
9 | + | wget https://downloads.tatoeba.org/exports/$$(basename $@ .csv | cut -c9-).tar.bz2 -O $@.tar.bz2 --continue | |
10 | + | tar xf $@.tar.bz2 -C dictionaries | |
11 | + | mv dictionaries/$$(basename $@ | cut -c9-) $@ | |
12 | + | ||
13 | + | dicos/tatoeba_%.nani: $(TATOEBA_DOWNLOADS) $(TATOEBA_MODULES) $(RADK_MODULES) | |
14 | + | guile -L modules tools/tatoeba.scm $(shell basename $@ .nani | sed 's|^tatoeba_||g') $@ |
tools/list.scm
22 | 22 | (use-modules (nani kanji kanjivg)) | |
23 | 23 | (use-modules (nani result result)) | |
24 | 24 | (use-modules (nani pitch pitch)) | |
25 | + | (use-modules (nani sentence sentence)) | |
25 | 26 | (use-modules (gcrypt hash)) | |
26 | 27 | (use-modules (ice-9 match)) | |
27 | 28 | (use-modules (ice-9 format)) | |
… | |||
65 | 66 | as you can't search for anything. This dictionary can be searched for | |
66 | 67 | by kanji, reading (kana) and by French translation.")) | |
67 | 68 | ||
69 | + | (define (tatoeba-synopsis lang) | |
70 | + | (match lang | |
71 | + | ("eng" `(_ "Japanese/english aligned sentences from the Tatoeba project.")) | |
72 | + | ("fra" `(_ "Japanese/French aligned sentences from the Tatoeba project.")) | |
73 | + | ("rus" `(_ "Japanese/Russian aligned sentences from the Tatoeba project.")) | |
74 | + | ("spa" `(_ "Japanese/Spanish aligned sentences from the Tatoeba project.")) | |
75 | + | ("ukr" `(_ "Japanese/Ukrainian aligned sentences from the Tatoeba project.")))) | |
76 | + | (define (tatoeba-description lang) | |
77 | + | `(_ "Tatoeba is a collection of sentences and translations. This | |
78 | + | dictionary contains pairs of sentences that are direct translations of | |
79 | + | one another, which allows you to see example sentences in search | |
80 | + | results.")) | |
81 | + | ||
68 | 82 | (define (jmdict-synopsis lang) | |
69 | 83 | (match lang | |
70 | 84 | ("e" `(_ "Japanese/English dictionary from the Electronic Dictionary Research and Development Group.")) | |
… | |||
123 | 137 | (let ((dico-lang (substring dico 7))) | |
124 | 138 | (if long? | |
125 | 139 | (jmdict-description dico-lang) | |
126 | - | (jmdict-synopsis dico-lang)))))) | |
140 | + | (jmdict-synopsis dico-lang)))) | |
141 | + | ((equal? (dico-type dico) "tatoeba") | |
142 | + | (let ((dico-lang (substring dico 8))) | |
143 | + | (if long? | |
144 | + | (tatoeba-description dico-lang) | |
145 | + | (tatoeba-synopsis dico-lang)))))) | |
127 | 146 | (translated (translate english lang))) | |
128 | 147 | (if (and (equal? english translated) (not (equal? lang "en"))) | |
129 | 148 | #f | |
… | |||
149 | 168 | "kanjidic") | |
150 | 169 | ((and (> (string-length file) 6) (equal? (substring file 0 6) "JMdict")) | |
151 | 170 | "jmdict") | |
171 | + | ((and (> (string-length file) 7) (equal? (substring file 0 7) "tatoeba")) | |
172 | + | "tatoeba") | |
152 | 173 | ((equal? file "jibiki_fre") "jibiki") | |
153 | 174 | ((equal? file "wadoku_ger") "wadoku") | |
154 | 175 | ((equal? file "wadoku_pitch") "wadoku_pitch"))) | |
… | |||
163 | 184 | (kanjidic-entry-count file)) | |
164 | 185 | ((member (dico-type (dico-name file)) '("jmdict" "wadoku" "jibiki")) | |
165 | 186 | (dictionary-entry-count file)) | |
187 | + | ((equal? (dico-type (dico-name file)) "tatoeba") | |
188 | + | (sentence-dictionary-entry-count file)) | |
166 | 189 | ((equal? (dico-type (dico-name file)) "wadoku_pitch") | |
167 | 190 | (pitch-entry-count file)))) | |
168 | 191 |
tools/tatoeba.scm unknown status 1
1 | + | ;;; Nani Project website | |
2 | + | ;;; Copyright ?? 2022 Julien Lepiller <julien@lepiller.eu> | |
3 | + | ;;; | |
4 | + | ;;; This file is part of the Nani Project website. | |
5 | + | ;;; | |
6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
9 | + | ;;; your option) any later version. | |
10 | + | ;;; | |
11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + | ;;; GNU Affero General Public License for more details. | |
15 | + | ;;; | |
16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
18 | + | ||
19 | + | (use-modules (srfi srfi-9)) | |
20 | + | (use-modules (srfi srfi-11)) | |
21 | + | (use-modules (ice-9 match)) | |
22 | + | (use-modules (ice-9 binary-ports)) | |
23 | + | (use-modules (ice-9 textual-ports)) | |
24 | + | (use-modules (nani sentence sentence)) | |
25 | + | ||
26 | + | (define (tatoeba-file file) | |
27 | + | (string-append (dirname (current-filename)) "/../dictionaries/tatoeba_" file)) | |
28 | + | ||
29 | + | (define (get-sentences lang native-users) | |
30 | + | (call-with-input-file (tatoeba-file "sentences_detailed.csv") | |
31 | + | (lambda (port) | |
32 | + | (let loop ((jpn '()) (trans '()) (line (get-line port))) | |
33 | + | (if (eof-object? line) | |
34 | + | (values (reverse jpn) (reverse trans)) | |
35 | + | (match (string-split line #\tab) | |
36 | + | ((id slang text user _ _) | |
37 | + | (cond | |
38 | + | ((equal? lang slang) | |
39 | + | (loop jpn (cons (cons (string->number id) text) trans) | |
40 | + | (get-line port))) | |
41 | + | ((and (equal? slang "jpn") (member user native-users)) | |
42 | + | (loop (cons (cons (string->number id) text) jpn) trans (get-line port))) | |
43 | + | (else | |
44 | + | (loop jpn trans (get-line port))))))))))) | |
45 | + | ||
46 | + | (define (get-native-jpn-users) | |
47 | + | (call-with-input-file (tatoeba-file "user_languages.csv") | |
48 | + | (lambda (port) | |
49 | + | (let loop ((jpn '()) (line (get-line port))) | |
50 | + | (if (eof-object? line) | |
51 | + | jpn | |
52 | + | (match (string-split line #\tab) | |
53 | + | ((ulang level user _) | |
54 | + | ;; consume the comment if it's on multiple lines | |
55 | + | (let loop2 ((line line)) | |
56 | + | (when (string-suffix? "\\" line) | |
57 | + | (loop2 (get-line port)))) | |
58 | + | ;; Add native japanese user | |
59 | + | (if (and (equal? ulang "jpn") (equal? level "5")) | |
60 | + | (loop (cons user jpn) (get-line port)) | |
61 | + | (loop jpn (get-line port)))))))))) | |
62 | + | ||
63 | + | (define (get-translations jpn trans) | |
64 | + | (define jpn-len (vector-length jpn)) | |
65 | + | (define trans-len (vector-length trans)) | |
66 | + | (define (member-vect elem vect len) | |
67 | + | (let loop ((min 0) (max (- len 1))) | |
68 | + | (if (> min max) | |
69 | + | #f | |
70 | + | (let* ((mid-pos (floor (+ min (/ (- max min) 2)))) | |
71 | + | (mid-elem (vector-ref vect mid-pos))) | |
72 | + | (cond | |
73 | + | ((equal? mid-elem elem) #t) | |
74 | + | ((> mid-elem elem) (loop min (- mid-pos 1))) | |
75 | + | ((< mid-elem elem) (loop (+ mid-pos 1) max))))))) | |
76 | + | ||
77 | + | (call-with-input-file (tatoeba-file "sentences_base.csv") | |
78 | + | (lambda (port) | |
79 | + | (let loop ((translations '()) (line (get-line port))) | |
80 | + | (if (or (eof-object? line) (null? trans)) | |
81 | + | translations | |
82 | + | (match (string-split line #\tab) | |
83 | + | ((id translation) | |
84 | + | (cond | |
85 | + | ((and (string->number id) | |
86 | + | (member-vect (string->number id) trans trans-len) | |
87 | + | (string->number translation) | |
88 | + | (member-vect (string->number translation) jpn jpn-len)) | |
89 | + | (loop (cons (cons (string->number translation) (string->number id)) translations) | |
90 | + | (get-line port))) | |
91 | + | ((and (string->number id) | |
92 | + | (member-vect (string->number id) jpn jpn-len) | |
93 | + | (string->number translation) | |
94 | + | (member-vect (string->number translation) trans trans-len)) | |
95 | + | (loop (cons (cons (string->number id) (string->number translation)) translations) | |
96 | + | (get-line port))) | |
97 | + | (else | |
98 | + | (loop translations (get-line port))))))))))) | |
99 | + | ||
100 | + | (define (add-tags translations trans jpn) | |
101 | + | (define tags | |
102 | + | (call-with-input-file (tatoeba-file "tags.csv") | |
103 | + | (lambda (port) | |
104 | + | (let loop ((tags '()) (line (get-line port))) | |
105 | + | (if (eof-object? line) | |
106 | + | tags | |
107 | + | (match (string-split line #\tab) | |
108 | + | ((id tag) | |
109 | + | (assoc-set! tags (string->number id) (cons tag (or (assoc-ref tags (string->number id)) '())))))))))) | |
110 | + | (map | |
111 | + | (match-lambda | |
112 | + | ((jpn-id . trans-id) | |
113 | + | (let ((tags (or (assoc-ref tags jpn-id) '())) | |
114 | + | (trans (assoc-ref trans trans-id)) | |
115 | + | (jpn (assoc-ref jpn jpn-id))) | |
116 | + | (make-sentence jpn trans tags #f)))) | |
117 | + | translations)) | |
118 | + | ||
119 | + | (define (get-tatoeba-sentences lang) | |
120 | + | (define native-users (get-native-jpn-users)) | |
121 | + | (let-values (((jpn trans) (get-sentences lang native-users))) | |
122 | + | (format #t "jpn: ~a sentences~%" (length jpn)) | |
123 | + | (format #t "~a: ~a sentences~%" lang (length trans)) | |
124 | + | (let ((translations | |
125 | + | (get-translations | |
126 | + | (list->vector (sort (map car jpn) <)) | |
127 | + | (list->vector (sort (map car trans) <))))) | |
128 | + | (format #t "~a pairs~%" (length translations)) | |
129 | + | (add-tags translations trans jpn)))) | |
130 | + | ||
131 | + | (match (command-line) | |
132 | + | ((_ lang output) | |
133 | + | (let ((sentences (get-tatoeba-sentences lang))) | |
134 | + | (format #t "Number of entries in ~a: ~a~%" output (length sentences)) | |
135 | + | (call-with-output-file output | |
136 | + | (lambda (port) | |
137 | + | (put-bytevector port | |
138 | + | (serialize-sentence-dictionary sentences))))))) |