Add tatoeba
Makefile
| 14 | 14 | include kanjidic.mk | |
| 15 | 15 | include kanjivg.mk | |
| 16 | 16 | include radicals.mk | |
| 17 | + | include tatoeba.mk | |
| 17 | 18 | include wadoku.mk | |
| 18 | 19 | ||
| 19 | 20 | # Files that constitute the website | |
… | |||
| 54 | 55 | touch site | |
| 55 | 56 | ||
| 56 | 57 | download: | |
| 57 | - | @rm -f dictionaries/* | |
| 58 | + | @rm -rf dictionaries/* | |
| 58 | 59 | @$(MAKE) $(DOWNLOADS) | |
| 59 | 60 | ||
| 60 | 61 | po/%/LC_MESSAGES/nani.mo: po/%.po | |
manifest.scm
| 6 | 6 | ||
| 7 | 7 | ; for all | |
| 8 | 8 | "gettext")) | |
| 9 | + |
modules/nani/sentence/sentence.scm unknown status 1
| 1 | + | ;;; Nani Project website | |
| 2 | + | ;;; Copyright ?? 2022 Julien Lepiller <julien@lepiller.eu> | |
| 3 | + | ;;; | |
| 4 | + | ;;; This file is part of the Nani Project website. | |
| 5 | + | ;;; | |
| 6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | + | ;;; your option) any later version. | |
| 10 | + | ;;; | |
| 11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | + | ;;; GNU Affero General Public License for more details. | |
| 15 | + | ;;; | |
| 16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | + | ||
| 19 | + | (define-module (nani sentence sentence) | |
| 20 | + | #:use-module (ice-9 binary-ports) | |
| 21 | + | #:use-module (ice-9 match) | |
| 22 | + | #:use-module (rnrs bytevectors) | |
| 23 | + | #:use-module (srfi srfi-9) | |
| 24 | + | #:use-module (nani encoding serialize) | |
| 25 | + | #:use-module (nani encoding huffman) | |
| 26 | + | #:use-module (nani encoding trie) | |
| 27 | + | #:use-module (mecab mecab) | |
| 28 | + | #:export (make-sentence | |
| 29 | + | sentence? | |
| 30 | + | sentence-jpn | |
| 31 | + | sentence-trans | |
| 32 | + | sentence-tags | |
| 33 | + | sentence-audio | |
| 34 | + | ||
| 35 | + | serialize-sentence | |
| 36 | + | serialize-sentence-dictionary | |
| 37 | + | sentence-dictionary-entry-count)) | |
| 38 | + | ||
| 39 | + | (define-record-type <sentence> | |
| 40 | + | (make-sentence jpn trans tags audio) | |
| 41 | + | sentence? | |
| 42 | + | (position sentence-position sentence-position-set!) | |
| 43 | + | (jpn sentence-jpn) | |
| 44 | + | (trans sentence-trans) | |
| 45 | + | (tags sentence-tags) | |
| 46 | + | (audio sentence-audio)) | |
| 47 | + | ||
| 48 | + | (define (serialize-audio filename pos bv) | |
| 49 | + | (let ((size (if filename (stat:size (stat filename)) 0))) | |
| 50 | + | (let ((pos (serialize-u16 size pos bv))) | |
| 51 | + | (if filename | |
| 52 | + | (let ((fbv (call-with-input-file filename get-bytevector-all))) | |
| 53 | + | (bytevector-copy! fbv 0 bv pos size) | |
| 54 | + | (+ pos size)) | |
| 55 | + | pos)))) | |
| 56 | + | (define (audio-size filename) | |
| 57 | + | (let ((size (if filename (stat:size (stat filename)) 0))) | |
| 58 | + | (+ (u16-size size) size))) | |
| 59 | + | ||
| 60 | + | (define (serialize-sentence jpn-huffman trans-huffman) | |
| 61 | + | (lambda (sentence pos bv) | |
| 62 | + | (when (not (sentence? sentence)) (throw 'not-sentence sentence)) | |
| 63 | + | (sentence-position-set! sentence pos) | |
| 64 | + | (let* ((pos ((serialize-huffman-string jpn-huffman) | |
| 65 | + | (sentence-jpn sentence) pos bv)) | |
| 66 | + | (pos ((serialize-huffman-string trans-huffman) | |
| 67 | + | (sentence-trans sentence) pos bv)) | |
| 68 | + | (pos ((serialize-list (serialize-huffman-string trans-huffman)) | |
| 69 | + | (sentence-tags sentence) pos bv)) | |
| 70 | + | (pos (serialize-audio (sentence-audio sentence) pos bv))) | |
| 71 | + | pos))) | |
| 72 | + | (define (sentence-size jpn-huffman trans-huffman) | |
| 73 | + | (lambda (sentence) | |
| 74 | + | (when (not (sentence? sentence)) (throw 'not-sentence sentence)) | |
| 75 | + | (+ ((huffman-string-size jpn-huffman) (sentence-jpn sentence)) | |
| 76 | + | ((huffman-string-size trans-huffman) (sentence-trans sentence)) | |
| 77 | + | ((list-size (huffman-string-size trans-huffman)) (sentence-tags sentence)) | |
| 78 | + | (audio-size (sentence-audio sentence))))) | |
| 79 | + | ||
| 80 | + | (define (make-key key) | |
| 81 | + | (apply append | |
| 82 | + | (map | |
| 83 | + | (lambda (c) | |
| 84 | + | (list (quotient c 16) (modulo c 16))) | |
| 85 | + | (bytevector->u8-list (string->utf8 key))))) | |
| 86 | + | ||
| 87 | + | (define (update-trie-pos! trie sentences) | |
| 88 | + | (let* ((vals (trie-vals trie)) | |
| 89 | + | (vals (map (lambda (i) (sentence-position (array-ref sentences i))) vals))) | |
| 90 | + | (trie-vals-set! trie vals)) | |
| 91 | + | (for-each | |
| 92 | + | (match-lambda | |
| 93 | + | ((char . child) | |
| 94 | + | (update-trie-pos! child sentences))) | |
| 95 | + | (trie-transitions trie))) | |
| 96 | + | ||
| 97 | + | (define (serialize-sentence-dictionary sentences) | |
| 98 | + | (define jpn-huffman | |
| 99 | + | (let ((jpn (map sentence-jpn sentences))) | |
| 100 | + | (create-huffman jpn))) | |
| 101 | + | (define jpn-huffman-code (huffman->code jpn-huffman)) | |
| 102 | + | ||
| 103 | + | (define trans-huffman | |
| 104 | + | (let ((trans (map sentence-trans sentences)) | |
| 105 | + | (tags (apply append (map sentence-tags sentences)))) | |
| 106 | + | (create-huffman (append trans tags)))) | |
| 107 | + | (define trans-huffman-code (huffman->code trans-huffman)) | |
| 108 | + | ||
| 109 | + | (define (make-sentence-trie sentences) | |
| 110 | + | (let ((trie (make-empty-trie)) | |
| 111 | + | (tagger (mecab-new-tagger '()))) | |
| 112 | + | (let loop ((sentences sentences) (i 0)) | |
| 113 | + | (if (null? sentences) | |
| 114 | + | (begin | |
| 115 | + | (mecab-destroy tagger) | |
| 116 | + | (compress-trie trie)) | |
| 117 | + | (begin | |
| 118 | + | (for-each | |
| 119 | + | (lambda (key) | |
| 120 | + | (add-to-trie! trie (make-key key) i)) | |
| 121 | + | (mecab-words tagger (sentence-jpn (car sentences)))) | |
| 122 | + | (loop (cdr sentences) (+ i 1))))))) | |
| 123 | + | ||
| 124 | + | (define (trie-node-size trie) | |
| 125 | + | (apply + 1 (map trie-node-size (map cdr (trie-transitions trie))))) | |
| 126 | + | ||
| 127 | + | (let* ((header (string->utf8 "NANI_SENTENCE001")) | |
| 128 | + | (header-size (bytevector-length header)) | |
| 129 | + | (pointers (make-bytevector 4 0)) | |
| 130 | + | (jpn-huffman-bv (serialize-huffman jpn-huffman)) | |
| 131 | + | (jpn-huffman-size (bytevector-length jpn-huffman-bv)) | |
| 132 | + | (trans-huffman-bv (serialize-huffman trans-huffman)) | |
| 133 | + | (trans-huffman-size (bytevector-length trans-huffman-bv)) | |
| 134 | + | (serialize-trie (serialize-trie serialize-int int-size)) | |
| 135 | + | (trie-size (trie-size int-size)) | |
| 136 | + | (sentence-trie (make-sentence-trie sentences)) | |
| 137 | + | (sentence-trie-size (trie-size sentence-trie)) | |
| 138 | + | (sentences-size | |
| 139 | + | ((list-size (sentence-size jpn-huffman-code trans-huffman-code) | |
| 140 | + | #:size? #f) | |
| 141 | + | sentences)) | |
| 142 | + | (huffman-size (+ jpn-huffman-size trans-huffman-size)) | |
| 143 | + | (pos-trie (+ header-size 4 jpn-huffman-size trans-huffman-size | |
| 144 | + | sentences-size 4)) | |
| 145 | + | (bv (make-bytevector (+ header-size 4 jpn-huffman-size | |
| 146 | + | trans-huffman-size sentences-size 4 | |
| 147 | + | sentence-trie-size)))) | |
| 148 | + | (format #t "Number of nodes in trie: ~a~%" (trie-node-size sentence-trie)) | |
| 149 | + | ((serialize-list (serialize-sentence jpn-huffman-code trans-huffman-code) | |
| 150 | + | #:size? #f) | |
| 151 | + | sentences (+ header-size 4 huffman-size) bv) | |
| 152 | + | ;; Serializing sentences also updated sentence-pos for each of them | |
| 153 | + | (let ((sentences (list->array 1 sentences))) | |
| 154 | + | (update-trie-pos! sentence-trie sentences)) | |
| 155 | + | ;; number of entries | |
| 156 | + | (serialize-int (length sentences) (+ header-size 4 huffman-size sentences-size) | |
| 157 | + | bv) | |
| 158 | + | (let* ((sentences (list->array 1 sentences)) | |
| 159 | + | (pos pos-trie) | |
| 160 | + | (pos (serialize-trie sentence-trie pos bv))) | |
| 161 | + | ;; point to the trie structure | |
| 162 | + | (bytevector-u32-set! | |
| 163 | + | pointers 0 | |
| 164 | + | (+ header-size 4 huffman-size sentences-size (int-size 0)) | |
| 165 | + | (endianness big)) | |
| 166 | + | ;; copy to result bytevector | |
| 167 | + | (bytevector-copy! header 0 bv 0 header-size) | |
| 168 | + | (bytevector-copy! pointers 0 bv header-size 4) | |
| 169 | + | (bytevector-copy! jpn-huffman-bv 0 bv (+ header-size 4) jpn-huffman-size) | |
| 170 | + | (bytevector-copy! trans-huffman-bv 0 bv (+ header-size 4 jpn-huffman-size) | |
| 171 | + | trans-huffman-size) | |
| 172 | + | ;; gide some feedback on the size of file's structures | |
| 173 | + | (format #t "huffmans are ~a bytes long~%" huffman-size) | |
| 174 | + | (format #t "sentences are ~a bytes long~%" sentences-size) | |
| 175 | + | (format #t "trie is ~a bytes long~%" sentence-trie-size) | |
| 176 | + | bv))) | |
| 177 | + | ||
| 178 | + | (define (sentence-dictionary-entry-count file) | |
| 179 | + | (call-with-input-file file | |
| 180 | + | (lambda (port) | |
| 181 | + | (let* ((header (utf8->string (get-bytevector-n port 16))) | |
| 182 | + | (pointers (get-bytevector-n port 4)) | |
| 183 | + | (end-pos (bytevector-u32-ref pointers 0 (endianness big)))) | |
| 184 | + | (seek port (- end-pos 4) SEEK_SET) | |
| 185 | + | (bytevector-u32-ref (get-bytevector-n port 4) 0 (endianness big)))))) |
po/nani.pot
| 8 | 8 | msgstr "" | |
| 9 | 9 | "Project-Id-Version: PACKAGE VERSION\n" | |
| 10 | 10 | "Report-Msgid-Bugs-To: \n" | |
| 11 | - | "POT-Creation-Date: 2021-07-29 21:50+0200\n" | |
| 11 | + | "POT-Creation-Date: 2022-07-09 21:44+0200\n" | |
| 12 | 12 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" | |
| 13 | 13 | "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n" | |
| 14 | 14 | "Language-Team: LANGUAGE <LL@li.org>\n" | |
… | |||
| 102 | 102 | msgid "JMdict" | |
| 103 | 103 | msgstr "" | |
| 104 | 104 | ||
| 105 | - | #: tools/list.scm:71 | |
| 105 | + | #: tools/list.scm:85 | |
| 106 | 106 | msgid "" | |
| 107 | 107 | "Japanese/Dutch dictionary from the Electronic Dictionary Research and " | |
| 108 | 108 | "Development Group." | |
| 109 | 109 | msgstr "" | |
| 110 | 110 | ||
| 111 | - | #: tools/list.scm:70 | |
| 111 | + | #: tools/list.scm:84 | |
| 112 | 112 | msgid "" | |
| 113 | 113 | "Japanese/English dictionary from the Electronic Dictionary Research and " | |
| 114 | 114 | "Development Group." | |
| 115 | 115 | msgstr "" | |
| 116 | 116 | ||
| 117 | 117 | #: tools/list.scm:72 | |
| 118 | + | msgid "Japanese/French aligned sentences from the Tatoeba project." | |
| 119 | + | msgstr "" | |
| 120 | + | ||
| 121 | + | #: tools/list.scm:86 | |
| 118 | 122 | msgid "" | |
| 119 | 123 | "Japanese/French dictionary from the Electronic Dictionary Research and " | |
| 120 | 124 | "Development Group." | |
| 121 | 125 | msgstr "" | |
| 122 | 126 | ||
| 123 | - | #: tools/list.scm:61 | |
| 127 | + | #: tools/list.scm:62 | |
| 124 | 128 | msgid "Japanese/French dictionary from the Jibiki project." | |
| 125 | 129 | msgstr "" | |
| 126 | 130 | ||
| 127 | - | #: tools/list.scm:45 | |
| 131 | + | #: tools/list.scm:46 | |
| 128 | 132 | msgid "Japanese/German dictionary from Wadoku." | |
| 129 | 133 | msgstr "" | |
| 130 | 134 | ||
| 131 | - | #: tools/list.scm:73 | |
| 135 | + | #: tools/list.scm:87 | |
| 132 | 136 | msgid "" | |
| 133 | 137 | "Japanese/German dictionary from the Electronic Dictionary Research and " | |
| 134 | 138 | "Development Group." | |
| 135 | 139 | msgstr "" | |
| 136 | 140 | ||
| 137 | - | #: tools/list.scm:74 | |
| 141 | + | #: tools/list.scm:88 | |
| 138 | 142 | msgid "" | |
| 139 | 143 | "Japanese/Hungarian dictionary from the Electronic Dictionary Research and " | |
| 140 | 144 | "Development Group." | |
| 141 | 145 | msgstr "" | |
| 142 | 146 | ||
| 143 | - | #: tools/list.scm:75 | |
| 147 | + | #: tools/list.scm:73 | |
| 148 | + | msgid "Japanese/Russian aligned sentences from the Tatoeba project." | |
| 149 | + | msgstr "" | |
| 150 | + | ||
| 151 | + | #: tools/list.scm:89 | |
| 144 | 152 | msgid "" | |
| 145 | 153 | "Japanese/Russian dictionary from the Electronic Dictionary Research and " | |
| 146 | 154 | "Development Group." | |
| 147 | 155 | msgstr "" | |
| 148 | 156 | ||
| 149 | - | #: tools/list.scm:76 | |
| 157 | + | #: tools/list.scm:90 | |
| 150 | 158 | msgid "" | |
| 151 | 159 | "Japanese/Slovenian dictionary from the Electronic Dictionary Research and " | |
| 152 | 160 | "Development Group." | |
| 153 | 161 | msgstr "" | |
| 154 | 162 | ||
| 155 | - | #: tools/list.scm:77 | |
| 163 | + | #: tools/list.scm:74 | |
| 164 | + | msgid "Japanese/Spanish aligned sentences from the Tatoeba project." | |
| 165 | + | msgstr "" | |
| 166 | + | ||
| 167 | + | #: tools/list.scm:91 | |
| 156 | 168 | msgid "" | |
| 157 | 169 | "Japanese/Spanish dictionary from the Electronic Dictionary Research and " | |
| 158 | 170 | "Development Group." | |
| 159 | 171 | msgstr "" | |
| 160 | 172 | ||
| 161 | - | #: tools/list.scm:78 | |
| 173 | + | #: tools/list.scm:92 | |
| 162 | 174 | msgid "" | |
| 163 | 175 | "Japanese/Swedish dictionary from the Electronic Dictionary Research and " | |
| 164 | 176 | "Development Group." | |
| 165 | 177 | msgstr "" | |
| 166 | 178 | ||
| 167 | - | #: tools/list.scm:87 | |
| 179 | + | #: tools/list.scm:75 | |
| 180 | + | msgid "Japanese/Ukrainian aligned sentences from the Tatoeba project." | |
| 181 | + | msgstr "" | |
| 182 | + | ||
| 183 | + | #: tools/list.scm:71 | |
| 184 | + | msgid "Japanese/english aligned sentences from the Tatoeba project." | |
| 185 | + | msgstr "" | |
| 186 | + | ||
| 187 | + | #: tools/list.scm:101 | |
| 168 | 188 | msgid "Kanji dictionary with English meanings." | |
| 169 | 189 | msgstr "" | |
| 170 | 190 | ||
| 171 | - | #: tools/list.scm:89 | |
| 191 | + | #: tools/list.scm:103 | |
| 172 | 192 | msgid "Kanji dictionary with French meanings." | |
| 173 | 193 | msgstr "" | |
| 174 | 194 | ||
| 175 | - | #: tools/list.scm:90 | |
| 195 | + | #: tools/list.scm:104 | |
| 176 | 196 | msgid "Kanji dictionary with Portuguese meanings." | |
| 177 | 197 | msgstr "" | |
| 178 | 198 | ||
| 179 | - | #: tools/list.scm:88 | |
| 199 | + | #: tools/list.scm:102 | |
| 180 | 200 | msgid "Kanji dictionary with Spanish meanings." | |
| 181 | 201 | msgstr "" | |
| 182 | 202 | ||
| 183 | - | #: tools/list.scm:39 | |
| 203 | + | #: tools/list.scm:40 | |
| 184 | 204 | msgid "Kanji writing visual help by the Kanjivg project." | |
| 185 | 205 | msgstr "" | |
| 186 | 206 | ||
… | |||
| 233 | 253 | msgid "Phone: " | |
| 234 | 254 | msgstr "" | |
| 235 | 255 | ||
| 236 | - | #: tools/list.scm:53 | |
| 256 | + | #: tools/list.scm:54 | |
| 237 | 257 | msgid "Pitch accent dictionary from Wadoku." | |
| 238 | 258 | msgstr "" | |
| 239 | 259 | ||
| 240 | - | #: tools/list.scm:32 | |
| 260 | + | #: tools/list.scm:33 | |
| 241 | 261 | msgid "" | |
| 242 | 262 | "Radical to Kanji dictionary from the Electronic Dictionary Research and " | |
| 243 | 263 | "Development Group." | |
… | |||
| 281 | 301 | msgid "Tap on the search button and you'll see the results. Easy, right?" | |
| 282 | 302 | msgstr "" | |
| 283 | 303 | ||
| 304 | + | #: tools/list.scm:77 | |
| 305 | + | msgid "" | |
| 306 | + | "Tatoeba is a collection of sentences and translations. This\n" | |
| 307 | + | " dictionary contains pairs of sentences that are direct translations " | |
| 308 | + | "of\n" | |
| 309 | + | " one another, which allows you to see example sentences in search\n" | |
| 310 | + | " results." | |
| 311 | + | msgstr "" | |
| 312 | + | ||
| 284 | 313 | #: pages/e404.scm:27 | |
| 285 | 314 | msgid "That's a 404 :/" | |
| 286 | 315 | msgstr "" | |
… | |||
| 335 | 364 | "In the following sections we will see how to use them." | |
| 336 | 365 | msgstr "" | |
| 337 | 366 | ||
| 338 | - | #: tools/list.scm:55 | |
| 367 | + | #: tools/list.scm:56 | |
| 339 | 368 | msgid "" | |
| 340 | 369 | "This dictionary allows you to augment search results on the main view\n" | |
| 341 | 370 | " with pitch accent (pronunciation) information. Japanese is not " | |
… | |||
| 345 | 374 | " words better, with a standard Japanese pitch accent." | |
| 346 | 375 | msgstr "" | |
| 347 | 376 | ||
| 348 | - | #: tools/list.scm:63 | |
| 377 | + | #: tools/list.scm:64 | |
| 349 | 378 | msgid "" | |
| 350 | 379 | "This dictionary allows you to do searches on the main view of this app.\n" | |
| 351 | 380 | "\tFailing to download one of these dictionaries will make the app unusable\n" | |
… | |||
| 353 | 382 | "\tby kanji, reading (kana) and by French translation." | |
| 354 | 383 | msgstr "" | |
| 355 | 384 | ||
| 356 | - | #: tools/list.scm:47 | |
| 385 | + | #: tools/list.scm:48 | |
| 357 | 386 | msgid "" | |
| 358 | 387 | "This dictionary allows you to do searches on the main view of this app.\n" | |
| 359 | 388 | " Failing to download one of these dictionaries will make the app " | |
… | |||
| 363 | 392 | " by kanji, reading (kana) and by German translation." | |
| 364 | 393 | msgstr "" | |
| 365 | 394 | ||
| 366 | - | #: tools/list.scm:80 | |
| 395 | + | #: tools/list.scm:94 | |
| 367 | 396 | msgid "" | |
| 368 | 397 | "This dictionary allows you to do searches on the main view of this app.\n" | |
| 369 | 398 | " Failing to download one of these dictionaries will make the app " | |
… | |||
| 373 | 402 | " kanji, reading (kana) and by meaning in the languages you selected." | |
| 374 | 403 | msgstr "" | |
| 375 | 404 | ||
| 376 | - | #: tools/list.scm:34 | |
| 405 | + | #: tools/list.scm:35 | |
| 377 | 406 | msgid "" | |
| 378 | 407 | "This dictionary allows you to enter kanji by selecting some of its\n" | |
| 379 | 408 | " components. Tap the water component button on the bottom of the screen " | |
… | |||
| 381 | 410 | " access the kanji selection by component view" | |
| 382 | 411 | msgstr "" | |
| 383 | 412 | ||
| 384 | - | #: tools/list.scm:92 | |
| 413 | + | #: tools/list.scm:106 | |
| 385 | 414 | msgid "" | |
| 386 | 415 | "This dictionary allows you to search for kanji and view kanji information\n" | |
| 387 | 416 | " such as number of strokes, pronunciations and meanings." | |
| 388 | 417 | msgstr "" | |
| 389 | 418 | ||
| 390 | - | #: tools/list.scm:41 | |
| 419 | + | #: tools/list.scm:42 | |
| 391 | 420 | msgid "" | |
| 392 | 421 | "This dictionary allows you to see how a kanji is written, what it is\n" | |
| 393 | 422 | "composed of, and the order in which strokes are written." | |
tatoeba.mk unknown status 1
| 1 | + | TATOEBA_LANGS=eng fra rus spa ukr | |
| 2 | + | DICOS+=$(addprefix dicos/tatoeba_, $(addsuffix .nani, $(TATOEBA_LANGS))) | |
| 3 | + | TATOEBA_DOWNLOADS+=$(addprefix dictionaries/tatoeba_, $(addsuffix .csv, sentences_detailed sentences_base tags user_languages)) | |
| 4 | + | DOWNLOADS+=$(TATOEBA_DOWNLOADS) | |
| 5 | + | ||
| 6 | + | .PRECIOUS: dictionaries/tatoeba%.csv | |
| 7 | + | ||
| 8 | + | dictionaries/tatoeba%.csv: | |
| 9 | + | wget https://downloads.tatoeba.org/exports/$$(basename $@ .csv | cut -c9-).tar.bz2 -O $@.tar.bz2 --continue | |
| 10 | + | tar xf $@.tar.bz2 -C dictionaries | |
| 11 | + | mv dictionaries/$$(basename $@ | cut -c9-) $@ | |
| 12 | + | ||
| 13 | + | dicos/tatoeba_%.nani: $(TATOEBA_DOWNLOADS) $(TATOEBA_MODULES) $(RADK_MODULES) | |
| 14 | + | guile -L modules tools/tatoeba.scm $(shell basename $@ .nani | sed 's|^tatoeba_||g') $@ |
tools/list.scm
| 22 | 22 | (use-modules (nani kanji kanjivg)) | |
| 23 | 23 | (use-modules (nani result result)) | |
| 24 | 24 | (use-modules (nani pitch pitch)) | |
| 25 | + | (use-modules (nani sentence sentence)) | |
| 25 | 26 | (use-modules (gcrypt hash)) | |
| 26 | 27 | (use-modules (ice-9 match)) | |
| 27 | 28 | (use-modules (ice-9 format)) | |
… | |||
| 65 | 66 | as you can't search for anything. This dictionary can be searched for | |
| 66 | 67 | by kanji, reading (kana) and by French translation.")) | |
| 67 | 68 | ||
| 69 | + | (define (tatoeba-synopsis lang) | |
| 70 | + | (match lang | |
| 71 | + | ("eng" `(_ "Japanese/english aligned sentences from the Tatoeba project.")) | |
| 72 | + | ("fra" `(_ "Japanese/French aligned sentences from the Tatoeba project.")) | |
| 73 | + | ("rus" `(_ "Japanese/Russian aligned sentences from the Tatoeba project.")) | |
| 74 | + | ("spa" `(_ "Japanese/Spanish aligned sentences from the Tatoeba project.")) | |
| 75 | + | ("ukr" `(_ "Japanese/Ukrainian aligned sentences from the Tatoeba project.")))) | |
| 76 | + | (define (tatoeba-description lang) | |
| 77 | + | `(_ "Tatoeba is a collection of sentences and translations. This | |
| 78 | + | dictionary contains pairs of sentences that are direct translations of | |
| 79 | + | one another, which allows you to see example sentences in search | |
| 80 | + | results.")) | |
| 81 | + | ||
| 68 | 82 | (define (jmdict-synopsis lang) | |
| 69 | 83 | (match lang | |
| 70 | 84 | ("e" `(_ "Japanese/English dictionary from the Electronic Dictionary Research and Development Group.")) | |
… | |||
| 123 | 137 | (let ((dico-lang (substring dico 7))) | |
| 124 | 138 | (if long? | |
| 125 | 139 | (jmdict-description dico-lang) | |
| 126 | - | (jmdict-synopsis dico-lang)))))) | |
| 140 | + | (jmdict-synopsis dico-lang)))) | |
| 141 | + | ((equal? (dico-type dico) "tatoeba") | |
| 142 | + | (let ((dico-lang (substring dico 8))) | |
| 143 | + | (if long? | |
| 144 | + | (tatoeba-description dico-lang) | |
| 145 | + | (tatoeba-synopsis dico-lang)))))) | |
| 127 | 146 | (translated (translate english lang))) | |
| 128 | 147 | (if (and (equal? english translated) (not (equal? lang "en"))) | |
| 129 | 148 | #f | |
… | |||
| 149 | 168 | "kanjidic") | |
| 150 | 169 | ((and (> (string-length file) 6) (equal? (substring file 0 6) "JMdict")) | |
| 151 | 170 | "jmdict") | |
| 171 | + | ((and (> (string-length file) 7) (equal? (substring file 0 7) "tatoeba")) | |
| 172 | + | "tatoeba") | |
| 152 | 173 | ((equal? file "jibiki_fre") "jibiki") | |
| 153 | 174 | ((equal? file "wadoku_ger") "wadoku") | |
| 154 | 175 | ((equal? file "wadoku_pitch") "wadoku_pitch"))) | |
… | |||
| 163 | 184 | (kanjidic-entry-count file)) | |
| 164 | 185 | ((member (dico-type (dico-name file)) '("jmdict" "wadoku" "jibiki")) | |
| 165 | 186 | (dictionary-entry-count file)) | |
| 187 | + | ((equal? (dico-type (dico-name file)) "tatoeba") | |
| 188 | + | (sentence-dictionary-entry-count file)) | |
| 166 | 189 | ((equal? (dico-type (dico-name file)) "wadoku_pitch") | |
| 167 | 190 | (pitch-entry-count file)))) | |
| 168 | 191 | ||
tools/tatoeba.scm unknown status 1
| 1 | + | ;;; Nani Project website | |
| 2 | + | ;;; Copyright ?? 2022 Julien Lepiller <julien@lepiller.eu> | |
| 3 | + | ;;; | |
| 4 | + | ;;; This file is part of the Nani Project website. | |
| 5 | + | ;;; | |
| 6 | + | ;;; The Nani Project website is free software; you can redistribute it and/or modify it | |
| 7 | + | ;;; under the terms of the GNU Affero General Public License as published by | |
| 8 | + | ;;; the Free Software Foundation; either version 3 of the License, or (at | |
| 9 | + | ;;; your option) any later version. | |
| 10 | + | ;;; | |
| 11 | + | ;;; The Nani Project website is distributed in the hope that it will be useful, but | |
| 12 | + | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 | + | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 14 | + | ;;; GNU Affero General Public License for more details. | |
| 15 | + | ;;; | |
| 16 | + | ;;; You should have received a copy of the GNU Affero General Public License | |
| 17 | + | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. | |
| 18 | + | ||
| 19 | + | (use-modules (srfi srfi-9)) | |
| 20 | + | (use-modules (srfi srfi-11)) | |
| 21 | + | (use-modules (ice-9 match)) | |
| 22 | + | (use-modules (ice-9 binary-ports)) | |
| 23 | + | (use-modules (ice-9 textual-ports)) | |
| 24 | + | (use-modules (nani sentence sentence)) | |
| 25 | + | ||
| 26 | + | (define (tatoeba-file file) | |
| 27 | + | (string-append (dirname (current-filename)) "/../dictionaries/tatoeba_" file)) | |
| 28 | + | ||
| 29 | + | (define (get-sentences lang native-users) | |
| 30 | + | (call-with-input-file (tatoeba-file "sentences_detailed.csv") | |
| 31 | + | (lambda (port) | |
| 32 | + | (let loop ((jpn '()) (trans '()) (line (get-line port))) | |
| 33 | + | (if (eof-object? line) | |
| 34 | + | (values (reverse jpn) (reverse trans)) | |
| 35 | + | (match (string-split line #\tab) | |
| 36 | + | ((id slang text user _ _) | |
| 37 | + | (cond | |
| 38 | + | ((equal? lang slang) | |
| 39 | + | (loop jpn (cons (cons (string->number id) text) trans) | |
| 40 | + | (get-line port))) | |
| 41 | + | ((and (equal? slang "jpn") (member user native-users)) | |
| 42 | + | (loop (cons (cons (string->number id) text) jpn) trans (get-line port))) | |
| 43 | + | (else | |
| 44 | + | (loop jpn trans (get-line port))))))))))) | |
| 45 | + | ||
| 46 | + | (define (get-native-jpn-users) | |
| 47 | + | (call-with-input-file (tatoeba-file "user_languages.csv") | |
| 48 | + | (lambda (port) | |
| 49 | + | (let loop ((jpn '()) (line (get-line port))) | |
| 50 | + | (if (eof-object? line) | |
| 51 | + | jpn | |
| 52 | + | (match (string-split line #\tab) | |
| 53 | + | ((ulang level user _) | |
| 54 | + | ;; consume the comment if it's on multiple lines | |
| 55 | + | (let loop2 ((line line)) | |
| 56 | + | (when (string-suffix? "\\" line) | |
| 57 | + | (loop2 (get-line port)))) | |
| 58 | + | ;; Add native japanese user | |
| 59 | + | (if (and (equal? ulang "jpn") (equal? level "5")) | |
| 60 | + | (loop (cons user jpn) (get-line port)) | |
| 61 | + | (loop jpn (get-line port)))))))))) | |
| 62 | + | ||
| 63 | + | (define (get-translations jpn trans) | |
| 64 | + | (define jpn-len (vector-length jpn)) | |
| 65 | + | (define trans-len (vector-length trans)) | |
| 66 | + | (define (member-vect elem vect len) | |
| 67 | + | (let loop ((min 0) (max (- len 1))) | |
| 68 | + | (if (> min max) | |
| 69 | + | #f | |
| 70 | + | (let* ((mid-pos (floor (+ min (/ (- max min) 2)))) | |
| 71 | + | (mid-elem (vector-ref vect mid-pos))) | |
| 72 | + | (cond | |
| 73 | + | ((equal? mid-elem elem) #t) | |
| 74 | + | ((> mid-elem elem) (loop min (- mid-pos 1))) | |
| 75 | + | ((< mid-elem elem) (loop (+ mid-pos 1) max))))))) | |
| 76 | + | ||
| 77 | + | (call-with-input-file (tatoeba-file "sentences_base.csv") | |
| 78 | + | (lambda (port) | |
| 79 | + | (let loop ((translations '()) (line (get-line port))) | |
| 80 | + | (if (or (eof-object? line) (null? trans)) | |
| 81 | + | translations | |
| 82 | + | (match (string-split line #\tab) | |
| 83 | + | ((id translation) | |
| 84 | + | (cond | |
| 85 | + | ((and (string->number id) | |
| 86 | + | (member-vect (string->number id) trans trans-len) | |
| 87 | + | (string->number translation) | |
| 88 | + | (member-vect (string->number translation) jpn jpn-len)) | |
| 89 | + | (loop (cons (cons (string->number translation) (string->number id)) translations) | |
| 90 | + | (get-line port))) | |
| 91 | + | ((and (string->number id) | |
| 92 | + | (member-vect (string->number id) jpn jpn-len) | |
| 93 | + | (string->number translation) | |
| 94 | + | (member-vect (string->number translation) trans trans-len)) | |
| 95 | + | (loop (cons (cons (string->number id) (string->number translation)) translations) | |
| 96 | + | (get-line port))) | |
| 97 | + | (else | |
| 98 | + | (loop translations (get-line port))))))))))) | |
| 99 | + | ||
| 100 | + | (define (add-tags translations trans jpn) | |
| 101 | + | (define tags | |
| 102 | + | (call-with-input-file (tatoeba-file "tags.csv") | |
| 103 | + | (lambda (port) | |
| 104 | + | (let loop ((tags '()) (line (get-line port))) | |
| 105 | + | (if (eof-object? line) | |
| 106 | + | tags | |
| 107 | + | (match (string-split line #\tab) | |
| 108 | + | ((id tag) | |
| 109 | + | (assoc-set! tags (string->number id) (cons tag (or (assoc-ref tags (string->number id)) '())))))))))) | |
| 110 | + | (map | |
| 111 | + | (match-lambda | |
| 112 | + | ((jpn-id . trans-id) | |
| 113 | + | (let ((tags (or (assoc-ref tags jpn-id) '())) | |
| 114 | + | (trans (assoc-ref trans trans-id)) | |
| 115 | + | (jpn (assoc-ref jpn jpn-id))) | |
| 116 | + | (make-sentence jpn trans tags #f)))) | |
| 117 | + | translations)) | |
| 118 | + | ||
| 119 | + | (define (get-tatoeba-sentences lang) | |
| 120 | + | (define native-users (get-native-jpn-users)) | |
| 121 | + | (let-values (((jpn trans) (get-sentences lang native-users))) | |
| 122 | + | (format #t "jpn: ~a sentences~%" (length jpn)) | |
| 123 | + | (format #t "~a: ~a sentences~%" lang (length trans)) | |
| 124 | + | (let ((translations | |
| 125 | + | (get-translations | |
| 126 | + | (list->vector (sort (map car jpn) <)) | |
| 127 | + | (list->vector (sort (map car trans) <))))) | |
| 128 | + | (format #t "~a pairs~%" (length translations)) | |
| 129 | + | (add-tags translations trans jpn)))) | |
| 130 | + | ||
| 131 | + | (match (command-line) | |
| 132 | + | ((_ lang output) | |
| 133 | + | (let ((sentences (get-tatoeba-sentences lang))) | |
| 134 | + | (format #t "Number of entries in ~a: ~a~%" output (length sentences)) | |
| 135 | + | (call-with-output-file output | |
| 136 | + | (lambda (port) | |
| 137 | + | (put-bytevector port | |
| 138 | + | (serialize-sentence-dictionary sentences))))))) |