tatoeba.scm
| 1 | ;;; Nani Project website |
| 2 | ;;; Copyright © 2022 Julien Lepiller <julien@lepiller.eu> |
| 3 | ;;; |
| 4 | ;;; This file is part of the Nani Project website. |
| 5 | ;;; |
| 6 | ;;; The Nani Project website is free software; you can redistribute it and/or modify it |
| 7 | ;;; under the terms of the GNU Affero General Public License as published by |
| 8 | ;;; the Free Software Foundation; either version 3 of the License, or (at |
| 9 | ;;; your option) any later version. |
| 10 | ;;; |
| 11 | ;;; The Nani Project website is distributed in the hope that it will be useful, but |
| 12 | ;;; WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 14 | ;;; GNU Affero General Public License for more details. |
| 15 | ;;; |
| 16 | ;;; You should have received a copy of the GNU Affero General Public License |
| 17 | ;;; along with the Nani Project website. If not, see <http://www.gnu.org/licenses/>. |
| 18 | |
| 19 | (use-modules (srfi srfi-9)) |
| 20 | (use-modules (srfi srfi-11)) |
| 21 | (use-modules (ice-9 match)) |
| 22 | (use-modules (ice-9 binary-ports)) |
| 23 | (use-modules (ice-9 textual-ports)) |
| 24 | (use-modules (nani sentence sentence)) |
| 25 | |
| 26 | (define (tatoeba-file file) |
| 27 | (string-append (dirname (current-filename)) "/../dictionaries/tatoeba_" file)) |
| 28 | |
| 29 | (define (get-sentences lang native-users) |
| 30 | (call-with-input-file (tatoeba-file "sentences_detailed.csv") |
| 31 | (lambda (port) |
| 32 | (let loop ((jpn '()) (trans '()) (line (get-line port))) |
| 33 | (if (eof-object? line) |
| 34 | (values (reverse jpn) (reverse trans)) |
| 35 | (match (string-split line #\tab) |
| 36 | ((id slang text user _ _) |
| 37 | (cond |
| 38 | ((equal? lang slang) |
| 39 | (loop jpn (cons (cons (string->number id) text) trans) |
| 40 | (get-line port))) |
| 41 | ((and (equal? slang "jpn") (member user native-users)) |
| 42 | (loop (cons (cons (string->number id) text) jpn) trans (get-line port))) |
| 43 | (else |
| 44 | (loop jpn trans (get-line port))))))))))) |
| 45 | |
| 46 | (define (get-native-jpn-users) |
| 47 | (call-with-input-file (tatoeba-file "user_languages.csv") |
| 48 | (lambda (port) |
| 49 | (let loop ((jpn '()) (line (get-line port))) |
| 50 | (if (eof-object? line) |
| 51 | jpn |
| 52 | (match (string-split line #\tab) |
| 53 | ((ulang level user _) |
| 54 | ;; consume the comment if it's on multiple lines |
| 55 | (let loop2 ((line line)) |
| 56 | (when (string-suffix? "\\" line) |
| 57 | (loop2 (get-line port)))) |
| 58 | ;; Add native japanese user |
| 59 | (if (and (equal? ulang "jpn") (equal? level "5")) |
| 60 | (loop (cons user jpn) (get-line port)) |
| 61 | (loop jpn (get-line port)))))))))) |
| 62 | |
| 63 | (define (get-translations jpn trans) |
| 64 | (define jpn-len (vector-length jpn)) |
| 65 | (define trans-len (vector-length trans)) |
| 66 | (define (member-vect elem vect len) |
| 67 | (let loop ((min 0) (max (- len 1))) |
| 68 | (if (> min max) |
| 69 | #f |
| 70 | (let* ((mid-pos (floor (+ min (/ (- max min) 2)))) |
| 71 | (mid-elem (vector-ref vect mid-pos))) |
| 72 | (cond |
| 73 | ((equal? mid-elem elem) #t) |
| 74 | ((> mid-elem elem) (loop min (- mid-pos 1))) |
| 75 | ((< mid-elem elem) (loop (+ mid-pos 1) max))))))) |
| 76 | |
| 77 | (call-with-input-file (tatoeba-file "sentences_base.csv") |
| 78 | (lambda (port) |
| 79 | (let loop ((translations '()) (line (get-line port))) |
| 80 | (if (or (eof-object? line) (null? trans)) |
| 81 | translations |
| 82 | (match (string-split line #\tab) |
| 83 | ((id translation) |
| 84 | (cond |
| 85 | ((and (string->number id) |
| 86 | (member-vect (string->number id) trans trans-len) |
| 87 | (string->number translation) |
| 88 | (member-vect (string->number translation) jpn jpn-len)) |
| 89 | (loop (cons (cons (string->number translation) (string->number id)) translations) |
| 90 | (get-line port))) |
| 91 | ((and (string->number id) |
| 92 | (member-vect (string->number id) jpn jpn-len) |
| 93 | (string->number translation) |
| 94 | (member-vect (string->number translation) trans trans-len)) |
| 95 | (loop (cons (cons (string->number id) (string->number translation)) translations) |
| 96 | (get-line port))) |
| 97 | (else |
| 98 | (loop translations (get-line port))))))))))) |
| 99 | |
| 100 | (define (add-tags translations trans jpn) |
| 101 | (define tags |
| 102 | (call-with-input-file (tatoeba-file "tags.csv") |
| 103 | (lambda (port) |
| 104 | (let loop ((tags '()) (line (get-line port))) |
| 105 | (if (eof-object? line) |
| 106 | tags |
| 107 | (match (string-split line #\tab) |
| 108 | ((id tag) |
| 109 | (assoc-set! tags (string->number id) (cons tag (or (assoc-ref tags (string->number id)) '())))))))))) |
| 110 | (map |
| 111 | (match-lambda |
| 112 | ((jpn-id . trans-id) |
| 113 | (let ((tags (or (assoc-ref tags jpn-id) '())) |
| 114 | (trans (assoc-ref trans trans-id)) |
| 115 | (jpn (assoc-ref jpn jpn-id))) |
| 116 | (make-sentence jpn trans tags #f)))) |
| 117 | translations)) |
| 118 | |
| 119 | (define (get-tatoeba-sentences lang) |
| 120 | (define native-users (get-native-jpn-users)) |
| 121 | (let-values (((jpn trans) (get-sentences lang native-users))) |
| 122 | (format #t "jpn: ~a sentences~%" (length jpn)) |
| 123 | (format #t "~a: ~a sentences~%" lang (length trans)) |
| 124 | (let ((translations |
| 125 | (get-translations |
| 126 | (list->vector (sort (map car jpn) <)) |
| 127 | (list->vector (sort (map car trans) <))))) |
| 128 | (format #t "~a pairs~%" (length translations)) |
| 129 | (add-tags translations trans jpn)))) |
| 130 | |
| 131 | (match (command-line) |
| 132 | ((_ lang output) |
| 133 | (let ((sentences (get-tatoeba-sentences lang))) |
| 134 | (format #t "Number of entries in ~a: ~a~%" output (length sentences)) |
| 135 | (call-with-output-file output |
| 136 | (lambda (port) |
| 137 | (put-bytevector port |
| 138 | (serialize-sentence-dictionary sentences))))))) |
| 139 |