Update data format

Julien LepillerThu Apr 18 20:20:36+0200 2019

c87b6c4

Update data format

app/src/main/java/eu/lepiller/nani/dictionary/DictionaryFactory.java

1313
1414
    private DictionaryFactory(Context context) {
1515
        dictionaries = new ArrayList<>();
16-
        dictionaries.add(new JMDict("example_jmdict",
17-
                context.getString(R.string.dico_jmdict_example),
18-
                context.getCacheDir(),
19-
                "https://xana.lepiller.eu/nani/dico/example.nani"));
2016
        dictionaries.add(new JMDict("JMdict_e",
2117
                context.getString(R.string.dico_jmdict_e),
2218
                context.getCacheDir(),
23-
                "https://xana.lepiller.eu/nani/dico/JMdict_e.nani"));
19+
                "https://nani.lepiller.eu/dicos/JMdict_e.nani"));
20+
        dictionaries.add(new JMDict("JMdict_dut",
21+
                context.getString(R.string.dico_jmdict_dut),
22+
                context.getCacheDir(),
23+
                "https://nani.lepiller.eu/dicos/JMdict_dut.nani"));
24+
        dictionaries.add(new JMDict("JMdict_fre",
25+
                context.getString(R.string.dico_jmdict_fre),
26+
                context.getCacheDir(),
27+
                "https://nani.lepiller.eu/dicos/JMdict_fre.nani"));
28+
        dictionaries.add(new JMDict("JMdict_ger",
29+
                context.getString(R.string.dico_jmdict_ger),
30+
                context.getCacheDir(),
31+
                "https://nani.lepiller.eu/dicos/JMdict_ger.nani"));
32+
        dictionaries.add(new JMDict("JMdict_hun",
33+
                context.getString(R.string.dico_jmdict_hun),
34+
                context.getCacheDir(),
35+
                "https://nani.lepiller.eu/dicos/JMdict_hun.nani"));
36+
        dictionaries.add(new JMDict("JMdict_rus",
37+
                context.getString(R.string.dico_jmdict_rus),
38+
                context.getCacheDir(),
39+
                "https://nani.lepiller.eu/dicos/JMdict_rus.nani"));
40+
        dictionaries.add(new JMDict("JMdict_slv",
41+
                context.getString(R.string.dico_jmdict_slv),
42+
                context.getCacheDir(),
43+
                "https://nani.lepiller.eu/dicos/JMdict_slv.nani"));
44+
        dictionaries.add(new JMDict("JMdict_spa",
45+
                context.getString(R.string.dico_jmdict_spa),
46+
                context.getCacheDir(),
47+
                "https://nani.lepiller.eu/dicos/JMdict_spa.nani"));
48+
        dictionaries.add(new JMDict("JMdict_swe",
49+
                context.getString(R.string.dico_jmdict_swe),
50+
                context.getCacheDir(),
51+
                "https://nani.lepiller.eu/dicos/JMdict_swe.nani"));
2452
    }
2553
2654
    public static ArrayList<Result> search(Context context, String text) {

app/src/main/java/eu/lepiller/nani/dictionary/JMDict.java

55
66
import java.io.File;
77
import java.io.FileNotFoundException;
8-
import java.io.FileReader;
98
import java.io.IOException;
109
import java.io.RandomAccessFile;
1110
import java.net.MalformedURLException;
1211
import java.net.URL;
13-
import java.nio.charset.StandardCharsets;
1412
import java.util.ArrayList;
1513
import java.util.Arrays;
14+
import java.util.Comparator;
1615
import java.util.HashMap;
16+
import java.util.List;
1717
import java.util.Map;
1818
1919
import eu.lepiller.nani.R;
2020
import eu.lepiller.nani.result.Result;
2121
2222
public class JMDict extends Dictionary {
23-
    final static String TAG = "JMDICT";
23+
    interface Huffman {
24+
    }
25+
26+
    static class HuffmanTree implements Huffman {
27+
        Huffman left, right;
28+
        HuffmanTree(Huffman left, Huffman right) {
29+
            this.left = left;
30+
            this.right = right;
31+
        }
32+
    }
33+
    static class HuffmanValue implements Huffman {
34+
        String character;
35+
        HuffmanValue(String character) {
36+
            this.character = character;
37+
        }
38+
    }
39+
40+
    final private static String TAG = "JMDICT";
2441
    private String mUrl;
42+
    private Huffman kanjiHuffman, readingHuffman, meaningHuffman;
2543
2644
    JMDict(String name, String description, File cacheDir, String url) {
2745
        super(name, description, cacheDir);

6482
    public void remove() {
6583
        File file = getFile();
6684
        file.delete();
85+
        kanjiHuffman = null;
86+
        readingHuffman = null;
87+
        meaningHuffman = null;
6788
    }
6889
6990
    private ArrayList<String> getStringList(RandomAccessFile file) throws IOException {
7091
        ArrayList<String> results = new ArrayList<>();
71-
        int number = file.readInt();
92+
        int number = file.readShort();
93+
        for(int i=0; i<number; i++) {
94+
            results.add(file.readUTF());
95+
        }
96+
        return results;
97+
    }
98+
99+
    private String getHuffmanString(RandomAccessFile file, Huffman huffman) throws IOException {
100+
        StringBuilder b = new StringBuilder();
101+
        ArrayList<Boolean> bits = new ArrayList<>();
102+
        String c = null;
103+
        Huffman h = huffman;
104+
        while(c == null || !c.isEmpty()) {
105+
            if(h instanceof HuffmanValue) {
106+
                c = ((HuffmanValue) h).character;
107+
                Log.d(TAG, "Huffman read: " + c);
108+
                b.append(c);
109+
                h = huffman;
110+
            } else if(h instanceof HuffmanTree) {
111+
                if(bits.isEmpty()) {
112+
                    byte by = file.readByte();
113+
                    Log.d(TAG, "Read byte for huffman: " + by);
114+
                    short mod = (short)256;
115+
                    while(mod != 1) {
116+
                        mod /= 2;
117+
                        bits.add((by / mod) > 0);
118+
                        by = (byte)(by % mod);
119+
                    }
120+
                    Log.d(TAG, "Read byte for huffman: " + bits);
121+
                }
122+
123+
                Boolean bo = bits.get(0);
124+
                bits.remove(0);
125+
                h = bo? ((HuffmanTree) h).right: ((HuffmanTree) h).left;
126+
            }
127+
        }
128+
129+
        return b.toString();
130+
    }
131+
132+
    private void logHuffman(Huffman h, ArrayList<Boolean> addr) {
133+
        if (h instanceof HuffmanValue) {
134+
            Log.d(TAG, "HUFF: " + ((HuffmanValue) h).character + " -> " + addr.toString());
135+
        } else if(h instanceof HuffmanTree) {
136+
            ArrayList<Boolean> addr_l = new ArrayList<>(addr);
137+
            addr_l.add(false);
138+
            ArrayList<Boolean> addr_r = new ArrayList<>(addr);
139+
            addr_r.add(true);
140+
            logHuffman(((HuffmanTree) h).left, addr_l);
141+
            logHuffman(((HuffmanTree) h).right, addr_r);
142+
        }
143+
    }
144+
145+
    private ArrayList<String> getHuffmanStringList(RandomAccessFile file, Huffman huffman) throws IOException {
146+
        ArrayList<String> results = new ArrayList<>();
147+
        int number = file.readShort();
148+
        Log.d(TAG, "huffmanStrings: " + number);
72149
        for(int i=0; i<number; i++) {
73-
            int l = file.readInt();
74-
            byte[] str = new byte[l];
75-
            file.read(str);
76-
            results.add(new String(str, "UTF-8"));
150+
            results.add(getHuffmanString(file, huffman));
77151
        }
78152
        return results;
79153
    }
80154
81-
    private String getString(RandomAccessFile file) throws IOException {
82-
        int l = file.readInt();
83-
        byte[] str = new byte[l];
84-
        file.read(str);
85-
        return new String(str, "UTF-8");
155+
    private ArrayList<Integer> getIntList(RandomAccessFile file) throws IOException {
156+
        ArrayList<Integer> results = new ArrayList<>();
157+
        int number = file.readShort();
158+
        for(int i=0; i<number; i++) {
159+
            results.add(Integer.valueOf(file.readByte()));
160+
        }
161+
        return results;
86162
    }
87163
88164
    private Result getValue(RandomAccessFile file, long pos) throws IOException {
89165
        file.seek(pos);
90-
        ArrayList<String> kanjis = getStringList(file);
166+
        Log.d(TAG, "Getting value");
167+
        ArrayList<String> kanjis = getHuffmanStringList(file, kanjiHuffman);
91168
169+
        Log.d(TAG, "Getting readings");
92170
        ArrayList<Result.Reading> readings = new ArrayList<>();
93-
        int reading_number = file.readInt();
171+
        int reading_number = file.readShort();
172+
        Log.d(TAG, reading_number + " readings.");
94173
        for(int i=0; i<reading_number; i++) {
95174
            ArrayList<String> reading_kanjis = getStringList(file);
175+
            Log.d(TAG, "kanjis: " + reading_kanjis);
96176
            ArrayList<String> reading_infos = getStringList(file);
97-
            ArrayList<String> reading_readings = getStringList(file);
177+
            Log.d(TAG, "infos: " + reading_kanjis);
178+
            ArrayList<String> reading_readings = getHuffmanStringList(file, readingHuffman);
98179
            Result.Reading r = new Result.Reading(reading_kanjis, reading_infos, reading_readings);
99180
            readings.add(r);
100181
        }
101182
102183
        ArrayList<Result.Sense> senses = new ArrayList<>();
103-
        int meaning_number = file.readInt();
184+
        int meaning_number = file.readShort();
185+
        Log.d(TAG, meaning_number + " meanings.");
104186
        for(int i=0; i<meaning_number; i++) {
105187
            ArrayList<String> sense_references = getStringList(file);
106188
            ArrayList<String> sense_limits = getStringList(file);

110192
            for(int j=0; j<source_number; j++) {
111193
                ArrayList<String> source_content = getStringList(file);
112194
                boolean source_wasei = file.read() != 0;
113-
                String source_type = getString(file);
114-
                String source_language = getString(file);
195+
                String source_type = file.readUTF();
196+
                String source_language = file.readUTF();
115197
                sense_sources.add(new Result.Source(source_content, source_wasei, source_type, source_language));
116198
            }
117-
            ArrayList<String> sense_tags = getStringList(file);
118-
            ArrayList<String> sense_glosses = getStringList(file);
119-
            String sense_language = getString(file);
199+
            ArrayList<Integer> sense_tags = getIntList(file);
200+
            ArrayList<String> sense_glosses = getHuffmanStringList(file, meaningHuffman);
201+
            String sense_language = file.readUTF();
120202
            senses.add(new Result.Sense(sense_references, sense_limits, sense_infos, sense_sources,
121203
                    sense_tags, sense_glosses, sense_language));
122204
        }

125207
126208
    private ArrayList<Integer> getValues(RandomAccessFile file, long triePos) throws IOException {
127209
        file.seek(triePos);
128-
        Log.d(TAG, "Getting value");
129-
        int valuesLength = file.readInt();
210+
        Log.d(TAG, "Getting values");
211+
        int valuesLength = file.readShort();
130212
        ArrayList<Integer> results = new ArrayList<>();
131213
132214
        Log.d(TAG, "Number of values: " + valuesLength);

134216
            results.add(file.readInt());
135217
        }
136218
137-
        int transitionLength = file.readInt();
219+
        int transitionLength = file.readByte();
138220
        Log.d(TAG, "Number of transitions: " + transitionLength);
139221
        int[] others = new int[transitionLength];
140222
        for(int i=0; i<transitionLength; i++) {

158240
        if(txt.length == 0) {
159241
            return getValues(file, triePos);
160242
        } else {
161-
            int valuesLength = file.readInt();
243+
            int valuesLength = file.readShort();
162244
            Log.d(TAG, "number of values: " + valuesLength);
163-
            /*for(int i=0; i<valuesLength; i++) {
164-
                int strLen = file.readInt();
165-
                byte[] s = new byte[strLen];
166-
                file.read(s);
167-
                //Log.d(TAG, "Value of size " + strLen + ": " + new String(s, "UTF-8"));
168-
                file.skipBytes(strLen*4);
169-
            }*/
170245
            file.skipBytes(valuesLength * 4);
171246
172-
            int transitionLength = file.readInt();
247+
            int transitionLength = file.readByte();
173248
            Log.d(TAG, "number of transitions: " + transitionLength);
174249
175250
            for(int i = 0; i < transitionLength; i++) {

189264
        }
190265
    }
191266
267+
    private Huffman loadHuffman(RandomAccessFile file) throws IOException {
268+
        byte b = file.readByte();
269+
        if(b == 1) {
270+
            Huffman left = loadHuffman(file);
271+
            Huffman right = loadHuffman(file);
272+
273+
            return new HuffmanTree(left, right);
274+
        } else if (b == 0) {
275+
            file.skipBytes(1);
276+
            return new HuffmanValue("");
277+
        } else {
278+
            ArrayList<Byte> bs = new ArrayList<>();
279+
            bs.add(b);
280+
            while((b = file.readByte()) != 0) {
281+
                bs.add(b);
282+
            }
283+
            byte[] array = new byte[bs.size()];
284+
            for(int i=0; i<bs.size(); i++) {
285+
                array[i] = bs.get(i);
286+
            }
287+
            return new HuffmanValue(new String(array, "UTF-8"));
288+
        }
289+
    }
290+
192291
    ArrayList<Result> search(String text) {
193292
        if (isDownloaded()) {
194293
            try {

213312
                Log.d(TAG, "reading: " + readingTriePos);
214313
                Log.d(TAG, "meaning: " + meaningTriePos);
215314
315+
                kanjiHuffman = loadHuffman(file);
316+
                readingHuffman = loadHuffman(file);
317+
                meaningHuffman = loadHuffman(file);
318+
319+
                logHuffman(readingHuffman, new ArrayList<Boolean>());
320+
216321
                ArrayList<Integer> results = searchTrie(file, kanjiTriePos, search);
217322
                if(results == null || results.isEmpty())
218323
                    results = searchTrie(file, readingTriePos, search);

229334
                        uniqResults.add(i);
230335
                }
231336
232-
                for(Integer i: uniqResults) {
337+
                int[] uniqResultsArray = new int[uniqResults.size()];
338+
                Arrays.sort(uniqResultsArray);
339+
340+
                int num = 0;
341+
                for(Integer i: uniqResultsArray) {
342+
                    if(num > 10)
343+
                        break;
344+
                    num++;
233345
                    r.add(getValue(file, i));
234346
                }
235347
                return r;

app/src/main/java/eu/lepiller/nani/result/Result.java

2323
    }
2424
2525
    public static class Sense {
26-
        private ArrayList<String> references, limits, infos, tags, glosses;
26+
        private ArrayList<String> references, limits, infos, glosses;
27+
        private ArrayList<Integer> tags;
2728
        private String language;
2829
        private ArrayList<Source> sources;
2930
3031
        public Sense(ArrayList<String> references, ArrayList<String> limits, ArrayList<String> infos,
31-
                     ArrayList<Source> sources, ArrayList<String> tags, ArrayList<String> glosses,
32+
                     ArrayList<Source> sources, ArrayList<Integer> tags, ArrayList<String> glosses,
3233
                     String language) {
3334
            this.references = references;
3435
            this.limits = limits;

app/src/main/res/values/strings.xml

1414
    <string name="kanji_description">Writing</string>
1515
1616
    <!-- Dictionnary descriptions -->
17-
    <string name="dico_jmdict_example">Japanese/English dictionary for test purposes. Do not use.</string>
1817
    <string name="dico_jmdict_e">Japanese/English dictionary from the Electronic Dictionary Research and Development Group.\n~80 MB, ~180,000 entries.</string>
18+
    <string name="dico_jmdict_dut">Japanese/Dutch dictionary from the Electronic Dictionary Research and Development Group.\n~30 MB, ~35,000 entries.</string>
19+
    <string name="dico_jmdict_fre">Japanese/French dictionary from the Electronic Dictionary Research and Development Group.\n~8 MB, ~15,000 entries.</string>
20+
    <string name="dico_jmdict_ger">Japanese/German dictionary from the Electronic Dictionary Research and Development Group.\n~70 MB, ~120,000 entries.</string>
21+
    <string name="dico_jmdict_hun">Japanese/Hungarian dictionary from the Electronic Dictionary Research and Development Group.\n~15 MB, ~40,000 entries.</string>
22+
    <string name="dico_jmdict_rus">Japanese/Russian dictionary from the Electronic Dictionary Research and Development Group.\n~80 MB, ~65,000 entries.</string>
23+
    <string name="dico_jmdict_slv">Japanese/Slovenian dictionary from the Electronic Dictionary Research and Development Group.\n~4 MB, ~9,000 entries.</string>
24+
    <string name="dico_jmdict_spa">Japanese/Spanish dictionary from the Electronic Dictionary Research and Development Group.\n~15 MB, ~35,000 entries.</string>
25+
    <string name="dico_jmdict_swe">Japanese/Swedish dictionary from the Electronic Dictionary Research and Development Group.\n~4 MB, ~15,000 entries.</string>
1926
2027
    <!-- Result view -->
2128
    <string name="sense_number">%d.</string>