Update data format
app/src/main/java/eu/lepiller/nani/dictionary/DictionaryFactory.java
13 | 13 | ||
14 | 14 | private DictionaryFactory(Context context) { | |
15 | 15 | dictionaries = new ArrayList<>(); | |
16 | - | dictionaries.add(new JMDict("example_jmdict", | |
17 | - | context.getString(R.string.dico_jmdict_example), | |
18 | - | context.getCacheDir(), | |
19 | - | "https://xana.lepiller.eu/nani/dico/example.nani")); | |
20 | 16 | dictionaries.add(new JMDict("JMdict_e", | |
21 | 17 | context.getString(R.string.dico_jmdict_e), | |
22 | 18 | context.getCacheDir(), | |
23 | - | "https://xana.lepiller.eu/nani/dico/JMdict_e.nani")); | |
19 | + | "https://nani.lepiller.eu/dicos/JMdict_e.nani")); | |
20 | + | dictionaries.add(new JMDict("JMdict_dut", | |
21 | + | context.getString(R.string.dico_jmdict_dut), | |
22 | + | context.getCacheDir(), | |
23 | + | "https://nani.lepiller.eu/dicos/JMdict_dut.nani")); | |
24 | + | dictionaries.add(new JMDict("JMdict_fre", | |
25 | + | context.getString(R.string.dico_jmdict_fre), | |
26 | + | context.getCacheDir(), | |
27 | + | "https://nani.lepiller.eu/dicos/JMdict_fre.nani")); | |
28 | + | dictionaries.add(new JMDict("JMdict_ger", | |
29 | + | context.getString(R.string.dico_jmdict_ger), | |
30 | + | context.getCacheDir(), | |
31 | + | "https://nani.lepiller.eu/dicos/JMdict_ger.nani")); | |
32 | + | dictionaries.add(new JMDict("JMdict_hun", | |
33 | + | context.getString(R.string.dico_jmdict_hun), | |
34 | + | context.getCacheDir(), | |
35 | + | "https://nani.lepiller.eu/dicos/JMdict_hun.nani")); | |
36 | + | dictionaries.add(new JMDict("JMdict_rus", | |
37 | + | context.getString(R.string.dico_jmdict_rus), | |
38 | + | context.getCacheDir(), | |
39 | + | "https://nani.lepiller.eu/dicos/JMdict_rus.nani")); | |
40 | + | dictionaries.add(new JMDict("JMdict_slv", | |
41 | + | context.getString(R.string.dico_jmdict_slv), | |
42 | + | context.getCacheDir(), | |
43 | + | "https://nani.lepiller.eu/dicos/JMdict_slv.nani")); | |
44 | + | dictionaries.add(new JMDict("JMdict_spa", | |
45 | + | context.getString(R.string.dico_jmdict_spa), | |
46 | + | context.getCacheDir(), | |
47 | + | "https://nani.lepiller.eu/dicos/JMdict_spa.nani")); | |
48 | + | dictionaries.add(new JMDict("JMdict_swe", | |
49 | + | context.getString(R.string.dico_jmdict_swe), | |
50 | + | context.getCacheDir(), | |
51 | + | "https://nani.lepiller.eu/dicos/JMdict_swe.nani")); | |
24 | 52 | } | |
25 | 53 | ||
26 | 54 | public static ArrayList<Result> search(Context context, String text) { |
app/src/main/java/eu/lepiller/nani/dictionary/JMDict.java
5 | 5 | ||
6 | 6 | import java.io.File; | |
7 | 7 | import java.io.FileNotFoundException; | |
8 | - | import java.io.FileReader; | |
9 | 8 | import java.io.IOException; | |
10 | 9 | import java.io.RandomAccessFile; | |
11 | 10 | import java.net.MalformedURLException; | |
12 | 11 | import java.net.URL; | |
13 | - | import java.nio.charset.StandardCharsets; | |
14 | 12 | import java.util.ArrayList; | |
15 | 13 | import java.util.Arrays; | |
14 | + | import java.util.Comparator; | |
16 | 15 | import java.util.HashMap; | |
16 | + | import java.util.List; | |
17 | 17 | import java.util.Map; | |
18 | 18 | ||
19 | 19 | import eu.lepiller.nani.R; | |
20 | 20 | import eu.lepiller.nani.result.Result; | |
21 | 21 | ||
22 | 22 | public class JMDict extends Dictionary { | |
23 | - | final static String TAG = "JMDICT"; | |
23 | + | interface Huffman { | |
24 | + | } | |
25 | + | ||
26 | + | static class HuffmanTree implements Huffman { | |
27 | + | Huffman left, right; | |
28 | + | HuffmanTree(Huffman left, Huffman right) { | |
29 | + | this.left = left; | |
30 | + | this.right = right; | |
31 | + | } | |
32 | + | } | |
33 | + | static class HuffmanValue implements Huffman { | |
34 | + | String character; | |
35 | + | HuffmanValue(String character) { | |
36 | + | this.character = character; | |
37 | + | } | |
38 | + | } | |
39 | + | ||
40 | + | final private static String TAG = "JMDICT"; | |
24 | 41 | private String mUrl; | |
42 | + | private Huffman kanjiHuffman, readingHuffman, meaningHuffman; | |
25 | 43 | ||
26 | 44 | JMDict(String name, String description, File cacheDir, String url) { | |
27 | 45 | super(name, description, cacheDir); | |
… | |||
64 | 82 | public void remove() { | |
65 | 83 | File file = getFile(); | |
66 | 84 | file.delete(); | |
85 | + | kanjiHuffman = null; | |
86 | + | readingHuffman = null; | |
87 | + | meaningHuffman = null; | |
67 | 88 | } | |
68 | 89 | ||
69 | 90 | private ArrayList<String> getStringList(RandomAccessFile file) throws IOException { | |
70 | 91 | ArrayList<String> results = new ArrayList<>(); | |
71 | - | int number = file.readInt(); | |
92 | + | int number = file.readShort(); | |
93 | + | for(int i=0; i<number; i++) { | |
94 | + | results.add(file.readUTF()); | |
95 | + | } | |
96 | + | return results; | |
97 | + | } | |
98 | + | ||
99 | + | private String getHuffmanString(RandomAccessFile file, Huffman huffman) throws IOException { | |
100 | + | StringBuilder b = new StringBuilder(); | |
101 | + | ArrayList<Boolean> bits = new ArrayList<>(); | |
102 | + | String c = null; | |
103 | + | Huffman h = huffman; | |
104 | + | while(c == null || !c.isEmpty()) { | |
105 | + | if(h instanceof HuffmanValue) { | |
106 | + | c = ((HuffmanValue) h).character; | |
107 | + | Log.d(TAG, "Huffman read: " + c); | |
108 | + | b.append(c); | |
109 | + | h = huffman; | |
110 | + | } else if(h instanceof HuffmanTree) { | |
111 | + | if(bits.isEmpty()) { | |
112 | + | byte by = file.readByte(); | |
113 | + | Log.d(TAG, "Read byte for huffman: " + by); | |
114 | + | short mod = (short)256; | |
115 | + | while(mod != 1) { | |
116 | + | mod /= 2; | |
117 | + | bits.add((by / mod) > 0); | |
118 | + | by = (byte)(by % mod); | |
119 | + | } | |
120 | + | Log.d(TAG, "Read byte for huffman: " + bits); | |
121 | + | } | |
122 | + | ||
123 | + | Boolean bo = bits.get(0); | |
124 | + | bits.remove(0); | |
125 | + | h = bo? ((HuffmanTree) h).right: ((HuffmanTree) h).left; | |
126 | + | } | |
127 | + | } | |
128 | + | ||
129 | + | return b.toString(); | |
130 | + | } | |
131 | + | ||
132 | + | private void logHuffman(Huffman h, ArrayList<Boolean> addr) { | |
133 | + | if (h instanceof HuffmanValue) { | |
134 | + | Log.d(TAG, "HUFF: " + ((HuffmanValue) h).character + " -> " + addr.toString()); | |
135 | + | } else if(h instanceof HuffmanTree) { | |
136 | + | ArrayList<Boolean> addr_l = new ArrayList<>(addr); | |
137 | + | addr_l.add(false); | |
138 | + | ArrayList<Boolean> addr_r = new ArrayList<>(addr); | |
139 | + | addr_r.add(true); | |
140 | + | logHuffman(((HuffmanTree) h).left, addr_l); | |
141 | + | logHuffman(((HuffmanTree) h).right, addr_r); | |
142 | + | } | |
143 | + | } | |
144 | + | ||
145 | + | private ArrayList<String> getHuffmanStringList(RandomAccessFile file, Huffman huffman) throws IOException { | |
146 | + | ArrayList<String> results = new ArrayList<>(); | |
147 | + | int number = file.readShort(); | |
148 | + | Log.d(TAG, "huffmanStrings: " + number); | |
72 | 149 | for(int i=0; i<number; i++) { | |
73 | - | int l = file.readInt(); | |
74 | - | byte[] str = new byte[l]; | |
75 | - | file.read(str); | |
76 | - | results.add(new String(str, "UTF-8")); | |
150 | + | results.add(getHuffmanString(file, huffman)); | |
77 | 151 | } | |
78 | 152 | return results; | |
79 | 153 | } | |
80 | 154 | ||
81 | - | private String getString(RandomAccessFile file) throws IOException { | |
82 | - | int l = file.readInt(); | |
83 | - | byte[] str = new byte[l]; | |
84 | - | file.read(str); | |
85 | - | return new String(str, "UTF-8"); | |
155 | + | private ArrayList<Integer> getIntList(RandomAccessFile file) throws IOException { | |
156 | + | ArrayList<Integer> results = new ArrayList<>(); | |
157 | + | int number = file.readShort(); | |
158 | + | for(int i=0; i<number; i++) { | |
159 | + | results.add(Integer.valueOf(file.readByte())); | |
160 | + | } | |
161 | + | return results; | |
86 | 162 | } | |
87 | 163 | ||
88 | 164 | private Result getValue(RandomAccessFile file, long pos) throws IOException { | |
89 | 165 | file.seek(pos); | |
90 | - | ArrayList<String> kanjis = getStringList(file); | |
166 | + | Log.d(TAG, "Getting value"); | |
167 | + | ArrayList<String> kanjis = getHuffmanStringList(file, kanjiHuffman); | |
91 | 168 | ||
169 | + | Log.d(TAG, "Getting readings"); | |
92 | 170 | ArrayList<Result.Reading> readings = new ArrayList<>(); | |
93 | - | int reading_number = file.readInt(); | |
171 | + | int reading_number = file.readShort(); | |
172 | + | Log.d(TAG, reading_number + " readings."); | |
94 | 173 | for(int i=0; i<reading_number; i++) { | |
95 | 174 | ArrayList<String> reading_kanjis = getStringList(file); | |
175 | + | Log.d(TAG, "kanjis: " + reading_kanjis); | |
96 | 176 | ArrayList<String> reading_infos = getStringList(file); | |
97 | - | ArrayList<String> reading_readings = getStringList(file); | |
177 | + | Log.d(TAG, "infos: " + reading_kanjis); | |
178 | + | ArrayList<String> reading_readings = getHuffmanStringList(file, readingHuffman); | |
98 | 179 | Result.Reading r = new Result.Reading(reading_kanjis, reading_infos, reading_readings); | |
99 | 180 | readings.add(r); | |
100 | 181 | } | |
101 | 182 | ||
102 | 183 | ArrayList<Result.Sense> senses = new ArrayList<>(); | |
103 | - | int meaning_number = file.readInt(); | |
184 | + | int meaning_number = file.readShort(); | |
185 | + | Log.d(TAG, meaning_number + " meanings."); | |
104 | 186 | for(int i=0; i<meaning_number; i++) { | |
105 | 187 | ArrayList<String> sense_references = getStringList(file); | |
106 | 188 | ArrayList<String> sense_limits = getStringList(file); | |
… | |||
110 | 192 | for(int j=0; j<source_number; j++) { | |
111 | 193 | ArrayList<String> source_content = getStringList(file); | |
112 | 194 | boolean source_wasei = file.read() != 0; | |
113 | - | String source_type = getString(file); | |
114 | - | String source_language = getString(file); | |
195 | + | String source_type = file.readUTF(); | |
196 | + | String source_language = file.readUTF(); | |
115 | 197 | sense_sources.add(new Result.Source(source_content, source_wasei, source_type, source_language)); | |
116 | 198 | } | |
117 | - | ArrayList<String> sense_tags = getStringList(file); | |
118 | - | ArrayList<String> sense_glosses = getStringList(file); | |
119 | - | String sense_language = getString(file); | |
199 | + | ArrayList<Integer> sense_tags = getIntList(file); | |
200 | + | ArrayList<String> sense_glosses = getHuffmanStringList(file, meaningHuffman); | |
201 | + | String sense_language = file.readUTF(); | |
120 | 202 | senses.add(new Result.Sense(sense_references, sense_limits, sense_infos, sense_sources, | |
121 | 203 | sense_tags, sense_glosses, sense_language)); | |
122 | 204 | } | |
… | |||
125 | 207 | ||
126 | 208 | private ArrayList<Integer> getValues(RandomAccessFile file, long triePos) throws IOException { | |
127 | 209 | file.seek(triePos); | |
128 | - | Log.d(TAG, "Getting value"); | |
129 | - | int valuesLength = file.readInt(); | |
210 | + | Log.d(TAG, "Getting values"); | |
211 | + | int valuesLength = file.readShort(); | |
130 | 212 | ArrayList<Integer> results = new ArrayList<>(); | |
131 | 213 | ||
132 | 214 | Log.d(TAG, "Number of values: " + valuesLength); | |
… | |||
134 | 216 | results.add(file.readInt()); | |
135 | 217 | } | |
136 | 218 | ||
137 | - | int transitionLength = file.readInt(); | |
219 | + | int transitionLength = file.readByte(); | |
138 | 220 | Log.d(TAG, "Number of transitions: " + transitionLength); | |
139 | 221 | int[] others = new int[transitionLength]; | |
140 | 222 | for(int i=0; i<transitionLength; i++) { | |
… | |||
158 | 240 | if(txt.length == 0) { | |
159 | 241 | return getValues(file, triePos); | |
160 | 242 | } else { | |
161 | - | int valuesLength = file.readInt(); | |
243 | + | int valuesLength = file.readShort(); | |
162 | 244 | Log.d(TAG, "number of values: " + valuesLength); | |
163 | - | /*for(int i=0; i<valuesLength; i++) { | |
164 | - | int strLen = file.readInt(); | |
165 | - | byte[] s = new byte[strLen]; | |
166 | - | file.read(s); | |
167 | - | //Log.d(TAG, "Value of size " + strLen + ": " + new String(s, "UTF-8")); | |
168 | - | file.skipBytes(strLen*4); | |
169 | - | }*/ | |
170 | 245 | file.skipBytes(valuesLength * 4); | |
171 | 246 | ||
172 | - | int transitionLength = file.readInt(); | |
247 | + | int transitionLength = file.readByte(); | |
173 | 248 | Log.d(TAG, "number of transitions: " + transitionLength); | |
174 | 249 | ||
175 | 250 | for(int i = 0; i < transitionLength; i++) { | |
… | |||
189 | 264 | } | |
190 | 265 | } | |
191 | 266 | ||
267 | + | private Huffman loadHuffman(RandomAccessFile file) throws IOException { | |
268 | + | byte b = file.readByte(); | |
269 | + | if(b == 1) { | |
270 | + | Huffman left = loadHuffman(file); | |
271 | + | Huffman right = loadHuffman(file); | |
272 | + | ||
273 | + | return new HuffmanTree(left, right); | |
274 | + | } else if (b == 0) { | |
275 | + | file.skipBytes(1); | |
276 | + | return new HuffmanValue(""); | |
277 | + | } else { | |
278 | + | ArrayList<Byte> bs = new ArrayList<>(); | |
279 | + | bs.add(b); | |
280 | + | while((b = file.readByte()) != 0) { | |
281 | + | bs.add(b); | |
282 | + | } | |
283 | + | byte[] array = new byte[bs.size()]; | |
284 | + | for(int i=0; i<bs.size(); i++) { | |
285 | + | array[i] = bs.get(i); | |
286 | + | } | |
287 | + | return new HuffmanValue(new String(array, "UTF-8")); | |
288 | + | } | |
289 | + | } | |
290 | + | ||
192 | 291 | ArrayList<Result> search(String text) { | |
193 | 292 | if (isDownloaded()) { | |
194 | 293 | try { | |
… | |||
213 | 312 | Log.d(TAG, "reading: " + readingTriePos); | |
214 | 313 | Log.d(TAG, "meaning: " + meaningTriePos); | |
215 | 314 | ||
315 | + | kanjiHuffman = loadHuffman(file); | |
316 | + | readingHuffman = loadHuffman(file); | |
317 | + | meaningHuffman = loadHuffman(file); | |
318 | + | ||
319 | + | logHuffman(readingHuffman, new ArrayList<Boolean>()); | |
320 | + | ||
216 | 321 | ArrayList<Integer> results = searchTrie(file, kanjiTriePos, search); | |
217 | 322 | if(results == null || results.isEmpty()) | |
218 | 323 | results = searchTrie(file, readingTriePos, search); | |
… | |||
229 | 334 | uniqResults.add(i); | |
230 | 335 | } | |
231 | 336 | ||
232 | - | for(Integer i: uniqResults) { | |
337 | + | int[] uniqResultsArray = new int[uniqResults.size()]; | |
338 | + | Arrays.sort(uniqResultsArray); | |
339 | + | ||
340 | + | int num = 0; | |
341 | + | for(Integer i: uniqResultsArray) { | |
342 | + | if(num > 10) | |
343 | + | break; | |
344 | + | num++; | |
233 | 345 | r.add(getValue(file, i)); | |
234 | 346 | } | |
235 | 347 | return r; |
app/src/main/java/eu/lepiller/nani/result/Result.java
23 | 23 | } | |
24 | 24 | ||
25 | 25 | public static class Sense { | |
26 | - | private ArrayList<String> references, limits, infos, tags, glosses; | |
26 | + | private ArrayList<String> references, limits, infos, glosses; | |
27 | + | private ArrayList<Integer> tags; | |
27 | 28 | private String language; | |
28 | 29 | private ArrayList<Source> sources; | |
29 | 30 | ||
30 | 31 | public Sense(ArrayList<String> references, ArrayList<String> limits, ArrayList<String> infos, | |
31 | - | ArrayList<Source> sources, ArrayList<String> tags, ArrayList<String> glosses, | |
32 | + | ArrayList<Source> sources, ArrayList<Integer> tags, ArrayList<String> glosses, | |
32 | 33 | String language) { | |
33 | 34 | this.references = references; | |
34 | 35 | this.limits = limits; |
app/src/main/res/values/strings.xml
14 | 14 | <string name="kanji_description">Writing</string> | |
15 | 15 | ||
16 | 16 | <!-- Dictionnary descriptions --> | |
17 | - | <string name="dico_jmdict_example">Japanese/English dictionary for test purposes. Do not use.</string> | |
18 | 17 | <string name="dico_jmdict_e">Japanese/English dictionary from the Electronic Dictionary Research and Development Group.\n~80 MB, ~180,000 entries.</string> | |
18 | + | <string name="dico_jmdict_dut">Japanese/Dutch dictionary from the Electronic Dictionary Research and Development Group.\n~30 MB, ~35,000 entries.</string> | |
19 | + | <string name="dico_jmdict_fre">Japanese/French dictionary from the Electronic Dictionary Research and Development Group.\n~8 MB, ~15,000 entries.</string> | |
20 | + | <string name="dico_jmdict_ger">Japanese/German dictionary from the Electronic Dictionary Research and Development Group.\n~70 MB, ~120,000 entries.</string> | |
21 | + | <string name="dico_jmdict_hun">Japanese/Hungarian dictionary from the Electronic Dictionary Research and Development Group.\n~15 MB, ~40,000 entries.</string> | |
22 | + | <string name="dico_jmdict_rus">Japanese/Russian dictionary from the Electronic Dictionary Research and Development Group.\n~80 MB, ~65,000 entries.</string> | |
23 | + | <string name="dico_jmdict_slv">Japanese/Slovenian dictionary from the Electronic Dictionary Research and Development Group.\n~4 MB, ~9,000 entries.</string> | |
24 | + | <string name="dico_jmdict_spa">Japanese/Spanish dictionary from the Electronic Dictionary Research and Development Group.\n~15 MB, ~35,000 entries.</string> | |
25 | + | <string name="dico_jmdict_swe">Japanese/Swedish dictionary from the Electronic Dictionary Research and Development Group.\n~4 MB, ~15,000 entries.</string> | |
19 | 26 | ||
20 | 27 | <!-- Result view --> | |
21 | 28 | <string name="sense_number">%d.</string> |