author | jenda |
Mon, 28 Nov 2011 09:40:57 +0100 | |
changeset 636 | e4ac77666de3 |
parent 0 | 542988ea726d |
permissions | -rw-r--r-- |
0 | 1 |
/* |
2 |
* CDDL HEADER START |
|
3 |
* |
|
4 |
* The contents of this file are subject to the terms of the |
|
5 |
* Common Development and Distribution License (the "License"). |
|
6 |
* You may not use this file except in compliance with the License. |
|
7 |
* |
|
8 |
* You can obtain a copy of the license at src/OPENSOLARIS.LICENSE |
|
9 |
* or http://www.opensolaris.org/os/licensing. |
|
10 |
* See the License for the specific language governing permissions |
|
11 |
* and limitations under the License. |
|
12 |
* |
|
13 |
* When distributing Covered Code, include this CDDL HEADER in each |
|
14 |
* file and include the License file at src/OPENSOLARIS.LICENSE. |
|
15 |
* If applicable, add the following below this CDDL HEADER, with the |
|
16 |
* fields enclosed by brackets "[]" replaced with your own identifying |
|
17 |
* information: Portions Copyright [yyyy] [name of copyright owner] |
|
18 |
* |
|
19 |
* CDDL HEADER END |
|
20 |
*/ |
|
21 |
/* |
|
22 |
* Copyright (c) 2003, by Sun Microsystems, Inc. |
|
23 |
* All rights reserved. |
|
24 |
*/ |
|
25 |
||
26 |
#ident "@(#)auto_ef_util.c 1.18 07/04/12 SMI" |
|
27 |
#include "auto_ef_lib.h" |
|
28 |
||
29 |
int IdentfyEncoding(int, size_t, int *, char *, char *, const char *, |
|
30 |
_auto_ef_t *, int *, char *); |
|
31 |
int IsSingleByte_buf(const char *, double *, char *, int, srd *, |
|
32 |
double *, double *, _auto_ef_t *); |
|
33 |
int IsHKSCSOrBIG5(char *, const char *, char *, size_t, _auto_ef_t *, double); |
|
34 |
int RegistBIG5(char *, size_t, char *, const char *, _auto_ef_t *, |
|
35 |
srd *, double *, double *); |
|
36 |
int RegistEUC(char *, int, char *, _auto_ef_t *, srd *, double *, double *); |
|
37 |
int RegistHashTable(unsigned char, unsigned char, srd *); |
|
38 |
int HashTableOpen(char *, srd *, double *, double *); |
|
39 |
void get_hash_name(char *, char *); |
|
40 |
int Regist_ASCII_ISO2022JP(int, char *, _auto_ef_t *); |
|
636
e4ac77666de3
7115607 cmd/auto_ef compilation fails on b175a - wrong iconv args (backport of s11u1:fb701ba7adfd)
jenda
parents:
0
diff
changeset
|
41 |
int IsAsciiOr2022_buf(char *, int, char *); |
0 | 42 |
int Is2022KROrCN(int, char *, char *, size_t, _auto_ef_t *); |
43 |
int Hash(unsigned char, unsigned char); |
|
44 |
int TotalScore_buf(const char *, double *, int, srd *, double *, double *); |
|
45 |
int FindKeyWord(unsigned char, unsigned char, srd *); |
|
46 |
int GetScore(unsigned char, unsigned char, srd *); |
|
47 |
double Calc_SD(int, double *, double *); |
|
48 |
char *chopbuf(char *); |
|
49 |
void FreeHashTable(srd *); |
|
50 |
int CheckISO2022CN(unsigned char, unsigned char, unsigned char, unsigned char); |
|
51 |
int CheckISO2022KR(unsigned char, unsigned char, unsigned char, unsigned char); |
|
52 |
void ThaiSpecificCheck(const char *, char *, size_t); |
|
53 |
||
54 |
/* |
|
55 |
Map from Single Byte encoding to Language |
|
56 |
*/ |
|
57 |
const char single_byte_langs[SINGLE_ENCODING_MAX][ICONV_LOCALE_MAX][ENCODING_LENGTH] = { |
|
58 |
/* ISO-8859-1 : 0 */ |
|
59 |
{"Germany", "Spain", "France", "Italy", "Sweden", "Denmark", "Finland", |
|
60 |
"Iceland", "Catalonia", "Netherland", "Norway", "Portugal"}, |
|
61 |
/* ISO-8859-2 : 1 */ |
|
62 |
{"Croatia", "Hungary", "Poland", "Serbia", "Slovakia", "Slovenia"}, |
|
63 |
/* ISO-8859-5 : 2 */ |
|
64 |
{"Bulgaria", "Russia"}, |
|
65 |
/* ISO-8859-6 : 3 */ |
|
66 |
{"Arabia"}, |
|
67 |
/* ISO-8859-7 : 4 */ |
|
68 |
{"Greece"}, |
|
69 |
/* ISO-8859-8 : 5 */ |
|
70 |
{"Hebrew"}, |
|
71 |
/* koi8_r : 6 */ |
|
72 |
{"Russia"}, |
|
73 |
/* CP1250 : 7 */ |
|
74 |
{"Croatia", "Hungary", "Poland", "Serbia", "Slovakia", "Slovenia"}, |
|
75 |
/* CP1251 : 8 */ |
|
76 |
{"Bulgaria", "Russia"}, |
|
77 |
/* CP1252 : 9 */ |
|
78 |
{"Germany", "Spain", "France", "Italy", "Sweden", "Denmark", "Finland", |
|
79 |
"Iceland", "Catalonia", "Netherland", "Norway", "Portugal"}, |
|
80 |
/* CP1253 : 10 */ |
|
81 |
{"Greece"}, |
|
82 |
/* CP1255 : 11 */ |
|
83 |
{"Hebrew"}, |
|
84 |
/* CP1256 : 12 */ |
|
85 |
{"Arabia"}, |
|
86 |
/* CP874 : 14 */ |
|
87 |
{"Thai"}, |
|
88 |
/* TIS620.2533 : 15 */ |
|
89 |
{"Thai"} |
|
90 |
}; |
|
91 |
||
92 |
extern const char *to_code; |
|
93 |
const char roothash[64]={ |
|
94 |
"/usr/lib/auto_ef/hashtable." |
|
95 |
}; |
|
96 |
||
97 |
int IdentfyEncoding(int code_num, size_t buf_size, |
|
98 |
int *found_target, char *from_code, char *inputp, const char *to_code, |
|
99 |
_auto_ef_t *root_autoef, int *end_auto_ef, char *input_buf) { |
|
100 |
||
101 |
int i, utf_flag; |
|
102 |
char hashfilename[PATH_MAX]; |
|
103 |
srd hashtable[HASHSIZE]; |
|
104 |
double average = 0.0, SD = 0.0; |
|
105 |
||
106 |
for (i = 0; i < HASHSIZE; i++) |
|
107 |
hashtable[i] = NULL; |
|
108 |
||
109 |
for (i = 0; i < PATH_MAX; i++) |
|
110 |
hashfilename[i] = '\0'; |
|
111 |
||
112 |
switch (code_num) { |
|
113 |
case 1: |
|
114 |
/* UTF-8 */ |
|
115 |
*found_target = 1; |
|
116 |
utf_flag = 0; |
|
117 |
for (i = 0; i < (int)buf_size; i++) { |
|
118 |
if (inputp[i] == '\0') |
|
119 |
break; |
|
120 |
if ((unsigned char)inputp[i] > 127) { |
|
121 |
utf_flag = 1; |
|
122 |
break; |
|
123 |
} |
|
124 |
} |
|
125 |
||
126 |
if (Is2022KROrCN(utf_flag, from_code, input_buf, buf_size, root_autoef) != -1) |
|
127 |
*end_auto_ef = 1; |
|
128 |
||
129 |
||
130 |
break; |
|
131 |
||
132 |
case 2: /* ISO-2022-JP or ASCII */ |
|
133 |
*found_target = 1; |
|
134 |
||
135 |
i = IsAsciiOr2022_buf(inputp, buf_size, from_code); |
|
136 |
if (Regist_ASCII_ISO2022JP(i, from_code, root_autoef) == -1) |
|
137 |
return (-1); |
|
138 |
||
139 |
*end_auto_ef = 1; |
|
140 |
break; |
|
141 |
||
142 |
case 3: /* EUC series */ |
|
143 |
get_hash_name(hashfilename, from_code); |
|
144 |
if (HashTableOpen(hashfilename, hashtable, |
|
145 |
&average, &SD) >= 0) { |
|
146 |
||
147 |
*found_target = RegistEUC(from_code, buf_size, inputp, |
|
148 |
root_autoef, hashtable, &average, &SD); |
|
149 |
||
150 |
if (*found_target == -1) |
|
151 |
return (-1); |
|
152 |
} else { |
|
153 |
errno = EACCES; |
|
154 |
return (-1); |
|
155 |
} |
|
156 |
FreeHashTable(hashtable); |
|
157 |
break; |
|
158 |
||
159 |
case 7: /* PCK, zh_HK.hkscs, GB18030, ISO-2022-KR, zh_CN.iso2022-CN */ |
|
160 |
get_hash_name(hashfilename, from_code); |
|
161 |
if (HashTableOpen(hashfilename, hashtable, |
|
162 |
&average, &SD) >= 0) { |
|
163 |
||
164 |
if (RegistBIG5(from_code, buf_size, |
|
165 |
inputp, to_code, root_autoef, hashtable, |
|
166 |
&average, &SD) == -1) |
|
167 |
||
168 |
return (-1); |
|
169 |
} else { |
|
170 |
errno = EACCES; |
|
171 |
return (-1); |
|
172 |
} |
|
173 |
FreeHashTable(hashtable); |
|
174 |
break; |
|
175 |
||
176 |
case 8: /* 8859 or CP series */ |
|
177 |
if (!*found_target) { |
|
178 |
double total_score = 0.0; |
|
179 |
double single_byte_score = 0.0; |
|
180 |
double highest_score = -3.0; |
|
181 |
int i; |
|
182 |
||
183 |
if (IsSingleByte_buf(inputp, &total_score, |
|
184 |
from_code, buf_size, hashtable, |
|
185 |
&average, &SD, root_autoef) == -1) { |
|
186 |
||
187 |
return (-1); |
|
188 |
} |
|
189 |
} |
|
190 |
break; |
|
191 |
||
192 |
default: |
|
193 |
errno = EACCES; |
|
194 |
return (-1); |
|
195 |
||
196 |
} |
|
197 |
||
198 |
return (0); |
|
199 |
} |
|
200 |
||
201 |
int IsSingleByte_buf(const char *input_buf, |
|
202 |
double *total_score, char *encoding, int buf_size, srd *hashtable, |
|
203 |
double *average, double *SD, _auto_ef_t *root_autoef) |
|
204 |
{ |
|
205 |
FILE *fp; |
|
206 |
int i; |
|
207 |
int sflag = -1; |
|
208 |
char tablename[PATH_MAX]; |
|
209 |
||
210 |
double highest_score = -3.0; |
|
211 |
||
212 |
if (strcmp(encoding, I8859_1) == 0) sflag = 0; |
|
213 |
if (strcmp(encoding, I8859_2) == 0) sflag = 1; |
|
214 |
if (strcmp(encoding, I8859_5) == 0) sflag = 2; |
|
215 |
if (strcmp(encoding, I8859_6) == 0) sflag = 3; |
|
216 |
if (strcmp(encoding, I8859_7) == 0) sflag = 4; |
|
217 |
if (strcmp(encoding, I8859_8) == 0) sflag = 5; |
|
218 |
if (strcmp(encoding, KOI8) == 0) sflag = 6; |
|
219 |
if (strcmp(encoding, CP1250) == 0) sflag = 7; |
|
220 |
if (strcmp(encoding, CP1251) == 0) sflag = 8; |
|
221 |
if (strcmp(encoding, CP1252) == 0) sflag = 9; |
|
222 |
if (strcmp(encoding, CP1253) == 0) sflag = 10; |
|
223 |
if (strcmp(encoding, CP1255) == 0) sflag = 11; |
|
224 |
if (strcmp(encoding, CP1256) == 0) sflag = 12; |
|
225 |
if (strcmp(encoding, CP874) == 0) sflag = 14; |
|
226 |
if (strcmp(encoding, TIS620) == 0) sflag = 15; |
|
227 |
||
228 |
for (i = 0; i < ICONV_LOCALE_MAX; i++) { |
|
229 |
if (single_byte_langs[sflag][i][0] == '\0') |
|
230 |
break; |
|
231 |
*total_score = 0.0; |
|
232 |
||
233 |
get_hash_name(tablename, encoding); |
|
234 |
strcat(tablename, "_"); |
|
235 |
strcat(tablename, "\0"); |
|
236 |
strcat(tablename, single_byte_langs[sflag][i]); |
|
237 |
strcat(tablename, "\0"); |
|
238 |
||
239 |
if (HashTableOpen(tablename, hashtable, average, SD) >= 0) { |
|
240 |
if (TotalScore_buf(input_buf, total_score, buf_size, |
|
241 |
hashtable, average, SD) >= 0) { |
|
242 |
||
243 |
/* */ |
|
244 |
/* encoding specific */ |
|
245 |
/* */ |
|
246 |
||
247 |
if (sflag == 14 || sflag == 15) { |
|
248 |
ThaiSpecificCheck(input_buf, encoding, buf_size); |
|
249 |
} |
|
250 |
||
251 |
if (*total_score != 0.0) { |
|
252 |
if (Regist_AUTOEF(encoding, *total_score, |
|
253 |
(char *)single_byte_langs[sflag][i], |
|
254 |
root_autoef) == -1) { |
|
255 |
||
256 |
errno = ENOMEM; |
|
257 |
return (-1); |
|
258 |
} |
|
259 |
} |
|
260 |
} |
|
261 |
||
262 |
} else { |
|
263 |
errno = EACCES; |
|
264 |
return (-1); |
|
265 |
} |
|
266 |
FreeHashTable(hashtable); |
|
267 |
} |
|
268 |
return (0); |
|
269 |
} |
|
270 |
||
271 |
||
272 |
int IsHKSCSOrBIG5(char *from_code, const char *to_code, |
|
273 |
char *inputp, size_t buf_size, _auto_ef_t *root_autoef, double total_score) |
|
274 |
{ |
|
275 |
||
276 |
iconv_t cd; |
|
636
e4ac77666de3
7115607 cmd/auto_ef compilation fails on b175a - wrong iconv args (backport of s11u1:fb701ba7adfd)
jenda
parents:
0
diff
changeset
|
277 |
char *context; |
0 | 278 |
char *convert; |
279 |
char *tbuf, *retbuf; |
|
280 |
size_t fsize, tsize; |
|
281 |
size_t ret; |
|
282 |
size_t comp_size = 0; |
|
283 |
const char *p; |
|
284 |
int succeed_flag = 0; |
|
285 |
||
286 |
int tmp_bufsize = 0; |
|
287 |
||
288 |
if ((cd = iconv_open(to_code, BIG5)) == (iconv_t)-1) { |
|
289 |
/* Use iconv_open errno */ |
|
290 |
return (-1); |
|
291 |
} |
|
292 |
||
293 |
errno = 0; |
|
294 |
p = context = &inputp[0]; |
|
295 |
while (*p) |
|
296 |
if ( *p == '\0') { |
|
297 |
break; |
|
298 |
} else p++; |
|
299 |
||
300 |
fsize = p - &inputp[0]; |
|
301 |
comp_size = fsize; |
|
302 |
tsize = fsize * 4; |
|
303 |
if ((tbuf = (char *) malloc(tsize)) == NULL) { |
|
304 |
errno = ENOMEM; |
|
305 |
iconv_close(cd); |
|
306 |
return (-1); |
|
307 |
} |
|
308 |
tbuf[0] = '\0'; |
|
309 |
convert = &tbuf[0]; |
|
310 |
ret = iconv(cd, &context, &fsize, &convert, &tsize); |
|
311 |
if (ret == (size_t) -1) |
|
312 |
succeed_flag = -1; |
|
313 |
else |
|
314 |
*convert = '\0'; |
|
315 |
iconv_close(cd); |
|
316 |
||
317 |
if (succeed_flag != -1) { |
|
318 |
if ((cd = iconv_open(BIG5, to_code)) == (iconv_t)-1) { |
|
319 |
/* Use iconv_open errno */ |
|
320 |
free(tbuf); |
|
321 |
return (-1); |
|
322 |
} |
|
323 |
||
324 |
errno = 0; |
|
325 |
p = context = &tbuf[0]; |
|
326 |
while (*p) |
|
327 |
if (*p == '\0'){ |
|
328 |
break; |
|
329 |
} else p++; |
|
330 |
||
331 |
fsize = p - &tbuf[0]; |
|
332 |
tsize = fsize * 4; |
|
333 |
if ((retbuf = (char *) malloc(tsize)) == NULL) { |
|
334 |
errno = ENOMEM; |
|
335 |
iconv_close(cd); |
|
336 |
free(tbuf); |
|
337 |
return (-1); |
|
338 |
} |
|
339 |
retbuf[0] = '\0'; |
|
340 |
convert = &retbuf[0]; |
|
341 |
ret = iconv(cd, &context, &fsize, &convert, &tsize); |
|
342 |
||
343 |
if (ret == (size_t) -1) |
|
344 |
succeed_flag = -1; |
|
345 |
else |
|
346 |
*convert = '\0'; |
|
347 |
iconv_close(cd); |
|
348 |
||
349 |
if (succeed_flag != -1) { |
|
350 |
if (strcmp(inputp, retbuf) == 0) { |
|
351 |
if (Regist_AUTOEF(BIG5, total_score, |
|
352 |
M_FromCodeToLang(from_code), |
|
353 |
root_autoef) == -1) { |
|
354 |
||
355 |
free(tbuf); |
|
356 |
free(retbuf); |
|
357 |
return (-1); |
|
358 |
} |
|
359 |
} else { |
|
360 |
if (Regist_AUTOEF(from_code, total_score, |
|
361 |
M_FromCodeToLang(from_code), |
|
362 |
root_autoef) == -1) { |
|
363 |
||
364 |
free(tbuf); |
|
365 |
free(retbuf); |
|
366 |
return (-1); |
|
367 |
} |
|
368 |
} |
|
369 |
} |
|
370 |
} |
|
371 |
free(tbuf); |
|
372 |
free(retbuf); |
|
373 |
return (0); |
|
374 |
} |
|
375 |
||
376 |
||
377 |
int RegistBIG5(char *from_code, size_t buf_size, |
|
378 |
char *inputp, const char *to_code, _auto_ef_t *root_autoef, |
|
379 |
srd *hashtable, double *average, double *SD) |
|
380 |
{ |
|
381 |
double total_score = 0.0; |
|
382 |
||
383 |
if (TotalScore_buf(inputp, &total_score, buf_size, |
|
384 |
hashtable, average, SD) >= 0) { |
|
385 |
if (total_score != 0.0) { |
|
386 |
/* If the encoding is zh_HK.hkscs, have to check */ |
|
387 |
/* the buf have extended code point from zh_TW.BIG5 */ |
|
388 |
if (strcmp(from_code, HKSCS) == 0) { |
|
389 |
if (IsHKSCSOrBIG5(from_code, to_code, |
|
390 |
inputp, buf_size, root_autoef, total_score) == -1) |
|
391 |
||
392 |
return (-1); |
|
393 |
} else { |
|
394 |
if (Regist_AUTOEF(from_code, total_score, |
|
395 |
M_FromCodeToLang(from_code), |
|
396 |
root_autoef) == -1) { |
|
397 |
||
398 |
return (-1); |
|
399 |
} |
|
400 |
} |
|
401 |
} |
|
402 |
} |
|
403 |
return (0); |
|
404 |
} |
|
405 |
||
406 |
||
407 |
int RegistEUC(char *from_code, int buf_size, char *inputp, |
|
408 |
_auto_ef_t *root_autoef, srd *hashtable, |
|
409 |
double *average, double *SD) { |
|
410 |
||
411 |
double total_score = 0.0; |
|
412 |
int found_target = 0; |
|
413 |
||
414 |
if (TotalScore_buf(inputp, &total_score, buf_size, |
|
415 |
hashtable, average, SD) >= 0) { |
|
416 |
if (total_score != 0.0) { |
|
417 |
found_target = 1; |
|
418 |
if (Regist_AUTOEF(from_code, total_score, |
|
419 |
M_FromCodeToLang(from_code), |
|
420 |
root_autoef) == -1) { |
|
421 |
||
422 |
return (-1); |
|
423 |
} |
|
424 |
} |
|
425 |
} |
|
426 |
return (found_target); |
|
427 |
} |
|
428 |
||
429 |
int RegistHashTable(unsigned char a, unsigned char b, srd *hashtable) { |
|
430 |
int i; |
|
431 |
srd p, lastp; |
|
432 |
int hashval; |
|
433 |
||
434 |
srd newrecordsrd = (srd) malloc(sizeof (SRD)); |
|
435 |
if (newrecordsrd == NULL) { |
|
436 |
errno = ENOMEM; |
|
437 |
return (-1); |
|
438 |
} |
|
439 |
||
440 |
newrecordsrd->keyword[0] = a; |
|
441 |
newrecordsrd->keyword[1] = b; |
|
442 |
newrecordsrd->score = 1; |
|
443 |
||
444 |
hashval = Hash(a, b); |
|
445 |
if (hashtable[hashval] == NULL) { |
|
446 |
hashtable[hashval] = newrecordsrd; |
|
447 |
newrecordsrd->nextsrd = NULL; |
|
448 |
} else { |
|
449 |
p = hashtable[hashval]; |
|
450 |
while (p->nextsrd != NULL) { |
|
451 |
p = p->nextsrd; |
|
452 |
} |
|
453 |
p->nextsrd = newrecordsrd; |
|
454 |
newrecordsrd->nextsrd = NULL; |
|
455 |
} |
|
456 |
||
457 |
return (0); |
|
458 |
} |
|
459 |
||
460 |
||
461 |
int HashTableOpen(char *table, srd *hashtable, double *average, |
|
462 |
double *SD) { |
|
463 |
||
464 |
FILE *fp; |
|
465 |
char buf[LONG_BIT]; |
|
466 |
int i; |
|
467 |
int tableline = 0; |
|
468 |
int hash_score = 0; |
|
469 |
int total_ent = 0; |
|
470 |
double sum_of_score = 0.0; |
|
471 |
double sum_of_deviation = 0.0; |
|
472 |
srd srdp; |
|
473 |
||
474 |
if ((fp = fopen(table, "r")) == NULL) { |
|
475 |
errno = EACCES; |
|
476 |
return (-1); |
|
477 |
} |
|
478 |
||
479 |
while (fgets(buf, LONG_BIT, fp) != NULL) { |
|
480 |
char *p; |
|
481 |
srd srdp; |
|
482 |
unsigned char point[3]; |
|
483 |
unsigned char keyword_a, keyword_b; |
|
484 |
||
485 |
chopbuf(buf); |
|
486 |
||
487 |
if (tableline == 3) |
|
488 |
tableline = 0; |
|
489 |
||
490 |
switch (tableline) { |
|
491 |
case 0: |
|
492 |
break; |
|
493 |
case 1: |
|
494 |
point[0] = buf[0]; point[1] = buf[1]; point[2] = '\0'; |
|
495 |
keyword_a = |
|
496 |
(unsigned char)strtol((const char *)point, |
|
497 |
(char **)NULL, 16); |
|
498 |
||
499 |
point[0] = buf[2]; point[1] = buf[3]; point[2] = '\0'; |
|
500 |
keyword_b = |
|
501 |
(unsigned char)strtol((const char *)point, |
|
502 |
(char **)NULL, 16); |
|
503 |
||
504 |
if (RegistHashTable(keyword_a, keyword_b, |
|
505 |
hashtable) == -1) { |
|
506 |
||
507 |
errno = EACCES; |
|
508 |
return (-1); |
|
509 |
} |
|
510 |
break; |
|
511 |
case 2: |
|
512 |
hash_score = atoi(buf); |
|
513 |
for (srdp = hashtable[Hash(keyword_a, keyword_b)]; |
|
514 |
srdp != NULL; srdp = srdp->nextsrd) { |
|
515 |
if ((srdp->keyword[0] == keyword_a) && |
|
516 |
(srdp->keyword[1] == keyword_b)) { |
|
517 |
||
518 |
srdp->score = hash_score; |
|
519 |
total_ent++; |
|
520 |
sum_of_score += hash_score; |
|
521 |
break; |
|
522 |
} |
|
523 |
} |
|
524 |
break; |
|
525 |
} |
|
526 |
tableline++; |
|
527 |
} |
|
528 |
||
529 |
*average = sum_of_score / (double)total_ent; |
|
530 |
||
531 |
for (i = 0; i < HASHSIZE; i++) { |
|
532 |
for (srdp = hashtable[i]; srdp != NULL; srdp = srdp->nextsrd) { |
|
533 |
sum_of_deviation += |
|
534 |
((double)srdp->score - *average) * |
|
535 |
((double)srdp->score - *average); |
|
536 |
} |
|
537 |
} |
|
538 |
*SD = sqrt(sum_of_deviation/(total_ent -1)); |
|
539 |
fclose(fp); |
|
540 |
return (0); |
|
541 |
} |
|
542 |
||
543 |
void get_hash_name(char *hashfile, char *encoding) { |
|
544 |
strcpy(hashfile, roothash); |
|
545 |
strcat(hashfile, "\0"); |
|
546 |
strcat(hashfile, encoding); |
|
547 |
strcat(hashfile, "\0"); |
|
548 |
} |
|
549 |
||
550 |
int Regist_ASCII_ISO2022JP(int i, char *from_code, _auto_ef_t *root_autoef) { |
|
551 |
switch (i) { |
|
552 |
case 0: |
|
553 |
if (Regist_AUTOEF(ASCII, FULL, ASCII, root_autoef) == -1) |
|
554 |
return (-1); |
|
555 |
break; |
|
556 |
case 1: |
|
557 |
if (Regist_AUTOEF(from_code, FULL, M_FromCodeToLang(from_code), root_autoef) == -1) |
|
558 |
return (-1); |
|
559 |
break; |
|
560 |
case -1: |
|
561 |
/* |
|
562 |
* errno is from IsAsciiOr2022_buf |
|
563 |
*/ |
|
564 |
return (-1); |
|
565 |
} |
|
566 |
return (0); |
|
567 |
} |
|
568 |
||
636
e4ac77666de3
7115607 cmd/auto_ef compilation fails on b175a - wrong iconv args (backport of s11u1:fb701ba7adfd)
jenda
parents:
0
diff
changeset
|
569 |
int IsAsciiOr2022_buf(char *input_buf, int buf_size, |
0 | 570 |
char *from_encoding) { |
571 |
||
572 |
char *tbuf; |
|
573 |
iconv_t cd; |
|
574 |
char *convert; |
|
636
e4ac77666de3
7115607 cmd/auto_ef compilation fails on b175a - wrong iconv args (backport of s11u1:fb701ba7adfd)
jenda
parents:
0
diff
changeset
|
575 |
char *context; |
e4ac77666de3
7115607 cmd/auto_ef compilation fails on b175a - wrong iconv args (backport of s11u1:fb701ba7adfd)
jenda
parents:
0
diff
changeset
|
576 |
const char *p; |
0 | 577 |
size_t fsize, tsize, ret; |
578 |
int i; |
|
579 |
int tmp_bufsize = 0; |
|
580 |
||
581 |
if ((cd = iconv_open(UTF8, from_encoding)) == (iconv_t)-1) { |
|
582 |
/* Use iconv_open errno */ |
|
583 |
iconv_close(cd); |
|
584 |
return (-1); |
|
585 |
} |
|
586 |
||
587 |
errno = 0; |
|
588 |
context = &input_buf[0]; |
|
589 |
tmp_bufsize = buf_size; |
|
590 |
p = lengthbuf(input_buf, &tmp_bufsize); |
|
591 |
fsize = p - &input_buf[0]; |
|
592 |
tsize = fsize * 4; |
|
593 |
if ((tbuf = (char *) malloc(tsize)) == NULL) { |
|
594 |
errno = ENOMEM; |
|
595 |
iconv_close(cd); |
|
596 |
return (-1); |
|
597 |
} |
|
598 |
||
599 |
tbuf[0] = '\0'; |
|
600 |
convert = &tbuf[0]; |
|
601 |
||
602 |
ret = iconv(cd, &context, &fsize, &convert, &tsize); |
|
603 |
if (ret == (size_t) -1) { |
|
604 |
iconv_close(cd); |
|
605 |
errno = EINVAL; |
|
606 |
free(tbuf); |
|
607 |
return (-1); |
|
608 |
} |
|
609 |
||
610 |
*convert = '\0'; |
|
611 |
for (i = 0; i < buf_size; i++) { |
|
612 |
if (tbuf[i] == '\0') break; |
|
613 |
if ((unsigned char)tbuf[i] > 127) { |
|
614 |
iconv_close(cd); |
|
615 |
free(tbuf); |
|
616 |
return (1); |
|
617 |
} |
|
618 |
} |
|
619 |
iconv_close(cd); |
|
620 |
free(tbuf); |
|
621 |
return (0); |
|
622 |
} |
|
623 |
||
624 |
int Is2022KROrCN(int utf_flag, char *from_code, char *inputp, |
|
625 |
size_t buf_size, _auto_ef_t *root_autoef) { |
|
626 |
||
627 |
int i; |
|
628 |
char iso_2022_encoding[PATH_MAX]; |
|
629 |
||
630 |
switch (utf_flag) { |
|
631 |
case 0: |
|
632 |
/* For ISO-2022-KR, CN/CN-EXT encoding */ |
|
633 |
for (i = 0; i < buf_size; i++) { |
|
634 |
unsigned char fst, snd, trd, fth; |
|
635 |
||
636 |
fst = (unsigned char) inputp[i]; |
|
637 |
snd = (unsigned char)inputp[i+1]; |
|
638 |
trd = (unsigned char)inputp[i+2]; |
|
639 |
fth = (unsigned char)inputp[i+3]; |
|
640 |
if (CheckISO2022KR(fst, snd, trd, fth) == 1) { |
|
641 |
if (Regist_AUTOEF(ISOKR, FULL, M_FromCodeToLang(ISOKR), |
|
642 |
root_autoef) != -1) { |
|
643 |
||
644 |
return (0); |
|
645 |
} |
|
646 |
break; |
|
647 |
} else if (CheckISO2022CN(fst, snd, trd, fth) == 1) { |
|
648 |
if (Regist_AUTOEF(ISOCN, FULL, M_FromCodeToLang(ISOCN), |
|
649 |
root_autoef) != -1) { |
|
650 |
||
651 |
return (0); |
|
652 |
} |
|
653 |
break; |
|
654 |
} |
|
655 |
} |
|
656 |
break; |
|
657 |
||
658 |
case 1: /* Not ISO-2022-KR, CN/CN-EXT is UTF-8 */ |
|
659 |
if (Regist_AUTOEF(from_code, FULL, from_code, root_autoef) != -1) { |
|
660 |
return (0); |
|
661 |
} |
|
662 |
break; |
|
663 |
} |
|
664 |
return (-1); |
|
665 |
} |
|
666 |
||
667 |
int Hash(unsigned char a, unsigned char b) { |
|
668 |
unsigned int hashval = 0; |
|
669 |
hashval = (unsigned int)a + (unsigned int)b; |
|
670 |
return (hashval % HASHSIZE); |
|
671 |
} |
|
672 |
||
673 |
||
674 |
int TotalScore_buf(const char *input_buf, double *total_score, |
|
675 |
int buf_size, srd *hashtable, double *average, double *SD) { |
|
676 |
||
677 |
int i; |
|
678 |
int score = 0; |
|
679 |
int found = 0; |
|
680 |
||
681 |
*total_score = 0.0; |
|
682 |
||
683 |
for (i = 0; i < buf_size - 1; i++) { |
|
684 |
unsigned char keywords[2]; |
|
685 |
if (input_buf[i] == '\0') break; |
|
686 |
||
687 |
if ((unsigned)input_buf[i] < MSBFLAG) |
|
688 |
continue; |
|
689 |
||
690 |
if (i == 0 && input_buf[i+1] != '\0' ) { |
|
691 |
if ((FindKeyWord(input_buf[i], input_buf[i+1], hashtable)) == TRUE) { |
|
692 |
score = GetScore(input_buf[i], input_buf[i+1], hashtable); |
|
693 |
if (score != -1) { |
|
694 |
*total_score += Calc_SD(score, average, SD); |
|
695 |
found = 1; |
|
696 |
} |
|
697 |
} |
|
698 |
||
699 |
} else { |
|
700 |
if (input_buf[i+1] != '\0') { |
|
701 |
if ((FindKeyWord(input_buf[i], input_buf[i+1], hashtable)) == TRUE) { |
|
702 |
score = GetScore(input_buf[i], input_buf[i+1], hashtable); |
|
703 |
if (score != -1) { |
|
704 |
*total_score += Calc_SD(score, average, SD); |
|
705 |
found = 1; |
|
706 |
} |
|
707 |
} |
|
708 |
} |
|
709 |
||
710 |
if ((FindKeyWord(input_buf[i-1], input_buf[i], hashtable)) == TRUE) { |
|
711 |
score = GetScore(input_buf[i-1], input_buf[i], hashtable); |
|
712 |
if (score != -1) { |
|
713 |
*total_score += Calc_SD(score, average, SD); |
|
714 |
found = 1; |
|
715 |
} |
|
716 |
} |
|
717 |
} |
|
718 |
} |
|
719 |
return (found); |
|
720 |
} |
|
721 |
||
722 |
int FindKeyWord(unsigned char a, unsigned char b, srd *hashtable) { |
|
723 |
||
724 |
srd srdp; |
|
725 |
||
726 |
for (srdp = hashtable[Hash(a, b)]; srdp != NULL; srdp = srdp->nextsrd) { |
|
727 |
if ((srdp->keyword[0] == a) && (srdp->keyword[1] == b)) { |
|
728 |
return (TRUE); |
|
729 |
} |
|
730 |
} |
|
731 |
return (FALSE); |
|
732 |
} |
|
733 |
||
734 |
int GetScore(unsigned char a, unsigned char b, srd *hashtable) { |
|
735 |
srd srdp; |
|
736 |
||
737 |
for (srdp = hashtable[Hash(a, b)]; srdp != NULL; |
|
738 |
srdp = srdp->nextsrd) { |
|
739 |
if ((srdp->keyword[0] == a) && (srdp->keyword[1] == b)) { |
|
740 |
return (srdp->score); |
|
741 |
} |
|
742 |
} |
|
743 |
return (-1); |
|
744 |
} |
|
745 |
||
746 |
double Calc_SD(int score, double *average, double *SD) { |
|
747 |
double z_score; |
|
748 |
int SD_index = 0; |
|
749 |
||
750 |
z_score = ((double)score - *average)/(*SD); |
|
751 |
return (z_score); |
|
752 |
} |
|
753 |
||
754 |
char *chopbuf(char *buf) { |
|
755 |
char *p; |
|
756 |
||
757 |
p = &buf[0]; |
|
758 |
||
759 |
while (*p) |
|
760 |
if (*p == '\n') { |
|
761 |
*p = '\0'; |
|
762 |
break; |
|
763 |
} else { |
|
764 |
p++; |
|
765 |
} |
|
766 |
||
767 |
return (p); |
|
768 |
} |
|
769 |
||
770 |
void FreeHashTable(srd *hashtable) { |
|
771 |
||
772 |
int i; |
|
773 |
srd p, q; |
|
774 |
||
775 |
for (i = 0; i < HASHSIZE; i++) { |
|
776 |
for (p = hashtable[i]; p != NULL; ) { |
|
777 |
if ((p->nextsrd) != NULL) { |
|
778 |
q = p->nextsrd; |
|
779 |
free(p); |
|
780 |
p = q; |
|
781 |
} else { |
|
782 |
free(p); |
|
783 |
break; |
|
784 |
} |
|
785 |
} |
|
786 |
hashtable[i] = (srd)NULL; |
|
787 |
} |
|
788 |
} |
|
789 |
||
790 |
int CheckISO2022CN(unsigned char a, unsigned char b, |
|
791 |
unsigned char c, unsigned char d) |
|
792 |
{ |
|
793 |
||
794 |
if (a == 0x1b && b == 0x24 && c == 0x29 && d == 0x41 || |
|
795 |
a == 0x1b && b == 0x24 && c == 0x29 && d == 0x47 || |
|
796 |
a == 0x1b && b == 0x24 && c == 0x2a && d == 0x48 || |
|
797 |
a == 0x1b && b == 0x24 && c == 0x29 && d == 0x45 || |
|
798 |
a == 0x1b && b == 0x24 && c == 0x2b && d == 0x49 || |
|
799 |
a == 0x1b && b == 0x24 && c == 0x2b && d == 0x4a || |
|
800 |
a == 0x1b && b == 0x24 && c == 0x2b && d == 0x4b || |
|
801 |
a == 0x1b && b == 0x24 && c == 0x2b && d == 0x4c || |
|
802 |
a == 0x1b && b == 0x24 && c == 0x2b && d == 0x4d) { |
|
803 |
return (1); |
|
804 |
} else { |
|
805 |
return (0); |
|
806 |
} |
|
807 |
} |
|
808 |
||
809 |
int CheckISO2022KR(unsigned char a, unsigned char b, |
|
810 |
unsigned char c, unsigned char d) |
|
811 |
{ |
|
812 |
if (a == 0x1b && b == 0x24 && c == 0x29 && d == 0x43) { |
|
813 |
return (1); |
|
814 |
} else { |
|
815 |
return (0); |
|
816 |
} |
|
817 |
} |
|
818 |
||
819 |
void ThaiSpecificCheck(const char *input_buf, char *encoding, size_t buf_size) |
|
820 |
{ |
|
821 |
int i = 0; |
|
822 |
unsigned char a = 0; |
|
823 |
for (i=0; i < buf_size; i++) { |
|
824 |
if (input_buf[i] == '\0') break; |
|
825 |
a = (unsigned char) input_buf[i]; |
|
826 |
if (a == 0x80 || a == 0x85 || a == 0x91 || |
|
827 |
a == 0x92 || a == 0x93 || a == 0x94 || |
|
828 |
a == 0x95 || a == 0x96 || a == 0x97) { |
|
829 |
strlcpy(encoding, CP874, ENCODING_LENGTH); |
|
830 |
return; |
|
831 |
} |
|
832 |
} |
|
833 |
||
834 |
strlcpy(encoding, TIS620, ENCODING_LENGTH); |
|
835 |
} |