1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.commons.codec.language;
19
20 import org.apache.commons.codec.EncoderException;
21 import org.apache.commons.codec.StringEncoder;
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55 public class Metaphone implements StringEncoder {
56
57
58
59
60 private static final String VOWELS = "AEIOU";
61
62
63
64
65 private static final String FRONTV = "EIY";
66
67
68
69
70 private static final String VARSON = "CSPTG";
71
72
73
74
75 private int maxCodeLen = 4;
76
77
78
79
80
81
82
83
84
85
86
87
88
89 @Override
90 public Object encode(final Object obj) throws EncoderException {
91 if (!(obj instanceof String)) {
92 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
93 }
94 return metaphone((String) obj);
95 }
96
97
98
99
100
101
102
103 @Override
104 public String encode(final String str) {
105 return metaphone(str);
106 }
107
108
109
110
111
112 public int getMaxCodeLen() { return this.maxCodeLen; }
113
114 private boolean isLastChar(final int wdsz, final int n) {
115 return n + 1 == wdsz;
116 }
117
118
119
120
121
122
123
124
125
126 public boolean isMetaphoneEqual(final String str1, final String str2) {
127 return metaphone(str1).equals(metaphone(str2));
128 }
129
130 private boolean isNextChar(final StringBuilder string, final int index, final char c) {
131 boolean matches = false;
132 if (index >= 0 && index < string.length() - 1) {
133 matches = string.charAt(index + 1) == c;
134 }
135 return matches;
136 }
137
138 private boolean isPreviousChar(final StringBuilder string, final int index, final char c) {
139 boolean matches = false;
140 if (index > 0 && index < string.length()) {
141 matches = string.charAt(index - 1) == c;
142 }
143 return matches;
144 }
145
146 private boolean isVowel(final StringBuilder string, final int index) {
147 return VOWELS.indexOf(string.charAt(index)) >= 0;
148 }
149
150
151
152
153
154
155
156
157
158
159
160 public String metaphone(final String txt) {
161 boolean hard = false;
162 final int txtLength;
163 if (txt == null || (txtLength = txt.length()) == 0) {
164 return "";
165 }
166
167 if (txtLength == 1) {
168 return txt.toUpperCase(java.util.Locale.ENGLISH);
169 }
170
171 final char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
172
173 final StringBuilder local = new StringBuilder(40);
174 final StringBuilder code = new StringBuilder(10);
175
176 switch (inwd[0]) {
177 case 'K':
178 case 'G':
179 case 'P':
180 if (inwd[1] == 'N') {
181 local.append(inwd, 1, inwd.length - 1);
182 } else {
183 local.append(inwd);
184 }
185 break;
186 case 'A':
187 if (inwd[1] == 'E') {
188 local.append(inwd, 1, inwd.length - 1);
189 } else {
190 local.append(inwd);
191 }
192 break;
193 case 'W':
194 if (inwd[1] == 'R') {
195 local.append(inwd, 1, inwd.length - 1);
196 break;
197 }
198 if (inwd[1] == 'H') {
199 local.append(inwd, 1, inwd.length - 1);
200 local.setCharAt(0, 'W');
201 } else {
202 local.append(inwd);
203 }
204 break;
205 case 'X':
206 inwd[0] = 'S';
207 local.append(inwd);
208 break;
209 default:
210 local.append(inwd);
211 }
212
213 final int wdsz = local.length();
214 int n = 0;
215
216 while (code.length() < this.getMaxCodeLen() && n < wdsz) {
217 final char symb = local.charAt(n);
218
219 if (symb != 'C' && isPreviousChar(local, n, symb)) {
220 n++;
221 } else {
222 switch (symb) {
223 case 'A':
224 case 'E':
225 case 'I':
226 case 'O':
227 case 'U':
228 if (n == 0) {
229 code.append(symb);
230 }
231 break;
232 case 'B':
233 if (isPreviousChar(local, n, 'M') && isLastChar(wdsz, n)) {
234 break;
235 }
236 code.append(symb);
237 break;
238 case 'C':
239
240 if (isPreviousChar(local, n, 'S') && !isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
241 break;
242 }
243 if (regionMatch(local, n, "CIA")) {
244 code.append('X');
245 break;
246 }
247 if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
248 code.append('S');
249 break;
250 }
251 if (isPreviousChar(local, n, 'S') && isNextChar(local, n, 'H')) {
252 code.append('K');
253 break;
254 }
255 if (isNextChar(local, n, 'H')) {
256 if (n == 0 && wdsz >= 3 && isVowel(local, 2)) {
257 code.append('K');
258 } else {
259 code.append('X');
260 }
261 } else {
262 code.append('K');
263 }
264 break;
265 case 'D':
266 if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'G') && FRONTV.indexOf(local.charAt(n + 2)) >= 0) {
267 code.append('J');
268 n += 2;
269 } else {
270 code.append('T');
271 }
272 break;
273 case 'G':
274 if (isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H')) {
275 break;
276 }
277 if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H') && !isVowel(local, n + 2)) {
278 break;
279 }
280 if (n > 0 && (regionMatch(local, n, "GN") || regionMatch(local, n, "GNED"))) {
281 break;
282 }
283
284 hard = isPreviousChar(local, n, 'G');
285 if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0 && !hard) {
286 code.append('J');
287 } else {
288 code.append('K');
289 }
290 break;
291 case 'H':
292 if (isLastChar(wdsz, n)) {
293 break;
294 }
295 if (n > 0 && VARSON.indexOf(local.charAt(n - 1)) >= 0) {
296 break;
297 }
298 if (isVowel(local, n + 1)) {
299 code.append('H');
300 }
301 break;
302 case 'F':
303 case 'J':
304 case 'L':
305 case 'M':
306 case 'N':
307 case 'R':
308 code.append(symb);
309 break;
310 case 'K':
311 if (n > 0) {
312 if (!isPreviousChar(local, n, 'C')) {
313 code.append(symb);
314 }
315 } else {
316 code.append(symb);
317 }
318 break;
319 case 'P':
320 if (isNextChar(local, n, 'H')) {
321
322 code.append('F');
323 } else {
324 code.append(symb);
325 }
326 break;
327 case 'Q':
328 code.append('K');
329 break;
330 case 'S':
331 if (regionMatch(local, n, "SH") || regionMatch(local, n, "SIO") || regionMatch(local, n, "SIA")) {
332 code.append('X');
333 } else {
334 code.append('S');
335 }
336 break;
337 case 'T':
338 if (regionMatch(local, n, "TIA") || regionMatch(local, n, "TIO")) {
339 code.append('X');
340 break;
341 }
342 if (regionMatch(local, n, "TCH")) {
343
344 break;
345 }
346
347 if (regionMatch(local, n, "TH")) {
348 code.append('0');
349 } else {
350 code.append('T');
351 }
352 break;
353 case 'V':
354 code.append('F');
355 break;
356 case 'W':
357 case 'Y':
358 if (!isLastChar(wdsz, n) && isVowel(local, n + 1)) {
359 code.append(symb);
360 }
361 break;
362 case 'X':
363 code.append('K');
364 code.append('S');
365 break;
366 case 'Z':
367 code.append('S');
368 break;
369 default:
370
371 break;
372 }
373 n++;
374 }
375 if (code.length() > this.getMaxCodeLen()) {
376 code.setLength(this.getMaxCodeLen());
377 }
378 }
379 return code.toString();
380 }
381
382 private boolean regionMatch(final StringBuilder string, final int index, final String test) {
383 boolean matches = false;
384 if (index >= 0 && index + test.length() - 1 < string.length()) {
385 final String substring = string.substring(index, index + test.length());
386 matches = substring.equals(test);
387 }
388 return matches;
389 }
390
391
392
393
394
395 public void setMaxCodeLen(final int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
396
397 }