From 3a569a2f00bf172cddfd567149774ee808a2a242 Mon Sep 17 00:00:00 2001 From: nash Date: Wed, 30 Mar 2011 19:50:51 +0000 Subject: Create branch for 1.6.2 git-svn-id: http://svn.us.apache.org/repos/asf/tuscany@1087059 13f79535-47bb-0310-9956-ffa450edef68 --- .../sca/domain/search/impl/NamingTokenizer.java | 149 +++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 sca-java-1.x/branches/sca-java-1.6.2/modules/domain-search/src/main/java/org/apache/tuscany/sca/domain/search/impl/NamingTokenizer.java (limited to 'sca-java-1.x/branches/sca-java-1.6.2/modules/domain-search/src/main/java/org/apache/tuscany/sca/domain/search/impl/NamingTokenizer.java') diff --git a/sca-java-1.x/branches/sca-java-1.6.2/modules/domain-search/src/main/java/org/apache/tuscany/sca/domain/search/impl/NamingTokenizer.java b/sca-java-1.x/branches/sca-java-1.6.2/modules/domain-search/src/main/java/org/apache/tuscany/sca/domain/search/impl/NamingTokenizer.java new file mode 100644 index 0000000000..280039a67e --- /dev/null +++ b/sca-java-1.x/branches/sca-java-1.6.2/modules/domain-search/src/main/java/org/apache/tuscany/sca/domain/search/impl/NamingTokenizer.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.tuscany.sca.domain.search.impl; + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.Tokenizer; + +/** + * + * @version $Rev$ $Date$ + */ +public class NamingTokenizer extends Tokenizer { + + private int offset = 0, bufferIndex = 0, dataLen = 0; + private static final int MAX_WORD_LEN = 255; + private static final int IO_BUFFER_SIZE = 4096; + private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; + + public NamingTokenizer(Reader reader) { + super(reader); + } + + @Override + public Token next(Token reusableToken) throws IOException { + assert reusableToken != null; + reusableToken.clear(); + int length = 0; + int start = bufferIndex; + char[] buffer = reusableToken.termBuffer(); + + boolean lowercaseCharFound = false; + boolean digitFound = false; + + while (true) { + + if (bufferIndex >= dataLen) { + offset += dataLen; + int incr; + + if (lowercaseCharFound || length == 0) { + incr = 0; + + } else { + incr = 2; + ioBuffer[0] = ioBuffer[bufferIndex - 1]; + ioBuffer[1] = ioBuffer[bufferIndex]; + + } + + dataLen = input.read(ioBuffer, incr, ioBuffer.length - incr); + if (dataLen == -1) { + if (length > 0) + break; + else + return null; + } + bufferIndex = incr; + dataLen += incr; + + } + + final char c = ioBuffer[bufferIndex++]; + boolean breakChar = true; + + if (Character.isDigit(c)) { + + if (digitFound || length == 0) { + breakChar = false; + digitFound = true; + + } else { + bufferIndex--; + } + + // TODO: normalize accent, it does not index accents for now + } else if (c >= 65 && c <= 90 || c >= 97 && c <= 122) { + + if (digitFound) { + bufferIndex--; + + } else if (Character.isLowerCase(c)) { + + if (!(lowercaseCharFound || length <= 1)) { + length--; + bufferIndex -= 2; + + } else { + lowercaseCharFound = true; + breakChar = false; + + } + + } else if (!lowercaseCharFound) { // && uppercase + breakChar = false; + + } else { + bufferIndex--; + } + + } + + if (!breakChar) { + + if (length == 0) // start of token + start = offset + bufferIndex - 1; + else if (length == buffer.length) + buffer = reusableToken.resizeTermBuffer(1 + length); + + buffer[length++] = Character.toLowerCase(c); // buffer it, + // normalized + + if (length == MAX_WORD_LEN) // buffer overflow! + break; + + } else if (length > 0) {// at non-Letter w/ chars + + break; // return 'em + + } + + } + + reusableToken.setTermLength(length); + reusableToken.setStartOffset(start); + reusableToken.setEndOffset(start + length); + + return reusableToken; + + } +} -- cgit v1.2.3