diff options
Diffstat (limited to '')
-rw-r--r-- | sca-java-1.x/tags/1.6.2-RC1/modules/domain-search/src/main/java/org/apache/tuscany/sca/domain/search/impl/DomainPathAnalyzer.java | 233 |
1 files changed, 233 insertions, 0 deletions
diff --git a/sca-java-1.x/tags/1.6.2-RC1/modules/domain-search/src/main/java/org/apache/tuscany/sca/domain/search/impl/DomainPathAnalyzer.java b/sca-java-1.x/tags/1.6.2-RC1/modules/domain-search/src/main/java/org/apache/tuscany/sca/domain/search/impl/DomainPathAnalyzer.java new file mode 100644 index 0000000000..c0b5bcb218 --- /dev/null +++ b/sca-java-1.x/tags/1.6.2-RC1/modules/domain-search/src/main/java/org/apache/tuscany/sca/domain/search/impl/DomainPathAnalyzer.java @@ -0,0 +1,233 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.tuscany.sca.domain.search.impl; + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; + +/** + * + * @version $Rev$ $Date$ + */ +public class DomainPathAnalyzer extends Analyzer { + + final public static char PATH_START = '\u0001'; + + final public static char PATH_SEPARATOR = '\u0002'; + + final public static char TYPE_SEPARATOR = '\u0003'; + + final public static char URI_SEPARATOR = '\u0004'; + + final public static char ARCHIVE_SEPARATOR = '!'; + + static class DomainPathTokenizer extends Tokenizer { + + private int offset = 0, bufferIndex = 0, dataLen = 0; + private static final int MAX_WORD_LEN = 1024; + private static final int IO_BUFFER_SIZE = 4096; + private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; + private boolean typeCharFound = false; + private boolean uriCharFound = false; + + public DomainPathTokenizer(Reader reader) { + super(reader); + } + + @Override + public void reset() throws IOException { + super.reset(); + + typeCharFound = false; + uriCharFound = false; + + } + + @Override + public void reset(Reader input) throws IOException { + super.reset(input); + + uriCharFound = false; + typeCharFound = false; + + } + + @Override + public Token next(Token reusableToken) throws IOException { + assert reusableToken != null; + reusableToken.clear(); + int length = 0; + int start = bufferIndex; + char[] buffer = reusableToken.termBuffer(); + + boolean lowercaseCharFound = false; + boolean digitFound = false; + + while (true) { + + if (bufferIndex >= dataLen) { + offset += dataLen; + int incr; + + if (lowercaseCharFound || length == 0) { + incr = 0; + + } else { + incr = 2; + ioBuffer[0] = ioBuffer[bufferIndex - 1]; + ioBuffer[1] = ioBuffer[bufferIndex]; + + } + + dataLen = input.read(ioBuffer, incr, ioBuffer.length - incr); + if (dataLen == -1) { + if (length > 0) + break; + else + return null; + } + bufferIndex = incr; + dataLen += incr; + + } + + final char c = ioBuffer[bufferIndex++]; + boolean breakChar = true; + boolean includeChar = false; + + if (c == PATH_START || c == PATH_SEPARATOR) { + + if (length == 0) { + includeChar = true; + + } else { + bufferIndex--; + } + + typeCharFound = false; + uriCharFound = false; + + } else if (c == TYPE_SEPARATOR && !typeCharFound || c == URI_SEPARATOR && !uriCharFound) { + length = 0; + breakChar = false; + lowercaseCharFound = false; + digitFound = false; + + } else { + + if (Character.isDigit(c)) { + + if (digitFound || length == 0) { + breakChar = false; + digitFound = true; + + } else { + bufferIndex--; + } + + // TODO: normalize accent, it does not index accents for + // now + } else if (c >= 65 && c <= 90 || c >= 97 && c <= 122) { + + if (digitFound) { + bufferIndex--; + + } else if (Character.isLowerCase(c)) { + + if (!(lowercaseCharFound || length <= 1)) { + length--; + bufferIndex -= 2; + + } else { + lowercaseCharFound = true; + breakChar = false; + + } + + } else if (!lowercaseCharFound) { // && uppercase + breakChar = false; + + } else { + bufferIndex--; + } + + } + + } + + if (!breakChar || includeChar) { + + if (length == 0) // start of token + start = offset + bufferIndex - 1; + else if (length == buffer.length) + buffer = reusableToken.resizeTermBuffer(1 + length); + + if (c == TYPE_SEPARATOR && !typeCharFound) { + typeCharFound = true; + + } else if (c == URI_SEPARATOR && !uriCharFound) { + typeCharFound = true; + + } else { + buffer[length++] = Character.toLowerCase(c); // buffer + // it, + // normalized + } + + if (length == MAX_WORD_LEN || (breakChar && length > 0)) // buffer + // overflow! + break; + + } else if (length > 0) {// at non-Letter w/ chars + + break; // return 'em + + } + + } + + reusableToken.setTermLength(length); + reusableToken.setStartOffset(start); + reusableToken.setEndOffset(start + length); + + return reusableToken; + + } + } + + public TokenStream tokenStream(String fieldName, Reader reader) { + return new DomainPathTokenizer(reader); + } + + public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { + Tokenizer tokenizer = (Tokenizer)getPreviousTokenStream(); + if (tokenizer == null) { + tokenizer = new DomainPathTokenizer(reader); + setPreviousTokenStream(tokenizer); + } else + tokenizer.reset(reader); + return tokenizer; + } + +} |