From 3a569a2f00bf172cddfd567149774ee808a2a242 Mon Sep 17 00:00:00 2001
From: nash <nash@13f79535-47bb-0310-9956-ffa450edef68>
Date: Wed, 30 Mar 2011 19:50:51 +0000
Subject: Create branch for 1.6.2

git-svn-id: http://svn.us.apache.org/repos/asf/tuscany@1087059 13f79535-47bb-0310-9956-ffa450edef68
---
 .../sca/domain/search/impl/NamingTokenizer.java    | 149 +++++++++++++++++++++
 1 file changed, 149 insertions(+)
 create mode 100644 sca-java-1.x/branches/sca-java-1.6.2/modules/domain-search/src/main/java/org/apache/tuscany/sca/domain/search/impl/NamingTokenizer.java

(limited to 'sca-java-1.x/branches/sca-java-1.6.2/modules/domain-search/src/main/java/org/apache/tuscany/sca/domain/search/impl/NamingTokenizer.java')

diff --git a/sca-java-1.x/branches/sca-java-1.6.2/modules/domain-search/src/main/java/org/apache/tuscany/sca/domain/search/impl/NamingTokenizer.java b/sca-java-1.x/branches/sca-java-1.6.2/modules/domain-search/src/main/java/org/apache/tuscany/sca/domain/search/impl/NamingTokenizer.java
new file mode 100644
index 0000000000..280039a67e
--- /dev/null
+++ b/sca-java-1.x/branches/sca-java-1.6.2/modules/domain-search/src/main/java/org/apache/tuscany/sca/domain/search/impl/NamingTokenizer.java
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.    
+ */
+package org.apache.tuscany.sca.domain.search.impl;
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
+
+/**
+ * 
+ * @version $Rev$ $Date$
+ */
+public class NamingTokenizer extends Tokenizer {
+
+    private int offset = 0, bufferIndex = 0, dataLen = 0;
+    private static final int MAX_WORD_LEN = 255;
+    private static final int IO_BUFFER_SIZE = 4096;
+    private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+    public NamingTokenizer(Reader reader) {
+        super(reader);
+    }
+
+    @Override
+    public Token next(Token reusableToken) throws IOException {
+        assert reusableToken != null;
+        reusableToken.clear();
+        int length = 0;
+        int start = bufferIndex;
+        char[] buffer = reusableToken.termBuffer();
+
+        boolean lowercaseCharFound = false;
+        boolean digitFound = false;
+
+        while (true) {
+
+            if (bufferIndex >= dataLen) {
+                offset += dataLen;
+                int incr;
+
+                if (lowercaseCharFound || length == 0) {
+                    incr = 0;
+
+                } else {
+                    incr = 2;
+                    ioBuffer[0] = ioBuffer[bufferIndex - 1];
+                    ioBuffer[1] = ioBuffer[bufferIndex];
+
+                }
+
+                dataLen = input.read(ioBuffer, incr, ioBuffer.length - incr);
+                if (dataLen == -1) {
+                    if (length > 0)
+                        break;
+                    else
+                        return null;
+                }
+                bufferIndex = incr;
+                dataLen += incr;
+
+            }
+
+            final char c = ioBuffer[bufferIndex++];
+            boolean breakChar = true;
+
+            if (Character.isDigit(c)) {
+
+                if (digitFound || length == 0) {
+                    breakChar = false;
+                    digitFound = true;
+
+                } else {
+                    bufferIndex--;
+                }
+
+                // TODO: normalize accent, it does not index accents for now
+            } else if (c >= 65 && c <= 90 || c >= 97 && c <= 122) {
+
+                if (digitFound) {
+                    bufferIndex--;
+
+                } else if (Character.isLowerCase(c)) {
+
+                    if (!(lowercaseCharFound || length <= 1)) {
+                        length--;
+                        bufferIndex -= 2;
+
+                    } else {
+                        lowercaseCharFound = true;
+                        breakChar = false;
+
+                    }
+
+                } else if (!lowercaseCharFound) { // && uppercase
+                    breakChar = false;
+
+                } else {
+                    bufferIndex--;
+                }
+
+            }
+
+            if (!breakChar) {
+
+                if (length == 0) // start of token
+                    start = offset + bufferIndex - 1;
+                else if (length == buffer.length)
+                    buffer = reusableToken.resizeTermBuffer(1 + length);
+
+                buffer[length++] = Character.toLowerCase(c); // buffer it,
+                                                             // normalized
+
+                if (length == MAX_WORD_LEN) // buffer overflow!
+                    break;
+
+            } else if (length > 0) {// at non-Letter w/ chars
+
+                break; // return 'em
+
+            }
+
+        }
+
+        reusableToken.setTermLength(length);
+        reusableToken.setStartOffset(start);
+        reusableToken.setEndOffset(start + length);
+
+        return reusableToken;
+
+    }
+}
-- 
cgit v1.2.3