lucene-2.9.4/0000755000175000017500000000000011554106561013457 5ustar janpascaljanpascallucene-2.9.4/lib/0000755000175000017500000000000011554106562014226 5ustar janpascaljanpascallucene-2.9.4/contrib/0000755000175000017500000000000011554106561015117 5ustar janpascaljanpascallucene-2.9.4/contrib/ant/0000755000175000017500000000000011554106561015701 5ustar janpascaljanpascallucene-2.9.4/contrib/ant/lib/0000755000175000017500000000000011554106562016450 5ustar janpascaljanpascallucene-2.9.4/contrib/ant/build.xml0000644000175000017500000000264511474320237017530 0ustar janpascaljanpascal Lucene Ant integration lucene-2.9.4/contrib/ant/src/0000755000175000017500000000000011474320237016467 5ustar janpascaljanpascallucene-2.9.4/contrib/ant/src/java/0000755000175000017500000000000011554106561017411 5ustar janpascaljanpascallucene-2.9.4/contrib/ant/src/java/org/0000755000175000017500000000000011474320237020177 5ustar janpascaljanpascallucene-2.9.4/contrib/ant/src/java/org/apache/0000755000175000017500000000000011474320237021420 5ustar janpascaljanpascallucene-2.9.4/contrib/ant/src/java/org/apache/lucene/0000755000175000017500000000000011474320237022673 5ustar janpascaljanpascallucene-2.9.4/contrib/ant/src/java/org/apache/lucene/ant/0000755000175000017500000000000011554106561023456 5ustar janpascaljanpascallucene-2.9.4/contrib/ant/src/java/org/apache/lucene/ant/DocumentHandler.java0000644000175000017500000000241311474320237027374 0ustar janpascaljanpascalpackage org.apache.lucene.ant; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.document.Document; import java.io.File; /** * Allows a class to act as a Lucene document handler * *@since October 27, 2001 */ public interface DocumentHandler { /** * Gets the document attribute of the DocumentHandler object * *@param file Description of Parameter *@return The document value */ Document getDocument(File file) throws DocumentHandlerException; } lucene-2.9.4/contrib/ant/src/java/org/apache/lucene/ant/IndexTask.java0000644000175000017500000002772511474320237026227 0ustar janpascaljanpascalpackage org.apache.lucene.ant; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.DateTools; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.TermQuery; import org.apache.tools.ant.BuildException; import org.apache.tools.ant.DirectoryScanner; import org.apache.tools.ant.DynamicConfigurator; import org.apache.tools.ant.Project; import org.apache.tools.ant.Task; import org.apache.tools.ant.types.FileSet; import org.apache.tools.ant.types.EnumeratedAttribute; import org.apache.tools.ant.types.Resource; import org.apache.tools.ant.types.ResourceCollection; import org.apache.tools.ant.types.resources.FileResource; import java.io.File; import java.io.IOException; import java.util.Date; import java.util.Iterator; import java.util.Properties; import java.util.Map; import java.util.HashMap; import java.util.Set; import java.util.ArrayList; import java.util.Vector; import java.text.ParseException; /** * Ant task to index files with Lucene * */ public class IndexTask extends Task { /** * resources */ protected Vector rcs = new Vector(); /** * overwrite index? */ private boolean overwrite = false; /** * index path */ private File indexDir; /** * document handler classname */ private String handlerClassName = FileExtensionDocumentHandler.class.getName(); /** * document handler instance */ private DocumentHandler handler; /** * */ private String analyzerClassName = StandardAnalyzer.class.getName(); /** * analyzer instance */ private Analyzer analyzer; /** * Lucene merge factor */ private int mergeFactor = 20; private HandlerConfig handlerConfig; private boolean useCompoundIndex = true; /** * Creates new instance */ public IndexTask() { } /** * Specifies the directory where the index will be stored */ public void setIndex(File indexDir) { this.indexDir = indexDir; } /** * Sets the mergeFactor attribute of the IndexTask object * *@param mergeFactor The new mergeFactor value */ public void setMergeFactor(int mergeFactor) { this.mergeFactor = mergeFactor; } /** * Sets the overwrite attribute of the IndexTask object * *@param overwrite The new overwrite value */ public void setOverwrite(boolean overwrite) { this.overwrite = overwrite; } /** * If creating a new index and this is set to true, the * index will be created in compound format. */ public void setUseCompoundIndex(boolean useCompoundIndex) { this.useCompoundIndex = useCompoundIndex; } /** * Sets the documentHandler attribute of the IndexTask object * *@param classname The new documentHandler value */ public void setDocumentHandler(String classname) { handlerClassName = classname; } /** * Sets the analyzer based on the builtin Lucene analyzer types. * * TODO: Enforce analyzer and analyzerClassName to be mutually exclusive */ public void setAnalyzer(AnalyzerType type) { analyzerClassName = type.getClassname(); } public void setAnalyzerClassName(String classname) { analyzerClassName = classname; } /** * Adds a set of files (nested fileset attribute). * *@param set FileSet to be added */ public void addFileset(FileSet set) { add(set); } /** * Add a collection of files to copy. * @param res a resource collection to copy. * @since Ant 1.7 */ public void add(ResourceCollection res) { rcs.add(res); } /** * Sets custom properties for a configurable document handler. */ public void addConfig(HandlerConfig config) throws BuildException { if (handlerConfig != null) { throw new BuildException("Only one config element allowed"); } handlerConfig = config; } /** * Begins the indexing * *@exception BuildException If an error occurs indexing the * fileset */ public void execute() throws BuildException { // construct handler and analyzer dynamically try { Class clazz = Class.forName(handlerClassName); handler = (DocumentHandler) clazz.newInstance(); clazz = Class.forName(analyzerClassName); analyzer = (Analyzer) clazz.newInstance(); } catch (ClassNotFoundException cnfe) { throw new BuildException(cnfe); } catch (InstantiationException ie) { throw new BuildException(ie); } catch (IllegalAccessException iae) { throw new BuildException(iae); } log("Document handler = " + handler.getClass(), Project.MSG_VERBOSE); log("Analyzer = " + analyzer.getClass(), Project.MSG_VERBOSE); if (handler instanceof ConfigurableDocumentHandler) { ((ConfigurableDocumentHandler) handler).configure(handlerConfig.getProperties()); } try { indexDocs(); } catch (IOException e) { throw new BuildException(e); } } /** * Index the fileset. * *@exception IOException if Lucene I/O exception *TODO: refactor!!!!! */ private void indexDocs() throws IOException { Date start = new Date(); boolean create = overwrite; // If the index directory doesn't exist, // create it and force create mode if (indexDir.mkdirs() && !overwrite) { create = true; } Searcher searcher = null; boolean checkLastModified = false; if (!create) { try { searcher = new IndexSearcher(indexDir.getAbsolutePath()); checkLastModified = true; } catch (IOException ioe) { log("IOException: " + ioe.getMessage()); // Empty - ignore, which indicates to index all // documents } } log("checkLastModified = " + checkLastModified, Project.MSG_VERBOSE); IndexWriter writer = new IndexWriter(indexDir, analyzer, create, IndexWriter.MaxFieldLength.LIMITED); writer.setUseCompoundFile(useCompoundIndex); int totalFiles = 0; int totalIndexed = 0; int totalIgnored = 0; try { writer.setMergeFactor(mergeFactor); for (int i = 0; i < rcs.size(); i++) { ResourceCollection rc = (ResourceCollection) rcs.elementAt(i); if (rc.isFilesystemOnly()) { Iterator resources = rc.iterator(); while (resources.hasNext()) { Resource r = (Resource) resources.next(); if (!r.isExists() || !(r instanceof FileResource)) { continue; } totalFiles++; File file = ((FileResource) r).getFile(); if (!file.exists() || !file.canRead()) { throw new BuildException("File \"" + file.getAbsolutePath() + "\" does not exist or is not readable."); } boolean indexIt = true; if (checkLastModified) { Term pathTerm = new Term("path", file.getPath()); TermQuery query = new TermQuery(pathTerm); Hits hits = searcher.search(query); // if document is found, compare the // indexed last modified time with the // current file // - don't index if up to date if (hits.length() > 0) { Document doc = hits.doc(0); String indexModified = doc.get("modified").trim(); if (indexModified != null) { long lastModified = 0; try { lastModified = DateTools.stringToTime(indexModified); } catch (ParseException e) { // if modified time is not parsable, skip } if (lastModified == file.lastModified()) { // TODO: remove existing document indexIt = false; } } } } if (indexIt) { try { log("Indexing " + file.getPath(), Project.MSG_VERBOSE); Document doc = handler.getDocument(file); if (doc == null) { totalIgnored++; } else { // Add the path of the file as a field named "path". Use a Keyword field, so // that the index stores the path, and so that the path is searchable doc.add(new Field("path", file.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED)); // Add the last modified date of the file a field named "modified". Use a // Keyword field, so that it's searchable, but so that no attempt is made // to tokenize the field into words. doc.add(new Field("modified", DateTools.timeToString(file.lastModified(), DateTools.Resolution.MILLISECOND), Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.addDocument(doc); totalIndexed++; } } catch (DocumentHandlerException e) { throw new BuildException(e); } } } // for j } // if (fs != null) } // for i writer.optimize(); } //try finally { // always make sure everything gets closed, // no matter how we exit. writer.close(); if (searcher != null) { searcher.close(); } } Date end = new Date(); log(totalIndexed + " out of " + totalFiles + " indexed (" + totalIgnored + " ignored) in " + (end.getTime() - start.getTime()) + " milliseconds"); } public static class HandlerConfig implements DynamicConfigurator { Properties props = new Properties(); public void setDynamicAttribute(String attributeName, String value) throws BuildException { props.setProperty(attributeName, value); } public Object createDynamicElement(String elementName) throws BuildException { throw new BuildException("Sub elements not supported"); } public Properties getProperties() { return props; } } public static class AnalyzerType extends EnumeratedAttribute { private static Map analyzerLookup = new HashMap(); static { analyzerLookup.put("simple", SimpleAnalyzer.class.getName()); analyzerLookup.put("standard", StandardAnalyzer.class.getName()); analyzerLookup.put("stop", StopAnalyzer.class.getName()); analyzerLookup.put("whitespace", WhitespaceAnalyzer.class.getName()); } /** * @see EnumeratedAttribute#getValues */ public String[] getValues() { Set keys = analyzerLookup.keySet(); return (String[]) keys.toArray(new String[0]); } public String getClassname() { return (String) analyzerLookup.get(getValue()); } } } lucene-2.9.4/contrib/ant/src/java/org/apache/lucene/ant/ConfigurableDocumentHandler.java0000644000175000017500000000172111474320237031716 0ustar janpascaljanpascalpackage org.apache.lucene.ant; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Properties; public interface ConfigurableDocumentHandler extends DocumentHandler { void configure(Properties props); } lucene-2.9.4/contrib/ant/src/java/org/apache/lucene/ant/package.html0000644000175000017500000000162511474320237025742 0ustar janpascaljanpascal Ant task to create Lucene indexes. lucene-2.9.4/contrib/ant/src/java/org/apache/lucene/ant/antlib.xml0000644000175000017500000000163611474320237025456 0ustar janpascaljanpascal lucene-2.9.4/contrib/ant/src/java/org/apache/lucene/ant/FileExtensionDocumentHandler.java0000644000175000017500000000407011474320237032072 0ustar janpascaljanpascalpackage org.apache.lucene.ant; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.document.Document; import java.io.File; /** * A DocumentHandler implementation to delegate responsibility to * based on a files extension. Currently only .html and .txt * files are handled, other extensions ignored. * *@since October 28, 2001 *TODO: Implement dynamic document type lookup */ public class FileExtensionDocumentHandler implements DocumentHandler { /** * Gets the document attribute of the * FileExtensionDocumentHandler object * *@param file Description of * Parameter *@return The document value *@exception DocumentHandlerException Description of * Exception */ public Document getDocument(File file) throws DocumentHandlerException { Document doc = null; String name = file.getName(); try { if (name.endsWith(".txt")) { doc = TextDocument.Document(file); } if (name.endsWith(".html")) { doc = HtmlDocument.Document(file); } } catch (java.io.IOException e) { throw new DocumentHandlerException(e); } return doc; } } lucene-2.9.4/contrib/ant/src/java/org/apache/lucene/ant/TextDocument.java0000644000175000017500000000562211474320237026750 0ustar janpascaljanpascalpackage org.apache.lucene.ant; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.StringWriter; /** * A utility for making Lucene Documents from a File. * *@since December 6, 2001 *TODO: Fix JavaDoc comments here */ public class TextDocument { private String contents; /** * Constructor for the TextDocument object * *@param file Description of Parameter *@exception IOException Description of Exception */ public TextDocument(File file) throws IOException { BufferedReader br = new BufferedReader(new FileReader(file)); StringWriter sw = new StringWriter(); String line = br.readLine(); while (line != null) { sw.write(line); line = br.readLine(); } br.close(); contents = sw.toString(); sw.close(); } /** * Makes a document for a File.

* * The document has a single field: *

*/ public class PerfRunData { private Points points; // objects used during performance test run // directory, analyzer, docMaker - created at startup. // reader, writer, searcher - maintained by basic tasks. private Directory directory; private Analyzer analyzer; private DocMaker docMaker; // we use separate (identical) instances for each "read" task type, so each can iterate the quries separately. private HashMap readTaskQueryMaker; private Class qmkrClass; private IndexReader indexReader; private IndexSearcher indexSearcher; private IndexWriter indexWriter; private Config config; private long startTimeMillis; // constructor public PerfRunData (Config config) throws Exception { this.config = config; // analyzer (default is standard analyzer) analyzer = (Analyzer) Class.forName(config.get("analyzer", "org.apache.lucene.analysis.standard.StandardAnalyzer")).newInstance(); // doc maker docMaker = (DocMaker) Class.forName(config.get("doc.maker", "org.apache.lucene.benchmark.byTask.feeds.DocMaker")).newInstance(); docMaker.setConfig(config); // query makers readTaskQueryMaker = new HashMap(); qmkrClass = Class.forName(config.get("query.maker","org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker")); // index stuff reinit(false); // statistic points points = new Points(config); if (Boolean.valueOf(config.get("log.queries","false")).booleanValue()) { System.out.println("------------> queries:"); System.out.println(getQueryMaker(new SearchTask(this)).printQueries()); } } // clean old stuff, reopen public void reinit(boolean eraseIndex) throws Exception { // cleanup index if (indexWriter!=null) { indexWriter.close(); indexWriter = null; } if (indexReader!=null) { indexReader.close(); indexReader = null; } if (directory!=null) { directory.close(); } // directory (default is ram-dir). if ("FSDirectory".equals(config.get("directory","RAMDirectory"))) { File workDir = new File(config.get("work.dir","work")); File indexDir = new File(workDir,"index"); if (eraseIndex && indexDir.exists()) { FileUtils.fullyDelete(indexDir); } indexDir.mkdirs(); directory = FSDirectory.open(indexDir); } else { directory = new RAMDirectory(); } // inputs resetInputs(); // release unused stuff System.runFinalization(); System.gc(); // Re-init clock setStartTimeMillis(); } public long setStartTimeMillis() { startTimeMillis = System.currentTimeMillis(); return startTimeMillis; } /** * @return Start time in milliseconds */ public long getStartTimeMillis() { return startTimeMillis; } /** * @return Returns the points. */ public Points getPoints() { return points; } /** * @return Returns the directory. */ public Directory getDirectory() { return directory; } /** * @param directory The directory to set. */ public void setDirectory(Directory directory) { this.directory = directory; } /** * @return Returns the indexReader. */ public IndexReader getIndexReader() { return indexReader; } /** * @return Returns the indexSearcher. */ public IndexSearcher getIndexSearcher() { return indexSearcher; } /** * @param indexReader The indexReader to set. */ public void setIndexReader(IndexReader indexReader) { this.indexReader = indexReader; if (indexReader != null) { indexSearcher = new IndexSearcher(indexReader); } else { indexSearcher = null; } } /** * @return Returns the indexWriter. */ public IndexWriter getIndexWriter() { return indexWriter; } /** * @param indexWriter The indexWriter to set. */ public void setIndexWriter(IndexWriter indexWriter) { this.indexWriter = indexWriter; } /** * @return Returns the anlyzer. */ public Analyzer getAnalyzer() { return analyzer; } public void setAnalyzer(Analyzer analyzer) { this.analyzer = analyzer; } /** Returns the docMaker. */ public DocMaker getDocMaker() { return docMaker; } /** * @return Returns the config. */ public Config getConfig() { return config; } public void resetInputs() throws IOException { docMaker.resetInputs(); Iterator it = readTaskQueryMaker.values().iterator(); while (it.hasNext()) { ((QueryMaker) it.next()).resetInputs(); } } /** * @return Returns the queryMaker by read task type (class) */ synchronized public QueryMaker getQueryMaker(ReadTask readTask) { // mapping the query maker by task class allows extending/adding new search/read tasks // without needing to modify this class. Class readTaskClass = readTask.getClass(); QueryMaker qm = (QueryMaker) readTaskQueryMaker.get(readTaskClass); if (qm == null) { try { qm = (QueryMaker) qmkrClass.newInstance(); qm.setConfig(config); } catch (Exception e) { throw new RuntimeException(e); } readTaskQueryMaker.put(readTaskClass,qm); } return qm; } } lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/0000755000175000017500000000000011554106561030360 5ustar janpascaljanpascallucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenIndexTask.java0000644000175000017500000000452011474320251033733 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.LogMergePolicy; import java.io.IOException; /** * Open an index writer. *
Other side effects: index writer object in perfRunData is set. *
Relevant properties: merge.factor, max.buffered, * max.field.length, ram.flush.mb [default 0], autocommit * [default true]. */ public class OpenIndexTask extends PerfTask { public static final int DEFAULT_MAX_BUFFERED = IndexWriter.DEFAULT_MAX_BUFFERED_DOCS; public static final int DEFAULT_MAX_FIELD_LENGTH = IndexWriter.DEFAULT_MAX_FIELD_LENGTH; public static final int DEFAULT_MERGE_PFACTOR = LogMergePolicy.DEFAULT_MERGE_FACTOR; public static final double DEFAULT_RAM_FLUSH_MB = (int) IndexWriter.DEFAULT_RAM_BUFFER_SIZE_MB; public static final boolean DEFAULT_AUTO_COMMIT = false; public OpenIndexTask(PerfRunData runData) { super(runData); } public int doLogic() throws IOException { PerfRunData runData = getRunData(); Config config = runData.getConfig(); IndexWriter writer = new IndexWriter(runData.getDirectory(), config.get("autocommit", DEFAULT_AUTO_COMMIT), runData.getAnalyzer(), false); CreateIndexTask.setIndexWriterConfig(writer, config); runData.setIndexWriter(writer); return 1; } } ././@LongLink0000000000000000000000000000015400000000000011565 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/RepSumByNameRoundTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/RepSumByNameRoundTa0000644000175000017500000000540711474320251034110 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.stats.Report; import org.apache.lucene.benchmark.byTask.stats.TaskStats; /** * Report all statistics grouped/aggregated by name and round. *
Other side effects: None. */ public class RepSumByNameRoundTask extends ReportTask { public RepSumByNameRoundTask(PerfRunData runData) { super(runData); } public int doLogic() throws Exception { Report rp = reportSumByNameRound(getRunData().getPoints().taskStats()); System.out.println(); System.out.println("------------> Report Sum By (any) Name and Round ("+ rp.getSize()+" about "+rp.getReported()+" out of "+rp.getOutOf()+")"); System.out.println(rp.getText()); System.out.println(); return 0; } /** * Report statistics as a string, aggregate for tasks named the same, and from the same round. * @return the report */ protected Report reportSumByNameRound(List taskStats) { // aggregate by task name and round LinkedHashMap p2 = new LinkedHashMap(); int reported = 0; for (Iterator it = taskStats.iterator(); it.hasNext();) { TaskStats stat1 = (TaskStats) it.next(); if (stat1.getElapsed()>=0) { // consider only tasks that ended reported++; String name = stat1.getTask().getName(); String rname = stat1.getRound()+"."+name; // group by round TaskStats stat2 = (TaskStats) p2.get(rname); if (stat2 == null) { try { stat2 = (TaskStats) stat1.clone(); } catch (CloneNotSupportedException e) { throw new RuntimeException(e); } p2.put(rname,stat2); } else { stat2.add(stat1); } } } // now generate report from secondary list p2 return genPartialReport(reported, p2, taskStats.size()); } } ././@LongLink0000000000000000000000000000015200000000000011563 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/DeleteByPercentTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/DeleteByPercentTask0000644000175000017500000000520411474320251034140 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Random; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.TermDocs; /** * Deletes a percentage of documents from an index randomly * over the number of documents. The parameter, X, is in * percent. EG 50 means 1/2 of all documents will be * deleted. * *

NOTE: the param is an absolute percentage of * maxDoc(). This means if you delete 50%, and then delete * 50% again, the 2nd delete will do nothing. */ public class DeleteByPercentTask extends PerfTask { double percent; int numDeleted = 0; Random random = new Random(System.currentTimeMillis()); public DeleteByPercentTask(PerfRunData runData) { super(runData); } public void setup() throws Exception { super.setup(); } public void setParams(String params) { super.setParams(params); percent = Double.parseDouble(params)/100; } public boolean supportsParams() { return true; } public int doLogic() throws Exception { IndexReader r = getRunData().getIndexReader(); int maxDoc = r.maxDoc(); int numDeleted = 0; // percent is an absolute target: int numToDelete = ((int) (maxDoc * percent)) - r.numDeletedDocs(); if (numToDelete < 0) { r.undeleteAll(); numToDelete = (int) (maxDoc * percent); } while (numDeleted < numToDelete) { double delRate = ((double) (numToDelete-numDeleted))/r.numDocs(); TermDocs termDocs = r.termDocs(null); while (termDocs.next() && numDeleted < numToDelete) { if (random.nextDouble() <= delRate) { r.deleteDocument(termDocs.doc()); numDeleted++; } } termDocs.close(); } System.out.println("--> processed (delete) " + numDeleted + " docs"); return numDeleted; } } ././@LongLink0000000000000000000000000000014600000000000011566 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CloseReaderTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CloseReaderTask.jav0000644000175000017500000000272111474320251034072 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.index.IndexReader; /** * Close index reader. *
Other side effects: index reader in perfRunData is nullified. *
This would cause read related tasks to reopen their own reader. */ public class CloseReaderTask extends PerfTask { public CloseReaderTask(PerfRunData runData) { super(runData); } public int doLogic() throws IOException { IndexReader reader= getRunData().getIndexReader(); if (reader!=null) { reader.close(); } getRunData().setIndexReader(null); return 1; } } lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/AddDocTask.java0000644000175000017500000000441311474320251033161 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.feeds.DocMaker; import org.apache.lucene.document.Document; /** * Add a document, optionally with of a certain size. *
Other side effects: none. *
Takes optional param: document size. */ public class AddDocTask extends PerfTask { public AddDocTask(PerfRunData runData) { super(runData); } private int docSize = 0; // volatile data passed between setup(), doLogic(), tearDown(). private Document doc = null; public void setup() throws Exception { super.setup(); DocMaker docMaker = getRunData().getDocMaker(); if (docSize > 0) { doc = docMaker.makeDocument(docSize); } else { doc = docMaker.makeDocument(); } } public void tearDown() throws Exception { doc = null; super.tearDown(); } protected String getLogMessage(int recsCount) { return "added " + recsCount + " docs"; } public int doLogic() throws Exception { getRunData().getIndexWriter().addDocument(doc); return 1; } /** * Set the params (docSize only) * @param params docSize, or 0 for no limit. */ public void setParams(String params) { super.setParams(params); docSize = (int) Float.parseFloat(params); } /* (non-Javadoc) * @see org.apache.lucene.benchmark.byTask.tasks.PerfTask#supportsParams() */ public boolean supportsParams() { return true; } } ././@LongLink0000000000000000000000000000014600000000000011566 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PrintReaderTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PrintReaderTask.jav0000644000175000017500000000337711474320251034131 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.index.IndexReader; import org.apache.lucene.store.Directory; public class PrintReaderTask extends PerfTask { private String userData = null; public PrintReaderTask(PerfRunData runData) { super(runData); } public void setParams(String params) { super.setParams(params); userData = params; } public boolean supportsParams() { return true; } public int doLogic() throws Exception { Directory dir = getRunData().getDirectory(); Config config = getRunData().getConfig(); IndexReader r = null; if (userData == null) r = IndexReader.open(dir); else r = OpenReaderTask.openCommitPoint(userData, dir, config, true); System.out.println("--> numDocs:"+r.numDocs()+" dels:"+r.numDeletedDocs()); r.close(); return 1; } } ././@LongLink0000000000000000000000000000014500000000000011565 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenReaderTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OpenReaderTask.java0000644000175000017500000000644611474320251034077 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Collection; import java.util.Iterator; import java.util.Map; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.index.IndexCommit; import org.apache.lucene.index.IndexDeletionPolicy; import org.apache.lucene.index.IndexReader; import org.apache.lucene.store.Directory; /** * Open an index reader. *
Other side effects: index reader object in perfRunData is set. *
Optional params readOnly,commitUserData eg. OpenReader(false,commit1) */ public class OpenReaderTask extends PerfTask { public static final String USER_DATA = "userData"; private boolean readOnly = true; private String commitUserData = null; public OpenReaderTask(PerfRunData runData) { super(runData); } public int doLogic() throws IOException { Directory dir = getRunData().getDirectory(); Config config = getRunData().getConfig(); IndexReader r = null; if (commitUserData != null) { r = openCommitPoint(commitUserData, dir, config, readOnly); } else { IndexDeletionPolicy indexDeletionPolicy = CreateIndexTask.getIndexDeletionPolicy(config); r = IndexReader.open(dir, indexDeletionPolicy, readOnly); } getRunData().setIndexReader(r); return 1; } public static IndexReader openCommitPoint(String userData, Directory dir, Config config, boolean readOnly) throws IOException { IndexReader r = null; Collection commits = IndexReader.listCommits(dir); Iterator i = commits.iterator(); while (i.hasNext()) { IndexCommit ic = (IndexCommit)i.next(); Map map = ic.getUserData(); String ud = null; if (map != null) { ud = (String)map.get(USER_DATA); } if (ud != null && ud.equals(userData)) { IndexDeletionPolicy indexDeletionPolicy = CreateIndexTask.getIndexDeletionPolicy(config); r = IndexReader.open(ic, indexDeletionPolicy, readOnly); break; } } if (r == null) throw new IOException("cannot find commitPoint userData:"+userData); return r; } public void setParams(String params) { super.setParams(params); if (params != null) { String[] split = params.split(","); if (split.length > 0) { readOnly = Boolean.valueOf(split[0]).booleanValue(); } if (split.length > 1) { commitUserData = split[1]; } } } public boolean supportsParams() { return true; } } ././@LongLink0000000000000000000000000000014500000000000011565 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTokensTask.java0000644000175000017500000001031511474320251034100 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Reader; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.feeds.DocMaker; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; /** * Simple task to test performance of tokenizers. It just * creates a token stream for each field of the document and * read all tokens out of that stream. */ public class ReadTokensTask extends PerfTask { public ReadTokensTask(PerfRunData runData) { super(runData); } private int totalTokenCount = 0; // volatile data passed between setup(), doLogic(), tearDown(). private Document doc = null; public void setup() throws Exception { super.setup(); DocMaker docMaker = getRunData().getDocMaker(); doc = docMaker.makeDocument(); } protected String getLogMessage(int recsCount) { return "read " + recsCount + " docs; " + totalTokenCount + " tokens"; } public void tearDown() throws Exception { doc = null; super.tearDown(); } Token token = new Token(); public int doLogic() throws Exception { List fields = doc.getFields(); final int numField = fields.size(); Analyzer analyzer = getRunData().getAnalyzer(); int tokenCount = 0; for(int i=0;i len) { s.getChars(upto, upto+len, c, off); upto += len; left -= len; return len; } else if (0 == left) { return -1; } else { s.getChars(upto, upto+left, c, off); int r = left; left = 0; upto = s.length(); return r; } } public void close() {}; } } ././@LongLink0000000000000000000000000000015400000000000011565 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/RepSumByPrefRoundTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/RepSumByPrefRoundTa0000644000175000017500000000533411474320251034123 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.stats.Report; import org.apache.lucene.benchmark.byTask.stats.TaskStats; /** * Report all prefix matching statistics grouped/aggregated by name and round. *
Other side effects: None. */ public class RepSumByPrefRoundTask extends RepSumByPrefTask { public RepSumByPrefRoundTask(PerfRunData runData) { super(runData); } public int doLogic() throws Exception { Report rp = reportSumByPrefixRound(getRunData().getPoints().taskStats()); System.out.println(); System.out.println("------------> Report sum by Prefix ("+prefix+") and Round ("+ rp.getSize()+" about "+rp.getReported()+" out of "+rp.getOutOf()+")"); System.out.println(rp.getText()); System.out.println(); return 0; } protected Report reportSumByPrefixRound(List taskStats) { // aggregate by task name and by round int reported = 0; LinkedHashMap p2 = new LinkedHashMap(); for (Iterator it = taskStats.iterator(); it.hasNext();) { TaskStats stat1 = (TaskStats) it.next(); if (stat1.getElapsed()>=0 && stat1.getTask().getName().startsWith(prefix)) { // only ended tasks with proper name reported++; String name = stat1.getTask().getName(); String rname = stat1.getRound()+"."+name; // group by round TaskStats stat2 = (TaskStats) p2.get(rname); if (stat2 == null) { try { stat2 = (TaskStats) stat1.clone(); } catch (CloneNotSupportedException e) { throw new RuntimeException(e); } p2.put(rname,stat2); } else { stat2.add(stat1); } } } // now generate report from secondary list p2 return genPartialReport(reported, p2, taskStats.size()); } } ././@LongLink0000000000000000000000000000014600000000000011566 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.jav0000644000175000017500000001363111474320251034077 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.index.IndexDeletionPolicy; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.MergeScheduler; import org.apache.lucene.index.MergePolicy; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintStream; /** * Create an index.
* Other side effects: index writer object in perfRunData is set.
* Relevant properties: merge.factor, max.buffered, * max.field.length, ram.flush.mb [default 0], autocommit * [default true]. *

* This task also supports a "writer.info.stream" property with the following * values: *

    *
  • SystemOut - sets {@link IndexWriter#setInfoStream(java.io.PrintStream)} * to {@link System#out}. *
  • SystemErr - sets {@link IndexWriter#setInfoStream(java.io.PrintStream)} * to {@link System#err}. *
  • <file_name> - attempts to create a file given that name and sets * {@link IndexWriter#setInfoStream(java.io.PrintStream)} to that file. If this * denotes an invalid file name, or some error occurs, an exception will be * thrown. *
*/ public class CreateIndexTask extends PerfTask { public CreateIndexTask(PerfRunData runData) { super(runData); } public static void setIndexWriterConfig(IndexWriter writer, Config config) throws IOException { final String mergeScheduler = config.get("merge.scheduler", "org.apache.lucene.index.ConcurrentMergeScheduler"); try { writer.setMergeScheduler((MergeScheduler) Class.forName(mergeScheduler).newInstance()); } catch (Exception e) { throw new RuntimeException("unable to instantiate class '" + mergeScheduler + "' as merge scheduler", e); } final String mergePolicy = config.get("merge.policy", "org.apache.lucene.index.LogByteSizeMergePolicy"); try { writer.setMergePolicy((MergePolicy) Class.forName(mergePolicy).getConstructor(new Class[] { IndexWriter.class }).newInstance(new Object[] { writer })); } catch (Exception e) { throw new RuntimeException("unable to instantiate class '" + mergePolicy + "' as merge policy", e); } writer.setUseCompoundFile(config.get("compound",true)); writer.setMergeFactor(config.get("merge.factor",OpenIndexTask.DEFAULT_MERGE_PFACTOR)); writer.setMaxFieldLength(config.get("max.field.length",OpenIndexTask.DEFAULT_MAX_FIELD_LENGTH)); final double ramBuffer = config.get("ram.flush.mb",OpenIndexTask.DEFAULT_RAM_FLUSH_MB); final int maxBuffered = config.get("max.buffered",OpenIndexTask.DEFAULT_MAX_BUFFERED); if (maxBuffered == IndexWriter.DISABLE_AUTO_FLUSH) { writer.setRAMBufferSizeMB(ramBuffer); writer.setMaxBufferedDocs(maxBuffered); } else { writer.setMaxBufferedDocs(maxBuffered); writer.setRAMBufferSizeMB(ramBuffer); } String infoStreamVal = config.get("writer.info.stream", null); if (infoStreamVal != null) { if (infoStreamVal.equals("SystemOut")) { writer.setInfoStream(System.out); } else if (infoStreamVal.equals("SystemErr")) { writer.setInfoStream(System.err); } else { File f = new File(infoStreamVal).getAbsoluteFile(); writer.setInfoStream(new PrintStream(new BufferedOutputStream(new FileOutputStream(f)))); } } } public static IndexDeletionPolicy getIndexDeletionPolicy(Config config) { String deletionPolicyName = config.get("deletion.policy", "org.apache.lucene.index.KeepOnlyLastCommitDeletionPolicy"); IndexDeletionPolicy indexDeletionPolicy = null; RuntimeException err = null; try { indexDeletionPolicy = ((IndexDeletionPolicy) Class.forName(deletionPolicyName).newInstance()); } catch (IllegalAccessException iae) { err = new RuntimeException("unable to instantiate class '" + deletionPolicyName + "' as IndexDeletionPolicy"); err.initCause(iae); } catch (InstantiationException ie) { err = new RuntimeException("unable to instantiate class '" + deletionPolicyName + "' as IndexDeletionPolicy"); err.initCause(ie); } catch (ClassNotFoundException cnfe) { err = new RuntimeException("unable to load class '" + deletionPolicyName + "' as IndexDeletionPolicy"); err.initCause(cnfe); } if (err != null) throw err; return indexDeletionPolicy; } public int doLogic() throws IOException { PerfRunData runData = getRunData(); Config config = runData.getConfig(); IndexDeletionPolicy indexDeletionPolicy = getIndexDeletionPolicy(config); IndexWriter writer = new IndexWriter(runData.getDirectory(), runData.getConfig().get("autocommit", OpenIndexTask.DEFAULT_AUTO_COMMIT), runData.getAnalyzer(), true, indexDeletionPolicy); setIndexWriterConfig(writer, config); runData.setIndexWriter(writer); return 1; } } ././@LongLink0000000000000000000000000000014500000000000011565 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ClearStatsTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ClearStatsTask.java0000644000175000017500000000247111474320251034112 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; import org.apache.lucene.benchmark.byTask.PerfRunData; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Clear statistics data. *
Other side effects: None. */ public class ClearStatsTask extends PerfTask { public ClearStatsTask(PerfRunData runData) { super(runData); } public int doLogic() throws Exception { getRunData().getPoints().clearData(); return 0; } /* (non-Javadoc) * @see PerfTask#shouldNotRecordStats() */ protected boolean shouldNotRecordStats() { return true; } } ././@LongLink0000000000000000000000000000015100000000000011562 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithSortTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchWithSortTask.0000644000175000017500000001064011474320251034111 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.feeds.QueryMaker; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; /** * Does sort search on specified field. * */ public class SearchWithSortTask extends ReadTask { private boolean doScore = true; private boolean doMaxScore = true; private Sort sort; public SearchWithSortTask(PerfRunData runData) { super(runData); } /** * SortFields: field:type,field:type[,noscore][,nomaxscore] * * If noscore is present, then we turn off score tracking * in {@link org.apache.lucene.search.TopFieldCollector}. * If nomaxscore is present, then we turn off maxScore tracking * in {@link org.apache.lucene.search.TopFieldCollector}. * * name:string,page:int,subject:string * */ public void setParams(String sortField) { super.setParams(sortField); String[] fields = sortField.split(","); SortField[] sortFields = new SortField[fields.length]; int upto = 0; for (int i = 0; i < fields.length; i++) { String field = fields[i]; SortField sortField0; if (field.equals("doc")) { sortField0 = SortField.FIELD_DOC; } if (field.equals("score")) { sortField0 = SortField.FIELD_SCORE; } else if (field.equals("noscore")) { doScore = false; continue; } else if (field.equals("nomaxscore")) { doMaxScore = false; continue; } else { int index = field.lastIndexOf(":"); String fieldName; String typeString; if (index != -1) { fieldName = field.substring(0, index); typeString = field.substring(1+index, field.length()); } else { throw new RuntimeException("You must specify the sort type ie page:int,subject:string"); } int type = getType(typeString); sortField0 = new SortField(fieldName, type); } sortFields[upto++] = sortField0; } if (upto < sortFields.length) { SortField[] newSortFields = new SortField[upto]; System.arraycopy(sortFields, 0, newSortFields, 0, upto); sortFields = newSortFields; } this.sort = new Sort(sortFields); } private int getType(String typeString) { int type; if (typeString.equals("float")) { type = SortField.FLOAT; } else if (typeString.equals("double")) { type = SortField.DOUBLE; } else if (typeString.equals("byte")) { type = SortField.BYTE; } else if (typeString.equals("short")) { type = SortField.SHORT; } else if (typeString.equals("int")) { type = SortField.INT; } else if (typeString.equals("long")) { type = SortField.LONG; } else if (typeString.equals("string")) { type = SortField.STRING; } else if (typeString.equals("string_val")) { type = SortField.STRING_VAL; } else { throw new RuntimeException("Unrecognized sort field type " + typeString); } return type; } public boolean supportsParams() { return true; } public QueryMaker getQueryMaker() { return getRunData().getQueryMaker(this); } public boolean withRetrieve() { return false; } public boolean withSearch() { return true; } public boolean withTraverse() { return false; } public boolean withWarm() { return false; } public boolean withScore() { return doScore; } public boolean withMaxScore() { return doMaxScore; } public Sort getSort() { if (sort == null) { throw new IllegalStateException("No sort field was set"); } return sort; } } ././@LongLink0000000000000000000000000000014600000000000011566 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ResetInputsTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ResetInputsTask.jav0000644000175000017500000000267111474320251034173 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; import org.apache.lucene.benchmark.byTask.PerfRunData; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Reset inputs so that the test run would behave, input wise, * as if it just started. This affects e.g. the generation of docs and queries. */ public class ResetInputsTask extends PerfTask { public ResetInputsTask(PerfRunData runData) { super(runData); } public int doLogic() throws Exception { getRunData().resetInputs(); return 0; } /* * (non-Javadoc) * @see org.apache.lucene.benchmark.byTask.tasks.PerfTask#shouldNotRecordStats() */ protected boolean shouldNotRecordStats() { return true; } } ././@LongLink0000000000000000000000000000014600000000000011566 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CommitIndexTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CommitIndexTask.jav0000644000175000017500000000320411474320251034117 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.HashMap; import java.util.Map; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.index.IndexWriter; /** * Commits the IndexWriter. * */ public class CommitIndexTask extends PerfTask { String commitUserData = null; public CommitIndexTask(PerfRunData runData) { super(runData); } public boolean supportsParams() { return true; } public void setParams(String params) { commitUserData = params; } public int doLogic() throws Exception { IndexWriter iw = getRunData().getIndexWriter(); if (iw != null) { if (commitUserData == null) iw.commit(); else { Map map = new HashMap(); map.put(OpenReaderTask.USER_DATA, commitUserData); iw.commit(map); } } return 1; } } ././@LongLink0000000000000000000000000000015200000000000011563 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ResetSystemSoftTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ResetSystemSoftTask0000644000175000017500000000256511474320251034254 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; import org.apache.lucene.benchmark.byTask.PerfRunData; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Reset all index and input data and call gc, does NOT erase index/dir, does NOT clear statistics. * This contains ResetInputs. *
Other side effects: writers/readers nullified, closed. * Index is NOT erased. * Directory is NOT erased. */ public class ResetSystemSoftTask extends ResetInputsTask { public ResetSystemSoftTask(PerfRunData runData) { super(runData); } public int doLogic() throws Exception { getRunData().reinit(false); return 0; } } ././@LongLink0000000000000000000000000000014700000000000011567 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReopenReaderTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReopenReaderTask.ja0000644000175000017500000000261711474320251034073 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.index.IndexReader; /** * Reopens IndexReader and closes old IndexReader. * */ public class ReopenReaderTask extends PerfTask { public ReopenReaderTask(PerfRunData runData) { super(runData); } public int doLogic() throws IOException { IndexReader ir = getRunData().getIndexReader(); IndexReader or = ir; IndexReader nr = ir.reopen(); if(nr != or) { getRunData().setIndexReader(nr); or.close(); } return 1; } } ././@LongLink0000000000000000000000000000015300000000000011564 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ResetSystemEraseTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ResetSystemEraseTas0000644000175000017500000000257411474320251034225 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; import org.apache.lucene.benchmark.byTask.PerfRunData; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Reset all index and input data and call gc, erase index and dir, does NOT clear statistics. *
This contains ResetInputs. *
Other side effects: writers/readers nullified, deleted, closed. * Index is erased. * Directory is erased. */ public class ResetSystemEraseTask extends ResetSystemSoftTask { public ResetSystemEraseTask(PerfRunData runData) { super(runData); } public int doLogic() throws Exception { getRunData().reinit(true); return 0; } } lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/PerfTask.java0000644000175000017500000002421711474320251032743 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.text.NumberFormat; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.stats.Points; import org.apache.lucene.benchmark.byTask.stats.TaskStats; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.benchmark.byTask.utils.Format; /** * An abstract task to be tested for performance.
* Every performance task extends this class, and provides its own * {@link #doLogic()} method, which performs the actual task.
* Tasks performing some work that should be measured for the task, can override * {@link #setup()} and/or {@link #tearDown()} and place that work there.
* Relevant properties: task.max.depth.log.
* Also supports the following logging attributes: *
    *
  • log.step - specifies how often to log messages about the current running * task. Default is 1000 {@link #doLogic()} invocations. Set to -1 to disable * logging. *
  • log.step.[class Task Name] - specifies the same as 'log.step', only for a * particular task name. For example, log.step.AddDoc will be applied only for * {@link AddDocTask}, but not for {@link DeleteDocTask}. It's a way to control * per task logging settings. If you want to omit logging for any other task, * include log.step=-1. The syntax is "log.step." together with the Task's * 'short' name (i.e., without the 'Task' part). *
*/ public abstract class PerfTask implements Cloneable { static final int DEFAULT_LOG_STEP = 1000; private PerfRunData runData; // propeties that all tasks have private String name; private int depth = 0; protected int logStep; private int logStepCount = 0; private int maxDepthLogStart = 0; private boolean disableCounting = false; protected String params = null; protected static final String NEW_LINE = System.getProperty("line.separator"); /** Should not be used externally */ private PerfTask() { name = Format.simpleName(getClass()); if (name.endsWith("Task")) { name = name.substring(0, name.length() - 4); } } /** * @deprecated will be removed in 3.0. checks if there are any obsolete * settings, like doc.add.log.step and doc.delete.log.step and * alerts the user. */ private void checkObsoleteSettings(Config config) { if (config.get("doc.add.log.step", null) != null) { throw new RuntimeException("doc.add.log.step is not supported anymore. " + "Use log.step.AddDoc and refer to CHANGES to read on the recent " + "API changes done to Benchmark's DocMaker and Task-based logging."); } if (config.get("doc.delete.log.step", null) != null) { throw new RuntimeException("doc.delete.log.step is not supported anymore. " + "Use log.step.DeleteDoc and refer to CHANGES to read on the recent " + "API changes done to Benchmark's DocMaker and Task-based logging."); } } public PerfTask(PerfRunData runData) { this(); this.runData = runData; Config config = runData.getConfig(); this.maxDepthLogStart = config.get("task.max.depth.log",0); String logStepAtt = "log.step"; // TODO (1.5): call getClass().getSimpleName() instead. String taskName = getClass().getName(); int idx = taskName.lastIndexOf('.'); // To support test internal classes. when we move to getSimpleName, this can be removed. int idx2 = taskName.indexOf('$', idx); if (idx2 != -1) idx = idx2; String taskLogStepAtt = "log.step." + taskName.substring(idx + 1, taskName.length() - 4 /* w/o the 'Task' part */); if (config.get(taskLogStepAtt, null) != null) { logStepAtt = taskLogStepAtt; } // It's important to read this from Config, to support vals-by-round. logStep = config.get(logStepAtt, DEFAULT_LOG_STEP); // To avoid the check 'if (logStep > 0)' in tearDown(). This effectively // turns logging off. if (logStep <= 0) { logStep = Integer.MAX_VALUE; } checkObsoleteSettings(config); } protected Object clone() throws CloneNotSupportedException { // tasks having non primitive data structures should override this. // otherwise parallel running of a task sequence might not run correctly. return super.clone(); } public void close() throws Exception { } /** * Run the task, record statistics. * @return number of work items done by this task. */ public final int runAndMaybeStats(boolean reportStats) throws Exception { if (reportStats && depth <= maxDepthLogStart && !shouldNeverLogAtStart()) { System.out.println("------------> starting task: " + getName()); } if (!reportStats || shouldNotRecordStats()) { setup(); int count = doLogic(); count = disableCounting ? 0 : count; tearDown(); return count; } setup(); Points pnts = runData.getPoints(); TaskStats ts = pnts.markTaskStart(this,runData.getConfig().getRoundNumber()); int count = doLogic(); count = disableCounting ? 0 : count; pnts.markTaskEnd(ts, count); tearDown(); return count; } /** * Perform the task once (ignoring repetitions specification) * Return number of work items done by this task. * For indexing that can be number of docs added. * For warming that can be number of scanned items, etc. * @return number of work items done by this task. */ public abstract int doLogic() throws Exception; /** * @return Returns the name. */ public String getName() { if (params==null) { return name; } return new StringBuffer(name).append('(').append(params).append(')').toString(); } /** * @param name The name to set. */ protected void setName(String name) { this.name = name; } /** * @return Returns the run data. */ public PerfRunData getRunData() { return runData; } /** * @return Returns the depth. */ public int getDepth() { return depth; } /** * @param depth The depth to set. */ public void setDepth(int depth) { this.depth = depth; } // compute a blank string padding for printing this task indented by its depth String getPadding () { char c[] = new char[4*getDepth()]; for (int i = 0; i < c.length; i++) c[i] = ' '; return new String(c); } /* (non-Javadoc) * @see java.lang.Object#toString() */ public String toString() { String padd = getPadding(); StringBuffer sb = new StringBuffer(padd); if (disableCounting) { sb.append('-'); } sb.append(getName()); return sb.toString(); } /** * @return Returns the maxDepthLogStart. */ int getMaxDepthLogStart() { return maxDepthLogStart; } protected String getLogMessage(int recsCount) { return "processed " + recsCount + " records"; } /** * Tasks that should never log at start can override this. * @return true if this task should never log when it start. */ protected boolean shouldNeverLogAtStart () { return false; } /** * Tasks that should not record statistics can override this. * @return true if this task should never record its statistics. */ protected boolean shouldNotRecordStats () { return false; } /** * Task setup work that should not be measured for that specific task. * By default it does nothing, but tasks can implement this, moving work from * doLogic() to this method. Only the work done in doLogicis measured for this task. * Notice that higher level (sequence) tasks containing this task would then * measure larger time than the sum of their contained tasks. * @throws Exception */ public void setup () throws Exception { } /** * Task tearDown work that should not be measured for that specific task. * By default it does nothing, but tasks can implement this, moving work from * doLogic() to this method. Only the work done in doLogicis measured for this task. * Notice that higher level (sequence) tasks containing this task would then * measure larger time than the sum of their contained tasks. */ public void tearDown() throws Exception { if (++logStepCount % logStep == 0) { double time = (System.currentTimeMillis() - runData.getStartTimeMillis()) / 1000.0; NumberFormat nf = NumberFormat.getInstance(); nf.setMaximumFractionDigits(2); System.out.println(nf.format(time) + " sec --> " + Thread.currentThread().getName() + " " + getLogMessage(logStepCount)); } } /** * Sub classes that supports parameters must override this method to return true. * @return true iff this task supports command line params. */ public boolean supportsParams () { return false; } /** * Set the params of this task. * @exception UnsupportedOperationException for tasks supporting command line parameters. */ public void setParams(String params) { if (!supportsParams()) { throw new UnsupportedOperationException(getName()+" does not support command line parameters."); } this.params = params; } /** * @return Returns the Params. */ public String getParams() { return params; } /** * Return true if counting is disabled for this task. */ public boolean isDisableCounting() { return disableCounting; } /** * See {@link #isDisableCounting()} */ public void setDisableCounting(boolean disableCounting) { this.disableCounting = disableCounting; } } ././@LongLink0000000000000000000000000000014600000000000011566 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/FlushReaderTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/FlushReaderTask.jav0000644000175000017500000000315011474320251034103 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.HashMap; import java.util.Map; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.index.IndexReader; public class FlushReaderTask extends PerfTask { String userData = null; public FlushReaderTask(PerfRunData runData) { super(runData); } public boolean supportsParams() { return true; } public void setParams(String params) { super.setParams(params); userData = params; } public int doLogic() throws IOException { IndexReader reader = getRunData().getIndexReader(); if (userData != null) { Map map = new HashMap(); map.put(OpenReaderTask.USER_DATA, userData); reader.flush(map); } else { reader.flush(); } return 1; } } ././@LongLink0000000000000000000000000000015700000000000011570 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ConsumeContentSourceTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ConsumeContentSourc0000644000175000017500000000430511474320251034260 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.feeds.ContentSource; import org.apache.lucene.benchmark.byTask.feeds.DocData; import org.apache.lucene.benchmark.byTask.utils.Config; /** * Consumes a {@link org.apache.lucene.benchmark.byTask.feeds.ContentSource}. * Supports the following parameters: *
    *
  • content.source - the content source to use. (mandatory) *
*/ public class ConsumeContentSourceTask extends PerfTask { private ContentSource source; private DocData dd = new DocData(); public ConsumeContentSourceTask(PerfRunData runData) { super(runData); Config config = runData.getConfig(); String sourceClass = config.get("content.source", null); if (sourceClass == null) { throw new IllegalArgumentException("content.source must be defined"); } try { source = (ContentSource) Class.forName(sourceClass).newInstance(); source.setConfig(config); source.resetInputs(); } catch (Exception e) { throw new RuntimeException(e); } } protected String getLogMessage(int recsCount) { return "read " + recsCount + " documents from the content source"; } public void close() throws Exception { source.close(); super.close(); } public int doLogic() throws Exception { dd = source.getNextDocData(dd); return 1; } } ././@LongLink0000000000000000000000000000015500000000000011566 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NearRealtimeReaderTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NearRealtimeReaderT0000644000175000017500000001011511474320251034113 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TopFieldDocs; import org.apache.lucene.index.Term; /** * Spawns a BG thread that periodically (defaults to 3.0 * seconds, but accepts param in seconds) wakes up and asks * IndexWriter for a near real-time reader. Then runs a * single query (body: 1) sorted by docdate, and prints * time to reopen and time to run the search. * * NOTE: this is very experimental at this point, and * subject to change. It's also not generally usable, eg * you cannot change which query is executed. */ public class NearRealtimeReaderTask extends PerfTask { ReopenThread t; float pauseSec = 3.0f; private static class ReopenThread extends Thread { final IndexWriter writer; final int pauseMsec; public volatile boolean done; ReopenThread(IndexWriter writer, float pauseSec) { this.writer = writer; this.pauseMsec = (int) (1000*pauseSec); setDaemon(true); } public void run() { IndexReader reader = null; final Query query = new TermQuery(new Term("body", "1")); final SortField sf = new SortField("docdate", SortField.LONG); final Sort sort = new Sort(sf); try { while(!done) { final long t0 = System.currentTimeMillis(); if (reader == null) { reader = writer.getReader(); } else { final IndexReader newReader = reader.reopen(); if (reader != newReader) { reader.close(); reader = newReader; } } final long t1 = System.currentTimeMillis(); final TopFieldDocs hits = new IndexSearcher(reader).search(query, null, 10, sort); final long t2 = System.currentTimeMillis(); System.out.println("nrt: open " + (t1-t0) + " msec; search " + (t2-t1) + " msec, " + hits.totalHits + " results; " + reader.numDocs() + " docs"); final long t4 = System.currentTimeMillis(); final int delay = (int) (pauseMsec - (t4-t0)); if (delay > 0) { try { Thread.sleep(delay); } catch (InterruptedException ie) { throw new RuntimeException(ie); } } } } catch (Exception e) { throw new RuntimeException(e); } } } public NearRealtimeReaderTask(PerfRunData runData) { super(runData); } public int doLogic() throws IOException { if (t == null) { IndexWriter w = getRunData().getIndexWriter(); t = new ReopenThread(w, pauseSec); t.start(); } return 1; } public void setParams(String params) { super.setParams(params); pauseSec = Float.parseFloat(params); } public boolean supportsParams() { return true; } // Close the thread public void close() throws InterruptedException { if (t != null) { t.done = true; t.join(); } } } ././@LongLink0000000000000000000000000000014500000000000011565 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CloseIndexTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CloseIndexTask.java0000644000175000017500000000370011474320251034076 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.PrintStream; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.index.IndexWriter; /** * Close index writer. *
Other side effects: index writer object in perfRunData is nullified. *
Takes optional param "doWait": if false, then close(false) is called. */ public class CloseIndexTask extends PerfTask { public CloseIndexTask(PerfRunData runData) { super(runData); } boolean doWait = true; public int doLogic() throws IOException { IndexWriter iw = getRunData().getIndexWriter(); if (iw != null) { // If infoStream was set to output to a file, close it. PrintStream infoStream = iw.getInfoStream(); if (infoStream != null && infoStream != System.out && infoStream != System.err) { infoStream.close(); } iw.close(doWait); getRunData().setIndexWriter(null); } return 1; } public void setParams(String params) { super.setParams(params); doWait = Boolean.valueOf(params).booleanValue(); } public boolean supportsParams() { return true; } } lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReadTask.java0000644000175000017500000002255011474320251032720 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Collection; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.feeds.QueryMaker; import org.apache.lucene.document.Document; import org.apache.lucene.document.Fieldable; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopFieldCollector; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Sort; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; import org.apache.lucene.store.Directory; /** * Read index (abstract) task. * Sub classes implement withSearch(), withWarm(), withTraverse() and withRetrieve() * methods to configure the actual action. *

*

Note: All ReadTasks reuse the reader if it is already open. * Otherwise a reader is opened at start and closed at the end. *

* The search.num.hits config parameter sets * the top number of hits to collect during searching. *

Other side effects: none. */ public abstract class ReadTask extends PerfTask { public ReadTask(PerfRunData runData) { super(runData); } public int doLogic() throws Exception { int res = 0; boolean closeReader = false; // open reader or use existing one IndexReader ir = getRunData().getIndexReader(); if (ir == null) { Directory dir = getRunData().getDirectory(); ir = IndexReader.open(dir); closeReader = true; //res++; //this is confusing, comment it out } // optionally warm and add num docs traversed to count if (withWarm()) { Document doc = null; for (int m = 0; m < ir.maxDoc(); m++) { if (!ir.isDeleted(m)) { doc = ir.document(m); res += (doc == null ? 0 : 1); } } } if (withSearch()) { res++; final IndexSearcher searcher; if (closeReader) { searcher = new IndexSearcher(ir); } else { searcher = getRunData().getIndexSearcher(); } QueryMaker queryMaker = getQueryMaker(); Query q = queryMaker.makeQuery(); Sort sort = getSort(); TopDocs hits; final int numHits = numHits(); if (numHits > 0) { if (sort != null) { // TODO: change the following to create TFC with in/out-of order // according to whether the query's Scorer. TopFieldCollector collector = TopFieldCollector.create(sort, numHits, true, withScore(), withMaxScore(), false); searcher.search(q, collector); hits = collector.topDocs(); } else { hits = searcher.search(q, numHits); } //System.out.println("q=" + q + ":" + hits.totalHits + " total hits"); if (withTraverse()) { final ScoreDoc[] scoreDocs = hits.scoreDocs; int traversalSize = Math.min(scoreDocs.length, traversalSize()); if (traversalSize > 0) { boolean retrieve = withRetrieve(); int numHighlight = Math.min(numToHighlight(), scoreDocs.length); Analyzer analyzer = getRunData().getAnalyzer(); BenchmarkHighlighter highlighter = null; if (numHighlight > 0) { highlighter = getBenchmarkHighlighter(q); } for (int m = 0; m < traversalSize; m++) { int id = scoreDocs[m].doc; res++; if (retrieve) { Document document = retrieveDoc(ir, id); res += document != null ? 1 : 0; if (numHighlight > 0 && m < numHighlight) { Collection/**/ fieldsToHighlight = getFieldsToHighlight(document); for (Iterator iterator = fieldsToHighlight.iterator(); iterator.hasNext();) { String field = (String) iterator.next(); String text = document.get(field); res += highlighter.doHighlight(ir, id, field, document, analyzer, text); } } } } } } } searcher.close(); } if (closeReader) { ir.close(); } return res; } protected Document retrieveDoc(IndexReader ir, int id) throws IOException { return ir.document(id); } /** * Return query maker used for this task. */ public abstract QueryMaker getQueryMaker(); /** * Return true if search should be performed. */ public abstract boolean withSearch(); /** * Return true if warming should be performed. */ public abstract boolean withWarm(); /** * Return true if, with search, results should be traversed. */ public abstract boolean withTraverse(); /** Whether scores should be computed (only useful with * field sort) */ public boolean withScore() { return true; } /** Whether maxScores should be computed (only useful with * field sort) */ public boolean withMaxScore() { return true; } /** * Specify the number of hits to traverse. Tasks should override this if they want to restrict the number * of hits that are traversed when {@link #withTraverse()} is true. Must be greater than 0. *

* Read task calculates the traversal as: Math.min(hits.length(), traversalSize()) * * @return Integer.MAX_VALUE */ public int traversalSize() { return Integer.MAX_VALUE; } static final int DEFAULT_SEARCH_NUM_HITS = 10; private int numHits; public void setup() throws Exception { super.setup(); numHits = getRunData().getConfig().get("search.num.hits", DEFAULT_SEARCH_NUM_HITS); } /** * Specify the number of hits to retrieve. Tasks should override this if they want to restrict the number * of hits that are collected during searching. Must be greater than 0. * * @return 10 by default, or search.num.hits config if set. */ public int numHits() { return numHits; } /** * Return true if, with search & results traversing, docs should be retrieved. */ public abstract boolean withRetrieve(); /** * Set to the number of documents to highlight. * * @return The number of the results to highlight. O means no docs will be highlighted. */ public int numToHighlight() { return 0; } /** * @deprecated Use {@link #getBenchmarkHighlighter(Query)} */ final Highlighter getHighlighter(Query q) { // not called return null; } /** * Return an appropriate highlighter to be used with * highlighting tasks */ protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){ return null; } /** * @return the maximum number of highlighter fragments * @deprecated Please define getBenchmarkHighlighter instead */ final int maxNumFragments(){ // not called -- we switched this method to final to // force any external subclasses to cutover to // getBenchmarkHighlighter instead return 10; } /** * * @return true if the highlighter should merge contiguous fragments * @deprecated Please define getBenchmarkHighlighter instead */ final boolean isMergeContiguousFragments(){ // not called -- we switched this method to final to // force any external subclasses to cutover to // getBenchmarkHighlighter instead return false; } /** * @deprecated Please define getBenchmarkHighlighter instead */ final int doHighlight(TokenStream ts, String text, Highlighter highlighter, boolean mergeContiguous, int maxFragments) throws IOException, InvalidTokenOffsetsException { // not called -- we switched this method to final to // force any external subclasses to cutover to // getBenchmarkHighlighter instead return 0; } protected Sort getSort() { return null; } /** * Define the fields to highlight. Base implementation returns all fields * @param document The Document * @return A Collection of Field names (Strings) */ protected Collection/**/ getFieldsToHighlight(Document document) { List/**/ fieldables = document.getFields(); Set/**/ result = new HashSet(fieldables.size()); for (Iterator iterator = fieldables.iterator(); iterator.hasNext();) { Fieldable fieldable = (Fieldable) iterator.next(); result.add(fieldable.name()); } return result; } } lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/package.html0000644000175000017500000000201211474320251032627 0ustar janpascaljanpascal Extendable benchmark tasks. lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/UpdateDocTask.java0000644000175000017500000000505211474320251033713 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.feeds.DocMaker; import org.apache.lucene.document.Document; import org.apache.lucene.index.Term; /** * Update a document, using IndexWriter.updateDocument, * optionally with of a certain size. *
Other side effects: none. *
Takes optional param: document size. */ public class UpdateDocTask extends PerfTask { public UpdateDocTask(PerfRunData runData) { super(runData); } private int docSize = 0; // volatile data passed between setup(), doLogic(), tearDown(). private Document doc = null; public void setup() throws Exception { super.setup(); DocMaker docMaker = getRunData().getDocMaker(); if (docSize > 0) { doc = docMaker.makeDocument(docSize); } else { doc = docMaker.makeDocument(); } } public void tearDown() throws Exception { doc = null; super.tearDown(); } public int doLogic() throws Exception { final String docID = doc.get(DocMaker.ID_FIELD); if (docID == null) { throw new IllegalStateException("document must define the docid field"); } getRunData().getIndexWriter().updateDocument(new Term(DocMaker.ID_FIELD, docID), doc); return 1; } protected String getLogMessage(int recsCount) { return "updated " + recsCount + " docs"; } /** * Set the params (docSize only) * @param params docSize, or 0 for no limit. */ public void setParams(String params) { super.setParams(params); docSize = (int) Float.parseFloat(params); } /* (non-Javadoc) * @see org.apache.lucene.benchmark.byTask.tasks.PerfTask#supportsParams() */ public boolean supportsParams() { return true; } } ././@LongLink0000000000000000000000000000015000000000000011561 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetTask.j0000644000175000017500000000277011474320251034074 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.benchmark.byTask.PerfRunData; /** * Search and Traverse and Retrieve docs task. * *

Note: This task reuses the reader if it is already open. * Otherwise a reader is opened at start and closed at the end. *

* *

Takes optional param: traversal size (otherwise all results are traversed).

* *

Other side effects: counts additional 1 (record) for each traversed hit, * and 1 more for each retrieved (non null) document.

*/ public class SearchTravRetTask extends SearchTravTask { public SearchTravRetTask(PerfRunData runData) { super(runData); } public boolean withRetrieve() { return true; } } lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/OptimizeTask.java0000644000175000017500000000301511474320251033640 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.index.IndexWriter; /** * Optimize the index. *
Other side effects: none. */ public class OptimizeTask extends PerfTask { public OptimizeTask(PerfRunData runData) { super(runData); } int maxNumSegments = 1; public int doLogic() throws Exception { IndexWriter iw = getRunData().getIndexWriter(); iw.optimize(maxNumSegments); //System.out.println("optimize called"); return 1; } public void setParams(String params) { super.setParams(params); maxNumSegments = (int) Double.valueOf(params).intValue(); } public boolean supportsParams() { return true; } } lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/DeleteDocTask.java0000644000175000017500000000500211474320251033666 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.benchmark.byTask.PerfRunData; /** * Delete a document by docid. If no docid param is supplied, deletes doc with * id = last-deleted-doc + doc.delete.step. */ public class DeleteDocTask extends PerfTask { /** * Gap between ids of deleted docs, applies when no docid param is provided. */ public static final int DEFAULT_DOC_DELETE_STEP = 8; public DeleteDocTask(PerfRunData runData) { super(runData); } private int deleteStep = -1; private static int lastDeleted = -1; private int docid = -1; private boolean byStep = true; public int doLogic() throws Exception { getRunData().getIndexReader().deleteDocument(docid); lastDeleted = docid; return 1; // one work item done here } /* (non-Javadoc) * @see org.apache.lucene.benchmark.byTask.tasks.PerfTask#setup() */ public void setup() throws Exception { super.setup(); if (deleteStep<0) { deleteStep = getRunData().getConfig().get("doc.delete.step",DEFAULT_DOC_DELETE_STEP); } // set the docid to be deleted docid = (byStep ? lastDeleted + deleteStep : docid); } protected String getLogMessage(int recsCount) { return "deleted " + recsCount + " docs, last deleted: " + lastDeleted; } /** * Set the params (docid only) * @param params docid to delete, or -1 for deleting by delete gap settings. */ public void setParams(String params) { super.setParams(params); docid = (int) Float.parseFloat(params); byStep = (docid < 0); } /* (non-Javadoc) * @see org.apache.lucene.benchmark.byTask.tasks.PerfTask#supportsParams() */ public boolean supportsParams() { return true; } } lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTask.java0000644000175000017500000000301711474320251033247 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.feeds.QueryMaker; /** * Search task. * *

Note: This task reuses the reader if it is already open. * Otherwise a reader is opened at start and closed at the end. */ public class SearchTask extends ReadTask { public SearchTask(PerfRunData runData) { super(runData); } public boolean withRetrieve() { return false; } public boolean withSearch() { return true; } public boolean withTraverse() { return false; } public boolean withWarm() { return false; } public QueryMaker getQueryMaker() { return getRunData().getQueryMaker(this); } } lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/TaskSequence.java0000644000175000017500000002546211474320251033622 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.ArrayList; import java.util.Iterator; import java.text.NumberFormat; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException; /** * Sequence of parallel or sequential tasks. */ public class TaskSequence extends PerfTask { public static int REPEAT_EXHAUST = -2; private ArrayList tasks; private int repetitions = 1; private boolean parallel; private TaskSequence parent; private boolean letChildReport = true; private int rate = 0; private boolean perMin = false; // rate, if set, is, by default, be sec. private String seqName; private boolean exhausted = false; private boolean resetExhausted = false; private PerfTask[] tasksArray; private boolean anyExhaustibleTasks; private boolean collapsable = false; // to not collapse external sequence named in alg. private boolean fixedTime; // true if we run for fixed time private double runTimeSec; // how long to run for public TaskSequence (PerfRunData runData, String name, TaskSequence parent, boolean parallel) { super(runData); collapsable = (name == null); name = (name!=null ? name : (parallel ? "Par" : "Seq")); setName(name); setSequenceName(); this.parent = parent; this.parallel = parallel; tasks = new ArrayList(); } public void close() throws Exception { initTasksArray(); for(int i=0;i 0) { return doSerialTasksWithRate(); } initTasksArray(); int count = 0; final long t0 = System.currentTimeMillis(); final long runTime = (long) (runTimeSec*1000); for (int k=0; fixedTime || (repetitions==REPEAT_EXHAUST && !exhausted) || k runTime) { repetitions = k+1; break; } } return count; } private int doSerialTasksWithRate() throws Exception { initTasksArray(); long delayStep = (perMin ? 60000 : 1000) /rate; long nextStartTime = System.currentTimeMillis(); int count = 0; for (int k=0; (repetitions==REPEAT_EXHAUST && !exhausted) || k 0) { //System.out.println("wait: "+waitMore+" for rate: "+ratePerMin+" (delayStep="+delayStep+")"); Thread.sleep(waitMore); } nextStartTime += delayStep; // this aims at avarage rate. try { count += task.runAndMaybeStats(letChildReport); if (anyExhaustibleTasks) updateExhausted(task); } catch (NoMoreDataException e) { exhausted = true; } } } return count; } // update state regarding exhaustion. private void updateExhausted(PerfTask task) { if (task instanceof ResetInputsTask) { exhausted = false; resetExhausted = true; } else if (task instanceof TaskSequence) { TaskSequence t = (TaskSequence) task; if (t.resetExhausted) { exhausted = false; resetExhausted = true; t.resetExhausted = false; } else { exhausted |= t.exhausted; } } } private int doParallelTasks() throws Exception { initTasksArray(); final int count [] = {0}; Thread t[] = new Thread [repetitions * tasks.size()]; // prepare threads int indx = 0; for (int k=0; k 0) { startlThreadsWithRate(t); return; } for (int i = 0; i < t.length; i++) { t[i].start(); } } // run threads with rate private void startlThreadsWithRate(Thread[] t) throws InterruptedException { long delayStep = (perMin ? 60000 : 1000) /rate; long nextStartTime = System.currentTimeMillis(); for (int i = 0; i < t.length; i++) { long waitMore = nextStartTime - System.currentTimeMillis(); if (waitMore > 0) { //System.out.println("thread wait: "+waitMore+" for rate: "+ratePerMin+" (delayStep="+delayStep+")"); Thread.sleep(waitMore); } nextStartTime += delayStep; // this aims at average rate of starting threads. t[i].start(); } } public void addTask(PerfTask task) { tasks.add(task); task.setDepth(getDepth()+1); } /* (non-Javadoc) * @see java.lang.Object#toString() */ public String toString() { String padd = getPadding(); StringBuffer sb = new StringBuffer(super.toString()); sb.append(parallel ? " [" : " {"); sb.append(NEW_LINE); for (Iterator it = tasks.iterator(); it.hasNext();) { PerfTask task = (PerfTask) it.next(); sb.append(task.toString()); sb.append(NEW_LINE); } sb.append(padd); sb.append(!letChildReport ? ">" : (parallel ? "]" : "}")); if (fixedTime) { sb.append(" " + NumberFormat.getNumberInstance().format(runTimeSec) + "s"); } else if (repetitions>1) { sb.append(" * " + repetitions); } else if (repetitions==REPEAT_EXHAUST) { sb.append(" * EXHAUST"); } if (rate>0) { sb.append(", rate: " + rate+"/"+(perMin?"min":"sec")); } return sb.toString(); } /** * Execute child tasks in a way that they do not report their time separately. */ public void setNoChildReport() { letChildReport = false; for (Iterator it = tasks.iterator(); it.hasNext();) { PerfTask task = (PerfTask) it.next(); if (task instanceof TaskSequence) { ((TaskSequence)task).setNoChildReport(); } } } /** * Returns the rate per minute: how many operations should be performed in a minute. * If 0 this has no effect. * @return the rate per min: how many operations should be performed in a minute. */ public int getRate() { return (perMin ? rate : 60*rate); } /** * @param rate The rate to set. */ public void setRate(int rate, boolean perMin) { this.rate = rate; this.perMin = perMin; setSequenceName(); } private void setSequenceName() { seqName = super.getName(); if (repetitions==REPEAT_EXHAUST) { seqName += "_Exhaust"; } else if (repetitions>1) { seqName += "_"+repetitions; } if (rate>0) { seqName += "_" + rate + (perMin?"/min":"/sec"); } if (parallel && seqName.toLowerCase().indexOf("par")<0) { seqName += "_Par"; } } public String getName() { return seqName; // override to include more info } /** * @return Returns the tasks. */ public ArrayList getTasks() { return tasks; } /* (non-Javadoc) * @see java.lang.Object#clone() */ protected Object clone() throws CloneNotSupportedException { TaskSequence res = (TaskSequence) super.clone(); res.tasks = new ArrayList(); for (int i = 0; i < tasks.size(); i++) { res.tasks.add(((PerfTask)tasks.get(i)).clone()); } return res; } /** * Return true if can be collapsed in case it is outermost sequence */ public boolean isCollapsable() { return collapsable; } } ././@LongLink0000000000000000000000000000015200000000000011563 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/RepSelectByPrefTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/RepSelectByPrefTask0000644000175000017500000000515211474320251034122 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Iterator; import java.util.List; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.stats.Report; import org.apache.lucene.benchmark.byTask.stats.TaskStats; /** * Report by-name-prefix statistics with no aggregations. *
Other side effects: None. */ public class RepSelectByPrefTask extends RepSumByPrefTask { public RepSelectByPrefTask(PerfRunData runData) { super(runData); } public int doLogic() throws Exception { Report rp = reportSelectByPrefix(getRunData().getPoints().taskStats()); System.out.println(); System.out.println("------------> Report Select By Prefix ("+prefix+") ("+ rp.getSize()+" about "+rp.getReported()+" out of "+rp.getOutOf()+")"); System.out.println(rp.getText()); System.out.println(); return 0; } protected Report reportSelectByPrefix(List taskStats) { String longestOp = longestOp(taskStats.iterator()); boolean first = true; StringBuffer sb = new StringBuffer(); sb.append(tableTitle(longestOp)); sb.append(newline); int reported = 0; for (Iterator it = taskStats.iterator(); it.hasNext();) { TaskStats stat = (TaskStats) it.next(); if (stat.getElapsed()>=0 && stat.getTask().getName().startsWith(prefix)) { // only ended tasks with proper name reported++; if (!first) { sb.append(newline); } first = false; String line = taskReportLine(longestOp,stat); if (taskStats.size()>2 && reported%2==0) { line = line.replaceAll(" "," - "); } sb.append(line); } } String reptxt = (reported==0 ? "No Matching Entries Were Found!" : sb.toString()); return new Report(reptxt,reported,reported, taskStats.size()); } } ././@LongLink0000000000000000000000000000014700000000000011567 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.ja0000644000175000017500000001213611474320251034045 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.BufferedOutputStream; import java.io.BufferedWriter; import java.io.FileOutputStream; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.feeds.DocMaker; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; /** * A task which writes documents, one line per document. Each line is in the * following format: title <TAB> date <TAB> body. The output of this * task can be consumed by * {@link org.apache.lucene.benchmark.byTask.feeds.LineDocMaker} and is intended * to save the IO overhead of opening a file per document to be indexed.
* Supports the following parameters: *

    *
  • line.file.out - the name of the file to write the output to. That * parameter is mandatory. NOTE: the file is re-created. *
  • bzip.compression - whether the output should be bzip-compressed. This is * recommended when the output file is expected to be large. (optional, default: * false). *
* NOTE: this class is not thread-safe and if used by multiple threads the * output is unspecified (as all will write to the same output file in a * non-synchronized way). */ public class WriteLineDocTask extends PerfTask { public final static char SEP = '\t'; private static final Matcher NORMALIZER = Pattern.compile("[\t\r\n]+").matcher(""); private int docSize = 0; private BufferedWriter lineFileOut = null; private DocMaker docMaker; public WriteLineDocTask(PerfRunData runData) throws Exception { super(runData); Config config = runData.getConfig(); String fileName = config.get("line.file.out", null); if (fileName == null) { throw new IllegalArgumentException("line.file.out must be set"); } OutputStream out = new FileOutputStream(fileName); boolean doBzipCompression = false; String doBZCompress = config.get("bzip.compression", null); if (doBZCompress != null) { // Property was set, use the value. doBzipCompression = Boolean.valueOf(doBZCompress).booleanValue(); } else { // Property was not set, attempt to detect based on file's extension doBzipCompression = fileName.endsWith("bz2"); } if (doBzipCompression) { // Wrap with BOS since BZip2CompressorOutputStream calls out.write(int) // and does not use the write(byte[]) version. This proved to speed the // compression process by 70% ! out = new BufferedOutputStream(out, 1 << 16); out = new CompressorStreamFactory().createCompressorOutputStream("bzip2", out); } lineFileOut = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), 1 << 16); docMaker = runData.getDocMaker(); } protected String getLogMessage(int recsCount) { return "Wrote " + recsCount + " line docs"; } public int doLogic() throws Exception { Document doc = docSize > 0 ? docMaker.makeDocument(docSize) : docMaker.makeDocument(); Field f = doc.getField(DocMaker.BODY_FIELD); String body = f != null ? NORMALIZER.reset(f.stringValue()).replaceAll(" ") : ""; f = doc.getField(DocMaker.TITLE_FIELD); String title = f != null ? NORMALIZER.reset(f.stringValue()).replaceAll(" ") : ""; if (body.length() > 0 || title.length() > 0) { f = doc.getField(DocMaker.DATE_FIELD); String date = f != null ? NORMALIZER.reset(f.stringValue()).replaceAll(" ") : ""; lineFileOut.write(title, 0, title.length()); lineFileOut.write(SEP); lineFileOut.write(date, 0, date.length()); lineFileOut.write(SEP); lineFileOut.write(body, 0, body.length()); lineFileOut.newLine(); } return 1; } public void close() throws Exception { lineFileOut.close(); super.close(); } /** * Set the params (docSize only) * @param params docSize, or 0 for no limit. */ public void setParams(String params) { if (super.supportsParams()) { super.setParams(params); } docSize = (int) Float.parseFloat(params); } public boolean supportsParams() { return true; } } ././@LongLink0000000000000000000000000000017100000000000011564 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetLoadFieldSelectorTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetLoadFi0000644000175000017500000000465411474320251034103 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.SetBasedFieldSelector; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import java.util.StringTokenizer; import java.util.Set; import java.util.HashSet; import java.util.Collections; import java.io.IOException; /** * Search and Traverse and Retrieve docs task using a SetBasedFieldSelector. * *

Note: This task reuses the reader if it is already open. * Otherwise a reader is opened at start and closed at the end. * *

Takes optional param: comma separated list of Fields to load.

* *

Other side effects: counts additional 1 (record) for each traversed hit, * and 1 more for each retrieved (non null) document.

*/ public class SearchTravRetLoadFieldSelectorTask extends SearchTravTask { protected FieldSelector fieldSelector; public SearchTravRetLoadFieldSelectorTask(PerfRunData runData) { super(runData); } public boolean withRetrieve() { return true; } protected Document retrieveDoc(IndexReader ir, int id) throws IOException { return ir.document(id, fieldSelector); } public void setParams(String params) { this.params = params; // cannot just call super.setParams(), b/c it's params differ. Set fieldsToLoad = new HashSet(); for (StringTokenizer tokenizer = new StringTokenizer(params, ","); tokenizer.hasMoreTokens();) { String s = tokenizer.nextToken(); fieldsToLoad.add(s); } fieldSelector = new SetBasedFieldSelector(fieldsToLoad, Collections.EMPTY_SET); } /* (non-Javadoc) * @see org.apache.lucene.benchmark.byTask.tasks.PerfTask#supportsParams() */ public boolean supportsParams() { return true; } } lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/ReportTask.java0000644000175000017500000001307311474320251033320 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; import java.util.Iterator; import java.util.LinkedHashMap; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.stats.Report; import org.apache.lucene.benchmark.byTask.stats.TaskStats; import org.apache.lucene.benchmark.byTask.utils.Format; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Report (abstract) task - all report tasks extend this task. */ public abstract class ReportTask extends PerfTask { public ReportTask(PerfRunData runData) { super(runData); } /* (non-Javadoc) * @see PerfTask#shouldNeverLogAtStart() */ protected boolean shouldNeverLogAtStart() { return true; } /* (non-Javadoc) * @see PerfTask#shouldNotRecordStats() */ protected boolean shouldNotRecordStats() { return true; } /* * From here start the code used to generate the reports. * Subclasses would use this part to generate reports. */ protected static final String newline = System.getProperty("line.separator"); /** * Get a textual summary of the benchmark results, average from all test runs. */ protected static final String OP = "Operation "; protected static final String ROUND = " round"; protected static final String RUNCNT = " runCnt"; protected static final String RECCNT = " recsPerRun"; protected static final String RECSEC = " rec/s"; protected static final String ELAPSED = " elapsedSec"; protected static final String USEDMEM = " avgUsedMem"; protected static final String TOTMEM = " avgTotalMem"; protected static final String COLS[] = { RUNCNT, RECCNT, RECSEC, ELAPSED, USEDMEM, TOTMEM }; /** * Compute a title line for a report table * @param longestOp size of longest op name in the table * @return the table title line. */ protected String tableTitle (String longestOp) { StringBuffer sb = new StringBuffer(); sb.append(Format.format(OP,longestOp)); sb.append(ROUND); sb.append(getRunData().getConfig().getColsNamesForValsByRound()); for (int i = 0; i < COLS.length; i++) { sb.append(COLS[i]); } return sb.toString(); } /** * find the longest op name out of completed tasks. * @param taskStats completed tasks to be considered. * @return the longest op name out of completed tasks. */ protected String longestOp(Iterator taskStats) { String longest = OP; while (taskStats.hasNext()) { TaskStats stat = (TaskStats) taskStats.next(); if (stat.getElapsed()>=0) { // consider only tasks that ended String name = stat.getTask().getName(); if (name.length() > longest.length()) { longest = name; } } } return longest; } /** * Compute a report line for the given task stat. * @param longestOp size of longest op name in the table. * @param stat task stat to be printed. * @return the report line. */ protected String taskReportLine(String longestOp, TaskStats stat) { PerfTask task = stat.getTask(); StringBuffer sb = new StringBuffer(); sb.append(Format.format(task.getName(), longestOp)); String round = (stat.getRound()>=0 ? ""+stat.getRound() : "-"); sb.append(Format.formatPaddLeft(round, ROUND)); sb.append(getRunData().getConfig().getColsValuesForValsByRound(stat.getRound())); sb.append(Format.format(stat.getNumRuns(), RUNCNT)); sb.append(Format.format(stat.getCount() / stat.getNumRuns(), RECCNT)); long elapsed = (stat.getElapsed()>0 ? stat.getElapsed() : 1); // assume at least 1ms sb.append(Format.format(2, (float) (stat.getCount() * 1000.0 / elapsed), RECSEC)); sb.append(Format.format(2, (float) stat.getElapsed() / 1000, ELAPSED)); sb.append(Format.format(0, (float) stat.getMaxUsedMem() / stat.getNumRuns(), USEDMEM)); sb.append(Format.format(0, (float) stat.getMaxTotMem() / stat.getNumRuns(), TOTMEM)); return sb.toString(); } protected Report genPartialReport(int reported, LinkedHashMap partOfTasks, int totalSize) { String longetOp = longestOp(partOfTasks.values().iterator()); boolean first = true; StringBuffer sb = new StringBuffer(); sb.append(tableTitle(longetOp)); sb.append(newline); int lineNum = 0; for (Iterator it = partOfTasks.values().iterator(); it.hasNext();) { TaskStats stat = (TaskStats) it.next(); if (!first) { sb.append(newline); } first = false; String line = taskReportLine(longetOp,stat); lineNum++; if (partOfTasks.size()>2 && lineNum%2==0) { line = line.replaceAll(" "," - "); } sb.append(line); } String reptxt = (reported==0 ? "No Matching Entries Were Found!" : sb.toString()); return new Report(reptxt,partOfTasks.size(),reported,totalSize); } } ././@LongLink0000000000000000000000000000016100000000000011563 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighli0000644000175000017500000001423311474320251034143 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Query; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.search.highlight.TextFragment; import org.apache.lucene.search.highlight.TokenSources; import java.util.Set; import java.util.Collection; import java.util.HashSet; import java.util.Collections; /** * Search and Traverse and Retrieve docs task. Highlight the fields in the retrieved documents. * * Uses the {@link org.apache.lucene.search.highlight.SimpleHTMLFormatter} for formatting. * *

Note: This task reuses the reader if it is already open. * Otherwise a reader is opened at start and closed at the end. *

* *

Takes optional multivalued, comma separated param string as: size[<traversal size>],highlight[<int>],maxFrags[<int>],mergeContiguous[<boolean>],fields[name1;name2;...]

*
    *
  • traversal size - The number of hits to traverse, otherwise all will be traversed
  • *
  • highlight - The number of the hits to highlight. Will always be less than or equal to traversal size. Default is Integer.MAX_VALUE (i.e. hits.length())
  • *
  • maxFrags - The maximum number of fragments to score by the highlighter
  • *
  • mergeContiguous - true if contiguous fragments should be merged.
  • *
  • fields - The fields to highlight. If not specified all fields will be highlighted (or at least attempted)
  • *
* Example: *
"SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000
 * 
* * Documents must be stored in order for this task to work. Additionally, term vector positions can be used as well. * *

Other side effects: counts additional 1 (record) for each traversed hit, * and 1 more for each retrieved (non null) document and 1 for each fragment returned.

*/ public class SearchTravRetHighlightTask extends SearchTravTask { protected int numToHighlight = Integer.MAX_VALUE; protected boolean mergeContiguous; protected int maxFrags = 2; protected Set paramFields = Collections.EMPTY_SET; protected Highlighter highlighter; protected int maxDocCharsToAnalyze; public SearchTravRetHighlightTask(PerfRunData runData) { super(runData); } public void setup() throws Exception { super.setup(); //check to make sure either the doc is being stored PerfRunData data = getRunData(); if (data.getConfig().get("doc.stored", false) == false){ throw new Exception("doc.stored must be set to true"); } maxDocCharsToAnalyze = data.getConfig().get("highlighter.maxDocCharsToAnalyze", Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE); } public boolean withRetrieve() { return true; } public int numToHighlight() { return numToHighlight; } protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){ highlighter = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q)); highlighter.setMaxDocCharsToAnalyze(maxDocCharsToAnalyze); return new BenchmarkHighlighter(){ public int doHighlight(IndexReader reader, int doc, String field, Document document, Analyzer analyzer, String text) throws Exception { TokenStream ts = TokenSources.getAnyTokenStream(reader, doc, field, document, analyzer); TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags); return frag != null ? frag.length : 0; } }; } protected Collection/**/ getFieldsToHighlight(Document document) { Collection result = super.getFieldsToHighlight(document); //if stored is false, then result will be empty, in which case just get all the param fields if (paramFields.isEmpty() == false && result.isEmpty() == false) { result.retainAll(paramFields); } else { result = paramFields; } return result; } public void setParams(String params) { String [] splits = params.split(","); for (int i = 0; i < splits.length; i++) { if (splits[i].startsWith("size[") == true){ traversalSize = (int)Float.parseFloat(splits[i].substring("size[".length(),splits[i].length() - 1)); } else if (splits[i].startsWith("highlight[") == true){ numToHighlight = (int)Float.parseFloat(splits[i].substring("highlight[".length(),splits[i].length() - 1)); } else if (splits[i].startsWith("maxFrags[") == true){ maxFrags = (int)Float.parseFloat(splits[i].substring("maxFrags[".length(),splits[i].length() - 1)); } else if (splits[i].startsWith("mergeContiguous[") == true){ mergeContiguous = Boolean.valueOf(splits[i].substring("mergeContiguous[".length(),splits[i].length() - 1)).booleanValue(); } else if (splits[i].startsWith("fields[") == true){ paramFields = new HashSet(); String fieldNames = splits[i].substring("fields[".length(), splits[i].length() - 1); String [] fieldSplits = fieldNames.split(";"); for (int j = 0; j < fieldSplits.length; j++) { paramFields.add(fieldSplits[j]); } } } } }././@LongLink0000000000000000000000000000016700000000000011571 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVectorHighlightTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetVector0000644000175000017500000001356411474320251034207 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Query; import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter; import org.apache.lucene.search.vectorhighlight.FieldQuery; import java.util.Set; import java.util.Collection; import java.util.HashSet; import java.util.Collections; /** * Search and Traverse and Retrieve docs task. Highlight the fields in the retrieved documents by using FastVectorHighlighter. * *

Note: This task reuses the reader if it is already open. * Otherwise a reader is opened at start and closed at the end. *

* *

Takes optional multivalued, comma separated param string as: size[<traversal size>],highlight[<int>],maxFrags[<int>],mergeContiguous[<boolean>],fields[name1;name2;...]

*
    *
  • traversal size - The number of hits to traverse, otherwise all will be traversed
  • *
  • highlight - The number of the hits to highlight. Will always be less than or equal to traversal size. Default is Integer.MAX_VALUE (i.e. hits.length())
  • *
  • maxFrags - The maximum number of fragments to score by the highlighter
  • *
  • fragSize - The length of fragments
  • *
  • fields - The fields to highlight. If not specified all fields will be highlighted (or at least attempted)
  • *
* Example: *
"SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(size[10],highlight[10],maxFrags[3],fields[body]) > : 1000
 * 
* * Fields must be stored and term vector offsets and positions in order must be true for this task to work. * *

Other side effects: counts additional 1 (record) for each traversed hit, * and 1 more for each retrieved (non null) document and 1 for each fragment returned.

*/ public class SearchTravRetVectorHighlightTask extends SearchTravTask { protected int numToHighlight = Integer.MAX_VALUE; protected int maxFrags = 2; protected int fragSize = 100; protected Set paramFields = Collections.EMPTY_SET; protected FastVectorHighlighter highlighter; public SearchTravRetVectorHighlightTask(PerfRunData runData) { super(runData); } public void setup() throws Exception { super.setup(); //check to make sure either the doc is being stored PerfRunData data = getRunData(); if (data.getConfig().get("doc.stored", false) == false){ throw new Exception("doc.stored must be set to true"); } if (data.getConfig().get("doc.term.vector.offsets", false) == false){ throw new Exception("doc.term.vector.offsets must be set to true"); } if (data.getConfig().get("doc.term.vector.positions", false) == false){ throw new Exception("doc.term.vector.positions must be set to true"); } } public boolean withRetrieve() { return true; } public int numToHighlight() { return numToHighlight; } protected BenchmarkHighlighter getBenchmarkHighlighter(Query q){ highlighter = new FastVectorHighlighter( false, false ); final FieldQuery fq = highlighter.getFieldQuery( q ); return new BenchmarkHighlighter(){ public int doHighlight(IndexReader reader, int doc, String field, Document document, Analyzer analyzer, String text) throws Exception { String[] fragments = highlighter.getBestFragments(fq, reader, doc, field, fragSize, maxFrags); return fragments != null ? fragments.length : 0; } }; } protected Collection/**/ getFieldsToHighlight(Document document) { Collection result = super.getFieldsToHighlight(document); //if stored is false, then result will be empty, in which case just get all the param fields if (paramFields.isEmpty() == false && result.isEmpty() == false) { result.retainAll(paramFields); } else { result = paramFields; } return result; } public void setParams(String params) { String [] splits = params.split(","); for (int i = 0; i < splits.length; i++) { if (splits[i].startsWith("size[") == true){ traversalSize = (int)Float.parseFloat(splits[i].substring("size[".length(),splits[i].length() - 1)); } else if (splits[i].startsWith("highlight[") == true){ numToHighlight = (int)Float.parseFloat(splits[i].substring("highlight[".length(),splits[i].length() - 1)); } else if (splits[i].startsWith("maxFrags[") == true){ maxFrags = (int)Float.parseFloat(splits[i].substring("maxFrags[".length(),splits[i].length() - 1)); } else if (splits[i].startsWith("fragSize[") == true){ fragSize = (int)Float.parseFloat(splits[i].substring("fragSize[".length(),splits[i].length() - 1)); } else if (splits[i].startsWith("fields[") == true){ paramFields = new HashSet(); String fieldNames = splits[i].substring("fields[".length(), splits[i].length() - 1); String [] fieldSplits = fieldNames.split(";"); for (int j = 0; j < fieldSplits.length; j++) { paramFields.add(fieldSplits[j]); } } } } } ././@LongLink0000000000000000000000000000015300000000000011564 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/BenchmarkHighlighter.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/BenchmarkHighlighte0000644000175000017500000000224611474320251034171 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; public abstract class BenchmarkHighlighter { public abstract int doHighlight( IndexReader reader, int doc, String field, Document document, Analyzer analyzer, String text ) throws Exception ; } ././@LongLink0000000000000000000000000000014500000000000011565 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravTask.java0000644000175000017500000000415511474320251034110 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.feeds.QueryMaker; /** * Search and Traverse task. * *

Note: This task reuses the reader if it is already open. * Otherwise a reader is opened at start and closed at the end. *

* *

Takes optional param: traversal size (otherwise all results are traversed).

* *

Other side effects: counts additional 1 (record) for each traversed hit.

*/ public class SearchTravTask extends ReadTask { protected int traversalSize = Integer.MAX_VALUE; public SearchTravTask(PerfRunData runData) { super(runData); } public boolean withRetrieve() { return false; } public boolean withSearch() { return true; } public boolean withTraverse() { return true; } public boolean withWarm() { return false; } public QueryMaker getQueryMaker() { return getRunData().getQueryMaker(this); } public int traversalSize() { return traversalSize; } public void setParams(String params) { super.setParams(params); traversalSize = (int)Float.parseFloat(params); } /* (non-Javadoc) * @see org.apache.lucene.benchmark.byTask.tasks.PerfTask#supportsParams() */ public boolean supportsParams() { return true; } } ././@LongLink0000000000000000000000000000014700000000000011567 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/RepSumByPrefTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/RepSumByPrefTask.ja0000644000175000017500000000546611474320251034050 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.stats.Report; import org.apache.lucene.benchmark.byTask.stats.TaskStats; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; /** * Report by-name-prefix statistics aggregated by name. *
Other side effects: None. */ public class RepSumByPrefTask extends ReportTask { public RepSumByPrefTask(PerfRunData runData) { super(runData); } protected String prefix; public int doLogic() throws Exception { Report rp = reportSumByPrefix(getRunData().getPoints().taskStats()); System.out.println(); System.out.println("------------> Report Sum By Prefix ("+prefix+") ("+ rp.getSize()+" about "+rp.getReported()+" out of "+rp.getOutOf()+")"); System.out.println(rp.getText()); System.out.println(); return 0; } protected Report reportSumByPrefix (List taskStats) { // aggregate by task name int reported = 0; LinkedHashMap p2 = new LinkedHashMap(); for (Iterator it = taskStats.iterator(); it.hasNext();) { TaskStats stat1 = (TaskStats) it.next(); if (stat1.getElapsed()>=0 && stat1.getTask().getName().startsWith(prefix)) { // only ended tasks with proper name reported++; String name = stat1.getTask().getName(); TaskStats stat2 = (TaskStats) p2.get(name); if (stat2 == null) { try { stat2 = (TaskStats) stat1.clone(); } catch (CloneNotSupportedException e) { throw new RuntimeException(e); } p2.put(name,stat2); } else { stat2.add(stat1); } } } // now generate report from secondary list p2 return genPartialReport(reported, p2, taskStats.size()); } public void setPrefix(String prefix) { this.prefix = prefix; } /* (non-Javadoc) * @see PerfTask#toString() */ public String toString() { return super.toString()+" "+prefix; } } ././@LongLink0000000000000000000000000000014700000000000011567 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/RepSumByNameTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/RepSumByNameTask.ja0000644000175000017500000000515311474320251034025 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.stats.Report; import org.apache.lucene.benchmark.byTask.stats.TaskStats; /** * Report all statistics aggregated by name. *
Other side effects: None. */ public class RepSumByNameTask extends ReportTask { public RepSumByNameTask(PerfRunData runData) { super(runData); } public int doLogic() throws Exception { Report rp = reportSumByName(getRunData().getPoints().taskStats()); System.out.println(); System.out.println("------------> Report Sum By (any) Name ("+ rp.getSize()+" about "+rp.getReported()+" out of "+rp.getOutOf()+")"); System.out.println(rp.getText()); System.out.println(); return 0; } /** * Report statistics as a string, aggregate for tasks named the same. * @return the report */ protected Report reportSumByName(List taskStats) { // aggregate by task name int reported = 0; LinkedHashMap p2 = new LinkedHashMap(); for (Iterator it = taskStats.iterator(); it.hasNext();) { TaskStats stat1 = (TaskStats) it.next(); if (stat1.getElapsed()>=0) { // consider only tasks that ended reported++; String name = stat1.getTask().getName(); TaskStats stat2 = (TaskStats) p2.get(name); if (stat2 == null) { try { stat2 = (TaskStats) stat1.clone(); } catch (CloneNotSupportedException e) { throw new RuntimeException(e); } p2.put(name,stat2); } else { stat2.add(stat1); } } } // now generate report from secondary list p2 return genPartialReport(reported, p2, taskStats.size()); } } lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/RepAllTask.java0000644000175000017500000000502611474320251033223 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Iterator; import java.util.List; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.stats.Report; import org.apache.lucene.benchmark.byTask.stats.TaskStats; /** * Report all statistics with no aggregations. *
Other side effects: None. */ public class RepAllTask extends ReportTask { public RepAllTask(PerfRunData runData) { super(runData); } public int doLogic() throws Exception { Report rp = reportAll(getRunData().getPoints().taskStats()); System.out.println(); System.out.println("------------> Report All ("+rp.getSize()+" out of "+rp.getOutOf()+")"); System.out.println(rp.getText()); System.out.println(); return 0; } /** * Report detailed statistics as a string * @return the report */ protected Report reportAll(List taskStats) { String longestOp = longestOp(taskStats.iterator()); boolean first = true; StringBuffer sb = new StringBuffer(); sb.append(tableTitle(longestOp)); sb.append(newline); int reported = 0; Iterator it = taskStats.iterator(); while (it.hasNext()) { TaskStats stat = (TaskStats) it.next(); if (stat.getElapsed()>=0) { // consider only tasks that ended if (!first) { sb.append(newline); } first = false; String line = taskReportLine(longestOp, stat); reported++; if (taskStats.size()>2 && reported%2==0) { line = line.replaceAll(" "," - "); } sb.append(line); } } String reptxt = (reported==0 ? "No Matching Entries Were Found!" : sb.toString()); return new Report(reptxt,reported,reported,taskStats.size()); } } ././@LongLink0000000000000000000000000000014600000000000011566 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.jav0000644000175000017500000000566511474320251034153 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.benchmark.byTask.PerfRunData; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.StringTokenizer; /** * Create a new {@link org.apache.lucene.analysis.Analyzer} and set it it in the getRunData() for use by all future tasks. * */ public class NewAnalyzerTask extends PerfTask { private List/**/ analyzerClassNames; private int current; public NewAnalyzerTask(PerfRunData runData) { super(runData); analyzerClassNames = new ArrayList(); } public int doLogic() throws IOException { String className = null; try { if (current >= analyzerClassNames.size()) { current = 0; } className = (String) analyzerClassNames.get(current++); if (className == null || className.equals("")) { className = "org.apache.lucene.analysis.standard.StandardAnalyzer"; } if (className.indexOf(".") == -1 || className.startsWith("standard."))//there is no package name, assume o.a.l.analysis { className = "org.apache.lucene.analysis." + className; } getRunData().setAnalyzer((Analyzer) Class.forName(className).newInstance()); System.out.println("Changed Analyzer to: " + className); } catch (Exception e) { throw new RuntimeException("Error creating Analyzer: " + className, e); } return 1; } /** * Set the params (analyzerClassName only), Comma-separate list of Analyzer class names. If the Analyzer lives in * org.apache.lucene.analysis, the name can be shortened by dropping the o.a.l.a part of the Fully Qualified Class Name. *

* Example Declaration: {"NewAnalyzer" NewAnalyzer(WhitespaceAnalyzer, SimpleAnalyzer, StopAnalyzer, standard.StandardAnalyzer) > * @param params analyzerClassName, or empty for the StandardAnalyzer */ public void setParams(String params) { super.setParams(params); for (StringTokenizer tokenizer = new StringTokenizer(params, ","); tokenizer.hasMoreTokens();) { String s = tokenizer.nextToken(); analyzerClassNames.add(s.trim()); } } /* (non-Javadoc) * @see org.apache.lucene.benchmark.byTask.tasks.PerfTask#supportsParams() */ public boolean supportsParams() { return true; } } lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WarmTask.java0000644000175000017500000000324511474320251032753 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.feeds.QueryMaker; /** * Warm reader task: retrieve all reader documents. * *

Note: This task reuses the reader if it is already open. * Otherwise a reader is opened at start and closed at the end. *

* *

Other side effects: counts additional 1 (record) for each * retrieved (non null) document.

*/ public class WarmTask extends ReadTask { public WarmTask(PerfRunData runData) { super(runData); } public boolean withRetrieve() { return false; } public boolean withSearch() { return false; } public boolean withTraverse() { return false; } public boolean withWarm() { return true; } public QueryMaker getQueryMaker() { return null; // not required for this task. } } lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SetPropTask.java0000644000175000017500000000421611474320251033440 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; import org.apache.lucene.benchmark.byTask.PerfRunData; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Set a performance test configuration property. * A property may have a single value, or a sequence of values, separated by ":". * If a sequence of values is specified, each time a new round starts, * the next (cyclic) value is taken. *
Other side effects: none. *
Takes mandatory param: "name,value" pair. * @see org.apache.lucene.benchmark.byTask.tasks.NewRoundTask */ public class SetPropTask extends PerfTask { public SetPropTask(PerfRunData runData) { super(runData); } private String name; private String value; public int doLogic() throws Exception { if (name==null || value==null) { throw new Exception(getName()+" - undefined name or value: name="+name+" value="+value); } getRunData().getConfig().set(name,value); return 0; } /** * Set the params (property name and value). * @param params property name and value separated by ','. */ public void setParams(String params) { super.setParams(params); int k = params.indexOf(","); name = params.substring(0,k).trim(); value = params.substring(k+1).trim(); } /* (non-Javadoc) * @see org.apache.lucene.benchmark.byTask.tasks.PerfTask#supportsParams() */ public boolean supportsParams() { return true; } } lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewRoundTask.java0000644000175000017500000000261611474320251033607 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; import org.apache.lucene.benchmark.byTask.PerfRunData; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Increment the counter for properties maintained by Round Number. *
Other side effects: if there are props by round number, log value change. */ public class NewRoundTask extends PerfTask { public NewRoundTask(PerfRunData runData) { super(runData); } public int doLogic() throws Exception { getRunData().getConfig().newRound(); return 0; } /* (non-Javadoc) * @see PerfTask#shouldNotRecordStats() */ protected boolean shouldNotRecordStats() { return true; } } lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/Benchmark.java0000644000175000017500000000767311474320253032002 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.File; import java.io.FileReader; import java.io.Reader; import org.apache.lucene.benchmark.byTask.utils.Algorithm; import org.apache.lucene.benchmark.byTask.utils.Config; /** * Run the benchmark algorithm. *

Usage: java Benchmark algorithm-file *

    *
  1. Read algorithm.
  2. *
  3. Run the algorithm.
  4. *
* Things to be added/fixed in "Benchmarking by tasks": *
    *
  1. TODO - report into Excel and/or graphed view.
  2. *
  3. TODO - perf comparison between Lucene releases over the years.
  4. *
  5. TODO - perf report adequate to include in Lucene nightly build site? (so we can easily track performance changes.)
  6. *
  7. TODO - add overall time control for repeated execution (vs. current by-count only).
  8. *
  9. TODO - query maker that is based on index statistics.
  10. *
*/ public class Benchmark { private PerfRunData runData; private Algorithm algorithm; private boolean executed; public Benchmark (Reader algReader) throws Exception { // prepare run data try { runData = new PerfRunData(new Config(algReader)); } catch (Exception e) { e.printStackTrace(); throw new Exception("Error: cannot init PerfRunData!",e); } // parse algorithm try { algorithm = new Algorithm(runData); } catch (Exception e) { throw new Exception("Error: cannot understand algorithm!",e); } } public synchronized void execute() throws Exception { if (executed) { throw new IllegalStateException("Benchmark was already executed"); } executed = true; runData.setStartTimeMillis(); algorithm.execute(); } /** * Run the benchmark algorithm. * @param args benchmark config and algorithm files */ public static void main(String[] args) { // verify command line args if (args.length < 1) { System.err.println("Usage: java Benchmark "); System.exit(1); } // verify input files File algFile = new File(args[0]); if (!algFile.exists() || !algFile.isFile() || !algFile.canRead()) { System.err.println("cannot find/read algorithm file: "+algFile.getAbsolutePath()); System.exit(1); } System.out.println("Running algorithm from: "+algFile.getAbsolutePath()); Benchmark benchmark = null; try { benchmark = new Benchmark(new FileReader(algFile)); } catch (Exception e) { e.printStackTrace(); System.exit(1); } System.out.println("------------> algorithm:"); System.out.println(benchmark.getAlgorithm().toString()); // execute try { benchmark.execute(); } catch (Exception e) { System.err.println("Error: cannot execute the algorithm! "+e.getMessage()); e.printStackTrace(); } System.out.println("####################"); System.out.println("### D O N E !!! ###"); System.out.println("####################"); } /** * @return Returns the algorithm. */ public Algorithm getAlgorithm() { return algorithm; } /** * @return Returns the runData. */ public PerfRunData getRunData() { return runData; } } lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/package.html0000644000175000017500000006371311474320253031523 0ustar janpascaljanpascal Benchmarking Lucene By Tasks
Benchmarking Lucene By Tasks.

This package provides "task based" performance benchmarking of Lucene. One can use the predefined benchmarks, or create new ones.

Contained packages:

Package Description
stats Statistics maintained when running benchmark tasks.
tasks Benchmark tasks.
feeds Sources for benchmark inputs: documents and queries.
utils Utilities used for the benchmark, and for the reports.
programmatic Sample performance test written programatically.

Table Of Contents

  1. Benchmarking By Tasks
  2. How to use
  3. Benchmark "algorithm"
  4. Supported tasks/commands
  5. Benchmark properties
  6. Example input algorithm and the result benchmark report.
  7. Results record counting clarified

Benchmarking By Tasks

Benchmark Lucene using task primitives.

A benchmark is composed of some predefined tasks, allowing for creating an index, adding documents, optimizing, searching, generating reports, and more. A benchmark run takes an "algorithm" file that contains a description of the sequence of tasks making up the run, and some properties defining a few additional characteristics of the benchmark run.

How to use

Easiest way to run a benchmarks is using the predefined ant task:

  • ant run-task
    - would run the micro-standard.alg "algorithm".
  • ant run-task -Dtask.alg=conf/compound-penalty.alg
    - would run the compound-penalty.alg "algorithm".
  • ant run-task -Dtask.alg=[full-path-to-your-alg-file]
    - would run your perf test "algorithm".
  • java org.apache.lucene.benchmark.byTask.programmatic.Sample
    - would run a performance test programmatically - without using an alg file. This is less readable, and less convinient, but possible.

You may find existing tasks sufficient for defining the benchmark you need, otherwise, you can extend the framework to meet your needs, as explained herein.

Each benchmark run has a DocMaker and a QueryMaker. These two should usually match, so that "meaningful" queries are used for a certain collection. Properties set at the header of the alg file define which "makers" should be used. You can also specify your own makers, extending DocMaker and implementing QureyMaker.

Note: since 2.9, DocMaker is a concrete class which accepts a ContentSource. In most cases, you can use the DocMaker class to create Documents, while providing your own ContentSource implementation. For example, the current Benchmark package includes ContentSource implementations for TREC, Enwiki and Reuters collections, as well as others like LineDocSource which reads a 'line' file produced by WriteLineDocTask.

Benchmark .alg file contains the benchmark "algorithm". The syntax is described below. Within the algorithm, you can specify groups of commands, assign them names, specify commands that should be repeated, do commands in serial or in parallel, and also control the speed of "firing" the commands.

This allows, for instance, to specify that an index should be opened for update, documents should be added to it one by one but not faster than 20 docs a minute, and, in parallel with this, some N queries should be searched against that index, again, no more than 2 queries a second. You can have the searches all share an index reader, or have them each open its own reader and close it afterwords.

If the commands available for use in the algorithm do not meet your needs, you can add commands by adding a new task under org.apache.lucene.benchmark.byTask.tasks - you should extend the PerfTask abstract class. Make sure that your new task class name is suffixed by Task. Assume you added the class "WonderfulTask" - doing so also enables the command "Wonderful" to be used in the algorithm.

External classes: It is sometimes useful to invoke the benchmark package with your external alg file that configures the use of your own doc/query maker and or html parser. You can work this out without modifying the benchmark package code, by passing your class path with the benchmark.ext.classpath property:

  • ant run-task -Dtask.alg=[full-path-to-your-alg-file] -Dbenchmark.ext.classpath=/mydir/classes -Dtask.mem=512M

Benchmark "algorithm"

The following is an informal description of the supported syntax.

  1. Measuring: When a command is executed, statistics for the elapsed execution time and memory consumption are collected. At any time, those statistics can be printed, using one of the available ReportTasks.
  2. Comments start with '#'.
  3. Serial sequences are enclosed within '{ }'.
  4. Parallel sequences are enclosed within '[ ]'
  5. Sequence naming: To name a sequence, put '"name"' just after '{' or '['.
    Example - { "ManyAdds" AddDoc } : 1000000 - would name the sequence of 1M add docs "ManyAdds", and this name would later appear in statistic reports. If you don't specify a name for a sequence, it is given one: you can see it as the algorithm is printed just before benchmark execution starts.
  6. Repeating: To repeat sequence tasks N times, add ': N' just after the sequence closing tag - '}' or ']' or '>'.
    Example - [ AddDoc ] : 4 - would do 4 addDoc in parallel, spawning 4 threads at once.
    Example - [ AddDoc AddDoc ] : 4 - would do 8 addDoc in parallel, spawning 8 threads at once.
    Example - { AddDoc } : 30 - would do addDoc 30 times in a row.
    Example - { AddDoc AddDoc } : 30 - would do addDoc 60 times in a row.
    Exhaustive repeating: use * instead of a number to repeat exhaustively. This is sometimes useful, for adding as many files as a doc maker can create, without iterating over the same file again, especially when the exact number of documents is not known in advance. For insance, TREC files extracted from a zip file. Note: when using this, you must also set doc.maker.forever to false.
    Example - { AddDoc } : * - would add docs until the doc maker is "exhausted".
  7. Command parameter: a command can optionally take a single parameter. If the certain command does not support a parameter, or if the parameter is of the wrong type, reading the algorithm will fail with an exception and the test would not start. Currently the following tasks take optional parameters:
    • AddDoc takes a numeric parameter, indicating the required size of added document. Note: if the DocMaker implementation used in the test does not support makeDoc(size), an exception would be thrown and the test would fail.
    • DeleteDoc takes numeric parameter, indicating the docid to be deleted. The latter is not very useful for loops, since the docid is fixed, so for deletion in loops it is better to use the doc.delete.step property.
    • SetProp takes a name,value mandatory param, ',' used as a separator.
    • SearchTravRetTask and SearchTravTask take a numeric parameter, indicating the required traversal size.
    • SearchTravRetLoadFieldSelectorTask takes a string parameter: a comma separated list of Fields to load.
    • SearchTravRetHighlighterTask takes a string parameter: a comma separated list of parameters to define highlighting. See that tasks javadocs for more information

    Example - AddDoc(2000) - would add a document of size 2000 (~bytes).
    See conf/task-sample.alg for how this can be used, for instance, to check which is faster, adding many smaller documents, or few larger documents. Next candidates for supporting a parameter may be the Search tasks, for controlling the qurey size.
  8. Statistic recording elimination: - a sequence can also end with '>', in which case child tasks would not store their statistics. This can be useful to avoid exploding stats data, for adding say 1M docs.
    Example - { "ManyAdds" AddDoc > : 1000000 - would add million docs, measure that total, but not save stats for each addDoc.
    Notice that the granularity of System.currentTimeMillis() (which is used here) is system dependant, and in some systems an operation that takes 5 ms to complete may show 0 ms latency time in performance measurements. Therefore it is sometimes more accurate to look at the elapsed time of a larger sequence, as demonstrated here.
  9. Rate: To set a rate (ops/sec or ops/min) for a sequence, add ': N : R' just after sequence closing tag. This would specify repetition of N with rate of R operations/sec. Use 'R/sec' or 'R/min' to explicitely specify that the rate is per second or per minute. The default is per second,
    Example - [ AddDoc ] : 400 : 3 - would do 400 addDoc in parallel, starting up to 3 threads per second.
    Example - { AddDoc } : 100 : 200/min - would do 100 addDoc serially, waiting before starting next add, if otherwise rate would exceed 200 adds/min.
  10. Disable Counting: Each task executed contributes to the records count. This count is reflected in reports under recs/s and under recsPerRun. Most tasks count 1, some count 0, and some count more. (See Results record counting clarified for more details.) It is possible to disable counting for a task by preceding it with -.
    Example - -CreateIndex - would count 0 while the default behavior for CreateIndex is to count 1.
  11. Command names: Each class "AnyNameTask" in the package org.apache.lucene.benchmark.byTask.tasks, that extends PerfTask, is supported as command "AnyName" that can be used in the benchmark "algorithm" description. This allows to add new commands by just adding such classes.

Supported tasks/commands

Existing tasks can be divided into a few groups: regular index/search work tasks, report tasks, and control tasks.

  1. Report tasks: There are a few Report commands for generating reports. Only task runs that were completed are reported. (The 'Report tasks' themselves are not measured and not reported.)
    • RepAll - all (completed) task runs.
    • RepSumByName - all statistics, aggregated by name. So, if AddDoc was executed 2000 times, only 1 report line would be created for it, aggregating all those 2000 statistic records.
    • RepSelectByPref   prefixWord - all records for tasks whose name start with prefixWord.
    • RepSumByPref   prefixWord - all records for tasks whose name start with prefixWord, aggregated by their full task name.
    • RepSumByNameRound - all statistics, aggregated by name and by Round. So, if AddDoc was executed 2000 times in each of 3 rounds, 3 report lines would be created for it, aggregating all those 2000 statistic records in each round. See more about rounds in the NewRound command description below.
    • RepSumByPrefRound   prefixWord - similar to RepSumByNameRound, just that only tasks whose name starts with prefixWord are included.
    If needed, additional reports can be added by extending the abstract class ReportTask, and by manipulating the statistics data in Points and TaskStats.
  2. Control tasks: Few of the tasks control the benchmark algorithm all over:
    • ClearStats - clears the entire statistics. Further reports would only include task runs that would start after this call.
    • NewRound - virtually start a new round of performance test. Although this command can be placed anywhere, it mostly makes sense at the end of an outermost sequence.
      This increments a global "round counter". All task runs that would start now would record the new, updated round counter as their round number. This would appear in reports. In particular, see RepSumByNameRound above.
      An additional effect of NewRound, is that numeric and boolean properties defined (at the head of the .alg file) as a sequence of values, e.g. merge.factor=mrg:10:100:10:100 would increment (cyclic) to the next value. Note: this would also be reflected in the reports, in this case under a column that would be named "mrg".
    • ResetInputs - DocMaker and the various QueryMakers would reset their counters to start. The way these Maker interfaces work, each call for makeDocument() or makeQuery() creates the next document or query that it "knows" to create. If that pool is "exhausted", the "maker" start over again. The resetInpus command therefore allows to make the rounds comparable. It is therefore useful to invoke ResetInputs together with NewRound.
    • ResetSystemErase - reset all index and input data and call gc. Does NOT reset statistics. This contains ResetInputs. All writers/readers are nullified, deleted, closed. Index is erased. Directory is erased. You would have to call CreateIndex once this was called...
    • ResetSystemSoft - reset all index and input data and call gc. Does NOT reset statistics. This contains ResetInputs. All writers/readers are nullified, closed. Index is NOT erased. Directory is NOT erased. This is useful for testing performance on an existing index, for instance if the construction of a large index took a very long time and now you would to test its search or update performance.
  3. Other existing tasks are quite straightforward and would just be briefly described here.
    • CreateIndex and OpenIndex both leave the index open for later update operations. CloseIndex would close it.
    • OpenReader, similarly, would leave an index reader open for later search operations. But this have further semantics. If a Read operation is performed, and an open reader exists, it would be used. Otherwise, the read operation would open its own reader and close it when the read operation is done. This allows testing various scenarios - sharing a reader, searching with "cold" reader, with "warmed" reader, etc. The read operations affected by this are: Warm, Search, SearchTrav (search and traverse), and SearchTravRet (search and traverse and retrieve). Notice that each of the 3 search task types maintains its own queryMaker instance.
    • CommitIndex and Optimize can be used to commit changes to the index and/or optimize the index created thus far.
    • WriteLineDoc prepares a 'line' file where each line holds a document with title, date and body elements, seperated by [TAB]. A line file is useful if one wants to measure pure indexing performance, without the overhead of parsing the data.
      You can use LineDocSource as a ContentSource over a 'line' file.
    • ConsumeContentSource consumes a ContentSource. Useful for e.g. testing a ContentSource performance, without the overhead of preparing a Document out of it.

Benchmark properties

Properties are read from the header of the .alg file, and define several parameters of the performance test. As mentioned above for the NewRound task, numeric and boolean properties that are defined as a sequence of values, e.g. merge.factor=mrg:10:100:10:100 would increment (cyclic) to the next value, when NewRound is called, and would also appear as a named column in the reports (column name would be "mrg" in this example).

Some of the currently defined properties are:

  1. analyzer - full class name for the analyzer to use. Same analyzer would be used in the entire test.
  2. directory - valid values are This tells which directory to use for the performance test.
  3. Index work parameters: Multi int/boolean values would be iterated with calls to NewRound. There would be also added as columns in the reports, first string in the sequence is the column name. (Make sure it is no shorter than any value in the sequence).
    • max.buffered
      Example: max.buffered=buf:10:10:100:100 - this would define using maxBufferedDocs of 10 in iterations 0 and 1, and 100 in iterations 2 and 3.
    • merge.factor - which merge factor to use.
    • compound - whether the index is using the compound format or not. Valid values are "true" and "false".

Here is a list of currently defined properties:

  1. Root directory for data and indexes:
    • work.dir (default is System property "benchmark.work.dir" or "work".)
  2. Docs and queries creation:
    • analyzer
    • doc.maker
    • doc.maker.forever
    • html.parser
    • doc.stored
    • doc.tokenized
    • doc.term.vector
    • doc.term.vector.positions
    • doc.term.vector.offsets
    • doc.store.body.bytes
    • docs.dir
    • query.maker
    • file.query.maker.file
    • file.query.maker.default.field
    • search.num.hits
  3. Logging:
    • log.step
    • log.step.[class name]Task ie log.step.DeleteDoc (e.g. log.step.Wonderful for the WonderfulTask example above).
    • log.queries
    • task.max.depth.log
  4. Index writing:
    • compound
    • merge.factor
    • max.buffered
    • directory
    • ram.flush.mb
    • autocommit
  5. Doc deletion:
    • doc.delete.step

For sample use of these properties see the *.alg files under conf.

Example input algorithm and the result benchmark report

The following example is in conf/sample.alg:

# --------------------------------------------------------
#
# Sample: what is the effect of doc size on indexing time?
#
# There are two parts in this test:
# - PopulateShort adds 2N documents of length  L
# - PopulateLong  adds  N documents of length 2L
# Which one would be faster?
# The comparison is done twice.
#
# --------------------------------------------------------

# -------------------------------------------------------------------------------------
# multi val params are iterated by NewRound's, added to reports, start with column name.
merge.factor=mrg:10:20
max.buffered=buf:100:1000
compound=true

analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer
directory=FSDirectory

doc.stored=true
doc.tokenized=true
doc.term.vector=false
doc.add.log.step=500

docs.dir=reuters-out

doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker

query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker

# task at this depth or less would print when they start
task.max.depth.log=2

log.queries=false
# -------------------------------------------------------------------------------------
{

    { "PopulateShort"
        CreateIndex
        { AddDoc(4000) > : 20000
        Optimize
        CloseIndex
    >

    ResetSystemErase

    { "PopulateLong"
        CreateIndex
        { AddDoc(8000) > : 10000
        Optimize
        CloseIndex
    >

    ResetSystemErase

    NewRound

} : 2

RepSumByName
RepSelectByPref Populate

The command line for running this sample:
ant run-task -Dtask.alg=conf/sample.alg

The output report from running this test contains the following:

Operation     round mrg  buf   runCnt   recsPerRun        rec/s  elapsedSec    avgUsedMem    avgTotalMem
PopulateShort     0  10  100        1        20003        119.6      167.26    12,959,120     14,241,792
PopulateLong -  - 0  10  100 -  -   1 -  -   10003 -  -  - 74.3 -  - 134.57 -  17,085,208 -   20,635,648
PopulateShort     1  20 1000        1        20003        143.5      139.39    63,982,040     94,756,864
PopulateLong -  - 1  20 1000 -  -   1 -  -   10003 -  -  - 77.0 -  - 129.92 -  87,309,608 -  100,831,232

Results record counting clarified

Two columns in the results table indicate records counts: records-per-run and records-per-second. What does it mean?

Almost every task gets 1 in this count just for being executed. Task sequences aggregate the counts of their child tasks, plus their own count of 1. So, a task sequence containing 5 other task sequences, each running a single other task 10 times, would have a count of 1 + 5 * (1 + 10) = 56.

The traverse and retrieve tasks "count" more: a traverse task would add 1 for each traversed result (hit), and a retrieve task would additionally add 1 for each retrieved doc. So, regular Search would count 1, SearchTrav that traverses 10 hits would count 11, and a SearchTravRet task that retrieves (and traverses) 10, would count 21.

Confusing? this might help: always examine the elapsedSec column, and always compare "apples to apples", .i.e. it is interesting to check how the rec/s changed for the same task (or sequence) between two different runs, but it is not very useful to know how the rec/s differs between Search and SearchTrav tasks. For the latter, elapsedSec would bring more insight.

 
lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/programmatic/0000755000175000017500000000000011554106561031720 5ustar janpascaljanpascallucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/programmatic/Sample.java0000644000175000017500000000726511474320251034011 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.programmatic; /** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Properties; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.tasks.AddDocTask; import org.apache.lucene.benchmark.byTask.tasks.CloseIndexTask; import org.apache.lucene.benchmark.byTask.tasks.CreateIndexTask; import org.apache.lucene.benchmark.byTask.tasks.RepSumByNameTask; import org.apache.lucene.benchmark.byTask.tasks.TaskSequence; import org.apache.lucene.benchmark.byTask.utils.Config; /** * Sample performance test written programmatically - no algorithm file is needed here. */ public class Sample { /** * @param args * @throws Exception * @throws IOException */ public static void main(String[] args) throws Exception { Properties p = initProps(); Config conf = new Config(p); PerfRunData runData = new PerfRunData(conf); // 1. top sequence TaskSequence top = new TaskSequence(runData,null,null,false); // top level, not parallel // 2. task to create the index CreateIndexTask create = new CreateIndexTask(runData); top.addTask(create); // 3. task seq to add 500 docs (order matters - top to bottom - add seq to top, only then add to seq) TaskSequence seq1 = new TaskSequence(runData,"AddDocs",top,false); seq1.setRepetitions(500); seq1.setNoChildReport(); top.addTask(seq1); // 4. task to add the doc AddDocTask addDoc = new AddDocTask(runData); //addDoc.setParams("1200"); // doc size limit if supported seq1.addTask(addDoc); // order matters 9see comment above) // 5. task to close the index CloseIndexTask close = new CloseIndexTask(runData); top.addTask(close); // task to report RepSumByNameTask rep = new RepSumByNameTask(runData); top.addTask(rep); // print algorithm System.out.println(top.toString()); // execute top.doLogic(); } // Sample programmatic settings. Could also read from file. private static Properties initProps() { Properties p = new Properties(); p.setProperty ( "task.max.depth.log" , "3" ); p.setProperty ( "max.buffered" , "buf:10:10:100:100:10:10:100:100" ); p.setProperty ( "doc.maker" , "org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource" ); p.setProperty ( "log.step" , "2000" ); p.setProperty ( "doc.delete.step" , "8" ); p.setProperty ( "analyzer" , "org.apache.lucene.analysis.standard.StandardAnalyzer" ); p.setProperty ( "doc.term.vector" , "false" ); p.setProperty ( "directory" , "FSDirectory" ); p.setProperty ( "query.maker" , "org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker" ); p.setProperty ( "doc.stored" , "true" ); p.setProperty ( "docs.dir" , "reuters-out" ); p.setProperty ( "compound" , "cmpnd:true:true:true:true:false:false:false:false" ); p.setProperty ( "doc.tokenized" , "true" ); p.setProperty ( "merge.factor" , "mrg:10:100:10:100:10:100:10:100" ); return p; } } ././@LongLink0000000000000000000000000000014500000000000011565 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/programmatic/package.htmllucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/programmatic/package.html0000644000175000017500000000167111474320251034201 0ustar janpascaljanpascal Sample performance test written programmatically - no algorithm file is needed here. lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/0000755000175000017500000000000011554106561030373 5ustar janpascaljanpascallucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/FileUtils.java0000644000175000017500000000321411474320251033131 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.utils; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.File; import java.io.IOException; /** * File utilities. */ public class FileUtils { /** * Delete files and directories, even if non-empty. * * @param dir file or directory * @return true on success, false if no or part of files have been deleted * @throws java.io.IOException */ public static boolean fullyDelete(File dir) throws IOException { if (dir == null || !dir.exists()) return false; File contents[] = dir.listFiles(); if (contents != null) { for (int i = 0; i < contents.length; i++) { if (contents[i].isFile()) { if (!contents[i].delete()) { return false; } } else { if (!fullyDelete(contents[i])) { return false; } } } } return dir.delete(); } } ././@LongLink0000000000000000000000000000015100000000000011562 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBufferReader.javalucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/StringBufferReader.0000644000175000017500000001166311474320251034121 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.utils; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.Reader; /** * Implements a {@link Reader} over a {@link StringBuffer} instance. Although * one can use {@link java.io.StringReader} by passing it * {@link StringBuffer#toString()}, it is better to use this class, as it * doesn't mark the passed-in {@link StringBuffer} as shared (which will cause * inner char[] allocations at the next append() attempt).
* Notes: *
    *
  • This implementation assumes the underlying {@link StringBuffer} is not * changed during the use of this {@link Reader} implementation. *
  • This implementation is thread-safe. *
  • The implementation looks very much like {@link java.io.StringReader} (for * the right reasons). *
  • If one wants to reuse that instance, then the following needs to be done: *
     * StringBuffer sb = new StringBuffer("some text");
     * Reader reader = new StringBufferReader(sb);
     * ... read from reader - don't close it ! ...
     * sb.setLength(0);
     * sb.append("some new text");
     * reader.reset();
     * ... read the new string from the reader ...
     * 
    *
*/ public class StringBufferReader extends Reader { // TODO (3.0): change to StringBuffer (including the name of the class) // The StringBuffer to read from. private StringBuffer sb; // The length of 'sb'. private int length; // The next position to read from the StringBuffer. private int next = 0; // The mark position. The default value 0 means the start of the text. private int mark = 0; public StringBufferReader(StringBuffer sb) { set(sb); } /** Check to make sure that the stream has not been closed. */ private void ensureOpen() throws IOException { if (sb == null) { throw new IOException("Stream has already been closed"); } } public void close() { synchronized (lock) { sb = null; } } /** * Mark the present position in the stream. Subsequent calls to reset() will * reposition the stream to this point. * * @param readAheadLimit Limit on the number of characters that may be read * while still preserving the mark. Because the stream's input comes * from a StringBuffer, there is no actual limit, so this argument * must not be negative, but is otherwise ignored. * @exception IllegalArgumentException If readAheadLimit is < 0 * @exception IOException If an I/O error occurs */ public void mark(int readAheadLimit) throws IOException { if (readAheadLimit < 0){ throw new IllegalArgumentException("Read-ahead limit cannpt be negative: " + readAheadLimit); } synchronized (lock) { ensureOpen(); mark = next; } } public boolean markSupported() { return true; } public int read() throws IOException { synchronized (lock) { ensureOpen(); return next >= length ? -1 : sb.charAt(next++); } } public int read(char cbuf[], int off, int len) throws IOException { synchronized (lock) { ensureOpen(); // Validate parameters if (off < 0 || off > cbuf.length || len < 0 || off + len > cbuf.length) { throw new IndexOutOfBoundsException("off=" + off + " len=" + len + " cbuf.length=" + cbuf.length); } if (len == 0) { return 0; } if (next >= length) { return -1; } int n = Math.min(length - next, len); sb.getChars(next, next + n, cbuf, off); next += n; return n; } } public boolean ready() throws IOException { synchronized (lock) { ensureOpen(); return true; } } public void reset() throws IOException { synchronized (lock) { ensureOpen(); next = mark; length = sb.length(); } } public void set(StringBuffer sb) { synchronized (lock) { this.sb = sb; length = sb.length(); } } public long skip(long ns) throws IOException { synchronized (lock) { ensureOpen(); if (next >= length) { return 0; } // Bound skip by beginning and end of the source long n = Math.min(length - next, ns); n = Math.max(-next, n); next += n; return n; } } } lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Format.java0000644000175000017500000000775611474320251032500 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.utils; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.text.NumberFormat; /** * Formatting utilities (for reports). */ public class Format { private static NumberFormat numFormat [] = { NumberFormat.getInstance(), NumberFormat.getInstance(), NumberFormat.getInstance(), }; private static final String padd = " "; static { numFormat[0].setMaximumFractionDigits(0); numFormat[0].setMinimumFractionDigits(0); numFormat[1].setMaximumFractionDigits(1); numFormat[1].setMinimumFractionDigits(1); numFormat[2].setMaximumFractionDigits(2); numFormat[2].setMinimumFractionDigits(2); } /** * Padd a number from left. * @param numFracDigits number of digits in fraction part - must be 0 or 1 or 2. * @param f number to be formatted. * @param col column name (used for deciding on length). * @return formatted string. */ public static String format(int numFracDigits, float f, String col) { String res = padd + numFormat[numFracDigits].format(f); return res.substring(res.length() - col.length()); } public static String format(int numFracDigits, double f, String col) { String res = padd + numFormat[numFracDigits].format(f); return res.substring(res.length() - col.length()); } /** * Pad a number from right. * @param numFracDigits number of digits in fraction part - must be 0 or 1 or 2. * @param f number to be formatted. * @param col column name (used for deciding on length). * @return formatted string. */ public static String formatPaddRight(int numFracDigits, float f, String col) { String res = numFormat[numFracDigits].format(f) + padd; return res.substring(0, col.length()); } public static String formatPaddRight(int numFracDigits, double f, String col) { String res = numFormat[numFracDigits].format(f) + padd; return res.substring(0, col.length()); } /** * Pad a number from left. * @param n number to be formatted. * @param col column name (used for deciding on length). * @return formatted string. */ public static String format(int n, String col) { String res = padd + n; return res.substring(res.length() - col.length()); } /** * Pad a string from right. * @param s string to be formatted. * @param col column name (used for deciding on length). * @return formatted string. */ public static String format(String s, String col) { String s1 = (s + padd); return s1.substring(0, Math.min(col.length(), s1.length())); } /** * Pad a string from left. * @param s string to be formatted. * @param col column name (used for deciding on length). * @return formatted string. */ public static String formatPaddLeft(String s, String col) { String res = padd + s; return res.substring(res.length() - col.length()); } /** * Extract simple class name * @param cls class whose simple name is required * @return simple class name */ public static String simpleName (Class cls) { String c = cls.getName(); String p = cls.getPackage().getName(); int k = c.lastIndexOf(p+"."); if (k<0) { return c; } return c.substring(k+1+p.length()); } } lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/package.html0000644000175000017500000000163411474320251032653 0ustar janpascaljanpascal Utilities used for the benchmark, and for the reports. lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Algorithm.java0000644000175000017500000002312111474505321033161 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.utils; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.StreamTokenizer; import java.io.StringReader; import java.lang.reflect.Constructor; import java.util.ArrayList; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.tasks.PerfTask; import org.apache.lucene.benchmark.byTask.tasks.RepSumByPrefTask; import org.apache.lucene.benchmark.byTask.tasks.TaskSequence; /** * Test algorithm, as read from file */ public class Algorithm { private TaskSequence sequence; /** * Read algorithm from file * @param runData perf-run-data used at running the tasks. * @throws Exception if errors while parsing the algorithm */ public Algorithm (PerfRunData runData) throws Exception { String algTxt = runData.getConfig().getAlgorithmText(); sequence = new TaskSequence(runData,null,null,false); TaskSequence currSequence = sequence; PerfTask prevTask = null; StreamTokenizer stok = new StreamTokenizer(new StringReader(algTxt)); stok.commentChar('#'); stok.eolIsSignificant(false); stok.ordinaryChar('"'); stok.ordinaryChar('/'); stok.ordinaryChar('('); stok.ordinaryChar(')'); stok.ordinaryChar('-'); boolean colonOk = false; boolean isDisableCountNextTask = false; // only for primitive tasks currSequence.setDepth(0); String taskPackage = PerfTask.class.getPackage().getName() + "."; Class paramClass[] = {PerfRunData.class}; Object paramObj[] = {runData}; while (stok.nextToken() != StreamTokenizer.TT_EOF) { switch(stok.ttype) { case StreamTokenizer.TT_WORD: String s = stok.sval; Constructor cnstr = Class.forName(taskPackage+s+"Task").getConstructor(paramClass); PerfTask task = (PerfTask) cnstr.newInstance(paramObj); task.setDisableCounting(isDisableCountNextTask); isDisableCountNextTask = false; currSequence.addTask(task); if (task instanceof RepSumByPrefTask) { stok.nextToken(); String prefix = stok.sval; if (prefix==null || prefix.length()==0) { throw new Exception("named report prefix problem - "+stok.toString()); } ((RepSumByPrefTask) task).setPrefix(prefix); } // check for task param: '(' someParam ')' stok.nextToken(); if (stok.ttype!='(') { stok.pushBack(); } else { // get params, for tasks that supports them, - anything until next ')' StringBuffer params = new StringBuffer(); stok.nextToken(); while (stok.ttype!=')') { switch (stok.ttype) { case StreamTokenizer.TT_NUMBER: params.append(stok.nval); break; case StreamTokenizer.TT_WORD: params.append(stok.sval); break; case StreamTokenizer.TT_EOF: throw new Exception("unexpexted EOF: - "+stok.toString()); default: params.append((char)stok.ttype); } stok.nextToken(); } String prm = params.toString().trim(); if (prm.length()>0) { task.setParams(prm); } } // --------------------------------------- colonOk = false; prevTask = task; break; default: char c = (char)stok.ttype; switch(c) { case ':' : if (!colonOk) throw new Exception("colon unexpexted: - "+stok.toString()); colonOk = false; // get repetitions number stok.nextToken(); if ((char)stok.ttype == '*') { ((TaskSequence)prevTask).setRepetitions(TaskSequence.REPEAT_EXHAUST); } else { if (stok.ttype!=StreamTokenizer.TT_NUMBER) { throw new Exception("expected repetitions number or XXXs: - "+stok.toString()); } else { double num = stok.nval; stok.nextToken(); if (stok.ttype == StreamTokenizer.TT_WORD && stok.sval.equals("s")) { ((TaskSequence) prevTask).setRunTime(num); } else { stok.pushBack(); ((TaskSequence) prevTask).setRepetitions((int) num); } } } // check for rate specification (ops/min) stok.nextToken(); if (stok.ttype!=':') { stok.pushBack(); } else { // get rate number stok.nextToken(); if (stok.ttype!=StreamTokenizer.TT_NUMBER) throw new Exception("expected rate number: - "+stok.toString()); // check for unit - min or sec, sec is default stok.nextToken(); if (stok.ttype!='/') { stok.pushBack(); ((TaskSequence)prevTask).setRate((int)stok.nval,false); // set rate per sec } else { stok.nextToken(); if (stok.ttype!=StreamTokenizer.TT_WORD) throw new Exception("expected rate unit: 'min' or 'sec' - "+stok.toString()); String unit = stok.sval.toLowerCase(); if ("min".equals(unit)) { ((TaskSequence)prevTask).setRate((int)stok.nval,true); // set rate per min } else if ("sec".equals(unit)) { ((TaskSequence)prevTask).setRate((int)stok.nval,false); // set rate per sec } else { throw new Exception("expected rate unit: 'min' or 'sec' - "+stok.toString()); } } } colonOk = false; break; case '{' : case '[' : // a sequence // check for sequence name String name = null; stok.nextToken(); if (stok.ttype!='"') { stok.pushBack(); } else { stok.nextToken(); name = stok.sval; stok.nextToken(); if (stok.ttype!='"' || name==null || name.length()==0) { throw new Exception("sequence name problem - "+stok.toString()); } } // start the sequence TaskSequence seq2 = new TaskSequence(runData, name, currSequence, c=='['); currSequence.addTask(seq2); currSequence = seq2; colonOk = false; break; case '>' : currSequence.setNoChildReport(); case '}' : case ']' : // end sequence colonOk = true; prevTask = currSequence; currSequence = currSequence.getParent(); break; case '-' : isDisableCountNextTask = true; break; } //switch(c) break; } //switch(stok.ttype) } if (sequence != currSequence) { throw new Exception("Unmatched sequences"); } // remove redundant top level enclosing sequences while (sequence.isCollapsable() && sequence.getRepetitions()==1 && sequence.getRate()==0) { ArrayList t = sequence.getTasks(); if (t!=null && t.size()==1) { PerfTask p = (PerfTask) t.get(0); if (p instanceof TaskSequence) { sequence = (TaskSequence) p; continue; } } break; } } /* (non-Javadoc) * @see java.lang.Object#toString() */ public String toString() { String newline = System.getProperty("line.separator"); StringBuffer sb = new StringBuffer(); sb.append(sequence.toString()); sb.append(newline); return sb.toString(); } /** * Execute this algorithm * @throws Exception */ public void execute() throws Exception { try { sequence.runAndMaybeStats(true); } finally { sequence.close(); } } /** * Expert: for test purposes, return all tasks participating in this algorithm. * @return all tasks participating in this algorithm. */ public ArrayList extractTasks() { ArrayList res = new ArrayList(); extractTasks(res, sequence); return res; } private void extractTasks (ArrayList extrct, TaskSequence seq) { if (seq==null) return; extrct.add(seq); ArrayList t = sequence.getTasks(); if (t==null) return; for (int i = 0; i < t.size(); i++) { PerfTask p = (PerfTask) t.get(0); if (p instanceof TaskSequence) { extractTasks(extrct, (TaskSequence)p); } else { extrct.add(p); } } } } lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/utils/Config.java0000644000175000017500000003141711474320251032444 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.utils; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.Reader; import java.util.ArrayList; import java.util.List; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.Properties; import java.util.StringTokenizer; /** * Perf run configuration properties. *

* Numeric property containing ":", e.g. "10:100:5" is interpreted * as array of numeric values. It is extracted once, on first use, and * maintain a round number to return the appropriate value. *

* The config property "work.dir" tells where is the root of * docs data dirs and indexes dirs. It is set to either of:

    *
  • value supplied for it in the alg file;
  • *
  • otherwise, value of System property "benchmark.work.dir";
  • *
  • otherwise, "work".
  • *
*/ public class Config { private static final String NEW_LINE = System.getProperty("line.separator"); private int roundNumber = 0; private Properties props; private HashMap valByRound = new HashMap(); private HashMap colForValByRound = new HashMap(); private String algorithmText; /** * Read both algorithm and config properties. * @param algReader from where to read algorithm and config properties. * @throws IOException */ public Config (Reader algReader) throws IOException { // read alg file to array of lines ArrayList lines = new ArrayList(); BufferedReader r = new BufferedReader(algReader); int lastConfigLine=0; for (String line = r.readLine(); line!=null; line=r.readLine()) { lines.add(line); if (line.indexOf('=')>0) { lastConfigLine = lines.size(); } } r.close(); // copy props lines to string StringBuffer sb = new StringBuffer(); for (int i=0; i config properties:"); List propKeys = new ArrayList(props.keySet()); Collections.sort(propKeys); for (Iterator it = propKeys.iterator(); it.hasNext();) { String propName = (String) it.next(); System.out.println(propName + " = " + props.getProperty(propName)); } System.out.println("-------------------------------"); } /** * Return a string property. * @param name name of property. * @param dflt default value. * @return a string property. */ public String get (String name, String dflt) { return props.getProperty(name,dflt); } /** * Set a property. * Note: once a multiple values property is set, it can no longer be modified. * @param name name of property. * @param value either single or multiple property value (multiple values are separated by ":") * @throws Exception */ public void set (String name, String value) throws Exception { if (valByRound.get(name) != null) { throw new Exception("Cannot modify a multi value property!"); } props.setProperty(name,value); } /** * Return an int property. * If the property contain ":", e.g. "10:100:5", it is interpreted * as array of ints. It is extracted once, on first call * to get() it, and a by-round-value is returned. * @param name name of property * @param dflt default value * @return a int property. */ public int get (String name, int dflt) { // use value by round if already parsed int vals[] = (int[]) valByRound.get(name); if (vals != null) { return vals[roundNumber % vals.length]; } // done if not by round String sval = props.getProperty(name,""+dflt); if (sval.indexOf(":")<0) { return Integer.parseInt(sval); } // first time this prop is extracted by round int k = sval.indexOf(":"); String colName = sval.substring(0,k); sval = sval.substring(k+1); colForValByRound.put(name,colName); vals = propToIntArray(sval); valByRound.put(name,vals); return vals[roundNumber % vals.length]; } /** * Return a double property. * If the property contain ":", e.g. "10:100:5", it is interpreted * as array of doubles. It is extracted once, on first call * to get() it, and a by-round-value is returned. * @param name name of property * @param dflt default value * @return a double property. */ public double get (String name, double dflt) { // use value by round if already parsed double vals[] = (double[]) valByRound.get(name); if (vals != null) { return vals[roundNumber % vals.length]; } // done if not by round String sval = props.getProperty(name,""+dflt); if (sval.indexOf(":")<0) { return Double.parseDouble(sval); } // first time this prop is extracted by round int k = sval.indexOf(":"); String colName = sval.substring(0,k); sval = sval.substring(k+1); colForValByRound.put(name,colName); vals = propToDoubleArray(sval); valByRound.put(name,vals); return vals[roundNumber % vals.length]; } /** * Return a boolean property. * If the property contain ":", e.g. "true.true.false", it is interpreted * as array of booleans. It is extracted once, on first call * to get() it, and a by-round-value is returned. * @param name name of property * @param dflt default value * @return a int property. */ public boolean get (String name, boolean dflt) { // use value by round if already parsed boolean vals[] = (boolean[]) valByRound.get(name); if (vals != null) { return vals[roundNumber % vals.length]; } // done if not by round String sval = props.getProperty(name,""+dflt); if (sval.indexOf(":")<0) { return Boolean.valueOf(sval).booleanValue(); } // first time this prop is extracted by round int k = sval.indexOf(":"); String colName = sval.substring(0,k); sval = sval.substring(k+1); colForValByRound.put(name,colName); vals = propToBooleanArray(sval); valByRound.put(name,vals); return vals[roundNumber % vals.length]; } /** * Increment the round number, for config values that are extracted by round number. * @return the new round number. */ public int newRound () { roundNumber++; StringBuffer sb = new StringBuffer("--> Round ").append(roundNumber-1).append("-->").append(roundNumber); // log changes in values if (valByRound.size()>0) { sb.append(": "); for (Iterator iter = valByRound.keySet().iterator(); iter.hasNext();) { String name = (String) iter.next(); Object a = valByRound.get(name); if (a instanceof int[]) { int ai[] = (int[]) a; int n1 = (roundNumber-1)%ai.length; int n2 = roundNumber%ai.length; sb.append(" ").append(name).append(":").append(ai[n1]).append("-->").append(ai[n2]); } else if (a instanceof double[]){ double ad[] = (double[]) a; int n1 = (roundNumber-1)%ad.length; int n2 = roundNumber%ad.length; sb.append(" ").append(name).append(":").append(ad[n1]).append("-->").append(ad[n2]); } else { boolean ab[] = (boolean[]) a; int n1 = (roundNumber-1)%ab.length; int n2 = roundNumber%ab.length; sb.append(" ").append(name).append(":").append(ab[n1]).append("-->").append(ab[n2]); } } } System.out.println(); System.out.println(sb.toString()); System.out.println(); return roundNumber; } // extract properties to array, e.g. for "10:100:5" return int[]{10,100,5}. private int[] propToIntArray (String s) { if (s.indexOf(":")<0) { return new int [] { Integer.parseInt(s) }; } ArrayList a = new ArrayList(); StringTokenizer st = new StringTokenizer(s,":"); while (st.hasMoreTokens()) { String t = st.nextToken(); a.add(new Integer(t)); } int res[] = new int[a.size()]; for (int i=0; i * Expects this topic format - *
 *   <top>
 *   <num> Number: nnn
 *     
 *   <title> title of the topic
 *     
 *   <desc> Description:
 *   description of the topic
 *     
 *   <narr> Narrative:
 *   "story" composed by assessors.
 *    
 *   </top>
 * 
* Comment lines starting with '#' are ignored. */ public class TrecTopicsReader { private static final String newline = System.getProperty("line.separator"); /** * Constructor for Trec's TopicsReader */ public TrecTopicsReader() { super(); } /** * Read quality queries from trec format topics file. * @param reader where queries are read from. * @return the result quality queries. * @throws IOException if cannot read the queries. */ public QualityQuery[] readQueries(BufferedReader reader) throws IOException { ArrayList res = new ArrayList(); StringBuffer sb; try { while (null!=(sb=read(reader,"",null,false,false))) { HashMap fields = new HashMap(); // id sb = read(reader,"",null,true,false); int k = sb.indexOf(":"); String id = sb.substring(k+1).trim(); // title sb = read(reader,"",null,true,false); k = sb.indexOf(">"); String title = sb.substring(k+1).trim(); // description sb = read(reader,"<desc>",null,false,false); sb = read(reader,"<narr>",null,false,true); String descripion = sb.toString().trim(); // we got a topic! fields.put("title",title); fields.put("description",descripion); QualityQuery topic = new QualityQuery(id,fields); res.add(topic); // skip narrative, get to end of doc read(reader,"</top>",null,false,false); } } finally { reader.close(); } // sort result array (by ID) QualityQuery qq[] = (QualityQuery[]) res.toArray(new QualityQuery[0]); Arrays.sort(qq); return qq; } // read until finding a line that starts with the specified prefix private StringBuffer read (BufferedReader reader, String prefix, StringBuffer sb, boolean collectMatchLine, boolean collectAll) throws IOException { sb = (sb==null ? new StringBuffer() : sb); String sep = ""; while (true) { String line = reader.readLine(); if (line==null) { return null; } if (line.startsWith(prefix)) { if (collectMatchLine) { sb.append(sep+line); sep = newline; } break; } if (collectAll) { sb.append(sep+line); sep = newline; } } //System.out.println("read: "+sb); return sb; } } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/Trec1MQReader.java�0000644�0001750�0001750�00000005134�11474320251�033623� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.benchmark.quality.trec; import java.io.BufferedReader; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import org.apache.lucene.benchmark.quality.QualityQuery; /** * Read topics of TREC 1MQ track. * <p> * Expects this topic format - * <pre> * qnum:qtext * </pre> * Comment lines starting with '#' are ignored. * <p> * All topics will have a single name value pair. */ public class Trec1MQReader { private String name; /** * Constructor for Trec's 1MQ TopicsReader * @param name name of name-value pair to set for all queries. */ public Trec1MQReader(String name) { super(); this.name = name; } /** * Read quality queries from trec 1MQ format topics file. * @param reader where queries are read from. * @return the result quality queries. * @throws IOException if cannot read the queries. */ public QualityQuery[] readQueries(BufferedReader reader) throws IOException { ArrayList res = new ArrayList(); String line; try { while (null!=(line=reader.readLine())) { line = line.trim(); if (line.startsWith("#")) { continue; } // id int k = line.indexOf(":"); String id = line.substring(0,k).trim(); // qtext String qtext = line.substring(k+1).trim(); // we got a topic! HashMap fields = new HashMap(); fields.put(name,qtext); //System.out.println("id: "+id+" qtext: "+qtext+" line: "+line); QualityQuery topic = new QualityQuery(id,fields); res.add(topic); } } finally { reader.close(); } // sort result array (by ID) QualityQuery qq[] = (QualityQuery[]) res.toArray(new QualityQuery[0]); Arrays.sort(qq); return qq; } } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/QueryDriver.java���0000644�0001750�0001750�00000005235�11474320251�033547� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������package org.apache.lucene.benchmark.quality.trec; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.benchmark.quality.trec.TrecJudge; import org.apache.lucene.benchmark.quality.trec.TrecTopicsReader; import org.apache.lucene.benchmark.quality.utils.SimpleQQParser; import org.apache.lucene.benchmark.quality.utils.SubmissionReport; import org.apache.lucene.benchmark.quality.*; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Searcher; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.PrintWriter; /** * * **/ public class QueryDriver { public static void main(String[] args) throws Exception { File topicsFile = new File(args[0]); File qrelsFile = new File(args[1]); Searcher searcher = new IndexSearcher(args[3]); int maxResults = 1000; String docNameField = "docname"; PrintWriter logger = new PrintWriter(System.out, true); // use trec utilities to read trec topics into quality queries TrecTopicsReader qReader = new TrecTopicsReader(); QualityQuery qqs[] = qReader.readQueries(new BufferedReader(new FileReader(topicsFile))); // prepare judge, with trec utilities that read from a QRels file Judge judge = new TrecJudge(new BufferedReader(new FileReader(qrelsFile))); // validate topics & judgments match each other judge.validateData(qqs, logger); // set the parsing of quality queries into Lucene queries. QualityQueryParser qqParser = new SimpleQQParser("title", "body"); // run the benchmark QualityBenchmark qrun = new QualityBenchmark(qqs, qqParser, searcher, docNameField); qrun.setMaxResults(maxResults); SubmissionReport submitLog = null; QualityStats stats[] = qrun.execute(judge, submitLog, logger); // print an avarage sum of the results QualityStats avg = QualityStats.average(stats); avg.log("SUMMARY", 2, logger, " "); } } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/package.html�������0000644�0001750�0001750�00000001702�11474320251�032677� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������<!doctype html public "-//w3c//dtd html 4.0 transitional//en"> <!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> <html> <body> Utilities for Trec related quality benchmarking, feeding from Trec Topics and QRels inputs. </body> </html> ��������������������������������������������������������������lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/trec/TrecJudge.java�����0000644�0001750�0001750�00000011435�11474320251�033141� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.benchmark.quality.trec; import java.io.BufferedReader; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.StringTokenizer; import org.apache.lucene.benchmark.quality.Judge; import org.apache.lucene.benchmark.quality.QualityQuery; /** * Judge if given document is relevant to given quality query, based on Trec format for judgements. */ public class TrecJudge implements Judge { HashMap judgements; /** * Constructor from a reader. * <p> * Expected input format: * <pre> * qnum 0 doc-name is-relevant * </pre> * Two sample lines: * <pre> * 19 0 doc303 1 * 19 0 doc7295 0 * </pre> * @param reader where judgments are read from. * @throws IOException */ public TrecJudge (BufferedReader reader) throws IOException { judgements = new HashMap(); QRelJudgement curr = null; String zero = "0"; String line; try { while (null!=(line=reader.readLine())) { line = line.trim(); if (line.length()==0 || '#'==line.charAt(0)) { continue; } StringTokenizer st = new StringTokenizer(line); String queryID = st.nextToken(); st.nextToken(); String docName = st.nextToken(); boolean relevant = !zero.equals(st.nextToken()); assert !st.hasMoreTokens() : "wrong format: "+line+" next: "+st.nextToken(); if (relevant) { // only keep relevant docs if (curr==null || !curr.queryID.equals(queryID)) { curr = (QRelJudgement)judgements.get(queryID); if (curr==null) { curr = new QRelJudgement(queryID); judgements.put(queryID,curr); } } curr.addRelevandDoc(docName); } } } finally { reader.close(); } } // inherit javadocs public boolean isRelevant(String docName, QualityQuery query) { QRelJudgement qrj = (QRelJudgement) judgements.get(query.getQueryID()); return qrj!=null && qrj.isRelevant(docName); } /** single Judgement of a trec quality query */ private static class QRelJudgement { private String queryID; private HashMap relevantDocs; QRelJudgement(String queryID) { this.queryID = queryID; relevantDocs = new HashMap(); } public void addRelevandDoc(String docName) { relevantDocs.put(docName,docName); } boolean isRelevant(String docName) { return relevantDocs.containsKey(docName); } public int maxRecall() { return relevantDocs.size(); } } // inherit javadocs public boolean validateData(QualityQuery[] qq, PrintWriter logger) { HashMap missingQueries = (HashMap) judgements.clone(); ArrayList missingJudgements = new ArrayList(); for (int i=0; i<qq.length; i++) { String id = qq[i].getQueryID(); if (missingQueries.containsKey(id)) { missingQueries.remove(id); } else { missingJudgements.add(id); } } boolean isValid = true; if (missingJudgements.size()>0) { isValid = false; if (logger!=null) { logger.println("WARNING: "+missingJudgements.size()+" queries have no judgments! - "); for (int i=0; i<missingJudgements.size(); i++) { logger.println(" "+(String)missingJudgements.get(i)); } } } if (missingQueries.size()>0) { isValid = false; if (logger!=null) { logger.println("WARNING: "+missingQueries.size()+" judgments match no query! - "); for (Iterator it = missingQueries.keySet().iterator(); it.hasNext();) { String id = (String) it.next(); logger.println(" "+id); } } } return isValid; } // inherit javadocs public int maxRecall(QualityQuery query) { QRelJudgement qrj = (QRelJudgement) judgements.get(query.getQueryID()); if (qrj!=null) { return qrj.maxRecall(); } return 0; } } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/Judge.java��������������0000644�0001750�0001750�00000004125�11474320251�031364� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.benchmark.quality; import java.io.PrintWriter; /** * Judge if a document is relevant for a quality query. */ public interface Judge { /** * Judge if document <code>docName</code> is relevant for the given quality query. * @param docName name of doc tested for relevancy. * @param query tested quality query. * @return true if relevant, false if not. */ public boolean isRelevant(String docName, QualityQuery query); /** * Validate that queries and this Judge match each other. * To be perfectly valid, this Judge must have some data for each and every * input quality query, and must not have any data on any other quality query. * <b>Note</b>: the quality benchmark run would not fail in case of imperfect * validity, just a warning message would be logged. * @param qq quality queries to be validated. * @param logger if not null, validation issues are logged. * @return true if perfectly valid, false if not. */ public boolean validateData (QualityQuery qq[], PrintWriter logger); /** * Return the maximal recall for the input quality query. * It is the number of relevant docs this Judge "knows" for the query. * @param query the query whose maximal recall is needed. */ public int maxRecall (QualityQuery query); } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityBenchmark.java���0000644�0001750�0001750�00000014075�11474320251�033576� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.benchmark.quality; import java.io.IOException; import java.io.PrintWriter; import org.apache.lucene.benchmark.quality.utils.DocNameExtractor; import org.apache.lucene.benchmark.quality.utils.SubmissionReport; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.TopDocs; /** * Main entry point for running a quality benchmark. * <p> * There are two main configurations for running a quality benchmark: <ul> * <li>Against existing judgements.</li> * <li>For submission (e.g. for a contest).</li> * </ul> * The first configuration requires a non null * {@link org.apache.lucene.benchmark.quality.Judge Judge}. * The second configuration requires a non null * {@link org.apache.lucene.benchmark.quality.utils.SubmissionReport SubmissionLogger}. */ public class QualityBenchmark { /** Quality Queries that this quality benchmark would execute. */ protected QualityQuery qualityQueries[]; /** Parser for turning QualityQueries into Lucene Queries. */ protected QualityQueryParser qqParser; /** Index to be searched. */ protected Searcher searcher; /** index field to extract doc name for each search result; used for judging the results. */ protected String docNameField; /** maximal number of queries that this quality benchmark runs. Default: maxint. Useful for debugging. */ private int maxQueries = Integer.MAX_VALUE; /** maximal number of results to collect for each query. Default: 1000. */ private int maxResults = 1000; /** * Create a QualityBenchmark. * @param qqs quality queries to run. * @param qqParser parser for turning QualityQueries into Lucene Queries. * @param searcher index to be searched. * @param docNameField name of field containing the document name. * This allows to extract the doc name for search results, * and is important for judging the results. */ public QualityBenchmark(QualityQuery qqs[], QualityQueryParser qqParser, Searcher searcher, String docNameField) { this.qualityQueries = qqs; this.qqParser = qqParser; this.searcher = searcher; this.docNameField = docNameField; } /** * Run the quality benchmark. * @param judge the judge that can tell if a certain result doc is relevant for a certain quality query. * If null, no judgements would be made. Usually null for a submission run. * @param submitRep submission report is created if non null. * @param qualityLog If not null, quality run data would be printed for each query. * @return QualityStats of each quality query that was executed. * @throws Exception if quality benchmark failed to run. */ public QualityStats [] execute(Judge judge, SubmissionReport submitRep, PrintWriter qualityLog) throws Exception { int nQueries = Math.min(maxQueries, qualityQueries.length); QualityStats stats[] = new QualityStats[nQueries]; for (int i=0; i<nQueries; i++) { QualityQuery qq = qualityQueries[i]; // generate query Query q = qqParser.parse(qq); // search with this query long t1 = System.currentTimeMillis(); TopDocs td = searcher.search(q,null,maxResults); long searchTime = System.currentTimeMillis()-t1; //most likely we either submit or judge, but check both if (judge!=null) { stats[i] = analyzeQueryResults(qq, q, td, judge, qualityLog, searchTime); } if (submitRep!=null) { submitRep.report(qq,td,docNameField,searcher); } } if (submitRep!=null) { submitRep.flush(); } return stats; } /* Analyze/judge results for a single quality query; optionally log them. */ private QualityStats analyzeQueryResults(QualityQuery qq, Query q, TopDocs td, Judge judge, PrintWriter logger, long searchTime) throws IOException { QualityStats stts = new QualityStats(judge.maxRecall(qq),searchTime); ScoreDoc sd[] = td.scoreDocs; long t1 = System.currentTimeMillis(); // extraction of first doc name we measure also construction of doc name extractor, just in case. DocNameExtractor xt = new DocNameExtractor(docNameField); for (int i=0; i<sd.length; i++) { String docName = xt.docName(searcher,sd[i].doc); long docNameExtractTime = System.currentTimeMillis() - t1; t1 = System.currentTimeMillis(); boolean isRelevant = judge.isRelevant(docName,qq); stts.addResult(i+1,isRelevant, docNameExtractTime); } if (logger!=null) { logger.println(qq.getQueryID()+" - "+q); stts.log(qq.getQueryID()+" Stats:",1,logger," "); } return stts; } /** * @return the maximum number of quality queries to run. Useful at debugging. */ public int getMaxQueries() { return maxQueries; } /** * Set the maximum number of quality queries to run. Useful at debugging. */ public void setMaxQueries(int maxQueries) { this.maxQueries = maxQueries; } /** * @return the maximum number of results to collect for each quality query. */ public int getMaxResults() { return maxResults; } /** * set the maximum number of results to collect for each quality query. */ public void setMaxResults(int maxResults) { this.maxResults = maxResults; } } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/package.html������������0000644�0001750�0001750�00000006174�11474320251�031752� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������<!doctype html public "-//w3c//dtd html 4.0 transitional//en"> <!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> <html> <body> <h2>Search Quality Benchmarking.</h2> <p> This package allows to benchmark search quality of a Lucene application. <p> In order to use this package you should provide: <ul> <li>A <a href="../../search/Searcher.html">searcher</a>.</li> <li><a href="QualityQuery.html">Quality queries</a>.</li> <li><a href="Judge.html">Judging object</a>.</li> <li><a href="utils/SubmissionReport.html">Reporting object</a>.</li> </ul> <p> For benchmarking TREC collections with TREC QRels, take a look at the <a href="trec/package-summary.html">trec package</a>. <p> Here is a sample code used to run the TREC 2006 queries 701-850 on the .Gov2 collection: <pre> File topicsFile = new File("topics-701-850.txt"); File qrelsFile = new File("qrels-701-850.txt"); Searcher searcher = new IndexSearcher("index"); int maxResults = 1000; String docNameField = "docname"; PrintWriter logger = new PrintWriter(System.out,true); // use trec utilities to read trec topics into quality queries TrecTopicsReader qReader = new TrecTopicsReader(); QualityQuery qqs[] = qReader.readQueries(new BufferedReader(new FileReader(topicsFile))); // prepare judge, with trec utilities that read from a QRels file Judge judge = new TrecJudge(new BufferedReader(new FileReader(qrelsFile))); // validate topics & judgments match each other judge.validateData(qqs, logger); // set the parsing of quality queries into Lucene queries. QualityQueryParser qqParser = new SimpleQQParser("title", "body"); // run the benchmark QualityBenchmark qrun = new QualityBenchmark(qqs, qqParser, searcher, docNameField); SubmissionReport submitLog = null; QualityStats stats[] = qrun.execute(maxResults, judge, submitLog, logger); // print an avarage sum of the results QualityStats avg = QualityStats.average(stats); avg.log("SUMMARY",2,logger, " "); </pre> <p> Some immediate ways to modify this program to your needs are: <ul> <li>To run on different formats of queries and judgements provide your own <a href="Judge.html">Judge</a> and <a href="QualityQuery.html">Quality queries</a>.</li> <li>Create sophisticated Lucene queries by supplying a different <a href="QualityQueryParser.html">Quality query parser</a>.</li> </ul> </body> </html> ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityQueryParser.java�0000644�0001750�0001750�00000002353�11474320251�034162� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.benchmark.quality; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.Query; /** * Parse a QualityQuery into a Lucene query. */ public interface QualityQueryParser { /** * Parse a given QualityQuery into a Lucene query. * @param qq the quality query to be parsed. * @throws ParseException if parsing failed. */ public Query parse(QualityQuery qq) throws ParseException; } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityStats.java�������0000644�0001750�0001750�00000022407�11474320251�033000� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.benchmark.quality; import java.io.PrintWriter; import java.text.NumberFormat; import java.util.ArrayList; /** * Results of quality benchmark run for a single query or for a set of queries. */ public class QualityStats { /** Number of points for which precision is computed. */ public static final int MAX_POINTS = 20; private double maxGoodPoints; private double recall; private double pAt[]; private double pReleventSum = 0; private double numPoints = 0; private double numGoodPoints = 0; private double mrr = 0; private long searchTime; private long docNamesExtractTime; /** * A certain rank in which a relevant doc was found. */ public static class RecallPoint { private int rank; private double recall; private RecallPoint(int rank, double recall) { this.rank = rank; this.recall = recall; } /** Returns the rank: where on the list of returned docs this relevant doc appeared. */ public int getRank() { return rank; } /** Returns the recall: how many relevant docs were returned up to this point, inclusive. */ public double getRecall() { return recall; } } private ArrayList recallPoints; /** * Construct a QualityStats object with anticipated maximal number of relevant hits. * @param maxGoodPoints maximal possible relevant hits. */ public QualityStats(double maxGoodPoints, long searchTime) { this.maxGoodPoints = maxGoodPoints; this.searchTime = searchTime; this.recallPoints = new ArrayList(); pAt = new double[MAX_POINTS+1]; // pAt[0] unused. } /** * Add a (possibly relevant) doc. * @param n rank of the added doc (its ordinal position within the query results). * @param isRelevant true if the added doc is relevant, false otherwise. */ public void addResult(int n, boolean isRelevant, long docNameExtractTime) { if (Math.abs(numPoints+1 - n) > 1E-6) { throw new IllegalArgumentException("point "+n+" illegal after "+numPoints+" points!"); } if (isRelevant) { numGoodPoints+=1; recallPoints.add(new RecallPoint(n,numGoodPoints)); if (recallPoints.size()==1 && n<=5) { // first point, but only within 5 top scores. mrr = 1.0 / n; } } numPoints = n; double p = numGoodPoints / numPoints; if (isRelevant) { pReleventSum += p; } if (n<pAt.length) { pAt[n] = p; } recall = maxGoodPoints<=0 ? p : numGoodPoints/maxGoodPoints; docNamesExtractTime += docNameExtractTime; } /** * Return the precision at rank n: * |{relevant hits within first <code>n</code> hits}| / <code>n</code>. * @param n requested precision point, must be at least 1 and at most {@link #MAX_POINTS}. */ public double getPrecisionAt(int n) { if (n<1 || n>MAX_POINTS) { throw new IllegalArgumentException("n="+n+" - but it must be in [1,"+MAX_POINTS+"] range!"); } if (n>numPoints) { return (numPoints * pAt[(int)numPoints])/n; } return pAt[n]; } /** * Return the average precision at recall points. */ public double getAvp() { return maxGoodPoints==0 ? 0 : pReleventSum/maxGoodPoints; } /** * Return the recall: |{relevant hits found}| / |{relevant hits existing}|. */ public double getRecall() { return recall; } /** * Log information on this QualityStats object. * @param logger Logger. * @param prefix prefix before each log line. */ public void log(String title, int paddLines, PrintWriter logger, String prefix) { for (int i=0; i<paddLines; i++) { logger.println(); } if (title!=null && title.trim().length()>0) { logger.println(title); } prefix = prefix==null ? "" : prefix; NumberFormat nf = NumberFormat.getInstance(); nf.setMaximumFractionDigits(3); nf.setMinimumFractionDigits(3); nf.setGroupingUsed(true); int M = 19; logger.println(prefix+format("Search Seconds: ",M)+ fracFormat(nf.format((double)searchTime/1000))); logger.println(prefix+format("DocName Seconds: ",M)+ fracFormat(nf.format((double)docNamesExtractTime/1000))); logger.println(prefix+format("Num Points: ",M)+ fracFormat(nf.format(numPoints))); logger.println(prefix+format("Num Good Points: ",M)+ fracFormat(nf.format(numGoodPoints))); logger.println(prefix+format("Max Good Points: ",M)+ fracFormat(nf.format(maxGoodPoints))); logger.println(prefix+format("Average Precision: ",M)+ fracFormat(nf.format(getAvp()))); logger.println(prefix+format("MRR: ",M)+ fracFormat(nf.format(getMRR()))); logger.println(prefix+format("Recall: ",M)+ fracFormat(nf.format(getRecall()))); for (int i=1; i<(int)numPoints && i<pAt.length; i++) { logger.println(prefix+format("Precision At "+i+": ",M)+ fracFormat(nf.format(getPrecisionAt(i)))); } for (int i=0; i<paddLines; i++) { logger.println(); } } private static String padd = " "; private String format(String s, int minLen) { s = (s==null ? "" : s); int n = Math.max(minLen,s.length()); return (s+padd).substring(0,n); } private String fracFormat(String frac) { int k = frac.indexOf('.'); String s1 = padd+frac.substring(0,k); int n = Math.max(k,6); s1 = s1.substring(s1.length()-n); return s1 + frac.substring(k); } /** * Create a QualityStats object that is the average of the input QualityStats objects. * @param stats array of input stats to be averaged. * @return an average over the input stats. */ public static QualityStats average(QualityStats[] stats) { QualityStats avg = new QualityStats(0,0); if (stats.length==0) { // weired, no stats to average! return avg; } int m = 0; // queries with positive judgements // aggregate for (int i=0; i<stats.length; i++) { avg.searchTime += stats[i].searchTime; avg.docNamesExtractTime += stats[i].docNamesExtractTime; if (stats[i].maxGoodPoints>0) { m++; avg.numGoodPoints += stats[i].numGoodPoints; avg.numPoints += stats[i].numPoints; avg.pReleventSum += stats[i].getAvp(); avg.recall += stats[i].recall; avg.mrr += stats[i].getMRR(); avg.maxGoodPoints += stats[i].maxGoodPoints; for (int j=1; j<avg.pAt.length; j++) { avg.pAt[j] += stats[i].getPrecisionAt(j); } } } assert m>0 : "Fishy: no \"good\" queries!"; // take average: times go by all queries, other measures go by "good" queries only. avg.searchTime /= stats.length; avg.docNamesExtractTime /= stats.length; avg.numGoodPoints /= m; avg.numPoints /= m; avg.recall /= m; avg.mrr /= m; avg.maxGoodPoints /= m; for (int j=1; j<avg.pAt.length; j++) { avg.pAt[j] /= m; } avg.pReleventSum /= m; // this is actually avgp now avg.pReleventSum *= avg.maxGoodPoints; // so that getAvgP() would be correct return avg; } /** * Returns the time it took to extract doc names for judging the measured query, in milliseconds. */ public long getDocNamesExtractTime() { return docNamesExtractTime; } /** * Returns the maximal number of good points. * This is the number of relevant docs known by the judge for the measured query. */ public double getMaxGoodPoints() { return maxGoodPoints; } /** * Returns the number of good points (only relevant points). */ public double getNumGoodPoints() { return numGoodPoints; } /** * Returns the number of points (both relevant and irrelevant points). */ public double getNumPoints() { return numPoints; } /** * Returns the recallPoints. */ public RecallPoint [] getRecallPoints() { return (RecallPoint[]) recallPoints.toArray(new RecallPoint[0]); } /** * Returns the Mean reciprocal rank over the queries or RR for a single query. * <p> * Reciprocal rank is defined as <code>1/r</code> where <code>r</code> is the * rank of the first correct result, or <code>0</code> if there are no correct * results within the top 5 results. * <p> * This follows the definition in * <a href="http://www.cnlp.org/publications/02cnlptrec10.pdf"> * Question Answering - CNLP at the TREC-10 Question Answering Track</a>. */ public double getMRR() { return mrr; } /** * Returns the search time in milliseconds for the measured query. */ public long getSearchTime() { return searchTime; } } ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/������������������0000755�0001750�0001750�00000000000�11554106561�030626� 5����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������././@LongLink���������������������������������������������������������������������������������������0000000�0000000�0000000�00000000154�00000000000�011565� L����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/QualityQueriesFinder.java���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/QualityQueriesFind0000644�0001750�0001750�00000010550�11474320251�034334� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.benchmark.quality.utils; import java.io.File; import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.PriorityQueue; /** * Suggest Quality queries based on an index contents. * Utility class, used for making quality test benchmarks. */ public class QualityQueriesFinder { private static final String newline = System.getProperty("line.separator"); private Directory dir; /** * Constructor over a directory containing the index. * @param dir directory containing the index we search for the quality test. */ private QualityQueriesFinder(Directory dir) { this.dir = dir; } /** * @param args {index-dir} * @throws IOException if cannot access the index. */ public static void main(String[] args) throws IOException { if (args.length<1) { System.err.println("Usage: java QualityQueriesFinder <index-dir>"); System.exit(1); } QualityQueriesFinder qqf = new QualityQueriesFinder(FSDirectory.open(new File(args[0]))); String q[] = qqf.bestQueries("body",20); for (int i=0; i<q.length; i++) { System.out.println(newline+formatQueryAsTrecTopic(i,q[i],null,null)); } } private String [] bestQueries(String field,int numQueries) throws IOException { String words[] = bestTerms("body",4*numQueries); int n = words.length; int m = n/4; String res[] = new String[m]; for (int i=0; i<res.length; i++) { res[i] = words[i] + " " + words[m+i]+ " " + words[n-1-m-i] + " " + words[n-1-i]; //System.out.println("query["+i+"]: "+res[i]); } return res; } private static String formatQueryAsTrecTopic (int qnum, String title, String description, String narrative) { return "<top>" + newline + "<num> Number: " + qnum + newline + newline + "<title> " + (title==null?"":title) + newline + newline + "<desc> Description:" + newline + (description==null?"":description) + newline + newline + "<narr> Narrative:" + newline + (narrative==null?"":narrative) + newline + newline + "</top>"; } private String [] bestTerms(String field,int numTerms) throws IOException { PriorityQueue pq = new TermsDfQueue(numTerms); IndexReader ir = IndexReader.open(dir); try { int threshold = ir.maxDoc() / 10; // ignore words too common. TermEnum terms = ir.terms(new Term(field,"")); while (terms.next()) { if (!field.equals(terms.term().field())) { break; } int df = terms.docFreq(); if (df<threshold) { String ttxt = terms.term().text(); pq.insert(new TermDf(ttxt,df)); } } } finally { ir.close(); } String res[] = new String[pq.size()]; int i = 0; while (pq.size()>0) { TermDf tdf = (TermDf) pq.pop(); res[i++] = tdf.word; System.out.println(i+". word: "+tdf.df+" "+tdf.word); } return res; } private static class TermDf { String word; int df; TermDf (String word, int freq) { this.word = word; this.df = freq; } } private static class TermsDfQueue extends PriorityQueue { TermsDfQueue (int maxSize) { initialize(maxSize); } protected boolean lessThan(Object a, Object b) { TermDf tf1 = (TermDf) a; TermDf tf2 = (TermDf) b; return tf1.df < tf2.df; } } } ��������������������������������������������������������������������������������������������������������������������������������������������������������././@LongLink���������������������������������������������������������������������������������������0000000�0000000�0000000�00000000150�00000000000�011561� L����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/DocNameExtractor.java�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/DocNameExtractor.j0000644�0001750�0001750�00000004065�11474320251�034203� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.benchmark.quality.utils; import java.io.IOException; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.FieldSelectorResult; import org.apache.lucene.search.Searcher; /** * Utility: extract doc names from an index */ public class DocNameExtractor { private FieldSelector fldSel; private String docNameField; /** * Constructor for DocNameExtractor. * @param docNameField name of the stored field containing the doc name. */ public DocNameExtractor (final String docNameField) { this.docNameField = docNameField; fldSel = new FieldSelector() { public FieldSelectorResult accept(String fieldName) { return fieldName.equals(docNameField) ? FieldSelectorResult.LOAD_AND_BREAK : FieldSelectorResult.NO_LOAD; } }; } /** * Extract the name of the input doc from the index. * @param searcher access to the index. * @param docid ID of doc whose name is needed. * @return the name of the input doc as extracted from the index. * @throws IOException if cannot extract the doc name from the index. */ public String docName(Searcher searcher, int docid) throws IOException { return searcher.doc(docid,fldSel).get(docNameField); } } ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/package.html������0000644�0001750�0001750�00000001702�11474320251�033102� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������<!doctype html public "-//w3c//dtd html 4.0 transitional//en"> <!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> <html> <body> Miscellaneous utilities for search quality benchmarking: query parsing, submission reports. </body> </html> ��������������������������������������������������������������././@LongLink���������������������������������������������������������������������������������������0000000�0000000�0000000�00000000150�00000000000�011561� L����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SubmissionReport.java�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SubmissionReport.j0000644�0001750�0001750�00000006026�11474320251�034327� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.benchmark.quality.utils; import java.io.IOException; import java.io.PrintWriter; import java.text.NumberFormat; import org.apache.lucene.benchmark.quality.QualityQuery; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.TopDocs; /** * Create a log ready for submission. * Extend this class and override * {@link #report(QualityQuery, TopDocs, String, Searcher)} * to create different reports. */ public class SubmissionReport { private NumberFormat nf; private PrintWriter logger; private String name; /** * Constructor for SubmissionReport. * @param logger if null, no submission data is created. * @param name name of this run. */ public SubmissionReport (PrintWriter logger, String name) { this.logger = logger; this.name = name; nf = NumberFormat.getInstance(); nf.setMaximumFractionDigits(4); nf.setMinimumFractionDigits(4); } /** * Report a search result for a certain quality query. * @param qq quality query for which the results are reported. * @param td search results for the query. * @param docNameField stored field used for fetching the result doc name. * @param searcher index access for fetching doc name. * @throws IOException in case of a problem. */ public void report(QualityQuery qq, TopDocs td, String docNameField, Searcher searcher) throws IOException { if (logger==null) { return; } ScoreDoc sd[] = td.scoreDocs; String sep = " \t "; DocNameExtractor xt = new DocNameExtractor(docNameField); for (int i=0; i<sd.length; i++) { String docName = xt.docName(searcher,sd[i].doc); logger.println( qq.getQueryID() + sep + "Q0" + sep + format(docName,20) + sep + format(""+i,7) + sep + nf.format(sd[i].score) + sep + name ); } } public void flush() { if (logger!=null) { logger.flush(); } } private static String padd = " "; private String format(String s, int minLen) { s = (s==null ? "" : s); int n = Math.max(minLen,s.length()); return (s+padd).substring(0,n); } } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������././@LongLink���������������������������������������������������������������������������������������0000000�0000000�0000000�00000000146�00000000000�011566� L����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SimpleQQParser.java���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/utils/SimpleQQParser.jav0000644�0001750�0001750�00000004344�11474320251�034200� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.benchmark.quality.utils; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.benchmark.quality.QualityQuery; import org.apache.lucene.benchmark.quality.QualityQueryParser; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.Query; /** * Simplistic quality query parser. A Lucene query is created by passing * the value of the specified QualityQuery name-value pair into * a Lucene's QueryParser using StandardAnalyzer. */ public class SimpleQQParser implements QualityQueryParser { private String qqName; private String indexField; ThreadLocal queryParser = new ThreadLocal(); /** * Constructor of a simple qq parser. * @param qqName name-value pair of quality query to use for creating the query * @param indexField corresponding index field */ public SimpleQQParser(String qqName, String indexField) { this.qqName = qqName; this.indexField = indexField; } /* (non-Javadoc) * @see org.apache.lucene.benchmark.quality.QualityQueryParser#parse(org.apache.lucene.benchmark.quality.QualityQuery) */ public Query parse(QualityQuery qq) throws ParseException { QueryParser qp = (QueryParser) queryParser.get(); if (qp==null) { qp = new QueryParser(indexField, new StandardAnalyzer()); queryParser.set(qp); } return qp.parse(qq.getValue(qqName)); } } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/quality/QualityQuery.java�������0000644�0001750�0001750�00000005640�11474320251�033007� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.benchmark.quality; import java.util.Map; /** * A QualityQuery has an ID and some name-value pairs. * <p> * The ID allows to map the quality query with its judgements. * <p> * The name-value pairs are used by a * {@link org.apache.lucene.benchmark.quality.QualityQueryParser} * to create a Lucene {@link org.apache.lucene.search.Query}. * <p> * It is very likely that name-value-pairs would be mapped into fields in a Lucene query, * but it is up to the QualityQueryParser how to map - e.g. all values in a single field, * or each pair as its own field, etc., - and this of course must match the way the * searched index was constructed. */ public class QualityQuery implements Comparable { private String queryID; private Map nameValPairs; /** * Create a QualityQuery with given ID and name-value pairs. * @param queryID ID of this quality query. * @param nameValPairs the contents of this quality query. */ public QualityQuery(String queryID, Map nameValPairs) { this.queryID = queryID; this.nameValPairs = nameValPairs; } /** * Return all the names of name-value-pairs in this QualityQuery. */ public String[] getNames() { return (String[]) nameValPairs.keySet().toArray(new String[0]); } /** * Return the value of a certain name-value pair. * @param name the name whose value should be returned. */ public String getValue(String name) { return (String) nameValPairs.get(name); } /** * Return the ID of this query. * The ID allows to map the quality query with its judgements. */ public String getQueryID() { return queryID; } /* for a nicer sort of input queries before running them. * Try first as ints, fall back to string if not int. */ public int compareTo(Object o) { QualityQuery other = (QualityQuery) o; try { // compare as ints when ids ints int n = Integer.parseInt(queryID); int nOther = Integer.parseInt(other.queryID); return n - nOther; } catch (NumberFormatException e) { // fall back to string comparison return queryID.compareTo(other.queryID); } } } ������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/��������������������������0000755�0001750�0001750�00000000000�11554106561�027136� 5����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractReuters.java�������0000644�0001750�0001750�00000012662�11474320251�032767� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������package org.apache.lucene.benchmark.utils; /** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.BufferedReader; import java.io.File; import java.io.FileFilter; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Split the Reuters SGML documents into Simple Text files containing: Title, Date, Dateline, Body */ public class ExtractReuters { private File reutersDir; private File outputDir; private static final String LINE_SEPARATOR = System.getProperty("line.separator"); public ExtractReuters(File reutersDir, File outputDir) { this.reutersDir = reutersDir; this.outputDir = outputDir; System.out.println("Deleting all files in " + outputDir); File [] files = outputDir.listFiles(); for (int i = 0; i < files.length; i++) { files[i].delete(); } } public void extract() { File [] sgmFiles = reutersDir.listFiles(new FileFilter() { public boolean accept(File file) { return file.getName().endsWith(".sgm"); } }); if (sgmFiles != null && sgmFiles.length > 0) { for (int i = 0; i < sgmFiles.length; i++) { File sgmFile = sgmFiles[i]; extractFile(sgmFile); } } else { System.err.println("No .sgm files in " + reutersDir); } } Pattern EXTRACTION_PATTERN = Pattern.compile("<TITLE>(.*?)|(.*?)|(.*?)"); private static String[] META_CHARS = {"&", "<", ">", "\"", "'"}; private static String[] META_CHARS_SERIALIZATIONS = {"&", "<", ">", """, "'"}; /** * Override if you wish to change what is extracted * * @param sgmFile */ protected void extractFile(File sgmFile) { try { BufferedReader reader = new BufferedReader(new FileReader(sgmFile)); StringBuffer buffer = new StringBuffer(1024); StringBuffer outBuffer = new StringBuffer(1024); String line = null; int index = -1; int docNumber = 0; while ((line = reader.readLine()) != null) { //when we see a closing reuters tag, flush the file if ((index = line.indexOf(" org.apache.lucene.benchmark.utils.ExtractReuters "); } } lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java0000644000175000017500000001211411474320251033234 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.utils; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.Properties; import org.apache.lucene.benchmark.byTask.feeds.DocMaker; import org.apache.lucene.benchmark.byTask.feeds.EnwikiDocMaker; import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.document.Document; /** * Extract the downloaded Wikipedia dump into separate files for indexing. */ public class ExtractWikipedia { private File outputDir; static public int count = 0; static final int BASE = 10; protected DocMaker docMaker; public ExtractWikipedia(DocMaker docMaker, File outputDir) { this.outputDir = outputDir; this.docMaker = docMaker; System.out.println("Deleting all files in " + outputDir); File[] files = outputDir.listFiles(); for (int i = 0; i < files.length; i++) { files[i].delete(); } } public File directory(int count, File directory) { if (directory == null) { directory = outputDir; } int base = BASE; while (base <= count) { base *= BASE; } if (count < BASE) { return directory; } directory = new File(directory, (Integer.toString(base / BASE))); directory = new File(directory, (Integer.toString(count / (base / BASE)))); return directory(count % (base / BASE), directory); } public void create(String id, String title, String time, String body) { File d = directory(count++, null); d.mkdirs(); File f = new File(d, id + ".txt"); StringBuffer contents = new StringBuffer(); contents.append(time); contents.append("\n\n"); contents.append(title); contents.append("\n\n"); contents.append(body); contents.append("\n"); try { FileWriter writer = new FileWriter(f); writer.write(contents.toString()); writer.close(); } catch (IOException ioe) { throw new RuntimeException(ioe); } } public void extract() throws Exception { Document doc = null; System.out.println("Starting Extraction"); long start = System.currentTimeMillis(); try { while ((doc = docMaker.makeDocument()) != null) { create(doc.get(DocMaker.ID_FIELD), doc.get(DocMaker.TITLE_FIELD), doc .get(DocMaker.DATE_FIELD), doc.get(DocMaker.BODY_FIELD)); } } catch (NoMoreDataException e) { //continue } long finish = System.currentTimeMillis(); System.out.println("Extraction took " + (finish - start) + " ms"); } public static void main(String[] args) throws Exception { File wikipedia = null; File outputDir = new File("./enwiki"); boolean keepImageOnlyDocs = true; for (int i = 0; i < args.length; i++) { String arg = args[i]; if (arg.equals("--input") || arg.equals("-i")) { wikipedia = new File(args[i + 1]); i++; } else if (arg.equals("--output") || arg.equals("-o")) { outputDir = new File(args[i + 1]); i++; } else if (arg.equals("--discardImageOnlyDocs") || arg.equals("-d")) { keepImageOnlyDocs = false; } } DocMaker docMaker = new EnwikiDocMaker(); Properties properties = new Properties(); properties.setProperty("docs.file", wikipedia.getAbsolutePath()); properties.setProperty("content.source.forever", "false"); properties.setProperty("keep.image.only.docs", String.valueOf(keepImageOnlyDocs)); docMaker.setConfig(new Config(properties)); docMaker.resetInputs(); if (wikipedia != null && wikipedia.exists()) { System.out.println("Extracting Wikipedia to: " + outputDir + " using EnwikiDocMaker"); outputDir.mkdirs(); ExtractWikipedia extractor = new ExtractWikipedia(docMaker, outputDir); extractor.extract(); } else { printUsage(); } } private static void printUsage() { System.err.println("Usage: java -cp <...> org.apache.lucene.benchmark.utils.ExtractWikipedia --input|-i " + "[--output|-o ] [--discardImageOnlyDocs|-d] [--useLineDocMaker|-l]"); System.err.println("--discardImageOnlyDocs tells the extractor to skip Wiki docs that contain only images"); System.err.println("--useLineDocMaker uses the LineDocMaker. Default is EnwikiDocMaker"); } }lucene-2.9.4/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/NoDeletionPolicy.java0000644000175000017500000000217511474320251033221 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.utils; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.List; import org.apache.lucene.index.IndexDeletionPolicy; public class NoDeletionPolicy implements IndexDeletionPolicy { public void onCommit(List commits) throws IOException { } public void onInit(List commits) throws IOException { } } lucene-2.9.4/contrib/benchmark/src/java/overview.html0000644000175000017500000000156111474320253023315 0ustar janpascaljanpascal benchmark benchmark lucene-2.9.4/contrib/benchmark/src/test/0000755000175000017500000000000011474320251020612 5ustar janpascaljanpascallucene-2.9.4/contrib/benchmark/src/test/org/0000755000175000017500000000000011474320251021401 5ustar janpascaljanpascallucene-2.9.4/contrib/benchmark/src/test/org/apache/0000755000175000017500000000000011474320251022622 5ustar janpascaljanpascallucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/0000755000175000017500000000000011474320251024075 5ustar janpascaljanpascallucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/0000755000175000017500000000000011554106561026034 5ustar janpascaljanpascallucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/BenchmarkTestCase.java0000644000175000017500000000232311474320251032220 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.File; import junit.framework.TestCase; /** Base class for all Benchmark unit tests. */ public class BenchmarkTestCase extends TestCase { private static final File workDir; static { workDir = new File(System.getProperty("benchmark.work.dir", "test/benchmark")).getAbsoluteFile(); workDir.mkdirs(); } public File getWorkDir() { return workDir; } } lucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/0000755000175000017500000000000011554106561027271 5ustar janpascaljanpascallucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/0000755000175000017500000000000011554106561030357 5ustar janpascaljanpascallucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/DocMakerTest.java0000644000175000017500000001343511474320251033550 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.feeds; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Properties; import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.benchmark.BenchmarkTestCase; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.tasks.AddDocTask; import org.apache.lucene.benchmark.byTask.tasks.CloseIndexTask; import org.apache.lucene.benchmark.byTask.tasks.CreateIndexTask; import org.apache.lucene.benchmark.byTask.tasks.TaskSequence; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.document.Document; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; /** Tests the functionality of {@link DocMaker}. */ public class DocMakerTest extends BenchmarkTestCase { static final class OneDocSource extends ContentSource { private boolean finish = false; public void close() throws IOException { } public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { if (finish) { throw new NoMoreDataException(); } docData.setBody("body"); docData.setDate("date"); docData.setTitle("title"); Properties props = new Properties(); props.setProperty("key", "value"); docData.setProps(props); finish = true; return docData; } } private void doTestIndexProperties(boolean setIndexProps, boolean indexPropsVal, int numExpectedResults) throws Exception { Properties props = new Properties(); // Indexing configuration. props.setProperty("analyzer", SimpleAnalyzer.class.getName()); props.setProperty("content.source", OneDocSource.class.getName()); props.setProperty("directory", "RAMDirectory"); if (setIndexProps) { props.setProperty("doc.index.props", Boolean.toString(indexPropsVal)); } // Create PerfRunData Config config = new Config(props); PerfRunData runData = new PerfRunData(config); TaskSequence tasks = new TaskSequence(runData, getName(), null, false); tasks.addTask(new CreateIndexTask(runData)); tasks.addTask(new AddDocTask(runData)); tasks.addTask(new CloseIndexTask(runData)); tasks.doLogic(); IndexSearcher searcher = new IndexSearcher(runData.getDirectory(), true); TopDocs td = searcher.search(new TermQuery(new Term("key", "value")), 10); assertEquals(numExpectedResults, td.totalHits); searcher.close(); } private Document createTestNormsDocument(boolean setNormsProp, boolean normsPropVal, boolean setBodyNormsProp, boolean bodyNormsVal) throws Exception { Properties props = new Properties(); // Indexing configuration. props.setProperty("analyzer", SimpleAnalyzer.class.getName()); props.setProperty("content.source", OneDocSource.class.getName()); props.setProperty("directory", "RAMDirectory"); if (setNormsProp) { props.setProperty("doc.tokenized.norms", Boolean.toString(normsPropVal)); } if (setBodyNormsProp) { props.setProperty("doc.body.tokenized.norms", Boolean.toString(bodyNormsVal)); } // Create PerfRunData Config config = new Config(props); DocMaker dm = new DocMaker(); dm.setConfig(config); return dm.makeDocument(); } /* Tests doc.index.props property. */ public void testIndexProperties() throws Exception { // default is to not index properties. doTestIndexProperties(false, false, 0); // set doc.index.props to false. doTestIndexProperties(true, false, 0); // set doc.index.props to true. doTestIndexProperties(true, true, 1); } /* Tests doc.tokenized.norms and doc.body.tokenized.norms properties. */ public void testNorms() throws Exception { Document doc; // Don't set anything, use the defaults doc = createTestNormsDocument(false, false, false, false); assertTrue(doc.getField(DocMaker.TITLE_FIELD).getOmitNorms()); assertFalse(doc.getField(DocMaker.BODY_FIELD).getOmitNorms()); // Set norms to false doc = createTestNormsDocument(true, false, false, false); assertTrue(doc.getField(DocMaker.TITLE_FIELD).getOmitNorms()); assertFalse(doc.getField(DocMaker.BODY_FIELD).getOmitNorms()); // Set norms to true doc = createTestNormsDocument(true, true, false, false); assertFalse(doc.getField(DocMaker.TITLE_FIELD).getOmitNorms()); assertFalse(doc.getField(DocMaker.BODY_FIELD).getOmitNorms()); // Set body norms to false doc = createTestNormsDocument(false, false, true, false); assertTrue(doc.getField(DocMaker.TITLE_FIELD).getOmitNorms()); assertTrue(doc.getField(DocMaker.BODY_FIELD).getOmitNorms()); // Set body norms to true doc = createTestNormsDocument(false, false, true, true); assertTrue(doc.getField(DocMaker.TITLE_FIELD).getOmitNorms()); assertFalse(doc.getField(DocMaker.BODY_FIELD).getOmitNorms()); } } ././@LongLink0000000000000000000000000000015000000000000011561 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocSourceTest.javalucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/LineDocSourceTest.j0000644000175000017500000001305511474320251034067 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.feeds; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.util.Properties; import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.benchmark.BenchmarkTestCase; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.tasks.AddDocTask; import org.apache.lucene.benchmark.byTask.tasks.CloseIndexTask; import org.apache.lucene.benchmark.byTask.tasks.CreateIndexTask; import org.apache.lucene.benchmark.byTask.tasks.TaskSequence; import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; /** Tests the functionality of {@link LineDocSource}. */ public class LineDocSourceTest extends BenchmarkTestCase { private static final CompressorStreamFactory csFactory = new CompressorStreamFactory(); private void createBZ2LineFile(File file) throws Exception { OutputStream out = new FileOutputStream(file); out = csFactory.createCompressorOutputStream("bzip2", out); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8")); StringBuffer doc = new StringBuffer(); doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body"); writer.write(doc.toString()); writer.newLine(); writer.close(); } private void createRegularLineFile(File file) throws Exception { OutputStream out = new FileOutputStream(file); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out, "utf-8")); StringBuffer doc = new StringBuffer(); doc.append("title").append(WriteLineDocTask.SEP).append("date").append(WriteLineDocTask.SEP).append("body"); writer.write(doc.toString()); writer.newLine(); writer.close(); } private void doIndexAndSearchTest(File file, boolean setBZCompress, String bz2CompressVal) throws Exception { Properties props = new Properties(); // LineDocSource specific settings. props.setProperty("docs.file", file.getAbsolutePath()); if (setBZCompress) { props.setProperty("bzip.compression", bz2CompressVal); } // Indexing configuration. props.setProperty("analyzer", SimpleAnalyzer.class.getName()); props.setProperty("content.source", LineDocSource.class.getName()); props.setProperty("directory", "RAMDirectory"); // Create PerfRunData Config config = new Config(props); PerfRunData runData = new PerfRunData(config); TaskSequence tasks = new TaskSequence(runData, "testBzip2", null, false); tasks.addTask(new CreateIndexTask(runData)); tasks.addTask(new AddDocTask(runData)); tasks.addTask(new CloseIndexTask(runData)); tasks.doLogic(); IndexSearcher searcher = new IndexSearcher(runData.getDirectory(), true); TopDocs td = searcher.search(new TermQuery(new Term("body", "body")), 10); assertEquals(1, td.totalHits); assertNotNull(td.scoreDocs[0]); searcher.close(); } /* Tests LineDocSource with a bzip2 input stream. */ public void testBZip2() throws Exception { File file = new File(getWorkDir(), "one-line.bz2"); createBZ2LineFile(file); doIndexAndSearchTest(file, true, "true"); } public void testBZip2AutoDetect() throws Exception { File file = new File(getWorkDir(), "one-line.bz2"); createBZ2LineFile(file); doIndexAndSearchTest(file, false, null); } public void testRegularFile() throws Exception { File file = new File(getWorkDir(), "one-line"); createRegularLineFile(file); doIndexAndSearchTest(file, false, null); } public void testInvalidFormat() throws Exception { String[] testCases = new String[] { "", // empty line "title", // just title "title" + WriteLineDocTask.SEP, // title + SEP "title" + WriteLineDocTask.SEP + "body", // title + SEP + body // note that title + SEP + body + SEP is a valid line, which results in an // empty body }; for (int i = 0; i < testCases.length; i++) { File file = new File(getWorkDir(), "one-line"); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "utf-8")); writer.write(testCases[i]); writer.newLine(); writer.close(); try { doIndexAndSearchTest(file, false, null); fail("Some exception should have been thrown for: [" + testCases[i] + "]"); } catch (Exception e) { // expected. } } } } ././@LongLink0000000000000000000000000000015400000000000011565 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTest.javalucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/feeds/TrecContentSourceTe0000644000175000017500000003065111474320251034204 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.feeds; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.BufferedReader; import java.io.IOException; import java.io.StringReader; import java.text.ParseException; import java.util.Date; import junit.framework.TestCase; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.document.DateTools; public class TrecContentSourceTest extends TestCase { /** A TrecDocMaker which works on a String and not files. */ private static class StringableTrecSource extends TrecContentSource { private String docs = null; public StringableTrecSource(String docs, boolean forever) { this.docs = docs; this.forever = forever; } void openNextFile() throws NoMoreDataException, IOException { if (reader != null) { if (!forever) { throw new NoMoreDataException(); } ++iteration; } reader = new BufferedReader(new StringReader(docs)); } public void setConfig(Config config) { htmlParser = new DemoHTMLParser(); } } private void assertDocData(DocData dd, String expName, String expTitle, String expBody, Date expDate) throws ParseException { assertNotNull(dd); assertEquals(expName, dd.getName()); assertEquals(expTitle, dd.getTitle()); assertTrue(dd.getBody().indexOf(expBody) != -1); Date date = dd.getDate() != null ? DateTools.stringToDate(dd.getDate()) : null; assertEquals(expDate, date); } private void assertNoMoreDataException(StringableTrecSource stdm) throws Exception { boolean thrown = false; try { stdm.getNextDocData(null); } catch (NoMoreDataException e) { thrown = true; } assertTrue("Expecting NoMoreDataException", thrown); } public void testOneDocument() throws Exception { String docs = "\r\n" + "TEST-000\r\n" + "\r\n" + "http://lucene.apache.org.trecdocmaker.test\r\n" + "HTTP/1.1 200 OK\r\n" + "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + "Server: Apache/1.3.27 (Unix)\r\n" + "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + "Content-Length: 614\r\n" + "Connection: close\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "TEST-000 title\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "TEST-000 text\r\n" + "\r\n" + "\r\n" + "\r\n" + ""; StringableTrecSource source = new StringableTrecSource(docs, false); source.setConfig(null); DocData dd = source.getNextDocData(new DocData()); assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", source .parseDate("Sun, 11 Jan 2009 08:00:00 GMT")); assertNoMoreDataException(source); } public void testTwoDocuments() throws Exception { String docs = "\r\n" + "TEST-000\r\n" + "\r\n" + "http://lucene.apache.org.trecdocmaker.test\r\n" + "HTTP/1.1 200 OK\r\n" + "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + "Server: Apache/1.3.27 (Unix)\r\n" + "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + "Content-Length: 614\r\n" + "Connection: close\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "TEST-000 title\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "TEST-000 text\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "TEST-001\r\n" + "\r\n" + "http://lucene.apache.org.trecdocmaker.test\r\n" + "HTTP/1.1 200 OK\r\n" + "Date: Sun, 11 Jan 2009 08:01:00 GMT\r\n" + "Server: Apache/1.3.27 (Unix)\r\n" + "Last-Modified: Sun, 11 Jan 2008 08:01:00 GMT\r\n" + "Content-Length: 614\r\n" + "Connection: close\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "TEST-001 title\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "TEST-001 text\r\n" + "\r\n" + "\r\n" + "\r\n" + ""; StringableTrecSource source = new StringableTrecSource(docs, false); source.setConfig(null); DocData dd = source.getNextDocData(new DocData()); assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", source .parseDate("Sun, 11 Jan 2009 08:00:00 GMT")); dd = source.getNextDocData(dd); assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", source .parseDate("Sun, 11 Jan 2009 08:01:00 GMT")); assertNoMoreDataException(source); } // If a Date: attribute is missing, make sure the document is not skipped, but // rather that null Data is assigned. public void testMissingDate() throws Exception { String docs = "\r\n" + "TEST-000\r\n" + "\r\n" + "http://lucene.apache.org.trecdocmaker.test\r\n" + "HTTP/1.1 200 OK\r\n" + "Server: Apache/1.3.27 (Unix)\r\n" + "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + "Content-Length: 614\r\n" + "Connection: close\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "TEST-000 title\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "TEST-000 text\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "TEST-001\r\n" + "\r\n" + "http://lucene.apache.org.trecdocmaker.test\r\n" + "HTTP/1.1 200 OK\r\n" + "Date: Sun, 11 Jan 2009 08:01:00 GMT\r\n" + "Server: Apache/1.3.27 (Unix)\r\n" + "Last-Modified: Sun, 11 Jan 2009 08:01:00 GMT\r\n" + "Content-Length: 614\r\n" + "Connection: close\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "TEST-001 title\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "TEST-001 text\r\n" + "\r\n" + "\r\n" + "\r\n" + ""; StringableTrecSource source = new StringableTrecSource(docs, false); source.setConfig(null); DocData dd = source.getNextDocData(new DocData()); assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", null); dd = source.getNextDocData(dd); assertDocData(dd, "TEST-001_0", "TEST-001 title", "TEST-001 text", source .parseDate("Sun, 11 Jan 2009 08:01:00 GMT")); assertNoMoreDataException(source); } // When a 'bad date' is input (unparsable date), make sure the DocData date is // assigned null. public void testBadDate() throws Exception { String docs = "\r\n" + "TEST-000\r\n" + "\r\n" + "http://lucene.apache.org.trecdocmaker.test\r\n" + "HTTP/1.1 200 OK\r\n" + "Date: Bad Date\r\n" + "Server: Apache/1.3.27 (Unix)\r\n" + "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + "Content-Length: 614\r\n" + "Connection: close\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "TEST-000 title\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "TEST-000 text\r\n" + "\r\n" + "\r\n" + "\r\n" + ""; StringableTrecSource source = new StringableTrecSource(docs, false); source.setConfig(null); DocData dd = source.getNextDocData(new DocData()); assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", null); assertNoMoreDataException(source); } public void testForever() throws Exception { String docs = "\r\n" + "TEST-000\r\n" + "\r\n" + "http://lucene.apache.org.trecdocmaker.test\r\n" + "HTTP/1.1 200 OK\r\n" + "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + "Server: Apache/1.3.27 (Unix)\r\n" + "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + "Content-Length: 614\r\n" + "Connection: close\r\n" + "Content-Type: text/html\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "TEST-000 title\r\n" + "\r\n" + "\r\n" + "\r\n" + "\r\n" + "TEST-000 text\r\n" + "\r\n" + "\r\n" + "\r\n" + ""; StringableTrecSource source = new StringableTrecSource(docs, true); source.setConfig(null); DocData dd = source.getNextDocData(new DocData()); assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", source .parseDate("Sun, 11 Jan 2009 08:00:00 GMT")); // same document, but the second iteration changes the name. dd = source.getNextDocData(dd); assertDocData(dd, "TEST-000_1", "TEST-000 title", "TEST-000 text", source .parseDate("Sun, 11 Jan 2009 08:00:00 GMT")); // Don't test that NoMoreDataException is thrown, since the forever flag is // turned on. } } lucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/0000755000175000017500000000000011554106561030416 5ustar janpascaljanpascal././@LongLink0000000000000000000000000000015200000000000011563 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTaskTest.javalucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTaskTest0000644000175000017500000000543711474320251034223 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.ByteArrayOutputStream; import java.io.File; import java.io.PrintStream; import java.util.Properties; import org.apache.lucene.benchmark.BenchmarkTestCase; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.utils.Config; /** Tests the functionality of {@link CreateIndexTask}. */ public class CreateIndexTaskTest extends BenchmarkTestCase { private PerfRunData createPerfRunData(String infoStreamValue) throws Exception { Properties props = new Properties(); props.setProperty("print.props", "false"); // don't print anything props.setProperty("directory", "RAMDirectory"); props.setProperty("writer.info.stream", infoStreamValue); Config config = new Config(props); return new PerfRunData(config); } public void testInfoStream_SystemOutErr() throws Exception { PrintStream curOut = System.out; ByteArrayOutputStream baos = new ByteArrayOutputStream(); System.setOut(new PrintStream(baos)); try { PerfRunData runData = createPerfRunData("SystemOut"); CreateIndexTask cit = new CreateIndexTask(runData); cit.doLogic(); new CloseIndexTask(runData).doLogic(); assertTrue(baos.size() > 0); } finally { System.setOut(curOut); } PrintStream curErr = System.err; baos.reset(); System.setErr(new PrintStream(baos)); try { PerfRunData runData = createPerfRunData("SystemErr"); CreateIndexTask cit = new CreateIndexTask(runData); cit.doLogic(); new CloseIndexTask(runData).doLogic(); assertTrue(baos.size() > 0); } finally { System.setErr(curErr); } } public void testInfoStream_File() throws Exception { File outFile = new File(getWorkDir(), "infoStreamTest"); PerfRunData runData = createPerfRunData(outFile.getAbsolutePath()); new CreateIndexTask(runData).doLogic(); new CloseIndexTask(runData).doLogic(); assertTrue(outFile.length() > 0); } } ././@LongLink0000000000000000000000000000016200000000000011564 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CountingHighlighterTestTask.javalucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CountingHighlighter0000644000175000017500000000507711474320251034312 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.benchmark.byTask.tasks; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.TextFragment; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.TokenSources; import org.apache.lucene.search.Query; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import java.io.IOException; /** * Test Search task which counts number of searches. */ public class CountingHighlighterTestTask extends SearchTravRetHighlightTask { public static int numHighlightedResults = 0; public static int numDocsRetrieved = 0; public CountingHighlighterTestTask(PerfRunData runData) { super(runData); } protected Document retrieveDoc(IndexReader ir, int id) throws IOException { Document document = ir.document(id); if (document != null) { numDocsRetrieved++; } return document; } public BenchmarkHighlighter getBenchmarkHighlighter(Query q) { highlighter = new Highlighter(new SimpleHTMLFormatter(), new QueryScorer(q)); return new BenchmarkHighlighter() { public int doHighlight(IndexReader reader, int doc, String field, Document document, Analyzer analyzer, String text) throws Exception { TokenStream ts = TokenSources.getAnyTokenStream(reader, doc, field, document, analyzer); TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags); numHighlightedResults += frag != null ? frag.length : 0; return frag != null ? frag.length : 0; } }; } } ././@LongLink0000000000000000000000000000015300000000000011564 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTest.javalucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTaskTes0000644000175000017500000002111111474320251034157 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import java.io.InputStreamReader; import java.util.Properties; import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.lucene.benchmark.BenchmarkTestCase; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.feeds.DocMaker; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; /** Tests the functionality of {@link WriteLineDocTask}. */ public class WriteLineDocTaskTest extends BenchmarkTestCase { // class has to be public so that Class.forName.newInstance() will work public static final class WriteLineDocMaker extends DocMaker { public Document makeDocument() throws Exception { Document doc = new Document(); doc.add(new Field(BODY_FIELD, "body", Store.NO, Index.NOT_ANALYZED_NO_NORMS)); doc.add(new Field(TITLE_FIELD, "title", Store.NO, Index.NOT_ANALYZED_NO_NORMS)); doc.add(new Field(DATE_FIELD, "date", Store.NO, Index.NOT_ANALYZED_NO_NORMS)); return doc; } } // class has to be public so that Class.forName.newInstance() will work public static final class NewLinesDocMaker extends DocMaker { public Document makeDocument() throws Exception { Document doc = new Document(); doc.add(new Field(BODY_FIELD, "body\r\ntext\ttwo", Store.NO, Index.NOT_ANALYZED_NO_NORMS)); doc.add(new Field(TITLE_FIELD, "title\r\ntext", Store.NO, Index.NOT_ANALYZED_NO_NORMS)); doc.add(new Field(DATE_FIELD, "date\r\ntext", Store.NO, Index.NOT_ANALYZED_NO_NORMS)); return doc; } } // class has to be public so that Class.forName.newInstance() will work public static final class NoBodyDocMaker extends DocMaker { public Document makeDocument() throws Exception { Document doc = new Document(); doc.add(new Field(TITLE_FIELD, "title", Store.NO, Index.NOT_ANALYZED_NO_NORMS)); doc.add(new Field(DATE_FIELD, "date", Store.NO, Index.NOT_ANALYZED_NO_NORMS)); return doc; } } // class has to be public so that Class.forName.newInstance() will work public static final class NoTitleDocMaker extends DocMaker { public Document makeDocument() throws Exception { Document doc = new Document(); doc.add(new Field(BODY_FIELD, "body", Store.NO, Index.NOT_ANALYZED_NO_NORMS)); doc.add(new Field(DATE_FIELD, "date", Store.NO, Index.NOT_ANALYZED_NO_NORMS)); return doc; } } // class has to be public so that Class.forName.newInstance() will work public static final class JustDateDocMaker extends DocMaker { public Document makeDocument() throws Exception { Document doc = new Document(); doc.add(new Field(DATE_FIELD, "date", Store.NO, Index.NOT_ANALYZED_NO_NORMS)); return doc; } } private static final CompressorStreamFactory csFactory = new CompressorStreamFactory(); private PerfRunData createPerfRunData(File file, boolean setBZCompress, String bz2CompressVal, String docMakerName) throws Exception { Properties props = new Properties(); props.setProperty("doc.maker", docMakerName); props.setProperty("line.file.out", file.getAbsolutePath()); if (setBZCompress) { props.setProperty("bzip.compression", bz2CompressVal); } props.setProperty("directory", "RAMDirectory"); // no accidental FS dir. Config config = new Config(props); return new PerfRunData(config); } private void doReadTest(File file, boolean bz2File, String expTitle, String expDate, String expBody) throws Exception { InputStream in = new FileInputStream(file); if (bz2File) { in = csFactory.createCompressorInputStream("bzip2", in); } BufferedReader br = new BufferedReader(new InputStreamReader(in, "utf-8")); try { String line = br.readLine(); assertNotNull(line); String[] parts = line.split(Character.toString(WriteLineDocTask.SEP)); int numExpParts = expBody == null ? 2 : 3; assertEquals(numExpParts, parts.length); assertEquals(expTitle, parts[0]); assertEquals(expDate, parts[1]); if (expBody != null) { assertEquals(expBody, parts[2]); } assertNull(br.readLine()); } finally { br.close(); } } /* Tests WriteLineDocTask with a bzip2 format. */ public void testBZip2() throws Exception { // Create a document in bz2 format. File file = new File(getWorkDir(), "one-line.bz2"); PerfRunData runData = createPerfRunData(file, true, "true", WriteLineDocMaker.class.getName()); WriteLineDocTask wldt = new WriteLineDocTask(runData); wldt.doLogic(); wldt.close(); doReadTest(file, true, "title", "date", "body"); } public void testBZip2AutoDetect() throws Exception { // Create a document in bz2 format. File file = new File(getWorkDir(), "one-line.bz2"); PerfRunData runData = createPerfRunData(file, false, null, WriteLineDocMaker.class.getName()); WriteLineDocTask wldt = new WriteLineDocTask(runData); wldt.doLogic(); wldt.close(); doReadTest(file, true, "title", "date", "body"); } public void testRegularFile() throws Exception { // Create a document in regular format. File file = new File(getWorkDir(), "one-line"); PerfRunData runData = createPerfRunData(file, true, "false", WriteLineDocMaker.class.getName()); WriteLineDocTask wldt = new WriteLineDocTask(runData); wldt.doLogic(); wldt.close(); doReadTest(file, false, "title", "date", "body"); } public void testCharsReplace() throws Exception { // WriteLineDocTask replaced only \t characters w/ a space, since that's its // separator char. However, it didn't replace newline characters, which // resulted in errors in LineDocSource. File file = new File(getWorkDir(), "one-line"); PerfRunData runData = createPerfRunData(file, false, null, NewLinesDocMaker.class.getName()); WriteLineDocTask wldt = new WriteLineDocTask(runData); wldt.doLogic(); wldt.close(); doReadTest(file, false, "title text", "date text", "body text two"); } public void testEmptyBody() throws Exception { // WriteLineDocTask threw away documents w/ no BODY element, even if they // had a TITLE element (LUCENE-1755). It should throw away documents if they // don't have BODY nor TITLE File file = new File(getWorkDir(), "one-line"); PerfRunData runData = createPerfRunData(file, false, null, NoBodyDocMaker.class.getName()); WriteLineDocTask wldt = new WriteLineDocTask(runData); wldt.doLogic(); wldt.close(); doReadTest(file, false, "title", "date", null); } public void testEmptyTitle() throws Exception { File file = new File(getWorkDir(), "one-line"); PerfRunData runData = createPerfRunData(file, false, null, NoTitleDocMaker.class.getName()); WriteLineDocTask wldt = new WriteLineDocTask(runData); wldt.doLogic(); wldt.close(); doReadTest(file, false, "", "date", "body"); } public void testJustDate() throws Exception { File file = new File(getWorkDir(), "one-line"); PerfRunData runData = createPerfRunData(file, false, null, JustDateDocMaker.class.getName()); WriteLineDocTask wldt = new WriteLineDocTask(runData); wldt.doLogic(); wldt.close(); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8")); try { String line = br.readLine(); assertNull(line); } finally { br.close(); } } } ././@LongLink0000000000000000000000000000015500000000000011566 Lustar rootrootlucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CountingSearchTestTask.javalucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CountingSearchTestT0000644000175000017500000000323011474320251034232 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.benchmark.byTask.tasks; import org.apache.lucene.benchmark.byTask.PerfRunData; /** * Test Search task which counts number of searches. */ public class CountingSearchTestTask extends SearchTask { public static int numSearches = 0; public static long startMillis; public static long lastMillis; public static long prevLastMillis; public CountingSearchTestTask(PerfRunData runData) { super(runData); } public int doLogic() throws Exception { int res = super.doLogic(); incrNumSearches(); return res; } private static synchronized void incrNumSearches() { prevLastMillis = lastMillis; lastMillis = System.currentTimeMillis(); if (0 == numSearches) { startMillis = prevLastMillis = lastMillis; } numSearches++; } public long getElapsedMillis() { return lastMillis - startMillis; } } lucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/PerfTaskTest.java0000644000175000017500000000516611474320251033643 0ustar janpascaljanpascalpackage org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Properties; import org.apache.lucene.benchmark.BenchmarkTestCase; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.utils.Config; /** Tests the functionality of the abstract {@link PerfTask}. */ public class PerfTaskTest extends BenchmarkTestCase { private static final class MyPerfTask extends PerfTask { public MyPerfTask(PerfRunData runData) { super(runData); } public int doLogic() throws Exception { return 0; } public int getLogStep() { return logStep; } } private PerfRunData createPerfRunData(boolean setLogStep, int logStepVal, boolean setTaskLogStep, int taskLogStepVal) throws Exception { Properties props = new Properties(); if (setLogStep) { props.setProperty("log.step", Integer.toString(logStepVal)); } if (setTaskLogStep) { props.setProperty("log.step.MyPerf", Integer.toString(taskLogStepVal)); } props.setProperty("directory", "RAMDirectory"); // no accidental FS dir. Config config = new Config(props); return new PerfRunData(config); } private void doLogStepTest(boolean setLogStep, int logStepVal, boolean setTaskLogStep, int taskLogStepVal, int expLogStepValue) throws Exception { PerfRunData runData = createPerfRunData(setLogStep, logStepVal, setTaskLogStep, taskLogStepVal); MyPerfTask mpt = new MyPerfTask(runData); assertEquals(expLogStepValue, mpt.getLogStep()); } public void testLogStep() throws Exception { doLogStepTest(false, -1, false, -1, PerfTask.DEFAULT_LOG_STEP); doLogStepTest(true, -1, false, -1, Integer.MAX_VALUE); doLogStepTest(true, 100, false, -1, 100); doLogStepTest(false, -1, true, -1, Integer.MAX_VALUE); doLogStepTest(false, -1, true, 100, 100); } } lucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksParse.java0000644000175000017500000000677211474320251033700 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.benchmark.byTask; import java.io.StringReader; import java.util.ArrayList; import java.util.Iterator; import junit.framework.TestCase; import org.apache.lucene.benchmark.byTask.tasks.PerfTask; import org.apache.lucene.benchmark.byTask.tasks.TaskSequence; import org.apache.lucene.benchmark.byTask.utils.Algorithm; /** Test very simply that perf tasks are parses as expected. */ public class TestPerfTasksParse extends TestCase { static final String NEW_LINE = System.getProperty("line.separator"); static final String INDENT = " "; // properties in effect in all tests here static final String propPart = INDENT + "directory=RAMDirectory" + NEW_LINE + INDENT + "print.props=false" + NEW_LINE ; public TestPerfTasksParse(String name) { super(name); } /** Test the repetiotion parsing for parallel tasks */ public void testParseParallelTaskSequenceRepetition() throws Exception { String taskStr = "AddDoc"; String parsedTasks = "[ "+taskStr+" ] : 1000"; Benchmark benchmark = new Benchmark(new StringReader(propPart+parsedTasks)); Algorithm alg = benchmark.getAlgorithm(); ArrayList algTasks = alg.extractTasks(); boolean foundAdd = false; for (Iterator iter = algTasks.iterator(); iter.hasNext();) { PerfTask task = (PerfTask) iter.next(); if (task.toString().indexOf(taskStr)>=0) { foundAdd = true; } if (task instanceof TaskSequence) { assertEquals("repetions should be 1000 for "+parsedTasks, 1000, ((TaskSequence) task).getRepetitions()); assertTrue("sequence for "+parsedTasks+" should be parallel!", ((TaskSequence) task).isParallel()); } assertTrue("Task "+taskStr+" was not found in "+alg.toString(),foundAdd); } } /** Test the repetiotion parsing for sequential tasks */ public void testParseTaskSequenceRepetition() throws Exception { String taskStr = "AddDoc"; String parsedTasks = "{ "+taskStr+" } : 1000"; Benchmark benchmark = new Benchmark(new StringReader(propPart+parsedTasks)); Algorithm alg = benchmark.getAlgorithm(); ArrayList algTasks = alg.extractTasks(); boolean foundAdd = false; for (Iterator iter = algTasks.iterator(); iter.hasNext();) { PerfTask task = (PerfTask) iter.next(); if (task.toString().indexOf(taskStr)>=0) { foundAdd = true; } if (task instanceof TaskSequence) { assertEquals("repetions should be 1000 for "+parsedTasks, 1000, ((TaskSequence) task).getRepetitions()); assertFalse("sequence for "+parsedTasks+" should be sequential!", ((TaskSequence) task).isParallel()); } assertTrue("Task "+taskStr+" was not found in "+alg.toString(),foundAdd); } } } lucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java0000644000175000017500000007417011474320251033660 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.benchmark.byTask; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.StringReader; import java.util.Iterator; import java.util.List; import junit.framework.TestCase; import org.apache.lucene.benchmark.byTask.feeds.DocData; import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException; import org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource; import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker; import org.apache.lucene.benchmark.byTask.stats.TaskStats; import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask; import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.LogDocMergePolicy; import org.apache.lucene.index.SegmentInfos; import org.apache.lucene.index.SerialMergeScheduler; import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.TermFreqVector; import org.apache.lucene.search.FieldCache.StringIndex; import org.apache.lucene.search.FieldCache; import org.apache.lucene.store.Directory; /** * Test very simply that perf tasks - simple algorithms - are doing what they should. */ public class TestPerfTasksLogic extends TestCase { private static final boolean DEBUG = false; static final String NEW_LINE = System.getProperty("line.separator"); // properties in effect in all tests here static final String propLines [] = { "directory=RAMDirectory", "print.props=false", }; /** * @param name test name */ public TestPerfTasksLogic(String name) { super(name); } /** * Test index creation logic */ public void testIndexAndSearchTasks() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "ResetSystemErase", "CreateIndex", "{ AddDoc } : 1000", "Optimize", "CloseIndex", "OpenReader", "{ CountingSearchTest } : 200", "CloseReader", "[ CountingSearchTest > : 70", "[ CountingSearchTest > : 9", }; // 2. we test this value later CountingSearchTestTask.numSearches = 0; // 3. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); // 4. test specific checks after the benchmark run completed. assertEquals("TestSearchTask was supposed to be called!",279,CountingSearchTestTask.numSearches); assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory())); // now we should be able to open the index for write. IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),null,false, IndexWriter.MaxFieldLength.LIMITED); iw.close(); IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs()); ir.close(); } /** * Test timed sequence task. */ public void testTimedSearchTask() throws Exception { String algLines[] = { "ResetSystemErase", "CreateIndex", "{ AddDoc } : 1000", "Optimize", "CloseIndex", "OpenReader", "{ CountingSearchTest } : 1.5s", "CloseReader", }; CountingSearchTestTask.numSearches = 0; execBenchmark(algLines); assertTrue(CountingSearchTestTask.numSearches > 0); long elapsed = CountingSearchTestTask.prevLastMillis - CountingSearchTestTask.startMillis; assertTrue("elapsed time was " + elapsed + " msec", elapsed <= 1500); } public void testHighlighting() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "doc.stored=true", "content.source="+Reuters20ContentSource.class.getName(), "query.maker=" + ReutersQueryMaker.class.getName(), "ResetSystemErase", "CreateIndex", "{ AddDoc } : 1000", "Optimize", "CloseIndex", "OpenReader(true)", "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200", "CloseReader", }; // 2. we test this value later CountingHighlighterTestTask.numHighlightedResults = 0; CountingHighlighterTestTask.numDocsRetrieved = 0; // 3. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); // 4. test specific checks after the benchmark run completed. assertEquals("TestSearchTask was supposed to be called!",147,CountingHighlighterTestTask.numDocsRetrieved); //pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs //we probably should use a different doc/query maker, but... assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0); assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory())); // now we should be able to open the index for write. IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),null,false, IndexWriter.MaxFieldLength.LIMITED); iw.close(); IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs()); ir.close(); } public void testHighlightingTV() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "doc.stored=true",//doc storage is required in order to have text to highlight "doc.term.vector.offsets=true", "content.source="+Reuters20ContentSource.class.getName(), "query.maker=" + ReutersQueryMaker.class.getName(), "ResetSystemErase", "CreateIndex", "{ AddDoc } : 1000", "Optimize", "CloseIndex", "OpenReader(false)", "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200", "CloseReader", }; // 2. we test this value later CountingHighlighterTestTask.numHighlightedResults = 0; CountingHighlighterTestTask.numDocsRetrieved = 0; // 3. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); // 4. test specific checks after the benchmark run completed. assertEquals("TestSearchTask was supposed to be called!",147,CountingHighlighterTestTask.numDocsRetrieved); //pretty hard to figure out a priori how many docs are going to have highlighted fragments returned, but we can never have more than the number of docs //we probably should use a different doc/query maker, but... assertTrue("TestSearchTask was supposed to be called!", CountingHighlighterTestTask.numDocsRetrieved >= CountingHighlighterTestTask.numHighlightedResults && CountingHighlighterTestTask.numHighlightedResults > 0); assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory())); // now we should be able to open the index for write. IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),null,false,IndexWriter.MaxFieldLength.UNLIMITED); iw.close(); IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs()); ir.close(); } public void testHighlightingNoTvNoStore() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "doc.stored=false", "content.source="+Reuters20ContentSource.class.getName(), "query.maker=" + ReutersQueryMaker.class.getName(), "ResetSystemErase", "CreateIndex", "{ AddDoc } : 1000", "Optimize", "CloseIndex", "OpenReader", "{ CountingHighlighterTest(size[1],highlight[1],mergeContiguous[true],maxFrags[1],fields[body]) } : 200", "CloseReader", }; // 2. we test this value later CountingHighlighterTestTask.numHighlightedResults = 0; CountingHighlighterTestTask.numDocsRetrieved = 0; // 3. execute the algorithm (required in every "logic" test) try { Benchmark benchmark = execBenchmark(algLines); assertTrue("CountingHighlighterTest should have thrown an exception", false); assertNotNull(benchmark); // (avoid compile warning on unused variable) } catch (Exception e) { assertTrue(true); } } /** * Test Exhasting Doc Maker logic */ public void testExhaustContentSource() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", "content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource", "content.source.log.step=1", "doc.term.vector=false", "content.source.forever=false", "directory=RAMDirectory", "doc.stored=false", "doc.tokenized=false", "# ----- alg ", "CreateIndex", "{ AddDoc } : * ", "Optimize", "CloseIndex", "OpenReader", "{ CountingSearchTest } : 100", "CloseReader", "[ CountingSearchTest > : 30", "[ CountingSearchTest > : 9", }; // 2. we test this value later CountingSearchTestTask.numSearches = 0; // 3. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); // 4. test specific checks after the benchmark run completed. assertEquals("TestSearchTask was supposed to be called!",139,CountingSearchTestTask.numSearches); assertTrue("Index does not exist?...!", IndexReader.indexExists(benchmark.getRunData().getDirectory())); // now we should be able to open the index for write. IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),null,false,IndexWriter.MaxFieldLength.UNLIMITED); iw.close(); IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); assertEquals("1 docs were added to the index, this is what we expect to find!",1,ir.numDocs()); ir.close(); } // LUCENE-1994: test thread safety of SortableSingleDocMaker public void testDocMakerThreadSafety() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", "content.source=org.apache.lucene.benchmark.byTask.feeds.SortableSingleDocSource", "doc.term.vector=false", "log.step.AddDoc=10000", "content.source.forever=true", "directory=RAMDirectory", "doc.reuse.fields=false", "doc.stored=false", "doc.tokenized=false", "doc.index.props=true", "# ----- alg ", "CreateIndex", "[ { AddDoc > : 2500 ] : 4", "CloseIndex", }; // 2. we test this value later CountingSearchTestTask.numSearches = 0; // 3. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); IndexReader r = IndexReader.open(benchmark.getRunData().getDirectory(), true); StringIndex idx = FieldCache.DEFAULT.getStringIndex(r, "country"); final int maxDoc = r.maxDoc(); assertEquals(10000, maxDoc); for(int i=0;i<10000;i++) { assertNotNull("doc " + i + " has null country", idx.lookup[idx.order[i]]); } r.close(); } /** * Test Parallel Doc Maker logic (for LUCENE-940) */ public void testParallelDocMaker() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", "content.source="+Reuters20ContentSource.class.getName(), "content.source.log.step=3", "doc.term.vector=false", "content.source.forever=false", "directory=FSDirectory", "doc.stored=false", "doc.tokenized=false", "# ----- alg ", "CreateIndex", "[ { AddDoc } : * ] : 4 ", "CloseIndex", }; // 2. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); // 3. test number of docs in the index IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs. assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); ir.close(); } /** * Test WriteLineDoc and LineDocSource. */ public void testLineDocFile() throws Exception { File lineFile = new File(System.getProperty("tempDir"), "test.reuters.lines.txt"); // We will call WriteLineDocs this many times final int NUM_TRY_DOCS = 500; // Creates a line file with first 500 docs from reuters String algLines1[] = { "# ----- properties ", "content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource", "content.source.forever=false", "line.file.out=" + lineFile.getAbsolutePath().replace('\\', '/'), "# ----- alg ", "{WriteLineDoc()}:" + NUM_TRY_DOCS, }; // Run algo Benchmark benchmark = execBenchmark(algLines1); // Verify we got somewhere between 1-500 lines (some // Reuters docs have no body, which WriteLineDoc task // skips). BufferedReader r = new BufferedReader(new FileReader(lineFile)); int numLines = 0; while(r.readLine() != null) numLines++; r.close(); assertTrue("did not see the right number of docs; should be > 0 and <= " + NUM_TRY_DOCS + " but was " + numLines, numLines > 0 && numLines <= NUM_TRY_DOCS); // Index the line docs String algLines2[] = { "# ----- properties ", "analyzer=org.apache.lucene.analysis.SimpleAnalyzer", "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", "docs.file=" + lineFile.getAbsolutePath().replace('\\', '/'), "content.source.forever=false", "doc.reuse.fields=false", "autocommit=false", "ram.flush.mb=4", "# ----- alg ", "ResetSystemErase", "CreateIndex", "{AddDoc}: *", "CloseIndex", }; // Run algo benchmark = execBenchmark(algLines2); // now we should be able to open the index for write. IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(),null,false,IndexWriter.MaxFieldLength.UNLIMITED); iw.close(); IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); assertEquals(numLines + " lines were created but " + ir.numDocs() + " docs are in the index", numLines, ir.numDocs()); ir.close(); lineFile.delete(); } /** * Test ReadTokensTask */ public void testReadTokens() throws Exception { // We will call ReadTokens on this many docs final int NUM_DOCS = 100; // Read tokens from first NUM_DOCS docs from Reuters and // then build index from the same docs String algLines1[] = { "# ----- properties ", "analyzer=org.apache.lucene.analysis.WhitespaceAnalyzer", "content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource", "# ----- alg ", "{ReadTokens}: " + NUM_DOCS, "ResetSystemErase", "CreateIndex", "{AddDoc}: " + NUM_DOCS, "CloseIndex", }; // Run algo Benchmark benchmark = execBenchmark(algLines1); List stats = benchmark.getRunData().getPoints().taskStats(); // Count how many tokens all ReadTokens saw int totalTokenCount1 = 0; for (Iterator it = stats.iterator(); it.hasNext();) { TaskStats stat = (TaskStats) it.next(); if (stat.getTask().getName().equals("ReadTokens")) { totalTokenCount1 += stat.getCount(); } } // Separately count how many tokens are actually in the index: IndexReader reader = IndexReader.open(benchmark.getRunData().getDirectory()); assertEquals(NUM_DOCS, reader.numDocs()); TermEnum terms = reader.terms(); TermDocs termDocs = reader.termDocs(); int totalTokenCount2 = 0; while(terms.next()) { termDocs.seek(terms.term()); while(termDocs.next()) totalTokenCount2 += termDocs.freq(); } reader.close(); // Make sure they are the same assertEquals(totalTokenCount1, totalTokenCount2); } /** * Test that " {[AddDoc(4000)]: 4} : * " works corrcetly (for LUCENE-941) */ public void testParallelExhausted() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", "content.source="+Reuters20ContentSource.class.getName(), "content.source.log.step=3", "doc.term.vector=false", "content.source.forever=false", "directory=RAMDirectory", "doc.stored=false", "doc.tokenized=false", "task.max.depth.log=1", "# ----- alg ", "CreateIndex", "{ [ AddDoc]: 4} : * ", "ResetInputs ", "{ [ AddDoc]: 4} : * ", "CloseIndex", }; // 2. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); // 3. test number of docs in the index IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); int ndocsExpected = 2 * 20; // Reuters20ContentSource exhausts after 20 docs. assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); ir.close(); } // create the benchmark and execute it. public static Benchmark execBenchmark(String[] algLines) throws Exception { String algText = algLinesToText(algLines); logTstLogic(algText); Benchmark benchmark = new Benchmark(new StringReader(algText)); benchmark.execute(); return benchmark; } // catenate alg lines to make the alg text private static String algLinesToText(String[] algLines) { String indent = " "; StringBuffer sb = new StringBuffer(); for (int i = 0; i < propLines.length; i++) { sb.append(indent).append(propLines[i]).append(NEW_LINE); } for (int i = 0; i < algLines.length; i++) { sb.append(indent).append(algLines[i]).append(NEW_LINE); } return sb.toString(); } private static void logTstLogic (String txt) { if (!DEBUG) return; System.out.println("Test logic of:"); System.out.println(txt); } /** use reuters and the exhaust mechanism, but to be faster, add 20 docs only... */ public static class Reuters20ContentSource extends ReutersContentSource { private int nDocs = 0; public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { if (nDocs >= 20 && !forever) { throw new NoMoreDataException(); } nDocs++; return super.getNextDocData(docData); } public synchronized void resetInputs() throws IOException { super.resetInputs(); nDocs = 0; } } /** * Test that exhaust in loop works as expected (LUCENE-1115). */ public void testExhaustedLooped() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", "content.source="+Reuters20ContentSource.class.getName(), "content.source.log.step=3", "doc.term.vector=false", "content.source.forever=false", "directory=RAMDirectory", "doc.stored=false", "doc.tokenized=false", "task.max.depth.log=1", "# ----- alg ", "{ \"Rounds\"", " ResetSystemErase", " CreateIndex", " { \"AddDocs\" AddDoc > : * ", " CloseIndex", "} : 2", }; // 2. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); // 3. test number of docs in the index IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs. assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); ir.close(); } /** * Test that we can close IndexWriter with argument "false". */ public void testCloseIndexFalse() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", "content.source="+Reuters20ContentSource.class.getName(), "ram.flush.mb=-1", "max.buffered=2", "content.source.log.step=3", "doc.term.vector=false", "content.source.forever=false", "directory=RAMDirectory", "doc.stored=false", "doc.tokenized=false", "debug.level=1", "# ----- alg ", "{ \"Rounds\"", " ResetSystemErase", " CreateIndex", " { \"AddDocs\" AddDoc > : * ", " CloseIndex(false)", "} : 2", }; // 2. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); // 3. test number of docs in the index IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs. assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); ir.close(); } public static class MyMergeScheduler extends SerialMergeScheduler { boolean called; public MyMergeScheduler() { super(); called = true; } } /** * Test that we can set merge scheduler". */ public void testMergeScheduler() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", "content.source="+Reuters20ContentSource.class.getName(), "content.source.log.step=3", "doc.term.vector=false", "content.source.forever=false", "directory=RAMDirectory", "merge.scheduler=" + MyMergeScheduler.class.getName(), "doc.stored=false", "doc.tokenized=false", "debug.level=1", "# ----- alg ", "{ \"Rounds\"", " ResetSystemErase", " CreateIndex", " { \"AddDocs\" AddDoc > : * ", "} : 2", }; // 2. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); assertTrue("did not use the specified MergeScheduler", ((MyMergeScheduler) benchmark.getRunData().getIndexWriter().getMergeScheduler()).called); benchmark.getRunData().getIndexWriter().close(); // 3. test number of docs in the index IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs. assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); ir.close(); } public static class MyMergePolicy extends LogDocMergePolicy { boolean called; public MyMergePolicy(IndexWriter writer) { super(writer); called = true; } } /** * Test that we can set merge policy". */ public void testMergePolicy() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", "content.source="+Reuters20ContentSource.class.getName(), "content.source.log.step=3", "ram.flush.mb=-1", "max.buffered=2", "doc.term.vector=false", "content.source.forever=false", "directory=RAMDirectory", "merge.policy=" + MyMergePolicy.class.getName(), "doc.stored=false", "doc.tokenized=false", "debug.level=1", "# ----- alg ", "{ \"Rounds\"", " ResetSystemErase", " CreateIndex", " { \"AddDocs\" AddDoc > : * ", "} : 2", }; // 2. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); assertTrue("did not use the specified MergeScheduler", ((MyMergePolicy) benchmark.getRunData().getIndexWriter().getMergePolicy()).called); benchmark.getRunData().getIndexWriter().close(); // 3. test number of docs in the index IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs. assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); ir.close(); } /** * Test that IndexWriter settings stick. */ public void testIndexWriterSettings() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", "content.source="+Reuters20ContentSource.class.getName(), "content.source.log.step=3", "ram.flush.mb=-1", "max.buffered=2", "compound=cmpnd:true:false", "doc.term.vector=vector:false:true", "content.source.forever=false", "directory=RAMDirectory", "doc.stored=false", "merge.factor=3", "doc.tokenized=false", "debug.level=1", "# ----- alg ", "{ \"Rounds\"", " ResetSystemErase", " CreateIndex", " { \"AddDocs\" AddDoc > : * ", " NewRound", "} : 2", }; // 2. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); final IndexWriter writer = benchmark.getRunData().getIndexWriter(); assertEquals(2, writer.getMaxBufferedDocs()); assertEquals(IndexWriter.DISABLE_AUTO_FLUSH, (int) writer.getRAMBufferSizeMB()); assertEquals(3, writer.getMergeFactor()); assertFalse(writer.getUseCompoundFile()); writer.close(); Directory dir = benchmark.getRunData().getDirectory(); IndexReader reader = IndexReader.open(dir); TermFreqVector [] tfv = reader.getTermFreqVectors(0); assertNotNull(tfv); assertTrue(tfv.length > 0); reader.close(); } /** * Test that we can call optimize(maxNumSegments). */ public void testOptimizeMaxNumSegments() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", "content.source="+Reuters20ContentSource.class.getName(), "content.source.log.step=3", "ram.flush.mb=-1", "max.buffered=3", "doc.term.vector=false", "content.source.forever=false", "directory=RAMDirectory", "merge.policy=org.apache.lucene.index.LogDocMergePolicy", "doc.stored=false", "doc.tokenized=false", "debug.level=1", "# ----- alg ", "{ \"Rounds\"", " ResetSystemErase", " CreateIndex", " { \"AddDocs\" AddDoc > : * ", " Optimize(3)", " CloseIndex()", "} : 2", }; // 2. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); // 3. test number of docs in the index IndexReader ir = IndexReader.open(benchmark.getRunData().getDirectory()); int ndocsExpected = 20; // Reuters20ContentSource exhausts after 20 docs. assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); ir.close(); // Make sure we have 3 segments: SegmentInfos infos = new SegmentInfos(); infos.read(benchmark.getRunData().getDirectory()); assertEquals(3, infos.size()); } /** * Test disabling task count (LUCENE-1136). */ public void testDisableCounting() throws Exception { doTestDisableCounting(true); doTestDisableCounting(false); } private void doTestDisableCounting(boolean disable) throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = disableCountingLines(disable); // 2. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); // 3. test counters int n = disable ? 0 : 1; int nChecked = 0; for (Iterator ts = benchmark.getRunData().getPoints().taskStats().iterator(); ts.hasNext();) { TaskStats stats = (TaskStats) ts.next(); String taskName = stats.getTask().getName(); if (taskName.equals("Rounds")) { assertEquals("Wrong total count!",20+2*n,stats.getCount()); nChecked++; } else if (taskName.equals("CreateIndex")) { assertEquals("Wrong count for CreateIndex!",n,stats.getCount()); nChecked++; } else if (taskName.equals("CloseIndex")) { assertEquals("Wrong count for CloseIndex!",n,stats.getCount()); nChecked++; } } assertEquals("Missing some tasks to check!",3,nChecked); } private static String[] disableCountingLines (boolean disable) { String dis = disable ? "-" : ""; return new String[] { "# ----- properties ", "content.source="+Reuters20ContentSource.class.getName(), "content.source.log.step=30", "doc.term.vector=false", "content.source.forever=false", "directory=RAMDirectory", "doc.stored=false", "doc.tokenized=false", "task.max.depth.log=1", "# ----- alg ", "{ \"Rounds\"", " ResetSystemErase", " "+dis+"CreateIndex", // optionally disable counting here " { \"AddDocs\" AddDoc > : * ", " "+dis+" CloseIndex", // optionally disable counting here (with extra blanks) "}", "RepSumByName", }; } } lucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/0000755000175000017500000000000011554106561027524 5ustar janpascaljanpascallucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/trecQRels.txt0000644000175000017500000005505011474320251032171 0ustar janpascaljanpascal# ----------------------------------------------------------------------- # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ----------------------------------------------------------------------- # ------------------------------------------------------------ # Format: # # qnum 0 doc-name is-relevant # # # The origin of this file was created using # utils.QualityQueriesFinder, so all queries # would have perfect 1.0 for all meassures. # # To make it suitable for testing it was modified # for some queries, depending on m = qnum % 8 # m==0: avg_precision and recall are hurt, by marking fake docs as relevant # m==1: precision_at_n and avg_precision are hurt, by unmarking relevant docs # m==2: all precision, precision_at_n and recall are hurt. # m>=3: these queries remain perfect # ------------------------------------------------------------ # --- m==0: avg_precision and recall are hurt, by marking fake docs as relevant 0 0 fakedoc1 1 0 0 fakedoc2 1 0 0 fakedoc3 1 0 0 fakedoc4 1 0 0 doc18211 1 0 0 doc20192 1 0 0 doc7401 1 0 0 doc11285 1 0 0 doc20647 1 0 0 doc3057 1 0 0 doc12431 1 0 0 doc4989 1 0 0 doc17324 1 0 0 doc4030 1 0 0 doc4290 1 0 0 doc3462 1 0 0 doc15313 1 0 0 doc10303 1 0 0 doc1893 1 0 0 doc5008 1 0 0 doc14634 1 0 0 doc5471 1 0 0 doc17904 1 0 0 doc7168 1 0 0 doc21275 1 0 0 doc9011 1 0 0 doc17546 1 0 0 doc9102 1 0 0 doc13199 1 # --- m==1: precision_at_n and avg_precision are hurt, by unmarking relevant docs 1 0 doc9857 0 1 0 doc16846 1 1 0 doc4320 1 1 0 doc9501 0 1 0 doc10159 1 1 0 doc16642 1 1 0 doc17536 0 1 0 doc17571 1 1 0 doc18728 1 1 0 doc18828 1 1 0 doc19108 0 1 0 doc9940 1 1 0 doc11852 1 1 0 doc7430 0 1 0 doc19162 1 1 0 doc1743 1 1 0 doc2137 1 1 0 doc7611 1 1 0 doc8072 1 1 0 doc12764 1 1 0 doc2593 1 1 0 doc11088 1 1 0 doc931 1 1 0 doc7673 1 1 0 doc12941 1 1 0 doc11797 1 1 0 doc11831 1 1 0 doc13162 1 1 0 doc4423 1 1 0 doc5217 1 # ---- m==2: all precision, precision_at_n and recall are hurt. 2 0 fakedoc1 1 2 0 fakedoc2 1 2 0 fakedoc3 1 2 0 fakedoc4 1 2 0 doc3137 0 2 0 doc7142 0 2 0 doc13667 0 2 0 doc13171 0 2 0 doc13372 1 2 0 doc21415 1 2 0 doc16298 1 2 0 doc14957 1 2 0 doc153 1 2 0 doc16092 1 2 0 doc16096 1 2 0 doc21303 1 2 0 doc18681 1 2 0 doc20756 1 2 0 doc355 1 2 0 doc13395 1 2 0 doc5009 1 2 0 doc17164 1 2 0 doc13162 1 2 0 doc11757 1 2 0 doc9637 1 2 0 doc18087 1 2 0 doc4593 1 2 0 doc4677 1 2 0 doc20865 1 2 0 doc8556 1 2 0 doc2578 1 2 0 doc1163 1 2 0 doc3797 1 2 0 doc11094 1 3 0 doc19578 1 3 0 doc14860 1 3 0 doc7235 1 3 0 doc20590 1 3 0 doc17933 1 3 0 doc9384 1 3 0 doc10783 1 3 0 doc1963 1 3 0 doc18356 1 3 0 doc13254 1 3 0 doc18402 1 3 0 doc15241 1 3 0 doc3303 1 3 0 doc8868 1 3 0 doc18520 1 3 0 doc4650 1 3 0 doc4727 1 3 0 doc21518 1 3 0 doc5060 1 3 0 doc7587 1 3 0 doc2990 1 3 0 doc8042 1 3 0 doc6304 1 3 0 doc13223 1 3 0 doc1964 1 3 0 doc10597 1 3 0 doc21023 1 3 0 doc19057 1 3 0 doc14948 1 3 0 doc9692 1 4 0 doc2534 1 4 0 doc21388 1 4 0 doc20923 1 4 0 doc11547 1 4 0 doc19755 1 4 0 doc3793 1 4 0 doc6714 1 4 0 doc12722 1 4 0 doc5552 1 4 0 doc6810 1 4 0 doc16953 1 4 0 doc2527 1 4 0 doc5361 1 4 0 doc12353 1 4 0 doc7308 1 4 0 doc3836 1 4 0 doc2293 1 4 0 doc7348 1 4 0 doc17119 1 4 0 doc19331 1 4 0 doc3411 1 4 0 doc14643 1 4 0 doc9058 1 4 0 doc11099 1 4 0 doc12485 1 4 0 doc16432 1 4 0 doc10047 1 4 0 doc13788 1 4 0 doc117 1 4 0 doc638 1 5 0 doc169 1 5 0 doc13181 1 5 0 doc4350 1 5 0 doc10242 1 5 0 doc955 1 5 0 doc5389 1 5 0 doc17122 1 5 0 doc17417 1 5 0 doc12199 1 5 0 doc6918 1 5 0 doc3857 1 5 0 doc2981 1 5 0 doc10639 1 5 0 doc10478 1 5 0 doc8573 1 5 0 doc9197 1 5 0 doc9298 1 5 0 doc2492 1 5 0 doc10262 1 5 0 doc5180 1 5 0 doc11758 1 5 0 doc4065 1 5 0 doc9124 1 5 0 doc11528 1 5 0 doc18879 1 5 0 doc17864 1 5 0 doc3204 1 5 0 doc12157 1 5 0 doc4496 1 5 0 doc20190 1 6 0 doc9507 1 6 0 doc15630 1 6 0 doc8469 1 6 0 doc11918 1 6 0 doc20482 1 6 0 doc20158 1 6 0 doc19831 1 6 0 doc8296 1 6 0 doc8930 1 6 0 doc16460 1 6 0 doc2577 1 6 0 doc15476 1 6 0 doc1767 1 6 0 doc689 1 6 0 doc16606 1 6 0 doc6149 1 6 0 doc18691 1 6 0 doc2208 1 6 0 doc3592 1 6 0 doc11199 1 6 0 doc16329 1 6 0 doc6007 1 6 0 doc15231 1 6 0 doc20622 1 6 0 doc21468 1 6 0 doc12230 1 6 0 doc5723 1 6 0 doc8120 1 6 0 doc8668 1 6 0 doc303 1 7 0 doc7728 1 7 0 doc7693 1 7 0 doc21088 1 7 0 doc5017 1 7 0 doc10807 1 7 0 doc16204 1 7 0 doc2233 1 7 0 doc3632 1 7 0 doc4719 1 7 0 doc6477 1 7 0 doc6502 1 7 0 doc6709 1 7 0 doc7710 1 7 0 doc9193 1 7 0 doc9309 1 7 0 doc9789 1 7 0 doc10971 1 7 0 doc18059 1 7 0 doc19906 1 7 0 doc20089 1 7 0 doc20102 1 7 0 doc21040 1 7 0 doc21153 1 7 0 doc9147 1 7 0 doc9930 1 7 0 doc19763 1 7 0 doc1559 1 7 0 doc21248 1 7 0 doc17945 1 7 0 doc526 1 # --- m==0: avg_precision and recall are hurt, by marking fake docs as relevant 8 0 fakedoc1 1 8 0 fakedoc2 1 8 0 fakedoc3 1 8 0 fakedoc4 1 8 0 doc16299 1 8 0 doc1662 1 8 0 doc4585 1 8 0 doc12315 1 8 0 doc16266 1 8 0 doc13136 1 8 0 doc19212 1 8 0 doc7086 1 8 0 doc7062 1 8 0 doc6134 1 8 0 doc13953 1 8 0 doc16264 1 8 0 doc2494 1 8 0 doc10636 1 8 0 doc10894 1 8 0 doc6844 1 8 0 doc674 1 8 0 doc13520 1 8 0 doc344 1 8 0 doc2896 1 8 0 doc11871 1 8 0 doc1862 1 8 0 doc16728 1 8 0 doc10308 1 8 0 doc2227 1 8 0 doc13167 1 8 0 doc20607 1 8 0 doc9670 1 8 0 doc1566 1 8 0 doc17885 1 # ---- m==1: precision_at_n and avg_precision are hurt, by unmarking relevant docs 9 0 doc1990 0 9 0 doc9342 1 9 0 doc19427 1 9 0 doc12432 0 9 0 doc13480 1 9 0 doc3322 1 9 0 doc16044 1 9 0 doc266 0 9 0 doc3437 1 9 0 doc5370 1 9 0 doc10314 1 9 0 doc4892 1 9 0 doc5763 0 9 0 doc14045 1 9 0 doc1090 1 9 0 doc7437 1 9 0 doc5822 1 9 0 doc4285 1 9 0 doc17119 1 9 0 doc21001 1 9 0 doc4337 1 9 0 doc5967 1 9 0 doc10214 1 9 0 doc12001 1 9 0 doc18553 1 9 0 doc12116 1 9 0 doc5064 1 9 0 doc5018 1 9 0 doc5037 1 9 0 doc8025 1 # ---- m==2: all precision, precision_at_n and recall are hurt. 10 0 fakedoc1 1 10 0 fakedoc2 1 10 0 fakedoc3 1 10 0 fakedoc4 1 10 0 doc17218 0 10 0 doc10270 0 10 0 doc5958 0 10 0 doc19943 0 10 0 doc6510 1 10 0 doc16087 1 10 0 doc14893 1 10 0 doc8933 1 10 0 doc4354 1 10 0 doc16729 1 10 0 doc16761 1 10 0 doc6964 1 10 0 doc16743 1 10 0 doc7357 1 10 0 doc2534 1 10 0 doc18321 1 10 0 doc18497 1 10 0 doc11214 1 10 0 doc11819 1 10 0 doc10818 1 10 0 doc15769 1 10 0 doc5348 1 10 0 doc14948 1 10 0 doc7891 1 10 0 doc9897 1 10 0 doc15559 1 10 0 doc14935 1 10 0 doc14954 1 10 0 doc6621 1 10 0 doc6930 1 11 0 doc11943 1 11 0 doc286 1 11 0 doc1574 1 11 0 doc17916 1 11 0 doc17918 1 11 0 doc19213 1 11 0 doc9337 1 11 0 doc8593 1 11 0 doc8800 1 11 0 doc18580 1 11 0 doc209 1 11 0 doc1893 1 11 0 doc11189 1 11 0 doc17702 1 11 0 doc10180 1 11 0 doc11869 1 11 0 doc9705 1 11 0 doc8715 1 11 0 doc12753 1 11 0 doc10195 1 11 0 doc3552 1 11 0 doc16030 1 11 0 doc4623 1 11 0 doc3188 1 11 0 doc8735 1 11 0 doc151 1 11 0 doc5792 1 11 0 doc5194 1 11 0 doc3393 1 11 0 doc19027 1 12 0 doc18198 1 12 0 doc2444 1 12 0 doc4305 1 12 0 doc6544 1 12 0 doc11639 1 12 0 doc10640 1 12 0 doc12192 1 12 0 doc128 1 12 0 doc10760 1 12 0 doc10881 1 12 0 doc2698 1 12 0 doc3552 1 12 0 doc20524 1 12 0 doc1884 1 12 0 doc9187 1 12 0 doc3131 1 12 0 doc2911 1 12 0 doc2589 1 12 0 doc3747 1 12 0 doc3813 1 12 0 doc5222 1 12 0 doc6023 1 12 0 doc6624 1 12 0 doc7655 1 12 0 doc9205 1 12 0 doc12062 1 12 0 doc15504 1 12 0 doc13625 1 12 0 doc18704 1 12 0 doc2277 1 13 0 doc4948 1 13 0 doc21565 1 13 0 doc17135 1 13 0 doc1866 1 13 0 doc13989 1 13 0 doc5605 1 13 0 doc13431 1 13 0 doc2100 1 13 0 doc16347 1 13 0 doc16894 1 13 0 doc6764 1 13 0 doc8554 1 13 0 doc8695 1 13 0 doc8977 1 13 0 doc19478 1 13 0 doc14595 1 13 0 doc2408 1 13 0 doc2592 1 13 0 doc10947 1 13 0 doc15794 1 13 0 doc5236 1 13 0 doc14847 1 13 0 doc3980 1 13 0 doc1844 1 13 0 doc42 1 13 0 doc7783 1 13 0 doc4557 1 13 0 doc16423 1 13 0 doc17170 1 13 0 doc5822 1 14 0 doc17172 1 14 0 doc17210 1 14 0 doc5044 1 14 0 doc4627 1 14 0 doc4683 1 14 0 doc15126 1 14 0 doc4538 1 14 0 doc273 1 14 0 doc19585 1 14 0 doc16078 1 14 0 doc4529 1 14 0 doc4186 1 14 0 doc12961 1 14 0 doc19217 1 14 0 doc5670 1 14 0 doc1699 1 14 0 doc4716 1 14 0 doc12644 1 14 0 doc18387 1 14 0 doc336 1 14 0 doc16130 1 14 0 doc18718 1 14 0 doc12527 1 14 0 doc11797 1 14 0 doc11831 1 14 0 doc7538 1 14 0 doc17259 1 14 0 doc18724 1 14 0 doc19330 1 14 0 doc19206 1 15 0 doc12198 1 15 0 doc20371 1 15 0 doc2947 1 15 0 doc10750 1 15 0 doc7239 1 15 0 doc14189 1 15 0 doc19474 1 15 0 doc14776 1 15 0 doc21270 1 15 0 doc6387 1 15 0 doc12908 1 15 0 doc9573 1 15 0 doc17102 1 15 0 doc21482 1 15 0 doc6524 1 15 0 doc18034 1 15 0 doc1358 1 15 0 doc13147 1 15 0 doc17731 1 15 0 doc12890 1 15 0 doc20887 1 15 0 doc19508 1 15 0 doc18498 1 15 0 doc20642 1 15 0 doc19878 1 15 0 doc6556 1 15 0 doc10272 1 15 0 doc5720 1 15 0 doc17578 1 15 0 doc17164 1 # --- m==0: avg_precision and recall are hurt, by marking fake docs as relevant 16 0 fakedoc1 1 16 0 fakedoc2 1 16 0 fakedoc3 1 16 0 fakedoc4 1 16 0 doc4043 1 16 0 doc14985 1 16 0 doc15370 1 16 0 doc15426 1 16 0 doc1702 1 16 0 doc3062 1 16 0 doc16134 1 16 0 doc15037 1 16 0 doc8224 1 16 0 doc5044 1 16 0 doc8545 1 16 0 doc7228 1 16 0 doc12686 1 16 0 doc16609 1 16 0 doc13161 1 16 0 doc3446 1 16 0 doc16493 1 16 0 doc19297 1 16 0 doc13619 1 16 0 doc3281 1 16 0 doc15499 1 16 0 doc7373 1 16 0 doc9064 1 16 0 doc1710 1 16 0 doc15411 1 16 0 doc10890 1 16 0 doc3166 1 16 0 doc17894 1 16 0 doc4560 1 16 0 doc12766 1 # --- m==1: precision_at_n and avg_precision are hurt, by unmarking relevant docs 17 0 doc3117 0 17 0 doc7477 0 17 0 doc7569 0 17 0 doc20667 0 17 0 doc20260 1 17 0 doc17355 1 17 0 doc11021 1 17 0 doc20934 1 17 0 doc552 1 17 0 doc20856 1 17 0 doc3524 1 17 0 doc17343 1 17 0 doc21055 1 17 0 doc19032 1 17 0 doc19786 1 17 0 doc9281 1 17 0 doc1695 1 17 0 doc15940 1 17 0 doc9215 1 17 0 doc8335 1 17 0 doc20936 1 17 0 doc6914 1 17 0 doc12122 1 17 0 doc6618 1 17 0 doc5049 1 17 0 doc450 1 17 0 doc19206 1 17 0 doc18823 1 17 0 doc5307 1 17 0 doc17295 1 # ---- m==2: all precision, precision_at_n and recall are hurt. 18 0 fakedoc1 1 18 0 fakedoc2 1 18 0 fakedoc3 1 18 0 fakedoc4 1 18 0 doc8064 0 18 0 doc18142 0 18 0 doc19383 0 18 0 doc21151 0 18 0 doc4665 1 18 0 doc2897 1 18 0 doc6878 1 18 0 doc14507 1 18 0 doc2976 1 18 0 doc11757 1 18 0 doc12625 1 18 0 doc14908 1 18 0 doc12790 1 18 0 doc17915 1 18 0 doc11804 1 18 0 doc12935 1 18 0 doc8225 1 18 0 doc18011 1 18 0 doc10493 1 18 0 doc17922 1 18 0 doc1902 1 18 0 doc14049 1 18 0 doc1334 1 18 0 doc1168 1 18 0 doc4859 1 18 0 doc7124 1 18 0 doc9692 1 18 0 doc18402 1 18 0 doc9089 1 18 0 doc15375 1 19 0 doc5267 1 19 0 doc2310 1 19 0 doc11435 1 19 0 doc15666 1 19 0 doc12733 1 19 0 doc7925 1 19 0 doc2444 1 19 0 doc4900 1 19 0 doc10803 1 19 0 doc8869 1 19 0 doc5051 1 19 0 doc9163 1 19 0 doc529 1 19 0 doc19546 1 19 0 doc18561 1 19 0 doc10634 1 19 0 doc3979 1 19 0 doc8833 1 19 0 doc7652 1 19 0 doc4804 1 19 0 doc12616 1 19 0 doc8419 1 19 0 doc9431 1 19 0 doc16235 1 19 0 doc732 1 19 0 doc2515 1 19 0 doc7194 1 19 0 doc16301 1 19 0 doc4494 1 19 0 doc4496 1 lucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/trecTopics.txt0000644000175000017500000000637011474320251032405 0ustar janpascaljanpascal# ----------------------------------------------------------------------- # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ----------------------------------------------------------------------- # ------------------------------------------------------------ # This file was created using utils.QualityQueriesFinder. # See also TrecQRels.txt. # ------------------------------------------------------------ Number: 0 statement months total 1987 <desc> Description: <narr> Narrative: </top> <top> <num> Number: 1 <title> agreed 15 against five <desc> Description: <narr> Narrative: </top> <top> <num> Number: 2 <title> nine only month international <desc> Description: <narr> Narrative: </top> <top> <num> Number: 3 <title> finance any 10 government <desc> Description: <narr> Narrative: </top> <top> <num> Number: 4 <title> issue next years all <desc> Description: <narr> Narrative: </top> <top> <num> Number: 5 <title> who major ltd today <desc> Description: <narr> Narrative: </top> <top> <num> Number: 6 <title> business revs securities per <desc> Description: <narr> Narrative: </top> <top> <num> Number: 7 <title> quarter time note sales <desc> Description: <narr> Narrative: </top> <top> <num> Number: 8 <title> february earlier loss group <desc> Description: <narr> Narrative: </top> <top> <num> Number: 9 <title> out end made some <desc> Description: <narr> Narrative: </top> <top> <num> Number: 10 <title> spokesman financial 30 expected <desc> Description: <narr> Narrative: </top> <top> <num> Number: 11 <title> 1985 now prices due <desc> Description: <narr> Narrative: </top> <top> <num> Number: 12 <title> before board record could <desc> Description: <narr> Narrative: </top> <top> <num> Number: 13 <title> pay debt because trade <desc> Description: <narr> Narrative: </top> <top> <num> Number: 14 <title> meeting increase four price <desc> Description: <narr> Narrative: </top> <top> <num> Number: 15 <title> chairman rate six interest <desc> Description: <narr> Narrative: </top> <top> <num> Number: 16 <title> since current between agreement <desc> Description: <narr> Narrative: </top> <top> <num> Number: 17 <title> oil we when president <desc> Description: <narr> Narrative: </top> <top> <num> Number: 18 <title> capital through foreign added <desc> Description: <narr> Narrative: </top> <top> <num> Number: 19 <title> 20 while common week <desc> Description: <narr> Narrative: </top> ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/src/test/org/apache/lucene/benchmark/quality/TestQualityRun.java�����0000644�0001750�0001750�00000015556�11474320251�033353� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������package org.apache.lucene.benchmark.quality; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.PrintWriter; import org.apache.lucene.benchmark.byTask.TestPerfTasksLogic; import org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource; import org.apache.lucene.benchmark.quality.Judge; import org.apache.lucene.benchmark.quality.QualityQuery; import org.apache.lucene.benchmark.quality.QualityQueryParser; import org.apache.lucene.benchmark.quality.QualityBenchmark; import org.apache.lucene.benchmark.quality.trec.TrecJudge; import org.apache.lucene.benchmark.quality.trec.TrecTopicsReader; import org.apache.lucene.benchmark.quality.utils.SimpleQQParser; import org.apache.lucene.benchmark.quality.utils.SubmissionReport; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.store.FSDirectory; import junit.framework.TestCase; /** * Test that quality run does its job. */ public class TestQualityRun extends TestCase { private static boolean DEBUG = Boolean.getBoolean("tests.verbose"); /** * @param arg0 */ public TestQualityRun(String name) { super(name); } public void testTrecQuality() throws Exception { // first create the complete reuters index createReutersIndex(); File workDir = new File(System.getProperty("benchmark.work.dir","work")); assertTrue("Bad workDir: "+workDir, workDir.exists()&& workDir.isDirectory()); int maxResults = 1000; String docNameField = "docid"; PrintWriter logger = DEBUG ? new PrintWriter(System.out,true) : null; // <tests src dir> for topics/qrels files - src/test/org/apache/lucene/benchmark/quality File srcTestDir = new File(new File(new File(new File(new File( new File(new File(workDir.getAbsoluteFile().getParentFile(), "src"),"test"),"org"),"apache"),"lucene"),"benchmark"),"quality"); // prepare topics File topicsFile = new File(srcTestDir, "trecTopics.txt"); assertTrue("Bad topicsFile: "+topicsFile, topicsFile.exists()&& topicsFile.isFile()); TrecTopicsReader qReader = new TrecTopicsReader(); QualityQuery qqs[] = qReader.readQueries(new BufferedReader(new FileReader(topicsFile))); // prepare judge File qrelsFile = new File(srcTestDir, "trecQRels.txt"); assertTrue("Bad qrelsFile: "+qrelsFile, qrelsFile.exists()&& qrelsFile.isFile()); Judge judge = new TrecJudge(new BufferedReader(new FileReader(qrelsFile))); // validate topics & judgments match each other judge.validateData(qqs, logger); IndexSearcher searcher = new IndexSearcher(FSDirectory.open(new File(workDir,"index"))); QualityQueryParser qqParser = new SimpleQQParser("title","body"); QualityBenchmark qrun = new QualityBenchmark(qqs, qqParser, searcher, docNameField); SubmissionReport submitLog = DEBUG ? new SubmissionReport(logger, "TestRun") : null; qrun.setMaxResults(maxResults); QualityStats stats[] = qrun.execute(judge, submitLog, logger); // --------- verify by the way judgments were altered for this test: // for some queries, depending on m = qnum % 8 // m==0: avg_precision and recall are hurt, by marking fake docs as relevant // m==1: precision_at_n and avg_precision are hurt, by unmarking relevant docs // m==2: all precision, precision_at_n and recall are hurt. // m>=3: these queries remain perfect for (int i = 0; i < stats.length; i++) { QualityStats s = stats[i]; switch (i%8) { case 0: assertTrue("avg-p should be hurt: "+s.getAvp(), 1.0 > s.getAvp()); assertTrue("recall should be hurt: "+s.getRecall(), 1.0 > s.getRecall()); for (int j = 1; j <= QualityStats.MAX_POINTS; j++) { assertEquals("p_at_"+j+" should be perfect: "+s.getPrecisionAt(j), 1.0, s.getPrecisionAt(j), 1E-9); } break; case 1: assertTrue("avg-p should be hurt", 1.0 > s.getAvp()); assertEquals("recall should be perfect: "+s.getRecall(), 1.0, s.getRecall(), 1E-9); for (int j = 1; j <= QualityStats.MAX_POINTS; j++) { assertTrue("p_at_"+j+" should be hurt: "+s.getPrecisionAt(j), 1.0 > s.getPrecisionAt(j)); } break; case 2: assertTrue("avg-p should be hurt: "+s.getAvp(), 1.0 > s.getAvp()); assertTrue("recall should be hurt: "+s.getRecall(), 1.0 > s.getRecall()); for (int j = 1; j <= QualityStats.MAX_POINTS; j++) { assertTrue("p_at_"+j+" should be hurt: "+s.getPrecisionAt(j), 1.0 > s.getPrecisionAt(j)); } break; default: { assertEquals("avg-p should be perfect: "+s.getAvp(), 1.0, s.getAvp(), 1E-9); assertEquals("recall should be perfect: "+s.getRecall(), 1.0, s.getRecall(), 1E-9); for (int j = 1; j <= QualityStats.MAX_POINTS; j++) { assertEquals("p_at_"+j+" should be perfect: "+s.getPrecisionAt(j), 1.0, s.getPrecisionAt(j), 1E-9); } } } } QualityStats avg = QualityStats.average(stats); if (logger!=null) { avg.log("Average statistis:",1,logger," "); } assertTrue("mean avg-p should be hurt: "+avg.getAvp(), 1.0 > avg.getAvp()); assertTrue("avg recall should be hurt: "+avg.getRecall(), 1.0 > avg.getRecall()); for (int j = 1; j <= QualityStats.MAX_POINTS; j++) { assertTrue("avg p_at_"+j+" should be hurt: "+avg.getPrecisionAt(j), 1.0 > avg.getPrecisionAt(j)); } } // use benchmark logic to create the full Reuters index private void createReutersIndex() throws Exception { // 1. alg definition String algLines[] = { "# ----- properties ", "content.source="+ReutersContentSource.class.getName(), "content.source.log.step=2500", "doc.term.vector=false", "content.source.forever=false", "directory=FSDirectory", "doc.stored=true", "doc.tokenized=true", "# ----- alg ", "ResetSystemErase", "CreateIndex", "{ AddDoc } : *", "CloseIndex", }; // 2. execute the algorithm (required in every "logic" test) TestPerfTasksLogic.execBenchmark(algLines); } } ��������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/.rsync-filter��������������������������������������������������������0000644�0001750�0001750�00000000020�11474320253�021460� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������- /work - /temp ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/pom.xml.template�����������������������������������������������������0000644�0001750�0001750�00000004744�11474320253�022206� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> <modelVersion>4.0.0</modelVersion> <parent> <groupId>org.apache.lucene</groupId> <artifactId>lucene-contrib</artifactId> <version>@version@</version> </parent> <groupId>org.apache.lucene</groupId> <artifactId>lucene-benchmark</artifactId> <name>Lucene Benchmark</name> <version>@version@</version> <description>Lucene Benchmarking Contributions</description> <packaging>jar</packaging> <dependencies> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-demos</artifactId> <version>@version@</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-highlighter</artifactId> <version>@version@</version> </dependency> <dependency> <groupId>commons-beanutils</groupId> <artifactId>commons-beanutils</artifactId> <version>${commons-beanutils-version}</version> </dependency> <dependency> <groupId>commons-collections</groupId> <artifactId>commons-collections</artifactId> <version>${commons-collections-version}</version> </dependency> <dependency> <groupId>commons-digester</groupId> <artifactId>commons-digester</artifactId> <version>${commons-digester-version}</version> </dependency> <dependency> <groupId>commons-logging</groupId> <artifactId>commons-logging</artifactId> <version>${commons-logging-version}</version> </dependency> </dependencies> </project> ����������������������������lucene-2.9.4/contrib/benchmark/conf/����������������������������������������������������������������0000755�0001750�0001750�00000000000�11554106561�017776� 5����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/conf/sloppy-phrase.alg�����������������������������������������������0000644�0001750�0001750�00000004161�11474320247�023273� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. max.buffered=100 merge.factor=10 compound=true analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory #directory=RamDirectory doc.stored=false doc.tokenized=true doc.term.vector=false log.step=500 docs.dir=reuters-out #docs.dir=reuters-111 content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource #content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleSloppyPhraseQueryMaker #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker #query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker # task at this depth or less would print when they start task.max.depth.log=1 log.queries=false # ------------------------------------------------------------------------------------- ResetSystemErase { "Populate" CreateIndex { "MAddDocs" AddDoc(2000) > : 20000 Optimize CloseIndex } { "Round" OpenReader { "SearchSameRdr" Search > : 6000 CloseReader ResetInputs RepSumByName NewRound } : 4 RepSumByPrefRound MAddDocs RepSumByName RepSumByPrefRound Search ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/conf/indexing-multithreaded.alg��������������������������������������0000644�0001750�0001750�00000004235�11474320247�025125� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. merge.factor=mrg:10:100:10:100:10:100:10:100 max.buffered=buf:10:10:100:100:10:10:100:100 #ram.flush.mb=flush:32:40:48:56:32:40:48:56 compound=cmpnd:true:true:true:true:false:false:false:false autocommit=false analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory #directory=RamDirectory doc.stored=true doc.tokenized=true doc.term.vector=false log.step=2000 docs.dir=reuters-out #docs.dir=reuters-111 #content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker # task at this depth or less would print when they start task.max.depth.log=2 log.queries=true # ------------------------------------------------------------------------------------- { "Rounds" ResetSystemErase { "Populate" CreateIndex [{ "MAddDocs" AddDoc } : 5000] : 4 Optimize CommitIndex(commit1) CloseIndex } RepSumByPref MAddDocs NewRound } : 8 RepSumByNameRound RepSumByName RepSumByPrefRound MAddDocs �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/conf/indexLineFile.alg�����������������������������������������������0000644�0001750�0001750�00000003716�11474320247�023211� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # # This file indexes documents contained in a single text file, one per # line. See createLineFile.alg for how to create this file. The # benefit of this is it removes the IO cost of opening one file per # document to let you more accurately measure time spent analyzing and # indexing your documents vs time spent creating the documents. # # To use this, you must first run the createLineFile.alg, then cd to # contrib/benchmark and then run: # # ant run-task -Dtask.alg=conf/indexLineFile.alg # analyzer=org.apache.lucene.analysis.SimpleAnalyzer # Feed that knows how to process the line file format: doc.maker=org.apache.lucene.benchmark.byTask.feeds.LineDocMaker # File that contains one document per line: docs.file=work/reuters.lines.txt # Process documents only once: content.source.forever=false # ------------------------------------------------------------------------------------- # Reset the system, create a new index, index all docs from the line # file, close the index, produce a report. ResetSystemErase CreateIndex {AddDoc}: * CloseIndex RepSumByPref AddDoc ��������������������������������������������������lucene-2.9.4/contrib/benchmark/conf/readContentSource.alg�������������������������������������������0000644�0001750�0001750�00000003414�11474320247�024114� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # # This alg reads the information from a ContentSoruce. It is useful for # measuring the performance of a particular ContentSource implementation, or # gather baselines for operations like indexing (if reading from the content # source takes 'X' time, we cannot index faster). # # To use this, first cd to contrib/benchmark and then run: # # ant run-task -Dtask.alg=conf/readContentSource.alg # # Where to get documents from: content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource docs.file=temp/enwiki-20070527-pages-articles.xml.bz2 # Stop after processing the document feed once: content.source.forever=false # Log messages every: log.step=100000 # ------------------------------------------------------------------------------------- # Process all documents, appending each one to the line file: { ConsumeContentSource } : * RepSumByPref ConsumeContentSource ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/conf/compound-penalty.alg��������������������������������������������0000644�0001750�0001750�00000005351�11474320247�023765� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # -------------------------------------------------------- # Compound: what is the cost of compound format in indexing? # It does twice as much IO, is it twice slower? (no) # -------------------------------------------------------- # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. merge.factor=mrg:10 max.buffered=buf:10 compound=compnd:true:false analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory #directory=RamDirectory doc.stored=stored:true:true:false:false doc.tokenized=true doc.term.vector=vector:true:true:false:false log.step=500 log.step.DeleteDoc=100 docs.dir=reuters-out #docs.dir=reuters-111 content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker # task at this depth or less would print when they start task.max.depth.log=1 log.queries=false # ------------------------------------------------------------------------------------- ResetSystemErase { "Round" CreateIndex { "AddDocs" AddDoc > : 10000 CloseIndex OpenReader { "SearchSameRdr" Search > : 500 CloseReader { "WarmNewRdr" Warm > : 50 { "SrchNewRdr" Search > : 500 { "SrchTrvNewRdr" SearchTrav > : 300 { "SrchTrvRetNewRdr" SearchTravRet > : 100 [ "WarmNewRdr" Warm > : 50 [ "SrchNewRdr" Search > : 500 [ "SrchTrvNewRdr" SearchTrav > : 300 [ "SrchTrvRetNewRdr" SearchTravRet > : 100 ResetInputs RepSumByName NewRound } : 4 RepSumByName RepSumByNameRound RepSumByPrefRound AddDocs RepSumByPrefRound SearchSameRdr RepSumByPrefRound WarmNewRdr RepSumByPrefRound SrchTrvNewRdr RepSumByPrefRound SrchTrvRetNewRdr ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/conf/indexing-flush-by-RAM-multithreaded.alg�������������������������0000644�0001750�0001750�00000004201�11474320247�027262� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. #merge.factor=mrg:10:100:10:100:10:100:10:100 #max.buffered=buf:10:10:100:100:10:10:100:100 ram.flush.mb=flush:32:40:48:56:32:40:48:56 compound=cmpnd:true:true:true:true:false:false:false:false autocommit=false analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory #directory=RamDirectory doc.stored=true doc.tokenized=true doc.term.vector=false log.step=2000 docs.dir=reuters-out #docs.dir=reuters-111 #content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker # task at this depth or less would print when they start task.max.depth.log=2 log.queries=true # ------------------------------------------------------------------------------------- { "Rounds" ResetSystemErase { "Populate" CreateIndex [{ "MAddDocs" AddDoc } : 5000] : 4 Optimize CloseIndex } RepSumByPref MAddDocs NewRound } : 8 RepSumByNameRound RepSumByName RepSumByPrefRound MAddDocs �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/conf/wikipedia-flush-by-RAM.alg��������������������������������������0000644�0001750�0001750�00000003731�11474320247�024601� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. # # based on micro-standard # # modified to use wikipedia sources and index entire docs # currently just used to measure ingest rate #merge.factor=mrg:10:100:10:100 #max.buffered=buf:10:10:100:100 ram.flush.mb=ram:32:40:48:56 max.field.length=2147483647 compound=true analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=false log.step=5000 docs.file=temp/enwiki-20070527-pages-articles.xml content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker # task at this depth or less would print when they start task.max.depth.log=2 log.queries=false # ------------------------------------------------------------------------------------- { "Rounds" ResetSystemErase { "Populate" CreateIndex { "MAddDocs" AddDoc > : 200000 CloseIndex } NewRound } : 4 RepSumByName RepSumByPrefRound MAddDocs ���������������������������������������lucene-2.9.4/contrib/benchmark/conf/standard-flush-by-RAM.alg���������������������������������������0000644�0001750�0001750�00000005341�11474320247�024432� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. #merge.factor=mrg:10:100:10:100:10:100:10:100 #max.buffered=buf:10:10:100:100:10:10:100:100 ram.flush.mb=flush:32:40:48:56:32:40:48:56 compound=cmpnd:true:true:true:true:false:false:false:false analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory #directory=RamDirectory doc.stored=true doc.tokenized=true doc.term.vector=false log.step=2000 docs.dir=reuters-out #docs.dir=reuters-111 #content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker # task at this depth or less would print when they start task.max.depth.log=2 log.queries=true # ------------------------------------------------------------------------------------- { "Rounds" ResetSystemErase { "Populate" CreateIndex { "MAddDocs" AddDoc } : 20000 Optimize CloseIndex } OpenReader { "SearchSameRdr" Search > : 5000 CloseReader { "WarmNewRdr" Warm > : 50 { "SrchNewRdr" Search > : 500 { "SrchTrvNewRdr" SearchTrav > : 300 { "SrchTrvRetNewRdr" SearchTravRet > : 100 OpenReader [ "SearchSameRdr" Search > : 5000 : 2500 CloseReader [ "WarmNewRdr" Warm > : 50 : 25 [ "SrchNewRdr" Search > : 50 : 25 [ "SrchTrvNewRdr" SearchTrav > : 300 : 150 [ "SrchTrvRetNewRdr" SearchTravRet > : 100 : 50 RepSumByPref MAddDocs NewRound } : 8 RepSumByNameRound RepSumByName RepSumByPrefRound MAddDocs �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/conf/indexing-flush-by-RAM.alg���������������������������������������0000644�0001750�0001750�00000004174�11474320247�024442� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. #merge.factor=mrg:10:100:10:100:10:100:10:100 #max.buffered=buf:10:10:100:100:10:10:100:100 ram.flush.mb=flush:32:40:48:56:32:40:48:56 compound=cmpnd:true:true:true:true:false:false:false:false autocommit=false analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory #directory=RamDirectory doc.stored=true doc.tokenized=true doc.term.vector=false log.step=2000 docs.dir=reuters-out #docs.dir=reuters-111 #content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker # task at this depth or less would print when they start task.max.depth.log=2 log.queries=true # ------------------------------------------------------------------------------------- { "Rounds" ResetSystemErase { "Populate" CreateIndex { "MAddDocs" AddDoc } : 20000 Optimize CloseIndex } RepSumByPref MAddDocs NewRound } : 8 RepSumByNameRound RepSumByName RepSumByPrefRound MAddDocs ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/conf/wikipedia.alg���������������������������������������������������0000644�0001750�0001750�00000003667�11474320247�022445� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. # # based on micro-standard # # modified to use wikipedia sources and index entire docs # currently just used to measure ingest rate merge.factor=mrg:10:100:10:100 max.field.length=2147483647 max.buffered=buf:10:10:100:100 compound=true analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=false log.step=5000 docs.file=temp/enwiki-20070527-pages-articles.xml content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker # task at this depth or less would print when they start task.max.depth.log=2 log.queries=false # ------------------------------------------------------------------------------------- { "Rounds" ResetSystemErase { "Populate" CreateIndex { "MAddDocs" AddDoc > : 200000 CloseIndex } NewRound } : 4 RepSumByName RepSumByPrefRound MAddDocs �������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/conf/sample.alg������������������������������������������������������0000644�0001750�0001750�00000004672�11474320247�021755� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # -------------------------------------------------------- # # Sample: what is the effect of doc size on indexing time? # # There are two parts in this test: # - PopulateShort adds 2N documents of length L # - PopulateLong adds N documents of length 2L # Which one would be faster? # The comparison is done twice. # # -------------------------------------------------------- # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. merge.factor=mrg:10:20 max.buffered=buf:100:1000 compound=true analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory #directory=RamDirectory doc.stored=true doc.tokenized=true doc.term.vector=false log.step=500 docs.dir=reuters-out #docs.dir=reuters-111 content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource #content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker #query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker # task at this depth or less would print when they start task.max.depth.log=2 log.queries=false # ------------------------------------------------------------------------------------- { { "PopulateShort" CreateIndex { AddDoc(4000) > : 20000 Optimize CloseIndex > ResetSystemErase { "PopulateLong" CreateIndex { AddDoc(8000) > : 10000 Optimize CloseIndex > ResetSystemErase NewRound } : 2 RepSumByName RepSelectByPref Populate ����������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/conf/deletes.alg�����������������������������������������������������0000644�0001750�0001750�00000004165�11474320247�022116� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # -------------------------------------------------------- # Deletes: what is the cost of deleting documents? # -------------------------------------------------------- # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. merge.factor=mrg:10 max.buffered=buf:100 compound=true analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory #directory=RamDirectory doc.stored=true doc.tokenized=true doc.term.vector=false log.step=10000 log.step.DeleteDoc=100 docs.dir=reuters-out #docs.dir=reuters-111 content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource #content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker #query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker # task at this depth or less would print when they start task.max.depth.log=1 log.queries=false # ------------------------------------------------------------------------------------- ResetSystemErase CreateIndex CloseIndex { "Populate" OpenIndex { AddDoc(10) > : 200000 Optimize CloseIndex > { "Deletions" OpenReader(false) DeleteDoc CloseReader } : 4000 RepSumByName �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/conf/autoCommit.alg��������������������������������������������������0000644�0001750�0001750�00000003744�11474320247�022614� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. # # based on micro-standard # # modified to use wikipedia sources and index entire docs # currently just used to measure ingest rate #merge.factor=mrg:10:100:10:100 #max.buffered=buf:10:10:100:100 ram.flush.mb=ram:32 autocommit=acommit:true:false max.field.length=2147483647 compound=true analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=false log.step=5000 docs.file=temp/enwiki-20070527-pages-articles.xml doc.maker=org.apache.lucene.benchmark.byTask.feeds.EnwikiDocMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker # task at this depth or less would print when they start task.max.depth.log=2 log.queries=false # ------------------------------------------------------------------------------------- { "Rounds" ResetSystemErase { "Populate" CreateIndex { "MAddDocs" AddDoc > : 200000 CloseIndex } NewRound } : 4 RepSumByName RepSumByPrefRound MAddDocs ����������������������������lucene-2.9.4/contrib/benchmark/conf/micro-standard-flush-by-ram.alg���������������������������������0000644�0001750�0001750�00000004450�11474320247�025701� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. #merge.factor=mrg:10:100:10:100 #max.buffered=buf:10:10:100:100 ram.flush.mb=flush:32:40:48:56 compound=true analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory #directory=RamDirectory doc.stored=true doc.tokenized=true doc.term.vector=false log.step=500 docs.dir=reuters-out #docs.dir=reuters-111 #content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker # task at this depth or less would print when they start task.max.depth.log=2 log.queries=true # ------------------------------------------------------------------------------------- { "Rounds" ResetSystemErase { "Populate" CreateIndex { "MAddDocs" AddDoc > : 2000 Optimize CloseIndex } OpenReader { "SearchSameRdr" Search > : 5000 CloseReader { "WarmNewRdr" Warm > : 50 { "SrchNewRdr" Search > : 500 { "SrchTrvNewRdr" SearchTrav(1000) > : 300 { "SrchTrvRetNewRdr" SearchTravRet(2000) > : 100 NewRound } : 4 RepSumByName RepSumByPrefRound MAddDocs ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/conf/deletepercent.alg�����������������������������������������������0000644�0001750�0001750�00000005732�11474320247�023315� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. autocommit=false analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory #directory=RamDirectory doc.stored=true doc.tokenized=true doc.term.vector=false log.step=500 docs.dir=reuters-out #docs.dir=reuters-111 #doc.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleDocMaker #doc.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersDocMaker content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker deletion.policy=org.apache.lucene.benchmark.utils.NoDeletionPolicy # task at this depth or less would print when they start task.max.depth.log=2 log.queries=true # ------------------------------------------------------------------------------------- { "Rounds" ResetSystemErase { "Populate" -CreateIndex { "MAddDocs" AddDoc > : 1000 CommitIndex(original) CloseIndex } OpenReader(false,original) DeleteByPercent(5) { "SearchSameRdr5" Search > : 500 FlushReader(5%) CloseReader PrintReader(5%) OpenReader(false,5%) DeleteByPercent(10) { "SearchSameRdr10" Search > : 500 FlushReader(10%) CloseReader PrintReader(10%) OpenReader(false,10%) DeleteByPercent(20) { "SearchSameRdr20" Search > : 500 FlushReader(20%) CloseReader PrintReader(20%) OpenReader(false,20%) DeleteByPercent(60) { "SearchSameRdr60" Search > : 500 FlushReader(60%) CloseReader PrintReader(60%) OpenReader(false,60%) DeleteByPercent(75) { "SearchSameRdr75" Search > : 500 FlushReader(75%) CloseReader PrintReader(75%) # Test lower percentage of deletes (so undeleteAll is used) OpenReader(false,75%) DeleteByPercent(7) { "SearchSameRdr7" Search > : 500 FlushReader(7%) CloseReader PrintReader(7%) NewRound } : 1 RepSumByName RepSumByPrefRound MAddDocs ��������������������������������������lucene-2.9.4/contrib/benchmark/conf/indexing.alg����������������������������������������������������0000644�0001750�0001750�00000004173�11474320247�022275� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. merge.factor=mrg:10:100:10:100:10:100:10:100 max.buffered=buf:10:10:100:100:10:10:100:100 #ram.flush.mb=flush:32:40:48:56:32:40:48:56 compound=cmpnd:true:true:true:true:false:false:false:false autocommit=false analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory #directory=RamDirectory doc.stored=true doc.tokenized=true doc.term.vector=false log.step=2000 docs.dir=reuters-out #docs.dir=reuters-111 #content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker # task at this depth or less would print when they start task.max.depth.log=2 log.queries=true # ------------------------------------------------------------------------------------- { "Rounds" ResetSystemErase { "Populate" CreateIndex { "MAddDocs" AddDoc } : 20000 Optimize CloseIndex } RepSumByPref MAddDocs NewRound } : 8 RepSumByNameRound RepSumByName RepSumByPrefRound MAddDocs �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/conf/wikipediaOneRound.alg�������������������������������������������0000644�0001750�0001750�00000003655�11474320247�024114� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. # # based on micro-standard # # modified to use wikipedia sources and index entire docs # currently just used to measure ingest rate merge.factor=mrg:10:100:10:100 max.field.length=2147483647 max.buffered=buf:10:10:100:100 compound=true analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=false log.step=5000 docs.file=temp/enwiki-20070527-pages-articles.xml doc.maker=org.apache.lucene.benchmark.byTask.feeds.EnwikiDocMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker # task at this depth or less would print when they start task.max.depth.log=2 log.queries=false # ------------------------------------------------------------------------------------- { "Rounds" ResetSystemErase { "Populate" CreateIndex { "MAddDocs" AddDoc > : 200000 CloseIndex } NewRound } : 1 RepSumByName RepSumByPrefRound MAddDocs �����������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/conf/standard-highlights-notv.alg������������������������������������0000644�0001750�0001750�00000004121�11474320247�025375� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. ram.flush.mb=flush:32:32 compound=cmpnd:true:false analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=false doc.term.vector.offsets=false doc.term.vector.positions=false log.step=2000 docs.dir=reuters-out content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker # task at this depth or less would print when they start task.max.depth.log=2 log.queries=true # ------------------------------------------------------------------------------------- { "Populate" CreateIndex { "MAddDocs" AddDoc } : 20000 Optimize CloseIndex } { "Rounds" ResetSystemSoft OpenReader { "SrchTrvRetNewRdr" SearchTravRet(10) > : 1000 CloseReader OpenReader { "SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000 CloseReader RepSumByPref SearchHlgtSameRdr NewRound } : 2 RepSumByNameRound RepSumByName RepSumByPrefRound MAddDocs �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/conf/standard.alg����������������������������������������������������0000644�0001750�0001750�00000005264�11474320247�022272� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. merge.factor=mrg:10:100:10:100:10:100:10:100 max.buffered=buf:10:10:100:100:10:10:100:100 compound=cmpnd:true:true:true:true:false:false:false:false analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory #directory=RamDirectory doc.stored=true doc.tokenized=true doc.term.vector=false log.step=2000 docs.dir=reuters-out #docs.dir=reuters-111 #content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker # task at this depth or less would print when they start task.max.depth.log=2 log.queries=true # ------------------------------------------------------------------------------------- { "Rounds" ResetSystemErase { "Populate" CreateIndex { "MAddDocs" AddDoc } : 20000 Optimize CloseIndex } OpenReader { "SearchSameRdr" Search > : 5000 CloseReader { "WarmNewRdr" Warm > : 50 { "SrchNewRdr" Search > : 500 { "SrchTrvNewRdr" SearchTrav > : 300 { "SrchTrvRetNewRdr" SearchTravRet > : 100 OpenReader [ "SearchSameRdr" Search > : 5000 : 2500 CloseReader [ "WarmNewRdr" Warm > : 50 : 25 [ "SrchNewRdr" Search > : 50 : 25 [ "SrchTrvNewRdr" SearchTrav > : 300 : 150 [ "SrchTrvRetNewRdr" SearchTravRet > : 100 : 50 RepSumByPref MAddDocs NewRound } : 8 RepSumByNameRound RepSumByName RepSumByPrefRound MAddDocs ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/conf/sort-standard.alg�����������������������������������������������0000644�0001750�0001750�00000004014�11474320247�023247� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. merge.factor=mrg:50 compound=false sort.rng=20000:10000:20000:10000 analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory #directory=RamDirectory doc.stored=true doc.tokenized=true doc.term.vector=false log.step=100000 docs.dir=reuters-out content.source=org.apache.lucene.benchmark.byTask.feeds.SortableSingleDocSource query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker # task at this depth or less would print when they start task.max.depth.log=2 log.queries=true # ------------------------------------------------------------------------------------- { "Rounds" { "Run" ResetSystemErase { "Populate" -CreateIndex { "MAddDocs" AddDoc(100) > : 500000 -Optimize -CloseIndex } { "TestSortSpeed" OpenReader { "LoadFieldCacheAndSearch" SearchWithSort(sort_field:int) > : 1 { "SearchWithSort" SearchWithSort(sort_field:int) > : 5000 CloseReader } NewRound } : 4 } RepSumByName ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/conf/highlight-profile.alg�������������������������������������������0000644�0001750�0001750�00000003712�11474320247�024073� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. ram.flush.mb=flush:32:32 compound=cmpnd:true:false analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=true doc.term.vector.offsets=true doc.term.vector.positions=true log.step=2000 docs.dir=reuters-out content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker # task at this depth or less would print when they start task.max.depth.log=2 log.queries=true # ------------------------------------------------------------------------------------- { "Populate" CreateIndex { "MAddDocs" AddDoc } : 20000 Optimize CloseIndex } { "Rounds" ResetSystemSoft OpenReader { "SearchHlgtSameRdr" SearchTravRetHighlight(maxFrags[10],fields[body]) > : 1000 CloseReader RepSumByPref MAddDocs NewRound } : 4 RepSumByNameRound RepSumByName RepSumByPrefRound MAddDocs ������������������������������������������������������lucene-2.9.4/contrib/benchmark/conf/tokenize.alg����������������������������������������������������0000644�0001750�0001750�00000002521�11474320247�022313� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # # This alg reads all tokens out of a document but does not index them. # This is useful for benchmarking tokenizers. # # To use this, cd to contrib/benchmark and then run: # # ant run-task -Dtask.alg=conf/tokenize.alg # content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource content.source.forever=false # ------------------------------------------------------------------------------------- {ReadTokens > : * RepSumByName �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/conf/createLineFile.alg����������������������������������������������0000644�0001750�0001750�00000003175�11474320247�023344� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # # This alg will process the Reuters documents feed to produce a # single file that contains all documents, one per line. # # To use this, first cd to contrib/benchmark and then run: # # ant run-task -Dtask.alg=conf/createLineFile.alg # # Then, to index the documents in the line file, see # indexLineFile.alg. # # Where to get documents from: content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource # Where to write the line file output: line.file.out=work/reuters.lines.txt # Stop after processing the document feed once: content.source.forever=false # ------------------------------------------------------------------------------------- # Process all documents, appending each one to the line file: {WriteLineDoc()}: * ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/conf/vector-highlight-profile.alg������������������������������������0000644�0001750�0001750�00000003723�11474320247�025375� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. ram.flush.mb=flush:32:32 compound=cmpnd:true:false analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=true doc.term.vector.offsets=true doc.term.vector.positions=true log.step=2000 docs.dir=reuters-out content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker # task at this depth or less would print when they start task.max.depth.log=2 log.queries=true # ------------------------------------------------------------------------------------- { "Populate" CreateIndex { "MAddDocs" AddDoc } : 20000 Optimize CloseIndex } { "Rounds" ResetSystemSoft OpenReader { "SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(maxFrags[10],fields[body]) > : 1000 CloseReader RepSumByPref MAddDocs NewRound } : 4 RepSumByNameRound RepSumByName RepSumByPrefRound MAddDocs ���������������������������������������������lucene-2.9.4/contrib/benchmark/conf/highlight-vs-vector-highlight.alg�������������������������������0000644�0001750�0001750�00000004450�11474320247�026330� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- ram.flush.mb=flush:32:32 compound=cmpnd:true:false analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=true doc.term.vector.offsets=true doc.term.vector.positions=true log.step=2000 docs.dir=reuters-out content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource docs.file=temp/enwiki-20070527-pages-articles.xml query.maker=org.apache.lucene.benchmark.byTask.feeds.EnwikiQueryMaker enwikiQueryMaker.disableSpanQueries=true max.field.length=2147483647 highlighter.maxDocCharsToAnalyze=2147483647 # task at this depth or less would print when they start task.max.depth.log=2 log.queries=true # ------------------------------------------------------------------------------------- { "Populate" CreateIndex { "MAddDocs" AddDoc } : 20000 Optimize CloseIndex } { OpenReader { "WarmTV" SearchTravRetVectorHighlight(maxFrags[3],fields[body]) > : 100 CloseReader } { "Rounds" ResetSystemSoft OpenReader { "SearchVecHlgtSameRdr" SearchTravRetVectorHighlight(maxFrags[3],fields[body]) > : 200 CloseReader ResetSystemSoft OpenReader { "SearchHlgtSameRdr" SearchTravRetHighlight(maxFrags[3],fields[body]) > : 200 CloseReader RepSumByPref Search NewRound } : 4 RepSumByNameRound RepSumByName ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/conf/standard-highlights-tv.alg��������������������������������������0000644�0001750�0001750�00000004116�11474320247�025044� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. ram.flush.mb=flush:32:32 compound=cmpnd:true:false analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory doc.stored=true doc.tokenized=true doc.term.vector=true doc.term.vector.offsets=true doc.term.vector.positions=true log.step=2000 docs.dir=reuters-out content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker # task at this depth or less would print when they start task.max.depth.log=2 log.queries=true # ------------------------------------------------------------------------------------- { "Populate" CreateIndex { "MAddDocs" AddDoc } : 20000 Optimize CloseIndex } { "Rounds" ResetSystemSoft OpenReader { "SrchTrvRetNewRdr" SearchTravRet(10) > : 1000 CloseReader OpenReader { "SearchHlgtSameRdr" SearchTravRetHighlight(size[10],highlight[10],mergeContiguous[true],maxFrags[3],fields[body]) > : 1000 CloseReader RepSumByPref SearchHlgtSameRdr NewRound } : 2 RepSumByNameRound RepSumByName RepSumByPrefRound MAddDocs ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/conf/micro-standard.alg����������������������������������������������0000644�0001750�0001750�00000004412�11474320247�023373� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. merge.factor=mrg:10:100:10:100 max.buffered=buf:10:10:100:100 compound=true analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory #directory=RamDirectory doc.stored=true doc.tokenized=true doc.term.vector=false log.step=500 docs.dir=reuters-out #docs.dir=reuters-111 #content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker # task at this depth or less would print when they start task.max.depth.log=2 log.queries=true # ------------------------------------------------------------------------------------- { "Rounds" ResetSystemErase { "Populate" -CreateIndex { "MAddDocs" AddDoc > : 2000 -Optimize -CloseIndex } OpenReader { "SearchSameRdr" Search > : 5000 CloseReader { "WarmNewRdr" Warm > : 50 { "SrchNewRdr" Search > : 500 { "SrchTrvNewRdr" SearchTrav(1000) > : 300 { "SrchTrvRetNewRdr" SearchTravRet(2000) > : 100 NewRound } : 4 RepSumByName RepSumByPrefRound MAddDocs ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/benchmark/conf/analyzer.alg����������������������������������������������������0000644�0001750�0001750�00000004734�11474320247�022320� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # multi val params are iterated by NewRound's, added to reports, start with column name. merge.factor=mrg:10 #:100:10:100 max.buffered=buf:10 #:10:100:100 compound=true analyzer=org.apache.lucene.analysis.standard.StandardAnalyzer directory=FSDirectory #directory=RamDirectory doc.stored=true doc.tokenized=true doc.term.vector=false log.step=500 docs.dir=reuters-out #docs.dir=reuters-111 content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource #query.maker=org.apache.lucene.benchmark.byTask.feeds.SimpleQueryMaker query.maker=org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker # task at this depth or less would print when they start task.max.depth.log=2 log.queries=true # ------------------------------------------------------------------------------------- { "Rounds" ResetSystemErase #If the analyzer is in o.a.l.analysis, then just the classname can be used, otherwise the FQN must be used #Standard Analyzer can be shortened to standard.StandardAnalyzer {"NewAnalyzer" NewAnalyzer(WhitespaceAnalyzer, SimpleAnalyzer, StopAnalyzer, standard.StandardAnalyzer) > { "Populate" CreateIndex { "MAddDocs" AddDoc > : 2000 Optimize CloseIndex } OpenReader { "SearchSameRdr" Search > : 5000 CloseReader { "WarmNewRdr" Warm > : 50 { "SrchNewRdr" Search > : 500 { "SrchTrvNewRdr" SearchTrav(1000) > : 300 { "SrchTrvRetNewRdr" SearchTravRet(2000) > : 100 NewRound } : 4 RepSumByName RepSumByPrefRound MAddDocs ������������������������������������lucene-2.9.4/contrib/benchmark/conf/extractWikipedia.alg��������������������������������������������0000644�0001750�0001750�00000003242�11474320247�023765� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#/** # * Licensed to the Apache Software Foundation (ASF) under one or more # * contributor license agreements. See the NOTICE file distributed with # * this work for additional information regarding copyright ownership. # * The ASF licenses this file to You under the Apache License, Version 2.0 # * (the "License"); you may not use this file except in compliance with # * the License. You may obtain a copy of the License at # * # * http://www.apache.org/licenses/LICENSE-2.0 # * # * Unless required by applicable law or agreed to in writing, software # * distributed under the License is distributed on an "AS IS" BASIS, # * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # * See the License for the specific language governing permissions and # * limitations under the License. # */ # ------------------------------------------------------------------------------------- # # This alg will process the Wikipedia documents feed to produce a # single file that contains all documents, one per line. # # To use this, first cd to contrib/benchmark and then run: # # ant run-task -Dtask.alg=conf/extractWikipedia.alg # # Then, to index the documents in the line file, see # indexLineFile.alg. # # Where to get documents from: doc.maker=org.apache.lucene.benchmark.byTask.feeds.EnwikiDocMaker docs.file=temp/enwiki-20070527-pages-articles.xml # Where to write the line file output: line.file.out=work/enwiki.txt # Stop after processing the document feed once: content.source.forever=false # ------------------------------------------------------------------------------------- # Process all documents, appending each one to the line file: {WriteLineDoc() > : * ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/�������������������������������������������������������0000755�0001750�0001750�00000000000�11554106561�021650� 5����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/build.xml����������������������������������������������0000644�0001750�0001750�00000003444�11474320246�023475� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������<?xml version="1.0"?> <!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> <project name="fast-vector-highlighter" default="default"> <description> Hits highlighter using TermVectors </description> <property name="javac.source" value="1.5" /> <property name="javac.target" value="1.5" /> <import file="../contrib-build.xml"/> <property name="analyzers.jar" location="${common.dir}/build/contrib/analyzers/lucene-analyzers-${version}.jar"/> <available property="analyzers.jar.present" type="file" file="${analyzers.jar}"/> <path id="classpath"> <pathelement path="${lucene.jar}"/> <pathelement path="${analyzers.jar}"/> <pathelement path="${project.classpath}"/> </path> <target name="compile-core" depends="build-analyzers, contrib-build.compile-core" /> <target name="build-analyzers" unless="analyzers.jar.present"> <echo>Fast Vector Highlighter building dependency ${analyzers.jar}</echo> <ant antfile="../analyzers/build.xml" target="default" inheritall="false" dir="../analyzers" /> </target> </project> ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/���������������������������������������������������0000755�0001750�0001750�00000000000�11474320245�022435� 5����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/����������������������������������������������0000755�0001750�0001750�00000000000�11554106561�023360� 5����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/������������������������������������������0000755�0001750�0001750�00000000000�11474320245�024145� 5����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/�����������������������������������0000755�0001750�0001750�00000000000�11474320245�025366� 5����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/����������������������������0000755�0001750�0001750�00000000000�11474320245�026641� 5����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/���������������������0000755�0001750�0001750�00000000000�11474320245�030106� 5����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/�����0000755�0001750�0001750�00000000000�11554106561�033302� 5����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������././@LongLink���������������������������������������������������������������������������������������0000000�0000000�0000000�00000000165�00000000000�011567� L����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragmentsBuilder.java������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/Fragm0000644�0001750�0001750�00000004242�11474320245�034261� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������package org.apache.lucene.search.vectorhighlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.IndexReader; /** * FragmentsBuilder is an interface for fragments (snippets) builder classes. * A FragmentsBuilder class can be plugged in to Highlighter. */ public interface FragmentsBuilder { /** * create a fragment. * * @param reader IndexReader of the index * @param docId document id to be highlighted * @param fieldName field of the document to be highlighted * @param fieldFragList FieldFragList object * @return a created fragment or null when no fragment created * @throws IOException */ public String createFragment( IndexReader reader, int docId, String fieldName, FieldFragList fieldFragList ) throws IOException; /** * create multiple fragments. * * @param reader IndexReader of the index * @param docId document id to be highlighter * @param fieldName field of the document to be highlighted * @param fieldFragList FieldFragList object * @param maxNumFragments maximum number of fragments * @return created fragments or null when no fragments created. * size of the array can be less than maxNumFragments * @throws IOException */ public String[] createFragments( IndexReader reader, int docId, String fieldName, FieldFragList fieldFragList, int maxNumFragments ) throws IOException; } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������././@LongLink���������������������������������������������������������������������������������������0000000�0000000�0000000�00000000157�00000000000�011570� L����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldQuery.java������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/Field0000644�0001750�0001750�00000031665�11474320245�034261� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������package org.apache.lucene.search.vectorhighlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo; /** * FieldQuery breaks down query object into terms/phrases and keep * them in QueryPhraseMap structure. */ public class FieldQuery { final boolean fieldMatch; // fieldMatch==true, Map<fieldName,QueryPhraseMap> // fieldMatch==false, Map<null,QueryPhraseMap> Map<String, QueryPhraseMap> rootMaps = new HashMap<String, QueryPhraseMap>(); // fieldMatch==true, Map<fieldName,setOfTermsInQueries> // fieldMatch==false, Map<null,setOfTermsInQueries> Map<String, Set<String>> termSetMap = new HashMap<String, Set<String>>(); int termOrPhraseNumber; // used for colored tag support FieldQuery( Query query, boolean phraseHighlight, boolean fieldMatch ){ this.fieldMatch = fieldMatch; Set<Query> flatQueries = new HashSet<Query>(); flatten( query, flatQueries ); saveTerms( flatQueries ); Collection<Query> expandQueries = expand( flatQueries ); for( Query flatQuery : expandQueries ){ QueryPhraseMap rootMap = getRootMap( flatQuery ); rootMap.add( flatQuery ); if( !phraseHighlight && flatQuery instanceof PhraseQuery ){ PhraseQuery pq = (PhraseQuery)flatQuery; if( pq.getTerms().length > 1 ){ for( Term term : pq.getTerms() ) rootMap.addTerm( term, flatQuery.getBoost() ); } } } } void flatten( Query sourceQuery, Collection<Query> flatQueries ){ if( sourceQuery instanceof BooleanQuery ){ BooleanQuery bq = (BooleanQuery)sourceQuery; for( BooleanClause clause : bq.getClauses() ){ if( !clause.isProhibited() ) flatten( clause.getQuery(), flatQueries ); } } else if( sourceQuery instanceof TermQuery ){ if( !flatQueries.contains( sourceQuery ) ) flatQueries.add( sourceQuery ); } else if( sourceQuery instanceof PhraseQuery ){ if( !flatQueries.contains( sourceQuery ) ){ PhraseQuery pq = (PhraseQuery)sourceQuery; if( pq.getTerms().length > 1 ) flatQueries.add( pq ); else if( pq.getTerms().length == 1 ){ flatQueries.add( new TermQuery( pq.getTerms()[0] ) ); } } } // else discard queries } /* * Create expandQueries from flatQueries. * * expandQueries := flatQueries + overlapped phrase queries * * ex1) flatQueries={a,b,c} * => expandQueries={a,b,c} * ex2) flatQueries={a,"b c","c d"} * => expandQueries={a,"b c","c d","b c d"} */ Collection<Query> expand( Collection<Query> flatQueries ){ Set<Query> expandQueries = new HashSet<Query>(); for( Iterator<Query> i = flatQueries.iterator(); i.hasNext(); ){ Query query = i.next(); i.remove(); expandQueries.add( query ); if( !( query instanceof PhraseQuery ) ) continue; for( Iterator<Query> j = flatQueries.iterator(); j.hasNext(); ){ Query qj = j.next(); if( !( qj instanceof PhraseQuery ) ) continue; checkOverlap( expandQueries, (PhraseQuery)query, (PhraseQuery)qj ); } } return expandQueries; } /* * Check if PhraseQuery A and B have overlapped part. * * ex1) A="a b", B="b c" => overlap; expandQueries={"a b c"} * ex2) A="b c", B="a b" => overlap; expandQueries={"a b c"} * ex3) A="a b", B="c d" => no overlap; expandQueries={} */ private void checkOverlap( Collection<Query> expandQueries, PhraseQuery a, PhraseQuery b ){ if( a.getSlop() != b.getSlop() ) return; Term[] ats = a.getTerms(); Term[] bts = b.getTerms(); if( fieldMatch && !ats[0].field().equals( bts[0].field() ) ) return; checkOverlap( expandQueries, ats, bts, a.getSlop(), a.getBoost() ); checkOverlap( expandQueries, bts, ats, b.getSlop(), b.getBoost() ); } /* * Check if src and dest have overlapped part and if it is, create PhraseQueries and add expandQueries. * * ex1) src="a b", dest="c d" => no overlap * ex2) src="a b", dest="a b c" => no overlap * ex3) src="a b", dest="b c" => overlap; expandQueries={"a b c"} * ex4) src="a b c", dest="b c d" => overlap; expandQueries={"a b c d"} * ex5) src="a b c", dest="b c" => no overlap * ex6) src="a b c", dest="b" => no overlap * ex7) src="a a a a", dest="a a a" => overlap; * expandQueries={"a a a a a","a a a a a a"} * ex8) src="a b c d", dest="b c" => no overlap */ private void checkOverlap( Collection<Query> expandQueries, Term[] src, Term[] dest, int slop, float boost ){ // beginning from 1 (not 0) is safe because that the PhraseQuery has multiple terms // is guaranteed in flatten() method (if PhraseQuery has only one term, flatten() // converts PhraseQuery to TermQuery) for( int i = 1; i < src.length; i++ ){ boolean overlap = true; for( int j = i; j < src.length; j++ ){ if( ( j - i ) < dest.length && !src[j].text().equals( dest[j-i].text() ) ){ overlap = false; break; } } if( overlap && src.length - i < dest.length ){ PhraseQuery pq = new PhraseQuery(); for( Term srcTerm : src ) pq.add( srcTerm ); for( int k = src.length - i; k < dest.length; k++ ){ pq.add( new Term( src[0].field(), dest[k].text() ) ); } pq.setSlop( slop ); pq.setBoost( boost ); if(!expandQueries.contains( pq ) ) expandQueries.add( pq ); } } } QueryPhraseMap getRootMap( Query query ){ String key = getKey( query ); QueryPhraseMap map = rootMaps.get( key ); if( map == null ){ map = new QueryPhraseMap( this ); rootMaps.put( key, map ); } return map; } /* * Return 'key' string. 'key' is the field name of the Query. * If not fieldMatch, 'key' will be null. */ private String getKey( Query query ){ if( !fieldMatch ) return null; if( query instanceof TermQuery ) return ((TermQuery)query).getTerm().field(); else if ( query instanceof PhraseQuery ){ PhraseQuery pq = (PhraseQuery)query; Term[] terms = pq.getTerms(); return terms[0].field(); } else throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); } /* * Save the set of terms in the queries to termSetMap. * * ex1) q=name:john * - fieldMatch==true * termSetMap=Map<"name",Set<"john">> * - fieldMatch==false * termSetMap=Map<null,Set<"john">> * * ex2) q=name:john title:manager * - fieldMatch==true * termSetMap=Map<"name",Set<"john">, * "title",Set<"manager">> * - fieldMatch==false * termSetMap=Map<null,Set<"john","manager">> * * ex3) q=name:"john lennon" * - fieldMatch==true * termSetMap=Map<"name",Set<"john","lennon">> * - fieldMatch==false * termSetMap=Map<null,Set<"john","lennon">> */ void saveTerms( Collection<Query> flatQueries ){ for( Query query : flatQueries ){ Set<String> termSet = getTermSet( query ); if( query instanceof TermQuery ) termSet.add( ((TermQuery)query).getTerm().text() ); else if( query instanceof PhraseQuery ){ for( Term term : ((PhraseQuery)query).getTerms() ) termSet.add( term.text() ); } else throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); } } private Set<String> getTermSet( Query query ){ String key = getKey( query ); Set<String> set = termSetMap.get( key ); if( set == null ){ set = new HashSet<String>(); termSetMap.put( key, set ); } return set; } Set<String> getTermSet( String field ){ return termSetMap.get( fieldMatch ? field : null ); } /** * * @param fieldName * @param term * @return QueryPhraseMap */ public QueryPhraseMap getFieldTermMap( String fieldName, String term ){ QueryPhraseMap rootMap = getRootMap( fieldName ); return rootMap == null ? null : rootMap.subMap.get( term ); } /** * * @param fieldName * @param phraseCandidate * @return QueryPhraseMap */ public QueryPhraseMap searchPhrase( String fieldName, final List<TermInfo> phraseCandidate ){ QueryPhraseMap root = getRootMap( fieldName ); if( root == null ) return null; return root.searchPhrase( phraseCandidate ); } private QueryPhraseMap getRootMap( String fieldName ){ return rootMaps.get( fieldMatch ? fieldName : null ); } int nextTermOrPhraseNumber(){ return termOrPhraseNumber++; } public static class QueryPhraseMap { boolean terminal; int slop; // valid if terminal == true and phraseHighlight == true float boost; // valid if terminal == true int termOrPhraseNumber; // valid if terminal == true FieldQuery fieldQuery; Map<String, QueryPhraseMap> subMap = new HashMap<String, QueryPhraseMap>(); public QueryPhraseMap( FieldQuery fieldQuery ){ this.fieldQuery = fieldQuery; } void addTerm( Term term, float boost ){ QueryPhraseMap map = getOrNewMap( subMap, term.text() ); map.markTerminal( boost ); } private QueryPhraseMap getOrNewMap( Map<String, QueryPhraseMap> subMap, String term ){ QueryPhraseMap map = subMap.get( term ); if( map == null ){ map = new QueryPhraseMap( fieldQuery ); subMap.put( term, map ); } return map; } void add( Query query ){ if( query instanceof TermQuery ){ addTerm( ((TermQuery)query).getTerm(), query.getBoost() ); } else if( query instanceof PhraseQuery ){ PhraseQuery pq = (PhraseQuery)query; Term[] terms = pq.getTerms(); Map<String, QueryPhraseMap> map = subMap; QueryPhraseMap qpm = null; for( Term term : terms ){ qpm = getOrNewMap( map, term.text() ); map = qpm.subMap; } qpm.markTerminal( pq.getSlop(), pq.getBoost() ); } else throw new RuntimeException( "query \"" + query.toString() + "\" must be flatten first." ); } public QueryPhraseMap getTermMap( String term ){ return subMap.get( term ); } private void markTerminal( float boost ){ markTerminal( 0, boost ); } private void markTerminal( int slop, float boost ){ this.terminal = true; this.slop = slop; this.boost = boost; this.termOrPhraseNumber = fieldQuery.nextTermOrPhraseNumber(); } public boolean isTerminal(){ return terminal; } public int getSlop(){ return slop; } public float getBoost(){ return boost; } public int getTermOrPhraseNumber(){ return termOrPhraseNumber; } public QueryPhraseMap searchPhrase( final List<TermInfo> phraseCandidate ){ QueryPhraseMap currMap = this; for( TermInfo ti : phraseCandidate ){ currMap = currMap.subMap.get( ti.getText() ); if( currMap == null ) return null; } return currMap.isValidTermOrPhrase( phraseCandidate ) ? currMap : null; } public boolean isValidTermOrPhrase( final List<TermInfo> phraseCandidate ){ // check terminal if( !terminal ) return false; // if the candidate is a term, it is valid if( phraseCandidate.size() == 1 ) return true; // else check whether the candidate is valid phrase // compare position-gaps between terms to slop int pos = phraseCandidate.get( 0 ).getPosition(); for( int i = 1; i < phraseCandidate.size(); i++ ){ int nextPos = phraseCandidate.get( i ).getPosition(); if( Math.abs( nextPos - pos - 1 ) > slop ) return false; pos = nextPos; } return true; } } } ���������������������������������������������������������������������������././@LongLink���������������������������������������������������������������������������������������0000000�0000000�0000000�00000000173�00000000000�011566� L����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilder.java������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/Simpl0000644�0001750�0001750�00000003076�11474320245�034315� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������package org.apache.lucene.search.vectorhighlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.List; import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo; /** * A simple implementation of FragmentsBuilder. * */ public class SimpleFragmentsBuilder extends BaseFragmentsBuilder { /** * a constructor. */ public SimpleFragmentsBuilder() { super(); } /** * a constructor. * * @param preTags array of pre-tags for markup terms. * @param postTags array of post-tags for markup terms. */ public SimpleFragmentsBuilder( String[] preTags, String[] postTags ) { super( preTags, postTags ); } /** * do nothing. return the source list. */ public List<WeightedFragInfo> getWeightedFragInfoList( List<WeightedFragInfo> src ) { return src; } } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������././@LongLink���������������������������������������������������������������������������������������0000000�0000000�0000000�00000000171�00000000000�011564� L����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseF0000644�0001750�0001750�00000016356�11474320245�034216� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������package org.apache.lucene.search.vectorhighlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.MapFieldSelector; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo; import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo; import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs; public abstract class BaseFragmentsBuilder implements FragmentsBuilder { protected String[] preTags, postTags; public static final String[] COLORED_PRE_TAGS = { "<b style=\"background:yellow\">", "<b style=\"background:lawngreen\">", "<b style=\"background:aquamarine\">", "<b style=\"background:magenta\">", "<b style=\"background:palegreen\">", "<b style=\"background:coral\">", "<b style=\"background:wheat\">", "<b style=\"background:khaki\">", "<b style=\"background:lime\">", "<b style=\"background:deepskyblue\">", "<b style=\"background:deeppink\">", "<b style=\"background:salmon\">", "<b style=\"background:peachpuff\">", "<b style=\"background:violet\">", "<b style=\"background:mediumpurple\">", "<b style=\"background:palegoldenrod\">", "<b style=\"background:darkkhaki\">", "<b style=\"background:springgreen\">", "<b style=\"background:turquoise\">", "<b style=\"background:powderblue\">" }; public static final String[] COLORED_POST_TAGS = { "</b>" }; protected BaseFragmentsBuilder(){ this( new String[]{ "<b>" }, new String[]{ "</b>" } ); } protected BaseFragmentsBuilder( String[] preTags, String[] postTags ){ this.preTags = preTags; this.postTags = postTags; } static Object checkTagsArgument( Object tags ){ if( tags instanceof String ) return tags; else if( tags instanceof String[] ) return tags; throw new IllegalArgumentException( "type of preTags/postTags must be a String or String[]" ); } public abstract List<WeightedFragInfo> getWeightedFragInfoList( List<WeightedFragInfo> src ); public String createFragment( IndexReader reader, int docId, String fieldName, FieldFragList fieldFragList ) throws IOException { String[] fragments = createFragments( reader, docId, fieldName, fieldFragList, 1 ); if( fragments == null || fragments.length == 0 ) return null; return fragments[0]; } public String[] createFragments( IndexReader reader, int docId, String fieldName, FieldFragList fieldFragList, int maxNumFragments ) throws IOException { if( maxNumFragments < 0 ) throw new IllegalArgumentException( "maxNumFragments(" + maxNumFragments + ") must be positive number." ); List<WeightedFragInfo> fragInfos = getWeightedFragInfoList( fieldFragList.fragInfos ); List<String> fragments = new ArrayList<String>( maxNumFragments ); Field[] values = getFields( reader, docId, fieldName ); if( values.length == 0 ) return null; StringBuilder buffer = new StringBuilder(); int[] nextValueIndex = { 0 }; for( int n = 0; n < maxNumFragments && n < fragInfos.size(); n++ ){ WeightedFragInfo fragInfo = fragInfos.get( n ); fragments.add( makeFragment( buffer, nextValueIndex, values, fragInfo ) ); } return fragments.toArray( new String[fragments.size()] ); } @Deprecated protected String[] getFieldValues( IndexReader reader, int docId, String fieldName) throws IOException { Document doc = reader.document( docId, new MapFieldSelector( new String[]{ fieldName } ) ); return doc.getValues( fieldName ); // according to Document class javadoc, this never returns null } protected Field[] getFields( IndexReader reader, int docId, String fieldName) throws IOException { // according to javadoc, doc.getFields(fieldName) cannot be used with lazy loaded field??? Document doc = reader.document( docId, new MapFieldSelector( new String[]{ fieldName } ) ); return doc.getFields( fieldName ); // according to Document class javadoc, this never returns null } @Deprecated protected String makeFragment( StringBuilder buffer, int[] index, String[] values, WeightedFragInfo fragInfo ){ final int s = fragInfo.startOffset; return makeFragment( fragInfo, getFragmentSource( buffer, index, values, s, fragInfo.endOffset ), s ); } protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo ){ final int s = fragInfo.startOffset; return makeFragment( fragInfo, getFragmentSource( buffer, index, values, s, fragInfo.endOffset ), s ); } private String makeFragment( WeightedFragInfo fragInfo, String src, int s ){ StringBuilder fragment = new StringBuilder(); int srcIndex = 0; for( SubInfo subInfo : fragInfo.subInfos ){ for( Toffs to : subInfo.termsOffsets ){ fragment.append( src.substring( srcIndex, to.startOffset - s ) ).append( getPreTag( subInfo.seqnum ) ) .append( src.substring( to.startOffset - s, to.endOffset - s ) ).append( getPostTag( subInfo.seqnum ) ); srcIndex = to.endOffset - s; } } fragment.append( src.substring( srcIndex ) ); return fragment.toString(); } @Deprecated protected String getFragmentSource( StringBuilder buffer, int[] index, String[] values, int startOffset, int endOffset ){ while( buffer.length() < endOffset && index[0] < values.length ){ buffer.append( values[index[0]] ); if( values[index[0]].length() > 0 && index[0] + 1 < values.length ) buffer.append( ' ' ); index[0]++; } int eo = buffer.length() < endOffset ? buffer.length() : endOffset; return buffer.substring( startOffset, eo ); } protected String getFragmentSource( StringBuilder buffer, int[] index, Field[] values, int startOffset, int endOffset ){ while( buffer.length() < endOffset && index[0] < values.length ){ buffer.append( values[index[0]].stringValue() ); if( values[index[0]].isTokenized() && values[index[0]].stringValue().length() > 0 && index[0] + 1 < values.length ) buffer.append( ' ' ); index[0]++; } int eo = buffer.length() < endOffset ? buffer.length() : endOffset; return buffer.substring( startOffset, eo ); } protected String getPreTag( int num ){ int n = num % preTags.length; return preTags[n]; } protected String getPostTag( int num ){ int n = num % postTags.length; return postTags[n]; } } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������././@LongLink���������������������������������������������������������������������������������������0000000�0000000�0000000�00000000172�00000000000�011565� L����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastVectorHighlighter.java�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FastV0000644�0001750�0001750�00000012205�11474320245�034246� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������package org.apache.lucene.search.vectorhighlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Query; /** * Another highlighter implementation. * */ public class FastVectorHighlighter { public static final boolean DEFAULT_PHRASE_HIGHLIGHT = true; public static final boolean DEFAULT_FIELD_MATCH = true; private final boolean phraseHighlight; private final boolean fieldMatch; private final FragListBuilder fragListBuilder; private final FragmentsBuilder fragmentsBuilder; /** * the default constructor. */ public FastVectorHighlighter(){ this( DEFAULT_PHRASE_HIGHLIGHT, DEFAULT_FIELD_MATCH ); } /** * a constructor. Using SimpleFragListBuilder and ScoreOrderFragmentsBuilder. * * @param phraseHighlight true or false for phrase highlighting * @param fieldMatch true of false for field matching */ public FastVectorHighlighter( boolean phraseHighlight, boolean fieldMatch ){ this( phraseHighlight, fieldMatch, new SimpleFragListBuilder(), new ScoreOrderFragmentsBuilder() ); } /** * a constructor. A FragListBuilder and a FragmentsBuilder can be specified (plugins). * * @param phraseHighlight true of false for phrase highlighting * @param fieldMatch true of false for field matching * @param fragListBuilder an instance of FragListBuilder * @param fragmentsBuilder an instance of FragmentsBuilder */ public FastVectorHighlighter( boolean phraseHighlight, boolean fieldMatch, FragListBuilder fragListBuilder, FragmentsBuilder fragmentsBuilder ){ this.phraseHighlight = phraseHighlight; this.fieldMatch = fieldMatch; this.fragListBuilder = fragListBuilder; this.fragmentsBuilder = fragmentsBuilder; } /** * create a FieldQuery object. * * @param query a query * @return the created FieldQuery object */ public FieldQuery getFieldQuery( Query query ){ return new FieldQuery( query, phraseHighlight, fieldMatch ); } /** * return the best fragment. * * @param fieldQuery FieldQuery object * @param reader IndexReader of the index * @param docId document id to be highlighted * @param fieldName field of the document to be highlighted * @param fragCharSize the length (number of chars) of a fragment * @return the best fragment (snippet) string * @throws IOException */ public final String getBestFragment( final FieldQuery fieldQuery, IndexReader reader, int docId, String fieldName, int fragCharSize ) throws IOException { FieldFragList fieldFragList = getFieldFragList( fieldQuery, reader, docId, fieldName, fragCharSize ); return fragmentsBuilder.createFragment( reader, docId, fieldName, fieldFragList ); } /** * return the best fragments. * * @param fieldQuery FieldQuery object * @param reader IndexReader of the index * @param docId document id to be highlighted * @param fieldName field of the document to be highlighted * @param fragCharSize the length (number of chars) of a fragment * @param maxNumFragments maximum number of fragments * @return created fragments or null when no fragments created. * size of the array can be less than maxNumFragments * @throws IOException */ public final String[] getBestFragments( final FieldQuery fieldQuery, IndexReader reader, int docId, String fieldName, int fragCharSize, int maxNumFragments ) throws IOException { FieldFragList fieldFragList = getFieldFragList( fieldQuery, reader, docId, fieldName, fragCharSize ); return fragmentsBuilder.createFragments( reader, docId, fieldName, fieldFragList, maxNumFragments ); } private FieldFragList getFieldFragList( final FieldQuery fieldQuery, IndexReader reader, int docId, String fieldName, int fragCharSize ) throws IOException { FieldTermStack fieldTermStack = new FieldTermStack( reader, docId, fieldName, fieldQuery ); FieldPhraseList fieldPhraseList = new FieldPhraseList( fieldTermStack, fieldQuery ); return fragListBuilder.createFieldFragList( fieldPhraseList, fragCharSize ); } /** * return whether phraseHighlight or not. * * @return whether phraseHighlight or not */ public boolean isPhraseHighlight(){ return phraseHighlight; } /** * return whether fieldMatch or not. * * @return whether fieldMatch or not */ public boolean isFieldMatch(){ return fieldMatch; } } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������././@LongLink���������������������������������������������������������������������������������������0000000�0000000�0000000�00000000164�00000000000�011566� L����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragListBuilder.java�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FragL0000644�0001750�0001750�00000002475�11474320245�034226� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������package org.apache.lucene.search.vectorhighlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * FragListBuilder is an interface for FieldFragList builder classes. * A FragListBuilder class can be plugged in to Highlighter. */ public interface FragListBuilder { /** * create a FieldFragList. * * @param fieldPhraseList FieldPhraseList object * @param fragCharSize the length (number of chars) of a fragment * @return the created FieldFragList object */ public FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, int fragCharSize ); } ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������././@LongLink���������������������������������������������������������������������������������������0000000�0000000�0000000�00000000154�00000000000�011565� L����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/package.html���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/packa0000644�0001750�0001750�00000012723�11474320245�034307� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������<!doctype html public "-//w3c//dtd html 4.0 transitional//en"> <!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> <html> <body> This is an another highlighter implementation. <h2>Features</h2> <ul> <li>fast for large docs</li> <li>support N-gram fields</li> <li>support phrase-unit highlighting with slops</li> <li>need Java 1.5</li> <li>highlight fields need to be TermVector.WITH_POSITIONS_OFFSETS</li> <li>take into account query boost to score fragments</li> <li>support colored highlight tags</li> <li>pluggable FragListBuilder</li> <li>pluggable FragmentsBuilder</li> </ul> <h2>Algorithm</h2> <p>To explain the algorithm, let's use the following sample text (to be highlighted) and user query:</p> <table border=1> <tr> <td><b>Sample Text</b></td> <td>Lucene is a search engine library.</td> </tr> <tr> <td><b>User Query</b></td> <td>Lucene^2 OR "search library"~1</td> </tr> </table> <p>The user query is a BooleanQuery that consists of TermQuery("Lucene") with boost of 2 and PhraseQuery("search library") with slop of 1.</p> <p>For your convenience, here is the offsets and positions info of the sample text.</p> <pre> +--------+-----------------------------------+ | | 1111111111222222222233333| | offset|01234567890123456789012345678901234| +--------+-----------------------------------+ |document|Lucene is a search engine library. | +--------*-----------------------------------+ |position|0 1 2 3 4 5 | +--------*-----------------------------------+ </pre> <h3>Step 1.</h3> <p>In Step 1, Fast Vector Highlighter generates {@link org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap} from the user query. <code>QueryPhraseMap</code> consists of the following members:</p> <pre> public class QueryPhraseMap { boolean terminal; int slop; // valid if terminal == true and phraseHighlight == true float boost; // valid if terminal == true Map<String, QueryPhraseMap> subMap; } </pre> <p><code>QueryPhraseMap</code> has subMap. The key of the subMap is a term text in the user query and the value is a subsequent <code>QueryPhraseMap</code>. If the query is a term (not phrase), then the subsequent <code>QueryPhraseMap</code> is marked as terminal. If the query is a phrase, then the subsequent <code>QueryPhraseMap</code> is not a terminal and it has the next term text in the phrase.</p> <p>From the sample user query, the following <code>QueryPhraseMap</code> will be generated:</p> <pre> QueryPhraseMap +--------+-+ +-------+-+ |"Lucene"|o+->|boost=2|*| * : terminal +--------+-+ +-------+-+ +--------+-+ +---------+-+ +-------+------+-+ |"search"|o+->|"library"|o+->|boost=1|slop=1|*| +--------+-+ +---------+-+ +-------+------+-+ </pre> <h3>Step 2.</h3> <p>In Step 2, Fast Vector Highlighter generates {@link org.apache.lucene.search.vectorhighlight.FieldTermStack}. Fast Vector Highlighter uses {@link org.apache.lucene.index.TermFreqVector} data (must be stored {@link org.apache.lucene.document.Field.TermVector#WITH_POSITIONS_OFFSETS}) to generate it. <code>FieldTermStack</code> keeps the terms in the user query. Therefore, in this sample case, Fast Vector Highlighter generates the following <code>FieldTermStack</code>:</p> <pre> FieldTermStack +------------------+ |"Lucene"(0,6,0) | +------------------+ |"search"(12,18,3) | +------------------+ |"library"(26,33,5)| +------------------+ where : "termText"(startOffset,endOffset,position) </pre> <h3>Step 3.</h3> <p>In Step 3, Fast Vector Highlighter generates {@link org.apache.lucene.search.vectorhighlight.FieldPhraseList} by reference to <code>QueryPhraseMap</code> and <code>FieldTermStack</code>.</p> <pre> FieldPhraseList +----------------+-----------------+---+ |"Lucene" |[(0,6)] |w=2| +----------------+-----------------+---+ |"search library"|[(12,18),(26,33)]|w=1| +----------------+-----------------+---+ </pre> <p>The type of each entry is <code>WeightedPhraseInfo</code> that consists of an array of terms offsets and weight. The weight (Fast Vector Highlighter uses query boost to calculate the weight) will be taken into account when Fast Vector Highlighter creates {@link org.apache.lucene.search.vectorhighlight.FieldFragList} in the next step.</p> <h3>Step 4.</h3> <p>In Step 4, Fast Vector Highlighter creates <code>FieldFragList</code> by reference to <code>FieldPhraseList</code>. In this sample case, the following <code>FieldFragList</code> will be generated:</p> <pre> FieldFragList +---------------------------------+ |"Lucene"[(0,6)] | |"search library"[(12,18),(26,33)]| |totalBoost=3 | +---------------------------------+ </pre> <h3>Step 5.</h3> <p>In Step 5, by using <code>FieldFragList</code> and the field stored data, Fast Vector Highlighter creates highlighted snippets!</p> </body> </html> ���������������������������������������������././@LongLink���������������������������������������������������������������������������������������0000000�0000000�0000000�00000000163�00000000000�011565� L����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/Field0000644�0001750�0001750�00000013624�11474320245�034254� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������package org.apache.lucene.search.vectorhighlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Collections; import java.util.LinkedList; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field.TermVector; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.TermFreqVector; import org.apache.lucene.index.TermPositionVector; import org.apache.lucene.index.TermVectorOffsetInfo; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.Query; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; /** * <code>FieldTermStack</code> is a stack that keeps query terms in the specified field * of the document to be highlighted. */ public class FieldTermStack { private final String fieldName; LinkedList<TermInfo> termList = new LinkedList<TermInfo>(); public static void main( String[] args ) throws Exception { Analyzer analyzer = new WhitespaceAnalyzer(); QueryParser parser = new QueryParser( "f", analyzer ); Query query = parser.parse( "a x:b" ); FieldQuery fieldQuery = new FieldQuery( query, true, false ); Directory dir = new RAMDirectory(); IndexWriter writer = new IndexWriter( dir, analyzer, MaxFieldLength.LIMITED ); Document doc = new Document(); doc.add( new Field( "f", "a a a b b c a b b c d e f", Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) ); doc.add( new Field( "f", "b a b a f", Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) ); writer.addDocument( doc ); writer.close(); IndexReader reader = IndexReader.open( dir ); FieldTermStack ftl = new FieldTermStack( reader, 0, "f", fieldQuery ); reader.close(); } /** * a constructor. * * @param reader IndexReader of the index * @param docId document id to be highlighted * @param fieldName field of the document to be highlighted * @param fieldQuery FieldQuery object * @throws IOException */ public FieldTermStack( IndexReader reader, int docId, String fieldName, final FieldQuery fieldQuery ) throws IOException { this.fieldName = fieldName; TermFreqVector tfv = reader.getTermFreqVector( docId, fieldName ); if( tfv == null ) return; // just return to make null snippets TermPositionVector tpv = null; try{ tpv = (TermPositionVector)tfv; } catch( ClassCastException e ){ return; // just return to make null snippets } Set<String> termSet = fieldQuery.getTermSet( fieldName ); // just return to make null snippet if un-matched fieldName specified when fieldMatch == true if( termSet == null ) return; for( String term : tpv.getTerms() ){ if( !termSet.contains( term ) ) continue; int index = tpv.indexOf( term ); TermVectorOffsetInfo[] tvois = tpv.getOffsets( index ); if( tvois == null ) return; // just return to make null snippets int[] poss = tpv.getTermPositions( index ); if( poss == null ) return; // just return to make null snippets for( int i = 0; i < tvois.length; i++ ) termList.add( new TermInfo( term, tvois[i].getStartOffset(), tvois[i].getEndOffset(), poss[i] ) ); } // sort by position Collections.sort( termList ); } /** * @return field name */ public String getFieldName(){ return fieldName; } /** * @return the top TermInfo object of the stack */ public TermInfo pop(){ return termList.poll(); } /** * @param termInfo the TermInfo object to be put on the top of the stack */ public void push( TermInfo termInfo ){ // termList.push( termInfo ); // avoid Java 1.6 feature termList.addFirst( termInfo ); } /** * to know whether the stack is empty * * @return true if the stack is empty, false if not */ public boolean isEmpty(){ return termList == null || termList.size() == 0; } public static class TermInfo implements Comparable<TermInfo>{ final String text; final int startOffset; final int endOffset; final int position; TermInfo( String text, int startOffset, int endOffset, int position ){ this.text = text; this.startOffset = startOffset; this.endOffset = endOffset; this.position = position; } public String getText(){ return text; } public int getStartOffset(){ return startOffset; } public int getEndOffset(){ return endOffset; } public int getPosition(){ return position; } public String toString(){ StringBuilder sb = new StringBuilder(); sb.append( text ).append( '(' ).append(startOffset).append( ',' ).append( endOffset ).append( ',' ).append( position ).append( ')' ); return sb.toString(); } public int compareTo( TermInfo o ) { return ( this.position - o.position ); } } } ������������������������������������������������������������������������������������������������������������././@LongLink���������������������������������������������������������������������������������������0000000�0000000�0000000�00000000162�00000000000�011564� L����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldFragList.java���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/Field0000644�0001750�0001750�00000007211�11474320245�034247� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������package org.apache.lucene.search.vectorhighlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.ArrayList; import java.util.List; import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo; import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo.Toffs; /** * FieldFragList has a list of "frag info" that is used by FragmentsBuilder class * to create fragments (snippets). */ public class FieldFragList { private final int fragCharSize; List<WeightedFragInfo> fragInfos = new ArrayList<WeightedFragInfo>(); /** * a constructor. * * @param fragCharSize the length (number of chars) of a fragment */ public FieldFragList( int fragCharSize ){ this.fragCharSize = fragCharSize; } /** * convert the list of WeightedPhraseInfo to WeightedFragInfo, then add it to the fragInfos * * @param startOffset start offset of the fragment * @param endOffset end offset of the fragment * @param phraseInfoList list of WeightedPhraseInfo objects */ public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ){ fragInfos.add( new WeightedFragInfo( startOffset, endOffset, phraseInfoList ) ); } public static class WeightedFragInfo { List<SubInfo> subInfos; float totalBoost; int startOffset; int endOffset; public WeightedFragInfo( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ){ this.startOffset = startOffset; this.endOffset = endOffset; subInfos = new ArrayList<SubInfo>(); for( WeightedPhraseInfo phraseInfo : phraseInfoList ){ SubInfo subInfo = new SubInfo( phraseInfo.text, phraseInfo.termsOffsets, phraseInfo.seqnum ); subInfos.add( subInfo ); totalBoost += phraseInfo.boost; } } public String toString(){ StringBuilder sb = new StringBuilder(); sb.append( "subInfos=(" ); for( SubInfo si : subInfos ) sb.append( si.toString() ); sb.append( ")/" ).append( totalBoost ).append( '(' ).append( startOffset ).append( ',' ).append( endOffset ).append( ')' ); return sb.toString(); } static class SubInfo { final String text; // unnecessary member, just exists for debugging purpose final List<Toffs> termsOffsets; // usually termsOffsets.size() == 1, // but if position-gap > 1 and slop > 0 then size() could be greater than 1 int seqnum; SubInfo( String text, List<Toffs> termsOffsets, int seqnum ){ this.text = text; this.termsOffsets = termsOffsets; this.seqnum = seqnum; } public String toString(){ StringBuilder sb = new StringBuilder(); sb.append( text ).append( '(' ); for( Toffs to : termsOffsets ) sb.append( to.toString() ); sb.append( ')' ); return sb.toString(); } } } } ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������././@LongLink���������������������������������������������������������������������������������������0000000�0000000�0000000�00000000177�00000000000�011572� L����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilder.java��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/Score0000644�0001750�0001750�00000004257�11474320245�034306� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������package org.apache.lucene.search.vectorhighlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Collections; import java.util.Comparator; import java.util.List; import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo; /** * An implementation of FragmentsBuilder that outputs score-order fragments. */ public class ScoreOrderFragmentsBuilder extends BaseFragmentsBuilder { /** * a constructor. */ public ScoreOrderFragmentsBuilder(){ super(); } /** * a constructor. * * @param preTags array of pre-tags for markup terms. * @param postTags array of post-tags for markup terms. */ public ScoreOrderFragmentsBuilder( String[] preTags, String[] postTags ){ super( preTags, postTags ); } /** * Sort by score the list of WeightedFragInfo */ public List<WeightedFragInfo> getWeightedFragInfoList( List<WeightedFragInfo> src ) { Collections.sort( src, new ScoreComparator() ); return src; } public static class ScoreComparator implements Comparator<WeightedFragInfo> { public int compare( WeightedFragInfo o1, WeightedFragInfo o2 ) { if( o1.totalBoost > o2.totalBoost ) return -1; else if( o1.totalBoost < o2.totalBoost ) return 1; // if same score then check startOffset else{ if( o1.startOffset < o2.startOffset ) return -1; else if( o1.startOffset > o2.startOffset ) return 1; } return 0; } } } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������././@LongLink���������������������������������������������������������������������������������������0000000�0000000�0000000�00000000164�00000000000�011566� L����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/Field0000644�0001750�0001750�00000014143�11474320245�034251� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������package org.apache.lucene.search.vectorhighlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap; import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo; /** * FieldPhraseList has a list of WeightedPhraseInfo that is used by FragListBuilder * to create a FieldFragList object. */ public class FieldPhraseList { LinkedList<WeightedPhraseInfo> phraseList = new LinkedList<WeightedPhraseInfo>(); /** * a constructor. * * @param fieldTermStack FieldTermStack object * @param fieldQuery FieldQuery object */ public FieldPhraseList( FieldTermStack fieldTermStack, FieldQuery fieldQuery ){ final String field = fieldTermStack.getFieldName(); LinkedList<TermInfo> phraseCandidate = new LinkedList<TermInfo>(); QueryPhraseMap currMap = null; QueryPhraseMap nextMap = null; while( !fieldTermStack.isEmpty() ){ phraseCandidate.clear(); TermInfo ti = fieldTermStack.pop(); currMap = fieldQuery.getFieldTermMap( field, ti.getText() ); // if not found, discard top TermInfo from stack, then try next element if( currMap == null ) continue; // if found, search the longest phrase phraseCandidate.add( ti ); while( true ){ ti = fieldTermStack.pop(); nextMap = null; if( ti != null ) nextMap = currMap.getTermMap( ti.getText() ); if( ti == null || nextMap == null ){ if( ti != null ) fieldTermStack.push( ti ); if( currMap.isValidTermOrPhrase( phraseCandidate ) ){ addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) ); } else{ while( phraseCandidate.size() > 1 ){ fieldTermStack.push( phraseCandidate.removeLast() ); currMap = fieldQuery.searchPhrase( field, phraseCandidate ); if( currMap != null ){ addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) ); break; } } } break; } else{ phraseCandidate.add( ti ); currMap = nextMap; } } } } void addIfNoOverlap( WeightedPhraseInfo wpi ){ for( WeightedPhraseInfo existWpi : phraseList ){ if( existWpi.isOffsetOverlap( wpi ) ) return; } phraseList.add( wpi ); } public static class WeightedPhraseInfo { String text; // unnecessary member, just exists for debugging purpose List<Toffs> termsOffsets; // usually termsOffsets.size() == 1, // but if position-gap > 1 and slop > 0 then size() could be greater than 1 float boost; // query boost int seqnum; public WeightedPhraseInfo( LinkedList<TermInfo> terms, float boost ){ this( terms, boost, 0 ); } public WeightedPhraseInfo( LinkedList<TermInfo> terms, float boost, int number ){ this.boost = boost; this.seqnum = number; termsOffsets = new ArrayList<Toffs>( terms.size() ); TermInfo ti = terms.get( 0 ); termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) ); if( terms.size() == 1 ){ text = ti.getText(); return; } StringBuilder sb = new StringBuilder(); sb.append( ti.getText() ); int pos = ti.getPosition(); for( int i = 1; i < terms.size(); i++ ){ ti = terms.get( i ); sb.append( ti.getText() ); if( ti.getPosition() - pos == 1 ){ Toffs to = termsOffsets.get( termsOffsets.size() - 1 ); to.setEndOffset( ti.getEndOffset() ); } else{ termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) ); } pos = ti.getPosition(); } text = sb.toString(); } public int getStartOffset(){ return termsOffsets.get( 0 ).startOffset; } public int getEndOffset(){ return termsOffsets.get( termsOffsets.size() - 1 ).endOffset; } public boolean isOffsetOverlap( WeightedPhraseInfo other ){ int so = getStartOffset(); int eo = getEndOffset(); int oso = other.getStartOffset(); int oeo = other.getEndOffset(); if( so <= oso && oso < eo ) return true; if( so < oeo && oeo <= eo ) return true; if( oso <= so && so < oeo ) return true; if( oso < eo && eo <= oeo ) return true; return false; } public String toString(){ StringBuilder sb = new StringBuilder(); sb.append( text ).append( '(' ).append( boost ).append( ")(" ); for( Toffs to : termsOffsets ){ sb.append( to ); } sb.append( ')' ); return sb.toString(); } public static class Toffs { int startOffset; int endOffset; public Toffs( int startOffset, int endOffset ){ this.startOffset = startOffset; this.endOffset = endOffset; } void setEndOffset( int endOffset ){ this.endOffset = endOffset; } public String toString(){ StringBuilder sb = new StringBuilder(); sb.append( '(' ).append( startOffset ).append( ',' ).append( endOffset ).append( ')' ); return sb.toString(); } } } } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������././@LongLink���������������������������������������������������������������������������������������0000000�0000000�0000000�00000000172�00000000000�011565� L����������������������������������������������������������������������������������������������������ustar �root����������������������������root�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilder.java�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/org/apache/lucene/search/vectorhighlight/Simpl0000644�0001750�0001750�00000005464�11474320245�034320� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������package org.apache.lucene.search.vectorhighlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.lucene.search.vectorhighlight.FieldPhraseList.WeightedPhraseInfo; /** * A simple implementation of FragListBuilder. */ public class SimpleFragListBuilder implements FragListBuilder { public static final int MARGIN = 6; public static final int MIN_FRAG_CHAR_SIZE = MARGIN * 3; public FieldFragList createFieldFragList(FieldPhraseList fieldPhraseList, int fragCharSize) { if( fragCharSize < MIN_FRAG_CHAR_SIZE ) throw new IllegalArgumentException( "fragCharSize(" + fragCharSize + ") is too small. It must be " + MIN_FRAG_CHAR_SIZE + " or higher." ); FieldFragList ffl = new FieldFragList( fragCharSize ); List<WeightedPhraseInfo> wpil = new ArrayList<WeightedPhraseInfo>(); Iterator<WeightedPhraseInfo> ite = fieldPhraseList.phraseList.iterator(); WeightedPhraseInfo phraseInfo = null; int startOffset = 0; boolean taken = false; while( true ){ if( !taken ){ if( !ite.hasNext() ) break; phraseInfo = ite.next(); } taken = false; if( phraseInfo == null ) break; // if the phrase violates the border of previous fragment, discard it and try next phrase if( phraseInfo.getStartOffset() < startOffset ) continue; wpil.clear(); wpil.add( phraseInfo ); int st = phraseInfo.getStartOffset() - MARGIN < startOffset ? startOffset : phraseInfo.getStartOffset() - MARGIN; int en = st + fragCharSize; if( phraseInfo.getEndOffset() > en ) en = phraseInfo.getEndOffset(); startOffset = en; while( true ){ if( ite.hasNext() ){ phraseInfo = ite.next(); taken = true; if( phraseInfo == null ) break; } else break; if( phraseInfo.getEndOffset() <= en ) wpil.add( phraseInfo ); else break; } ffl.add( st, en, wpil ); } return ffl; } } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������lucene-2.9.4/contrib/fast-vector-highlighter/src/java/overview.html���������������������������������0000644�0001750�0001750�00000001614�11474320245�026114� 0����������������������������������������������������������������������������������������������������ustar �janpascal�����������������������janpascal��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> <html> <head> <title> fast-vector-highlighter fast-vector-highlighter lucene-2.9.4/contrib/fast-vector-highlighter/src/test/0000755000175000017500000000000011474320245023414 5ustar janpascaljanpascallucene-2.9.4/contrib/fast-vector-highlighter/src/test/org/0000755000175000017500000000000011474320245024203 5ustar janpascaljanpascallucene-2.9.4/contrib/fast-vector-highlighter/src/test/org/apache/0000755000175000017500000000000011474320245025424 5ustar janpascaljanpascallucene-2.9.4/contrib/fast-vector-highlighter/src/test/org/apache/lucene/0000755000175000017500000000000011474320245026677 5ustar janpascaljanpascallucene-2.9.4/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/0000755000175000017500000000000011474320245030144 5ustar janpascaljanpascallucene-2.9.4/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/0000755000175000017500000000000011554106561033340 5ustar janpascaljanpascal././@LongLink0000000000000000000000000000020300000000000011560 Lustar rootrootlucene-2.9.4/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilderTest.javalucene-2.9.4/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/Score0000644000175000017500000000356511474320245034345 0ustar janpascaljanpascalpackage org.apache.lucene.search.vectorhighlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.search.Query; public class ScoreOrderFragmentsBuilderTest extends AbstractTestCase { public void test3Frags() throws Exception { FieldFragList ffl = ffl( "a c", "a b b b b b b b b b b b a b a b b b b b c a a b b" ); ScoreOrderFragmentsBuilder sofb = new ScoreOrderFragmentsBuilder(); String[] f = sofb.createFragments( reader, 0, F, ffl, 3 ); assertEquals( 3, f.length ); // check score order assertEquals( "c a a b b", f[0] ); assertEquals( "b b a b a b b b b b ", f[1] ); assertEquals( "a b b b b b b b b b ", f[2] ); } private FieldFragList ffl( String queryValue, String indexValue ) throws Exception { make1d1fIndex( indexValue ); Query query = paW.parse( queryValue ); FieldQuery fq = new FieldQuery( query, true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); return new SimpleFragListBuilder().createFieldFragList( fpl, 20 ); } } ././@LongLink0000000000000000000000000000016500000000000011567 Lustar rootrootlucene-2.9.4/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/AbstractTestCase.javalucene-2.9.4/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/Abstr0000644000175000017500000003225711474320245034345 0ustar janpascaljanpascalpackage org.apache.lucene.search.vectorhighlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.Reader; import java.util.Collection; import junit.framework.TestCase; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.KeywordAnalyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field.TermVector; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; public abstract class AbstractTestCase extends TestCase { protected final String F = "f"; protected final String F1 = "f1"; protected final String F2 = "f2"; protected Directory dir; protected Analyzer analyzerW; protected Analyzer analyzerB; protected Analyzer analyzerK; protected IndexReader reader; protected QueryParser paW; protected QueryParser paB; protected static final String[] shortMVValues = { "", "", "a b c", "", // empty data in multi valued field "d e" }; protected static final String[] longMVValues = { "Followings are the examples of customizable parameters and actual examples of customization:", "The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically" }; // test data for LUCENE-1448 bug protected static final String[] biMVValues = { "\nLucene/Solr does not require such additional hardware.", "\nWhen you talk about processing speed, the" }; protected static final String[] strMVValues = { "abc", "defg", "hijkl" }; protected void setUp() throws Exception { analyzerW = new WhitespaceAnalyzer(); analyzerB = new BigramAnalyzer(); analyzerK = new KeywordAnalyzer(); paW = new QueryParser( F, analyzerW ); paB = new QueryParser( F, analyzerB ); dir = new RAMDirectory(); } protected void tearDown() throws Exception { if( reader != null ){ reader.close(); reader = null; } } protected Query tq( String text ){ return tq( 1F, text ); } protected Query tq( float boost, String text ){ return tq( boost, F, text ); } protected Query tq( String field, String text ){ return tq( 1F, field, text ); } protected Query tq( float boost, String field, String text ){ Query query = new TermQuery( new Term( field, text ) ); query.setBoost( boost ); return query; } protected Query pqF( String... texts ){ return pqF( 1F, texts ); } protected Query pqF( float boost, String... texts ){ return pqF( boost, 0, texts ); } protected Query pqF( float boost, int slop, String... texts ){ return pq( boost, slop, F, texts ); } protected Query pq( String field, String... texts ){ return pq( 1F, 0, field, texts ); } protected Query pq( float boost, String field, String... texts ){ return pq( boost, 0, field, texts ); } protected Query pq( float boost, int slop, String field, String... texts ){ PhraseQuery query = new PhraseQuery(); for( String text : texts ){ query.add( new Term( field, text ) ); } query.setBoost( boost ); query.setSlop( slop ); return query; } protected void assertCollectionQueries( Collection actual, Query... expected ){ assertEquals( expected.length, actual.size() ); for( Query query : expected ){ assertTrue( actual.contains( query ) ); } } static class BigramAnalyzer extends Analyzer { public TokenStream tokenStream(String fieldName, Reader reader) { return new BasicNGramTokenizer( reader ); } } static class BasicNGramTokenizer extends Tokenizer { public static final int DEFAULT_N_SIZE = 2; public static final String DEFAULT_DELIMITERS = " \t\n.,"; private final int n; private final String delimiters; private int startTerm; private int lenTerm; private int startOffset; private int nextStartOffset; private int ch; private String snippet; private StringBuilder snippetBuffer; private static final int BUFFER_SIZE = 4096; private char[] charBuffer; private int charBufferIndex; private int charBufferLen; public BasicNGramTokenizer( Reader in ){ this( in, DEFAULT_N_SIZE ); } public BasicNGramTokenizer( Reader in, int n ){ this( in, n, DEFAULT_DELIMITERS ); } public BasicNGramTokenizer( Reader in, String delimiters ){ this( in, DEFAULT_N_SIZE, delimiters ); } public BasicNGramTokenizer( Reader in, int n, String delimiters ){ super(in); this.n = n; this.delimiters = delimiters; startTerm = 0; nextStartOffset = 0; snippet = null; snippetBuffer = new StringBuilder(); charBuffer = new char[BUFFER_SIZE]; charBufferIndex = BUFFER_SIZE; charBufferLen = 0; ch = 0; } TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class); OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); public boolean incrementToken() throws IOException { if( !getNextPartialSnippet() ) return false; clearAttributes(); termAtt.setTermBuffer(snippet, startTerm, lenTerm); offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset + lenTerm)); return true; } private int getFinalOffset() { return nextStartOffset; } public final void end(){ offsetAtt.setOffset(getFinalOffset(),getFinalOffset()); } protected boolean getNextPartialSnippet() throws IOException { if( snippet != null && snippet.length() >= startTerm + 1 + n ){ startTerm++; startOffset++; lenTerm = n; return true; } return getNextSnippet(); } protected boolean getNextSnippet() throws IOException { startTerm = 0; startOffset = nextStartOffset; snippetBuffer.delete( 0, snippetBuffer.length() ); while( true ){ if( ch != -1 ) ch = readCharFromBuffer(); if( ch == -1 ) break; else if( !isDelimiter( ch ) ) snippetBuffer.append( (char)ch ); else if( snippetBuffer.length() > 0 ) break; else startOffset++; } if( snippetBuffer.length() == 0 ) return false; snippet = snippetBuffer.toString(); lenTerm = snippet.length() >= n ? n : snippet.length(); return true; } protected int readCharFromBuffer() throws IOException { if( charBufferIndex >= charBufferLen ){ charBufferLen = input.read( charBuffer ); if( charBufferLen == -1 ){ return -1; } charBufferIndex = 0; } int c = (int)charBuffer[charBufferIndex++]; nextStartOffset++; return c; } protected boolean isDelimiter( int c ){ return delimiters.indexOf( c ) >= 0; } } protected void make1d1fIndex( String value ) throws Exception { make1dmfIndex( value ); } protected void make1d1fIndexB( String value ) throws Exception { make1dmfIndexB( value ); } protected void make1dmfIndex( String... values ) throws Exception { make1dmfIndex( analyzerW, values ); } protected void make1dmfIndexB( String... values ) throws Exception { make1dmfIndex( analyzerB, values ); } // make 1 doc with multi valued field protected void make1dmfIndex( Analyzer analyzer, String... values ) throws Exception { IndexWriter writer = new IndexWriter( dir, analyzer, true, MaxFieldLength.LIMITED ); Document doc = new Document(); for( String value: values ) doc.add( new Field( F, value, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) ); writer.addDocument( doc ); writer.close(); reader = IndexReader.open( dir ); } // make 1 doc with multi valued & not analyzed field protected void make1dmfIndexNA( String... values ) throws Exception { IndexWriter writer = new IndexWriter( dir, analyzerK, true, MaxFieldLength.LIMITED ); Document doc = new Document(); for( String value: values ) doc.add( new Field( F, value, Store.YES, Index.NOT_ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) ); writer.addDocument( doc ); writer.close(); reader = IndexReader.open( dir, true ); } protected void makeIndexShortMV() throws Exception { // "" // "" // 012345 // "a b c" // 0 1 2 // "" // 6789 // "d e" // 3 4 make1dmfIndex( shortMVValues ); } protected void makeIndexLongMV() throws Exception { // 11111111112222222222333333333344444444445555555555666666666677777777778888888888999 // 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012 // Followings are the examples of customizable parameters and actual examples of customization: // 0 1 2 3 4 5 6 7 8 9 10 11 // 1 2 // 999999900000000001111111111222222222233333333334444444444555555555566666666667777777777888888888899999999990000000000111111111122 // 345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901 // The most search engines use only one of these methods. Even the search engines that says they can use the both methods basically // 12 13 (14) (15) 16 17 18 19 20 21 22 23 (24) (25) 26 27 28 29 30 31 32 33 34 make1dmfIndex( longMVValues ); } protected void makeIndexLongMVB() throws Exception { // "*" ... LF // 1111111111222222222233333333334444444444555555 // 01234567890123456789012345678901234567890123456789012345 // *Lucene/Solr does not require such additional hardware. // Lu 0 do 10 re 15 su 21 na 31 // uc 1 oe 11 eq 16 uc 22 al 32 // ce 2 es 12 qu 17 ch 23 ha 33 // en 3 no 13 ui 18 ad 24 ar 34 // ne 4 ot 14 ir 19 dd 25 rd 35 // e/ 5 re 20 di 26 dw 36 // /S 6 it 27 wa 37 // So 7 ti 28 ar 38 // ol 8 io 29 re 39 // lr 9 on 30 // 5555666666666677777777778888888888999999999 // 6789012345678901234567890123456789012345678 // *When you talk about processing speed, the // Wh 40 ab 48 es 56 th 65 // he 41 bo 49 ss 57 he 66 // en 42 ou 50 si 58 // yo 43 ut 51 in 59 // ou 44 pr 52 ng 60 // ta 45 ro 53 sp 61 // al 46 oc 54 pe 62 // lk 47 ce 55 ee 63 // ed 64 make1dmfIndexB( biMVValues ); } protected void makeIndexStrMV() throws Exception { // 0123 // "abc" // 34567 // "defg" // 111 // 789012 // "hijkl" make1dmfIndexNA( strMVValues ); } } ././@LongLink0000000000000000000000000000017600000000000011571 Lustar rootrootlucene-2.9.4/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilderTest.javalucene-2.9.4/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/Simpl0000644000175000017500000002017311474320245034350 0ustar janpascaljanpascalpackage org.apache.lucene.search.vectorhighlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.search.Query; public class SimpleFragListBuilderTest extends AbstractTestCase { public void testNullFieldFragList() throws Exception { SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl( "a", "b c d" ), 100 ); assertEquals( 0, ffl.fragInfos.size() ); } public void testTooSmallFragSize() throws Exception { try{ SimpleFragListBuilder sflb = new SimpleFragListBuilder(); sflb.createFieldFragList( fpl( "a", "b c d" ), SimpleFragListBuilder.MIN_FRAG_CHAR_SIZE - 1 ); fail( "IllegalArgumentException must be thrown" ); } catch ( IllegalArgumentException expected ) { } } public void testSmallerFragSizeThanTermQuery() throws Exception { SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl( "abcdefghijklmnopqrs", "abcdefghijklmnopqrs" ), SimpleFragListBuilder.MIN_FRAG_CHAR_SIZE ); assertEquals( 1, ffl.fragInfos.size() ); assertEquals( "subInfos=(abcdefghijklmnopqrs((0,19)))/1.0(0,19)", ffl.fragInfos.get( 0 ).toString() ); } public void testSmallerFragSizeThanPhraseQuery() throws Exception { SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl( "\"abcdefgh jklmnopqrs\"", "abcdefgh jklmnopqrs" ), SimpleFragListBuilder.MIN_FRAG_CHAR_SIZE ); assertEquals( 1, ffl.fragInfos.size() ); System.out.println( ffl.fragInfos.get( 0 ).toString() ); assertEquals( "subInfos=(abcdefghjklmnopqrs((0,21)))/1.0(0,21)", ffl.fragInfos.get( 0 ).toString() ); } public void test1TermIndex() throws Exception { SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl( "a", "a" ), 100 ); assertEquals( 1, ffl.fragInfos.size() ); assertEquals( "subInfos=(a((0,1)))/1.0(0,100)", ffl.fragInfos.get( 0 ).toString() ); } public void test2TermsIndex1Frag() throws Exception { SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl( "a", "a a" ), 100 ); assertEquals( 1, ffl.fragInfos.size() ); assertEquals( "subInfos=(a((0,1))a((2,3)))/2.0(0,100)", ffl.fragInfos.get( 0 ).toString() ); ffl = sflb.createFieldFragList( fpl( "a", "a b b b b b b b b a" ), 20 ); assertEquals( 1, ffl.fragInfos.size() ); assertEquals( "subInfos=(a((0,1))a((18,19)))/2.0(0,20)", ffl.fragInfos.get( 0 ).toString() ); ffl = sflb.createFieldFragList( fpl( "a", "b b b b a b b b b a" ), 20 ); assertEquals( 1, ffl.fragInfos.size() ); assertEquals( "subInfos=(a((8,9))a((18,19)))/2.0(2,22)", ffl.fragInfos.get( 0 ).toString() ); } public void test2TermsIndex2Frags() throws Exception { SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl( "a", "a b b b b b b b b b b b b b a" ), 20 ); assertEquals( 2, ffl.fragInfos.size() ); assertEquals( "subInfos=(a((0,1)))/1.0(0,20)", ffl.fragInfos.get( 0 ).toString() ); assertEquals( "subInfos=(a((28,29)))/1.0(22,42)", ffl.fragInfos.get( 1 ).toString() ); ffl = sflb.createFieldFragList( fpl( "a", "a b b b b b b b b b b b b a" ), 20 ); assertEquals( 2, ffl.fragInfos.size() ); assertEquals( "subInfos=(a((0,1)))/1.0(0,20)", ffl.fragInfos.get( 0 ).toString() ); assertEquals( "subInfos=(a((26,27)))/1.0(20,40)", ffl.fragInfos.get( 1 ).toString() ); ffl = sflb.createFieldFragList( fpl( "a", "a b b b b b b b b b a" ), 20 ); assertEquals( 2, ffl.fragInfos.size() ); assertEquals( "subInfos=(a((0,1)))/1.0(0,20)", ffl.fragInfos.get( 0 ).toString() ); assertEquals( "subInfos=(a((20,21)))/1.0(20,40)", ffl.fragInfos.get( 1 ).toString() ); } public void test2TermsQuery() throws Exception { SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl( "a b", "c d e" ), 20 ); assertEquals( 0, ffl.fragInfos.size() ); ffl = sflb.createFieldFragList( fpl( "a b", "d b c" ), 20 ); assertEquals( 1, ffl.fragInfos.size() ); assertEquals( "subInfos=(b((2,3)))/1.0(0,20)", ffl.fragInfos.get( 0 ).toString() ); ffl = sflb.createFieldFragList( fpl( "a b", "a b c" ), 20 ); assertEquals( 1, ffl.fragInfos.size() ); assertEquals( "subInfos=(a((0,1))b((2,3)))/2.0(0,20)", ffl.fragInfos.get( 0 ).toString() ); } public void testPhraseQuery() throws Exception { SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl( "\"a b\"", "c d e" ), 20 ); assertEquals( 0, ffl.fragInfos.size() ); ffl = sflb.createFieldFragList( fpl( "\"a b\"", "a c b" ), 20 ); assertEquals( 0, ffl.fragInfos.size() ); ffl = sflb.createFieldFragList( fpl( "\"a b\"", "a b c" ), 20 ); assertEquals( 1, ffl.fragInfos.size() ); assertEquals( "subInfos=(ab((0,3)))/1.0(0,20)", ffl.fragInfos.get( 0 ).toString() ); } public void testPhraseQuerySlop() throws Exception { SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl( "\"a b\"~1", "a c b" ), 20 ); assertEquals( 1, ffl.fragInfos.size() ); assertEquals( "subInfos=(ab((0,1)(4,5)))/1.0(0,20)", ffl.fragInfos.get( 0 ).toString() ); } private FieldPhraseList fpl( String queryValue, String indexValue ) throws Exception { make1d1fIndex( indexValue ); Query query = paW.parse( queryValue ); FieldQuery fq = new FieldQuery( query, true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); return new FieldPhraseList( stack, fq ); } public void test1PhraseShortMV() throws Exception { makeIndexShortMV(); FieldQuery fq = new FieldQuery( tq( "d" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl, 100 ); assertEquals( 1, ffl.fragInfos.size() ); assertEquals( "subInfos=(d((6,7)))/1.0(0,100)", ffl.fragInfos.get( 0 ).toString() ); } public void test1PhraseLongMV() throws Exception { makeIndexLongMV(); FieldQuery fq = new FieldQuery( pqF( "search", "engines" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl, 100 ); assertEquals( 1, ffl.fragInfos.size() ); assertEquals( "subInfos=(searchengines((102,116))searchengines((157,171)))/2.0(96,196)", ffl.fragInfos.get( 0 ).toString() ); } public void test1PhraseLongMVB() throws Exception { makeIndexLongMVB(); FieldQuery fq = new FieldQuery( pqF( "sp", "pe", "ee", "ed" ), true, true ); // "speed" -(2gram)-> "sp","pe","ee","ed" FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl, 100 ); assertEquals( 1, ffl.fragInfos.size() ); assertEquals( "subInfos=(sppeeeed((88,93)))/1.0(82,182)", ffl.fragInfos.get( 0 ).toString() ); } } ././@LongLink0000000000000000000000000000017100000000000011564 Lustar rootrootlucene-2.9.4/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/IndexTimeSynonymTest.javalucene-2.9.4/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/Index0000644000175000017500000003041611474320245034334 0ustar janpascaljanpascalpackage org.apache.lucene.search.vectorhighlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.Reader; import java.util.HashSet; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanClause.Occur; public class IndexTimeSynonymTest extends AbstractTestCase { public void testFieldTermStackIndex1wSearch1term() throws Exception { makeIndex1w(); FieldQuery fq = new FieldQuery( tq( "Mac" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); assertEquals( 1, stack.termList.size() ); assertEquals( "Mac(11,20,3)", stack.pop().toString() ); } public void testFieldTermStackIndex1wSearch2terms() throws Exception { makeIndex1w(); BooleanQuery bq = new BooleanQuery(); bq.add( tq( "Mac" ), Occur.SHOULD ); bq.add( tq( "MacBook" ), Occur.SHOULD ); FieldQuery fq = new FieldQuery( bq, true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); assertEquals( 2, stack.termList.size() ); Set expectedSet = new HashSet(); expectedSet.add( "Mac(11,20,3)" ); expectedSet.add( "MacBook(11,20,3)" ); assertTrue( expectedSet.contains( stack.pop().toString() ) ); assertTrue( expectedSet.contains( stack.pop().toString() ) ); } public void testFieldTermStackIndex1w2wSearch1term() throws Exception { makeIndex1w2w(); FieldQuery fq = new FieldQuery( tq( "pc" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); assertEquals( 1, stack.termList.size() ); assertEquals( "pc(3,5,1)", stack.pop().toString() ); } public void testFieldTermStackIndex1w2wSearch1phrase() throws Exception { makeIndex1w2w(); FieldQuery fq = new FieldQuery( pqF( "personal", "computer" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); assertEquals( 2, stack.termList.size() ); assertEquals( "personal(3,5,1)", stack.pop().toString() ); assertEquals( "computer(3,5,2)", stack.pop().toString() ); } public void testFieldTermStackIndex1w2wSearch1partial() throws Exception { makeIndex1w2w(); FieldQuery fq = new FieldQuery( tq( "computer" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); assertEquals( 1, stack.termList.size() ); assertEquals( "computer(3,5,2)", stack.pop().toString() ); } public void testFieldTermStackIndex1w2wSearch1term1phrase() throws Exception { makeIndex1w2w(); BooleanQuery bq = new BooleanQuery(); bq.add( tq( "pc" ), Occur.SHOULD ); bq.add( pqF( "personal", "computer" ), Occur.SHOULD ); FieldQuery fq = new FieldQuery( bq, true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); assertEquals( 3, stack.termList.size() ); Set expectedSet = new HashSet(); expectedSet.add( "pc(3,5,1)" ); expectedSet.add( "personal(3,5,1)" ); assertTrue( expectedSet.contains( stack.pop().toString() ) ); assertTrue( expectedSet.contains( stack.pop().toString() ) ); assertEquals( "computer(3,5,2)", stack.pop().toString() ); } public void testFieldTermStackIndex2w1wSearch1term() throws Exception { makeIndex2w1w(); FieldQuery fq = new FieldQuery( tq( "pc" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); assertEquals( 1, stack.termList.size() ); assertEquals( "pc(3,20,1)", stack.pop().toString() ); } public void testFieldTermStackIndex2w1wSearch1phrase() throws Exception { makeIndex2w1w(); FieldQuery fq = new FieldQuery( pqF( "personal", "computer" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); assertEquals( 2, stack.termList.size() ); assertEquals( "personal(3,20,1)", stack.pop().toString() ); assertEquals( "computer(3,20,2)", stack.pop().toString() ); } public void testFieldTermStackIndex2w1wSearch1partial() throws Exception { makeIndex2w1w(); FieldQuery fq = new FieldQuery( tq( "computer" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); assertEquals( 1, stack.termList.size() ); assertEquals( "computer(3,20,2)", stack.pop().toString() ); } public void testFieldTermStackIndex2w1wSearch1term1phrase() throws Exception { makeIndex2w1w(); BooleanQuery bq = new BooleanQuery(); bq.add( tq( "pc" ), Occur.SHOULD ); bq.add( pqF( "personal", "computer" ), Occur.SHOULD ); FieldQuery fq = new FieldQuery( bq, true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); assertEquals( 3, stack.termList.size() ); Set expectedSet = new HashSet(); expectedSet.add( "pc(3,20,1)" ); expectedSet.add( "personal(3,20,1)" ); assertTrue( expectedSet.contains( stack.pop().toString() ) ); assertTrue( expectedSet.contains( stack.pop().toString() ) ); assertEquals( "computer(3,20,2)", stack.pop().toString() ); } public void testFieldPhraseListIndex1w2wSearch1phrase() throws Exception { makeIndex1w2w(); FieldQuery fq = new FieldQuery( pqF( "personal", "computer" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); assertEquals( 1, fpl.phraseList.size() ); assertEquals( "personalcomputer(1.0)((3,5))", fpl.phraseList.get( 0 ).toString() ); assertEquals( 3, fpl.phraseList.get( 0 ).getStartOffset() ); assertEquals( 5, fpl.phraseList.get( 0 ).getEndOffset() ); } public void testFieldPhraseListIndex1w2wSearch1partial() throws Exception { makeIndex1w2w(); FieldQuery fq = new FieldQuery( tq( "computer" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); assertEquals( 1, fpl.phraseList.size() ); assertEquals( "computer(1.0)((3,5))", fpl.phraseList.get( 0 ).toString() ); assertEquals( 3, fpl.phraseList.get( 0 ).getStartOffset() ); assertEquals( 5, fpl.phraseList.get( 0 ).getEndOffset() ); } public void testFieldPhraseListIndex1w2wSearch1term1phrase() throws Exception { makeIndex1w2w(); BooleanQuery bq = new BooleanQuery(); bq.add( tq( "pc" ), Occur.SHOULD ); bq.add( pqF( "personal", "computer" ), Occur.SHOULD ); FieldQuery fq = new FieldQuery( bq, true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); assertEquals( 1, fpl.phraseList.size() ); assertTrue( fpl.phraseList.get( 0 ).toString().indexOf( "(1.0)((3,5))" ) > 0 ); assertEquals( 3, fpl.phraseList.get( 0 ).getStartOffset() ); assertEquals( 5, fpl.phraseList.get( 0 ).getEndOffset() ); } public void testFieldPhraseListIndex2w1wSearch1term() throws Exception { makeIndex2w1w(); FieldQuery fq = new FieldQuery( tq( "pc" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); assertEquals( 1, fpl.phraseList.size() ); assertEquals( "pc(1.0)((3,20))", fpl.phraseList.get( 0 ).toString() ); assertEquals( 3, fpl.phraseList.get( 0 ).getStartOffset() ); assertEquals( 20, fpl.phraseList.get( 0 ).getEndOffset() ); } public void testFieldPhraseListIndex2w1wSearch1phrase() throws Exception { makeIndex2w1w(); FieldQuery fq = new FieldQuery( pqF( "personal", "computer" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); assertEquals( 1, fpl.phraseList.size() ); assertEquals( "personalcomputer(1.0)((3,20))", fpl.phraseList.get( 0 ).toString() ); assertEquals( 3, fpl.phraseList.get( 0 ).getStartOffset() ); assertEquals( 20, fpl.phraseList.get( 0 ).getEndOffset() ); } public void testFieldPhraseListIndex2w1wSearch1partial() throws Exception { makeIndex2w1w(); FieldQuery fq = new FieldQuery( tq( "computer" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); assertEquals( 1, fpl.phraseList.size() ); assertEquals( "computer(1.0)((3,20))", fpl.phraseList.get( 0 ).toString() ); assertEquals( 3, fpl.phraseList.get( 0 ).getStartOffset() ); assertEquals( 20, fpl.phraseList.get( 0 ).getEndOffset() ); } public void testFieldPhraseListIndex2w1wSearch1term1phrase() throws Exception { makeIndex2w1w(); BooleanQuery bq = new BooleanQuery(); bq.add( tq( "pc" ), Occur.SHOULD ); bq.add( pqF( "personal", "computer" ), Occur.SHOULD ); FieldQuery fq = new FieldQuery( bq, true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); assertEquals( 1, fpl.phraseList.size() ); assertTrue( fpl.phraseList.get( 0 ).toString().indexOf( "(1.0)((3,20))" ) > 0 ); assertEquals( 3, fpl.phraseList.get( 0 ).getStartOffset() ); assertEquals( 20, fpl.phraseList.get( 0 ).getEndOffset() ); } private void makeIndex1w() throws Exception { // 11111111112 // 012345678901234567890 // I'll buy a Macintosh // Mac // MacBook // 0 1 2 3 makeSynonymIndex( "I'll buy a Macintosh", t("I'll",0,4), t("buy",5,8), t("a",9,10), t("Macintosh",11,20),t("Mac",11,20,0),t("MacBook",11,20,0)); } private void makeIndex1w2w() throws Exception { // 1111111 // 01234567890123456 // My pc was broken // personal computer // 0 1 2 3 makeSynonymIndex( "My pc was broken", t("My",0,2), t("pc",3,5),t("personal",3,5,0),t("computer",3,5), t("was",6,9), t("broken",10,16)); } private void makeIndex2w1w() throws Exception { // 1111111111222222222233 // 01234567890123456789012345678901 // My personal computer was broken // pc // 0 1 2 3 4 makeSynonymIndex( "My personal computer was broken", t("My",0,2), t("personal",3,20),t("pc",3,20,0),t("computer",3,20), t("was",21,24), t("broken",25,31)); } void makeSynonymIndex( String value, Token... tokens ) throws Exception { Analyzer analyzer = new TokenArrayAnalyzer( tokens ); make1dmfIndex( analyzer, value ); } public static Token t( String text, int startOffset, int endOffset ){ return t( text, startOffset, endOffset, 1 ); } public static Token t( String text, int startOffset, int endOffset, int positionIncrement ){ Token token = new Token( text, startOffset, endOffset ); token.setPositionIncrement( positionIncrement ); return token; } public static class TokenArrayAnalyzer extends Analyzer { Token[] tokens; public TokenArrayAnalyzer( Token... tokens ){ this.tokens = tokens; } public TokenStream tokenStream(String fieldName, Reader reader) { final Token reusableToken = new Token(); TokenStream.setOnlyUseNewAPI(true); TokenStream ts = new TokenStream(){ int p = 0; public boolean incrementToken() throws IOException { if( p >= tokens.length ) return false; tokens[p++].copyTo(reusableToken); return true; } }; ts.addAttributeImpl(reusableToken); return ts; } } } ././@LongLink0000000000000000000000000000017000000000000011563 Lustar rootrootlucene-2.9.4/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldPhraseListTest.javalucene-2.9.4/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/Field0000644000175000017500000001771711474320245034321 0ustar janpascaljanpascalpackage org.apache.lucene.search.vectorhighlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanClause.Occur; public class FieldPhraseListTest extends AbstractTestCase { public void test1TermIndex() throws Exception { make1d1fIndex( "a" ); FieldQuery fq = new FieldQuery( tq( "a" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); assertEquals( 1, fpl.phraseList.size() ); assertEquals( "a(1.0)((0,1))", fpl.phraseList.get( 0 ).toString() ); fq = new FieldQuery( tq( "b" ), true, true ); stack = new FieldTermStack( reader, 0, F, fq ); fpl = new FieldPhraseList( stack, fq ); assertEquals( 0, fpl.phraseList.size() ); } public void test2TermsIndex() throws Exception { make1d1fIndex( "a a" ); FieldQuery fq = new FieldQuery( tq( "a" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); assertEquals( 2, fpl.phraseList.size() ); assertEquals( "a(1.0)((0,1))", fpl.phraseList.get( 0 ).toString() ); assertEquals( "a(1.0)((2,3))", fpl.phraseList.get( 1 ).toString() ); } public void test1PhraseIndex() throws Exception { make1d1fIndex( "a b" ); FieldQuery fq = new FieldQuery( pqF( "a", "b" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); assertEquals( 1, fpl.phraseList.size() ); assertEquals( "ab(1.0)((0,3))", fpl.phraseList.get( 0 ).toString() ); fq = new FieldQuery( tq( "b" ), true, true ); stack = new FieldTermStack( reader, 0, F, fq ); fpl = new FieldPhraseList( stack, fq ); assertEquals( 1, fpl.phraseList.size() ); assertEquals( "b(1.0)((2,3))", fpl.phraseList.get( 0 ).toString() ); } public void test1PhraseIndexB() throws Exception { // 01 12 23 34 45 56 67 78 (offsets) // bb|bb|ba|ac|cb|ba|ab|bc // 0 1 2 3 4 5 6 7 (positions) make1d1fIndexB( "bbbacbabc" ); FieldQuery fq = new FieldQuery( pqF( "ba", "ac" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); assertEquals( 1, fpl.phraseList.size() ); assertEquals( "baac(1.0)((2,5))", fpl.phraseList.get( 0 ).toString() ); } public void test2ConcatTermsIndexB() throws Exception { // 01 12 23 (offsets) // ab|ba|ab // 0 1 2 (positions) make1d1fIndexB( "abab" ); FieldQuery fq = new FieldQuery( tq( "ab" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); assertEquals( 2, fpl.phraseList.size() ); assertEquals( "ab(1.0)((0,2))", fpl.phraseList.get( 0 ).toString() ); assertEquals( "ab(1.0)((2,4))", fpl.phraseList.get( 1 ).toString() ); } public void test2Terms1PhraseIndex() throws Exception { make1d1fIndex( "c a a b" ); // phraseHighlight = true FieldQuery fq = new FieldQuery( pqF( "a", "b" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); assertEquals( 1, fpl.phraseList.size() ); assertEquals( "ab(1.0)((4,7))", fpl.phraseList.get( 0 ).toString() ); // phraseHighlight = false fq = new FieldQuery( pqF( "a", "b" ), false, true ); stack = new FieldTermStack( reader, 0, F, fq ); fpl = new FieldPhraseList( stack, fq ); assertEquals( 2, fpl.phraseList.size() ); assertEquals( "a(1.0)((2,3))", fpl.phraseList.get( 0 ).toString() ); assertEquals( "ab(1.0)((4,7))", fpl.phraseList.get( 1 ).toString() ); } public void testPhraseSlop() throws Exception { make1d1fIndex( "c a a b c" ); FieldQuery fq = new FieldQuery( pqF( 2F, 1, "a", "c" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); assertEquals( 1, fpl.phraseList.size() ); assertEquals( "ac(2.0)((4,5)(8,9))", fpl.phraseList.get( 0 ).toString() ); assertEquals( 4, fpl.phraseList.get( 0 ).getStartOffset() ); assertEquals( 9, fpl.phraseList.get( 0 ).getEndOffset() ); } public void test2PhrasesOverlap() throws Exception { make1d1fIndex( "d a b c d" ); BooleanQuery query = new BooleanQuery(); query.add( pqF( "a", "b" ), Occur.SHOULD ); query.add( pqF( "b", "c" ), Occur.SHOULD ); FieldQuery fq = new FieldQuery( query, true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); assertEquals( 1, fpl.phraseList.size() ); assertEquals( "abc(1.0)((2,7))", fpl.phraseList.get( 0 ).toString() ); } public void test3TermsPhrase() throws Exception { make1d1fIndex( "d a b a b c d" ); FieldQuery fq = new FieldQuery( pqF( "a", "b", "c" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); assertEquals( 1, fpl.phraseList.size() ); assertEquals( "abc(1.0)((6,11))", fpl.phraseList.get( 0 ).toString() ); } public void testSearchLongestPhrase() throws Exception { make1d1fIndex( "d a b d c a b c" ); BooleanQuery query = new BooleanQuery(); query.add( pqF( "a", "b" ), Occur.SHOULD ); query.add( pqF( "a", "b", "c" ), Occur.SHOULD ); FieldQuery fq = new FieldQuery( query, true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); assertEquals( 2, fpl.phraseList.size() ); assertEquals( "ab(1.0)((2,5))", fpl.phraseList.get( 0 ).toString() ); assertEquals( "abc(1.0)((10,15))", fpl.phraseList.get( 1 ).toString() ); } public void test1PhraseShortMV() throws Exception { makeIndexShortMV(); FieldQuery fq = new FieldQuery( tq( "d" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); assertEquals( 1, fpl.phraseList.size() ); assertEquals( "d(1.0)((6,7))", fpl.phraseList.get( 0 ).toString() ); } public void test1PhraseLongMV() throws Exception { makeIndexLongMV(); FieldQuery fq = new FieldQuery( pqF( "search", "engines" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); assertEquals( 2, fpl.phraseList.size() ); assertEquals( "searchengines(1.0)((102,116))", fpl.phraseList.get( 0 ).toString() ); assertEquals( "searchengines(1.0)((157,171))", fpl.phraseList.get( 1 ).toString() ); } public void test1PhraseLongMVB() throws Exception { makeIndexLongMVB(); FieldQuery fq = new FieldQuery( pqF( "sp", "pe", "ee", "ed" ), true, true ); // "speed" -(2gram)-> "sp","pe","ee","ed" FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); assertEquals( 1, fpl.phraseList.size() ); assertEquals( "sppeeeed(1.0)((88,93))", fpl.phraseList.get( 0 ).toString() ); } } ././@LongLink0000000000000000000000000000016700000000000011571 Lustar rootrootlucene-2.9.4/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldTermStackTest.javalucene-2.9.4/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/Field0000644000175000017500000001445111474320245034311 0ustar janpascaljanpascalpackage org.apache.lucene.search.vectorhighlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanClause.Occur; public class FieldTermStackTest extends AbstractTestCase { public void test1Term() throws Exception { makeIndex(); FieldQuery fq = new FieldQuery( tq( "a" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); assertEquals( 6, stack.termList.size() ); assertEquals( "a(0,1,0)", stack.pop().toString() ); assertEquals( "a(2,3,1)", stack.pop().toString() ); assertEquals( "a(4,5,2)", stack.pop().toString() ); assertEquals( "a(12,13,6)", stack.pop().toString() ); assertEquals( "a(28,29,14)", stack.pop().toString() ); assertEquals( "a(32,33,16)", stack.pop().toString() ); } public void test2Terms() throws Exception { makeIndex(); BooleanQuery query = new BooleanQuery(); query.add( tq( "b" ), Occur.SHOULD ); query.add( tq( "c" ), Occur.SHOULD ); FieldQuery fq = new FieldQuery( query, true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); assertEquals( 8, stack.termList.size() ); assertEquals( "b(6,7,3)", stack.pop().toString() ); assertEquals( "b(8,9,4)", stack.pop().toString() ); assertEquals( "c(10,11,5)", stack.pop().toString() ); assertEquals( "b(14,15,7)", stack.pop().toString() ); assertEquals( "b(16,17,8)", stack.pop().toString() ); assertEquals( "c(18,19,9)", stack.pop().toString() ); assertEquals( "b(26,27,13)", stack.pop().toString() ); assertEquals( "b(30,31,15)", stack.pop().toString() ); } public void test1Phrase() throws Exception { makeIndex(); FieldQuery fq = new FieldQuery( pqF( "c", "d" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); assertEquals( 3, stack.termList.size() ); assertEquals( "c(10,11,5)", stack.pop().toString() ); assertEquals( "c(18,19,9)", stack.pop().toString() ); assertEquals( "d(20,21,10)", stack.pop().toString() ); } private void makeIndex() throws Exception { // 111111111122222 // 0123456789012345678901234 (offsets) // a a a b b c a b b c d e f // 0 1 2 3 4 5 6 7 8 9101112 (position) String value1 = "a a a b b c a b b c d e f"; // 222233333 // 678901234 (offsets) // b a b a f //1314151617 (position) String value2 = "b a b a f"; make1dmfIndex( value1, value2 ); } public void test1TermB() throws Exception { makeIndexB(); FieldQuery fq = new FieldQuery( tq( "ab" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); assertEquals( 2, stack.termList.size() ); assertEquals( "ab(2,4,2)", stack.pop().toString() ); assertEquals( "ab(6,8,6)", stack.pop().toString() ); } public void test2TermsB() throws Exception { makeIndexB(); BooleanQuery query = new BooleanQuery(); query.add( tq( "bc" ), Occur.SHOULD ); query.add( tq( "ef" ), Occur.SHOULD ); FieldQuery fq = new FieldQuery( query, true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); assertEquals( 3, stack.termList.size() ); assertEquals( "bc(4,6,4)", stack.pop().toString() ); assertEquals( "bc(8,10,8)", stack.pop().toString() ); assertEquals( "ef(11,13,11)", stack.pop().toString() ); } public void test1PhraseB() throws Exception { makeIndexB(); FieldQuery fq = new FieldQuery( pqF( "ab", "bb" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); assertEquals( 4, stack.termList.size() ); assertEquals( "ab(2,4,2)", stack.pop().toString() ); assertEquals( "bb(3,5,3)", stack.pop().toString() ); assertEquals( "ab(6,8,6)", stack.pop().toString() ); assertEquals( "bb(7,9,7)", stack.pop().toString() ); } private void makeIndexB() throws Exception { // 1 11 11 // 01 12 23 34 45 56 67 78 89 90 01 12 (offsets) // aa|aa|ab|bb|bc|ca|ab|bb|bc|cd|de|ef // 0 1 2 3 4 5 6 7 8 9 10 11 (position) String value = "aaabbcabbcdef"; make1dmfIndexB( value ); } public void test1PhraseShortMV() throws Exception { makeIndexShortMV(); FieldQuery fq = new FieldQuery( tq( "d" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); assertEquals( 1, stack.termList.size() ); assertEquals( "d(6,7,3)", stack.pop().toString() ); } public void test1PhraseLongMV() throws Exception { makeIndexLongMV(); FieldQuery fq = new FieldQuery( pqF( "search", "engines" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); assertEquals( 4, stack.termList.size() ); assertEquals( "search(102,108,14)", stack.pop().toString() ); assertEquals( "engines(109,116,15)", stack.pop().toString() ); assertEquals( "search(157,163,24)", stack.pop().toString() ); assertEquals( "engines(164,171,25)", stack.pop().toString() ); } public void test1PhraseMVB() throws Exception { makeIndexLongMVB(); FieldQuery fq = new FieldQuery( pqF( "sp", "pe", "ee", "ed" ), true, true ); // "speed" -(2gram)-> "sp","pe","ee","ed" FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); assertEquals( 4, stack.termList.size() ); assertEquals( "sp(88,90,61)", stack.pop().toString() ); assertEquals( "pe(89,91,62)", stack.pop().toString() ); assertEquals( "ee(90,92,63)", stack.pop().toString() ); assertEquals( "ed(91,93,64)", stack.pop().toString() ); } } ././@LongLink0000000000000000000000000000017700000000000011572 Lustar rootrootlucene-2.9.4/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.javalucene-2.9.4/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/Simpl0000644000175000017500000001436211474320245034353 0ustar janpascaljanpascalpackage org.apache.lucene.search.vectorhighlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field.TermVector; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.search.Query; public class SimpleFragmentsBuilderTest extends AbstractTestCase { public void test1TermIndex() throws Exception { FieldFragList ffl = ffl( "a", "a" ); SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); assertEquals( "a", sfb.createFragment( reader, 0, F, ffl ) ); // change tags sfb = new SimpleFragmentsBuilder( new String[]{ "[" }, new String[]{ "]" } ); assertEquals( "[a]", sfb.createFragment( reader, 0, F, ffl ) ); } public void test2Frags() throws Exception { FieldFragList ffl = ffl( "a", "a b b b b b b b b b b b a b a b" ); SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); String[] f = sfb.createFragments( reader, 0, F, ffl, 3 ); // 3 snippets requested, but should be 2 assertEquals( 2, f.length ); assertEquals( "a b b b b b b b b b ", f[0] ); assertEquals( "b b a b a b", f[1] ); } public void test3Frags() throws Exception { FieldFragList ffl = ffl( "a c", "a b b b b b b b b b b b a b a b b b b b c a a b b" ); SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); String[] f = sfb.createFragments( reader, 0, F, ffl, 3 ); assertEquals( 3, f.length ); assertEquals( "a b b b b b b b b b ", f[0] ); assertEquals( "b b a b a b b b b b ", f[1] ); assertEquals( "c a a b b", f[2] ); } private FieldFragList ffl( String queryValue, String indexValue ) throws Exception { make1d1fIndex( indexValue ); Query query = paW.parse( queryValue ); FieldQuery fq = new FieldQuery( query, true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); return new SimpleFragListBuilder().createFieldFragList( fpl, 20 ); } public void test1PhraseShortMV() throws Exception { makeIndexShortMV(); FieldQuery fq = new FieldQuery( tq( "d" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl, 100 ); SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); assertEquals( "a b c d e", sfb.createFragment( reader, 0, F, ffl ) ); } public void test1PhraseLongMV() throws Exception { makeIndexLongMV(); FieldQuery fq = new FieldQuery( pqF( "search", "engines" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl, 100 ); SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); assertEquals( " most search engines use only one of these methods. Even the search engines that says they can use t", sfb.createFragment( reader, 0, F, ffl ) ); } public void test1PhraseLongMVB() throws Exception { makeIndexLongMVB(); FieldQuery fq = new FieldQuery( pqF( "sp", "pe", "ee", "ed" ), true, true ); // "speed" -(2gram)-> "sp","pe","ee","ed" FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl, 100 ); SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); assertEquals( "ssing speed, the", sfb.createFragment( reader, 0, F, ffl ) ); } public void testUnstoredField() throws Exception { makeUnstoredIndex(); FieldQuery fq = new FieldQuery( tq( "aaa" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl, 100 ); SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); assertNull( sfb.createFragment( reader, 0, F, ffl ) ); } protected void makeUnstoredIndex() throws Exception { IndexWriter writer = new IndexWriter( dir, analyzerW, true, MaxFieldLength.LIMITED ); Document doc = new Document(); doc.add( new Field( F, "aaa", Store.NO, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS ) ); writer.addDocument( doc ); writer.close(); reader = IndexReader.open( dir ); } public void test1StrMV() throws Exception { makeIndexStrMV(); FieldQuery fq = new FieldQuery( tq( "defg" ), true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl, 100 ); SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); assertEquals( "abcdefghijkl", sfb.createFragment( reader, 0, F, ffl ) ); } } ././@LongLink0000000000000000000000000000016300000000000011565 Lustar rootrootlucene-2.9.4/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/FieldQueryTest.javalucene-2.9.4/contrib/fast-vector-highlighter/src/test/org/apache/lucene/search/vectorhighlight/Field0000644000175000017500000007206011474320245034311 0ustar janpascaljanpascalpackage org.apache.lucene.search.vectorhighlight; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap; import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo; public class FieldQueryTest extends AbstractTestCase { public void testFlattenBoolean() throws Exception { Query query = paW.parse( "A AND B OR C NOT (D AND E)" ); FieldQuery fq = new FieldQuery( query, true, true ); Set flatQueries = new HashSet(); fq.flatten( query, flatQueries ); assertCollectionQueries( flatQueries, tq( "A" ), tq( "B" ), tq( "C" ) ); } public void testFlattenTermAndPhrase() throws Exception { Query query = paW.parse( "A AND \"B C\"" ); FieldQuery fq = new FieldQuery( query, true, true ); Set flatQueries = new HashSet(); fq.flatten( query, flatQueries ); assertCollectionQueries( flatQueries, tq( "A" ), pqF( "B", "C" ) ); } public void testFlattenTermAndPhrase2gram() throws Exception { Query query = paB.parse( "AA AND BCD OR EFGH" ); FieldQuery fq = new FieldQuery( query, true, true ); Set flatQueries = new HashSet(); fq.flatten( query, flatQueries ); assertCollectionQueries( flatQueries, tq( "AA" ), pqF( "BC", "CD" ), pqF( "EF", "FG", "GH" ) ); } public void testFlatten1TermPhrase() throws Exception { Query query = pqF( "A" ); FieldQuery fq = new FieldQuery( query, true, true ); Set flatQueries = new HashSet(); fq.flatten( query, flatQueries ); assertCollectionQueries( flatQueries, tq( "A" ) ); } public void testExpand() throws Exception { Query dummy = pqF( "DUMMY" ); FieldQuery fq = new FieldQuery( dummy, true, true ); // "a b","b c" => "a b","b c","a b c" Set flatQueries = new HashSet(); flatQueries.add( pqF( "a", "b" ) ); flatQueries.add( pqF( "b", "c" ) ); assertCollectionQueries( fq.expand( flatQueries ), pqF( "a", "b" ), pqF( "b", "c" ), pqF( "a", "b", "c" ) ); // "a b","b c d" => "a b","b c d","a b c d" flatQueries = new HashSet(); flatQueries.add( pqF( "a", "b" ) ); flatQueries.add( pqF( "b", "c", "d" ) ); assertCollectionQueries( fq.expand( flatQueries ), pqF( "a", "b" ), pqF( "b", "c", "d" ), pqF( "a", "b", "c", "d" ) ); // "a b c","b c d" => "a b c","b c d","a b c d" flatQueries = new HashSet(); flatQueries.add( pqF( "a", "b", "c" ) ); flatQueries.add( pqF( "b", "c", "d" ) ); assertCollectionQueries( fq.expand( flatQueries ), pqF( "a", "b", "c" ), pqF( "b", "c", "d" ), pqF( "a", "b", "c", "d" ) ); // "a b c","c d e" => "a b c","c d e","a b c d e" flatQueries = new HashSet(); flatQueries.add( pqF( "a", "b", "c" ) ); flatQueries.add( pqF( "c", "d", "e" ) ); assertCollectionQueries( fq.expand( flatQueries ), pqF( "a", "b", "c" ), pqF( "c", "d", "e" ), pqF( "a", "b", "c", "d", "e" ) ); // "a b c d","b c" => "a b c d","b c" flatQueries = new HashSet(); flatQueries.add( pqF( "a", "b", "c", "d" ) ); flatQueries.add( pqF( "b", "c" ) ); assertCollectionQueries( fq.expand( flatQueries ), pqF( "a", "b", "c", "d" ), pqF( "b", "c" ) ); // "a b b","b c" => "a b b","b c","a b b c" flatQueries = new HashSet(); flatQueries.add( pqF( "a", "b", "b" ) ); flatQueries.add( pqF( "b", "c" ) ); assertCollectionQueries( fq.expand( flatQueries ), pqF( "a", "b", "b" ), pqF( "b", "c" ), pqF( "a", "b", "b", "c" ) ); // "a b","b a" => "a b","b a","a b a", "b a b" flatQueries = new HashSet(); flatQueries.add( pqF( "a", "b" ) ); flatQueries.add( pqF( "b", "a" ) ); assertCollectionQueries( fq.expand( flatQueries ), pqF( "a", "b" ), pqF( "b", "a" ), pqF( "a", "b", "a" ), pqF( "b", "a", "b" ) ); // "a b","a b c" => "a b","a b c" flatQueries = new HashSet(); flatQueries.add( pqF( "a", "b" ) ); flatQueries.add( pqF( "a", "b", "c" ) ); assertCollectionQueries( fq.expand( flatQueries ), pqF( "a", "b" ), pqF( "a", "b", "c" ) ); } public void testNoExpand() throws Exception { Query dummy = pqF( "DUMMY" ); FieldQuery fq = new FieldQuery( dummy, true, true ); // "a b","c d" => "a b","c d" Set flatQueries = new HashSet(); flatQueries.add( pqF( "a", "b" ) ); flatQueries.add( pqF( "c", "d" ) ); assertCollectionQueries( fq.expand( flatQueries ), pqF( "a", "b" ), pqF( "c", "d" ) ); // "a","a b" => "a", "a b" flatQueries = new HashSet(); flatQueries.add( tq( "a" ) ); flatQueries.add( pqF( "a", "b" ) ); assertCollectionQueries( fq.expand( flatQueries ), tq( "a" ), pqF( "a", "b" ) ); // "a b","b" => "a b", "b" flatQueries = new HashSet(); flatQueries.add( pqF( "a", "b" ) ); flatQueries.add( tq( "b" ) ); assertCollectionQueries( fq.expand( flatQueries ), pqF( "a", "b" ), tq( "b" ) ); // "a b c","b c" => "a b c","b c" flatQueries = new HashSet(); flatQueries.add( pqF( "a", "b", "c" ) ); flatQueries.add( pqF( "b", "c" ) ); assertCollectionQueries( fq.expand( flatQueries ), pqF( "a", "b", "c" ), pqF( "b", "c" ) ); // "a b","a b c" => "a b","a b c" flatQueries = new HashSet(); flatQueries.add( pqF( "a", "b" ) ); flatQueries.add( pqF( "a", "b", "c" ) ); assertCollectionQueries( fq.expand( flatQueries ), pqF( "a", "b" ), pqF( "a", "b", "c" ) ); // "a b c","b d e" => "a b c","b d e" flatQueries = new HashSet(); flatQueries.add( pqF( "a", "b", "c" ) ); flatQueries.add( pqF( "b", "d", "e" ) ); assertCollectionQueries( fq.expand( flatQueries ), pqF( "a", "b", "c" ), pqF( "b", "d", "e" ) ); } public void testExpandNotFieldMatch() throws Exception { Query dummy = pqF( "DUMMY" ); FieldQuery fq = new FieldQuery( dummy, true, false ); // f1:"a b",f2:"b c" => f1:"a b",f2:"b c",f1:"a b c" Set flatQueries = new HashSet(); flatQueries.add( pq( F1, "a", "b" ) ); flatQueries.add( pq( F2, "b", "c" ) ); assertCollectionQueries( fq.expand( flatQueries ), pq( F1, "a", "b" ), pq( F2, "b", "c" ), pq( F1, "a", "b", "c" ) ); } public void testGetFieldTermMap() throws Exception { Query query = tq( "a" ); FieldQuery fq = new FieldQuery( query, true, true ); QueryPhraseMap pqm = fq.getFieldTermMap( F, "a" ); assertNotNull( pqm ); assertTrue( pqm.isTerminal() ); pqm = fq.getFieldTermMap( F, "b" ); assertNull( pqm ); pqm = fq.getFieldTermMap( F1, "a" ); assertNull( pqm ); } public void testGetRootMap() throws Exception { Query dummy = pqF( "DUMMY" ); FieldQuery fq = new FieldQuery( dummy, true, true ); QueryPhraseMap rootMap1 = fq.getRootMap( tq( "a" ) ); QueryPhraseMap rootMap2 = fq.getRootMap( tq( "a" ) ); assertTrue( rootMap1 == rootMap2 ); QueryPhraseMap rootMap3 = fq.getRootMap( tq( "b" ) ); assertTrue( rootMap1 == rootMap3 ); QueryPhraseMap rootMap4 = fq.getRootMap( tq( F1, "b" ) ); assertFalse( rootMap4 == rootMap3 ); } public void testGetRootMapNotFieldMatch() throws Exception { Query dummy = pqF( "DUMMY" ); FieldQuery fq = new FieldQuery( dummy, true, false ); QueryPhraseMap rootMap1 = fq.getRootMap( tq( "a" ) ); QueryPhraseMap rootMap2 = fq.getRootMap( tq( "a" ) ); assertTrue( rootMap1 == rootMap2 ); QueryPhraseMap rootMap3 = fq.getRootMap( tq( "b" ) ); assertTrue( rootMap1 == rootMap3 ); QueryPhraseMap rootMap4 = fq.getRootMap( tq( F1, "b" ) ); assertTrue( rootMap4 == rootMap3 ); } public void testGetTermSet() throws Exception { Query query = paW.parse( "A AND B OR x:C NOT (D AND E)" ); FieldQuery fq = new FieldQuery( query, true, true ); assertEquals( 2, fq.termSetMap.size() ); Set termSet = fq.getTermSet( F ); assertEquals( 2, termSet.size() ); assertTrue( termSet.contains( "A" ) ); assertTrue( termSet.contains( "B" ) ); termSet = fq.getTermSet( "x" ); assertEquals( 1, termSet.size() ); assertTrue( termSet.contains( "C" ) ); termSet = fq.getTermSet( "y" ); assertNull( termSet ); } public void testQueryPhraseMap1Term() throws Exception { Query query = tq( "a" ); // phraseHighlight = true, fieldMatch = true FieldQuery fq = new FieldQuery( query, true, true ); Map map = fq.rootMaps; assertEquals( 1, map.size() ); assertNull( map.get( null ) ); assertNotNull( map.get( F ) ); QueryPhraseMap qpm = map.get( F ); assertEquals( 1, qpm.subMap.size() ); assertTrue( qpm.subMap.get( "a" ) != null ); assertTrue( qpm.subMap.get( "a" ).terminal ); assertEquals( 1F, qpm.subMap.get( "a" ).boost ); // phraseHighlight = true, fieldMatch = false fq = new FieldQuery( query, true, false ); map = fq.rootMaps; assertEquals( 1, map.size() ); assertNull( map.get( F ) ); assertNotNull( map.get( null ) ); qpm = map.get( null ); assertEquals( 1, qpm.subMap.size() ); assertTrue( qpm.subMap.get( "a" ) != null ); assertTrue( qpm.subMap.get( "a" ).terminal ); assertEquals( 1F, qpm.subMap.get( "a" ).boost ); // phraseHighlight = false, fieldMatch = true fq = new FieldQuery( query, false, true ); map = fq.rootMaps; assertEquals( 1, map.size() ); assertNull( map.get( null ) ); assertNotNull( map.get( F ) ); qpm = map.get( F ); assertEquals( 1, qpm.subMap.size() ); assertTrue( qpm.subMap.get( "a" ) != null ); assertTrue( qpm.subMap.get( "a" ).terminal ); assertEquals( 1F, qpm.subMap.get( "a" ).boost ); // phraseHighlight = false, fieldMatch = false fq = new FieldQuery( query, false, false ); map = fq.rootMaps; assertEquals( 1, map.size() ); assertNull( map.get( F ) ); assertNotNull( map.get( null ) ); qpm = map.get( null ); assertEquals( 1, qpm.subMap.size() ); assertTrue( qpm.subMap.get( "a" ) != null ); assertTrue( qpm.subMap.get( "a" ).terminal ); assertEquals( 1F, qpm.subMap.get( "a" ).boost ); // boost != 1 query = tq( 2, "a" ); fq = new FieldQuery( query, true, true ); map = fq.rootMaps; qpm = map.get( F ); assertEquals( 2F, qpm.subMap.get( "a" ).boost ); } public void testQueryPhraseMap1Phrase() throws Exception { Query query = pqF( "a", "b" ); // phraseHighlight = true, fieldMatch = true FieldQuery fq = new FieldQuery( query, true, true ); Map map = fq.rootMaps; assertEquals( 1, map.size() ); assertNull( map.get( null ) ); assertNotNull( map.get( F ) ); QueryPhraseMap qpm = map.get( F ); assertEquals( 1, qpm.subMap.size() ); assertNotNull( qpm.subMap.get( "a" ) ); QueryPhraseMap qpm2 = qpm.subMap.get( "a" ); assertFalse( qpm2.terminal ); assertEquals( 1, qpm2.subMap.size() ); assertNotNull( qpm2.subMap.get( "b" ) ); QueryPhraseMap qpm3 = qpm2.subMap.get( "b" ); assertTrue( qpm3.terminal ); assertEquals( 1F, qpm3.boost ); // phraseHighlight = true, fieldMatch = false fq = new FieldQuery( query, true, false ); map = fq.rootMaps; assertEquals( 1, map.size() ); assertNull( map.get( F ) ); assertNotNull( map.get( null ) ); qpm = map.get( null ); assertEquals( 1, qpm.subMap.size() ); assertNotNull( qpm.subMap.get( "a" ) ); qpm2 = qpm.subMap.get( "a" ); assertFalse( qpm2.terminal ); assertEquals( 1, qpm2.subMap.size() ); assertNotNull( qpm2.subMap.get( "b" ) ); qpm3 = qpm2.subMap.get( "b" ); assertTrue( qpm3.terminal ); assertEquals( 1F, qpm3.boost ); // phraseHighlight = false, fieldMatch = true fq = new FieldQuery( query, false, true ); map = fq.rootMaps; assertEquals( 1, map.size() ); assertNull( map.get( null ) ); assertNotNull( map.get( F ) ); qpm = map.get( F ); assertEquals( 2, qpm.subMap.size() ); assertNotNull( qpm.subMap.get( "a" ) ); qpm2 = qpm.subMap.get( "a" ); assertTrue( qpm2.terminal ); assertEquals( 1F, qpm2.boost ); assertEquals( 1, qpm2.subMap.size() ); assertNotNull( qpm2.subMap.get( "b" ) ); qpm3 = qpm2.subMap.get( "b" ); assertTrue( qpm3.terminal ); assertEquals( 1F, qpm3.boost ); assertNotNull( qpm.subMap.get( "b" ) ); qpm2 = qpm.subMap.get( "b" ); assertTrue( qpm2.terminal ); assertEquals( 1F, qpm2.boost ); // phraseHighlight = false, fieldMatch = false fq = new FieldQuery( query, false, false ); map = fq.rootMaps; assertEquals( 1, map.size() ); assertNull( map.get( F ) ); assertNotNull( map.get( null ) ); qpm = map.get( null ); assertEquals( 2, qpm.subMap.size() ); assertNotNull( qpm.subMap.get( "a" ) ); qpm2 = qpm.subMap.get( "a" ); assertTrue( qpm2.terminal ); assertEquals( 1F, qpm2.boost ); assertEquals( 1, qpm2.subMap.size() ); assertNotNull( qpm2.subMap.get( "b" ) ); qpm3 = qpm2.subMap.get( "b" ); assertTrue( qpm3.terminal ); assertEquals( 1F, qpm3.boost ); assertNotNull( qpm.subMap.get( "b" ) ); qpm2 = qpm.subMap.get( "b" ); assertTrue( qpm2.terminal ); assertEquals( 1F, qpm2.boost ); // boost != 1 query = pqF( 2, "a", "b" ); // phraseHighlight = false, fieldMatch = false fq = new FieldQuery( query, false, false ); map = fq.rootMaps; qpm = map.get( null ); qpm2 = qpm.subMap.get( "a" ); assertEquals( 2F, qpm2.boost ); qpm3 = qpm2.subMap.get( "b" ); assertEquals( 2F, qpm3.boost ); qpm2 = qpm.subMap.get( "b" ); assertEquals( 2F, qpm2.boost ); } public void testQueryPhraseMap1PhraseAnother() throws Exception { Query query = pqF( "search", "engines" ); // phraseHighlight = true, fieldMatch = true FieldQuery fq = new FieldQuery( query, true, true ); Map map = fq.rootMaps; assertEquals( 1, map.size() ); assertNull( map.get( null ) ); assertNotNull( map.get( F ) ); QueryPhraseMap qpm = map.get( F ); assertEquals( 1, qpm.subMap.size() ); assertNotNull( qpm.subMap.get( "search" ) ); QueryPhraseMap qpm2 = qpm.subMap.get( "search" ); assertFalse( qpm2.terminal ); assertEquals( 1, qpm2.subMap.size() ); assertNotNull( qpm2.subMap.get( "engines" ) ); QueryPhraseMap qpm3 = qpm2.subMap.get( "engines" ); assertTrue( qpm3.terminal ); assertEquals( 1F, qpm3.boost ); } public void testQueryPhraseMap2Phrases() throws Exception { BooleanQuery query = new BooleanQuery(); query.add( pqF( "a", "b" ), Occur.SHOULD ); query.add( pqF( 2, "c", "d" ), Occur.SHOULD ); // phraseHighlight = true, fieldMatch = true FieldQuery fq = new FieldQuery( query, true, true ); Map map = fq.rootMaps; assertEquals( 1, map.size() ); assertNull( map.get( null ) ); assertNotNull( map.get( F ) ); QueryPhraseMap qpm = map.get( F ); assertEquals( 2, qpm.subMap.size() ); // "a b" assertNotNull( qpm.subMap.get( "a" ) ); QueryPhraseMap qpm2 = qpm.subMap.get( "a" ); assertFalse( qpm2.terminal ); assertEquals( 1, qpm2.subMap.size() ); assertNotNull( qpm2.subMap.get( "b" ) ); QueryPhraseMap qpm3 = qpm2.subMap.get( "b" ); assertTrue( qpm3.terminal ); assertEquals( 1F, qpm3.boost ); // "c d"^2 assertNotNull( qpm.subMap.get( "c" ) ); qpm2 = qpm.subMap.get( "c" ); assertFalse( qpm2.terminal ); assertEquals( 1, qpm2.subMap.size() ); assertNotNull( qpm2.subMap.get( "d" ) ); qpm3 = qpm2.subMap.get( "d" ); assertTrue( qpm3.terminal ); assertEquals( 2F, qpm3.boost ); } public void testQueryPhraseMap2PhrasesFields() throws Exception { BooleanQuery query = new BooleanQuery(); query.add( pq( F1, "a", "b" ), Occur.SHOULD ); query.add( pq( 2F, F2, "c", "d" ), Occur.SHOULD ); // phraseHighlight = true, fieldMatch = true FieldQuery fq = new FieldQuery( query, true, true ); Map map = fq.rootMaps; assertEquals( 2, map.size() ); assertNull( map.get( null ) ); // "a b" assertNotNull( map.get( F1 ) ); QueryPhraseMap qpm = map.get( F1 ); assertEquals( 1, qpm.subMap.size() ); assertNotNull( qpm.subMap.get( "a" ) ); QueryPhraseMap qpm2 = qpm.subMap.get( "a" ); assertFalse( qpm2.terminal ); assertEquals( 1, qpm2.subMap.size() ); assertNotNull( qpm2.subMap.get( "b" ) ); QueryPhraseMap qpm3 = qpm2.subMap.get( "b" ); assertTrue( qpm3.terminal ); assertEquals( 1F, qpm3.boost ); // "c d"^2 assertNotNull( map.get( F2 ) ); qpm = map.get( F2 ); assertEquals( 1, qpm.subMap.size() ); assertNotNull( qpm.subMap.get( "c" ) ); qpm2 = qpm.subMap.get( "c" ); assertFalse( qpm2.terminal ); assertEquals( 1, qpm2.subMap.size() ); assertNotNull( qpm2.subMap.get( "d" ) ); qpm3 = qpm2.subMap.get( "d" ); assertTrue( qpm3.terminal ); assertEquals( 2F, qpm3.boost ); // phraseHighlight = true, fieldMatch = false fq = new FieldQuery( query, true, false ); map = fq.rootMaps; assertEquals( 1, map.size() ); assertNull( map.get( F1 ) ); assertNull( map.get( F2 ) ); assertNotNull( map.get( null ) ); qpm = map.get( null ); assertEquals( 2, qpm.subMap.size() ); // "a b" assertNotNull( qpm.subMap.get( "a" ) ); qpm2 = qpm.subMap.get( "a" ); assertFalse( qpm2.terminal ); assertEquals( 1, qpm2.subMap.size() ); assertNotNull( qpm2.subMap.get( "b" ) ); qpm3 = qpm2.subMap.get( "b" ); assertTrue( qpm3.terminal ); assertEquals( 1F, qpm3.boost ); // "c d"^2 assertNotNull( qpm.subMap.get( "c" ) ); qpm2 = qpm.subMap.get( "c" ); assertFalse( qpm2.terminal ); assertEquals( 1, qpm2.subMap.size() ); assertNotNull( qpm2.subMap.get( "d" ) ); qpm3 = qpm2.subMap.get( "d" ); assertTrue( qpm3.terminal ); assertEquals( 2F, qpm3.boost ); } /* * ...terminal * * a-b-c- * +-d- * b-c-d- * +-d- */ public void testQueryPhraseMapOverlapPhrases() throws Exception { BooleanQuery query = new BooleanQuery(); query.add( pqF( "a", "b", "c" ), Occur.SHOULD ); query.add( pqF( 2, "b", "c", "d" ), Occur.SHOULD ); query.add( pqF( 3, "b", "d" ), Occur.SHOULD ); // phraseHighlight = true, fieldMatch = true FieldQuery fq = new FieldQuery( query, true, true ); Map map = fq.rootMaps; assertEquals( 1, map.size() ); assertNull( map.get( null ) ); assertNotNull( map.get( F ) ); QueryPhraseMap qpm = map.get( F ); assertEquals( 2, qpm.subMap.size() ); // "a b c" assertNotNull( qpm.subMap.get( "a" ) ); QueryPhraseMap qpm2 = qpm.subMap.get( "a" ); assertFalse( qpm2.terminal ); assertEquals( 1, qpm2.subMap.size() ); assertNotNull( qpm2.subMap.get( "b" ) ); QueryPhraseMap qpm3 = qpm2.subMap.get( "b" ); assertFalse( qpm3.terminal ); assertEquals( 1, qpm3.subMap.size() ); assertNotNull( qpm3.subMap.get( "c" ) ); QueryPhraseMap qpm4 = qpm3.subMap.get( "c" ); assertTrue( qpm4.terminal ); assertEquals( 1F, qpm4.boost ); assertNotNull( qpm4.subMap.get( "d" ) ); QueryPhraseMap qpm5 = qpm4.subMap.get( "d" ); assertTrue( qpm5.terminal ); assertEquals( 1F, qpm5.boost ); // "b c d"^2, "b d"^3 assertNotNull( qpm.subMap.get( "b" ) ); qpm2 = qpm.subMap.get( "b" ); assertFalse( qpm2.terminal ); assertEquals( 2, qpm2.subMap.size() ); assertNotNull( qpm2.subMap.get( "c" ) ); qpm3 = qpm2.subMap.get( "c" ); assertFalse( qpm3.terminal ); assertEquals( 1, qpm3.subMap.size() ); assertNotNull( qpm3.subMap.get( "d" ) ); qpm4 = qpm3.subMap.get( "d" ); assertTrue( qpm4.terminal ); assertEquals( 2F, qpm4.boost ); assertNotNull( qpm2.subMap.get( "d" ) ); qpm3 = qpm2.subMap.get( "d" ); assertTrue( qpm3.terminal ); assertEquals( 3F, qpm3.boost ); } /* * ...terminal * * a-b- * +-c- */ public void testQueryPhraseMapOverlapPhrases2() throws Exception { BooleanQuery query = new BooleanQuery(); query.add( pqF( "a", "b" ), Occur.SHOULD ); query.add( pqF( 2, "a", "b", "c" ), Occur.SHOULD ); // phraseHighlight = true, fieldMatch = true FieldQuery fq = new FieldQuery( query, true, true ); Map map = fq.rootMaps; assertEquals( 1, map.size() ); assertNull( map.get( null ) ); assertNotNull( map.get( F ) ); QueryPhraseMap qpm = map.get( F ); assertEquals( 1, qpm.subMap.size() ); // "a b" assertNotNull( qpm.subMap.get( "a" ) ); QueryPhraseMap qpm2 = qpm.subMap.get( "a" ); assertFalse( qpm2.terminal ); assertEquals( 1, qpm2.subMap.size() ); assertNotNull( qpm2.subMap.get( "b" ) ); QueryPhraseMap qpm3 = qpm2.subMap.get( "b" ); assertTrue( qpm3.terminal ); assertEquals( 1F, qpm3.boost ); // "a b c"^2 assertEquals( 1, qpm3.subMap.size() ); assertNotNull( qpm3.subMap.get( "c" ) ); QueryPhraseMap qpm4 = qpm3.subMap.get( "c" ); assertTrue( qpm4.terminal ); assertEquals( 2F, qpm4.boost ); } /* * ...terminal * * a-a-a- * +-a- * +-a- * +-a- */ public void testQueryPhraseMapOverlapPhrases3() throws Exception { BooleanQuery query = new BooleanQuery(); query.add( pqF( "a", "a", "a", "a" ), Occur.SHOULD ); query.add( pqF( 2, "a", "a", "a" ), Occur.SHOULD ); // phraseHighlight = true, fieldMatch = true FieldQuery fq = new FieldQuery( query, true, true ); Map map = fq.rootMaps; assertEquals( 1, map.size() ); assertNull( map.get( null ) ); assertNotNull( map.get( F ) ); QueryPhraseMap qpm = map.get( F ); assertEquals( 1, qpm.subMap.size() ); // "a a a" assertNotNull( qpm.subMap.get( "a" ) ); QueryPhraseMap qpm2 = qpm.subMap.get( "a" ); assertFalse( qpm2.terminal ); assertEquals( 1, qpm2.subMap.size() ); assertNotNull( qpm2.subMap.get( "a" ) ); QueryPhraseMap qpm3 = qpm2.subMap.get( "a" ); assertFalse( qpm3.terminal ); assertEquals( 1, qpm3.subMap.size() ); assertNotNull( qpm3.subMap.get( "a" ) ); QueryPhraseMap qpm4 = qpm3.subMap.get( "a" ); assertTrue( qpm4.terminal ); // "a a a a" assertEquals( 1, qpm4.subMap.size() ); assertNotNull( qpm4.subMap.get( "a" ) ); QueryPhraseMap qpm5 = qpm4.subMap.get( "a" ); assertTrue( qpm5.terminal ); // "a a a a a" assertEquals( 1, qpm5.subMap.size() ); assertNotNull( qpm5.subMap.get( "a" ) ); QueryPhraseMap qpm6 = qpm5.subMap.get( "a" ); assertTrue( qpm6.terminal ); // "a a a a a a" assertEquals( 1, qpm6.subMap.size() ); assertNotNull( qpm6.subMap.get( "a" ) ); QueryPhraseMap qpm7 = qpm6.subMap.get( "a" ); assertTrue( qpm7.terminal ); } public void testQueryPhraseMapOverlap2gram() throws Exception { Query query = paB.parse( "abc AND bcd" ); // phraseHighlight = true, fieldMatch = true FieldQuery fq = new FieldQuery( query, true, true ); Map map = fq.rootMaps; assertEquals( 1, map.size() ); assertNull( map.get( null ) ); assertNotNull( map.get( F ) ); QueryPhraseMap qpm = map.get( F ); assertEquals( 2, qpm.subMap.size() ); // "ab bc" assertNotNull( qpm.subMap.get( "ab" ) ); QueryPhraseMap qpm2 = qpm.subMap.get( "ab" ); assertFalse( qpm2.terminal ); assertEquals( 1, qpm2.subMap.size() ); assertNotNull( qpm2.subMap.get( "bc" ) ); QueryPhraseMap qpm3 = qpm2.subMap.get( "bc" ); assertTrue( qpm3.terminal ); assertEquals( 1F, qpm3.boost ); // "ab bc cd" assertEquals( 1, qpm3.subMap.size() ); assertNotNull( qpm3.subMap.get( "cd" ) ); QueryPhraseMap qpm4 = qpm3.subMap.get( "cd" ); assertTrue( qpm4.terminal ); assertEquals( 1F, qpm4.boost ); // "bc cd" assertNotNull( qpm.subMap.get( "bc" ) ); qpm2 = qpm.subMap.get( "bc" ); assertFalse( qpm2.terminal ); assertEquals( 1, qpm2.subMap.size() ); assertNotNull( qpm2.subMap.get( "cd" ) ); qpm3 = qpm2.subMap.get( "cd" ); assertTrue( qpm3.terminal ); assertEquals( 1F, qpm3.boost ); // phraseHighlight = false, fieldMatch = true fq = new FieldQuery( query, false, true ); map = fq.rootMaps; assertEquals( 1, map.size() ); assertNull( map.get( null ) ); assertNotNull( map.get( F ) ); qpm = map.get( F ); assertEquals( 3, qpm.subMap.size() ); // "ab bc" assertNotNull( qpm.subMap.get( "ab" ) ); qpm2 = qpm.subMap.get( "ab" ); assertTrue( qpm2.terminal ); assertEquals( 1F, qpm2.boost ); assertEquals( 1, qpm2.subMap.size() ); assertNotNull( qpm2.subMap.get( "bc" ) ); qpm3 = qpm2.subMap.get( "bc" ); assertTrue( qpm3.terminal ); assertEquals( 1F, qpm3.boost ); // "ab bc cd" assertEquals( 1, qpm3.subMap.size() ); assertNotNull( qpm3.subMap.get( "cd" ) ); qpm4 = qpm3.subMap.get( "cd" ); assertTrue( qpm4.terminal ); assertEquals( 1F, qpm4.boost ); // "bc cd" assertNotNull( qpm.subMap.get( "bc" ) ); qpm2 = qpm.subMap.get( "bc" ); assertTrue( qpm2.terminal ); assertEquals( 1F, qpm2.boost ); assertEquals( 1, qpm2.subMap.size() ); assertNotNull( qpm2.subMap.get( "cd" ) ); qpm3 = qpm2.subMap.get( "cd" ); assertTrue( qpm3.terminal ); assertEquals( 1F, qpm3.boost ); // "cd" assertNotNull( qpm.subMap.get( "cd" ) ); qpm2 = qpm.subMap.get( "cd" ); assertTrue( qpm2.terminal ); assertEquals( 1F, qpm2.boost ); assertEquals( 0, qpm2.subMap.size() ); } public void testSearchPhrase() throws Exception { Query query = pqF( "a", "b", "c" ); // phraseHighlight = true, fieldMatch = true FieldQuery fq = new FieldQuery( query, true, true ); // "a" List phraseCandidate = new ArrayList(); phraseCandidate.add( new TermInfo( "a", 0, 1, 0 ) ); assertNull( fq.searchPhrase( F, phraseCandidate ) ); // "a b" phraseCandidate.add( new TermInfo( "b", 2, 3, 1 ) ); assertNull( fq.searchPhrase( F, phraseCandidate ) ); // "a b c" phraseCandidate.add( new TermInfo( "c", 4, 5, 2 ) ); assertNotNull( fq.searchPhrase( F, phraseCandidate ) ); assertNull( fq.searchPhrase( "x", phraseCandidate ) ); // phraseHighlight = true, fieldMatch = false fq = new FieldQuery( query, true, false ); // "a b c" assertNotNull( fq.searchPhrase( F, phraseCandidate ) ); assertNotNull( fq.searchPhrase( "x", phraseCandidate ) ); // phraseHighlight = false, fieldMatch = true fq = new FieldQuery( query, false, true ); // "a" phraseCandidate.clear(); phraseCandidate.add( new TermInfo( "a", 0, 1, 0 ) ); assertNotNull( fq.searchPhrase( F, phraseCandidate ) ); // "a b" phraseCandidate.add( new TermInfo( "b", 2, 3, 1 ) ); assertNull( fq.searchPhrase( F, phraseCandidate ) ); // "a b c" phraseCandidate.add( new TermInfo( "c", 4, 5, 2 ) ); assertNotNull( fq.searchPhrase( F, phraseCandidate ) ); assertNull( fq.searchPhrase( "x", phraseCandidate ) ); } public void testSearchPhraseSlop() throws Exception { // "a b c"~0 Query query = pqF( "a", "b", "c" ); // phraseHighlight = true, fieldMatch = true FieldQuery fq = new FieldQuery( query, true, true ); // "a b c" w/ position-gap = 2 List phraseCandidate = new ArrayList(); phraseCandidate.add( new TermInfo( "a", 0, 1, 0 ) ); phraseCandidate.add( new TermInfo( "b", 2, 3, 2 ) ); phraseCandidate.add( new TermInfo( "c", 4, 5, 4 ) ); assertNull( fq.searchPhrase( F, phraseCandidate ) ); // "a b c"~1 query = pqF( 1F, 1, "a", "b", "c" ); // phraseHighlight = true, fieldMatch = true fq = new FieldQuery( query, true, true ); // "a b c" w/ position-gap = 2 assertNotNull( fq.searchPhrase( F, phraseCandidate ) ); // "a b c" w/ position-gap = 3 phraseCandidate.clear(); phraseCandidate.add( new TermInfo( "a", 0, 1, 0 ) ); phraseCandidate.add( new TermInfo( "b", 2, 3, 3 ) ); phraseCandidate.add( new TermInfo( "c", 4, 5, 6 ) ); assertNull( fq.searchPhrase( F, phraseCandidate ) ); } } lucene-2.9.4/contrib/fast-vector-highlighter/pom.xml.template0000644000175000017500000000326611474320246025005 0ustar janpascaljanpascal 4.0.0 org.apache.lucene lucene-contrib @version@ org.apache.lucene lucene-fast-vector-highlighter Lucene Fast-Vector-Highlighter @version@ This is a Term-Vector based highlighter for Apache Lucene Java jar org.apache.lucene lucene-analyzers @version@ lucene-2.9.4/contrib/contrib-build.xml0000644000175000017500000001325711474320266020407 0ustar janpascaljanpascal Building ${ant.project.name}... ${name} ]]> lucene-2.9.4/contrib/lucli/0000755000175000017500000000000011554106561016227 5ustar janpascaljanpascallucene-2.9.4/contrib/lucli/lib/0000755000175000017500000000000011554106562016776 5ustar janpascaljanpascallucene-2.9.4/contrib/lucli/lib/jline.LICENSE0000644000175000017500000000273611474320257021113 0ustar janpascaljanpascalCopyright (c) 2002, 2003, 2004, 2005, Marc Prud'hommeaux All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. Neither the name of JLine nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lucene-2.9.4/contrib/lucli/README0000644000175000017500000000123211474320257017106 0ustar janpascaljanpascallucli (pronounced Luckily) is the Lucene Command Line Interface. INSTALLATION Call "ant", then call the run.sh shell script. If it doesn't work right away: Edit JAVA_HOME to point to your java directory. Edit LUCLI to point to where you installed lucli. Edit LUCLI_MEMORY and set it to the maximum amount of memory you want to allocate to lucli You can also replace the Lucene jar file that came with lucli with your own. ENABLING READLINE Readline support should automatically work thanks to JLine, see http://jline.sourceforge.net/ Documentation There is none :-). Type help at the command line or read the code. Enjoy Dror Matalon dror@zapatec.com. lucene-2.9.4/contrib/lucli/build.xml0000644000175000017500000000356511474320257020062 0ustar janpascaljanpascal Lucene Command Line Interface lucene-2.9.4/contrib/lucli/src/0000755000175000017500000000000011474320257017017 5ustar janpascaljanpascallucene-2.9.4/contrib/lucli/src/java/0000755000175000017500000000000011554106561017737 5ustar janpascaljanpascallucene-2.9.4/contrib/lucli/src/java/overview.html0000644000175000017500000000155111474320257022476 0ustar janpascaljanpascal lucli lucli lucene-2.9.4/contrib/lucli/src/java/lucli/0000755000175000017500000000000011554106561021047 5ustar janpascaljanpascallucene-2.9.4/contrib/lucli/src/java/lucli/LuceneMethods.java0000644000175000017500000003141511474320257024456 0ustar janpascaljanpascalpackage lucli; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.Enumeration; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.Map.Entry; import jline.ConsoleReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import org.apache.lucene.index.IndexReader.FieldOption; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Searcher; /** * Various methods that interact with Lucene and provide info about the * index, search, etc. Parts adapted from Lucene demo. */ class LuceneMethods { private int numDocs; private String indexName; //directory of this index private java.util.Iterator fieldIterator; private List fields; //Fields as a vector private List indexedFields; //Fields as a vector private String fieldsArray[]; //Fields as an array private Searcher searcher; private Query query; //current query string private String analyzerClassFQN = null; // Analyzer class, if NULL, use default Analyzer public LuceneMethods(String index) { indexName = index; message("Lucene CLI. Using directory '" + indexName + "'. Type 'help' for instructions."); } private Analyzer createAnalyzer() { if (analyzerClassFQN == null) return new StandardAnalyzer(); try { Class aClass = Class.forName(analyzerClassFQN); Object obj = aClass.newInstance(); if (!(obj instanceof Analyzer)) { message("Given class is not an Analyzer: " + analyzerClassFQN); return new StandardAnalyzer(); } return (Analyzer)obj; } catch (Exception e) { message("Unable to use Analyzer " + analyzerClassFQN); return new StandardAnalyzer(); } } public void info() throws java.io.IOException { IndexReader indexReader = IndexReader.open(indexName); getFieldInfo(); numDocs = indexReader.numDocs(); message("Index has " + numDocs + " documents "); message("All Fields:" + fields.toString()); message("Indexed Fields:" + indexedFields.toString()); if (IndexReader.isLocked(indexName)) { message("Index is locked"); } //IndexReader.getCurrentVersion(indexName); //System.out.println("Version:" + version); indexReader.close(); } public void search(String queryString, boolean explain, boolean showTokens, ConsoleReader cr) throws java.io.IOException, org.apache.lucene.queryParser.ParseException { Hits hits = initSearch(queryString); System.out.println(hits.length() + " total matching documents"); if (explain) { query = explainQuery(queryString); } final int HITS_PER_PAGE = 10; message("--------------------------------------"); for (int start = 0; start < hits.length(); start += HITS_PER_PAGE) { int end = Math.min(hits.length(), start + HITS_PER_PAGE); for (int ii = start; ii < end; ii++) { Document doc = hits.doc(ii); message("---------------- " + (ii + 1) + " score:" + hits.score(ii) + "---------------------"); printHit(doc); if (showTokens) { invertDocument(doc); } if (explain) { Explanation exp = searcher.explain(query, hits.id(ii)); message("Explanation:" + exp.toString()); } } message("#################################################"); if (hits.length() > end) { // TODO: don't let the input end up in the command line history queryString = cr.readLine("more (y/n) ? "); if (queryString.length() == 0 || queryString.charAt(0) == 'n') break; } } searcher.close(); } /** * TODO: Allow user to specify what field(s) to display */ private void printHit(Document doc) { for (int ii = 0; ii < fieldsArray.length; ii++) { String currField = fieldsArray[ii]; String[] result = doc.getValues(currField); if (result != null) { for (int i = 0; i < result.length; i++) { message(currField + ":" + result[i]); } } else { message(currField + ": "); } } //another option is to just do message(doc); } public void optimize() throws IOException { //open the index writer. False: don't create a new one IndexWriter indexWriter = new IndexWriter(indexName, createAnalyzer(), false); message("Starting to optimize index."); long start = System.currentTimeMillis(); indexWriter.optimize(); message("Done optimizing index. Took " + (System.currentTimeMillis() - start) + " msecs"); indexWriter.close(); } private Query explainQuery(String queryString) throws IOException, ParseException { searcher = new IndexSearcher(indexName); Analyzer analyzer = createAnalyzer(); getFieldInfo(); int arraySize = indexedFields.size(); String indexedArray[] = new String[arraySize]; for (int ii = 0; ii < arraySize; ii++) { indexedArray[ii] = (String) indexedFields.get(ii); } MultiFieldQueryParser parser = new MultiFieldQueryParser(indexedArray, analyzer); query = parser.parse(queryString); System.out.println("Searching for: " + query.toString()); return (query); } /** * TODO: Allow user to specify analyzer */ private Hits initSearch(String queryString) throws IOException, ParseException { searcher = new IndexSearcher(indexName); Analyzer analyzer = createAnalyzer(); getFieldInfo(); int arraySize = fields.size(); fieldsArray = new String[arraySize]; for (int ii = 0; ii < arraySize; ii++) { fieldsArray[ii] = (String) fields.get(ii); } MultiFieldQueryParser parser = new MultiFieldQueryParser(fieldsArray, analyzer); query = parser.parse(queryString); System.out.println("Searching for: " + query.toString()); Hits hits = searcher.search(query); return (hits); } public void count(String queryString) throws java.io.IOException, ParseException { Hits hits = initSearch(queryString); System.out.println(hits.length() + " total documents"); searcher.close(); } static public void message(String s) { System.out.println(s); } private void getFieldInfo() throws IOException { IndexReader indexReader = IndexReader.open(indexName); fields = new ArrayList(); indexedFields = new ArrayList(); //get the list of all field names fieldIterator = indexReader.getFieldNames(FieldOption.ALL).iterator(); while (fieldIterator.hasNext()) { Object field = fieldIterator.next(); if (field != null && !field.equals("")) fields.add(field.toString()); } // //get the list of indexed field names fieldIterator = indexReader.getFieldNames(FieldOption.INDEXED).iterator(); while (fieldIterator.hasNext()) { Object field = fieldIterator.next(); if (field != null && !field.equals("")) indexedFields.add(field.toString()); } indexReader.close(); } // Copied from DocumentWriter // Tokenizes the fields of a document into Postings. private void invertDocument(Document doc) throws IOException { Map tokenMap = new HashMap(); final int maxFieldLength = 10000; Analyzer analyzer = createAnalyzer(); Iterator fields = doc.getFields().iterator(); final Token reusableToken = new Token(); while (fields.hasNext()) { Field field = (Field) fields.next(); String fieldName = field.name(); if (field.isIndexed()) { if (field.isTokenized()) { // un-tokenized field Reader reader; // find or make Reader if (field.readerValue() != null) reader = field.readerValue(); else if (field.stringValue() != null) reader = new StringReader(field.stringValue()); else throw new IllegalArgumentException ("field must have either String or Reader value"); int position = 0; // Tokenize field and add to postingTable TokenStream stream = analyzer.tokenStream(fieldName, reader); TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class); PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.addAttribute(PositionIncrementAttribute.class); try { while (stream.incrementToken()) { position += (posIncrAtt.getPositionIncrement() - 1); position++; String name = termAtt.term(); Integer Count = (Integer) tokenMap.get(name); if (Count == null) { // not in there yet tokenMap.put(name, new Integer(1)); //first one } else { int count = Count.intValue(); tokenMap.put(name, new Integer(count + 1)); } if (position > maxFieldLength) break; } } finally { stream.close(); } } } } Entry[] sortedHash = getSortedMapEntries(tokenMap); for (int ii = 0; ii < sortedHash.length && ii < 10; ii++) { Entry currentEntry = sortedHash[ii]; message((ii + 1) + ":" + currentEntry.getKey() + " " + currentEntry.getValue()); } } /** Provides a list of the top terms of the index. * * @param field - the name of the command or null for all of them. */ public void terms(String field) throws IOException { TreeMap termMap = new TreeMap(); IndexReader indexReader = IndexReader.open(indexName); TermEnum terms = indexReader.terms(); while (terms.next()) { Term term = terms.term(); //message(term.field() + ":" + term.text() + " freq:" + terms.docFreq()); //if we're either not looking by field or we're matching the specific field if ((field == null) || field.equals(term.field())) termMap.put(term.field() + ":" + term.text(), new Integer((terms.docFreq()))); } Iterator termIterator = termMap.keySet().iterator(); for (int ii = 0; termIterator.hasNext() && ii < 100; ii++) { String termDetails = (String) termIterator.next(); Integer termFreq = (Integer) termMap.get(termDetails); message(termDetails + ": " + termFreq); } indexReader.close(); } /** Sort Map values * @param m the map we're sorting * from http://developer.java.sun.com/developer/qow/archive/170/index.jsp */ public static Entry[] getSortedMapEntries(Map m) { Set set = m.entrySet(); Entry[] entries = (Entry[]) set.toArray( new Entry[set.size()]); Arrays.sort(entries, new Comparator() { public int compare(Object o1, Object o2) { Object v1 = ((Entry) o1).getValue(); Object v2 = ((Entry) o2).getValue(); return ((Comparable) v2).compareTo(v1); //descending order } }); return entries; } public void analyzer(String word) { if ("current".equals(word)) { String current = analyzerClassFQN == null ? "StandardAnalyzer" : analyzerClassFQN; message("The currently used Analyzer class is: " + current); return; } analyzerClassFQN = word; message("Switched to Analyzer class " + analyzerClassFQN); } } lucene-2.9.4/contrib/lucli/src/java/lucli/Lucli.java0000644000175000017500000002272411474320257022772 0ustar janpascaljanpascalpackage lucli; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.File; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.Iterator; import java.util.Set; import java.util.StringTokenizer; import java.util.TreeMap; import jline.ArgumentCompletor; import jline.Completor; import jline.ConsoleReader; import jline.FileNameCompletor; import jline.History; import jline.SimpleCompletor; import org.apache.lucene.queryParser.ParseException; /** * Main class for lucli: the Lucene Command Line Interface. * This class handles mostly the actual CLI part, command names, help, etc. */ public class Lucli { final static String DEFAULT_INDEX = "index"; //directory "index" under the current directory final static String HISTORYFILE = ".lucli"; //history file in user's home directory public final static int MAX_TERMS = 100; //Maximum number of terms we're going to show // List of commands // To add another command, add it in here, in the list of addcomand(), and in the switch statement final static int NOCOMMAND = -2; final static int UNKOWN = -1; final static int INFO = 0; final static int SEARCH = 1; final static int OPTIMIZE = 2; final static int QUIT = 3; final static int HELP = 4; final static int COUNT = 5; final static int TERMS = 6; final static int INDEX = 7; final static int TOKENS = 8; final static int EXPLAIN = 9; final static int ANALYZER = 10; String historyFile; TreeMap commandMap = new TreeMap(); LuceneMethods luceneMethods; //current cli class we're using boolean enableReadline; //false: use plain java. True: shared library readline /** Main entry point. The first argument can be a filename with an application initialization file. */ public Lucli(String[] args) throws IOException { String line; historyFile = System.getProperty("user.home") + File.separator + HISTORYFILE; /* * Initialize the list of commands */ addCommand("info", INFO, "Display info about the current Lucene index. Example: info"); addCommand("search", SEARCH, "Search the current index. Example: search foo", 1); addCommand("count", COUNT, "Return the number of hits for a search. Example: count foo", 1); addCommand("optimize", OPTIMIZE, "Optimize the current index"); addCommand("quit", QUIT, "Quit/exit the program"); addCommand("help", HELP, "Display help about commands"); addCommand("terms", TERMS, "Show the first " + MAX_TERMS + " terms in this index. Supply a field name to only show terms in a specific field. Example: terms"); addCommand("index", INDEX, "Choose a different lucene index. Example index my_index", 1); addCommand("tokens", TOKENS, "Does a search and shows the top 10 tokens for each document. Verbose! Example: tokens foo", 1); addCommand("explain", EXPLAIN, "Explanation that describes how the document scored against query. Example: explain foo", 1); addCommand("analyzer", ANALYZER, "Specifies the Analyzer class to be used. Example: analyzer org.apache.lucene.analysis.SimpleAnalyzer", 1); //parse command line arguments parseArgs(args); ConsoleReader cr = new ConsoleReader(); //Readline.readHistoryFile(fullPath); cr.setHistory(new History(new File(historyFile))); // set completer with list of words Completor[] comp = new Completor[]{ new SimpleCompletor(getCommandsAsArray()), new FileNameCompletor() }; cr.addCompletor (new ArgumentCompletor(comp)); // main input loop luceneMethods = new LuceneMethods(DEFAULT_INDEX); while (true) { try { line = cr.readLine("lucli> "); if (line != null) { handleCommand(line, cr); } } catch (java.io.EOFException eof) { System.out.println("");//new line exit(); } catch (UnsupportedEncodingException enc) { enc.printStackTrace(System.err); } catch (ParseException pe) { pe.printStackTrace(System.err); } catch (IOException ioe) { ioe.printStackTrace(System.err); } } } private String[] getCommandsAsArray() { Set commandSet = commandMap.keySet(); String[] commands = new String[commandMap.size()]; int i = 0; for (Iterator iter = commandSet.iterator(); iter.hasNext();) { String cmd = (String) iter.next(); commands[i++] = cmd; } return commands; } public static void main(String[] args) throws IOException { new Lucli(args); } private void handleCommand(String line, ConsoleReader cr) throws IOException, ParseException { String [] words = tokenizeCommand(line); if (words.length == 0) return; //white space String query = ""; if (line.trim().startsWith("#")) // # = comment return; //Command name and number of arguments switch (getCommandId(words[0], words.length - 1)) { case INFO: luceneMethods.info(); break; case SEARCH: for (int ii = 1; ii < words.length; ii++) { query += words[ii] + " "; } luceneMethods.search(query, false, false, cr); break; case COUNT: for (int ii = 1; ii < words.length; ii++) { query += words[ii] + " "; } luceneMethods.count(query); break; case QUIT: exit(); break; case TERMS: if(words.length > 1) luceneMethods.terms(words[1]); else luceneMethods.terms(null); break; case INDEX: LuceneMethods newLm = new LuceneMethods(words[1]); try { newLm.info(); //will fail if can't open the index luceneMethods = newLm; //OK, so we'll use the new one } catch (IOException ioe) { //problem we'll keep using the old one error(ioe.toString()); } break; case OPTIMIZE: luceneMethods.optimize(); break; case TOKENS: for (int ii = 1; ii < words.length; ii++) { query += words[ii] + " "; } luceneMethods.search(query, false, true, cr); break; case EXPLAIN: for (int ii = 1; ii < words.length; ii++) { query += words[ii] + " "; } luceneMethods.search(query, true, false, cr); break; case ANALYZER: luceneMethods.analyzer(words[1]); break; case HELP: help(); break; case NOCOMMAND: //do nothing break; case UNKOWN: System.out.println("Unknown command: " + words[0] + ". Type help to get a list of commands."); break; } } private String [] tokenizeCommand(String line) { StringTokenizer tokenizer = new StringTokenizer(line, " \t"); int size = tokenizer.countTokens(); String [] tokens = new String[size]; for (int ii = 0; tokenizer.hasMoreTokens(); ii++) { tokens[ii] = tokenizer.nextToken(); } return tokens; } private void exit() { System.exit(0); } /** * Add a command to the list of commands for the interpreter for a * command that doesn't take any parameters. * @param name - the name of the command * @param id - the unique id of the command * @param help - the help message for this command */ private void addCommand(String name, int id, String help) { addCommand(name, id, help, 0); } /** * Add a command to the list of commands for the interpreter. * @param name - the name of the command * @param id - the unique id of the command * @param help - the help message for this command * @param params - the minimum number of required params if any */ private void addCommand(String name, int id, String help, int params) { Command command = new Command(name, id, help, params); commandMap.put(name, command); } private int getCommandId(String name, int params) { name = name.toLowerCase(); //treat uppercase and lower case commands the same Command command = (Command) commandMap.get(name); if (command == null) { return(UNKOWN); } else { if(command.params > params) { error(command.name + " needs at least " + command.params + " arguments."); return (NOCOMMAND); } return (command.id); } } private void help() { Iterator commands = commandMap.keySet().iterator(); while (commands.hasNext()) { Command command = (Command) commandMap.get(commands.next()); System.out.println("\t" + command.name + ": " + command.help); } } private void error(String message) { System.err.println("Error:" + message); } private void message(String text) { System.out.println(text); } /* * Parse command line arguments (currently none) */ private void parseArgs(String[] args) { if (args.length > 0) { usage(); System.exit(1); } } private void usage() { message("Usage: lucli.Lucli"); message("(currently, no parameters are supported)"); } private class Command { String name; int id; int numberArgs; String help; int params; Command(String name, int id, String help, int params) { this.name = name; this.id = id; this.help = help; this.params = params; } /** * Prints out a usage message for this command. */ public String commandUsage() { return (name + ":" + help + ". Command takes " + params + " params"); } } } lucene-2.9.4/contrib/lucli/src/java/lucli/package.html0000644000175000017500000000162011474320257023330 0ustar janpascaljanpascal Lucene Command Line Interface lucene-2.9.4/contrib/lucli/pom.xml.template0000644000175000017500000000313711474320257021363 0ustar janpascaljanpascal 4.0.0 org.apache.lucene lucene-contrib @version@ org.apache.lucene lucene-lucli jar @version@ lucene-contrib-lucli Lucene Command Line Interface jline jline 0.9.91 lucene-2.9.4/contrib/lucli/run.sh0000644000175000017500000000202411474320257017366 0ustar janpascaljanpascal # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. LUCLI=. LUCLI_MEMORY=128M #JAVA_HOME=/home/dror/j2sdk1.4.1_03/ CLASSPATH=${CLASSPATH}:$LUCLI/lib/jline.jar:$LUCLI/lib/lucene.jar:$LUCLI/dist/lucli-dev.jar export CLASSPATH $JAVA_HOME/bin/java -Xmx${LUCLI_MEMORY} lucli.Lucli lucene-2.9.4/contrib/snowball/0000755000175000017500000000000011554106561016740 5ustar janpascaljanpascallucene-2.9.4/contrib/snowball/snowball.cvspass0000644000175000017500000000006711474320235022165 0ustar janpascaljanpascal:pserver:cvsuser@cvs.tartarus.org:/home/cvs Ay=0=a%0bZ lucene-2.9.4/contrib/snowball/docs/0000755000175000017500000000000011554106561017670 5ustar janpascaljanpascallucene-2.9.4/contrib/snowball/docs/index.html0000644000175000017500000001172211474320235021665 0ustar janpascaljanpascal Snowball Stemmers for Lucene - Overview - Snowball Stemmers for Lucene

Documentation

Download

Links

Jakarta

Snowball Stemmers for Lucene

This project provides pre-compiled version of the Snowball stemmers together with classes integrating them with the Lucene search engine.


Download

Releases of the stemmers are available here



Copyright © 1999-2004, The Apache Software Foundation
lucene-2.9.4/contrib/snowball/SNOWBALL-LICENSE.txt0000644000175000017500000000303011474320235021773 0ustar janpascaljanpascalCopyright (c) 2001, Dr Martin Porter Copyright (c) 2002, Richard Boulton All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * Neither the name of the copyright holders nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lucene-2.9.4/contrib/snowball/build.xml0000644000175000017500000000767011474320235020570 0ustar janpascaljanpascal Snowball Analyzers lucene-2.9.4/contrib/snowball/LICENSE.txt0000644000175000017500000000144311474320235020562 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ lucene-2.9.4/contrib/snowball/src/0000755000175000017500000000000011474320235017524 5ustar janpascaljanpascallucene-2.9.4/contrib/snowball/src/java/0000755000175000017500000000000011554106561020450 5ustar janpascaljanpascallucene-2.9.4/contrib/snowball/src/java/org/0000755000175000017500000000000011474320235021234 5ustar janpascaljanpascallucene-2.9.4/contrib/snowball/src/java/org/apache/0000755000175000017500000000000011474320235022455 5ustar janpascaljanpascallucene-2.9.4/contrib/snowball/src/java/org/apache/lucene/0000755000175000017500000000000011474320235023730 5ustar janpascaljanpascallucene-2.9.4/contrib/snowball/src/java/org/apache/lucene/analysis/0000755000175000017500000000000011474320235025553 5ustar janpascaljanpascallucene-2.9.4/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/0000755000175000017500000000000011554106561027377 5ustar janpascaljanpascallucene-2.9.4/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballFilter.java0000644000175000017500000000650111474320235033170 0ustar janpascaljanpascalpackage org.apache.lucene.analysis.snowball; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.tartarus.snowball.SnowballProgram; /** * A filter that stems words using a Snowball-generated stemmer. * * Available stemmers are listed in {@link org.tartarus.snowball.ext}. */ public class SnowballFilter extends TokenFilter { private SnowballProgram stemmer; private TermAttribute termAtt; public SnowballFilter(TokenStream input, SnowballProgram stemmer) { super(input); this.stemmer = stemmer; termAtt = (TermAttribute) addAttribute(TermAttribute.class); } /** * Construct the named stemming filter. * * Available stemmers are listed in {@link org.tartarus.snowball.ext}. * The name of a stemmer is the part of the class name before "Stemmer", * e.g., the stemmer in {@link org.tartarus.snowball.ext.EnglishStemmer} is named "English". * * @param in the input tokens to stem * @param name the name of a stemmer */ public SnowballFilter(TokenStream in, String name) { super(in); try { Class stemClass = Class.forName("org.tartarus.snowball.ext." + name + "Stemmer"); stemmer = (SnowballProgram) stemClass.newInstance(); } catch (Exception e) { throw new RuntimeException(e.toString()); } termAtt = (TermAttribute) addAttribute(TermAttribute.class); } /** Returns the next input Token, after being stemmed */ public final boolean incrementToken() throws IOException { if (input.incrementToken()) { String originalTerm = termAtt.term(); stemmer.setCurrent(originalTerm); stemmer.stem(); String finalTerm = stemmer.getCurrent(); // Don't bother updating, if it is unchanged. if (!originalTerm.equals(finalTerm)) termAtt.setTermBuffer(finalTerm); return true; } else { return false; } } /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should * not be overridden. Delegates to the backwards compatibility layer. */ public final Token next(final Token reusableToken) throws java.io.IOException { return super.next(reusableToken); } /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should * not be overridden. Delegates to the backwards compatibility layer. */ public final Token next() throws java.io.IOException { return super.next(); } } lucene-2.9.4/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/package.html0000644000175000017500000000176011474320235031661 0ustar janpascaljanpascal {@link org.apache.lucene.analysis.TokenFilter} and {@link org.apache.lucene.analysis.Analyzer} implementations that use Snowball stemmers. lucene-2.9.4/contrib/snowball/src/java/org/apache/lucene/analysis/snowball/SnowballAnalyzer.java0000644000175000017500000001141111474320235033524 0ustar janpascaljanpascalpackage org.apache.lucene.analysis.snowball; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.standard.*; import org.apache.lucene.util.Version; import java.io.IOException; import java.io.Reader; import java.util.Set; /** Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link * LowerCaseFilter}, {@link StopFilter} and {@link SnowballFilter}. * * Available stemmers are listed in org.tartarus.snowball.ext. The name of a * stemmer is the part of the class name before "Stemmer", e.g., the stemmer in * {@link org.tartarus.snowball.ext.EnglishStemmer} is named "English". * *

NOTE: This class uses the same {@link Version} * dependent settings as {@link StandardAnalyzer}.

*/ public class SnowballAnalyzer extends Analyzer { private String name; private Set stopSet; private final Version matchVersion; /** Builds the named analyzer with no stop words. * * @deprecated Use {@link #SnowballAnalyzer(Version, String)} instead*/ public SnowballAnalyzer(String name) { this(Version.LUCENE_23, name); } /** Builds the named analyzer with no stop words. */ public SnowballAnalyzer(Version matchVersion, String name) { this.name = name; setOverridesTokenStreamMethod(SnowballAnalyzer.class); this.matchVersion = matchVersion; } /** Builds the named analyzer with the given stop words. * * @deprecated Use {@link #SnowballAnalyzer(Version, String, String[])} instead*/ public SnowballAnalyzer(String name, String[] stopWords) { this(Version.LUCENE_23, name, stopWords); } /** Builds the named analyzer with the given stop words. */ public SnowballAnalyzer(Version matchVersion, String name, String[] stopWords) { this(matchVersion, name); stopSet = StopFilter.makeStopSet(stopWords); } /** Builds the named analyzer with the given stop words. */ public SnowballAnalyzer(Version matchVersion, String name, Set stopWords) { this(matchVersion, name); this.stopSet = CharArraySet.unmodifiableSet(new CharArraySet(stopWords, false)); } /** Constructs a {@link StandardTokenizer} filtered by a {@link StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter}, and a {@link SnowballFilter} */ public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer(matchVersion, reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); if (stopSet != null) result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet); result = new SnowballFilter(result, name); return result; } private class SavedStreams { Tokenizer source; TokenStream result; }; /** Returns a (possibly reused) {@link StandardTokenizer} filtered by a * {@link StandardFilter}, a {@link LowerCaseFilter}, * a {@link StopFilter}, and a {@link SnowballFilter} */ public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { if (overridesTokenStreamMethod) { // LUCENE-1678: force fallback to tokenStream() if we // have been subclassed and that subclass overrides // tokenStream but not reusableTokenStream return tokenStream(fieldName, reader); } SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(); streams.source = new StandardTokenizer(matchVersion, reader); streams.result = new StandardFilter(streams.source); streams.result = new LowerCaseFilter(streams.result); if (stopSet != null) streams.result = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion), streams.result, stopSet); streams.result = new SnowballFilter(streams.result, name); setPreviousTokenStream(streams); } else { streams.source.reset(reader); } return streams.result; } } lucene-2.9.4/contrib/snowball/src/java/org/tartarus/0000755000175000017500000000000011474320235023101 5ustar janpascaljanpascallucene-2.9.4/contrib/snowball/src/java/org/tartarus/snowball/0000755000175000017500000000000011554106561024725 5ustar janpascaljanpascallucene-2.9.4/contrib/snowball/src/java/org/tartarus/snowball/TestApp.java0000644000175000017500000000545011474320235027151 0ustar janpascaljanpascalpackage org.tartarus.snowball; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.lang.reflect.Method; import java.io.Reader; import java.io.Writer; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.OutputStream; import java.io.FileOutputStream; public class TestApp { private static void usage() { System.err.println("Usage: TestApp [-o ]"); } public static void main(String [] args) throws Throwable { if (args.length < 2) { usage(); return; } Class stemClass = Class.forName("org.tartarus.snowball.ext." + args[0] + "Stemmer"); SnowballProgram stemmer = (SnowballProgram) stemClass.newInstance(); Method stemMethod = stemClass.getMethod("stem", new Class[0]); Reader reader; reader = new InputStreamReader(new FileInputStream(args[1])); reader = new BufferedReader(reader); StringBuffer input = new StringBuffer(); OutputStream outstream; if (args.length > 2) { if (args.length == 4 && args[2].equals("-o")) { outstream = new FileOutputStream(args[3]); } else { usage(); return; } } else { outstream = System.out; } Writer output = new OutputStreamWriter(outstream); output = new BufferedWriter(output); int repeat = 1; if (args.length > 4) { repeat = Integer.parseInt(args[4]); } Object [] emptyArgs = new Object[0]; int character; while ((character = reader.read()) != -1) { char ch = (char) character; if (Character.isWhitespace((char) ch)) { if (input.length() > 0) { stemmer.setCurrent(input.toString()); for (int i = repeat; i != 0; i--) { stemMethod.invoke(stemmer, emptyArgs); } output.write(stemmer.getCurrent()); output.write('\n'); input.delete(0, input.length()); } } else { input.append(Character.toLowerCase(ch)); } } output.flush(); } } lucene-2.9.4/contrib/snowball/src/java/org/tartarus/snowball/Among.java0000644000175000017500000000337211474320235026633 0ustar janpascaljanpascalpackage org.tartarus.snowball; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.lang.reflect.Method; public class Among { public Among (String s, int substring_i, int result, String methodname, SnowballProgram methodobject) { this.s_size = s.length(); this.s = s; this.substring_i = substring_i; this.result = result; this.methodobject = methodobject; if (methodname.length() == 0) { this.method = null; } else { try { this.method = methodobject.getClass(). getDeclaredMethod(methodname, new Class[0]); } catch (NoSuchMethodException e) { // FIXME - debug message this.method = null; } } } public int s_size; /* search string */ public String s; /* search string */ public int substring_i; /* index to longest matching substring */ public int result; /* result of the lookup */ public Method method; /* method to use if substring matches */ public SnowballProgram methodobject; /* object to invoke method on */ }; lucene-2.9.4/contrib/snowball/src/java/org/tartarus/snowball/ext/0000755000175000017500000000000011554106561025525 5ustar janpascaljanpascallucene-2.9.4/contrib/snowball/src/java/org/tartarus/snowball/ext/RussianStemmer.java0000644000175000017500000006224111474320235031353 0ustar janpascaljanpascal// This file was generated automatically by the Snowball to Java compiler package org.tartarus.snowball.ext; import org.tartarus.snowball.SnowballProgram; import org.tartarus.snowball.Among; /** * Generated class implementing code defined by a snowball script. */ public class RussianStemmer extends SnowballProgram { private Among a_0[] = { new Among ( "\u0432", -1, 1, "", this), new Among ( "\u0438\u0432", 0, 2, "", this), new Among ( "\u044B\u0432", 0, 2, "", this), new Among ( "\u0432\u0448\u0438", -1, 1, "", this), new Among ( "\u0438\u0432\u0448\u0438", 3, 2, "", this), new Among ( "\u044B\u0432\u0448\u0438", 3, 2, "", this), new Among ( "\u0432\u0448\u0438\u0441\u044C", -1, 1, "", this), new Among ( "\u0438\u0432\u0448\u0438\u0441\u044C", 6, 2, "", this), new Among ( "\u044B\u0432\u0448\u0438\u0441\u044C", 6, 2, "", this) }; private Among a_1[] = { new Among ( "\u0435\u0435", -1, 1, "", this), new Among ( "\u0438\u0435", -1, 1, "", this), new Among ( "\u043E\u0435", -1, 1, "", this), new Among ( "\u044B\u0435", -1, 1, "", this), new Among ( "\u0438\u043C\u0438", -1, 1, "", this), new Among ( "\u044B\u043C\u0438", -1, 1, "", this), new Among ( "\u0435\u0439", -1, 1, "", this), new Among ( "\u0438\u0439", -1, 1, "", this), new Among ( "\u043E\u0439", -1, 1, "", this), new Among ( "\u044B\u0439", -1, 1, "", this), new Among ( "\u0435\u043C", -1, 1, "", this), new Among ( "\u0438\u043C", -1, 1, "", this), new Among ( "\u043E\u043C", -1, 1, "", this), new Among ( "\u044B\u043C", -1, 1, "", this), new Among ( "\u0435\u0433\u043E", -1, 1, "", this), new Among ( "\u043E\u0433\u043E", -1, 1, "", this), new Among ( "\u0435\u043C\u0443", -1, 1, "", this), new Among ( "\u043E\u043C\u0443", -1, 1, "", this), new Among ( "\u0438\u0445", -1, 1, "", this), new Among ( "\u044B\u0445", -1, 1, "", this), new Among ( "\u0435\u044E", -1, 1, "", this), new Among ( "\u043E\u044E", -1, 1, "", this), new Among ( "\u0443\u044E", -1, 1, "", this), new Among ( "\u044E\u044E", -1, 1, "", this), new Among ( "\u0430\u044F", -1, 1, "", this), new Among ( "\u044F\u044F", -1, 1, "", this) }; private Among a_2[] = { new Among ( "\u0435\u043C", -1, 1, "", this), new Among ( "\u043D\u043D", -1, 1, "", this), new Among ( "\u0432\u0448", -1, 1, "", this), new Among ( "\u0438\u0432\u0448", 2, 2, "", this), new Among ( "\u044B\u0432\u0448", 2, 2, "", this), new Among ( "\u0449", -1, 1, "", this), new Among ( "\u044E\u0449", 5, 1, "", this), new Among ( "\u0443\u044E\u0449", 6, 2, "", this) }; private Among a_3[] = { new Among ( "\u0441\u044C", -1, 1, "", this), new Among ( "\u0441\u044F", -1, 1, "", this) }; private Among a_4[] = { new Among ( "\u043B\u0430", -1, 1, "", this), new Among ( "\u0438\u043B\u0430", 0, 2, "", this), new Among ( "\u044B\u043B\u0430", 0, 2, "", this), new Among ( "\u043D\u0430", -1, 1, "", this), new Among ( "\u0435\u043D\u0430", 3, 2, "", this), new Among ( "\u0435\u0442\u0435", -1, 1, "", this), new Among ( "\u0438\u0442\u0435", -1, 2, "", this), new Among ( "\u0439\u0442\u0435", -1, 1, "", this), new Among ( "\u0435\u0439\u0442\u0435", 7, 2, "", this), new Among ( "\u0443\u0439\u0442\u0435", 7, 2, "", this), new Among ( "\u043B\u0438", -1, 1, "", this), new Among ( "\u0438\u043B\u0438", 10, 2, "", this), new Among ( "\u044B\u043B\u0438", 10, 2, "", this), new Among ( "\u0439", -1, 1, "", this), new Among ( "\u0435\u0439", 13, 2, "", this), new Among ( "\u0443\u0439", 13, 2, "", this), new Among ( "\u043B", -1, 1, "", this), new Among ( "\u0438\u043B", 16, 2, "", this), new Among ( "\u044B\u043B", 16, 2, "", this), new Among ( "\u0435\u043C", -1, 1, "", this), new Among ( "\u0438\u043C", -1, 2, "", this), new Among ( "\u044B\u043C", -1, 2, "", this), new Among ( "\u043D", -1, 1, "", this), new Among ( "\u0435\u043D", 22, 2, "", this), new Among ( "\u043B\u043E", -1, 1, "", this), new Among ( "\u0438\u043B\u043E", 24, 2, "", this), new Among ( "\u044B\u043B\u043E", 24, 2, "", this), new Among ( "\u043D\u043E", -1, 1, "", this), new Among ( "\u0435\u043D\u043E", 27, 2, "", this), new Among ( "\u043D\u043D\u043E", 27, 1, "", this), new Among ( "\u0435\u0442", -1, 1, "", this), new Among ( "\u0443\u0435\u0442", 30, 2, "", this), new Among ( "\u0438\u0442", -1, 2, "", this), new Among ( "\u044B\u0442", -1, 2, "", this), new Among ( "\u044E\u0442", -1, 1, "", this), new Among ( "\u0443\u044E\u0442", 34, 2, "", this), new Among ( "\u044F\u0442", -1, 2, "", this), new Among ( "\u043D\u044B", -1, 1, "", this), new Among ( "\u0435\u043D\u044B", 37, 2, "", this), new Among ( "\u0442\u044C", -1, 1, "", this), new Among ( "\u0438\u0442\u044C", 39, 2, "", this), new Among ( "\u044B\u0442\u044C", 39, 2, "", this), new Among ( "\u0435\u0448\u044C", -1, 1, "", this), new Among ( "\u0438\u0448\u044C", -1, 2, "", this), new Among ( "\u044E", -1, 2, "", this), new Among ( "\u0443\u044E", 44, 2, "", this) }; private Among a_5[] = { new Among ( "\u0430", -1, 1, "", this), new Among ( "\u0435\u0432", -1, 1, "", this), new Among ( "\u043E\u0432", -1, 1, "", this), new Among ( "\u0435", -1, 1, "", this), new Among ( "\u0438\u0435", 3, 1, "", this), new Among ( "\u044C\u0435", 3, 1, "", this), new Among ( "\u0438", -1, 1, "", this), new Among ( "\u0435\u0438", 6, 1, "", this), new Among ( "\u0438\u0438", 6, 1, "", this), new Among ( "\u0430\u043C\u0438", 6, 1, "", this), new Among ( "\u044F\u043C\u0438", 6, 1, "", this), new Among ( "\u0438\u044F\u043C\u0438", 10, 1, "", this), new Among ( "\u0439", -1, 1, "", this), new Among ( "\u0435\u0439", 12, 1, "", this), new Among ( "\u0438\u0435\u0439", 13, 1, "", this), new Among ( "\u0438\u0439", 12, 1, "", this), new Among ( "\u043E\u0439", 12, 1, "", this), new Among ( "\u0430\u043C", -1, 1, "", this), new Among ( "\u0435\u043C", -1, 1, "", this), new Among ( "\u0438\u0435\u043C", 18, 1, "", this), new Among ( "\u043E\u043C", -1, 1, "", this), new Among ( "\u044F\u043C", -1, 1, "", this), new Among ( "\u0438\u044F\u043C", 21, 1, "", this), new Among ( "\u043E", -1, 1, "", this), new Among ( "\u0443", -1, 1, "", this), new Among ( "\u0430\u0445", -1, 1, "", this), new Among ( "\u044F\u0445", -1, 1, "", this), new Among ( "\u0438\u044F\u0445", 26, 1, "", this), new Among ( "\u044B", -1, 1, "", this), new Among ( "\u044C", -1, 1, "", this), new Among ( "\u044E", -1, 1, "", this), new Among ( "\u0438\u044E", 30, 1, "", this), new Among ( "\u044C\u044E", 30, 1, "", this), new Among ( "\u044F", -1, 1, "", this), new Among ( "\u0438\u044F", 33, 1, "", this), new Among ( "\u044C\u044F", 33, 1, "", this) }; private Among a_6[] = { new Among ( "\u043E\u0441\u0442", -1, 1, "", this), new Among ( "\u043E\u0441\u0442\u044C", -1, 1, "", this) }; private Among a_7[] = { new Among ( "\u0435\u0439\u0448\u0435", -1, 1, "", this), new Among ( "\u043D", -1, 2, "", this), new Among ( "\u0435\u0439\u0448", -1, 1, "", this), new Among ( "\u044C", -1, 3, "", this) }; private static final char g_v[] = {33, 65, 8, 232 }; private int I_p2; private int I_pV; private void copy_from(RussianStemmer other) { I_p2 = other.I_p2; I_pV = other.I_pV; super.copy_from(other); } private boolean r_mark_regions() { int v_1; // (, line 57 I_pV = limit; I_p2 = limit; // do, line 61 v_1 = cursor; lab0: do { // (, line 61 // gopast, line 62 golab1: while(true) { lab2: do { if (!(in_grouping(g_v, 1072, 1103))) { break lab2; } break golab1; } while (false); if (cursor >= limit) { break lab0; } cursor++; } // setmark pV, line 62 I_pV = cursor; // gopast, line 62 golab3: while(true) { lab4: do { if (!(out_grouping(g_v, 1072, 1103))) { break lab4; } break golab3; } while (false); if (cursor >= limit) { break lab0; } cursor++; } // gopast, line 63 golab5: while(true) { lab6: do { if (!(in_grouping(g_v, 1072, 1103))) { break lab6; } break golab5; } while (false); if (cursor >= limit) { break lab0; } cursor++; } // gopast, line 63 golab7: while(true) { lab8: do { if (!(out_grouping(g_v, 1072, 1103))) { break lab8; } break golab7; } while (false); if (cursor >= limit) { break lab0; } cursor++; } // setmark p2, line 63 I_p2 = cursor; } while (false); cursor = v_1; return true; } private boolean r_R2() { if (!(I_p2 <= cursor)) { return false; } return true; } private boolean r_perfective_gerund() { int among_var; int v_1; // (, line 71 // [, line 72 ket = cursor; // substring, line 72 among_var = find_among_b(a_0, 9); if (among_var == 0) { return false; } // ], line 72 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 76 // or, line 76 lab0: do { v_1 = limit - cursor; lab1: do { // literal, line 76 if (!(eq_s_b(1, "\u0430"))) { break lab1; } break lab0; } while (false); cursor = limit - v_1; // literal, line 76 if (!(eq_s_b(1, "\u044F"))) { return false; } } while (false); // delete, line 76 slice_del(); break; case 2: // (, line 83 // delete, line 83 slice_del(); break; } return true; } private boolean r_adjective() { int among_var; // (, line 87 // [, line 88 ket = cursor; // substring, line 88 among_var = find_among_b(a_1, 26); if (among_var == 0) { return false; } // ], line 88 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 97 // delete, line 97 slice_del(); break; } return true; } private boolean r_adjectival() { int among_var; int v_1; int v_2; // (, line 101 // call adjective, line 102 if (!r_adjective()) { return false; } // try, line 109 v_1 = limit - cursor; lab0: do { // (, line 109 // [, line 110 ket = cursor; // substring, line 110 among_var = find_among_b(a_2, 8); if (among_var == 0) { cursor = limit - v_1; break lab0; } // ], line 110 bra = cursor; switch(among_var) { case 0: cursor = limit - v_1; break lab0; case 1: // (, line 115 // or, line 115 lab1: do { v_2 = limit - cursor; lab2: do { // literal, line 115 if (!(eq_s_b(1, "\u0430"))) { break lab2; } break lab1; } while (false); cursor = limit - v_2; // literal, line 115 if (!(eq_s_b(1, "\u044F"))) { cursor = limit - v_1; break lab0; } } while (false); // delete, line 115 slice_del(); break; case 2: // (, line 122 // delete, line 122 slice_del(); break; } } while (false); return true; } private boolean r_reflexive() { int among_var; // (, line 128 // [, line 129 ket = cursor; // substring, line 129 among_var = find_among_b(a_3, 2); if (among_var == 0) { return false; } // ], line 129 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 132 // delete, line 132 slice_del(); break; } return true; } private boolean r_verb() { int among_var; int v_1; // (, line 136 // [, line 137 ket = cursor; // substring, line 137 among_var = find_among_b(a_4, 46); if (among_var == 0) { return false; } // ], line 137 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 143 // or, line 143 lab0: do { v_1 = limit - cursor; lab1: do { // literal, line 143 if (!(eq_s_b(1, "\u0430"))) { break lab1; } break lab0; } while (false); cursor = limit - v_1; // literal, line 143 if (!(eq_s_b(1, "\u044F"))) { return false; } } while (false); // delete, line 143 slice_del(); break; case 2: // (, line 151 // delete, line 151 slice_del(); break; } return true; } private boolean r_noun() { int among_var; // (, line 159 // [, line 160 ket = cursor; // substring, line 160 among_var = find_among_b(a_5, 36); if (among_var == 0) { return false; } // ], line 160 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 167 // delete, line 167 slice_del(); break; } return true; } private boolean r_derivational() { int among_var; // (, line 175 // [, line 176 ket = cursor; // substring, line 176 among_var = find_among_b(a_6, 2); if (among_var == 0) { return false; } // ], line 176 bra = cursor; // call R2, line 176 if (!r_R2()) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 179 // delete, line 179 slice_del(); break; } return true; } private boolean r_tidy_up() { int among_var; // (, line 183 // [, line 184 ket = cursor; // substring, line 184 among_var = find_among_b(a_7, 4); if (among_var == 0) { return false; } // ], line 184 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 188 // delete, line 188 slice_del(); // [, line 189 ket = cursor; // literal, line 189 if (!(eq_s_b(1, "\u043D"))) { return false; } // ], line 189 bra = cursor; // literal, line 189 if (!(eq_s_b(1, "\u043D"))) { return false; } // delete, line 189 slice_del(); break; case 2: // (, line 192 // literal, line 192 if (!(eq_s_b(1, "\u043D"))) { return false; } // delete, line 192 slice_del(); break; case 3: // (, line 194 // delete, line 194 slice_del(); break; } return true; } public boolean stem() { int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; int v_8; int v_9; int v_10; // (, line 199 // do, line 201 v_1 = cursor; lab0: do { // call mark_regions, line 201 if (!r_mark_regions()) { break lab0; } } while (false); cursor = v_1; // backwards, line 202 limit_backward = cursor; cursor = limit; // setlimit, line 202 v_2 = limit - cursor; // tomark, line 202 if (cursor < I_pV) { return false; } cursor = I_pV; v_3 = limit_backward; limit_backward = cursor; cursor = limit - v_2; // (, line 202 // do, line 203 v_4 = limit - cursor; lab1: do { // (, line 203 // or, line 204 lab2: do { v_5 = limit - cursor; lab3: do { // call perfective_gerund, line 204 if (!r_perfective_gerund()) { break lab3; } break lab2; } while (false); cursor = limit - v_5; // (, line 205 // try, line 205 v_6 = limit - cursor; lab4: do { // call reflexive, line 205 if (!r_reflexive()) { cursor = limit - v_6; break lab4; } } while (false); // or, line 206 lab5: do { v_7 = limit - cursor; lab6: do { // call adjectival, line 206 if (!r_adjectival()) { break lab6; } break lab5; } while (false); cursor = limit - v_7; lab7: do { // call verb, line 206 if (!r_verb()) { break lab7; } break lab5; } while (false); cursor = limit - v_7; // call noun, line 206 if (!r_noun()) { break lab1; } } while (false); } while (false); } while (false); cursor = limit - v_4; // try, line 209 v_8 = limit - cursor; lab8: do { // (, line 209 // [, line 209 ket = cursor; // literal, line 209 if (!(eq_s_b(1, "\u0438"))) { cursor = limit - v_8; break lab8; } // ], line 209 bra = cursor; // delete, line 209 slice_del(); } while (false); // do, line 212 v_9 = limit - cursor; lab9: do { // call derivational, line 212 if (!r_derivational()) { break lab9; } } while (false); cursor = limit - v_9; // do, line 213 v_10 = limit - cursor; lab10: do { // call tidy_up, line 213 if (!r_tidy_up()) { break lab10; } } while (false); cursor = limit - v_10; limit_backward = v_3; cursor = limit_backward; return true; } } lucene-2.9.4/contrib/snowball/src/java/org/tartarus/snowball/ext/KpStemmer.java0000644000175000017500000021721111474320235030300 0ustar janpascaljanpascal// This file was generated automatically by the Snowball to Java compiler package org.tartarus.snowball.ext; import org.tartarus.snowball.SnowballProgram; import org.tartarus.snowball.Among; /** * Generated class implementing code defined by a snowball script. */ public class KpStemmer extends SnowballProgram { private Among a_0[] = { new Among ( "nde", -1, 7, "", this), new Among ( "en", -1, 6, "", this), new Among ( "s", -1, 2, "", this), new Among ( "'s", 2, 1, "", this), new Among ( "es", 2, 4, "", this), new Among ( "ies", 4, 3, "", this), new Among ( "aus", 2, 5, "", this) }; private Among a_1[] = { new Among ( "de", -1, 5, "", this), new Among ( "ge", -1, 2, "", this), new Among ( "ische", -1, 4, "", this), new Among ( "je", -1, 1, "", this), new Among ( "lijke", -1, 3, "", this), new Among ( "le", -1, 9, "", this), new Among ( "ene", -1, 10, "", this), new Among ( "re", -1, 8, "", this), new Among ( "se", -1, 7, "", this), new Among ( "te", -1, 6, "", this), new Among ( "ieve", -1, 11, "", this) }; private Among a_2[] = { new Among ( "heid", -1, 3, "", this), new Among ( "fie", -1, 7, "", this), new Among ( "gie", -1, 8, "", this), new Among ( "atie", -1, 1, "", this), new Among ( "isme", -1, 5, "", this), new Among ( "ing", -1, 5, "", this), new Among ( "arij", -1, 6, "", this), new Among ( "erij", -1, 5, "", this), new Among ( "sel", -1, 3, "", this), new Among ( "rder", -1, 4, "", this), new Among ( "ster", -1, 3, "", this), new Among ( "iteit", -1, 2, "", this), new Among ( "dst", -1, 10, "", this), new Among ( "tst", -1, 9, "", this) }; private Among a_3[] = { new Among ( "end", -1, 10, "", this), new Among ( "atief", -1, 2, "", this), new Among ( "erig", -1, 10, "", this), new Among ( "achtig", -1, 9, "", this), new Among ( "ioneel", -1, 1, "", this), new Among ( "baar", -1, 3, "", this), new Among ( "laar", -1, 5, "", this), new Among ( "naar", -1, 4, "", this), new Among ( "raar", -1, 6, "", this), new Among ( "eriger", -1, 10, "", this), new Among ( "achtiger", -1, 9, "", this), new Among ( "lijker", -1, 8, "", this), new Among ( "tant", -1, 7, "", this), new Among ( "erigst", -1, 10, "", this), new Among ( "achtigst", -1, 9, "", this), new Among ( "lijkst", -1, 8, "", this) }; private Among a_4[] = { new Among ( "ig", -1, 1, "", this), new Among ( "iger", -1, 1, "", this), new Among ( "igst", -1, 1, "", this) }; private Among a_5[] = { new Among ( "ft", -1, 2, "", this), new Among ( "kt", -1, 1, "", this), new Among ( "pt", -1, 3, "", this) }; private Among a_6[] = { new Among ( "bb", -1, 1, "", this), new Among ( "cc", -1, 2, "", this), new Among ( "dd", -1, 3, "", this), new Among ( "ff", -1, 4, "", this), new Among ( "gg", -1, 5, "", this), new Among ( "hh", -1, 6, "", this), new Among ( "jj", -1, 7, "", this), new Among ( "kk", -1, 8, "", this), new Among ( "ll", -1, 9, "", this), new Among ( "mm", -1, 10, "", this), new Among ( "nn", -1, 11, "", this), new Among ( "pp", -1, 12, "", this), new Among ( "qq", -1, 13, "", this), new Among ( "rr", -1, 14, "", this), new Among ( "ss", -1, 15, "", this), new Among ( "tt", -1, 16, "", this), new Among ( "v", -1, 21, "", this), new Among ( "vv", 16, 17, "", this), new Among ( "ww", -1, 18, "", this), new Among ( "xx", -1, 19, "", this), new Among ( "z", -1, 22, "", this), new Among ( "zz", 20, 20, "", this) }; private Among a_7[] = { new Among ( "d", -1, 1, "", this), new Among ( "t", -1, 2, "", this) }; private static final char g_v[] = {17, 65, 16, 1 }; private static final char g_v_WX[] = {17, 65, 208, 1 }; private static final char g_AOU[] = {1, 64, 16 }; private static final char g_AIOU[] = {1, 65, 16 }; private boolean B_GE_removed; private boolean B_stemmed; private boolean B_Y_found; private int I_p2; private int I_p1; private int I_x; private StringBuffer S_ch = new StringBuffer(); private void copy_from(KpStemmer other) { B_GE_removed = other.B_GE_removed; B_stemmed = other.B_stemmed; B_Y_found = other.B_Y_found; I_p2 = other.I_p2; I_p1 = other.I_p1; I_x = other.I_x; S_ch = other.S_ch; super.copy_from(other); } private boolean r_R1() { // (, line 32 // setmark x, line 32 I_x = cursor; if (!(I_x >= I_p1)) { return false; } return true; } private boolean r_R2() { // (, line 33 // setmark x, line 33 I_x = cursor; if (!(I_x >= I_p2)) { return false; } return true; } private boolean r_V() { int v_1; int v_2; // test, line 35 v_1 = limit - cursor; // (, line 35 // or, line 35 lab0: do { v_2 = limit - cursor; lab1: do { if (!(in_grouping_b(g_v, 97, 121))) { break lab1; } break lab0; } while (false); cursor = limit - v_2; // literal, line 35 if (!(eq_s_b(2, "ij"))) { return false; } } while (false); cursor = limit - v_1; return true; } private boolean r_VX() { int v_1; int v_2; // test, line 36 v_1 = limit - cursor; // (, line 36 // next, line 36 if (cursor <= limit_backward) { return false; } cursor--; // or, line 36 lab0: do { v_2 = limit - cursor; lab1: do { if (!(in_grouping_b(g_v, 97, 121))) { break lab1; } break lab0; } while (false); cursor = limit - v_2; // literal, line 36 if (!(eq_s_b(2, "ij"))) { return false; } } while (false); cursor = limit - v_1; return true; } private boolean r_C() { int v_1; int v_2; // test, line 37 v_1 = limit - cursor; // (, line 37 // not, line 37 { v_2 = limit - cursor; lab0: do { // literal, line 37 if (!(eq_s_b(2, "ij"))) { break lab0; } return false; } while (false); cursor = limit - v_2; } if (!(out_grouping_b(g_v, 97, 121))) { return false; } cursor = limit - v_1; return true; } private boolean r_lengthen_V() { int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; int v_8; // do, line 39 v_1 = limit - cursor; lab0: do { // (, line 39 if (!(out_grouping_b(g_v_WX, 97, 121))) { break lab0; } // [, line 40 ket = cursor; // or, line 40 lab1: do { v_2 = limit - cursor; lab2: do { // (, line 40 if (!(in_grouping_b(g_AOU, 97, 117))) { break lab2; } // ], line 40 bra = cursor; // test, line 40 v_3 = limit - cursor; // (, line 40 // or, line 40 lab3: do { v_4 = limit - cursor; lab4: do { if (!(out_grouping_b(g_v, 97, 121))) { break lab4; } break lab3; } while (false); cursor = limit - v_4; // atlimit, line 40 if (cursor > limit_backward) { break lab2; } } while (false); cursor = limit - v_3; break lab1; } while (false); cursor = limit - v_2; // (, line 41 // literal, line 41 if (!(eq_s_b(1, "e"))) { break lab0; } // ], line 41 bra = cursor; // test, line 41 v_5 = limit - cursor; // (, line 41 // or, line 41 lab5: do { v_6 = limit - cursor; lab6: do { if (!(out_grouping_b(g_v, 97, 121))) { break lab6; } break lab5; } while (false); cursor = limit - v_6; // atlimit, line 41 if (cursor > limit_backward) { break lab0; } } while (false); // not, line 42 { v_7 = limit - cursor; lab7: do { if (!(in_grouping_b(g_AIOU, 97, 117))) { break lab7; } break lab0; } while (false); cursor = limit - v_7; } // not, line 43 { v_8 = limit - cursor; lab8: do { // (, line 43 // next, line 43 if (cursor <= limit_backward) { break lab8; } cursor--; if (!(in_grouping_b(g_AIOU, 97, 117))) { break lab8; } if (!(out_grouping_b(g_v, 97, 121))) { break lab8; } break lab0; } while (false); cursor = limit - v_8; } cursor = limit - v_5; } while (false); // -> ch, line 44 S_ch = slice_to(S_ch); // <+ ch, line 44 { int c = cursor; insert(cursor, cursor, S_ch); cursor = c; } } while (false); cursor = limit - v_1; return true; } private boolean r_Step_1() { int among_var; int v_1; int v_2; int v_3; int v_4; // (, line 48 // [, line 49 ket = cursor; // among, line 49 among_var = find_among_b(a_0, 7); if (among_var == 0) { return false; } // (, line 49 // ], line 49 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 51 // delete, line 51 slice_del(); break; case 2: // (, line 52 // call R1, line 52 if (!r_R1()) { return false; } // not, line 52 { v_1 = limit - cursor; lab0: do { // (, line 52 // literal, line 52 if (!(eq_s_b(1, "t"))) { break lab0; } // call R1, line 52 if (!r_R1()) { break lab0; } return false; } while (false); cursor = limit - v_1; } // call C, line 52 if (!r_C()) { return false; } // delete, line 52 slice_del(); break; case 3: // (, line 53 // call R1, line 53 if (!r_R1()) { return false; } // <-, line 53 slice_from("ie"); break; case 4: // (, line 55 // or, line 55 lab1: do { v_2 = limit - cursor; lab2: do { // (, line 55 // literal, line 55 if (!(eq_s_b(2, "ar"))) { break lab2; } // call R1, line 55 if (!r_R1()) { break lab2; } // call C, line 55 if (!r_C()) { break lab2; } // ], line 55 bra = cursor; // delete, line 55 slice_del(); // call lengthen_V, line 55 if (!r_lengthen_V()) { break lab2; } break lab1; } while (false); cursor = limit - v_2; lab3: do { // (, line 56 // literal, line 56 if (!(eq_s_b(2, "er"))) { break lab3; } // call R1, line 56 if (!r_R1()) { break lab3; } // call C, line 56 if (!r_C()) { break lab3; } // ], line 56 bra = cursor; // delete, line 56 slice_del(); break lab1; } while (false); cursor = limit - v_2; // (, line 57 // call R1, line 57 if (!r_R1()) { return false; } // call C, line 57 if (!r_C()) { return false; } // <-, line 57 slice_from("e"); } while (false); break; case 5: // (, line 59 // call R1, line 59 if (!r_R1()) { return false; } // call V, line 59 if (!r_V()) { return false; } // <-, line 59 slice_from("au"); break; case 6: // (, line 60 // or, line 60 lab4: do { v_3 = limit - cursor; lab5: do { // (, line 60 // literal, line 60 if (!(eq_s_b(3, "hed"))) { break lab5; } // call R1, line 60 if (!r_R1()) { break lab5; } // ], line 60 bra = cursor; // <-, line 60 slice_from("heid"); break lab4; } while (false); cursor = limit - v_3; lab6: do { // (, line 61 // literal, line 61 if (!(eq_s_b(2, "nd"))) { break lab6; } // delete, line 61 slice_del(); break lab4; } while (false); cursor = limit - v_3; lab7: do { // (, line 62 // literal, line 62 if (!(eq_s_b(1, "d"))) { break lab7; } // call R1, line 62 if (!r_R1()) { break lab7; } // call C, line 62 if (!r_C()) { break lab7; } // ], line 62 bra = cursor; // delete, line 62 slice_del(); break lab4; } while (false); cursor = limit - v_3; lab8: do { // (, line 63 // or, line 63 lab9: do { v_4 = limit - cursor; lab10: do { // literal, line 63 if (!(eq_s_b(1, "i"))) { break lab10; } break lab9; } while (false); cursor = limit - v_4; // literal, line 63 if (!(eq_s_b(1, "j"))) { break lab8; } } while (false); // call V, line 63 if (!r_V()) { break lab8; } // delete, line 63 slice_del(); break lab4; } while (false); cursor = limit - v_3; // (, line 64 // call R1, line 64 if (!r_R1()) { return false; } // call C, line 64 if (!r_C()) { return false; } // delete, line 64 slice_del(); // call lengthen_V, line 64 if (!r_lengthen_V()) { return false; } } while (false); break; case 7: // (, line 65 // <-, line 65 slice_from("nd"); break; } return true; } private boolean r_Step_2() { int among_var; int v_1; // (, line 70 // [, line 71 ket = cursor; // among, line 71 among_var = find_among_b(a_1, 11); if (among_var == 0) { return false; } // (, line 71 // ], line 71 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 72 // or, line 72 lab0: do { v_1 = limit - cursor; lab1: do { // (, line 72 // literal, line 72 if (!(eq_s_b(2, "'t"))) { break lab1; } // ], line 72 bra = cursor; // delete, line 72 slice_del(); break lab0; } while (false); cursor = limit - v_1; lab2: do { // (, line 73 // literal, line 73 if (!(eq_s_b(2, "et"))) { break lab2; } // ], line 73 bra = cursor; // call R1, line 73 if (!r_R1()) { break lab2; } // call C, line 73 if (!r_C()) { break lab2; } // delete, line 73 slice_del(); break lab0; } while (false); cursor = limit - v_1; lab3: do { // (, line 74 // literal, line 74 if (!(eq_s_b(3, "rnt"))) { break lab3; } // ], line 74 bra = cursor; // <-, line 74 slice_from("rn"); break lab0; } while (false); cursor = limit - v_1; lab4: do { // (, line 75 // literal, line 75 if (!(eq_s_b(1, "t"))) { break lab4; } // ], line 75 bra = cursor; // call R1, line 75 if (!r_R1()) { break lab4; } // call VX, line 75 if (!r_VX()) { break lab4; } // delete, line 75 slice_del(); break lab0; } while (false); cursor = limit - v_1; lab5: do { // (, line 76 // literal, line 76 if (!(eq_s_b(3, "ink"))) { break lab5; } // ], line 76 bra = cursor; // <-, line 76 slice_from("ing"); break lab0; } while (false); cursor = limit - v_1; lab6: do { // (, line 77 // literal, line 77 if (!(eq_s_b(2, "mp"))) { break lab6; } // ], line 77 bra = cursor; // <-, line 77 slice_from("m"); break lab0; } while (false); cursor = limit - v_1; lab7: do { // (, line 78 // literal, line 78 if (!(eq_s_b(1, "'"))) { break lab7; } // ], line 78 bra = cursor; // call R1, line 78 if (!r_R1()) { break lab7; } // delete, line 78 slice_del(); break lab0; } while (false); cursor = limit - v_1; // (, line 79 // ], line 79 bra = cursor; // call R1, line 79 if (!r_R1()) { return false; } // call C, line 79 if (!r_C()) { return false; } // delete, line 79 slice_del(); } while (false); break; case 2: // (, line 80 // call R1, line 80 if (!r_R1()) { return false; } // <-, line 80 slice_from("g"); break; case 3: // (, line 81 // call R1, line 81 if (!r_R1()) { return false; } // <-, line 81 slice_from("lijk"); break; case 4: // (, line 82 // call R1, line 82 if (!r_R1()) { return false; } // <-, line 82 slice_from("isch"); break; case 5: // (, line 83 // call R1, line 83 if (!r_R1()) { return false; } // call C, line 83 if (!r_C()) { return false; } // delete, line 83 slice_del(); break; case 6: // (, line 84 // call R1, line 84 if (!r_R1()) { return false; } // <-, line 84 slice_from("t"); break; case 7: // (, line 85 // call R1, line 85 if (!r_R1()) { return false; } // <-, line 85 slice_from("s"); break; case 8: // (, line 86 // call R1, line 86 if (!r_R1()) { return false; } // <-, line 86 slice_from("r"); break; case 9: // (, line 87 // call R1, line 87 if (!r_R1()) { return false; } // delete, line 87 slice_del(); // attach, line 87 insert(cursor, cursor, "l"); // call lengthen_V, line 87 if (!r_lengthen_V()) { return false; } break; case 10: // (, line 88 // call R1, line 88 if (!r_R1()) { return false; } // call C, line 88 if (!r_C()) { return false; } // delete, line 88 slice_del(); // attach, line 88 insert(cursor, cursor, "en"); // call lengthen_V, line 88 if (!r_lengthen_V()) { return false; } break; case 11: // (, line 89 // call R1, line 89 if (!r_R1()) { return false; } // call C, line 89 if (!r_C()) { return false; } // <-, line 89 slice_from("ief"); break; } return true; } private boolean r_Step_3() { int among_var; // (, line 94 // [, line 95 ket = cursor; // among, line 95 among_var = find_among_b(a_2, 14); if (among_var == 0) { return false; } // (, line 95 // ], line 95 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 96 // call R1, line 96 if (!r_R1()) { return false; } // <-, line 96 slice_from("eer"); break; case 2: // (, line 97 // call R1, line 97 if (!r_R1()) { return false; } // delete, line 97 slice_del(); // call lengthen_V, line 97 if (!r_lengthen_V()) { return false; } break; case 3: // (, line 100 // call R1, line 100 if (!r_R1()) { return false; } // delete, line 100 slice_del(); break; case 4: // (, line 101 // <-, line 101 slice_from("r"); break; case 5: // (, line 104 // call R1, line 104 if (!r_R1()) { return false; } // delete, line 104 slice_del(); // call lengthen_V, line 104 if (!r_lengthen_V()) { return false; } break; case 6: // (, line 105 // call R1, line 105 if (!r_R1()) { return false; } // call C, line 105 if (!r_C()) { return false; } // <-, line 105 slice_from("aar"); break; case 7: // (, line 106 // call R2, line 106 if (!r_R2()) { return false; } // delete, line 106 slice_del(); // attach, line 106 insert(cursor, cursor, "f"); // call lengthen_V, line 106 if (!r_lengthen_V()) { return false; } break; case 8: // (, line 107 // call R2, line 107 if (!r_R2()) { return false; } // delete, line 107 slice_del(); // attach, line 107 insert(cursor, cursor, "g"); // call lengthen_V, line 107 if (!r_lengthen_V()) { return false; } break; case 9: // (, line 108 // call R1, line 108 if (!r_R1()) { return false; } // call C, line 108 if (!r_C()) { return false; } // <-, line 108 slice_from("t"); break; case 10: // (, line 109 // call R1, line 109 if (!r_R1()) { return false; } // call C, line 109 if (!r_C()) { return false; } // <-, line 109 slice_from("d"); break; } return true; } private boolean r_Step_4() { int among_var; int v_1; // (, line 114 // or, line 134 lab0: do { v_1 = limit - cursor; lab1: do { // (, line 115 // [, line 115 ket = cursor; // among, line 115 among_var = find_among_b(a_3, 16); if (among_var == 0) { break lab1; } // (, line 115 // ], line 115 bra = cursor; switch(among_var) { case 0: break lab1; case 1: // (, line 116 // call R1, line 116 if (!r_R1()) { break lab1; } // <-, line 116 slice_from("ie"); break; case 2: // (, line 117 // call R1, line 117 if (!r_R1()) { break lab1; } // <-, line 117 slice_from("eer"); break; case 3: // (, line 118 // call R1, line 118 if (!r_R1()) { break lab1; } // delete, line 118 slice_del(); break; case 4: // (, line 119 // call R1, line 119 if (!r_R1()) { break lab1; } // call V, line 119 if (!r_V()) { break lab1; } // <-, line 119 slice_from("n"); break; case 5: // (, line 120 // call R1, line 120 if (!r_R1()) { break lab1; } // call V, line 120 if (!r_V()) { break lab1; } // <-, line 120 slice_from("l"); break; case 6: // (, line 121 // call R1, line 121 if (!r_R1()) { break lab1; } // call V, line 121 if (!r_V()) { break lab1; } // <-, line 121 slice_from("r"); break; case 7: // (, line 122 // call R1, line 122 if (!r_R1()) { break lab1; } // <-, line 122 slice_from("teer"); break; case 8: // (, line 124 // call R1, line 124 if (!r_R1()) { break lab1; } // <-, line 124 slice_from("lijk"); break; case 9: // (, line 127 // call R1, line 127 if (!r_R1()) { break lab1; } // delete, line 127 slice_del(); break; case 10: // (, line 131 // call R1, line 131 if (!r_R1()) { break lab1; } // call C, line 131 if (!r_C()) { break lab1; } // delete, line 131 slice_del(); // call lengthen_V, line 131 if (!r_lengthen_V()) { break lab1; } break; } break lab0; } while (false); cursor = limit - v_1; // (, line 135 // [, line 135 ket = cursor; // among, line 135 among_var = find_among_b(a_4, 3); if (among_var == 0) { return false; } // (, line 135 // ], line 135 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 138 // call R1, line 138 if (!r_R1()) { return false; } // call C, line 138 if (!r_C()) { return false; } // delete, line 138 slice_del(); // call lengthen_V, line 138 if (!r_lengthen_V()) { return false; } break; } } while (false); return true; } private boolean r_Step_7() { int among_var; // (, line 144 // [, line 145 ket = cursor; // among, line 145 among_var = find_among_b(a_5, 3); if (among_var == 0) { return false; } // (, line 145 // ], line 145 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 146 // <-, line 146 slice_from("k"); break; case 2: // (, line 147 // <-, line 147 slice_from("f"); break; case 3: // (, line 148 // <-, line 148 slice_from("p"); break; } return true; } private boolean r_Step_6() { int among_var; // (, line 153 // [, line 154 ket = cursor; // among, line 154 among_var = find_among_b(a_6, 22); if (among_var == 0) { return false; } // (, line 154 // ], line 154 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 155 // <-, line 155 slice_from("b"); break; case 2: // (, line 156 // <-, line 156 slice_from("c"); break; case 3: // (, line 157 // <-, line 157 slice_from("d"); break; case 4: // (, line 158 // <-, line 158 slice_from("f"); break; case 5: // (, line 159 // <-, line 159 slice_from("g"); break; case 6: // (, line 160 // <-, line 160 slice_from("h"); break; case 7: // (, line 161 // <-, line 161 slice_from("j"); break; case 8: // (, line 162 // <-, line 162 slice_from("k"); break; case 9: // (, line 163 // <-, line 163 slice_from("l"); break; case 10: // (, line 164 // <-, line 164 slice_from("m"); break; case 11: // (, line 165 // <-, line 165 slice_from("n"); break; case 12: // (, line 166 // <-, line 166 slice_from("p"); break; case 13: // (, line 167 // <-, line 167 slice_from("q"); break; case 14: // (, line 168 // <-, line 168 slice_from("r"); break; case 15: // (, line 169 // <-, line 169 slice_from("s"); break; case 16: // (, line 170 // <-, line 170 slice_from("t"); break; case 17: // (, line 171 // <-, line 171 slice_from("v"); break; case 18: // (, line 172 // <-, line 172 slice_from("w"); break; case 19: // (, line 173 // <-, line 173 slice_from("x"); break; case 20: // (, line 174 // <-, line 174 slice_from("z"); break; case 21: // (, line 175 // <-, line 175 slice_from("f"); break; case 22: // (, line 176 // <-, line 176 slice_from("s"); break; } return true; } private boolean r_Step_1c() { int among_var; int v_1; int v_2; // (, line 181 // [, line 182 ket = cursor; // among, line 182 among_var = find_among_b(a_7, 2); if (among_var == 0) { return false; } // (, line 182 // ], line 182 bra = cursor; // call R1, line 182 if (!r_R1()) { return false; } // call C, line 182 if (!r_C()) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 183 // not, line 183 { v_1 = limit - cursor; lab0: do { // (, line 183 // literal, line 183 if (!(eq_s_b(1, "n"))) { break lab0; } // call R1, line 183 if (!r_R1()) { break lab0; } return false; } while (false); cursor = limit - v_1; } // delete, line 183 slice_del(); break; case 2: // (, line 184 // not, line 184 { v_2 = limit - cursor; lab1: do { // (, line 184 // literal, line 184 if (!(eq_s_b(1, "h"))) { break lab1; } // call R1, line 184 if (!r_R1()) { break lab1; } return false; } while (false); cursor = limit - v_2; } // delete, line 184 slice_del(); break; } return true; } private boolean r_Lose_prefix() { int v_1; int v_2; int v_3; // (, line 189 // [, line 190 bra = cursor; // literal, line 190 if (!(eq_s(2, "ge"))) { return false; } // ], line 190 ket = cursor; // test, line 190 v_1 = cursor; // hop, line 190 { int c = cursor + 3; if (0 > c || c > limit) { return false; } cursor = c; } cursor = v_1; // (, line 190 // goto, line 190 golab0: while(true) { v_2 = cursor; lab1: do { if (!(in_grouping(g_v, 97, 121))) { break lab1; } cursor = v_2; break golab0; } while (false); cursor = v_2; if (cursor >= limit) { return false; } cursor++; } // goto, line 190 golab2: while(true) { v_3 = cursor; lab3: do { if (!(out_grouping(g_v, 97, 121))) { break lab3; } cursor = v_3; break golab2; } while (false); cursor = v_3; if (cursor >= limit) { return false; } cursor++; } // set GE_removed, line 191 B_GE_removed = true; // delete, line 192 slice_del(); return true; } private boolean r_Lose_infix() { int v_2; int v_3; int v_4; // (, line 195 // next, line 196 if (cursor >= limit) { return false; } cursor++; // gopast, line 197 golab0: while(true) { lab1: do { // (, line 197 // [, line 197 bra = cursor; // literal, line 197 if (!(eq_s(2, "ge"))) { break lab1; } // ], line 197 ket = cursor; break golab0; } while (false); if (cursor >= limit) { return false; } cursor++; } // test, line 197 v_2 = cursor; // hop, line 197 { int c = cursor + 3; if (0 > c || c > limit) { return false; } cursor = c; } cursor = v_2; // (, line 197 // goto, line 197 golab2: while(true) { v_3 = cursor; lab3: do { if (!(in_grouping(g_v, 97, 121))) { break lab3; } cursor = v_3; break golab2; } while (false); cursor = v_3; if (cursor >= limit) { return false; } cursor++; } // goto, line 197 golab4: while(true) { v_4 = cursor; lab5: do { if (!(out_grouping(g_v, 97, 121))) { break lab5; } cursor = v_4; break golab4; } while (false); cursor = v_4; if (cursor >= limit) { return false; } cursor++; } // set GE_removed, line 198 B_GE_removed = true; // delete, line 199 slice_del(); return true; } private boolean r_measure() { int v_1; int v_2; int v_5; int v_6; int v_9; int v_10; // (, line 202 // do, line 203 v_1 = cursor; lab0: do { // (, line 203 // tolimit, line 204 cursor = limit; // setmark p1, line 205 I_p1 = cursor; // setmark p2, line 206 I_p2 = cursor; } while (false); cursor = v_1; // do, line 208 v_2 = cursor; lab1: do { // (, line 208 // repeat, line 209 replab2: while(true) { lab3: do { if (!(out_grouping(g_v, 97, 121))) { break lab3; } continue replab2; } while (false); break replab2; } // atleast, line 209 { int v_4 = 1; // atleast, line 209 replab4: while(true) { v_5 = cursor; lab5: do { // (, line 209 // or, line 209 lab6: do { v_6 = cursor; lab7: do { // literal, line 209 if (!(eq_s(2, "ij"))) { break lab7; } break lab6; } while (false); cursor = v_6; if (!(in_grouping(g_v, 97, 121))) { break lab5; } } while (false); v_4--; continue replab4; } while (false); cursor = v_5; break replab4; } if (v_4 > 0) { break lab1; } } if (!(out_grouping(g_v, 97, 121))) { break lab1; } // setmark p1, line 209 I_p1 = cursor; // repeat, line 210 replab8: while(true) { lab9: do { if (!(out_grouping(g_v, 97, 121))) { break lab9; } continue replab8; } while (false); break replab8; } // atleast, line 210 { int v_8 = 1; // atleast, line 210 replab10: while(true) { v_9 = cursor; lab11: do { // (, line 210 // or, line 210 lab12: do { v_10 = cursor; lab13: do { // literal, line 210 if (!(eq_s(2, "ij"))) { break lab13; } break lab12; } while (false); cursor = v_10; if (!(in_grouping(g_v, 97, 121))) { break lab11; } } while (false); v_8--; continue replab10; } while (false); cursor = v_9; break replab10; } if (v_8 > 0) { break lab1; } } if (!(out_grouping(g_v, 97, 121))) { break lab1; } // setmark p2, line 210 I_p2 = cursor; } while (false); cursor = v_2; return true; } public boolean stem() { int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; int v_8; int v_9; int v_10; int v_11; int v_12; int v_13; int v_14; int v_15; int v_16; int v_18; int v_19; int v_20; // (, line 214 // unset Y_found, line 216 B_Y_found = false; // unset stemmed, line 217 B_stemmed = false; // do, line 218 v_1 = cursor; lab0: do { // (, line 218 // [, line 218 bra = cursor; // literal, line 218 if (!(eq_s(1, "y"))) { break lab0; } // ], line 218 ket = cursor; // <-, line 218 slice_from("Y"); // set Y_found, line 218 B_Y_found = true; } while (false); cursor = v_1; // do, line 219 v_2 = cursor; lab1: do { // repeat, line 219 replab2: while(true) { v_3 = cursor; lab3: do { // (, line 219 // goto, line 219 golab4: while(true) { v_4 = cursor; lab5: do { // (, line 219 if (!(in_grouping(g_v, 97, 121))) { break lab5; } // [, line 219 bra = cursor; // literal, line 219 if (!(eq_s(1, "y"))) { break lab5; } // ], line 219 ket = cursor; cursor = v_4; break golab4; } while (false); cursor = v_4; if (cursor >= limit) { break lab3; } cursor++; } // <-, line 219 slice_from("Y"); // set Y_found, line 219 B_Y_found = true; continue replab2; } while (false); cursor = v_3; break replab2; } } while (false); cursor = v_2; // call measure, line 221 if (!r_measure()) { return false; } // backwards, line 223 limit_backward = cursor; cursor = limit; // (, line 223 // do, line 224 v_5 = limit - cursor; lab6: do { // (, line 224 // call Step_1, line 224 if (!r_Step_1()) { break lab6; } // set stemmed, line 224 B_stemmed = true; } while (false); cursor = limit - v_5; // do, line 225 v_6 = limit - cursor; lab7: do { // (, line 225 // call Step_2, line 225 if (!r_Step_2()) { break lab7; } // set stemmed, line 225 B_stemmed = true; } while (false); cursor = limit - v_6; // do, line 226 v_7 = limit - cursor; lab8: do { // (, line 226 // call Step_3, line 226 if (!r_Step_3()) { break lab8; } // set stemmed, line 226 B_stemmed = true; } while (false); cursor = limit - v_7; // do, line 227 v_8 = limit - cursor; lab9: do { // (, line 227 // call Step_4, line 227 if (!r_Step_4()) { break lab9; } // set stemmed, line 227 B_stemmed = true; } while (false); cursor = limit - v_8; cursor = limit_backward; // unset GE_removed, line 229 B_GE_removed = false; // do, line 230 v_9 = cursor; lab10: do { // (, line 230 // and, line 230 v_10 = cursor; // call Lose_prefix, line 230 if (!r_Lose_prefix()) { break lab10; } cursor = v_10; // call measure, line 230 if (!r_measure()) { break lab10; } } while (false); cursor = v_9; // backwards, line 231 limit_backward = cursor; cursor = limit; // (, line 231 // do, line 232 v_11 = limit - cursor; lab11: do { // (, line 232 // Boolean test GE_removed, line 232 if (!(B_GE_removed)) { break lab11; } // call Step_1c, line 232 if (!r_Step_1c()) { break lab11; } } while (false); cursor = limit - v_11; cursor = limit_backward; // unset GE_removed, line 234 B_GE_removed = false; // do, line 235 v_12 = cursor; lab12: do { // (, line 235 // and, line 235 v_13 = cursor; // call Lose_infix, line 235 if (!r_Lose_infix()) { break lab12; } cursor = v_13; // call measure, line 235 if (!r_measure()) { break lab12; } } while (false); cursor = v_12; // backwards, line 236 limit_backward = cursor; cursor = limit; // (, line 236 // do, line 237 v_14 = limit - cursor; lab13: do { // (, line 237 // Boolean test GE_removed, line 237 if (!(B_GE_removed)) { break lab13; } // call Step_1c, line 237 if (!r_Step_1c()) { break lab13; } } while (false); cursor = limit - v_14; cursor = limit_backward; // backwards, line 239 limit_backward = cursor; cursor = limit; // (, line 239 // do, line 240 v_15 = limit - cursor; lab14: do { // (, line 240 // call Step_7, line 240 if (!r_Step_7()) { break lab14; } // set stemmed, line 240 B_stemmed = true; } while (false); cursor = limit - v_15; // do, line 241 v_16 = limit - cursor; lab15: do { // (, line 241 // or, line 241 lab16: do { lab17: do { // Boolean test stemmed, line 241 if (!(B_stemmed)) { break lab17; } break lab16; } while (false); // Boolean test GE_removed, line 241 if (!(B_GE_removed)) { break lab15; } } while (false); // call Step_6, line 241 if (!r_Step_6()) { break lab15; } } while (false); cursor = limit - v_16; cursor = limit_backward; // do, line 243 v_18 = cursor; lab18: do { // (, line 243 // Boolean test Y_found, line 243 if (!(B_Y_found)) { break lab18; } // repeat, line 243 replab19: while(true) { v_19 = cursor; lab20: do { // (, line 243 // goto, line 243 golab21: while(true) { v_20 = cursor; lab22: do { // (, line 243 // [, line 243 bra = cursor; // literal, line 243 if (!(eq_s(1, "Y"))) { break lab22; } // ], line 243 ket = cursor; cursor = v_20; break golab21; } while (false); cursor = v_20; if (cursor >= limit) { break lab20; } cursor++; } // <-, line 243 slice_from("y"); continue replab19; } while (false); cursor = v_19; break replab19; } } while (false); cursor = v_18; return true; } } lucene-2.9.4/contrib/snowball/src/java/org/tartarus/snowball/ext/FrenchStemmer.java0000644000175000017500000015373011474320235031140 0ustar janpascaljanpascal// This file was generated automatically by the Snowball to Java compiler package org.tartarus.snowball.ext; import org.tartarus.snowball.SnowballProgram; import org.tartarus.snowball.Among; /** * Generated class implementing code defined by a snowball script. */ public class FrenchStemmer extends SnowballProgram { private Among a_0[] = { new Among ( "col", -1, -1, "", this), new Among ( "par", -1, -1, "", this), new Among ( "tap", -1, -1, "", this) }; private Among a_1[] = { new Among ( "", -1, 4, "", this), new Among ( "I", 0, 1, "", this), new Among ( "U", 0, 2, "", this), new Among ( "Y", 0, 3, "", this) }; private Among a_2[] = { new Among ( "iqU", -1, 3, "", this), new Among ( "abl", -1, 3, "", this), new Among ( "I\u00E8r", -1, 4, "", this), new Among ( "i\u00E8r", -1, 4, "", this), new Among ( "eus", -1, 2, "", this), new Among ( "iv", -1, 1, "", this) }; private Among a_3[] = { new Among ( "ic", -1, 2, "", this), new Among ( "abil", -1, 1, "", this), new Among ( "iv", -1, 3, "", this) }; private Among a_4[] = { new Among ( "iqUe", -1, 1, "", this), new Among ( "atrice", -1, 2, "", this), new Among ( "ance", -1, 1, "", this), new Among ( "ence", -1, 5, "", this), new Among ( "logie", -1, 3, "", this), new Among ( "able", -1, 1, "", this), new Among ( "isme", -1, 1, "", this), new Among ( "euse", -1, 11, "", this), new Among ( "iste", -1, 1, "", this), new Among ( "ive", -1, 8, "", this), new Among ( "if", -1, 8, "", this), new Among ( "usion", -1, 4, "", this), new Among ( "ation", -1, 2, "", this), new Among ( "ution", -1, 4, "", this), new Among ( "ateur", -1, 2, "", this), new Among ( "iqUes", -1, 1, "", this), new Among ( "atrices", -1, 2, "", this), new Among ( "ances", -1, 1, "", this), new Among ( "ences", -1, 5, "", this), new Among ( "logies", -1, 3, "", this), new Among ( "ables", -1, 1, "", this), new Among ( "ismes", -1, 1, "", this), new Among ( "euses", -1, 11, "", this), new Among ( "istes", -1, 1, "", this), new Among ( "ives", -1, 8, "", this), new Among ( "ifs", -1, 8, "", this), new Among ( "usions", -1, 4, "", this), new Among ( "ations", -1, 2, "", this), new Among ( "utions", -1, 4, "", this), new Among ( "ateurs", -1, 2, "", this), new Among ( "ments", -1, 15, "", this), new Among ( "ements", 30, 6, "", this), new Among ( "issements", 31, 12, "", this), new Among ( "it\u00E9s", -1, 7, "", this), new Among ( "ment", -1, 15, "", this), new Among ( "ement", 34, 6, "", this), new Among ( "issement", 35, 12, "", this), new Among ( "amment", 34, 13, "", this), new Among ( "emment", 34, 14, "", this), new Among ( "aux", -1, 10, "", this), new Among ( "eaux", 39, 9, "", this), new Among ( "eux", -1, 1, "", this), new Among ( "it\u00E9", -1, 7, "", this) }; private Among a_5[] = { new Among ( "ira", -1, 1, "", this), new Among ( "ie", -1, 1, "", this), new Among ( "isse", -1, 1, "", this), new Among ( "issante", -1, 1, "", this), new Among ( "i", -1, 1, "", this), new Among ( "irai", 4, 1, "", this), new Among ( "ir", -1, 1, "", this), new Among ( "iras", -1, 1, "", this), new Among ( "ies", -1, 1, "", this), new Among ( "\u00EEmes", -1, 1, "", this), new Among ( "isses", -1, 1, "", this), new Among ( "issantes", -1, 1, "", this), new Among ( "\u00EEtes", -1, 1, "", this), new Among ( "is", -1, 1, "", this), new Among ( "irais", 13, 1, "", this), new Among ( "issais", 13, 1, "", this), new Among ( "irions", -1, 1, "", this), new Among ( "issions", -1, 1, "", this), new Among ( "irons", -1, 1, "", this), new Among ( "issons", -1, 1, "", this), new Among ( "issants", -1, 1, "", this), new Among ( "it", -1, 1, "", this), new Among ( "irait", 21, 1, "", this), new Among ( "issait", 21, 1, "", this), new Among ( "issant", -1, 1, "", this), new Among ( "iraIent", -1, 1, "", this), new Among ( "issaIent", -1, 1, "", this), new Among ( "irent", -1, 1, "", this), new Among ( "issent", -1, 1, "", this), new Among ( "iront", -1, 1, "", this), new Among ( "\u00EEt", -1, 1, "", this), new Among ( "iriez", -1, 1, "", this), new Among ( "issiez", -1, 1, "", this), new Among ( "irez", -1, 1, "", this), new Among ( "issez", -1, 1, "", this) }; private Among a_6[] = { new Among ( "a", -1, 3, "", this), new Among ( "era", 0, 2, "", this), new Among ( "asse", -1, 3, "", this), new Among ( "ante", -1, 3, "", this), new Among ( "\u00E9e", -1, 2, "", this), new Among ( "ai", -1, 3, "", this), new Among ( "erai", 5, 2, "", this), new Among ( "er", -1, 2, "", this), new Among ( "as", -1, 3, "", this), new Among ( "eras", 8, 2, "", this), new Among ( "\u00E2mes", -1, 3, "", this), new Among ( "asses", -1, 3, "", this), new Among ( "antes", -1, 3, "", this), new Among ( "\u00E2tes", -1, 3, "", this), new Among ( "\u00E9es", -1, 2, "", this), new Among ( "ais", -1, 3, "", this), new Among ( "erais", 15, 2, "", this), new Among ( "ions", -1, 1, "", this), new Among ( "erions", 17, 2, "", this), new Among ( "assions", 17, 3, "", this), new Among ( "erons", -1, 2, "", this), new Among ( "ants", -1, 3, "", this), new Among ( "\u00E9s", -1, 2, "", this), new Among ( "ait", -1, 3, "", this), new Among ( "erait", 23, 2, "", this), new Among ( "ant", -1, 3, "", this), new Among ( "aIent", -1, 3, "", this), new Among ( "eraIent", 26, 2, "", this), new Among ( "\u00E8rent", -1, 2, "", this), new Among ( "assent", -1, 3, "", this), new Among ( "eront", -1, 2, "", this), new Among ( "\u00E2t", -1, 3, "", this), new Among ( "ez", -1, 2, "", this), new Among ( "iez", 32, 2, "", this), new Among ( "eriez", 33, 2, "", this), new Among ( "assiez", 33, 3, "", this), new Among ( "erez", 32, 2, "", this), new Among ( "\u00E9", -1, 2, "", this) }; private Among a_7[] = { new Among ( "e", -1, 3, "", this), new Among ( "I\u00E8re", 0, 2, "", this), new Among ( "i\u00E8re", 0, 2, "", this), new Among ( "ion", -1, 1, "", this), new Among ( "Ier", -1, 2, "", this), new Among ( "ier", -1, 2, "", this), new Among ( "\u00EB", -1, 4, "", this) }; private Among a_8[] = { new Among ( "ell", -1, -1, "", this), new Among ( "eill", -1, -1, "", this), new Among ( "enn", -1, -1, "", this), new Among ( "onn", -1, -1, "", this), new Among ( "ett", -1, -1, "", this) }; private static final char g_v[] = {17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 130, 103, 8, 5 }; private static final char g_keep_with_s[] = {1, 65, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128 }; private int I_p2; private int I_p1; private int I_pV; private void copy_from(FrenchStemmer other) { I_p2 = other.I_p2; I_p1 = other.I_p1; I_pV = other.I_pV; super.copy_from(other); } private boolean r_prelude() { int v_1; int v_2; int v_3; int v_4; // repeat, line 38 replab0: while(true) { v_1 = cursor; lab1: do { // goto, line 38 golab2: while(true) { v_2 = cursor; lab3: do { // (, line 38 // or, line 44 lab4: do { v_3 = cursor; lab5: do { // (, line 40 if (!(in_grouping(g_v, 97, 251))) { break lab5; } // [, line 40 bra = cursor; // or, line 40 lab6: do { v_4 = cursor; lab7: do { // (, line 40 // literal, line 40 if (!(eq_s(1, "u"))) { break lab7; } // ], line 40 ket = cursor; if (!(in_grouping(g_v, 97, 251))) { break lab7; } // <-, line 40 slice_from("U"); break lab6; } while (false); cursor = v_4; lab8: do { // (, line 41 // literal, line 41 if (!(eq_s(1, "i"))) { break lab8; } // ], line 41 ket = cursor; if (!(in_grouping(g_v, 97, 251))) { break lab8; } // <-, line 41 slice_from("I"); break lab6; } while (false); cursor = v_4; // (, line 42 // literal, line 42 if (!(eq_s(1, "y"))) { break lab5; } // ], line 42 ket = cursor; // <-, line 42 slice_from("Y"); } while (false); break lab4; } while (false); cursor = v_3; lab9: do { // (, line 45 // [, line 45 bra = cursor; // literal, line 45 if (!(eq_s(1, "y"))) { break lab9; } // ], line 45 ket = cursor; if (!(in_grouping(g_v, 97, 251))) { break lab9; } // <-, line 45 slice_from("Y"); break lab4; } while (false); cursor = v_3; // (, line 47 // literal, line 47 if (!(eq_s(1, "q"))) { break lab3; } // [, line 47 bra = cursor; // literal, line 47 if (!(eq_s(1, "u"))) { break lab3; } // ], line 47 ket = cursor; // <-, line 47 slice_from("U"); } while (false); cursor = v_2; break golab2; } while (false); cursor = v_2; if (cursor >= limit) { break lab1; } cursor++; } continue replab0; } while (false); cursor = v_1; break replab0; } return true; } private boolean r_mark_regions() { int v_1; int v_2; int v_4; // (, line 50 I_pV = limit; I_p1 = limit; I_p2 = limit; // do, line 56 v_1 = cursor; lab0: do { // (, line 56 // or, line 58 lab1: do { v_2 = cursor; lab2: do { // (, line 57 if (!(in_grouping(g_v, 97, 251))) { break lab2; } if (!(in_grouping(g_v, 97, 251))) { break lab2; } // next, line 57 if (cursor >= limit) { break lab2; } cursor++; break lab1; } while (false); cursor = v_2; lab3: do { // among, line 59 if (find_among(a_0, 3) == 0) { break lab3; } break lab1; } while (false); cursor = v_2; // (, line 66 // next, line 66 if (cursor >= limit) { break lab0; } cursor++; // gopast, line 66 golab4: while(true) { lab5: do { if (!(in_grouping(g_v, 97, 251))) { break lab5; } break golab4; } while (false); if (cursor >= limit) { break lab0; } cursor++; } } while (false); // setmark pV, line 67 I_pV = cursor; } while (false); cursor = v_1; // do, line 69 v_4 = cursor; lab6: do { // (, line 69 // gopast, line 70 golab7: while(true) { lab8: do { if (!(in_grouping(g_v, 97, 251))) { break lab8; } break golab7; } while (false); if (cursor >= limit) { break lab6; } cursor++; } // gopast, line 70 golab9: while(true) { lab10: do { if (!(out_grouping(g_v, 97, 251))) { break lab10; } break golab9; } while (false); if (cursor >= limit) { break lab6; } cursor++; } // setmark p1, line 70 I_p1 = cursor; // gopast, line 71 golab11: while(true) { lab12: do { if (!(in_grouping(g_v, 97, 251))) { break lab12; } break golab11; } while (false); if (cursor >= limit) { break lab6; } cursor++; } // gopast, line 71 golab13: while(true) { lab14: do { if (!(out_grouping(g_v, 97, 251))) { break lab14; } break golab13; } while (false); if (cursor >= limit) { break lab6; } cursor++; } // setmark p2, line 71 I_p2 = cursor; } while (false); cursor = v_4; return true; } private boolean r_postlude() { int among_var; int v_1; // repeat, line 75 replab0: while(true) { v_1 = cursor; lab1: do { // (, line 75 // [, line 77 bra = cursor; // substring, line 77 among_var = find_among(a_1, 4); if (among_var == 0) { break lab1; } // ], line 77 ket = cursor; switch(among_var) { case 0: break lab1; case 1: // (, line 78 // <-, line 78 slice_from("i"); break; case 2: // (, line 79 // <-, line 79 slice_from("u"); break; case 3: // (, line 80 // <-, line 80 slice_from("y"); break; case 4: // (, line 81 // next, line 81 if (cursor >= limit) { break lab1; } cursor++; break; } continue replab0; } while (false); cursor = v_1; break replab0; } return true; } private boolean r_RV() { if (!(I_pV <= cursor)) { return false; } return true; } private boolean r_R1() { if (!(I_p1 <= cursor)) { return false; } return true; } private boolean r_R2() { if (!(I_p2 <= cursor)) { return false; } return true; } private boolean r_standard_suffix() { int among_var; int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; int v_8; int v_9; int v_10; int v_11; // (, line 91 // [, line 92 ket = cursor; // substring, line 92 among_var = find_among_b(a_4, 43); if (among_var == 0) { return false; } // ], line 92 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 96 // call R2, line 96 if (!r_R2()) { return false; } // delete, line 96 slice_del(); break; case 2: // (, line 99 // call R2, line 99 if (!r_R2()) { return false; } // delete, line 99 slice_del(); // try, line 100 v_1 = limit - cursor; lab0: do { // (, line 100 // [, line 100 ket = cursor; // literal, line 100 if (!(eq_s_b(2, "ic"))) { cursor = limit - v_1; break lab0; } // ], line 100 bra = cursor; // or, line 100 lab1: do { v_2 = limit - cursor; lab2: do { // (, line 100 // call R2, line 100 if (!r_R2()) { break lab2; } // delete, line 100 slice_del(); break lab1; } while (false); cursor = limit - v_2; // <-, line 100 slice_from("iqU"); } while (false); } while (false); break; case 3: // (, line 104 // call R2, line 104 if (!r_R2()) { return false; } // <-, line 104 slice_from("log"); break; case 4: // (, line 107 // call R2, line 107 if (!r_R2()) { return false; } // <-, line 107 slice_from("u"); break; case 5: // (, line 110 // call R2, line 110 if (!r_R2()) { return false; } // <-, line 110 slice_from("ent"); break; case 6: // (, line 113 // call RV, line 114 if (!r_RV()) { return false; } // delete, line 114 slice_del(); // try, line 115 v_3 = limit - cursor; lab3: do { // (, line 115 // [, line 116 ket = cursor; // substring, line 116 among_var = find_among_b(a_2, 6); if (among_var == 0) { cursor = limit - v_3; break lab3; } // ], line 116 bra = cursor; switch(among_var) { case 0: cursor = limit - v_3; break lab3; case 1: // (, line 117 // call R2, line 117 if (!r_R2()) { cursor = limit - v_3; break lab3; } // delete, line 117 slice_del(); // [, line 117 ket = cursor; // literal, line 117 if (!(eq_s_b(2, "at"))) { cursor = limit - v_3; break lab3; } // ], line 117 bra = cursor; // call R2, line 117 if (!r_R2()) { cursor = limit - v_3; break lab3; } // delete, line 117 slice_del(); break; case 2: // (, line 118 // or, line 118 lab4: do { v_4 = limit - cursor; lab5: do { // (, line 118 // call R2, line 118 if (!r_R2()) { break lab5; } // delete, line 118 slice_del(); break lab4; } while (false); cursor = limit - v_4; // (, line 118 // call R1, line 118 if (!r_R1()) { cursor = limit - v_3; break lab3; } // <-, line 118 slice_from("eux"); } while (false); break; case 3: // (, line 120 // call R2, line 120 if (!r_R2()) { cursor = limit - v_3; break lab3; } // delete, line 120 slice_del(); break; case 4: // (, line 122 // call RV, line 122 if (!r_RV()) { cursor = limit - v_3; break lab3; } // <-, line 122 slice_from("i"); break; } } while (false); break; case 7: // (, line 128 // call R2, line 129 if (!r_R2()) { return false; } // delete, line 129 slice_del(); // try, line 130 v_5 = limit - cursor; lab6: do { // (, line 130 // [, line 131 ket = cursor; // substring, line 131 among_var = find_among_b(a_3, 3); if (among_var == 0) { cursor = limit - v_5; break lab6; } // ], line 131 bra = cursor; switch(among_var) { case 0: cursor = limit - v_5; break lab6; case 1: // (, line 132 // or, line 132 lab7: do { v_6 = limit - cursor; lab8: do { // (, line 132 // call R2, line 132 if (!r_R2()) { break lab8; } // delete, line 132 slice_del(); break lab7; } while (false); cursor = limit - v_6; // <-, line 132 slice_from("abl"); } while (false); break; case 2: // (, line 133 // or, line 133 lab9: do { v_7 = limit - cursor; lab10: do { // (, line 133 // call R2, line 133 if (!r_R2()) { break lab10; } // delete, line 133 slice_del(); break lab9; } while (false); cursor = limit - v_7; // <-, line 133 slice_from("iqU"); } while (false); break; case 3: // (, line 134 // call R2, line 134 if (!r_R2()) { cursor = limit - v_5; break lab6; } // delete, line 134 slice_del(); break; } } while (false); break; case 8: // (, line 140 // call R2, line 141 if (!r_R2()) { return false; } // delete, line 141 slice_del(); // try, line 142 v_8 = limit - cursor; lab11: do { // (, line 142 // [, line 142 ket = cursor; // literal, line 142 if (!(eq_s_b(2, "at"))) { cursor = limit - v_8; break lab11; } // ], line 142 bra = cursor; // call R2, line 142 if (!r_R2()) { cursor = limit - v_8; break lab11; } // delete, line 142 slice_del(); // [, line 142 ket = cursor; // literal, line 142 if (!(eq_s_b(2, "ic"))) { cursor = limit - v_8; break lab11; } // ], line 142 bra = cursor; // or, line 142 lab12: do { v_9 = limit - cursor; lab13: do { // (, line 142 // call R2, line 142 if (!r_R2()) { break lab13; } // delete, line 142 slice_del(); break lab12; } while (false); cursor = limit - v_9; // <-, line 142 slice_from("iqU"); } while (false); } while (false); break; case 9: // (, line 144 // <-, line 144 slice_from("eau"); break; case 10: // (, line 145 // call R1, line 145 if (!r_R1()) { return false; } // <-, line 145 slice_from("al"); break; case 11: // (, line 147 // or, line 147 lab14: do { v_10 = limit - cursor; lab15: do { // (, line 147 // call R2, line 147 if (!r_R2()) { break lab15; } // delete, line 147 slice_del(); break lab14; } while (false); cursor = limit - v_10; // (, line 147 // call R1, line 147 if (!r_R1()) { return false; } // <-, line 147 slice_from("eux"); } while (false); break; case 12: // (, line 150 // call R1, line 150 if (!r_R1()) { return false; } if (!(out_grouping_b(g_v, 97, 251))) { return false; } // delete, line 150 slice_del(); break; case 13: // (, line 155 // call RV, line 155 if (!r_RV()) { return false; } // fail, line 155 // (, line 155 // <-, line 155 slice_from("ant"); return false; case 14: // (, line 156 // call RV, line 156 if (!r_RV()) { return false; } // fail, line 156 // (, line 156 // <-, line 156 slice_from("ent"); return false; case 15: // (, line 158 // test, line 158 v_11 = limit - cursor; // (, line 158 if (!(in_grouping_b(g_v, 97, 251))) { return false; } // call RV, line 158 if (!r_RV()) { return false; } cursor = limit - v_11; // fail, line 158 // (, line 158 // delete, line 158 slice_del(); return false; } return true; } private boolean r_i_verb_suffix() { int among_var; int v_1; int v_2; // setlimit, line 163 v_1 = limit - cursor; // tomark, line 163 if (cursor < I_pV) { return false; } cursor = I_pV; v_2 = limit_backward; limit_backward = cursor; cursor = limit - v_1; // (, line 163 // [, line 164 ket = cursor; // substring, line 164 among_var = find_among_b(a_5, 35); if (among_var == 0) { limit_backward = v_2; return false; } // ], line 164 bra = cursor; switch(among_var) { case 0: limit_backward = v_2; return false; case 1: // (, line 170 if (!(out_grouping_b(g_v, 97, 251))) { limit_backward = v_2; return false; } // delete, line 170 slice_del(); break; } limit_backward = v_2; return true; } private boolean r_verb_suffix() { int among_var; int v_1; int v_2; int v_3; // setlimit, line 174 v_1 = limit - cursor; // tomark, line 174 if (cursor < I_pV) { return false; } cursor = I_pV; v_2 = limit_backward; limit_backward = cursor; cursor = limit - v_1; // (, line 174 // [, line 175 ket = cursor; // substring, line 175 among_var = find_among_b(a_6, 38); if (among_var == 0) { limit_backward = v_2; return false; } // ], line 175 bra = cursor; switch(among_var) { case 0: limit_backward = v_2; return false; case 1: // (, line 177 // call R2, line 177 if (!r_R2()) { limit_backward = v_2; return false; } // delete, line 177 slice_del(); break; case 2: // (, line 185 // delete, line 185 slice_del(); break; case 3: // (, line 190 // delete, line 190 slice_del(); // try, line 191 v_3 = limit - cursor; lab0: do { // (, line 191 // [, line 191 ket = cursor; // literal, line 191 if (!(eq_s_b(1, "e"))) { cursor = limit - v_3; break lab0; } // ], line 191 bra = cursor; // delete, line 191 slice_del(); } while (false); break; } limit_backward = v_2; return true; } private boolean r_residual_suffix() { int among_var; int v_1; int v_2; int v_3; int v_4; int v_5; // (, line 198 // try, line 199 v_1 = limit - cursor; lab0: do { // (, line 199 // [, line 199 ket = cursor; // literal, line 199 if (!(eq_s_b(1, "s"))) { cursor = limit - v_1; break lab0; } // ], line 199 bra = cursor; // test, line 199 v_2 = limit - cursor; if (!(out_grouping_b(g_keep_with_s, 97, 232))) { cursor = limit - v_1; break lab0; } cursor = limit - v_2; // delete, line 199 slice_del(); } while (false); // setlimit, line 200 v_3 = limit - cursor; // tomark, line 200 if (cursor < I_pV) { return false; } cursor = I_pV; v_4 = limit_backward; limit_backward = cursor; cursor = limit - v_3; // (, line 200 // [, line 201 ket = cursor; // substring, line 201 among_var = find_among_b(a_7, 7); if (among_var == 0) { limit_backward = v_4; return false; } // ], line 201 bra = cursor; switch(among_var) { case 0: limit_backward = v_4; return false; case 1: // (, line 202 // call R2, line 202 if (!r_R2()) { limit_backward = v_4; return false; } // or, line 202 lab1: do { v_5 = limit - cursor; lab2: do { // literal, line 202 if (!(eq_s_b(1, "s"))) { break lab2; } break lab1; } while (false); cursor = limit - v_5; // literal, line 202 if (!(eq_s_b(1, "t"))) { limit_backward = v_4; return false; } } while (false); // delete, line 202 slice_del(); break; case 2: // (, line 204 // <-, line 204 slice_from("i"); break; case 3: // (, line 205 // delete, line 205 slice_del(); break; case 4: // (, line 206 // literal, line 206 if (!(eq_s_b(2, "gu"))) { limit_backward = v_4; return false; } // delete, line 206 slice_del(); break; } limit_backward = v_4; return true; } private boolean r_un_double() { int v_1; // (, line 211 // test, line 212 v_1 = limit - cursor; // among, line 212 if (find_among_b(a_8, 5) == 0) { return false; } cursor = limit - v_1; // [, line 212 ket = cursor; // next, line 212 if (cursor <= limit_backward) { return false; } cursor--; // ], line 212 bra = cursor; // delete, line 212 slice_del(); return true; } private boolean r_un_accent() { int v_3; // (, line 215 // atleast, line 216 { int v_1 = 1; // atleast, line 216 replab0: while(true) { lab1: do { if (!(out_grouping_b(g_v, 97, 251))) { break lab1; } v_1--; continue replab0; } while (false); break replab0; } if (v_1 > 0) { return false; } } // [, line 217 ket = cursor; // or, line 217 lab2: do { v_3 = limit - cursor; lab3: do { // literal, line 217 if (!(eq_s_b(1, "\u00E9"))) { break lab3; } break lab2; } while (false); cursor = limit - v_3; // literal, line 217 if (!(eq_s_b(1, "\u00E8"))) { return false; } } while (false); // ], line 217 bra = cursor; // <-, line 217 slice_from("e"); return true; } public boolean stem() { int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; int v_8; int v_9; int v_10; int v_11; // (, line 221 // do, line 223 v_1 = cursor; lab0: do { // call prelude, line 223 if (!r_prelude()) { break lab0; } } while (false); cursor = v_1; // do, line 224 v_2 = cursor; lab1: do { // call mark_regions, line 224 if (!r_mark_regions()) { break lab1; } } while (false); cursor = v_2; // backwards, line 225 limit_backward = cursor; cursor = limit; // (, line 225 // do, line 227 v_3 = limit - cursor; lab2: do { // (, line 227 // or, line 237 lab3: do { v_4 = limit - cursor; lab4: do { // (, line 228 // and, line 233 v_5 = limit - cursor; // (, line 229 // or, line 229 lab5: do { v_6 = limit - cursor; lab6: do { // call standard_suffix, line 229 if (!r_standard_suffix()) { break lab6; } break lab5; } while (false); cursor = limit - v_6; lab7: do { // call i_verb_suffix, line 230 if (!r_i_verb_suffix()) { break lab7; } break lab5; } while (false); cursor = limit - v_6; // call verb_suffix, line 231 if (!r_verb_suffix()) { break lab4; } } while (false); cursor = limit - v_5; // try, line 234 v_7 = limit - cursor; lab8: do { // (, line 234 // [, line 234 ket = cursor; // or, line 234 lab9: do { v_8 = limit - cursor; lab10: do { // (, line 234 // literal, line 234 if (!(eq_s_b(1, "Y"))) { break lab10; } // ], line 234 bra = cursor; // <-, line 234 slice_from("i"); break lab9; } while (false); cursor = limit - v_8; // (, line 235 // literal, line 235 if (!(eq_s_b(1, "\u00E7"))) { cursor = limit - v_7; break lab8; } // ], line 235 bra = cursor; // <-, line 235 slice_from("c"); } while (false); } while (false); break lab3; } while (false); cursor = limit - v_4; // call residual_suffix, line 238 if (!r_residual_suffix()) { break lab2; } } while (false); } while (false); cursor = limit - v_3; // do, line 243 v_9 = limit - cursor; lab11: do { // call un_double, line 243 if (!r_un_double()) { break lab11; } } while (false); cursor = limit - v_9; // do, line 244 v_10 = limit - cursor; lab12: do { // call un_accent, line 244 if (!r_un_accent()) { break lab12; } } while (false); cursor = limit - v_10; cursor = limit_backward; // do, line 246 v_11 = cursor; lab13: do { // call postlude, line 246 if (!r_postlude()) { break lab13; } } while (false); cursor = v_11; return true; } } lucene-2.9.4/contrib/snowball/src/java/org/tartarus/snowball/ext/German2Stemmer.java0000644000175000017500000006040411474320235031221 0ustar janpascaljanpascal// This file was generated automatically by the Snowball to Java compiler package org.tartarus.snowball.ext; import org.tartarus.snowball.SnowballProgram; import org.tartarus.snowball.Among; /** * Generated class implementing code defined by a snowball script. */ public class German2Stemmer extends SnowballProgram { private Among a_0[] = { new Among ( "", -1, 6, "", this), new Among ( "ae", 0, 2, "", this), new Among ( "oe", 0, 3, "", this), new Among ( "qu", 0, 5, "", this), new Among ( "ue", 0, 4, "", this), new Among ( "\u00DF", 0, 1, "", this) }; private Among a_1[] = { new Among ( "", -1, 6, "", this), new Among ( "U", 0, 2, "", this), new Among ( "Y", 0, 1, "", this), new Among ( "\u00E4", 0, 3, "", this), new Among ( "\u00F6", 0, 4, "", this), new Among ( "\u00FC", 0, 5, "", this) }; private Among a_2[] = { new Among ( "e", -1, 1, "", this), new Among ( "em", -1, 1, "", this), new Among ( "en", -1, 1, "", this), new Among ( "ern", -1, 1, "", this), new Among ( "er", -1, 1, "", this), new Among ( "s", -1, 2, "", this), new Among ( "es", 5, 1, "", this) }; private Among a_3[] = { new Among ( "en", -1, 1, "", this), new Among ( "er", -1, 1, "", this), new Among ( "st", -1, 2, "", this), new Among ( "est", 2, 1, "", this) }; private Among a_4[] = { new Among ( "ig", -1, 1, "", this), new Among ( "lich", -1, 1, "", this) }; private Among a_5[] = { new Among ( "end", -1, 1, "", this), new Among ( "ig", -1, 2, "", this), new Among ( "ung", -1, 1, "", this), new Among ( "lich", -1, 3, "", this), new Among ( "isch", -1, 2, "", this), new Among ( "ik", -1, 2, "", this), new Among ( "heit", -1, 3, "", this), new Among ( "keit", -1, 4, "", this) }; private static final char g_v[] = {17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 32, 8 }; private static final char g_s_ending[] = {117, 30, 5 }; private static final char g_st_ending[] = {117, 30, 4 }; private int I_x; private int I_p2; private int I_p1; private void copy_from(German2Stemmer other) { I_x = other.I_x; I_p2 = other.I_p2; I_p1 = other.I_p1; super.copy_from(other); } private boolean r_prelude() { int among_var; int v_1; int v_2; int v_3; int v_4; int v_5; // (, line 28 // test, line 30 v_1 = cursor; // repeat, line 30 replab0: while(true) { v_2 = cursor; lab1: do { // goto, line 30 golab2: while(true) { v_3 = cursor; lab3: do { // (, line 30 if (!(in_grouping(g_v, 97, 252))) { break lab3; } // [, line 31 bra = cursor; // or, line 31 lab4: do { v_4 = cursor; lab5: do { // (, line 31 // literal, line 31 if (!(eq_s(1, "u"))) { break lab5; } // ], line 31 ket = cursor; if (!(in_grouping(g_v, 97, 252))) { break lab5; } // <-, line 31 slice_from("U"); break lab4; } while (false); cursor = v_4; // (, line 32 // literal, line 32 if (!(eq_s(1, "y"))) { break lab3; } // ], line 32 ket = cursor; if (!(in_grouping(g_v, 97, 252))) { break lab3; } // <-, line 32 slice_from("Y"); } while (false); cursor = v_3; break golab2; } while (false); cursor = v_3; if (cursor >= limit) { break lab1; } cursor++; } continue replab0; } while (false); cursor = v_2; break replab0; } cursor = v_1; // repeat, line 35 replab6: while(true) { v_5 = cursor; lab7: do { // (, line 35 // [, line 36 bra = cursor; // substring, line 36 among_var = find_among(a_0, 6); if (among_var == 0) { break lab7; } // ], line 36 ket = cursor; switch(among_var) { case 0: break lab7; case 1: // (, line 37 // <-, line 37 slice_from("ss"); break; case 2: // (, line 38 // <-, line 38 slice_from("\u00E4"); break; case 3: // (, line 39 // <-, line 39 slice_from("\u00F6"); break; case 4: // (, line 40 // <-, line 40 slice_from("\u00FC"); break; case 5: // (, line 41 // hop, line 41 { int c = cursor + 2; if (0 > c || c > limit) { break lab7; } cursor = c; } break; case 6: // (, line 42 // next, line 42 if (cursor >= limit) { break lab7; } cursor++; break; } continue replab6; } while (false); cursor = v_5; break replab6; } return true; } private boolean r_mark_regions() { int v_1; // (, line 48 I_p1 = limit; I_p2 = limit; // test, line 53 v_1 = cursor; // (, line 53 // hop, line 53 { int c = cursor + 3; if (0 > c || c > limit) { return false; } cursor = c; } // setmark x, line 53 I_x = cursor; cursor = v_1; // gopast, line 55 golab0: while(true) { lab1: do { if (!(in_grouping(g_v, 97, 252))) { break lab1; } break golab0; } while (false); if (cursor >= limit) { return false; } cursor++; } // gopast, line 55 golab2: while(true) { lab3: do { if (!(out_grouping(g_v, 97, 252))) { break lab3; } break golab2; } while (false); if (cursor >= limit) { return false; } cursor++; } // setmark p1, line 55 I_p1 = cursor; // try, line 56 lab4: do { // (, line 56 if (!(I_p1 < I_x)) { break lab4; } I_p1 = I_x; } while (false); // gopast, line 57 golab5: while(true) { lab6: do { if (!(in_grouping(g_v, 97, 252))) { break lab6; } break golab5; } while (false); if (cursor >= limit) { return false; } cursor++; } // gopast, line 57 golab7: while(true) { lab8: do { if (!(out_grouping(g_v, 97, 252))) { break lab8; } break golab7; } while (false); if (cursor >= limit) { return false; } cursor++; } // setmark p2, line 57 I_p2 = cursor; return true; } private boolean r_postlude() { int among_var; int v_1; // repeat, line 61 replab0: while(true) { v_1 = cursor; lab1: do { // (, line 61 // [, line 63 bra = cursor; // substring, line 63 among_var = find_among(a_1, 6); if (among_var == 0) { break lab1; } // ], line 63 ket = cursor; switch(among_var) { case 0: break lab1; case 1: // (, line 64 // <-, line 64 slice_from("y"); break; case 2: // (, line 65 // <-, line 65 slice_from("u"); break; case 3: // (, line 66 // <-, line 66 slice_from("a"); break; case 4: // (, line 67 // <-, line 67 slice_from("o"); break; case 5: // (, line 68 // <-, line 68 slice_from("u"); break; case 6: // (, line 69 // next, line 69 if (cursor >= limit) { break lab1; } cursor++; break; } continue replab0; } while (false); cursor = v_1; break replab0; } return true; } private boolean r_R1() { if (!(I_p1 <= cursor)) { return false; } return true; } private boolean r_R2() { if (!(I_p2 <= cursor)) { return false; } return true; } private boolean r_standard_suffix() { int among_var; int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; int v_8; int v_9; // (, line 79 // do, line 80 v_1 = limit - cursor; lab0: do { // (, line 80 // [, line 81 ket = cursor; // substring, line 81 among_var = find_among_b(a_2, 7); if (among_var == 0) { break lab0; } // ], line 81 bra = cursor; // call R1, line 81 if (!r_R1()) { break lab0; } switch(among_var) { case 0: break lab0; case 1: // (, line 83 // delete, line 83 slice_del(); break; case 2: // (, line 86 if (!(in_grouping_b(g_s_ending, 98, 116))) { break lab0; } // delete, line 86 slice_del(); break; } } while (false); cursor = limit - v_1; // do, line 90 v_2 = limit - cursor; lab1: do { // (, line 90 // [, line 91 ket = cursor; // substring, line 91 among_var = find_among_b(a_3, 4); if (among_var == 0) { break lab1; } // ], line 91 bra = cursor; // call R1, line 91 if (!r_R1()) { break lab1; } switch(among_var) { case 0: break lab1; case 1: // (, line 93 // delete, line 93 slice_del(); break; case 2: // (, line 96 if (!(in_grouping_b(g_st_ending, 98, 116))) { break lab1; } // hop, line 96 { int c = cursor - 3; if (limit_backward > c || c > limit) { break lab1; } cursor = c; } // delete, line 96 slice_del(); break; } } while (false); cursor = limit - v_2; // do, line 100 v_3 = limit - cursor; lab2: do { // (, line 100 // [, line 101 ket = cursor; // substring, line 101 among_var = find_among_b(a_5, 8); if (among_var == 0) { break lab2; } // ], line 101 bra = cursor; // call R2, line 101 if (!r_R2()) { break lab2; } switch(among_var) { case 0: break lab2; case 1: // (, line 103 // delete, line 103 slice_del(); // try, line 104 v_4 = limit - cursor; lab3: do { // (, line 104 // [, line 104 ket = cursor; // literal, line 104 if (!(eq_s_b(2, "ig"))) { cursor = limit - v_4; break lab3; } // ], line 104 bra = cursor; // not, line 104 { v_5 = limit - cursor; lab4: do { // literal, line 104 if (!(eq_s_b(1, "e"))) { break lab4; } cursor = limit - v_4; break lab3; } while (false); cursor = limit - v_5; } // call R2, line 104 if (!r_R2()) { cursor = limit - v_4; break lab3; } // delete, line 104 slice_del(); } while (false); break; case 2: // (, line 107 // not, line 107 { v_6 = limit - cursor; lab5: do { // literal, line 107 if (!(eq_s_b(1, "e"))) { break lab5; } break lab2; } while (false); cursor = limit - v_6; } // delete, line 107 slice_del(); break; case 3: // (, line 110 // delete, line 110 slice_del(); // try, line 111 v_7 = limit - cursor; lab6: do { // (, line 111 // [, line 112 ket = cursor; // or, line 112 lab7: do { v_8 = limit - cursor; lab8: do { // literal, line 112 if (!(eq_s_b(2, "er"))) { break lab8; } break lab7; } while (false); cursor = limit - v_8; // literal, line 112 if (!(eq_s_b(2, "en"))) { cursor = limit - v_7; break lab6; } } while (false); // ], line 112 bra = cursor; // call R1, line 112 if (!r_R1()) { cursor = limit - v_7; break lab6; } // delete, line 112 slice_del(); } while (false); break; case 4: // (, line 116 // delete, line 116 slice_del(); // try, line 117 v_9 = limit - cursor; lab9: do { // (, line 117 // [, line 118 ket = cursor; // substring, line 118 among_var = find_among_b(a_4, 2); if (among_var == 0) { cursor = limit - v_9; break lab9; } // ], line 118 bra = cursor; // call R2, line 118 if (!r_R2()) { cursor = limit - v_9; break lab9; } switch(among_var) { case 0: cursor = limit - v_9; break lab9; case 1: // (, line 120 // delete, line 120 slice_del(); break; } } while (false); break; } } while (false); cursor = limit - v_3; return true; } public boolean stem() { int v_1; int v_2; int v_3; int v_4; // (, line 130 // do, line 131 v_1 = cursor; lab0: do { // call prelude, line 131 if (!r_prelude()) { break lab0; } } while (false); cursor = v_1; // do, line 132 v_2 = cursor; lab1: do { // call mark_regions, line 132 if (!r_mark_regions()) { break lab1; } } while (false); cursor = v_2; // backwards, line 133 limit_backward = cursor; cursor = limit; // do, line 134 v_3 = limit - cursor; lab2: do { // call standard_suffix, line 134 if (!r_standard_suffix()) { break lab2; } } while (false); cursor = limit - v_3; cursor = limit_backward; // do, line 135 v_4 = cursor; lab3: do { // call postlude, line 135 if (!r_postlude()) { break lab3; } } while (false); cursor = v_4; return true; } } lucene-2.9.4/contrib/snowball/src/java/org/tartarus/snowball/ext/FinnishStemmer.java0000644000175000017500000007720311474320235031331 0ustar janpascaljanpascal// This file was generated automatically by the Snowball to Java compiler package org.tartarus.snowball.ext; import org.tartarus.snowball.SnowballProgram; import org.tartarus.snowball.Among; /** * Generated class implementing code defined by a snowball script. */ public class FinnishStemmer extends SnowballProgram { private Among a_0[] = { new Among ( "pa", -1, 1, "", this), new Among ( "sti", -1, 2, "", this), new Among ( "kaan", -1, 1, "", this), new Among ( "han", -1, 1, "", this), new Among ( "kin", -1, 1, "", this), new Among ( "h\u00E4n", -1, 1, "", this), new Among ( "k\u00E4\u00E4n", -1, 1, "", this), new Among ( "ko", -1, 1, "", this), new Among ( "p\u00E4", -1, 1, "", this), new Among ( "k\u00F6", -1, 1, "", this) }; private Among a_1[] = { new Among ( "lla", -1, -1, "", this), new Among ( "na", -1, -1, "", this), new Among ( "ssa", -1, -1, "", this), new Among ( "ta", -1, -1, "", this), new Among ( "lta", 3, -1, "", this), new Among ( "sta", 3, -1, "", this) }; private Among a_2[] = { new Among ( "ll\u00E4", -1, -1, "", this), new Among ( "n\u00E4", -1, -1, "", this), new Among ( "ss\u00E4", -1, -1, "", this), new Among ( "t\u00E4", -1, -1, "", this), new Among ( "lt\u00E4", 3, -1, "", this), new Among ( "st\u00E4", 3, -1, "", this) }; private Among a_3[] = { new Among ( "lle", -1, -1, "", this), new Among ( "ine", -1, -1, "", this) }; private Among a_4[] = { new Among ( "nsa", -1, 3, "", this), new Among ( "mme", -1, 3, "", this), new Among ( "nne", -1, 3, "", this), new Among ( "ni", -1, 2, "", this), new Among ( "si", -1, 1, "", this), new Among ( "an", -1, 4, "", this), new Among ( "en", -1, 6, "", this), new Among ( "\u00E4n", -1, 5, "", this), new Among ( "ns\u00E4", -1, 3, "", this) }; private Among a_5[] = { new Among ( "aa", -1, -1, "", this), new Among ( "ee", -1, -1, "", this), new Among ( "ii", -1, -1, "", this), new Among ( "oo", -1, -1, "", this), new Among ( "uu", -1, -1, "", this), new Among ( "\u00E4\u00E4", -1, -1, "", this), new Among ( "\u00F6\u00F6", -1, -1, "", this) }; private Among a_6[] = { new Among ( "a", -1, 8, "", this), new Among ( "lla", 0, -1, "", this), new Among ( "na", 0, -1, "", this), new Among ( "ssa", 0, -1, "", this), new Among ( "ta", 0, -1, "", this), new Among ( "lta", 4, -1, "", this), new Among ( "sta", 4, -1, "", this), new Among ( "tta", 4, 9, "", this), new Among ( "lle", -1, -1, "", this), new Among ( "ine", -1, -1, "", this), new Among ( "ksi", -1, -1, "", this), new Among ( "n", -1, 7, "", this), new Among ( "han", 11, 1, "", this), new Among ( "den", 11, -1, "r_VI", this), new Among ( "seen", 11, -1, "r_LONG", this), new Among ( "hen", 11, 2, "", this), new Among ( "tten", 11, -1, "r_VI", this), new Among ( "hin", 11, 3, "", this), new Among ( "siin", 11, -1, "r_VI", this), new Among ( "hon", 11, 4, "", this), new Among ( "h\u00E4n", 11, 5, "", this), new Among ( "h\u00F6n", 11, 6, "", this), new Among ( "\u00E4", -1, 8, "", this), new Among ( "ll\u00E4", 22, -1, "", this), new Among ( "n\u00E4", 22, -1, "", this), new Among ( "ss\u00E4", 22, -1, "", this), new Among ( "t\u00E4", 22, -1, "", this), new Among ( "lt\u00E4", 26, -1, "", this), new Among ( "st\u00E4", 26, -1, "", this), new Among ( "tt\u00E4", 26, 9, "", this) }; private Among a_7[] = { new Among ( "eja", -1, -1, "", this), new Among ( "mma", -1, 1, "", this), new Among ( "imma", 1, -1, "", this), new Among ( "mpa", -1, 1, "", this), new Among ( "impa", 3, -1, "", this), new Among ( "mmi", -1, 1, "", this), new Among ( "immi", 5, -1, "", this), new Among ( "mpi", -1, 1, "", this), new Among ( "impi", 7, -1, "", this), new Among ( "ej\u00E4", -1, -1, "", this), new Among ( "mm\u00E4", -1, 1, "", this), new Among ( "imm\u00E4", 10, -1, "", this), new Among ( "mp\u00E4", -1, 1, "", this), new Among ( "imp\u00E4", 12, -1, "", this) }; private Among a_8[] = { new Among ( "i", -1, -1, "", this), new Among ( "j", -1, -1, "", this) }; private Among a_9[] = { new Among ( "mma", -1, 1, "", this), new Among ( "imma", 0, -1, "", this) }; private static final char g_AEI[] = {17, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8 }; private static final char g_V1[] = {17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 32 }; private static final char g_V2[] = {17, 65, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 32 }; private static final char g_particle_end[] = {17, 97, 24, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 32 }; private boolean B_ending_removed; private StringBuffer S_x = new StringBuffer(); private int I_p2; private int I_p1; private void copy_from(FinnishStemmer other) { B_ending_removed = other.B_ending_removed; S_x = other.S_x; I_p2 = other.I_p2; I_p1 = other.I_p1; super.copy_from(other); } private boolean r_mark_regions() { int v_1; int v_3; // (, line 41 I_p1 = limit; I_p2 = limit; // goto, line 46 golab0: while(true) { v_1 = cursor; lab1: do { if (!(in_grouping(g_V1, 97, 246))) { break lab1; } cursor = v_1; break golab0; } while (false); cursor = v_1; if (cursor >= limit) { return false; } cursor++; } // gopast, line 46 golab2: while(true) { lab3: do { if (!(out_grouping(g_V1, 97, 246))) { break lab3; } break golab2; } while (false); if (cursor >= limit) { return false; } cursor++; } // setmark p1, line 46 I_p1 = cursor; // goto, line 47 golab4: while(true) { v_3 = cursor; lab5: do { if (!(in_grouping(g_V1, 97, 246))) { break lab5; } cursor = v_3; break golab4; } while (false); cursor = v_3; if (cursor >= limit) { return false; } cursor++; } // gopast, line 47 golab6: while(true) { lab7: do { if (!(out_grouping(g_V1, 97, 246))) { break lab7; } break golab6; } while (false); if (cursor >= limit) { return false; } cursor++; } // setmark p2, line 47 I_p2 = cursor; return true; } private boolean r_R2() { if (!(I_p2 <= cursor)) { return false; } return true; } private boolean r_particle_etc() { int among_var; int v_1; int v_2; // (, line 54 // setlimit, line 55 v_1 = limit - cursor; // tomark, line 55 if (cursor < I_p1) { return false; } cursor = I_p1; v_2 = limit_backward; limit_backward = cursor; cursor = limit - v_1; // (, line 55 // [, line 55 ket = cursor; // substring, line 55 among_var = find_among_b(a_0, 10); if (among_var == 0) { limit_backward = v_2; return false; } // ], line 55 bra = cursor; limit_backward = v_2; switch(among_var) { case 0: return false; case 1: // (, line 62 if (!(in_grouping_b(g_particle_end, 97, 246))) { return false; } break; case 2: // (, line 64 // call R2, line 64 if (!r_R2()) { return false; } break; } // delete, line 66 slice_del(); return true; } private boolean r_possessive() { int among_var; int v_1; int v_2; int v_3; // (, line 68 // setlimit, line 69 v_1 = limit - cursor; // tomark, line 69 if (cursor < I_p1) { return false; } cursor = I_p1; v_2 = limit_backward; limit_backward = cursor; cursor = limit - v_1; // (, line 69 // [, line 69 ket = cursor; // substring, line 69 among_var = find_among_b(a_4, 9); if (among_var == 0) { limit_backward = v_2; return false; } // ], line 69 bra = cursor; limit_backward = v_2; switch(among_var) { case 0: return false; case 1: // (, line 72 // not, line 72 { v_3 = limit - cursor; lab0: do { // literal, line 72 if (!(eq_s_b(1, "k"))) { break lab0; } return false; } while (false); cursor = limit - v_3; } // delete, line 72 slice_del(); break; case 2: // (, line 74 // delete, line 74 slice_del(); // [, line 74 ket = cursor; // literal, line 74 if (!(eq_s_b(3, "kse"))) { return false; } // ], line 74 bra = cursor; // <-, line 74 slice_from("ksi"); break; case 3: // (, line 78 // delete, line 78 slice_del(); break; case 4: // (, line 81 // among, line 81 if (find_among_b(a_1, 6) == 0) { return false; } // delete, line 81 slice_del(); break; case 5: // (, line 83 // among, line 83 if (find_among_b(a_2, 6) == 0) { return false; } // delete, line 84 slice_del(); break; case 6: // (, line 86 // among, line 86 if (find_among_b(a_3, 2) == 0) { return false; } // delete, line 86 slice_del(); break; } return true; } private boolean r_LONG() { // among, line 91 if (find_among_b(a_5, 7) == 0) { return false; } return true; } private boolean r_VI() { // (, line 93 // literal, line 93 if (!(eq_s_b(1, "i"))) { return false; } if (!(in_grouping_b(g_V2, 97, 246))) { return false; } return true; } private boolean r_case_ending() { int among_var; int v_1; int v_2; int v_3; int v_4; int v_5; // (, line 95 // setlimit, line 96 v_1 = limit - cursor; // tomark, line 96 if (cursor < I_p1) { return false; } cursor = I_p1; v_2 = limit_backward; limit_backward = cursor; cursor = limit - v_1; // (, line 96 // [, line 96 ket = cursor; // substring, line 96 among_var = find_among_b(a_6, 30); if (among_var == 0) { limit_backward = v_2; return false; } // ], line 96 bra = cursor; limit_backward = v_2; switch(among_var) { case 0: return false; case 1: // (, line 98 // literal, line 98 if (!(eq_s_b(1, "a"))) { return false; } break; case 2: // (, line 99 // literal, line 99 if (!(eq_s_b(1, "e"))) { return false; } break; case 3: // (, line 100 // literal, line 100 if (!(eq_s_b(1, "i"))) { return false; } break; case 4: // (, line 101 // literal, line 101 if (!(eq_s_b(1, "o"))) { return false; } break; case 5: // (, line 102 // literal, line 102 if (!(eq_s_b(1, "\u00E4"))) { return false; } break; case 6: // (, line 103 // literal, line 103 if (!(eq_s_b(1, "\u00F6"))) { return false; } break; case 7: // (, line 111 // try, line 111 v_3 = limit - cursor; lab0: do { // (, line 111 // and, line 113 v_4 = limit - cursor; // or, line 112 lab1: do { v_5 = limit - cursor; lab2: do { // call LONG, line 111 if (!r_LONG()) { break lab2; } break lab1; } while (false); cursor = limit - v_5; // literal, line 112 if (!(eq_s_b(2, "ie"))) { cursor = limit - v_3; break lab0; } } while (false); cursor = limit - v_4; // next, line 113 if (cursor <= limit_backward) { cursor = limit - v_3; break lab0; } cursor--; // ], line 113 bra = cursor; } while (false); break; case 8: // (, line 119 if (!(in_grouping_b(g_V1, 97, 246))) { return false; } if (!(out_grouping_b(g_V1, 97, 246))) { return false; } break; case 9: // (, line 121 // literal, line 121 if (!(eq_s_b(1, "e"))) { return false; } break; } // delete, line 138 slice_del(); // set ending_removed, line 139 B_ending_removed = true; return true; } private boolean r_other_endings() { int among_var; int v_1; int v_2; int v_3; // (, line 141 // setlimit, line 142 v_1 = limit - cursor; // tomark, line 142 if (cursor < I_p2) { return false; } cursor = I_p2; v_2 = limit_backward; limit_backward = cursor; cursor = limit - v_1; // (, line 142 // [, line 142 ket = cursor; // substring, line 142 among_var = find_among_b(a_7, 14); if (among_var == 0) { limit_backward = v_2; return false; } // ], line 142 bra = cursor; limit_backward = v_2; switch(among_var) { case 0: return false; case 1: // (, line 146 // not, line 146 { v_3 = limit - cursor; lab0: do { // literal, line 146 if (!(eq_s_b(2, "po"))) { break lab0; } return false; } while (false); cursor = limit - v_3; } break; } // delete, line 151 slice_del(); return true; } private boolean r_i_plural() { int v_1; int v_2; // (, line 153 // setlimit, line 154 v_1 = limit - cursor; // tomark, line 154 if (cursor < I_p1) { return false; } cursor = I_p1; v_2 = limit_backward; limit_backward = cursor; cursor = limit - v_1; // (, line 154 // [, line 154 ket = cursor; // substring, line 154 if (find_among_b(a_8, 2) == 0) { limit_backward = v_2; return false; } // ], line 154 bra = cursor; limit_backward = v_2; // delete, line 158 slice_del(); return true; } private boolean r_t_plural() { int among_var; int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; // (, line 160 // setlimit, line 161 v_1 = limit - cursor; // tomark, line 161 if (cursor < I_p1) { return false; } cursor = I_p1; v_2 = limit_backward; limit_backward = cursor; cursor = limit - v_1; // (, line 161 // [, line 162 ket = cursor; // literal, line 162 if (!(eq_s_b(1, "t"))) { limit_backward = v_2; return false; } // ], line 162 bra = cursor; // test, line 162 v_3 = limit - cursor; if (!(in_grouping_b(g_V1, 97, 246))) { limit_backward = v_2; return false; } cursor = limit - v_3; // delete, line 163 slice_del(); limit_backward = v_2; // setlimit, line 165 v_4 = limit - cursor; // tomark, line 165 if (cursor < I_p2) { return false; } cursor = I_p2; v_5 = limit_backward; limit_backward = cursor; cursor = limit - v_4; // (, line 165 // [, line 165 ket = cursor; // substring, line 165 among_var = find_among_b(a_9, 2); if (among_var == 0) { limit_backward = v_5; return false; } // ], line 165 bra = cursor; limit_backward = v_5; switch(among_var) { case 0: return false; case 1: // (, line 167 // not, line 167 { v_6 = limit - cursor; lab0: do { // literal, line 167 if (!(eq_s_b(2, "po"))) { break lab0; } return false; } while (false); cursor = limit - v_6; } break; } // delete, line 170 slice_del(); return true; } private boolean r_tidy() { int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; int v_8; int v_9; // (, line 172 // setlimit, line 173 v_1 = limit - cursor; // tomark, line 173 if (cursor < I_p1) { return false; } cursor = I_p1; v_2 = limit_backward; limit_backward = cursor; cursor = limit - v_1; // (, line 173 // do, line 174 v_3 = limit - cursor; lab0: do { // (, line 174 // and, line 174 v_4 = limit - cursor; // call LONG, line 174 if (!r_LONG()) { break lab0; } cursor = limit - v_4; // (, line 174 // [, line 174 ket = cursor; // next, line 174 if (cursor <= limit_backward) { break lab0; } cursor--; // ], line 174 bra = cursor; // delete, line 174 slice_del(); } while (false); cursor = limit - v_3; // do, line 175 v_5 = limit - cursor; lab1: do { // (, line 175 // [, line 175 ket = cursor; if (!(in_grouping_b(g_AEI, 97, 228))) { break lab1; } // ], line 175 bra = cursor; if (!(out_grouping_b(g_V1, 97, 246))) { break lab1; } // delete, line 175 slice_del(); } while (false); cursor = limit - v_5; // do, line 176 v_6 = limit - cursor; lab2: do { // (, line 176 // [, line 176 ket = cursor; // literal, line 176 if (!(eq_s_b(1, "j"))) { break lab2; } // ], line 176 bra = cursor; // or, line 176 lab3: do { v_7 = limit - cursor; lab4: do { // literal, line 176 if (!(eq_s_b(1, "o"))) { break lab4; } break lab3; } while (false); cursor = limit - v_7; // literal, line 176 if (!(eq_s_b(1, "u"))) { break lab2; } } while (false); // delete, line 176 slice_del(); } while (false); cursor = limit - v_6; // do, line 177 v_8 = limit - cursor; lab5: do { // (, line 177 // [, line 177 ket = cursor; // literal, line 177 if (!(eq_s_b(1, "o"))) { break lab5; } // ], line 177 bra = cursor; // literal, line 177 if (!(eq_s_b(1, "j"))) { break lab5; } // delete, line 177 slice_del(); } while (false); cursor = limit - v_8; limit_backward = v_2; // goto, line 179 golab6: while(true) { v_9 = limit - cursor; lab7: do { if (!(out_grouping_b(g_V1, 97, 246))) { break lab7; } cursor = limit - v_9; break golab6; } while (false); cursor = limit - v_9; if (cursor <= limit_backward) { return false; } cursor--; } // [, line 179 ket = cursor; // next, line 179 if (cursor <= limit_backward) { return false; } cursor--; // ], line 179 bra = cursor; // -> x, line 179 S_x = slice_to(S_x); // name x, line 179 if (!(eq_v_b(S_x))) { return false; } // delete, line 179 slice_del(); return true; } public boolean stem() { int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; int v_8; int v_9; // (, line 183 // do, line 185 v_1 = cursor; lab0: do { // call mark_regions, line 185 if (!r_mark_regions()) { break lab0; } } while (false); cursor = v_1; // unset ending_removed, line 186 B_ending_removed = false; // backwards, line 187 limit_backward = cursor; cursor = limit; // (, line 187 // do, line 188 v_2 = limit - cursor; lab1: do { // call particle_etc, line 188 if (!r_particle_etc()) { break lab1; } } while (false); cursor = limit - v_2; // do, line 189 v_3 = limit - cursor; lab2: do { // call possessive, line 189 if (!r_possessive()) { break lab2; } } while (false); cursor = limit - v_3; // do, line 190 v_4 = limit - cursor; lab3: do { // call case_ending, line 190 if (!r_case_ending()) { break lab3; } } while (false); cursor = limit - v_4; // do, line 191 v_5 = limit - cursor; lab4: do { // call other_endings, line 191 if (!r_other_endings()) { break lab4; } } while (false); cursor = limit - v_5; // or, line 192 lab5: do { v_6 = limit - cursor; lab6: do { // (, line 192 // Boolean test ending_removed, line 192 if (!(B_ending_removed)) { break lab6; } // do, line 192 v_7 = limit - cursor; lab7: do { // call i_plural, line 192 if (!r_i_plural()) { break lab7; } } while (false); cursor = limit - v_7; break lab5; } while (false); cursor = limit - v_6; // do, line 192 v_8 = limit - cursor; lab8: do { // call t_plural, line 192 if (!r_t_plural()) { break lab8; } } while (false); cursor = limit - v_8; } while (false); // do, line 193 v_9 = limit - cursor; lab9: do { // call tidy, line 193 if (!r_tidy()) { break lab9; } } while (false); cursor = limit - v_9; cursor = limit_backward; return true; } } lucene-2.9.4/contrib/snowball/src/java/org/tartarus/snowball/ext/EnglishStemmer.java0000644000175000017500000012454311474320235031324 0ustar janpascaljanpascal// This file was generated automatically by the Snowball to Java compiler package org.tartarus.snowball.ext; import org.tartarus.snowball.SnowballProgram; import org.tartarus.snowball.Among; /** * Generated class implementing code defined by a snowball script. */ public class EnglishStemmer extends SnowballProgram { private Among a_0[] = { new Among ( "arsen", -1, -1, "", this), new Among ( "commun", -1, -1, "", this), new Among ( "gener", -1, -1, "", this) }; private Among a_1[] = { new Among ( "'", -1, 1, "", this), new Among ( "'s'", 0, 1, "", this), new Among ( "'s", -1, 1, "", this) }; private Among a_2[] = { new Among ( "ied", -1, 2, "", this), new Among ( "s", -1, 3, "", this), new Among ( "ies", 1, 2, "", this), new Among ( "sses", 1, 1, "", this), new Among ( "ss", 1, -1, "", this), new Among ( "us", 1, -1, "", this) }; private Among a_3[] = { new Among ( "", -1, 3, "", this), new Among ( "bb", 0, 2, "", this), new Among ( "dd", 0, 2, "", this), new Among ( "ff", 0, 2, "", this), new Among ( "gg", 0, 2, "", this), new Among ( "bl", 0, 1, "", this), new Among ( "mm", 0, 2, "", this), new Among ( "nn", 0, 2, "", this), new Among ( "pp", 0, 2, "", this), new Among ( "rr", 0, 2, "", this), new Among ( "at", 0, 1, "", this), new Among ( "tt", 0, 2, "", this), new Among ( "iz", 0, 1, "", this) }; private Among a_4[] = { new Among ( "ed", -1, 2, "", this), new Among ( "eed", 0, 1, "", this), new Among ( "ing", -1, 2, "", this), new Among ( "edly", -1, 2, "", this), new Among ( "eedly", 3, 1, "", this), new Among ( "ingly", -1, 2, "", this) }; private Among a_5[] = { new Among ( "anci", -1, 3, "", this), new Among ( "enci", -1, 2, "", this), new Among ( "ogi", -1, 13, "", this), new Among ( "li", -1, 16, "", this), new Among ( "bli", 3, 12, "", this), new Among ( "abli", 4, 4, "", this), new Among ( "alli", 3, 8, "", this), new Among ( "fulli", 3, 14, "", this), new Among ( "lessli", 3, 15, "", this), new Among ( "ousli", 3, 10, "", this), new Among ( "entli", 3, 5, "", this), new Among ( "aliti", -1, 8, "", this), new Among ( "biliti", -1, 12, "", this), new Among ( "iviti", -1, 11, "", this), new Among ( "tional", -1, 1, "", this), new Among ( "ational", 14, 7, "", this), new Among ( "alism", -1, 8, "", this), new Among ( "ation", -1, 7, "", this), new Among ( "ization", 17, 6, "", this), new Among ( "izer", -1, 6, "", this), new Among ( "ator", -1, 7, "", this), new Among ( "iveness", -1, 11, "", this), new Among ( "fulness", -1, 9, "", this), new Among ( "ousness", -1, 10, "", this) }; private Among a_6[] = { new Among ( "icate", -1, 4, "", this), new Among ( "ative", -1, 6, "", this), new Among ( "alize", -1, 3, "", this), new Among ( "iciti", -1, 4, "", this), new Among ( "ical", -1, 4, "", this), new Among ( "tional", -1, 1, "", this), new Among ( "ational", 5, 2, "", this), new Among ( "ful", -1, 5, "", this), new Among ( "ness", -1, 5, "", this) }; private Among a_7[] = { new Among ( "ic", -1, 1, "", this), new Among ( "ance", -1, 1, "", this), new Among ( "ence", -1, 1, "", this), new Among ( "able", -1, 1, "", this), new Among ( "ible", -1, 1, "", this), new Among ( "ate", -1, 1, "", this), new Among ( "ive", -1, 1, "", this), new Among ( "ize", -1, 1, "", this), new Among ( "iti", -1, 1, "", this), new Among ( "al", -1, 1, "", this), new Among ( "ism", -1, 1, "", this), new Among ( "ion", -1, 2, "", this), new Among ( "er", -1, 1, "", this), new Among ( "ous", -1, 1, "", this), new Among ( "ant", -1, 1, "", this), new Among ( "ent", -1, 1, "", this), new Among ( "ment", 15, 1, "", this), new Among ( "ement", 16, 1, "", this) }; private Among a_8[] = { new Among ( "e", -1, 1, "", this), new Among ( "l", -1, 2, "", this) }; private Among a_9[] = { new Among ( "succeed", -1, -1, "", this), new Among ( "proceed", -1, -1, "", this), new Among ( "exceed", -1, -1, "", this), new Among ( "canning", -1, -1, "", this), new Among ( "inning", -1, -1, "", this), new Among ( "earring", -1, -1, "", this), new Among ( "herring", -1, -1, "", this), new Among ( "outing", -1, -1, "", this) }; private Among a_10[] = { new Among ( "andes", -1, -1, "", this), new Among ( "atlas", -1, -1, "", this), new Among ( "bias", -1, -1, "", this), new Among ( "cosmos", -1, -1, "", this), new Among ( "dying", -1, 3, "", this), new Among ( "early", -1, 9, "", this), new Among ( "gently", -1, 7, "", this), new Among ( "howe", -1, -1, "", this), new Among ( "idly", -1, 6, "", this), new Among ( "lying", -1, 4, "", this), new Among ( "news", -1, -1, "", this), new Among ( "only", -1, 10, "", this), new Among ( "singly", -1, 11, "", this), new Among ( "skies", -1, 2, "", this), new Among ( "skis", -1, 1, "", this), new Among ( "sky", -1, -1, "", this), new Among ( "tying", -1, 5, "", this), new Among ( "ugly", -1, 8, "", this) }; private static final char g_v[] = {17, 65, 16, 1 }; private static final char g_v_WXY[] = {1, 17, 65, 208, 1 }; private static final char g_valid_LI[] = {55, 141, 2 }; private boolean B_Y_found; private int I_p2; private int I_p1; private void copy_from(EnglishStemmer other) { B_Y_found = other.B_Y_found; I_p2 = other.I_p2; I_p1 = other.I_p1; super.copy_from(other); } private boolean r_prelude() { int v_1; int v_2; int v_3; int v_4; int v_5; // (, line 25 // unset Y_found, line 26 B_Y_found = false; // do, line 27 v_1 = cursor; lab0: do { // (, line 27 // [, line 27 bra = cursor; // literal, line 27 if (!(eq_s(1, "'"))) { break lab0; } // ], line 27 ket = cursor; // delete, line 27 slice_del(); } while (false); cursor = v_1; // do, line 28 v_2 = cursor; lab1: do { // (, line 28 // [, line 28 bra = cursor; // literal, line 28 if (!(eq_s(1, "y"))) { break lab1; } // ], line 28 ket = cursor; // <-, line 28 slice_from("Y"); // set Y_found, line 28 B_Y_found = true; } while (false); cursor = v_2; // do, line 29 v_3 = cursor; lab2: do { // repeat, line 29 replab3: while(true) { v_4 = cursor; lab4: do { // (, line 29 // goto, line 29 golab5: while(true) { v_5 = cursor; lab6: do { // (, line 29 if (!(in_grouping(g_v, 97, 121))) { break lab6; } // [, line 29 bra = cursor; // literal, line 29 if (!(eq_s(1, "y"))) { break lab6; } // ], line 29 ket = cursor; cursor = v_5; break golab5; } while (false); cursor = v_5; if (cursor >= limit) { break lab4; } cursor++; } // <-, line 29 slice_from("Y"); // set Y_found, line 29 B_Y_found = true; continue replab3; } while (false); cursor = v_4; break replab3; } } while (false); cursor = v_3; return true; } private boolean r_mark_regions() { int v_1; int v_2; // (, line 32 I_p1 = limit; I_p2 = limit; // do, line 35 v_1 = cursor; lab0: do { // (, line 35 // or, line 41 lab1: do { v_2 = cursor; lab2: do { // among, line 36 if (find_among(a_0, 3) == 0) { break lab2; } break lab1; } while (false); cursor = v_2; // (, line 41 // gopast, line 41 golab3: while(true) { lab4: do { if (!(in_grouping(g_v, 97, 121))) { break lab4; } break golab3; } while (false); if (cursor >= limit) { break lab0; } cursor++; } // gopast, line 41 golab5: while(true) { lab6: do { if (!(out_grouping(g_v, 97, 121))) { break lab6; } break golab5; } while (false); if (cursor >= limit) { break lab0; } cursor++; } } while (false); // setmark p1, line 42 I_p1 = cursor; // gopast, line 43 golab7: while(true) { lab8: do { if (!(in_grouping(g_v, 97, 121))) { break lab8; } break golab7; } while (false); if (cursor >= limit) { break lab0; } cursor++; } // gopast, line 43 golab9: while(true) { lab10: do { if (!(out_grouping(g_v, 97, 121))) { break lab10; } break golab9; } while (false); if (cursor >= limit) { break lab0; } cursor++; } // setmark p2, line 43 I_p2 = cursor; } while (false); cursor = v_1; return true; } private boolean r_shortv() { int v_1; // (, line 49 // or, line 51 lab0: do { v_1 = limit - cursor; lab1: do { // (, line 50 if (!(out_grouping_b(g_v_WXY, 89, 121))) { break lab1; } if (!(in_grouping_b(g_v, 97, 121))) { break lab1; } if (!(out_grouping_b(g_v, 97, 121))) { break lab1; } break lab0; } while (false); cursor = limit - v_1; // (, line 52 if (!(out_grouping_b(g_v, 97, 121))) { return false; } if (!(in_grouping_b(g_v, 97, 121))) { return false; } // atlimit, line 52 if (cursor > limit_backward) { return false; } } while (false); return true; } private boolean r_R1() { if (!(I_p1 <= cursor)) { return false; } return true; } private boolean r_R2() { if (!(I_p2 <= cursor)) { return false; } return true; } private boolean r_Step_1a() { int among_var; int v_1; int v_2; // (, line 58 // try, line 59 v_1 = limit - cursor; lab0: do { // (, line 59 // [, line 60 ket = cursor; // substring, line 60 among_var = find_among_b(a_1, 3); if (among_var == 0) { cursor = limit - v_1; break lab0; } // ], line 60 bra = cursor; switch(among_var) { case 0: cursor = limit - v_1; break lab0; case 1: // (, line 62 // delete, line 62 slice_del(); break; } } while (false); // [, line 65 ket = cursor; // substring, line 65 among_var = find_among_b(a_2, 6); if (among_var == 0) { return false; } // ], line 65 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 66 // <-, line 66 slice_from("ss"); break; case 2: // (, line 68 // or, line 68 lab1: do { v_2 = limit - cursor; lab2: do { // (, line 68 // hop, line 68 { int c = cursor - 2; if (limit_backward > c || c > limit) { break lab2; } cursor = c; } // <-, line 68 slice_from("i"); break lab1; } while (false); cursor = limit - v_2; // <-, line 68 slice_from("ie"); } while (false); break; case 3: // (, line 69 // next, line 69 if (cursor <= limit_backward) { return false; } cursor--; // gopast, line 69 golab3: while(true) { lab4: do { if (!(in_grouping_b(g_v, 97, 121))) { break lab4; } break golab3; } while (false); if (cursor <= limit_backward) { return false; } cursor--; } // delete, line 69 slice_del(); break; } return true; } private boolean r_Step_1b() { int among_var; int v_1; int v_3; int v_4; // (, line 74 // [, line 75 ket = cursor; // substring, line 75 among_var = find_among_b(a_4, 6); if (among_var == 0) { return false; } // ], line 75 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 77 // call R1, line 77 if (!r_R1()) { return false; } // <-, line 77 slice_from("ee"); break; case 2: // (, line 79 // test, line 80 v_1 = limit - cursor; // gopast, line 80 golab0: while(true) { lab1: do { if (!(in_grouping_b(g_v, 97, 121))) { break lab1; } break golab0; } while (false); if (cursor <= limit_backward) { return false; } cursor--; } cursor = limit - v_1; // delete, line 80 slice_del(); // test, line 81 v_3 = limit - cursor; // substring, line 81 among_var = find_among_b(a_3, 13); if (among_var == 0) { return false; } cursor = limit - v_3; switch(among_var) { case 0: return false; case 1: // (, line 83 // <+, line 83 { int c = cursor; insert(cursor, cursor, "e"); cursor = c; } break; case 2: // (, line 86 // [, line 86 ket = cursor; // next, line 86 if (cursor <= limit_backward) { return false; } cursor--; // ], line 86 bra = cursor; // delete, line 86 slice_del(); break; case 3: // (, line 87 // atmark, line 87 if (cursor != I_p1) { return false; } // test, line 87 v_4 = limit - cursor; // call shortv, line 87 if (!r_shortv()) { return false; } cursor = limit - v_4; // <+, line 87 { int c = cursor; insert(cursor, cursor, "e"); cursor = c; } break; } break; } return true; } private boolean r_Step_1c() { int v_1; int v_2; // (, line 93 // [, line 94 ket = cursor; // or, line 94 lab0: do { v_1 = limit - cursor; lab1: do { // literal, line 94 if (!(eq_s_b(1, "y"))) { break lab1; } break lab0; } while (false); cursor = limit - v_1; // literal, line 94 if (!(eq_s_b(1, "Y"))) { return false; } } while (false); // ], line 94 bra = cursor; if (!(out_grouping_b(g_v, 97, 121))) { return false; } // not, line 95 { v_2 = limit - cursor; lab2: do { // atlimit, line 95 if (cursor > limit_backward) { break lab2; } return false; } while (false); cursor = limit - v_2; } // <-, line 96 slice_from("i"); return true; } private boolean r_Step_2() { int among_var; // (, line 99 // [, line 100 ket = cursor; // substring, line 100 among_var = find_among_b(a_5, 24); if (among_var == 0) { return false; } // ], line 100 bra = cursor; // call R1, line 100 if (!r_R1()) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 101 // <-, line 101 slice_from("tion"); break; case 2: // (, line 102 // <-, line 102 slice_from("ence"); break; case 3: // (, line 103 // <-, line 103 slice_from("ance"); break; case 4: // (, line 104 // <-, line 104 slice_from("able"); break; case 5: // (, line 105 // <-, line 105 slice_from("ent"); break; case 6: // (, line 107 // <-, line 107 slice_from("ize"); break; case 7: // (, line 109 // <-, line 109 slice_from("ate"); break; case 8: // (, line 111 // <-, line 111 slice_from("al"); break; case 9: // (, line 112 // <-, line 112 slice_from("ful"); break; case 10: // (, line 114 // <-, line 114 slice_from("ous"); break; case 11: // (, line 116 // <-, line 116 slice_from("ive"); break; case 12: // (, line 118 // <-, line 118 slice_from("ble"); break; case 13: // (, line 119 // literal, line 119 if (!(eq_s_b(1, "l"))) { return false; } // <-, line 119 slice_from("og"); break; case 14: // (, line 120 // <-, line 120 slice_from("ful"); break; case 15: // (, line 121 // <-, line 121 slice_from("less"); break; case 16: // (, line 122 if (!(in_grouping_b(g_valid_LI, 99, 116))) { return false; } // delete, line 122 slice_del(); break; } return true; } private boolean r_Step_3() { int among_var; // (, line 126 // [, line 127 ket = cursor; // substring, line 127 among_var = find_among_b(a_6, 9); if (among_var == 0) { return false; } // ], line 127 bra = cursor; // call R1, line 127 if (!r_R1()) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 128 // <-, line 128 slice_from("tion"); break; case 2: // (, line 129 // <-, line 129 slice_from("ate"); break; case 3: // (, line 130 // <-, line 130 slice_from("al"); break; case 4: // (, line 132 // <-, line 132 slice_from("ic"); break; case 5: // (, line 134 // delete, line 134 slice_del(); break; case 6: // (, line 136 // call R2, line 136 if (!r_R2()) { return false; } // delete, line 136 slice_del(); break; } return true; } private boolean r_Step_4() { int among_var; int v_1; // (, line 140 // [, line 141 ket = cursor; // substring, line 141 among_var = find_among_b(a_7, 18); if (among_var == 0) { return false; } // ], line 141 bra = cursor; // call R2, line 141 if (!r_R2()) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 144 // delete, line 144 slice_del(); break; case 2: // (, line 145 // or, line 145 lab0: do { v_1 = limit - cursor; lab1: do { // literal, line 145 if (!(eq_s_b(1, "s"))) { break lab1; } break lab0; } while (false); cursor = limit - v_1; // literal, line 145 if (!(eq_s_b(1, "t"))) { return false; } } while (false); // delete, line 145 slice_del(); break; } return true; } private boolean r_Step_5() { int among_var; int v_1; int v_2; // (, line 149 // [, line 150 ket = cursor; // substring, line 150 among_var = find_among_b(a_8, 2); if (among_var == 0) { return false; } // ], line 150 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 151 // or, line 151 lab0: do { v_1 = limit - cursor; lab1: do { // call R2, line 151 if (!r_R2()) { break lab1; } break lab0; } while (false); cursor = limit - v_1; // (, line 151 // call R1, line 151 if (!r_R1()) { return false; } // not, line 151 { v_2 = limit - cursor; lab2: do { // call shortv, line 151 if (!r_shortv()) { break lab2; } return false; } while (false); cursor = limit - v_2; } } while (false); // delete, line 151 slice_del(); break; case 2: // (, line 152 // call R2, line 152 if (!r_R2()) { return false; } // literal, line 152 if (!(eq_s_b(1, "l"))) { return false; } // delete, line 152 slice_del(); break; } return true; } private boolean r_exception2() { // (, line 156 // [, line 158 ket = cursor; // substring, line 158 if (find_among_b(a_9, 8) == 0) { return false; } // ], line 158 bra = cursor; // atlimit, line 158 if (cursor > limit_backward) { return false; } return true; } private boolean r_exception1() { int among_var; // (, line 168 // [, line 170 bra = cursor; // substring, line 170 among_var = find_among(a_10, 18); if (among_var == 0) { return false; } // ], line 170 ket = cursor; // atlimit, line 170 if (cursor < limit) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 174 // <-, line 174 slice_from("ski"); break; case 2: // (, line 175 // <-, line 175 slice_from("sky"); break; case 3: // (, line 176 // <-, line 176 slice_from("die"); break; case 4: // (, line 177 // <-, line 177 slice_from("lie"); break; case 5: // (, line 178 // <-, line 178 slice_from("tie"); break; case 6: // (, line 182 // <-, line 182 slice_from("idl"); break; case 7: // (, line 183 // <-, line 183 slice_from("gentl"); break; case 8: // (, line 184 // <-, line 184 slice_from("ugli"); break; case 9: // (, line 185 // <-, line 185 slice_from("earli"); break; case 10: // (, line 186 // <-, line 186 slice_from("onli"); break; case 11: // (, line 187 // <-, line 187 slice_from("singl"); break; } return true; } private boolean r_postlude() { int v_1; int v_2; // (, line 203 // Boolean test Y_found, line 203 if (!(B_Y_found)) { return false; } // repeat, line 203 replab0: while(true) { v_1 = cursor; lab1: do { // (, line 203 // goto, line 203 golab2: while(true) { v_2 = cursor; lab3: do { // (, line 203 // [, line 203 bra = cursor; // literal, line 203 if (!(eq_s(1, "Y"))) { break lab3; } // ], line 203 ket = cursor; cursor = v_2; break golab2; } while (false); cursor = v_2; if (cursor >= limit) { break lab1; } cursor++; } // <-, line 203 slice_from("y"); continue replab0; } while (false); cursor = v_1; break replab0; } return true; } public boolean stem() { int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; int v_8; int v_9; int v_10; int v_11; int v_12; int v_13; // (, line 205 // or, line 207 lab0: do { v_1 = cursor; lab1: do { // call exception1, line 207 if (!r_exception1()) { break lab1; } break lab0; } while (false); cursor = v_1; lab2: do { // not, line 208 { v_2 = cursor; lab3: do { // hop, line 208 { int c = cursor + 3; if (0 > c || c > limit) { break lab3; } cursor = c; } break lab2; } while (false); cursor = v_2; } break lab0; } while (false); cursor = v_1; // (, line 208 // do, line 209 v_3 = cursor; lab4: do { // call prelude, line 209 if (!r_prelude()) { break lab4; } } while (false); cursor = v_3; // do, line 210 v_4 = cursor; lab5: do { // call mark_regions, line 210 if (!r_mark_regions()) { break lab5; } } while (false); cursor = v_4; // backwards, line 211 limit_backward = cursor; cursor = limit; // (, line 211 // do, line 213 v_5 = limit - cursor; lab6: do { // call Step_1a, line 213 if (!r_Step_1a()) { break lab6; } } while (false); cursor = limit - v_5; // or, line 215 lab7: do { v_6 = limit - cursor; lab8: do { // call exception2, line 215 if (!r_exception2()) { break lab8; } break lab7; } while (false); cursor = limit - v_6; // (, line 215 // do, line 217 v_7 = limit - cursor; lab9: do { // call Step_1b, line 217 if (!r_Step_1b()) { break lab9; } } while (false); cursor = limit - v_7; // do, line 218 v_8 = limit - cursor; lab10: do { // call Step_1c, line 218 if (!r_Step_1c()) { break lab10; } } while (false); cursor = limit - v_8; // do, line 220 v_9 = limit - cursor; lab11: do { // call Step_2, line 220 if (!r_Step_2()) { break lab11; } } while (false); cursor = limit - v_9; // do, line 221 v_10 = limit - cursor; lab12: do { // call Step_3, line 221 if (!r_Step_3()) { break lab12; } } while (false); cursor = limit - v_10; // do, line 222 v_11 = limit - cursor; lab13: do { // call Step_4, line 222 if (!r_Step_4()) { break lab13; } } while (false); cursor = limit - v_11; // do, line 224 v_12 = limit - cursor; lab14: do { // call Step_5, line 224 if (!r_Step_5()) { break lab14; } } while (false); cursor = limit - v_12; } while (false); cursor = limit_backward; // do, line 227 v_13 = cursor; lab15: do { // call postlude, line 227 if (!r_postlude()) { break lab15; } } while (false); cursor = v_13; } while (false); return true; } } lucene-2.9.4/contrib/snowball/src/java/org/tartarus/snowball/ext/GermanStemmer.java0000644000175000017500000005552511474320235031147 0ustar janpascaljanpascal// This file was generated automatically by the Snowball to Java compiler package org.tartarus.snowball.ext; import org.tartarus.snowball.SnowballProgram; import org.tartarus.snowball.Among; /** * Generated class implementing code defined by a snowball script. */ public class GermanStemmer extends SnowballProgram { private Among a_0[] = { new Among ( "", -1, 6, "", this), new Among ( "U", 0, 2, "", this), new Among ( "Y", 0, 1, "", this), new Among ( "\u00E4", 0, 3, "", this), new Among ( "\u00F6", 0, 4, "", this), new Among ( "\u00FC", 0, 5, "", this) }; private Among a_1[] = { new Among ( "e", -1, 1, "", this), new Among ( "em", -1, 1, "", this), new Among ( "en", -1, 1, "", this), new Among ( "ern", -1, 1, "", this), new Among ( "er", -1, 1, "", this), new Among ( "s", -1, 2, "", this), new Among ( "es", 5, 1, "", this) }; private Among a_2[] = { new Among ( "en", -1, 1, "", this), new Among ( "er", -1, 1, "", this), new Among ( "st", -1, 2, "", this), new Among ( "est", 2, 1, "", this) }; private Among a_3[] = { new Among ( "ig", -1, 1, "", this), new Among ( "lich", -1, 1, "", this) }; private Among a_4[] = { new Among ( "end", -1, 1, "", this), new Among ( "ig", -1, 2, "", this), new Among ( "ung", -1, 1, "", this), new Among ( "lich", -1, 3, "", this), new Among ( "isch", -1, 2, "", this), new Among ( "ik", -1, 2, "", this), new Among ( "heit", -1, 3, "", this), new Among ( "keit", -1, 4, "", this) }; private static final char g_v[] = {17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 32, 8 }; private static final char g_s_ending[] = {117, 30, 5 }; private static final char g_st_ending[] = {117, 30, 4 }; private int I_x; private int I_p2; private int I_p1; private void copy_from(GermanStemmer other) { I_x = other.I_x; I_p2 = other.I_p2; I_p1 = other.I_p1; super.copy_from(other); } private boolean r_prelude() { int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; // (, line 28 // test, line 30 v_1 = cursor; // repeat, line 30 replab0: while(true) { v_2 = cursor; lab1: do { // (, line 30 // or, line 33 lab2: do { v_3 = cursor; lab3: do { // (, line 31 // [, line 32 bra = cursor; // literal, line 32 if (!(eq_s(1, "\u00DF"))) { break lab3; } // ], line 32 ket = cursor; // <-, line 32 slice_from("ss"); break lab2; } while (false); cursor = v_3; // next, line 33 if (cursor >= limit) { break lab1; } cursor++; } while (false); continue replab0; } while (false); cursor = v_2; break replab0; } cursor = v_1; // repeat, line 36 replab4: while(true) { v_4 = cursor; lab5: do { // goto, line 36 golab6: while(true) { v_5 = cursor; lab7: do { // (, line 36 if (!(in_grouping(g_v, 97, 252))) { break lab7; } // [, line 37 bra = cursor; // or, line 37 lab8: do { v_6 = cursor; lab9: do { // (, line 37 // literal, line 37 if (!(eq_s(1, "u"))) { break lab9; } // ], line 37 ket = cursor; if (!(in_grouping(g_v, 97, 252))) { break lab9; } // <-, line 37 slice_from("U"); break lab8; } while (false); cursor = v_6; // (, line 38 // literal, line 38 if (!(eq_s(1, "y"))) { break lab7; } // ], line 38 ket = cursor; if (!(in_grouping(g_v, 97, 252))) { break lab7; } // <-, line 38 slice_from("Y"); } while (false); cursor = v_5; break golab6; } while (false); cursor = v_5; if (cursor >= limit) { break lab5; } cursor++; } continue replab4; } while (false); cursor = v_4; break replab4; } return true; } private boolean r_mark_regions() { int v_1; // (, line 42 I_p1 = limit; I_p2 = limit; // test, line 47 v_1 = cursor; // (, line 47 // hop, line 47 { int c = cursor + 3; if (0 > c || c > limit) { return false; } cursor = c; } // setmark x, line 47 I_x = cursor; cursor = v_1; // gopast, line 49 golab0: while(true) { lab1: do { if (!(in_grouping(g_v, 97, 252))) { break lab1; } break golab0; } while (false); if (cursor >= limit) { return false; } cursor++; } // gopast, line 49 golab2: while(true) { lab3: do { if (!(out_grouping(g_v, 97, 252))) { break lab3; } break golab2; } while (false); if (cursor >= limit) { return false; } cursor++; } // setmark p1, line 49 I_p1 = cursor; // try, line 50 lab4: do { // (, line 50 if (!(I_p1 < I_x)) { break lab4; } I_p1 = I_x; } while (false); // gopast, line 51 golab5: while(true) { lab6: do { if (!(in_grouping(g_v, 97, 252))) { break lab6; } break golab5; } while (false); if (cursor >= limit) { return false; } cursor++; } // gopast, line 51 golab7: while(true) { lab8: do { if (!(out_grouping(g_v, 97, 252))) { break lab8; } break golab7; } while (false); if (cursor >= limit) { return false; } cursor++; } // setmark p2, line 51 I_p2 = cursor; return true; } private boolean r_postlude() { int among_var; int v_1; // repeat, line 55 replab0: while(true) { v_1 = cursor; lab1: do { // (, line 55 // [, line 57 bra = cursor; // substring, line 57 among_var = find_among(a_0, 6); if (among_var == 0) { break lab1; } // ], line 57 ket = cursor; switch(among_var) { case 0: break lab1; case 1: // (, line 58 // <-, line 58 slice_from("y"); break; case 2: // (, line 59 // <-, line 59 slice_from("u"); break; case 3: // (, line 60 // <-, line 60 slice_from("a"); break; case 4: // (, line 61 // <-, line 61 slice_from("o"); break; case 5: // (, line 62 // <-, line 62 slice_from("u"); break; case 6: // (, line 63 // next, line 63 if (cursor >= limit) { break lab1; } cursor++; break; } continue replab0; } while (false); cursor = v_1; break replab0; } return true; } private boolean r_R1() { if (!(I_p1 <= cursor)) { return false; } return true; } private boolean r_R2() { if (!(I_p2 <= cursor)) { return false; } return true; } private boolean r_standard_suffix() { int among_var; int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; int v_8; int v_9; // (, line 73 // do, line 74 v_1 = limit - cursor; lab0: do { // (, line 74 // [, line 75 ket = cursor; // substring, line 75 among_var = find_among_b(a_1, 7); if (among_var == 0) { break lab0; } // ], line 75 bra = cursor; // call R1, line 75 if (!r_R1()) { break lab0; } switch(among_var) { case 0: break lab0; case 1: // (, line 77 // delete, line 77 slice_del(); break; case 2: // (, line 80 if (!(in_grouping_b(g_s_ending, 98, 116))) { break lab0; } // delete, line 80 slice_del(); break; } } while (false); cursor = limit - v_1; // do, line 84 v_2 = limit - cursor; lab1: do { // (, line 84 // [, line 85 ket = cursor; // substring, line 85 among_var = find_among_b(a_2, 4); if (among_var == 0) { break lab1; } // ], line 85 bra = cursor; // call R1, line 85 if (!r_R1()) { break lab1; } switch(among_var) { case 0: break lab1; case 1: // (, line 87 // delete, line 87 slice_del(); break; case 2: // (, line 90 if (!(in_grouping_b(g_st_ending, 98, 116))) { break lab1; } // hop, line 90 { int c = cursor - 3; if (limit_backward > c || c > limit) { break lab1; } cursor = c; } // delete, line 90 slice_del(); break; } } while (false); cursor = limit - v_2; // do, line 94 v_3 = limit - cursor; lab2: do { // (, line 94 // [, line 95 ket = cursor; // substring, line 95 among_var = find_among_b(a_4, 8); if (among_var == 0) { break lab2; } // ], line 95 bra = cursor; // call R2, line 95 if (!r_R2()) { break lab2; } switch(among_var) { case 0: break lab2; case 1: // (, line 97 // delete, line 97 slice_del(); // try, line 98 v_4 = limit - cursor; lab3: do { // (, line 98 // [, line 98 ket = cursor; // literal, line 98 if (!(eq_s_b(2, "ig"))) { cursor = limit - v_4; break lab3; } // ], line 98 bra = cursor; // not, line 98 { v_5 = limit - cursor; lab4: do { // literal, line 98 if (!(eq_s_b(1, "e"))) { break lab4; } cursor = limit - v_4; break lab3; } while (false); cursor = limit - v_5; } // call R2, line 98 if (!r_R2()) { cursor = limit - v_4; break lab3; } // delete, line 98 slice_del(); } while (false); break; case 2: // (, line 101 // not, line 101 { v_6 = limit - cursor; lab5: do { // literal, line 101 if (!(eq_s_b(1, "e"))) { break lab5; } break lab2; } while (false); cursor = limit - v_6; } // delete, line 101 slice_del(); break; case 3: // (, line 104 // delete, line 104 slice_del(); // try, line 105 v_7 = limit - cursor; lab6: do { // (, line 105 // [, line 106 ket = cursor; // or, line 106 lab7: do { v_8 = limit - cursor; lab8: do { // literal, line 106 if (!(eq_s_b(2, "er"))) { break lab8; } break lab7; } while (false); cursor = limit - v_8; // literal, line 106 if (!(eq_s_b(2, "en"))) { cursor = limit - v_7; break lab6; } } while (false); // ], line 106 bra = cursor; // call R1, line 106 if (!r_R1()) { cursor = limit - v_7; break lab6; } // delete, line 106 slice_del(); } while (false); break; case 4: // (, line 110 // delete, line 110 slice_del(); // try, line 111 v_9 = limit - cursor; lab9: do { // (, line 111 // [, line 112 ket = cursor; // substring, line 112 among_var = find_among_b(a_3, 2); if (among_var == 0) { cursor = limit - v_9; break lab9; } // ], line 112 bra = cursor; // call R2, line 112 if (!r_R2()) { cursor = limit - v_9; break lab9; } switch(among_var) { case 0: cursor = limit - v_9; break lab9; case 1: // (, line 114 // delete, line 114 slice_del(); break; } } while (false); break; } } while (false); cursor = limit - v_3; return true; } public boolean stem() { int v_1; int v_2; int v_3; int v_4; // (, line 124 // do, line 125 v_1 = cursor; lab0: do { // call prelude, line 125 if (!r_prelude()) { break lab0; } } while (false); cursor = v_1; // do, line 126 v_2 = cursor; lab1: do { // call mark_regions, line 126 if (!r_mark_regions()) { break lab1; } } while (false); cursor = v_2; // backwards, line 127 limit_backward = cursor; cursor = limit; // do, line 128 v_3 = limit - cursor; lab2: do { // call standard_suffix, line 128 if (!r_standard_suffix()) { break lab2; } } while (false); cursor = limit - v_3; cursor = limit_backward; // do, line 129 v_4 = cursor; lab3: do { // call postlude, line 129 if (!r_postlude()) { break lab3; } } while (false); cursor = v_4; return true; } } lucene-2.9.4/contrib/snowball/src/java/org/tartarus/snowball/ext/LovinsStemmer.java0000644000175000017500000017424711474320235031213 0ustar janpascaljanpascal// This file was generated automatically by the Snowball to Java compiler package org.tartarus.snowball.ext; import org.tartarus.snowball.SnowballProgram; import org.tartarus.snowball.Among; /** * Generated class implementing code defined by a snowball script. */ public class LovinsStemmer extends SnowballProgram { private Among a_0[] = { new Among ( "d", -1, -1, "", this), new Among ( "f", -1, -1, "", this), new Among ( "ph", -1, -1, "", this), new Among ( "th", -1, -1, "", this), new Among ( "l", -1, -1, "", this), new Among ( "er", -1, -1, "", this), new Among ( "or", -1, -1, "", this), new Among ( "es", -1, -1, "", this), new Among ( "t", -1, -1, "", this) }; private Among a_1[] = { new Among ( "s'", -1, 1, "r_A", this), new Among ( "a", -1, 1, "r_A", this), new Among ( "ia", 1, 1, "r_A", this), new Among ( "ata", 1, 1, "r_A", this), new Among ( "ic", -1, 1, "r_A", this), new Among ( "aic", 4, 1, "r_A", this), new Among ( "allic", 4, 1, "r_BB", this), new Among ( "aric", 4, 1, "r_A", this), new Among ( "atic", 4, 1, "r_B", this), new Among ( "itic", 4, 1, "r_H", this), new Among ( "antic", 4, 1, "r_C", this), new Among ( "istic", 4, 1, "r_A", this), new Among ( "alistic", 11, 1, "r_B", this), new Among ( "aristic", 11, 1, "r_A", this), new Among ( "ivistic", 11, 1, "r_A", this), new Among ( "ed", -1, 1, "r_E", this), new Among ( "anced", 15, 1, "r_B", this), new Among ( "enced", 15, 1, "r_A", this), new Among ( "ished", 15, 1, "r_A", this), new Among ( "ied", 15, 1, "r_A", this), new Among ( "ened", 15, 1, "r_E", this), new Among ( "ioned", 15, 1, "r_A", this), new Among ( "ated", 15, 1, "r_I", this), new Among ( "ented", 15, 1, "r_C", this), new Among ( "ized", 15, 1, "r_F", this), new Among ( "arized", 24, 1, "r_A", this), new Among ( "oid", -1, 1, "r_A", this), new Among ( "aroid", 26, 1, "r_A", this), new Among ( "hood", -1, 1, "r_A", this), new Among ( "ehood", 28, 1, "r_A", this), new Among ( "ihood", 28, 1, "r_A", this), new Among ( "elihood", 30, 1, "r_E", this), new Among ( "ward", -1, 1, "r_A", this), new Among ( "e", -1, 1, "r_A", this), new Among ( "ae", 33, 1, "r_A", this), new Among ( "ance", 33, 1, "r_B", this), new Among ( "icance", 35, 1, "r_A", this), new Among ( "ence", 33, 1, "r_A", this), new Among ( "ide", 33, 1, "r_L", this), new Among ( "icide", 38, 1, "r_A", this), new Among ( "otide", 38, 1, "r_A", this), new Among ( "age", 33, 1, "r_B", this), new Among ( "able", 33, 1, "r_A", this), new Among ( "atable", 42, 1, "r_A", this), new Among ( "izable", 42, 1, "r_E", this), new Among ( "arizable", 44, 1, "r_A", this), new Among ( "ible", 33, 1, "r_A", this), new Among ( "encible", 46, 1, "r_A", this), new Among ( "ene", 33, 1, "r_E", this), new Among ( "ine", 33, 1, "r_M", this), new Among ( "idine", 49, 1, "r_I", this), new Among ( "one", 33, 1, "r_R", this), new Among ( "ature", 33, 1, "r_E", this), new Among ( "eature", 52, 1, "r_Z", this), new Among ( "ese", 33, 1, "r_A", this), new Among ( "wise", 33, 1, "r_A", this), new Among ( "ate", 33, 1, "r_A", this), new Among ( "entiate", 56, 1, "r_A", this), new Among ( "inate", 56, 1, "r_A", this), new Among ( "ionate", 56, 1, "r_D", this), new Among ( "ite", 33, 1, "r_AA", this), new Among ( "ive", 33, 1, "r_A", this), new Among ( "ative", 61, 1, "r_A", this), new Among ( "ize", 33, 1, "r_F", this), new Among ( "alize", 63, 1, "r_A", this), new Among ( "icalize", 64, 1, "r_A", this), new Among ( "ialize", 64, 1, "r_A", this), new Among ( "entialize", 66, 1, "r_A", this), new Among ( "ionalize", 64, 1, "r_A", this), new Among ( "arize", 63, 1, "r_A", this), new Among ( "ing", -1, 1, "r_N", this), new Among ( "ancing", 70, 1, "r_B", this), new Among ( "encing", 70, 1, "r_A", this), new Among ( "aging", 70, 1, "r_B", this), new Among ( "ening", 70, 1, "r_E", this), new Among ( "ioning", 70, 1, "r_A", this), new Among ( "ating", 70, 1, "r_I", this), new Among ( "enting", 70, 1, "r_C", this), new Among ( "ying", 70, 1, "r_B", this), new Among ( "izing", 70, 1, "r_F", this), new Among ( "arizing", 79, 1, "r_A", this), new Among ( "ish", -1, 1, "r_C", this), new Among ( "yish", 81, 1, "r_A", this), new Among ( "i", -1, 1, "r_A", this), new Among ( "al", -1, 1, "r_BB", this), new Among ( "ical", 84, 1, "r_A", this), new Among ( "aical", 85, 1, "r_A", this), new Among ( "istical", 85, 1, "r_A", this), new Among ( "oidal", 84, 1, "r_A", this), new Among ( "eal", 84, 1, "r_Y", this), new Among ( "ial", 84, 1, "r_A", this), new Among ( "ancial", 90, 1, "r_A", this), new Among ( "arial", 90, 1, "r_A", this), new Among ( "ential", 90, 1, "r_A", this), new Among ( "ional", 84, 1, "r_A", this), new Among ( "ational", 94, 1, "r_B", this), new Among ( "izational", 95, 1, "r_A", this), new Among ( "ental", 84, 1, "r_A", this), new Among ( "ful", -1, 1, "r_A", this), new Among ( "eful", 98, 1, "r_A", this), new Among ( "iful", 98, 1, "r_A", this), new Among ( "yl", -1, 1, "r_R", this), new Among ( "ism", -1, 1, "r_B", this), new Among ( "icism", 102, 1, "r_A", this), new Among ( "oidism", 102, 1, "r_A", this), new Among ( "alism", 102, 1, "r_B", this), new Among ( "icalism", 105, 1, "r_A", this), new Among ( "ionalism", 105, 1, "r_A", this), new Among ( "inism", 102, 1, "r_J", this), new Among ( "ativism", 102, 1, "r_A", this), new Among ( "um", -1, 1, "r_U", this), new Among ( "ium", 110, 1, "r_A", this), new Among ( "ian", -1, 1, "r_A", this), new Among ( "ician", 112, 1, "r_A", this), new Among ( "en", -1, 1, "r_F", this), new Among ( "ogen", 114, 1, "r_A", this), new Among ( "on", -1, 1, "r_S", this), new Among ( "ion", 116, 1, "r_Q", this), new Among ( "ation", 117, 1, "r_B", this), new Among ( "ication", 118, 1, "r_G", this), new Among ( "entiation", 118, 1, "r_A", this), new Among ( "ination", 118, 1, "r_A", this), new Among ( "isation", 118, 1, "r_A", this), new Among ( "arisation", 122, 1, "r_A", this), new Among ( "entation", 118, 1, "r_A", this), new Among ( "ization", 118, 1, "r_F", this), new Among ( "arization", 125, 1, "r_A", this), new Among ( "action", 117, 1, "r_G", this), new Among ( "o", -1, 1, "r_A", this), new Among ( "ar", -1, 1, "r_X", this), new Among ( "ear", 129, 1, "r_Y", this), new Among ( "ier", -1, 1, "r_A", this), new Among ( "ariser", -1, 1, "r_A", this), new Among ( "izer", -1, 1, "r_F", this), new Among ( "arizer", 133, 1, "r_A", this), new Among ( "or", -1, 1, "r_T", this), new Among ( "ator", 135, 1, "r_A", this), new Among ( "s", -1, 1, "r_W", this), new Among ( "'s", 137, 1, "r_A", this), new Among ( "as", 137, 1, "r_B", this), new Among ( "ics", 137, 1, "r_A", this), new Among ( "istics", 140, 1, "r_A", this), new Among ( "es", 137, 1, "r_E", this), new Among ( "ances", 142, 1, "r_B", this), new Among ( "ences", 142, 1, "r_A", this), new Among ( "ides", 142, 1, "r_L", this), new Among ( "oides", 145, 1, "r_A", this), new Among ( "ages", 142, 1, "r_B", this), new Among ( "ies", 142, 1, "r_P", this), new Among ( "acies", 148, 1, "r_A", this), new Among ( "ancies", 148, 1, "r_A", this), new Among ( "encies", 148, 1, "r_A", this), new Among ( "aries", 148, 1, "r_A", this), new Among ( "ities", 148, 1, "r_A", this), new Among ( "alities", 153, 1, "r_A", this), new Among ( "ivities", 153, 1, "r_A", this), new Among ( "ines", 142, 1, "r_M", this), new Among ( "nesses", 142, 1, "r_A", this), new Among ( "ates", 142, 1, "r_A", this), new Among ( "atives", 142, 1, "r_A", this), new Among ( "ings", 137, 1, "r_N", this), new Among ( "is", 137, 1, "r_A", this), new Among ( "als", 137, 1, "r_BB", this), new Among ( "ials", 162, 1, "r_A", this), new Among ( "entials", 163, 1, "r_A", this), new Among ( "ionals", 162, 1, "r_A", this), new Among ( "isms", 137, 1, "r_B", this), new Among ( "ians", 137, 1, "r_A", this), new Among ( "icians", 167, 1, "r_A", this), new Among ( "ions", 137, 1, "r_B", this), new Among ( "ations", 169, 1, "r_B", this), new Among ( "arisations", 170, 1, "r_A", this), new Among ( "entations", 170, 1, "r_A", this), new Among ( "izations", 170, 1, "r_A", this), new Among ( "arizations", 173, 1, "r_A", this), new Among ( "ars", 137, 1, "r_O", this), new Among ( "iers", 137, 1, "r_A", this), new Among ( "izers", 137, 1, "r_F", this), new Among ( "ators", 137, 1, "r_A", this), new Among ( "less", 137, 1, "r_A", this), new Among ( "eless", 179, 1, "r_A", this), new Among ( "ness", 137, 1, "r_A", this), new Among ( "eness", 181, 1, "r_E", this), new Among ( "ableness", 182, 1, "r_A", this), new Among ( "eableness", 183, 1, "r_E", this), new Among ( "ibleness", 182, 1, "r_A", this), new Among ( "ateness", 182, 1, "r_A", this), new Among ( "iteness", 182, 1, "r_A", this), new Among ( "iveness", 182, 1, "r_A", this), new Among ( "ativeness", 188, 1, "r_A", this), new Among ( "ingness", 181, 1, "r_A", this), new Among ( "ishness", 181, 1, "r_A", this), new Among ( "iness", 181, 1, "r_A", this), new Among ( "ariness", 192, 1, "r_E", this), new Among ( "alness", 181, 1, "r_A", this), new Among ( "icalness", 194, 1, "r_A", this), new Among ( "antialness", 194, 1, "r_A", this), new Among ( "entialness", 194, 1, "r_A", this), new Among ( "ionalness", 194, 1, "r_A", this), new Among ( "fulness", 181, 1, "r_A", this), new Among ( "lessness", 181, 1, "r_A", this), new Among ( "ousness", 181, 1, "r_A", this), new Among ( "eousness", 201, 1, "r_A", this), new Among ( "iousness", 201, 1, "r_A", this), new Among ( "itousness", 201, 1, "r_A", this), new Among ( "entness", 181, 1, "r_A", this), new Among ( "ants", 137, 1, "r_B", this), new Among ( "ists", 137, 1, "r_A", this), new Among ( "icists", 207, 1, "r_A", this), new Among ( "us", 137, 1, "r_V", this), new Among ( "ous", 209, 1, "r_A", this), new Among ( "eous", 210, 1, "r_A", this), new Among ( "aceous", 211, 1, "r_A", this), new Among ( "antaneous", 211, 1, "r_A", this), new Among ( "ious", 210, 1, "r_A", this), new Among ( "acious", 214, 1, "r_B", this), new Among ( "itous", 210, 1, "r_A", this), new Among ( "ant", -1, 1, "r_B", this), new Among ( "icant", 217, 1, "r_A", this), new Among ( "ent", -1, 1, "r_C", this), new Among ( "ement", 219, 1, "r_A", this), new Among ( "izement", 220, 1, "r_A", this), new Among ( "ist", -1, 1, "r_A", this), new Among ( "icist", 222, 1, "r_A", this), new Among ( "alist", 222, 1, "r_A", this), new Among ( "icalist", 224, 1, "r_A", this), new Among ( "ialist", 224, 1, "r_A", this), new Among ( "ionist", 222, 1, "r_A", this), new Among ( "entist", 222, 1, "r_A", this), new Among ( "y", -1, 1, "r_B", this), new Among ( "acy", 229, 1, "r_A", this), new Among ( "ancy", 229, 1, "r_B", this), new Among ( "ency", 229, 1, "r_A", this), new Among ( "ly", 229, 1, "r_B", this), new Among ( "ealy", 233, 1, "r_Y", this), new Among ( "ably", 233, 1, "r_A", this), new Among ( "ibly", 233, 1, "r_A", this), new Among ( "edly", 233, 1, "r_E", this), new Among ( "iedly", 237, 1, "r_A", this), new Among ( "ely", 233, 1, "r_E", this), new Among ( "ately", 239, 1, "r_A", this), new Among ( "ively", 239, 1, "r_A", this), new Among ( "atively", 241, 1, "r_A", this), new Among ( "ingly", 233, 1, "r_B", this), new Among ( "atingly", 243, 1, "r_A", this), new Among ( "ily", 233, 1, "r_A", this), new Among ( "lily", 245, 1, "r_A", this), new Among ( "arily", 245, 1, "r_A", this), new Among ( "ally", 233, 1, "r_B", this), new Among ( "ically", 248, 1, "r_A", this), new Among ( "aically", 249, 1, "r_A", this), new Among ( "allically", 249, 1, "r_C", this), new Among ( "istically", 249, 1, "r_A", this), new Among ( "alistically", 252, 1, "r_B", this), new Among ( "oidally", 248, 1, "r_A", this), new Among ( "ially", 248, 1, "r_A", this), new Among ( "entially", 255, 1, "r_A", this), new Among ( "ionally", 248, 1, "r_A", this), new Among ( "ationally", 257, 1, "r_B", this), new Among ( "izationally", 258, 1, "r_B", this), new Among ( "entally", 248, 1, "r_A", this), new Among ( "fully", 233, 1, "r_A", this), new Among ( "efully", 261, 1, "r_A", this), new Among ( "ifully", 261, 1, "r_A", this), new Among ( "enly", 233, 1, "r_E", this), new Among ( "arly", 233, 1, "r_K", this), new Among ( "early", 265, 1, "r_Y", this), new Among ( "lessly", 233, 1, "r_A", this), new Among ( "ously", 233, 1, "r_A", this), new Among ( "eously", 268, 1, "r_A", this), new Among ( "iously", 268, 1, "r_A", this), new Among ( "ently", 233, 1, "r_A", this), new Among ( "ary", 229, 1, "r_F", this), new Among ( "ery", 229, 1, "r_E", this), new Among ( "icianry", 229, 1, "r_A", this), new Among ( "atory", 229, 1, "r_A", this), new Among ( "ity", 229, 1, "r_A", this), new Among ( "acity", 276, 1, "r_A", this), new Among ( "icity", 276, 1, "r_A", this), new Among ( "eity", 276, 1, "r_A", this), new Among ( "ality", 276, 1, "r_A", this), new Among ( "icality", 280, 1, "r_A", this), new Among ( "iality", 280, 1, "r_A", this), new Among ( "antiality", 282, 1, "r_A", this), new Among ( "entiality", 282, 1, "r_A", this), new Among ( "ionality", 280, 1, "r_A", this), new Among ( "elity", 276, 1, "r_A", this), new Among ( "ability", 276, 1, "r_A", this), new Among ( "izability", 287, 1, "r_A", this), new Among ( "arizability", 288, 1, "r_A", this), new Among ( "ibility", 276, 1, "r_A", this), new Among ( "inity", 276, 1, "r_CC", this), new Among ( "arity", 276, 1, "r_B", this), new Among ( "ivity", 276, 1, "r_A", this) }; private Among a_2[] = { new Among ( "bb", -1, -1, "", this), new Among ( "dd", -1, -1, "", this), new Among ( "gg", -1, -1, "", this), new Among ( "ll", -1, -1, "", this), new Among ( "mm", -1, -1, "", this), new Among ( "nn", -1, -1, "", this), new Among ( "pp", -1, -1, "", this), new Among ( "rr", -1, -1, "", this), new Among ( "ss", -1, -1, "", this), new Among ( "tt", -1, -1, "", this) }; private Among a_3[] = { new Among ( "uad", -1, 18, "", this), new Among ( "vad", -1, 19, "", this), new Among ( "cid", -1, 20, "", this), new Among ( "lid", -1, 21, "", this), new Among ( "erid", -1, 22, "", this), new Among ( "pand", -1, 23, "", this), new Among ( "end", -1, 24, "", this), new Among ( "ond", -1, 25, "", this), new Among ( "lud", -1, 26, "", this), new Among ( "rud", -1, 27, "", this), new Among ( "ul", -1, 9, "", this), new Among ( "her", -1, 28, "", this), new Among ( "metr", -1, 7, "", this), new Among ( "istr", -1, 6, "", this), new Among ( "urs", -1, 5, "", this), new Among ( "uct", -1, 2, "", this), new Among ( "et", -1, 32, "", this), new Among ( "mit", -1, 29, "", this), new Among ( "ent", -1, 30, "", this), new Among ( "umpt", -1, 3, "", this), new Among ( "rpt", -1, 4, "", this), new Among ( "ert", -1, 31, "", this), new Among ( "yt", -1, 33, "", this), new Among ( "iev", -1, 1, "", this), new Among ( "olv", -1, 8, "", this), new Among ( "ax", -1, 14, "", this), new Among ( "ex", -1, 15, "", this), new Among ( "bex", 26, 10, "", this), new Among ( "dex", 26, 11, "", this), new Among ( "pex", 26, 12, "", this), new Among ( "tex", 26, 13, "", this), new Among ( "ix", -1, 16, "", this), new Among ( "lux", -1, 17, "", this), new Among ( "yz", -1, 34, "", this) }; private void copy_from(LovinsStemmer other) { super.copy_from(other); } private boolean r_A() { // (, line 21 // hop, line 21 { int c = cursor - 2; if (limit_backward > c || c > limit) { return false; } cursor = c; } return true; } private boolean r_B() { // (, line 22 // hop, line 22 { int c = cursor - 3; if (limit_backward > c || c > limit) { return false; } cursor = c; } return true; } private boolean r_C() { // (, line 23 // hop, line 23 { int c = cursor - 4; if (limit_backward > c || c > limit) { return false; } cursor = c; } return true; } private boolean r_D() { // (, line 24 // hop, line 24 { int c = cursor - 5; if (limit_backward > c || c > limit) { return false; } cursor = c; } return true; } private boolean r_E() { int v_1; int v_2; // (, line 25 // test, line 25 v_1 = limit - cursor; // hop, line 25 { int c = cursor - 2; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // not, line 25 { v_2 = limit - cursor; lab0: do { // literal, line 25 if (!(eq_s_b(1, "e"))) { break lab0; } return false; } while (false); cursor = limit - v_2; } return true; } private boolean r_F() { int v_1; int v_2; // (, line 26 // test, line 26 v_1 = limit - cursor; // hop, line 26 { int c = cursor - 3; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // not, line 26 { v_2 = limit - cursor; lab0: do { // literal, line 26 if (!(eq_s_b(1, "e"))) { break lab0; } return false; } while (false); cursor = limit - v_2; } return true; } private boolean r_G() { int v_1; // (, line 27 // test, line 27 v_1 = limit - cursor; // hop, line 27 { int c = cursor - 3; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // literal, line 27 if (!(eq_s_b(1, "f"))) { return false; } return true; } private boolean r_H() { int v_1; int v_2; // (, line 28 // test, line 28 v_1 = limit - cursor; // hop, line 28 { int c = cursor - 2; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // or, line 28 lab0: do { v_2 = limit - cursor; lab1: do { // literal, line 28 if (!(eq_s_b(1, "t"))) { break lab1; } break lab0; } while (false); cursor = limit - v_2; // literal, line 28 if (!(eq_s_b(2, "ll"))) { return false; } } while (false); return true; } private boolean r_I() { int v_1; int v_2; int v_3; // (, line 29 // test, line 29 v_1 = limit - cursor; // hop, line 29 { int c = cursor - 2; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // not, line 29 { v_2 = limit - cursor; lab0: do { // literal, line 29 if (!(eq_s_b(1, "o"))) { break lab0; } return false; } while (false); cursor = limit - v_2; } // not, line 29 { v_3 = limit - cursor; lab1: do { // literal, line 29 if (!(eq_s_b(1, "e"))) { break lab1; } return false; } while (false); cursor = limit - v_3; } return true; } private boolean r_J() { int v_1; int v_2; int v_3; // (, line 30 // test, line 30 v_1 = limit - cursor; // hop, line 30 { int c = cursor - 2; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // not, line 30 { v_2 = limit - cursor; lab0: do { // literal, line 30 if (!(eq_s_b(1, "a"))) { break lab0; } return false; } while (false); cursor = limit - v_2; } // not, line 30 { v_3 = limit - cursor; lab1: do { // literal, line 30 if (!(eq_s_b(1, "e"))) { break lab1; } return false; } while (false); cursor = limit - v_3; } return true; } private boolean r_K() { int v_1; int v_2; // (, line 31 // test, line 31 v_1 = limit - cursor; // hop, line 31 { int c = cursor - 3; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // or, line 31 lab0: do { v_2 = limit - cursor; lab1: do { // literal, line 31 if (!(eq_s_b(1, "l"))) { break lab1; } break lab0; } while (false); cursor = limit - v_2; lab2: do { // literal, line 31 if (!(eq_s_b(1, "i"))) { break lab2; } break lab0; } while (false); cursor = limit - v_2; // (, line 31 // literal, line 31 if (!(eq_s_b(1, "e"))) { return false; } // next, line 31 if (cursor <= limit_backward) { return false; } cursor--; // literal, line 31 if (!(eq_s_b(1, "u"))) { return false; } } while (false); return true; } private boolean r_L() { int v_1; int v_2; int v_3; int v_4; int v_5; // (, line 32 // test, line 32 v_1 = limit - cursor; // hop, line 32 { int c = cursor - 2; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // not, line 32 { v_2 = limit - cursor; lab0: do { // literal, line 32 if (!(eq_s_b(1, "u"))) { break lab0; } return false; } while (false); cursor = limit - v_2; } // not, line 32 { v_3 = limit - cursor; lab1: do { // literal, line 32 if (!(eq_s_b(1, "x"))) { break lab1; } return false; } while (false); cursor = limit - v_3; } // not, line 32 { v_4 = limit - cursor; lab2: do { // (, line 32 // literal, line 32 if (!(eq_s_b(1, "s"))) { break lab2; } // not, line 32 { v_5 = limit - cursor; lab3: do { // literal, line 32 if (!(eq_s_b(1, "o"))) { break lab3; } break lab2; } while (false); cursor = limit - v_5; } return false; } while (false); cursor = limit - v_4; } return true; } private boolean r_M() { int v_1; int v_2; int v_3; int v_4; int v_5; // (, line 33 // test, line 33 v_1 = limit - cursor; // hop, line 33 { int c = cursor - 2; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // not, line 33 { v_2 = limit - cursor; lab0: do { // literal, line 33 if (!(eq_s_b(1, "a"))) { break lab0; } return false; } while (false); cursor = limit - v_2; } // not, line 33 { v_3 = limit - cursor; lab1: do { // literal, line 33 if (!(eq_s_b(1, "c"))) { break lab1; } return false; } while (false); cursor = limit - v_3; } // not, line 33 { v_4 = limit - cursor; lab2: do { // literal, line 33 if (!(eq_s_b(1, "e"))) { break lab2; } return false; } while (false); cursor = limit - v_4; } // not, line 33 { v_5 = limit - cursor; lab3: do { // literal, line 33 if (!(eq_s_b(1, "m"))) { break lab3; } return false; } while (false); cursor = limit - v_5; } return true; } private boolean r_N() { int v_1; int v_2; int v_3; // (, line 34 // test, line 34 v_1 = limit - cursor; // hop, line 34 { int c = cursor - 3; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // (, line 34 // hop, line 34 { int c = cursor - 2; if (limit_backward > c || c > limit) { return false; } cursor = c; } // or, line 34 lab0: do { v_2 = limit - cursor; lab1: do { // not, line 34 { v_3 = limit - cursor; lab2: do { // literal, line 34 if (!(eq_s_b(1, "s"))) { break lab2; } break lab1; } while (false); cursor = limit - v_3; } break lab0; } while (false); cursor = limit - v_2; // hop, line 34 { int c = cursor - 2; if (limit_backward > c || c > limit) { return false; } cursor = c; } } while (false); return true; } private boolean r_O() { int v_1; int v_2; // (, line 35 // test, line 35 v_1 = limit - cursor; // hop, line 35 { int c = cursor - 2; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // or, line 35 lab0: do { v_2 = limit - cursor; lab1: do { // literal, line 35 if (!(eq_s_b(1, "l"))) { break lab1; } break lab0; } while (false); cursor = limit - v_2; // literal, line 35 if (!(eq_s_b(1, "i"))) { return false; } } while (false); return true; } private boolean r_P() { int v_1; int v_2; // (, line 36 // test, line 36 v_1 = limit - cursor; // hop, line 36 { int c = cursor - 2; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // not, line 36 { v_2 = limit - cursor; lab0: do { // literal, line 36 if (!(eq_s_b(1, "c"))) { break lab0; } return false; } while (false); cursor = limit - v_2; } return true; } private boolean r_Q() { int v_1; int v_2; int v_3; int v_4; // (, line 37 // test, line 37 v_1 = limit - cursor; // hop, line 37 { int c = cursor - 2; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // test, line 37 v_2 = limit - cursor; // hop, line 37 { int c = cursor - 3; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_2; // not, line 37 { v_3 = limit - cursor; lab0: do { // literal, line 37 if (!(eq_s_b(1, "l"))) { break lab0; } return false; } while (false); cursor = limit - v_3; } // not, line 37 { v_4 = limit - cursor; lab1: do { // literal, line 37 if (!(eq_s_b(1, "n"))) { break lab1; } return false; } while (false); cursor = limit - v_4; } return true; } private boolean r_R() { int v_1; int v_2; // (, line 38 // test, line 38 v_1 = limit - cursor; // hop, line 38 { int c = cursor - 2; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // or, line 38 lab0: do { v_2 = limit - cursor; lab1: do { // literal, line 38 if (!(eq_s_b(1, "n"))) { break lab1; } break lab0; } while (false); cursor = limit - v_2; // literal, line 38 if (!(eq_s_b(1, "r"))) { return false; } } while (false); return true; } private boolean r_S() { int v_1; int v_2; int v_3; // (, line 39 // test, line 39 v_1 = limit - cursor; // hop, line 39 { int c = cursor - 2; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // or, line 39 lab0: do { v_2 = limit - cursor; lab1: do { // literal, line 39 if (!(eq_s_b(2, "dr"))) { break lab1; } break lab0; } while (false); cursor = limit - v_2; // (, line 39 // literal, line 39 if (!(eq_s_b(1, "t"))) { return false; } // not, line 39 { v_3 = limit - cursor; lab2: do { // literal, line 39 if (!(eq_s_b(1, "t"))) { break lab2; } return false; } while (false); cursor = limit - v_3; } } while (false); return true; } private boolean r_T() { int v_1; int v_2; int v_3; // (, line 40 // test, line 40 v_1 = limit - cursor; // hop, line 40 { int c = cursor - 2; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // or, line 40 lab0: do { v_2 = limit - cursor; lab1: do { // literal, line 40 if (!(eq_s_b(1, "s"))) { break lab1; } break lab0; } while (false); cursor = limit - v_2; // (, line 40 // literal, line 40 if (!(eq_s_b(1, "t"))) { return false; } // not, line 40 { v_3 = limit - cursor; lab2: do { // literal, line 40 if (!(eq_s_b(1, "o"))) { break lab2; } return false; } while (false); cursor = limit - v_3; } } while (false); return true; } private boolean r_U() { int v_1; int v_2; // (, line 41 // test, line 41 v_1 = limit - cursor; // hop, line 41 { int c = cursor - 2; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // or, line 41 lab0: do { v_2 = limit - cursor; lab1: do { // literal, line 41 if (!(eq_s_b(1, "l"))) { break lab1; } break lab0; } while (false); cursor = limit - v_2; lab2: do { // literal, line 41 if (!(eq_s_b(1, "m"))) { break lab2; } break lab0; } while (false); cursor = limit - v_2; lab3: do { // literal, line 41 if (!(eq_s_b(1, "n"))) { break lab3; } break lab0; } while (false); cursor = limit - v_2; // literal, line 41 if (!(eq_s_b(1, "r"))) { return false; } } while (false); return true; } private boolean r_V() { int v_1; // (, line 42 // test, line 42 v_1 = limit - cursor; // hop, line 42 { int c = cursor - 2; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // literal, line 42 if (!(eq_s_b(1, "c"))) { return false; } return true; } private boolean r_W() { int v_1; int v_2; int v_3; // (, line 43 // test, line 43 v_1 = limit - cursor; // hop, line 43 { int c = cursor - 2; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // not, line 43 { v_2 = limit - cursor; lab0: do { // literal, line 43 if (!(eq_s_b(1, "s"))) { break lab0; } return false; } while (false); cursor = limit - v_2; } // not, line 43 { v_3 = limit - cursor; lab1: do { // literal, line 43 if (!(eq_s_b(1, "u"))) { break lab1; } return false; } while (false); cursor = limit - v_3; } return true; } private boolean r_X() { int v_1; int v_2; // (, line 44 // test, line 44 v_1 = limit - cursor; // hop, line 44 { int c = cursor - 2; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // or, line 44 lab0: do { v_2 = limit - cursor; lab1: do { // literal, line 44 if (!(eq_s_b(1, "l"))) { break lab1; } break lab0; } while (false); cursor = limit - v_2; lab2: do { // literal, line 44 if (!(eq_s_b(1, "i"))) { break lab2; } break lab0; } while (false); cursor = limit - v_2; // (, line 44 // literal, line 44 if (!(eq_s_b(1, "e"))) { return false; } // next, line 44 if (cursor <= limit_backward) { return false; } cursor--; // literal, line 44 if (!(eq_s_b(1, "u"))) { return false; } } while (false); return true; } private boolean r_Y() { int v_1; // (, line 45 // test, line 45 v_1 = limit - cursor; // hop, line 45 { int c = cursor - 2; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // literal, line 45 if (!(eq_s_b(2, "in"))) { return false; } return true; } private boolean r_Z() { int v_1; int v_2; // (, line 46 // test, line 46 v_1 = limit - cursor; // hop, line 46 { int c = cursor - 2; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // not, line 46 { v_2 = limit - cursor; lab0: do { // literal, line 46 if (!(eq_s_b(1, "f"))) { break lab0; } return false; } while (false); cursor = limit - v_2; } return true; } private boolean r_AA() { int v_1; // (, line 47 // test, line 47 v_1 = limit - cursor; // hop, line 47 { int c = cursor - 2; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // among, line 47 if (find_among_b(a_0, 9) == 0) { return false; } return true; } private boolean r_BB() { int v_1; int v_2; int v_3; // (, line 49 // test, line 49 v_1 = limit - cursor; // hop, line 49 { int c = cursor - 3; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // not, line 49 { v_2 = limit - cursor; lab0: do { // literal, line 49 if (!(eq_s_b(3, "met"))) { break lab0; } return false; } while (false); cursor = limit - v_2; } // not, line 49 { v_3 = limit - cursor; lab1: do { // literal, line 49 if (!(eq_s_b(4, "ryst"))) { break lab1; } return false; } while (false); cursor = limit - v_3; } return true; } private boolean r_CC() { int v_1; // (, line 50 // test, line 50 v_1 = limit - cursor; // hop, line 50 { int c = cursor - 2; if (limit_backward > c || c > limit) { return false; } cursor = c; } cursor = limit - v_1; // literal, line 50 if (!(eq_s_b(1, "l"))) { return false; } return true; } private boolean r_endings() { int among_var; // (, line 55 // [, line 56 ket = cursor; // substring, line 56 among_var = find_among_b(a_1, 294); if (among_var == 0) { return false; } // ], line 56 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 145 // delete, line 145 slice_del(); break; } return true; } private boolean r_undouble() { int v_1; // (, line 151 // test, line 152 v_1 = limit - cursor; // substring, line 152 if (find_among_b(a_2, 10) == 0) { return false; } cursor = limit - v_1; // [, line 154 ket = cursor; // next, line 154 if (cursor <= limit_backward) { return false; } cursor--; // ], line 154 bra = cursor; // delete, line 154 slice_del(); return true; } private boolean r_respell() { int among_var; int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; int v_8; // (, line 159 // [, line 160 ket = cursor; // substring, line 160 among_var = find_among_b(a_3, 34); if (among_var == 0) { return false; } // ], line 160 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 161 // <-, line 161 slice_from("ief"); break; case 2: // (, line 162 // <-, line 162 slice_from("uc"); break; case 3: // (, line 163 // <-, line 163 slice_from("um"); break; case 4: // (, line 164 // <-, line 164 slice_from("rb"); break; case 5: // (, line 165 // <-, line 165 slice_from("ur"); break; case 6: // (, line 166 // <-, line 166 slice_from("ister"); break; case 7: // (, line 167 // <-, line 167 slice_from("meter"); break; case 8: // (, line 168 // <-, line 168 slice_from("olut"); break; case 9: // (, line 169 // not, line 169 { v_1 = limit - cursor; lab0: do { // literal, line 169 if (!(eq_s_b(1, "a"))) { break lab0; } return false; } while (false); cursor = limit - v_1; } // not, line 169 { v_2 = limit - cursor; lab1: do { // literal, line 169 if (!(eq_s_b(1, "i"))) { break lab1; } return false; } while (false); cursor = limit - v_2; } // not, line 169 { v_3 = limit - cursor; lab2: do { // literal, line 169 if (!(eq_s_b(1, "o"))) { break lab2; } return false; } while (false); cursor = limit - v_3; } // <-, line 169 slice_from("l"); break; case 10: // (, line 170 // <-, line 170 slice_from("bic"); break; case 11: // (, line 171 // <-, line 171 slice_from("dic"); break; case 12: // (, line 172 // <-, line 172 slice_from("pic"); break; case 13: // (, line 173 // <-, line 173 slice_from("tic"); break; case 14: // (, line 174 // <-, line 174 slice_from("ac"); break; case 15: // (, line 175 // <-, line 175 slice_from("ec"); break; case 16: // (, line 176 // <-, line 176 slice_from("ic"); break; case 17: // (, line 177 // <-, line 177 slice_from("luc"); break; case 18: // (, line 178 // <-, line 178 slice_from("uas"); break; case 19: // (, line 179 // <-, line 179 slice_from("vas"); break; case 20: // (, line 180 // <-, line 180 slice_from("cis"); break; case 21: // (, line 181 // <-, line 181 slice_from("lis"); break; case 22: // (, line 182 // <-, line 182 slice_from("eris"); break; case 23: // (, line 183 // <-, line 183 slice_from("pans"); break; case 24: // (, line 184 // not, line 184 { v_4 = limit - cursor; lab3: do { // literal, line 184 if (!(eq_s_b(1, "s"))) { break lab3; } return false; } while (false); cursor = limit - v_4; } // <-, line 184 slice_from("ens"); break; case 25: // (, line 185 // <-, line 185 slice_from("ons"); break; case 26: // (, line 186 // <-, line 186 slice_from("lus"); break; case 27: // (, line 187 // <-, line 187 slice_from("rus"); break; case 28: // (, line 188 // not, line 188 { v_5 = limit - cursor; lab4: do { // literal, line 188 if (!(eq_s_b(1, "p"))) { break lab4; } return false; } while (false); cursor = limit - v_5; } // not, line 188 { v_6 = limit - cursor; lab5: do { // literal, line 188 if (!(eq_s_b(1, "t"))) { break lab5; } return false; } while (false); cursor = limit - v_6; } // <-, line 188 slice_from("hes"); break; case 29: // (, line 189 // <-, line 189 slice_from("mis"); break; case 30: // (, line 190 // not, line 190 { v_7 = limit - cursor; lab6: do { // literal, line 190 if (!(eq_s_b(1, "m"))) { break lab6; } return false; } while (false); cursor = limit - v_7; } // <-, line 190 slice_from("ens"); break; case 31: // (, line 192 // <-, line 192 slice_from("ers"); break; case 32: // (, line 193 // not, line 193 { v_8 = limit - cursor; lab7: do { // literal, line 193 if (!(eq_s_b(1, "n"))) { break lab7; } return false; } while (false); cursor = limit - v_8; } // <-, line 193 slice_from("es"); break; case 33: // (, line 194 // <-, line 194 slice_from("ys"); break; case 34: // (, line 195 // <-, line 195 slice_from("ys"); break; } return true; } public boolean stem() { int v_1; int v_2; int v_3; // (, line 200 // backwards, line 202 limit_backward = cursor; cursor = limit; // (, line 202 // do, line 203 v_1 = limit - cursor; lab0: do { // call endings, line 203 if (!r_endings()) { break lab0; } } while (false); cursor = limit - v_1; // do, line 204 v_2 = limit - cursor; lab1: do { // call undouble, line 204 if (!r_undouble()) { break lab1; } } while (false); cursor = limit - v_2; // do, line 205 v_3 = limit - cursor; lab2: do { // call respell, line 205 if (!r_respell()) { break lab2; } } while (false); cursor = limit - v_3; cursor = limit_backward; return true; } } lucene-2.9.4/contrib/snowball/src/java/org/tartarus/snowball/ext/NorwegianStemmer.java0000644000175000017500000002531411474320235031660 0ustar janpascaljanpascal// This file was generated automatically by the Snowball to Java compiler package org.tartarus.snowball.ext; import org.tartarus.snowball.SnowballProgram; import org.tartarus.snowball.Among; /** * Generated class implementing code defined by a snowball script. */ public class NorwegianStemmer extends SnowballProgram { private Among a_0[] = { new Among ( "a", -1, 1, "", this), new Among ( "e", -1, 1, "", this), new Among ( "ede", 1, 1, "", this), new Among ( "ande", 1, 1, "", this), new Among ( "ende", 1, 1, "", this), new Among ( "ane", 1, 1, "", this), new Among ( "ene", 1, 1, "", this), new Among ( "hetene", 6, 1, "", this), new Among ( "erte", 1, 3, "", this), new Among ( "en", -1, 1, "", this), new Among ( "heten", 9, 1, "", this), new Among ( "ar", -1, 1, "", this), new Among ( "er", -1, 1, "", this), new Among ( "heter", 12, 1, "", this), new Among ( "s", -1, 2, "", this), new Among ( "as", 14, 1, "", this), new Among ( "es", 14, 1, "", this), new Among ( "edes", 16, 1, "", this), new Among ( "endes", 16, 1, "", this), new Among ( "enes", 16, 1, "", this), new Among ( "hetenes", 19, 1, "", this), new Among ( "ens", 14, 1, "", this), new Among ( "hetens", 21, 1, "", this), new Among ( "ers", 14, 1, "", this), new Among ( "ets", 14, 1, "", this), new Among ( "et", -1, 1, "", this), new Among ( "het", 25, 1, "", this), new Among ( "ert", -1, 3, "", this), new Among ( "ast", -1, 1, "", this) }; private Among a_1[] = { new Among ( "dt", -1, -1, "", this), new Among ( "vt", -1, -1, "", this) }; private Among a_2[] = { new Among ( "leg", -1, 1, "", this), new Among ( "eleg", 0, 1, "", this), new Among ( "ig", -1, 1, "", this), new Among ( "eig", 2, 1, "", this), new Among ( "lig", 2, 1, "", this), new Among ( "elig", 4, 1, "", this), new Among ( "els", -1, 1, "", this), new Among ( "lov", -1, 1, "", this), new Among ( "elov", 7, 1, "", this), new Among ( "slov", 7, 1, "", this), new Among ( "hetslov", 9, 1, "", this) }; private static final char g_v[] = {17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, 128 }; private static final char g_s_ending[] = {119, 125, 149, 1 }; private int I_x; private int I_p1; private void copy_from(NorwegianStemmer other) { I_x = other.I_x; I_p1 = other.I_p1; super.copy_from(other); } private boolean r_mark_regions() { int v_1; int v_2; // (, line 26 I_p1 = limit; // test, line 30 v_1 = cursor; // (, line 30 // hop, line 30 { int c = cursor + 3; if (0 > c || c > limit) { return false; } cursor = c; } // setmark x, line 30 I_x = cursor; cursor = v_1; // goto, line 31 golab0: while(true) { v_2 = cursor; lab1: do { if (!(in_grouping(g_v, 97, 248))) { break lab1; } cursor = v_2; break golab0; } while (false); cursor = v_2; if (cursor >= limit) { return false; } cursor++; } // gopast, line 31 golab2: while(true) { lab3: do { if (!(out_grouping(g_v, 97, 248))) { break lab3; } break golab2; } while (false); if (cursor >= limit) { return false; } cursor++; } // setmark p1, line 31 I_p1 = cursor; // try, line 32 lab4: do { // (, line 32 if (!(I_p1 < I_x)) { break lab4; } I_p1 = I_x; } while (false); return true; } private boolean r_main_suffix() { int among_var; int v_1; int v_2; int v_3; // (, line 37 // setlimit, line 38 v_1 = limit - cursor; // tomark, line 38 if (cursor < I_p1) { return false; } cursor = I_p1; v_2 = limit_backward; limit_backward = cursor; cursor = limit - v_1; // (, line 38 // [, line 38 ket = cursor; // substring, line 38 among_var = find_among_b(a_0, 29); if (among_var == 0) { limit_backward = v_2; return false; } // ], line 38 bra = cursor; limit_backward = v_2; switch(among_var) { case 0: return false; case 1: // (, line 44 // delete, line 44 slice_del(); break; case 2: // (, line 46 // or, line 46 lab0: do { v_3 = limit - cursor; lab1: do { if (!(in_grouping_b(g_s_ending, 98, 122))) { break lab1; } break lab0; } while (false); cursor = limit - v_3; // (, line 46 // literal, line 46 if (!(eq_s_b(1, "k"))) { return false; } if (!(out_grouping_b(g_v, 97, 248))) { return false; } } while (false); // delete, line 46 slice_del(); break; case 3: // (, line 48 // <-, line 48 slice_from("er"); break; } return true; } private boolean r_consonant_pair() { int v_1; int v_2; int v_3; // (, line 52 // test, line 53 v_1 = limit - cursor; // (, line 53 // setlimit, line 54 v_2 = limit - cursor; // tomark, line 54 if (cursor < I_p1) { return false; } cursor = I_p1; v_3 = limit_backward; limit_backward = cursor; cursor = limit - v_2; // (, line 54 // [, line 54 ket = cursor; // substring, line 54 if (find_among_b(a_1, 2) == 0) { limit_backward = v_3; return false; } // ], line 54 bra = cursor; limit_backward = v_3; cursor = limit - v_1; // next, line 59 if (cursor <= limit_backward) { return false; } cursor--; // ], line 59 bra = cursor; // delete, line 59 slice_del(); return true; } private boolean r_other_suffix() { int among_var; int v_1; int v_2; // (, line 62 // setlimit, line 63 v_1 = limit - cursor; // tomark, line 63 if (cursor < I_p1) { return false; } cursor = I_p1; v_2 = limit_backward; limit_backward = cursor; cursor = limit - v_1; // (, line 63 // [, line 63 ket = cursor; // substring, line 63 among_var = find_among_b(a_2, 11); if (among_var == 0) { limit_backward = v_2; return false; } // ], line 63 bra = cursor; limit_backward = v_2; switch(among_var) { case 0: return false; case 1: // (, line 67 // delete, line 67 slice_del(); break; } return true; } public boolean stem() { int v_1; int v_2; int v_3; int v_4; // (, line 72 // do, line 74 v_1 = cursor; lab0: do { // call mark_regions, line 74 if (!r_mark_regions()) { break lab0; } } while (false); cursor = v_1; // backwards, line 75 limit_backward = cursor; cursor = limit; // (, line 75 // do, line 76 v_2 = limit - cursor; lab1: do { // call main_suffix, line 76 if (!r_main_suffix()) { break lab1; } } while (false); cursor = limit - v_2; // do, line 77 v_3 = limit - cursor; lab2: do { // call consonant_pair, line 77 if (!r_consonant_pair()) { break lab2; } } while (false); cursor = limit - v_3; // do, line 78 v_4 = limit - cursor; lab3: do { // call other_suffix, line 78 if (!r_other_suffix()) { break lab3; } } while (false); cursor = limit - v_4; cursor = limit_backward; return true; } } lucene-2.9.4/contrib/snowball/src/java/org/tartarus/snowball/ext/SwedishStemmer.java0000644000175000017500000002471711474320235031343 0ustar janpascaljanpascal// This file was generated automatically by the Snowball to Java compiler package org.tartarus.snowball.ext; import org.tartarus.snowball.SnowballProgram; import org.tartarus.snowball.Among; /** * Generated class implementing code defined by a snowball script. */ public class SwedishStemmer extends SnowballProgram { private Among a_0[] = { new Among ( "a", -1, 1, "", this), new Among ( "arna", 0, 1, "", this), new Among ( "erna", 0, 1, "", this), new Among ( "heterna", 2, 1, "", this), new Among ( "orna", 0, 1, "", this), new Among ( "ad", -1, 1, "", this), new Among ( "e", -1, 1, "", this), new Among ( "ade", 6, 1, "", this), new Among ( "ande", 6, 1, "", this), new Among ( "arne", 6, 1, "", this), new Among ( "are", 6, 1, "", this), new Among ( "aste", 6, 1, "", this), new Among ( "en", -1, 1, "", this), new Among ( "anden", 12, 1, "", this), new Among ( "aren", 12, 1, "", this), new Among ( "heten", 12, 1, "", this), new Among ( "ern", -1, 1, "", this), new Among ( "ar", -1, 1, "", this), new Among ( "er", -1, 1, "", this), new Among ( "heter", 18, 1, "", this), new Among ( "or", -1, 1, "", this), new Among ( "s", -1, 2, "", this), new Among ( "as", 21, 1, "", this), new Among ( "arnas", 22, 1, "", this), new Among ( "ernas", 22, 1, "", this), new Among ( "ornas", 22, 1, "", this), new Among ( "es", 21, 1, "", this), new Among ( "ades", 26, 1, "", this), new Among ( "andes", 26, 1, "", this), new Among ( "ens", 21, 1, "", this), new Among ( "arens", 29, 1, "", this), new Among ( "hetens", 29, 1, "", this), new Among ( "erns", 21, 1, "", this), new Among ( "at", -1, 1, "", this), new Among ( "andet", -1, 1, "", this), new Among ( "het", -1, 1, "", this), new Among ( "ast", -1, 1, "", this) }; private Among a_1[] = { new Among ( "dd", -1, -1, "", this), new Among ( "gd", -1, -1, "", this), new Among ( "nn", -1, -1, "", this), new Among ( "dt", -1, -1, "", this), new Among ( "gt", -1, -1, "", this), new Among ( "kt", -1, -1, "", this), new Among ( "tt", -1, -1, "", this) }; private Among a_2[] = { new Among ( "ig", -1, 1, "", this), new Among ( "lig", 0, 1, "", this), new Among ( "els", -1, 1, "", this), new Among ( "fullt", -1, 3, "", this), new Among ( "l\u00F6st", -1, 2, "", this) }; private static final char g_v[] = {17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 32 }; private static final char g_s_ending[] = {119, 127, 149 }; private int I_x; private int I_p1; private void copy_from(SwedishStemmer other) { I_x = other.I_x; I_p1 = other.I_p1; super.copy_from(other); } private boolean r_mark_regions() { int v_1; int v_2; // (, line 26 I_p1 = limit; // test, line 29 v_1 = cursor; // (, line 29 // hop, line 29 { int c = cursor + 3; if (0 > c || c > limit) { return false; } cursor = c; } // setmark x, line 29 I_x = cursor; cursor = v_1; // goto, line 30 golab0: while(true) { v_2 = cursor; lab1: do { if (!(in_grouping(g_v, 97, 246))) { break lab1; } cursor = v_2; break golab0; } while (false); cursor = v_2; if (cursor >= limit) { return false; } cursor++; } // gopast, line 30 golab2: while(true) { lab3: do { if (!(out_grouping(g_v, 97, 246))) { break lab3; } break golab2; } while (false); if (cursor >= limit) { return false; } cursor++; } // setmark p1, line 30 I_p1 = cursor; // try, line 31 lab4: do { // (, line 31 if (!(I_p1 < I_x)) { break lab4; } I_p1 = I_x; } while (false); return true; } private boolean r_main_suffix() { int among_var; int v_1; int v_2; // (, line 36 // setlimit, line 37 v_1 = limit - cursor; // tomark, line 37 if (cursor < I_p1) { return false; } cursor = I_p1; v_2 = limit_backward; limit_backward = cursor; cursor = limit - v_1; // (, line 37 // [, line 37 ket = cursor; // substring, line 37 among_var = find_among_b(a_0, 37); if (among_var == 0) { limit_backward = v_2; return false; } // ], line 37 bra = cursor; limit_backward = v_2; switch(among_var) { case 0: return false; case 1: // (, line 44 // delete, line 44 slice_del(); break; case 2: // (, line 46 if (!(in_grouping_b(g_s_ending, 98, 121))) { return false; } // delete, line 46 slice_del(); break; } return true; } private boolean r_consonant_pair() { int v_1; int v_2; int v_3; // setlimit, line 50 v_1 = limit - cursor; // tomark, line 50 if (cursor < I_p1) { return false; } cursor = I_p1; v_2 = limit_backward; limit_backward = cursor; cursor = limit - v_1; // (, line 50 // and, line 52 v_3 = limit - cursor; // among, line 51 if (find_among_b(a_1, 7) == 0) { limit_backward = v_2; return false; } cursor = limit - v_3; // (, line 52 // [, line 52 ket = cursor; // next, line 52 if (cursor <= limit_backward) { limit_backward = v_2; return false; } cursor--; // ], line 52 bra = cursor; // delete, line 52 slice_del(); limit_backward = v_2; return true; } private boolean r_other_suffix() { int among_var; int v_1; int v_2; // setlimit, line 55 v_1 = limit - cursor; // tomark, line 55 if (cursor < I_p1) { return false; } cursor = I_p1; v_2 = limit_backward; limit_backward = cursor; cursor = limit - v_1; // (, line 55 // [, line 56 ket = cursor; // substring, line 56 among_var = find_among_b(a_2, 5); if (among_var == 0) { limit_backward = v_2; return false; } // ], line 56 bra = cursor; switch(among_var) { case 0: limit_backward = v_2; return false; case 1: // (, line 57 // delete, line 57 slice_del(); break; case 2: // (, line 58 // <-, line 58 slice_from("l\u00F6s"); break; case 3: // (, line 59 // <-, line 59 slice_from("full"); break; } limit_backward = v_2; return true; } public boolean stem() { int v_1; int v_2; int v_3; int v_4; // (, line 64 // do, line 66 v_1 = cursor; lab0: do { // call mark_regions, line 66 if (!r_mark_regions()) { break lab0; } } while (false); cursor = v_1; // backwards, line 67 limit_backward = cursor; cursor = limit; // (, line 67 // do, line 68 v_2 = limit - cursor; lab1: do { // call main_suffix, line 68 if (!r_main_suffix()) { break lab1; } } while (false); cursor = limit - v_2; // do, line 69 v_3 = limit - cursor; lab2: do { // call consonant_pair, line 69 if (!r_consonant_pair()) { break lab2; } } while (false); cursor = limit - v_3; // do, line 70 v_4 = limit - cursor; lab3: do { // call other_suffix, line 70 if (!r_other_suffix()) { break lab3; } } while (false); cursor = limit - v_4; cursor = limit_backward; return true; } } lucene-2.9.4/contrib/snowball/src/java/org/tartarus/snowball/ext/HungarianStemmer.java0000644000175000017500000011063611474320235031645 0ustar janpascaljanpascal// This file was generated automatically by the Snowball to Java compiler package org.tartarus.snowball.ext; import org.tartarus.snowball.SnowballProgram; import org.tartarus.snowball.Among; /** * Generated class implementing code defined by a snowball script. */ public class HungarianStemmer extends SnowballProgram { private Among a_0[] = { new Among ( "cs", -1, -1, "", this), new Among ( "dzs", -1, -1, "", this), new Among ( "gy", -1, -1, "", this), new Among ( "ly", -1, -1, "", this), new Among ( "ny", -1, -1, "", this), new Among ( "sz", -1, -1, "", this), new Among ( "ty", -1, -1, "", this), new Among ( "zs", -1, -1, "", this) }; private Among a_1[] = { new Among ( "\u00E1", -1, 1, "", this), new Among ( "\u00E9", -1, 2, "", this) }; private Among a_2[] = { new Among ( "bb", -1, -1, "", this), new Among ( "cc", -1, -1, "", this), new Among ( "dd", -1, -1, "", this), new Among ( "ff", -1, -1, "", this), new Among ( "gg", -1, -1, "", this), new Among ( "jj", -1, -1, "", this), new Among ( "kk", -1, -1, "", this), new Among ( "ll", -1, -1, "", this), new Among ( "mm", -1, -1, "", this), new Among ( "nn", -1, -1, "", this), new Among ( "pp", -1, -1, "", this), new Among ( "rr", -1, -1, "", this), new Among ( "ccs", -1, -1, "", this), new Among ( "ss", -1, -1, "", this), new Among ( "zzs", -1, -1, "", this), new Among ( "tt", -1, -1, "", this), new Among ( "vv", -1, -1, "", this), new Among ( "ggy", -1, -1, "", this), new Among ( "lly", -1, -1, "", this), new Among ( "nny", -1, -1, "", this), new Among ( "tty", -1, -1, "", this), new Among ( "ssz", -1, -1, "", this), new Among ( "zz", -1, -1, "", this) }; private Among a_3[] = { new Among ( "al", -1, 1, "", this), new Among ( "el", -1, 2, "", this) }; private Among a_4[] = { new Among ( "ba", -1, -1, "", this), new Among ( "ra", -1, -1, "", this), new Among ( "be", -1, -1, "", this), new Among ( "re", -1, -1, "", this), new Among ( "ig", -1, -1, "", this), new Among ( "nak", -1, -1, "", this), new Among ( "nek", -1, -1, "", this), new Among ( "val", -1, -1, "", this), new Among ( "vel", -1, -1, "", this), new Among ( "ul", -1, -1, "", this), new Among ( "n\u00E1l", -1, -1, "", this), new Among ( "n\u00E9l", -1, -1, "", this), new Among ( "b\u00F3l", -1, -1, "", this), new Among ( "r\u00F3l", -1, -1, "", this), new Among ( "t\u00F3l", -1, -1, "", this), new Among ( "b\u00F5l", -1, -1, "", this), new Among ( "r\u00F5l", -1, -1, "", this), new Among ( "t\u00F5l", -1, -1, "", this), new Among ( "\u00FCl", -1, -1, "", this), new Among ( "n", -1, -1, "", this), new Among ( "an", 19, -1, "", this), new Among ( "ban", 20, -1, "", this), new Among ( "en", 19, -1, "", this), new Among ( "ben", 22, -1, "", this), new Among ( "k\u00E9ppen", 22, -1, "", this), new Among ( "on", 19, -1, "", this), new Among ( "\u00F6n", 19, -1, "", this), new Among ( "k\u00E9pp", -1, -1, "", this), new Among ( "kor", -1, -1, "", this), new Among ( "t", -1, -1, "", this), new Among ( "at", 29, -1, "", this), new Among ( "et", 29, -1, "", this), new Among ( "k\u00E9nt", 29, -1, "", this), new Among ( "ank\u00E9nt", 32, -1, "", this), new Among ( "enk\u00E9nt", 32, -1, "", this), new Among ( "onk\u00E9nt", 32, -1, "", this), new Among ( "ot", 29, -1, "", this), new Among ( "\u00E9rt", 29, -1, "", this), new Among ( "\u00F6t", 29, -1, "", this), new Among ( "hez", -1, -1, "", this), new Among ( "hoz", -1, -1, "", this), new Among ( "h\u00F6z", -1, -1, "", this), new Among ( "v\u00E1", -1, -1, "", this), new Among ( "v\u00E9", -1, -1, "", this) }; private Among a_5[] = { new Among ( "\u00E1n", -1, 2, "", this), new Among ( "\u00E9n", -1, 1, "", this), new Among ( "\u00E1nk\u00E9nt", -1, 3, "", this) }; private Among a_6[] = { new Among ( "stul", -1, 2, "", this), new Among ( "astul", 0, 1, "", this), new Among ( "\u00E1stul", 0, 3, "", this), new Among ( "st\u00FCl", -1, 2, "", this), new Among ( "est\u00FCl", 3, 1, "", this), new Among ( "\u00E9st\u00FCl", 3, 4, "", this) }; private Among a_7[] = { new Among ( "\u00E1", -1, 1, "", this), new Among ( "\u00E9", -1, 2, "", this) }; private Among a_8[] = { new Among ( "k", -1, 7, "", this), new Among ( "ak", 0, 4, "", this), new Among ( "ek", 0, 6, "", this), new Among ( "ok", 0, 5, "", this), new Among ( "\u00E1k", 0, 1, "", this), new Among ( "\u00E9k", 0, 2, "", this), new Among ( "\u00F6k", 0, 3, "", this) }; private Among a_9[] = { new Among ( "\u00E9i", -1, 7, "", this), new Among ( "\u00E1\u00E9i", 0, 6, "", this), new Among ( "\u00E9\u00E9i", 0, 5, "", this), new Among ( "\u00E9", -1, 9, "", this), new Among ( "k\u00E9", 3, 4, "", this), new Among ( "ak\u00E9", 4, 1, "", this), new Among ( "ek\u00E9", 4, 1, "", this), new Among ( "ok\u00E9", 4, 1, "", this), new Among ( "\u00E1k\u00E9", 4, 3, "", this), new Among ( "\u00E9k\u00E9", 4, 2, "", this), new Among ( "\u00F6k\u00E9", 4, 1, "", this), new Among ( "\u00E9\u00E9", 3, 8, "", this) }; private Among a_10[] = { new Among ( "a", -1, 18, "", this), new Among ( "ja", 0, 17, "", this), new Among ( "d", -1, 16, "", this), new Among ( "ad", 2, 13, "", this), new Among ( "ed", 2, 13, "", this), new Among ( "od", 2, 13, "", this), new Among ( "\u00E1d", 2, 14, "", this), new Among ( "\u00E9d", 2, 15, "", this), new Among ( "\u00F6d", 2, 13, "", this), new Among ( "e", -1, 18, "", this), new Among ( "je", 9, 17, "", this), new Among ( "nk", -1, 4, "", this), new Among ( "unk", 11, 1, "", this), new Among ( "\u00E1nk", 11, 2, "", this), new Among ( "\u00E9nk", 11, 3, "", this), new Among ( "\u00FCnk", 11, 1, "", this), new Among ( "uk", -1, 8, "", this), new Among ( "juk", 16, 7, "", this), new Among ( "\u00E1juk", 17, 5, "", this), new Among ( "\u00FCk", -1, 8, "", this), new Among ( "j\u00FCk", 19, 7, "", this), new Among ( "\u00E9j\u00FCk", 20, 6, "", this), new Among ( "m", -1, 12, "", this), new Among ( "am", 22, 9, "", this), new Among ( "em", 22, 9, "", this), new Among ( "om", 22, 9, "", this), new Among ( "\u00E1m", 22, 10, "", this), new Among ( "\u00E9m", 22, 11, "", this), new Among ( "o", -1, 18, "", this), new Among ( "\u00E1", -1, 19, "", this), new Among ( "\u00E9", -1, 20, "", this) }; private Among a_11[] = { new Among ( "id", -1, 10, "", this), new Among ( "aid", 0, 9, "", this), new Among ( "jaid", 1, 6, "", this), new Among ( "eid", 0, 9, "", this), new Among ( "jeid", 3, 6, "", this), new Among ( "\u00E1id", 0, 7, "", this), new Among ( "\u00E9id", 0, 8, "", this), new Among ( "i", -1, 15, "", this), new Among ( "ai", 7, 14, "", this), new Among ( "jai", 8, 11, "", this), new Among ( "ei", 7, 14, "", this), new Among ( "jei", 10, 11, "", this), new Among ( "\u00E1i", 7, 12, "", this), new Among ( "\u00E9i", 7, 13, "", this), new Among ( "itek", -1, 24, "", this), new Among ( "eitek", 14, 21, "", this), new Among ( "jeitek", 15, 20, "", this), new Among ( "\u00E9itek", 14, 23, "", this), new Among ( "ik", -1, 29, "", this), new Among ( "aik", 18, 26, "", this), new Among ( "jaik", 19, 25, "", this), new Among ( "eik", 18, 26, "", this), new Among ( "jeik", 21, 25, "", this), new Among ( "\u00E1ik", 18, 27, "", this), new Among ( "\u00E9ik", 18, 28, "", this), new Among ( "ink", -1, 20, "", this), new Among ( "aink", 25, 17, "", this), new Among ( "jaink", 26, 16, "", this), new Among ( "eink", 25, 17, "", this), new Among ( "jeink", 28, 16, "", this), new Among ( "\u00E1ink", 25, 18, "", this), new Among ( "\u00E9ink", 25, 19, "", this), new Among ( "aitok", -1, 21, "", this), new Among ( "jaitok", 32, 20, "", this), new Among ( "\u00E1itok", -1, 22, "", this), new Among ( "im", -1, 5, "", this), new Among ( "aim", 35, 4, "", this), new Among ( "jaim", 36, 1, "", this), new Among ( "eim", 35, 4, "", this), new Among ( "jeim", 38, 1, "", this), new Among ( "\u00E1im", 35, 2, "", this), new Among ( "\u00E9im", 35, 3, "", this) }; private static final char g_v[] = {17, 65, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 17, 52, 14 }; private int I_p1; private void copy_from(HungarianStemmer other) { I_p1 = other.I_p1; super.copy_from(other); } private boolean r_mark_regions() { int v_1; int v_2; int v_3; // (, line 44 I_p1 = limit; // or, line 51 lab0: do { v_1 = cursor; lab1: do { // (, line 48 if (!(in_grouping(g_v, 97, 252))) { break lab1; } // goto, line 48 golab2: while(true) { v_2 = cursor; lab3: do { if (!(out_grouping(g_v, 97, 252))) { break lab3; } cursor = v_2; break golab2; } while (false); cursor = v_2; if (cursor >= limit) { break lab1; } cursor++; } // or, line 49 lab4: do { v_3 = cursor; lab5: do { // among, line 49 if (find_among(a_0, 8) == 0) { break lab5; } break lab4; } while (false); cursor = v_3; // next, line 49 if (cursor >= limit) { break lab1; } cursor++; } while (false); // setmark p1, line 50 I_p1 = cursor; break lab0; } while (false); cursor = v_1; // (, line 53 if (!(out_grouping(g_v, 97, 252))) { return false; } // gopast, line 53 golab6: while(true) { lab7: do { if (!(in_grouping(g_v, 97, 252))) { break lab7; } break golab6; } while (false); if (cursor >= limit) { return false; } cursor++; } // setmark p1, line 53 I_p1 = cursor; } while (false); return true; } private boolean r_R1() { if (!(I_p1 <= cursor)) { return false; } return true; } private boolean r_v_ending() { int among_var; // (, line 60 // [, line 61 ket = cursor; // substring, line 61 among_var = find_among_b(a_1, 2); if (among_var == 0) { return false; } // ], line 61 bra = cursor; // call R1, line 61 if (!r_R1()) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 62 // <-, line 62 slice_from("a"); break; case 2: // (, line 63 // <-, line 63 slice_from("e"); break; } return true; } private boolean r_double() { int v_1; // (, line 67 // test, line 68 v_1 = limit - cursor; // among, line 68 if (find_among_b(a_2, 23) == 0) { return false; } cursor = limit - v_1; return true; } private boolean r_undouble() { // (, line 72 // next, line 73 if (cursor <= limit_backward) { return false; } cursor--; // [, line 73 ket = cursor; // hop, line 73 { int c = cursor - 1; if (limit_backward > c || c > limit) { return false; } cursor = c; } // ], line 73 bra = cursor; // delete, line 73 slice_del(); return true; } private boolean r_instrum() { int among_var; // (, line 76 // [, line 77 ket = cursor; // substring, line 77 among_var = find_among_b(a_3, 2); if (among_var == 0) { return false; } // ], line 77 bra = cursor; // call R1, line 77 if (!r_R1()) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 78 // call double, line 78 if (!r_double()) { return false; } break; case 2: // (, line 79 // call double, line 79 if (!r_double()) { return false; } break; } // delete, line 81 slice_del(); // call undouble, line 82 if (!r_undouble()) { return false; } return true; } private boolean r_case() { // (, line 86 // [, line 87 ket = cursor; // substring, line 87 if (find_among_b(a_4, 44) == 0) { return false; } // ], line 87 bra = cursor; // call R1, line 87 if (!r_R1()) { return false; } // delete, line 111 slice_del(); // call v_ending, line 112 if (!r_v_ending()) { return false; } return true; } private boolean r_case_special() { int among_var; // (, line 115 // [, line 116 ket = cursor; // substring, line 116 among_var = find_among_b(a_5, 3); if (among_var == 0) { return false; } // ], line 116 bra = cursor; // call R1, line 116 if (!r_R1()) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 117 // <-, line 117 slice_from("e"); break; case 2: // (, line 118 // <-, line 118 slice_from("a"); break; case 3: // (, line 119 // <-, line 119 slice_from("a"); break; } return true; } private boolean r_case_other() { int among_var; // (, line 123 // [, line 124 ket = cursor; // substring, line 124 among_var = find_among_b(a_6, 6); if (among_var == 0) { return false; } // ], line 124 bra = cursor; // call R1, line 124 if (!r_R1()) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 125 // delete, line 125 slice_del(); break; case 2: // (, line 126 // delete, line 126 slice_del(); break; case 3: // (, line 127 // <-, line 127 slice_from("a"); break; case 4: // (, line 128 // <-, line 128 slice_from("e"); break; } return true; } private boolean r_factive() { int among_var; // (, line 132 // [, line 133 ket = cursor; // substring, line 133 among_var = find_among_b(a_7, 2); if (among_var == 0) { return false; } // ], line 133 bra = cursor; // call R1, line 133 if (!r_R1()) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 134 // call double, line 134 if (!r_double()) { return false; } break; case 2: // (, line 135 // call double, line 135 if (!r_double()) { return false; } break; } // delete, line 137 slice_del(); // call undouble, line 138 if (!r_undouble()) { return false; } return true; } private boolean r_plural() { int among_var; // (, line 141 // [, line 142 ket = cursor; // substring, line 142 among_var = find_among_b(a_8, 7); if (among_var == 0) { return false; } // ], line 142 bra = cursor; // call R1, line 142 if (!r_R1()) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 143 // <-, line 143 slice_from("a"); break; case 2: // (, line 144 // <-, line 144 slice_from("e"); break; case 3: // (, line 145 // delete, line 145 slice_del(); break; case 4: // (, line 146 // delete, line 146 slice_del(); break; case 5: // (, line 147 // delete, line 147 slice_del(); break; case 6: // (, line 148 // delete, line 148 slice_del(); break; case 7: // (, line 149 // delete, line 149 slice_del(); break; } return true; } private boolean r_owned() { int among_var; // (, line 153 // [, line 154 ket = cursor; // substring, line 154 among_var = find_among_b(a_9, 12); if (among_var == 0) { return false; } // ], line 154 bra = cursor; // call R1, line 154 if (!r_R1()) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 155 // delete, line 155 slice_del(); break; case 2: // (, line 156 // <-, line 156 slice_from("e"); break; case 3: // (, line 157 // <-, line 157 slice_from("a"); break; case 4: // (, line 158 // delete, line 158 slice_del(); break; case 5: // (, line 159 // <-, line 159 slice_from("e"); break; case 6: // (, line 160 // <-, line 160 slice_from("a"); break; case 7: // (, line 161 // delete, line 161 slice_del(); break; case 8: // (, line 162 // <-, line 162 slice_from("e"); break; case 9: // (, line 163 // delete, line 163 slice_del(); break; } return true; } private boolean r_sing_owner() { int among_var; // (, line 167 // [, line 168 ket = cursor; // substring, line 168 among_var = find_among_b(a_10, 31); if (among_var == 0) { return false; } // ], line 168 bra = cursor; // call R1, line 168 if (!r_R1()) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 169 // delete, line 169 slice_del(); break; case 2: // (, line 170 // <-, line 170 slice_from("a"); break; case 3: // (, line 171 // <-, line 171 slice_from("e"); break; case 4: // (, line 172 // delete, line 172 slice_del(); break; case 5: // (, line 173 // <-, line 173 slice_from("a"); break; case 6: // (, line 174 // <-, line 174 slice_from("e"); break; case 7: // (, line 175 // delete, line 175 slice_del(); break; case 8: // (, line 176 // delete, line 176 slice_del(); break; case 9: // (, line 177 // delete, line 177 slice_del(); break; case 10: // (, line 178 // <-, line 178 slice_from("a"); break; case 11: // (, line 179 // <-, line 179 slice_from("e"); break; case 12: // (, line 180 // delete, line 180 slice_del(); break; case 13: // (, line 181 // delete, line 181 slice_del(); break; case 14: // (, line 182 // <-, line 182 slice_from("a"); break; case 15: // (, line 183 // <-, line 183 slice_from("e"); break; case 16: // (, line 184 // delete, line 184 slice_del(); break; case 17: // (, line 185 // delete, line 185 slice_del(); break; case 18: // (, line 186 // delete, line 186 slice_del(); break; case 19: // (, line 187 // <-, line 187 slice_from("a"); break; case 20: // (, line 188 // <-, line 188 slice_from("e"); break; } return true; } private boolean r_plur_owner() { int among_var; // (, line 192 // [, line 193 ket = cursor; // substring, line 193 among_var = find_among_b(a_11, 42); if (among_var == 0) { return false; } // ], line 193 bra = cursor; // call R1, line 193 if (!r_R1()) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 194 // delete, line 194 slice_del(); break; case 2: // (, line 195 // <-, line 195 slice_from("a"); break; case 3: // (, line 196 // <-, line 196 slice_from("e"); break; case 4: // (, line 197 // delete, line 197 slice_del(); break; case 5: // (, line 198 // delete, line 198 slice_del(); break; case 6: // (, line 199 // delete, line 199 slice_del(); break; case 7: // (, line 200 // <-, line 200 slice_from("a"); break; case 8: // (, line 201 // <-, line 201 slice_from("e"); break; case 9: // (, line 202 // delete, line 202 slice_del(); break; case 10: // (, line 203 // delete, line 203 slice_del(); break; case 11: // (, line 204 // delete, line 204 slice_del(); break; case 12: // (, line 205 // <-, line 205 slice_from("a"); break; case 13: // (, line 206 // <-, line 206 slice_from("e"); break; case 14: // (, line 207 // delete, line 207 slice_del(); break; case 15: // (, line 208 // delete, line 208 slice_del(); break; case 16: // (, line 209 // delete, line 209 slice_del(); break; case 17: // (, line 210 // delete, line 210 slice_del(); break; case 18: // (, line 211 // <-, line 211 slice_from("a"); break; case 19: // (, line 212 // <-, line 212 slice_from("e"); break; case 20: // (, line 214 // delete, line 214 slice_del(); break; case 21: // (, line 215 // delete, line 215 slice_del(); break; case 22: // (, line 216 // <-, line 216 slice_from("a"); break; case 23: // (, line 217 // <-, line 217 slice_from("e"); break; case 24: // (, line 218 // delete, line 218 slice_del(); break; case 25: // (, line 219 // delete, line 219 slice_del(); break; case 26: // (, line 220 // delete, line 220 slice_del(); break; case 27: // (, line 221 // <-, line 221 slice_from("a"); break; case 28: // (, line 222 // <-, line 222 slice_from("e"); break; case 29: // (, line 223 // delete, line 223 slice_del(); break; } return true; } public boolean stem() { int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; int v_8; int v_9; int v_10; // (, line 228 // do, line 229 v_1 = cursor; lab0: do { // call mark_regions, line 229 if (!r_mark_regions()) { break lab0; } } while (false); cursor = v_1; // backwards, line 230 limit_backward = cursor; cursor = limit; // (, line 230 // do, line 231 v_2 = limit - cursor; lab1: do { // call instrum, line 231 if (!r_instrum()) { break lab1; } } while (false); cursor = limit - v_2; // do, line 232 v_3 = limit - cursor; lab2: do { // call case, line 232 if (!r_case()) { break lab2; } } while (false); cursor = limit - v_3; // do, line 233 v_4 = limit - cursor; lab3: do { // call case_special, line 233 if (!r_case_special()) { break lab3; } } while (false); cursor = limit - v_4; // do, line 234 v_5 = limit - cursor; lab4: do { // call case_other, line 234 if (!r_case_other()) { break lab4; } } while (false); cursor = limit - v_5; // do, line 235 v_6 = limit - cursor; lab5: do { // call factive, line 235 if (!r_factive()) { break lab5; } } while (false); cursor = limit - v_6; // do, line 236 v_7 = limit - cursor; lab6: do { // call owned, line 236 if (!r_owned()) { break lab6; } } while (false); cursor = limit - v_7; // do, line 237 v_8 = limit - cursor; lab7: do { // call sing_owner, line 237 if (!r_sing_owner()) { break lab7; } } while (false); cursor = limit - v_8; // do, line 238 v_9 = limit - cursor; lab8: do { // call plur_owner, line 238 if (!r_plur_owner()) { break lab8; } } while (false); cursor = limit - v_9; // do, line 239 v_10 = limit - cursor; lab9: do { // call plural, line 239 if (!r_plural()) { break lab9; } } while (false); cursor = limit - v_10; cursor = limit_backward; return true; } } lucene-2.9.4/contrib/snowball/src/java/org/tartarus/snowball/ext/PorterStemmer.java0000644000175000017500000007021511474320235031202 0ustar janpascaljanpascal// This file was generated automatically by the Snowball to Java compiler package org.tartarus.snowball.ext; import org.tartarus.snowball.SnowballProgram; import org.tartarus.snowball.Among; /** * Generated class implementing code defined by a snowball script. */ public class PorterStemmer extends SnowballProgram { private Among a_0[] = { new Among ( "s", -1, 3, "", this), new Among ( "ies", 0, 2, "", this), new Among ( "sses", 0, 1, "", this), new Among ( "ss", 0, -1, "", this) }; private Among a_1[] = { new Among ( "", -1, 3, "", this), new Among ( "bb", 0, 2, "", this), new Among ( "dd", 0, 2, "", this), new Among ( "ff", 0, 2, "", this), new Among ( "gg", 0, 2, "", this), new Among ( "bl", 0, 1, "", this), new Among ( "mm", 0, 2, "", this), new Among ( "nn", 0, 2, "", this), new Among ( "pp", 0, 2, "", this), new Among ( "rr", 0, 2, "", this), new Among ( "at", 0, 1, "", this), new Among ( "tt", 0, 2, "", this), new Among ( "iz", 0, 1, "", this) }; private Among a_2[] = { new Among ( "ed", -1, 2, "", this), new Among ( "eed", 0, 1, "", this), new Among ( "ing", -1, 2, "", this) }; private Among a_3[] = { new Among ( "anci", -1, 3, "", this), new Among ( "enci", -1, 2, "", this), new Among ( "abli", -1, 4, "", this), new Among ( "eli", -1, 6, "", this), new Among ( "alli", -1, 9, "", this), new Among ( "ousli", -1, 12, "", this), new Among ( "entli", -1, 5, "", this), new Among ( "aliti", -1, 10, "", this), new Among ( "biliti", -1, 14, "", this), new Among ( "iviti", -1, 13, "", this), new Among ( "tional", -1, 1, "", this), new Among ( "ational", 10, 8, "", this), new Among ( "alism", -1, 10, "", this), new Among ( "ation", -1, 8, "", this), new Among ( "ization", 13, 7, "", this), new Among ( "izer", -1, 7, "", this), new Among ( "ator", -1, 8, "", this), new Among ( "iveness", -1, 13, "", this), new Among ( "fulness", -1, 11, "", this), new Among ( "ousness", -1, 12, "", this) }; private Among a_4[] = { new Among ( "icate", -1, 2, "", this), new Among ( "ative", -1, 3, "", this), new Among ( "alize", -1, 1, "", this), new Among ( "iciti", -1, 2, "", this), new Among ( "ical", -1, 2, "", this), new Among ( "ful", -1, 3, "", this), new Among ( "ness", -1, 3, "", this) }; private Among a_5[] = { new Among ( "ic", -1, 1, "", this), new Among ( "ance", -1, 1, "", this), new Among ( "ence", -1, 1, "", this), new Among ( "able", -1, 1, "", this), new Among ( "ible", -1, 1, "", this), new Among ( "ate", -1, 1, "", this), new Among ( "ive", -1, 1, "", this), new Among ( "ize", -1, 1, "", this), new Among ( "iti", -1, 1, "", this), new Among ( "al", -1, 1, "", this), new Among ( "ism", -1, 1, "", this), new Among ( "ion", -1, 2, "", this), new Among ( "er", -1, 1, "", this), new Among ( "ous", -1, 1, "", this), new Among ( "ant", -1, 1, "", this), new Among ( "ent", -1, 1, "", this), new Among ( "ment", 15, 1, "", this), new Among ( "ement", 16, 1, "", this), new Among ( "ou", -1, 1, "", this) }; private static final char g_v[] = {17, 65, 16, 1 }; private static final char g_v_WXY[] = {1, 17, 65, 208, 1 }; private boolean B_Y_found; private int I_p2; private int I_p1; private void copy_from(PorterStemmer other) { B_Y_found = other.B_Y_found; I_p2 = other.I_p2; I_p1 = other.I_p1; super.copy_from(other); } private boolean r_shortv() { // (, line 19 if (!(out_grouping_b(g_v_WXY, 89, 121))) { return false; } if (!(in_grouping_b(g_v, 97, 121))) { return false; } if (!(out_grouping_b(g_v, 97, 121))) { return false; } return true; } private boolean r_R1() { if (!(I_p1 <= cursor)) { return false; } return true; } private boolean r_R2() { if (!(I_p2 <= cursor)) { return false; } return true; } private boolean r_Step_1a() { int among_var; // (, line 24 // [, line 25 ket = cursor; // substring, line 25 among_var = find_among_b(a_0, 4); if (among_var == 0) { return false; } // ], line 25 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 26 // <-, line 26 slice_from("ss"); break; case 2: // (, line 27 // <-, line 27 slice_from("i"); break; case 3: // (, line 29 // delete, line 29 slice_del(); break; } return true; } private boolean r_Step_1b() { int among_var; int v_1; int v_3; int v_4; // (, line 33 // [, line 34 ket = cursor; // substring, line 34 among_var = find_among_b(a_2, 3); if (among_var == 0) { return false; } // ], line 34 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 35 // call R1, line 35 if (!r_R1()) { return false; } // <-, line 35 slice_from("ee"); break; case 2: // (, line 37 // test, line 38 v_1 = limit - cursor; // gopast, line 38 golab0: while(true) { lab1: do { if (!(in_grouping_b(g_v, 97, 121))) { break lab1; } break golab0; } while (false); if (cursor <= limit_backward) { return false; } cursor--; } cursor = limit - v_1; // delete, line 38 slice_del(); // test, line 39 v_3 = limit - cursor; // substring, line 39 among_var = find_among_b(a_1, 13); if (among_var == 0) { return false; } cursor = limit - v_3; switch(among_var) { case 0: return false; case 1: // (, line 41 // <+, line 41 { int c = cursor; insert(cursor, cursor, "e"); cursor = c; } break; case 2: // (, line 44 // [, line 44 ket = cursor; // next, line 44 if (cursor <= limit_backward) { return false; } cursor--; // ], line 44 bra = cursor; // delete, line 44 slice_del(); break; case 3: // (, line 45 // atmark, line 45 if (cursor != I_p1) { return false; } // test, line 45 v_4 = limit - cursor; // call shortv, line 45 if (!r_shortv()) { return false; } cursor = limit - v_4; // <+, line 45 { int c = cursor; insert(cursor, cursor, "e"); cursor = c; } break; } break; } return true; } private boolean r_Step_1c() { int v_1; // (, line 51 // [, line 52 ket = cursor; // or, line 52 lab0: do { v_1 = limit - cursor; lab1: do { // literal, line 52 if (!(eq_s_b(1, "y"))) { break lab1; } break lab0; } while (false); cursor = limit - v_1; // literal, line 52 if (!(eq_s_b(1, "Y"))) { return false; } } while (false); // ], line 52 bra = cursor; // gopast, line 53 golab2: while(true) { lab3: do { if (!(in_grouping_b(g_v, 97, 121))) { break lab3; } break golab2; } while (false); if (cursor <= limit_backward) { return false; } cursor--; } // <-, line 54 slice_from("i"); return true; } private boolean r_Step_2() { int among_var; // (, line 57 // [, line 58 ket = cursor; // substring, line 58 among_var = find_among_b(a_3, 20); if (among_var == 0) { return false; } // ], line 58 bra = cursor; // call R1, line 58 if (!r_R1()) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 59 // <-, line 59 slice_from("tion"); break; case 2: // (, line 60 // <-, line 60 slice_from("ence"); break; case 3: // (, line 61 // <-, line 61 slice_from("ance"); break; case 4: // (, line 62 // <-, line 62 slice_from("able"); break; case 5: // (, line 63 // <-, line 63 slice_from("ent"); break; case 6: // (, line 64 // <-, line 64 slice_from("e"); break; case 7: // (, line 66 // <-, line 66 slice_from("ize"); break; case 8: // (, line 68 // <-, line 68 slice_from("ate"); break; case 9: // (, line 69 // <-, line 69 slice_from("al"); break; case 10: // (, line 71 // <-, line 71 slice_from("al"); break; case 11: // (, line 72 // <-, line 72 slice_from("ful"); break; case 12: // (, line 74 // <-, line 74 slice_from("ous"); break; case 13: // (, line 76 // <-, line 76 slice_from("ive"); break; case 14: // (, line 77 // <-, line 77 slice_from("ble"); break; } return true; } private boolean r_Step_3() { int among_var; // (, line 81 // [, line 82 ket = cursor; // substring, line 82 among_var = find_among_b(a_4, 7); if (among_var == 0) { return false; } // ], line 82 bra = cursor; // call R1, line 82 if (!r_R1()) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 83 // <-, line 83 slice_from("al"); break; case 2: // (, line 85 // <-, line 85 slice_from("ic"); break; case 3: // (, line 87 // delete, line 87 slice_del(); break; } return true; } private boolean r_Step_4() { int among_var; int v_1; // (, line 91 // [, line 92 ket = cursor; // substring, line 92 among_var = find_among_b(a_5, 19); if (among_var == 0) { return false; } // ], line 92 bra = cursor; // call R2, line 92 if (!r_R2()) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 95 // delete, line 95 slice_del(); break; case 2: // (, line 96 // or, line 96 lab0: do { v_1 = limit - cursor; lab1: do { // literal, line 96 if (!(eq_s_b(1, "s"))) { break lab1; } break lab0; } while (false); cursor = limit - v_1; // literal, line 96 if (!(eq_s_b(1, "t"))) { return false; } } while (false); // delete, line 96 slice_del(); break; } return true; } private boolean r_Step_5a() { int v_1; int v_2; // (, line 100 // [, line 101 ket = cursor; // literal, line 101 if (!(eq_s_b(1, "e"))) { return false; } // ], line 101 bra = cursor; // or, line 102 lab0: do { v_1 = limit - cursor; lab1: do { // call R2, line 102 if (!r_R2()) { break lab1; } break lab0; } while (false); cursor = limit - v_1; // (, line 102 // call R1, line 102 if (!r_R1()) { return false; } // not, line 102 { v_2 = limit - cursor; lab2: do { // call shortv, line 102 if (!r_shortv()) { break lab2; } return false; } while (false); cursor = limit - v_2; } } while (false); // delete, line 103 slice_del(); return true; } private boolean r_Step_5b() { // (, line 106 // [, line 107 ket = cursor; // literal, line 107 if (!(eq_s_b(1, "l"))) { return false; } // ], line 107 bra = cursor; // call R2, line 108 if (!r_R2()) { return false; } // literal, line 108 if (!(eq_s_b(1, "l"))) { return false; } // delete, line 109 slice_del(); return true; } public boolean stem() { int v_1; int v_2; int v_3; int v_4; int v_5; int v_10; int v_11; int v_12; int v_13; int v_14; int v_15; int v_16; int v_17; int v_18; int v_19; int v_20; // (, line 113 // unset Y_found, line 115 B_Y_found = false; // do, line 116 v_1 = cursor; lab0: do { // (, line 116 // [, line 116 bra = cursor; // literal, line 116 if (!(eq_s(1, "y"))) { break lab0; } // ], line 116 ket = cursor; // <-, line 116 slice_from("Y"); // set Y_found, line 116 B_Y_found = true; } while (false); cursor = v_1; // do, line 117 v_2 = cursor; lab1: do { // repeat, line 117 replab2: while(true) { v_3 = cursor; lab3: do { // (, line 117 // goto, line 117 golab4: while(true) { v_4 = cursor; lab5: do { // (, line 117 if (!(in_grouping(g_v, 97, 121))) { break lab5; } // [, line 117 bra = cursor; // literal, line 117 if (!(eq_s(1, "y"))) { break lab5; } // ], line 117 ket = cursor; cursor = v_4; break golab4; } while (false); cursor = v_4; if (cursor >= limit) { break lab3; } cursor++; } // <-, line 117 slice_from("Y"); // set Y_found, line 117 B_Y_found = true; continue replab2; } while (false); cursor = v_3; break replab2; } } while (false); cursor = v_2; I_p1 = limit; I_p2 = limit; // do, line 121 v_5 = cursor; lab6: do { // (, line 121 // gopast, line 122 golab7: while(true) { lab8: do { if (!(in_grouping(g_v, 97, 121))) { break lab8; } break golab7; } while (false); if (cursor >= limit) { break lab6; } cursor++; } // gopast, line 122 golab9: while(true) { lab10: do { if (!(out_grouping(g_v, 97, 121))) { break lab10; } break golab9; } while (false); if (cursor >= limit) { break lab6; } cursor++; } // setmark p1, line 122 I_p1 = cursor; // gopast, line 123 golab11: while(true) { lab12: do { if (!(in_grouping(g_v, 97, 121))) { break lab12; } break golab11; } while (false); if (cursor >= limit) { break lab6; } cursor++; } // gopast, line 123 golab13: while(true) { lab14: do { if (!(out_grouping(g_v, 97, 121))) { break lab14; } break golab13; } while (false); if (cursor >= limit) { break lab6; } cursor++; } // setmark p2, line 123 I_p2 = cursor; } while (false); cursor = v_5; // backwards, line 126 limit_backward = cursor; cursor = limit; // (, line 126 // do, line 127 v_10 = limit - cursor; lab15: do { // call Step_1a, line 127 if (!r_Step_1a()) { break lab15; } } while (false); cursor = limit - v_10; // do, line 128 v_11 = limit - cursor; lab16: do { // call Step_1b, line 128 if (!r_Step_1b()) { break lab16; } } while (false); cursor = limit - v_11; // do, line 129 v_12 = limit - cursor; lab17: do { // call Step_1c, line 129 if (!r_Step_1c()) { break lab17; } } while (false); cursor = limit - v_12; // do, line 130 v_13 = limit - cursor; lab18: do { // call Step_2, line 130 if (!r_Step_2()) { break lab18; } } while (false); cursor = limit - v_13; // do, line 131 v_14 = limit - cursor; lab19: do { // call Step_3, line 131 if (!r_Step_3()) { break lab19; } } while (false); cursor = limit - v_14; // do, line 132 v_15 = limit - cursor; lab20: do { // call Step_4, line 132 if (!r_Step_4()) { break lab20; } } while (false); cursor = limit - v_15; // do, line 133 v_16 = limit - cursor; lab21: do { // call Step_5a, line 133 if (!r_Step_5a()) { break lab21; } } while (false); cursor = limit - v_16; // do, line 134 v_17 = limit - cursor; lab22: do { // call Step_5b, line 134 if (!r_Step_5b()) { break lab22; } } while (false); cursor = limit - v_17; cursor = limit_backward; // do, line 137 v_18 = cursor; lab23: do { // (, line 137 // Boolean test Y_found, line 137 if (!(B_Y_found)) { break lab23; } // repeat, line 137 replab24: while(true) { v_19 = cursor; lab25: do { // (, line 137 // goto, line 137 golab26: while(true) { v_20 = cursor; lab27: do { // (, line 137 // [, line 137 bra = cursor; // literal, line 137 if (!(eq_s(1, "Y"))) { break lab27; } // ], line 137 ket = cursor; cursor = v_20; break golab26; } while (false); cursor = v_20; if (cursor >= limit) { break lab25; } cursor++; } // <-, line 137 slice_from("y"); continue replab24; } while (false); cursor = v_19; break replab24; } } while (false); cursor = v_18; return true; } } lucene-2.9.4/contrib/snowball/src/java/org/tartarus/snowball/ext/TurkishStemmer.java0000644000175000017500000033521511474320235031364 0ustar janpascaljanpascal// This file was generated automatically by the Snowball to Java compiler package org.tartarus.snowball.ext; import org.tartarus.snowball.SnowballProgram; import org.tartarus.snowball.Among; /** * Generated class implementing code defined by a snowball script. */ public class TurkishStemmer extends SnowballProgram { private Among a_0[] = { new Among ( "m", -1, -1, "", this), new Among ( "n", -1, -1, "", this), new Among ( "miz", -1, -1, "", this), new Among ( "niz", -1, -1, "", this), new Among ( "muz", -1, -1, "", this), new Among ( "nuz", -1, -1, "", this), new Among ( "m\u00FCz", -1, -1, "", this), new Among ( "n\u00FCz", -1, -1, "", this), new Among ( "m\u0131z", -1, -1, "", this), new Among ( "n\u0131z", -1, -1, "", this) }; private Among a_1[] = { new Among ( "leri", -1, -1, "", this), new Among ( "lar\u0131", -1, -1, "", this) }; private Among a_2[] = { new Among ( "ni", -1, -1, "", this), new Among ( "nu", -1, -1, "", this), new Among ( "n\u00FC", -1, -1, "", this), new Among ( "n\u0131", -1, -1, "", this) }; private Among a_3[] = { new Among ( "in", -1, -1, "", this), new Among ( "un", -1, -1, "", this), new Among ( "\u00FCn", -1, -1, "", this), new Among ( "\u0131n", -1, -1, "", this) }; private Among a_4[] = { new Among ( "a", -1, -1, "", this), new Among ( "e", -1, -1, "", this) }; private Among a_5[] = { new Among ( "na", -1, -1, "", this), new Among ( "ne", -1, -1, "", this) }; private Among a_6[] = { new Among ( "da", -1, -1, "", this), new Among ( "ta", -1, -1, "", this), new Among ( "de", -1, -1, "", this), new Among ( "te", -1, -1, "", this) }; private Among a_7[] = { new Among ( "nda", -1, -1, "", this), new Among ( "nde", -1, -1, "", this) }; private Among a_8[] = { new Among ( "dan", -1, -1, "", this), new Among ( "tan", -1, -1, "", this), new Among ( "den", -1, -1, "", this), new Among ( "ten", -1, -1, "", this) }; private Among a_9[] = { new Among ( "ndan", -1, -1, "", this), new Among ( "nden", -1, -1, "", this) }; private Among a_10[] = { new Among ( "la", -1, -1, "", this), new Among ( "le", -1, -1, "", this) }; private Among a_11[] = { new Among ( "ca", -1, -1, "", this), new Among ( "ce", -1, -1, "", this) }; private Among a_12[] = { new Among ( "im", -1, -1, "", this), new Among ( "um", -1, -1, "", this), new Among ( "\u00FCm", -1, -1, "", this), new Among ( "\u0131m", -1, -1, "", this) }; private Among a_13[] = { new Among ( "sin", -1, -1, "", this), new Among ( "sun", -1, -1, "", this), new Among ( "s\u00FCn", -1, -1, "", this), new Among ( "s\u0131n", -1, -1, "", this) }; private Among a_14[] = { new Among ( "iz", -1, -1, "", this), new Among ( "uz", -1, -1, "", this), new Among ( "\u00FCz", -1, -1, "", this), new Among ( "\u0131z", -1, -1, "", this) }; private Among a_15[] = { new Among ( "siniz", -1, -1, "", this), new Among ( "sunuz", -1, -1, "", this), new Among ( "s\u00FCn\u00FCz", -1, -1, "", this), new Among ( "s\u0131n\u0131z", -1, -1, "", this) }; private Among a_16[] = { new Among ( "lar", -1, -1, "", this), new Among ( "ler", -1, -1, "", this) }; private Among a_17[] = { new Among ( "niz", -1, -1, "", this), new Among ( "nuz", -1, -1, "", this), new Among ( "n\u00FCz", -1, -1, "", this), new Among ( "n\u0131z", -1, -1, "", this) }; private Among a_18[] = { new Among ( "dir", -1, -1, "", this), new Among ( "tir", -1, -1, "", this), new Among ( "dur", -1, -1, "", this), new Among ( "tur", -1, -1, "", this), new Among ( "d\u00FCr", -1, -1, "", this), new Among ( "t\u00FCr", -1, -1, "", this), new Among ( "d\u0131r", -1, -1, "", this), new Among ( "t\u0131r", -1, -1, "", this) }; private Among a_19[] = { new Among ( "cas\u0131na", -1, -1, "", this), new Among ( "cesine", -1, -1, "", this) }; private Among a_20[] = { new Among ( "di", -1, -1, "", this), new Among ( "ti", -1, -1, "", this), new Among ( "dik", -1, -1, "", this), new Among ( "tik", -1, -1, "", this), new Among ( "duk", -1, -1, "", this), new Among ( "tuk", -1, -1, "", this), new Among ( "d\u00FCk", -1, -1, "", this), new Among ( "t\u00FCk", -1, -1, "", this), new Among ( "d\u0131k", -1, -1, "", this), new Among ( "t\u0131k", -1, -1, "", this), new Among ( "dim", -1, -1, "", this), new Among ( "tim", -1, -1, "", this), new Among ( "dum", -1, -1, "", this), new Among ( "tum", -1, -1, "", this), new Among ( "d\u00FCm", -1, -1, "", this), new Among ( "t\u00FCm", -1, -1, "", this), new Among ( "d\u0131m", -1, -1, "", this), new Among ( "t\u0131m", -1, -1, "", this), new Among ( "din", -1, -1, "", this), new Among ( "tin", -1, -1, "", this), new Among ( "dun", -1, -1, "", this), new Among ( "tun", -1, -1, "", this), new Among ( "d\u00FCn", -1, -1, "", this), new Among ( "t\u00FCn", -1, -1, "", this), new Among ( "d\u0131n", -1, -1, "", this), new Among ( "t\u0131n", -1, -1, "", this), new Among ( "du", -1, -1, "", this), new Among ( "tu", -1, -1, "", this), new Among ( "d\u00FC", -1, -1, "", this), new Among ( "t\u00FC", -1, -1, "", this), new Among ( "d\u0131", -1, -1, "", this), new Among ( "t\u0131", -1, -1, "", this) }; private Among a_21[] = { new Among ( "sa", -1, -1, "", this), new Among ( "se", -1, -1, "", this), new Among ( "sak", -1, -1, "", this), new Among ( "sek", -1, -1, "", this), new Among ( "sam", -1, -1, "", this), new Among ( "sem", -1, -1, "", this), new Among ( "san", -1, -1, "", this), new Among ( "sen", -1, -1, "", this) }; private Among a_22[] = { new Among ( "mi\u015F", -1, -1, "", this), new Among ( "mu\u015F", -1, -1, "", this), new Among ( "m\u00FC\u015F", -1, -1, "", this), new Among ( "m\u0131\u015F", -1, -1, "", this) }; private Among a_23[] = { new Among ( "b", -1, 1, "", this), new Among ( "c", -1, 2, "", this), new Among ( "d", -1, 3, "", this), new Among ( "\u011F", -1, 4, "", this) }; private static final char g_vowel[] = {17, 65, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 8, 0, 0, 0, 0, 0, 0, 1 }; private static final char g_U[] = {1, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 1 }; private static final char g_vowel1[] = {1, 64, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }; private static final char g_vowel2[] = {17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 130 }; private static final char g_vowel3[] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }; private static final char g_vowel4[] = {17 }; private static final char g_vowel5[] = {65 }; private static final char g_vowel6[] = {65 }; private boolean B_continue_stemming_noun_suffixes; private int I_strlen; private void copy_from(TurkishStemmer other) { B_continue_stemming_noun_suffixes = other.B_continue_stemming_noun_suffixes; I_strlen = other.I_strlen; super.copy_from(other); } private boolean r_check_vowel_harmony() { int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; int v_8; int v_9; int v_10; int v_11; // (, line 111 // test, line 112 v_1 = limit - cursor; // (, line 113 // (, line 114 // goto, line 114 golab0: while(true) { v_2 = limit - cursor; lab1: do { if (!(in_grouping_b(g_vowel, 97, 305))) { break lab1; } cursor = limit - v_2; break golab0; } while (false); cursor = limit - v_2; if (cursor <= limit_backward) { return false; } cursor--; } // (, line 115 // or, line 116 lab2: do { v_3 = limit - cursor; lab3: do { // (, line 116 // literal, line 116 if (!(eq_s_b(1, "a"))) { break lab3; } // goto, line 116 golab4: while(true) { v_4 = limit - cursor; lab5: do { if (!(in_grouping_b(g_vowel1, 97, 305))) { break lab5; } cursor = limit - v_4; break golab4; } while (false); cursor = limit - v_4; if (cursor <= limit_backward) { break lab3; } cursor--; } break lab2; } while (false); cursor = limit - v_3; lab6: do { // (, line 117 // literal, line 117 if (!(eq_s_b(1, "e"))) { break lab6; } // goto, line 117 golab7: while(true) { v_5 = limit - cursor; lab8: do { if (!(in_grouping_b(g_vowel2, 101, 252))) { break lab8; } cursor = limit - v_5; break golab7; } while (false); cursor = limit - v_5; if (cursor <= limit_backward) { break lab6; } cursor--; } break lab2; } while (false); cursor = limit - v_3; lab9: do { // (, line 118 // literal, line 118 if (!(eq_s_b(1, "\u0131"))) { break lab9; } // goto, line 118 golab10: while(true) { v_6 = limit - cursor; lab11: do { if (!(in_grouping_b(g_vowel3, 97, 305))) { break lab11; } cursor = limit - v_6; break golab10; } while (false); cursor = limit - v_6; if (cursor <= limit_backward) { break lab9; } cursor--; } break lab2; } while (false); cursor = limit - v_3; lab12: do { // (, line 119 // literal, line 119 if (!(eq_s_b(1, "i"))) { break lab12; } // goto, line 119 golab13: while(true) { v_7 = limit - cursor; lab14: do { if (!(in_grouping_b(g_vowel4, 101, 105))) { break lab14; } cursor = limit - v_7; break golab13; } while (false); cursor = limit - v_7; if (cursor <= limit_backward) { break lab12; } cursor--; } break lab2; } while (false); cursor = limit - v_3; lab15: do { // (, line 120 // literal, line 120 if (!(eq_s_b(1, "o"))) { break lab15; } // goto, line 120 golab16: while(true) { v_8 = limit - cursor; lab17: do { if (!(in_grouping_b(g_vowel5, 111, 117))) { break lab17; } cursor = limit - v_8; break golab16; } while (false); cursor = limit - v_8; if (cursor <= limit_backward) { break lab15; } cursor--; } break lab2; } while (false); cursor = limit - v_3; lab18: do { // (, line 121 // literal, line 121 if (!(eq_s_b(1, "\u00F6"))) { break lab18; } // goto, line 121 golab19: while(true) { v_9 = limit - cursor; lab20: do { if (!(in_grouping_b(g_vowel6, 246, 252))) { break lab20; } cursor = limit - v_9; break golab19; } while (false); cursor = limit - v_9; if (cursor <= limit_backward) { break lab18; } cursor--; } break lab2; } while (false); cursor = limit - v_3; lab21: do { // (, line 122 // literal, line 122 if (!(eq_s_b(1, "u"))) { break lab21; } // goto, line 122 golab22: while(true) { v_10 = limit - cursor; lab23: do { if (!(in_grouping_b(g_vowel5, 111, 117))) { break lab23; } cursor = limit - v_10; break golab22; } while (false); cursor = limit - v_10; if (cursor <= limit_backward) { break lab21; } cursor--; } break lab2; } while (false); cursor = limit - v_3; // (, line 123 // literal, line 123 if (!(eq_s_b(1, "\u00FC"))) { return false; } // goto, line 123 golab24: while(true) { v_11 = limit - cursor; lab25: do { if (!(in_grouping_b(g_vowel6, 246, 252))) { break lab25; } cursor = limit - v_11; break golab24; } while (false); cursor = limit - v_11; if (cursor <= limit_backward) { return false; } cursor--; } } while (false); cursor = limit - v_1; return true; } private boolean r_mark_suffix_with_optional_n_consonant() { int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; // (, line 132 // or, line 134 lab0: do { v_1 = limit - cursor; lab1: do { // (, line 133 // (, line 133 // test, line 133 v_2 = limit - cursor; // literal, line 133 if (!(eq_s_b(1, "n"))) { break lab1; } cursor = limit - v_2; // next, line 133 if (cursor <= limit_backward) { break lab1; } cursor--; // (, line 133 // test, line 133 v_3 = limit - cursor; if (!(in_grouping_b(g_vowel, 97, 305))) { break lab1; } cursor = limit - v_3; break lab0; } while (false); cursor = limit - v_1; // (, line 135 // (, line 135 // not, line 135 { v_4 = limit - cursor; lab2: do { // (, line 135 // test, line 135 v_5 = limit - cursor; // literal, line 135 if (!(eq_s_b(1, "n"))) { break lab2; } cursor = limit - v_5; return false; } while (false); cursor = limit - v_4; } // test, line 135 v_6 = limit - cursor; // (, line 135 // next, line 135 if (cursor <= limit_backward) { return false; } cursor--; // (, line 135 // test, line 135 v_7 = limit - cursor; if (!(in_grouping_b(g_vowel, 97, 305))) { return false; } cursor = limit - v_7; cursor = limit - v_6; } while (false); return true; } private boolean r_mark_suffix_with_optional_s_consonant() { int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; // (, line 143 // or, line 145 lab0: do { v_1 = limit - cursor; lab1: do { // (, line 144 // (, line 144 // test, line 144 v_2 = limit - cursor; // literal, line 144 if (!(eq_s_b(1, "s"))) { break lab1; } cursor = limit - v_2; // next, line 144 if (cursor <= limit_backward) { break lab1; } cursor--; // (, line 144 // test, line 144 v_3 = limit - cursor; if (!(in_grouping_b(g_vowel, 97, 305))) { break lab1; } cursor = limit - v_3; break lab0; } while (false); cursor = limit - v_1; // (, line 146 // (, line 146 // not, line 146 { v_4 = limit - cursor; lab2: do { // (, line 146 // test, line 146 v_5 = limit - cursor; // literal, line 146 if (!(eq_s_b(1, "s"))) { break lab2; } cursor = limit - v_5; return false; } while (false); cursor = limit - v_4; } // test, line 146 v_6 = limit - cursor; // (, line 146 // next, line 146 if (cursor <= limit_backward) { return false; } cursor--; // (, line 146 // test, line 146 v_7 = limit - cursor; if (!(in_grouping_b(g_vowel, 97, 305))) { return false; } cursor = limit - v_7; cursor = limit - v_6; } while (false); return true; } private boolean r_mark_suffix_with_optional_y_consonant() { int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; // (, line 153 // or, line 155 lab0: do { v_1 = limit - cursor; lab1: do { // (, line 154 // (, line 154 // test, line 154 v_2 = limit - cursor; // literal, line 154 if (!(eq_s_b(1, "y"))) { break lab1; } cursor = limit - v_2; // next, line 154 if (cursor <= limit_backward) { break lab1; } cursor--; // (, line 154 // test, line 154 v_3 = limit - cursor; if (!(in_grouping_b(g_vowel, 97, 305))) { break lab1; } cursor = limit - v_3; break lab0; } while (false); cursor = limit - v_1; // (, line 156 // (, line 156 // not, line 156 { v_4 = limit - cursor; lab2: do { // (, line 156 // test, line 156 v_5 = limit - cursor; // literal, line 156 if (!(eq_s_b(1, "y"))) { break lab2; } cursor = limit - v_5; return false; } while (false); cursor = limit - v_4; } // test, line 156 v_6 = limit - cursor; // (, line 156 // next, line 156 if (cursor <= limit_backward) { return false; } cursor--; // (, line 156 // test, line 156 v_7 = limit - cursor; if (!(in_grouping_b(g_vowel, 97, 305))) { return false; } cursor = limit - v_7; cursor = limit - v_6; } while (false); return true; } private boolean r_mark_suffix_with_optional_U_vowel() { int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; // (, line 159 // or, line 161 lab0: do { v_1 = limit - cursor; lab1: do { // (, line 160 // (, line 160 // test, line 160 v_2 = limit - cursor; if (!(in_grouping_b(g_U, 105, 305))) { break lab1; } cursor = limit - v_2; // next, line 160 if (cursor <= limit_backward) { break lab1; } cursor--; // (, line 160 // test, line 160 v_3 = limit - cursor; if (!(out_grouping_b(g_vowel, 97, 305))) { break lab1; } cursor = limit - v_3; break lab0; } while (false); cursor = limit - v_1; // (, line 162 // (, line 162 // not, line 162 { v_4 = limit - cursor; lab2: do { // (, line 162 // test, line 162 v_5 = limit - cursor; if (!(in_grouping_b(g_U, 105, 305))) { break lab2; } cursor = limit - v_5; return false; } while (false); cursor = limit - v_4; } // test, line 162 v_6 = limit - cursor; // (, line 162 // next, line 162 if (cursor <= limit_backward) { return false; } cursor--; // (, line 162 // test, line 162 v_7 = limit - cursor; if (!(out_grouping_b(g_vowel, 97, 305))) { return false; } cursor = limit - v_7; cursor = limit - v_6; } while (false); return true; } private boolean r_mark_possessives() { // (, line 166 // among, line 167 if (find_among_b(a_0, 10) == 0) { return false; } // (, line 169 // call mark_suffix_with_optional_U_vowel, line 169 if (!r_mark_suffix_with_optional_U_vowel()) { return false; } return true; } private boolean r_mark_sU() { // (, line 172 // call check_vowel_harmony, line 173 if (!r_check_vowel_harmony()) { return false; } if (!(in_grouping_b(g_U, 105, 305))) { return false; } // (, line 175 // call mark_suffix_with_optional_s_consonant, line 175 if (!r_mark_suffix_with_optional_s_consonant()) { return false; } return true; } private boolean r_mark_lArI() { // (, line 178 // among, line 179 if (find_among_b(a_1, 2) == 0) { return false; } return true; } private boolean r_mark_yU() { // (, line 182 // call check_vowel_harmony, line 183 if (!r_check_vowel_harmony()) { return false; } if (!(in_grouping_b(g_U, 105, 305))) { return false; } // (, line 185 // call mark_suffix_with_optional_y_consonant, line 185 if (!r_mark_suffix_with_optional_y_consonant()) { return false; } return true; } private boolean r_mark_nU() { // (, line 188 // call check_vowel_harmony, line 189 if (!r_check_vowel_harmony()) { return false; } // among, line 190 if (find_among_b(a_2, 4) == 0) { return false; } return true; } private boolean r_mark_nUn() { // (, line 193 // call check_vowel_harmony, line 194 if (!r_check_vowel_harmony()) { return false; } // among, line 195 if (find_among_b(a_3, 4) == 0) { return false; } // (, line 196 // call mark_suffix_with_optional_n_consonant, line 196 if (!r_mark_suffix_with_optional_n_consonant()) { return false; } return true; } private boolean r_mark_yA() { // (, line 199 // call check_vowel_harmony, line 200 if (!r_check_vowel_harmony()) { return false; } // among, line 201 if (find_among_b(a_4, 2) == 0) { return false; } // (, line 202 // call mark_suffix_with_optional_y_consonant, line 202 if (!r_mark_suffix_with_optional_y_consonant()) { return false; } return true; } private boolean r_mark_nA() { // (, line 205 // call check_vowel_harmony, line 206 if (!r_check_vowel_harmony()) { return false; } // among, line 207 if (find_among_b(a_5, 2) == 0) { return false; } return true; } private boolean r_mark_DA() { // (, line 210 // call check_vowel_harmony, line 211 if (!r_check_vowel_harmony()) { return false; } // among, line 212 if (find_among_b(a_6, 4) == 0) { return false; } return true; } private boolean r_mark_ndA() { // (, line 215 // call check_vowel_harmony, line 216 if (!r_check_vowel_harmony()) { return false; } // among, line 217 if (find_among_b(a_7, 2) == 0) { return false; } return true; } private boolean r_mark_DAn() { // (, line 220 // call check_vowel_harmony, line 221 if (!r_check_vowel_harmony()) { return false; } // among, line 222 if (find_among_b(a_8, 4) == 0) { return false; } return true; } private boolean r_mark_ndAn() { // (, line 225 // call check_vowel_harmony, line 226 if (!r_check_vowel_harmony()) { return false; } // among, line 227 if (find_among_b(a_9, 2) == 0) { return false; } return true; } private boolean r_mark_ylA() { // (, line 230 // call check_vowel_harmony, line 231 if (!r_check_vowel_harmony()) { return false; } // among, line 232 if (find_among_b(a_10, 2) == 0) { return false; } // (, line 233 // call mark_suffix_with_optional_y_consonant, line 233 if (!r_mark_suffix_with_optional_y_consonant()) { return false; } return true; } private boolean r_mark_ki() { // (, line 236 // literal, line 237 if (!(eq_s_b(2, "ki"))) { return false; } return true; } private boolean r_mark_ncA() { // (, line 240 // call check_vowel_harmony, line 241 if (!r_check_vowel_harmony()) { return false; } // among, line 242 if (find_among_b(a_11, 2) == 0) { return false; } // (, line 243 // call mark_suffix_with_optional_n_consonant, line 243 if (!r_mark_suffix_with_optional_n_consonant()) { return false; } return true; } private boolean r_mark_yUm() { // (, line 246 // call check_vowel_harmony, line 247 if (!r_check_vowel_harmony()) { return false; } // among, line 248 if (find_among_b(a_12, 4) == 0) { return false; } // (, line 249 // call mark_suffix_with_optional_y_consonant, line 249 if (!r_mark_suffix_with_optional_y_consonant()) { return false; } return true; } private boolean r_mark_sUn() { // (, line 252 // call check_vowel_harmony, line 253 if (!r_check_vowel_harmony()) { return false; } // among, line 254 if (find_among_b(a_13, 4) == 0) { return false; } return true; } private boolean r_mark_yUz() { // (, line 257 // call check_vowel_harmony, line 258 if (!r_check_vowel_harmony()) { return false; } // among, line 259 if (find_among_b(a_14, 4) == 0) { return false; } // (, line 260 // call mark_suffix_with_optional_y_consonant, line 260 if (!r_mark_suffix_with_optional_y_consonant()) { return false; } return true; } private boolean r_mark_sUnUz() { // (, line 263 // among, line 264 if (find_among_b(a_15, 4) == 0) { return false; } return true; } private boolean r_mark_lAr() { // (, line 267 // call check_vowel_harmony, line 268 if (!r_check_vowel_harmony()) { return false; } // among, line 269 if (find_among_b(a_16, 2) == 0) { return false; } return true; } private boolean r_mark_nUz() { // (, line 272 // call check_vowel_harmony, line 273 if (!r_check_vowel_harmony()) { return false; } // among, line 274 if (find_among_b(a_17, 4) == 0) { return false; } return true; } private boolean r_mark_DUr() { // (, line 277 // call check_vowel_harmony, line 278 if (!r_check_vowel_harmony()) { return false; } // among, line 279 if (find_among_b(a_18, 8) == 0) { return false; } return true; } private boolean r_mark_cAsInA() { // (, line 282 // among, line 283 if (find_among_b(a_19, 2) == 0) { return false; } return true; } private boolean r_mark_yDU() { // (, line 286 // call check_vowel_harmony, line 287 if (!r_check_vowel_harmony()) { return false; } // among, line 288 if (find_among_b(a_20, 32) == 0) { return false; } // (, line 292 // call mark_suffix_with_optional_y_consonant, line 292 if (!r_mark_suffix_with_optional_y_consonant()) { return false; } return true; } private boolean r_mark_ysA() { // (, line 296 // among, line 297 if (find_among_b(a_21, 8) == 0) { return false; } // (, line 298 // call mark_suffix_with_optional_y_consonant, line 298 if (!r_mark_suffix_with_optional_y_consonant()) { return false; } return true; } private boolean r_mark_ymUs_() { // (, line 301 // call check_vowel_harmony, line 302 if (!r_check_vowel_harmony()) { return false; } // among, line 303 if (find_among_b(a_22, 4) == 0) { return false; } // (, line 304 // call mark_suffix_with_optional_y_consonant, line 304 if (!r_mark_suffix_with_optional_y_consonant()) { return false; } return true; } private boolean r_mark_yken() { // (, line 307 // literal, line 308 if (!(eq_s_b(3, "ken"))) { return false; } // (, line 308 // call mark_suffix_with_optional_y_consonant, line 308 if (!r_mark_suffix_with_optional_y_consonant()) { return false; } return true; } private boolean r_stem_nominal_verb_suffixes() { int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; int v_8; int v_9; int v_10; // (, line 311 // [, line 312 ket = cursor; // set continue_stemming_noun_suffixes, line 313 B_continue_stemming_noun_suffixes = true; // or, line 315 lab0: do { v_1 = limit - cursor; lab1: do { // (, line 314 // or, line 314 lab2: do { v_2 = limit - cursor; lab3: do { // call mark_ymUs_, line 314 if (!r_mark_ymUs_()) { break lab3; } break lab2; } while (false); cursor = limit - v_2; lab4: do { // call mark_yDU, line 314 if (!r_mark_yDU()) { break lab4; } break lab2; } while (false); cursor = limit - v_2; lab5: do { // call mark_ysA, line 314 if (!r_mark_ysA()) { break lab5; } break lab2; } while (false); cursor = limit - v_2; // call mark_yken, line 314 if (!r_mark_yken()) { break lab1; } } while (false); break lab0; } while (false); cursor = limit - v_1; lab6: do { // (, line 316 // call mark_cAsInA, line 316 if (!r_mark_cAsInA()) { break lab6; } // (, line 316 // or, line 316 lab7: do { v_3 = limit - cursor; lab8: do { // call mark_sUnUz, line 316 if (!r_mark_sUnUz()) { break lab8; } break lab7; } while (false); cursor = limit - v_3; lab9: do { // call mark_lAr, line 316 if (!r_mark_lAr()) { break lab9; } break lab7; } while (false); cursor = limit - v_3; lab10: do { // call mark_yUm, line 316 if (!r_mark_yUm()) { break lab10; } break lab7; } while (false); cursor = limit - v_3; lab11: do { // call mark_sUn, line 316 if (!r_mark_sUn()) { break lab11; } break lab7; } while (false); cursor = limit - v_3; lab12: do { // call mark_yUz, line 316 if (!r_mark_yUz()) { break lab12; } break lab7; } while (false); cursor = limit - v_3; } while (false); // call mark_ymUs_, line 316 if (!r_mark_ymUs_()) { break lab6; } break lab0; } while (false); cursor = limit - v_1; lab13: do { // (, line 318 // call mark_lAr, line 319 if (!r_mark_lAr()) { break lab13; } // ], line 319 bra = cursor; // delete, line 319 slice_del(); // try, line 319 v_4 = limit - cursor; lab14: do { // (, line 319 // [, line 319 ket = cursor; // (, line 319 // or, line 319 lab15: do { v_5 = limit - cursor; lab16: do { // call mark_DUr, line 319 if (!r_mark_DUr()) { break lab16; } break lab15; } while (false); cursor = limit - v_5; lab17: do { // call mark_yDU, line 319 if (!r_mark_yDU()) { break lab17; } break lab15; } while (false); cursor = limit - v_5; lab18: do { // call mark_ysA, line 319 if (!r_mark_ysA()) { break lab18; } break lab15; } while (false); cursor = limit - v_5; // call mark_ymUs_, line 319 if (!r_mark_ymUs_()) { cursor = limit - v_4; break lab14; } } while (false); } while (false); // unset continue_stemming_noun_suffixes, line 320 B_continue_stemming_noun_suffixes = false; break lab0; } while (false); cursor = limit - v_1; lab19: do { // (, line 323 // call mark_nUz, line 323 if (!r_mark_nUz()) { break lab19; } // (, line 323 // or, line 323 lab20: do { v_6 = limit - cursor; lab21: do { // call mark_yDU, line 323 if (!r_mark_yDU()) { break lab21; } break lab20; } while (false); cursor = limit - v_6; // call mark_ysA, line 323 if (!r_mark_ysA()) { break lab19; } } while (false); break lab0; } while (false); cursor = limit - v_1; lab22: do { // (, line 325 // (, line 325 // or, line 325 lab23: do { v_7 = limit - cursor; lab24: do { // call mark_sUnUz, line 325 if (!r_mark_sUnUz()) { break lab24; } break lab23; } while (false); cursor = limit - v_7; lab25: do { // call mark_yUz, line 325 if (!r_mark_yUz()) { break lab25; } break lab23; } while (false); cursor = limit - v_7; lab26: do { // call mark_sUn, line 325 if (!r_mark_sUn()) { break lab26; } break lab23; } while (false); cursor = limit - v_7; // call mark_yUm, line 325 if (!r_mark_yUm()) { break lab22; } } while (false); // ], line 325 bra = cursor; // delete, line 325 slice_del(); // try, line 325 v_8 = limit - cursor; lab27: do { // (, line 325 // [, line 325 ket = cursor; // call mark_ymUs_, line 325 if (!r_mark_ymUs_()) { cursor = limit - v_8; break lab27; } } while (false); break lab0; } while (false); cursor = limit - v_1; // (, line 327 // call mark_DUr, line 327 if (!r_mark_DUr()) { return false; } // ], line 327 bra = cursor; // delete, line 327 slice_del(); // try, line 327 v_9 = limit - cursor; lab28: do { // (, line 327 // [, line 327 ket = cursor; // (, line 327 // or, line 327 lab29: do { v_10 = limit - cursor; lab30: do { // call mark_sUnUz, line 327 if (!r_mark_sUnUz()) { break lab30; } break lab29; } while (false); cursor = limit - v_10; lab31: do { // call mark_lAr, line 327 if (!r_mark_lAr()) { break lab31; } break lab29; } while (false); cursor = limit - v_10; lab32: do { // call mark_yUm, line 327 if (!r_mark_yUm()) { break lab32; } break lab29; } while (false); cursor = limit - v_10; lab33: do { // call mark_sUn, line 327 if (!r_mark_sUn()) { break lab33; } break lab29; } while (false); cursor = limit - v_10; lab34: do { // call mark_yUz, line 327 if (!r_mark_yUz()) { break lab34; } break lab29; } while (false); cursor = limit - v_10; } while (false); // call mark_ymUs_, line 327 if (!r_mark_ymUs_()) { cursor = limit - v_9; break lab28; } } while (false); } while (false); // ], line 328 bra = cursor; // delete, line 328 slice_del(); return true; } private boolean r_stem_suffix_chain_before_ki() { int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; int v_8; int v_9; int v_10; int v_11; // (, line 332 // [, line 333 ket = cursor; // call mark_ki, line 334 if (!r_mark_ki()) { return false; } // (, line 335 // or, line 342 lab0: do { v_1 = limit - cursor; lab1: do { // (, line 336 // call mark_DA, line 336 if (!r_mark_DA()) { break lab1; } // ], line 336 bra = cursor; // delete, line 336 slice_del(); // try, line 336 v_2 = limit - cursor; lab2: do { // (, line 336 // [, line 336 ket = cursor; // or, line 338 lab3: do { v_3 = limit - cursor; lab4: do { // (, line 337 // call mark_lAr, line 337 if (!r_mark_lAr()) { break lab4; } // ], line 337 bra = cursor; // delete, line 337 slice_del(); // try, line 337 v_4 = limit - cursor; lab5: do { // (, line 337 // call stem_suffix_chain_before_ki, line 337 if (!r_stem_suffix_chain_before_ki()) { cursor = limit - v_4; break lab5; } } while (false); break lab3; } while (false); cursor = limit - v_3; // (, line 339 // call mark_possessives, line 339 if (!r_mark_possessives()) { cursor = limit - v_2; break lab2; } // ], line 339 bra = cursor; // delete, line 339 slice_del(); // try, line 339 v_5 = limit - cursor; lab6: do { // (, line 339 // [, line 339 ket = cursor; // call mark_lAr, line 339 if (!r_mark_lAr()) { cursor = limit - v_5; break lab6; } // ], line 339 bra = cursor; // delete, line 339 slice_del(); // call stem_suffix_chain_before_ki, line 339 if (!r_stem_suffix_chain_before_ki()) { cursor = limit - v_5; break lab6; } } while (false); } while (false); } while (false); break lab0; } while (false); cursor = limit - v_1; lab7: do { // (, line 343 // call mark_nUn, line 343 if (!r_mark_nUn()) { break lab7; } // ], line 343 bra = cursor; // delete, line 343 slice_del(); // try, line 343 v_6 = limit - cursor; lab8: do { // (, line 343 // [, line 343 ket = cursor; // or, line 345 lab9: do { v_7 = limit - cursor; lab10: do { // (, line 344 // call mark_lArI, line 344 if (!r_mark_lArI()) { break lab10; } // ], line 344 bra = cursor; // delete, line 344 slice_del(); break lab9; } while (false); cursor = limit - v_7; lab11: do { // (, line 346 // [, line 346 ket = cursor; // or, line 346 lab12: do { v_8 = limit - cursor; lab13: do { // call mark_possessives, line 346 if (!r_mark_possessives()) { break lab13; } break lab12; } while (false); cursor = limit - v_8; // call mark_sU, line 346 if (!r_mark_sU()) { break lab11; } } while (false); // ], line 346 bra = cursor; // delete, line 346 slice_del(); // try, line 346 v_9 = limit - cursor; lab14: do { // (, line 346 // [, line 346 ket = cursor; // call mark_lAr, line 346 if (!r_mark_lAr()) { cursor = limit - v_9; break lab14; } // ], line 346 bra = cursor; // delete, line 346 slice_del(); // call stem_suffix_chain_before_ki, line 346 if (!r_stem_suffix_chain_before_ki()) { cursor = limit - v_9; break lab14; } } while (false); break lab9; } while (false); cursor = limit - v_7; // (, line 348 // call stem_suffix_chain_before_ki, line 348 if (!r_stem_suffix_chain_before_ki()) { cursor = limit - v_6; break lab8; } } while (false); } while (false); break lab0; } while (false); cursor = limit - v_1; // (, line 351 // call mark_ndA, line 351 if (!r_mark_ndA()) { return false; } // (, line 351 // or, line 353 lab15: do { v_10 = limit - cursor; lab16: do { // (, line 352 // call mark_lArI, line 352 if (!r_mark_lArI()) { break lab16; } // ], line 352 bra = cursor; // delete, line 352 slice_del(); break lab15; } while (false); cursor = limit - v_10; lab17: do { // (, line 354 // (, line 354 // call mark_sU, line 354 if (!r_mark_sU()) { break lab17; } // ], line 354 bra = cursor; // delete, line 354 slice_del(); // try, line 354 v_11 = limit - cursor; lab18: do { // (, line 354 // [, line 354 ket = cursor; // call mark_lAr, line 354 if (!r_mark_lAr()) { cursor = limit - v_11; break lab18; } // ], line 354 bra = cursor; // delete, line 354 slice_del(); // call stem_suffix_chain_before_ki, line 354 if (!r_stem_suffix_chain_before_ki()) { cursor = limit - v_11; break lab18; } } while (false); break lab15; } while (false); cursor = limit - v_10; // (, line 356 // call stem_suffix_chain_before_ki, line 356 if (!r_stem_suffix_chain_before_ki()) { return false; } } while (false); } while (false); return true; } private boolean r_stem_noun_suffixes() { int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; int v_8; int v_9; int v_10; int v_11; int v_12; int v_13; int v_14; int v_15; int v_16; int v_17; int v_18; int v_19; int v_20; int v_21; int v_22; int v_23; int v_24; int v_25; int v_26; int v_27; // (, line 361 // or, line 363 lab0: do { v_1 = limit - cursor; lab1: do { // (, line 362 // [, line 362 ket = cursor; // call mark_lAr, line 362 if (!r_mark_lAr()) { break lab1; } // ], line 362 bra = cursor; // delete, line 362 slice_del(); // try, line 362 v_2 = limit - cursor; lab2: do { // (, line 362 // call stem_suffix_chain_before_ki, line 362 if (!r_stem_suffix_chain_before_ki()) { cursor = limit - v_2; break lab2; } } while (false); break lab0; } while (false); cursor = limit - v_1; lab3: do { // (, line 364 // [, line 364 ket = cursor; // call mark_ncA, line 364 if (!r_mark_ncA()) { break lab3; } // ], line 364 bra = cursor; // delete, line 364 slice_del(); // try, line 365 v_3 = limit - cursor; lab4: do { // (, line 365 // or, line 367 lab5: do { v_4 = limit - cursor; lab6: do { // (, line 366 // [, line 366 ket = cursor; // call mark_lArI, line 366 if (!r_mark_lArI()) { break lab6; } // ], line 366 bra = cursor; // delete, line 366 slice_del(); break lab5; } while (false); cursor = limit - v_4; lab7: do { // (, line 368 // [, line 368 ket = cursor; // or, line 368 lab8: do { v_5 = limit - cursor; lab9: do { // call mark_possessives, line 368 if (!r_mark_possessives()) { break lab9; } break lab8; } while (false); cursor = limit - v_5; // call mark_sU, line 368 if (!r_mark_sU()) { break lab7; } } while (false); // ], line 368 bra = cursor; // delete, line 368 slice_del(); // try, line 368 v_6 = limit - cursor; lab10: do { // (, line 368 // [, line 368 ket = cursor; // call mark_lAr, line 368 if (!r_mark_lAr()) { cursor = limit - v_6; break lab10; } // ], line 368 bra = cursor; // delete, line 368 slice_del(); // call stem_suffix_chain_before_ki, line 368 if (!r_stem_suffix_chain_before_ki()) { cursor = limit - v_6; break lab10; } } while (false); break lab5; } while (false); cursor = limit - v_4; // (, line 370 // [, line 370 ket = cursor; // call mark_lAr, line 370 if (!r_mark_lAr()) { cursor = limit - v_3; break lab4; } // ], line 370 bra = cursor; // delete, line 370 slice_del(); // call stem_suffix_chain_before_ki, line 370 if (!r_stem_suffix_chain_before_ki()) { cursor = limit - v_3; break lab4; } } while (false); } while (false); break lab0; } while (false); cursor = limit - v_1; lab11: do { // (, line 374 // [, line 374 ket = cursor; // (, line 374 // or, line 374 lab12: do { v_7 = limit - cursor; lab13: do { // call mark_ndA, line 374 if (!r_mark_ndA()) { break lab13; } break lab12; } while (false); cursor = limit - v_7; // call mark_nA, line 374 if (!r_mark_nA()) { break lab11; } } while (false); // (, line 375 // or, line 377 lab14: do { v_8 = limit - cursor; lab15: do { // (, line 376 // call mark_lArI, line 376 if (!r_mark_lArI()) { break lab15; } // ], line 376 bra = cursor; // delete, line 376 slice_del(); break lab14; } while (false); cursor = limit - v_8; lab16: do { // (, line 378 // call mark_sU, line 378 if (!r_mark_sU()) { break lab16; } // ], line 378 bra = cursor; // delete, line 378 slice_del(); // try, line 378 v_9 = limit - cursor; lab17: do { // (, line 378 // [, line 378 ket = cursor; // call mark_lAr, line 378 if (!r_mark_lAr()) { cursor = limit - v_9; break lab17; } // ], line 378 bra = cursor; // delete, line 378 slice_del(); // call stem_suffix_chain_before_ki, line 378 if (!r_stem_suffix_chain_before_ki()) { cursor = limit - v_9; break lab17; } } while (false); break lab14; } while (false); cursor = limit - v_8; // (, line 380 // call stem_suffix_chain_before_ki, line 380 if (!r_stem_suffix_chain_before_ki()) { break lab11; } } while (false); break lab0; } while (false); cursor = limit - v_1; lab18: do { // (, line 384 // [, line 384 ket = cursor; // (, line 384 // or, line 384 lab19: do { v_10 = limit - cursor; lab20: do { // call mark_ndAn, line 384 if (!r_mark_ndAn()) { break lab20; } break lab19; } while (false); cursor = limit - v_10; // call mark_nU, line 384 if (!r_mark_nU()) { break lab18; } } while (false); // (, line 384 // or, line 384 lab21: do { v_11 = limit - cursor; lab22: do { // (, line 384 // call mark_sU, line 384 if (!r_mark_sU()) { break lab22; } // ], line 384 bra = cursor; // delete, line 384 slice_del(); // try, line 384 v_12 = limit - cursor; lab23: do { // (, line 384 // [, line 384 ket = cursor; // call mark_lAr, line 384 if (!r_mark_lAr()) { cursor = limit - v_12; break lab23; } // ], line 384 bra = cursor; // delete, line 384 slice_del(); // call stem_suffix_chain_before_ki, line 384 if (!r_stem_suffix_chain_before_ki()) { cursor = limit - v_12; break lab23; } } while (false); break lab21; } while (false); cursor = limit - v_11; // (, line 384 // call mark_lArI, line 384 if (!r_mark_lArI()) { break lab18; } } while (false); break lab0; } while (false); cursor = limit - v_1; lab24: do { // (, line 386 // [, line 386 ket = cursor; // call mark_DAn, line 386 if (!r_mark_DAn()) { break lab24; } // ], line 386 bra = cursor; // delete, line 386 slice_del(); // try, line 386 v_13 = limit - cursor; lab25: do { // (, line 386 // [, line 386 ket = cursor; // (, line 387 // or, line 389 lab26: do { v_14 = limit - cursor; lab27: do { // (, line 388 // call mark_possessives, line 388 if (!r_mark_possessives()) { break lab27; } // ], line 388 bra = cursor; // delete, line 388 slice_del(); // try, line 388 v_15 = limit - cursor; lab28: do { // (, line 388 // [, line 388 ket = cursor; // call mark_lAr, line 388 if (!r_mark_lAr()) { cursor = limit - v_15; break lab28; } // ], line 388 bra = cursor; // delete, line 388 slice_del(); // call stem_suffix_chain_before_ki, line 388 if (!r_stem_suffix_chain_before_ki()) { cursor = limit - v_15; break lab28; } } while (false); break lab26; } while (false); cursor = limit - v_14; lab29: do { // (, line 390 // call mark_lAr, line 390 if (!r_mark_lAr()) { break lab29; } // ], line 390 bra = cursor; // delete, line 390 slice_del(); // try, line 390 v_16 = limit - cursor; lab30: do { // (, line 390 // call stem_suffix_chain_before_ki, line 390 if (!r_stem_suffix_chain_before_ki()) { cursor = limit - v_16; break lab30; } } while (false); break lab26; } while (false); cursor = limit - v_14; // (, line 392 // call stem_suffix_chain_before_ki, line 392 if (!r_stem_suffix_chain_before_ki()) { cursor = limit - v_13; break lab25; } } while (false); } while (false); break lab0; } while (false); cursor = limit - v_1; lab31: do { // (, line 396 // [, line 396 ket = cursor; // or, line 396 lab32: do { v_17 = limit - cursor; lab33: do { // call mark_nUn, line 396 if (!r_mark_nUn()) { break lab33; } break lab32; } while (false); cursor = limit - v_17; // call mark_ylA, line 396 if (!r_mark_ylA()) { break lab31; } } while (false); // ], line 396 bra = cursor; // delete, line 396 slice_del(); // try, line 397 v_18 = limit - cursor; lab34: do { // (, line 397 // or, line 399 lab35: do { v_19 = limit - cursor; lab36: do { // (, line 398 // [, line 398 ket = cursor; // call mark_lAr, line 398 if (!r_mark_lAr()) { break lab36; } // ], line 398 bra = cursor; // delete, line 398 slice_del(); // call stem_suffix_chain_before_ki, line 398 if (!r_stem_suffix_chain_before_ki()) { break lab36; } break lab35; } while (false); cursor = limit - v_19; lab37: do { // (, line 400 // [, line 400 ket = cursor; // or, line 400 lab38: do { v_20 = limit - cursor; lab39: do { // call mark_possessives, line 400 if (!r_mark_possessives()) { break lab39; } break lab38; } while (false); cursor = limit - v_20; // call mark_sU, line 400 if (!r_mark_sU()) { break lab37; } } while (false); // ], line 400 bra = cursor; // delete, line 400 slice_del(); // try, line 400 v_21 = limit - cursor; lab40: do { // (, line 400 // [, line 400 ket = cursor; // call mark_lAr, line 400 if (!r_mark_lAr()) { cursor = limit - v_21; break lab40; } // ], line 400 bra = cursor; // delete, line 400 slice_del(); // call stem_suffix_chain_before_ki, line 400 if (!r_stem_suffix_chain_before_ki()) { cursor = limit - v_21; break lab40; } } while (false); break lab35; } while (false); cursor = limit - v_19; // call stem_suffix_chain_before_ki, line 402 if (!r_stem_suffix_chain_before_ki()) { cursor = limit - v_18; break lab34; } } while (false); } while (false); break lab0; } while (false); cursor = limit - v_1; lab41: do { // (, line 406 // [, line 406 ket = cursor; // call mark_lArI, line 406 if (!r_mark_lArI()) { break lab41; } // ], line 406 bra = cursor; // delete, line 406 slice_del(); break lab0; } while (false); cursor = limit - v_1; lab42: do { // (, line 408 // call stem_suffix_chain_before_ki, line 408 if (!r_stem_suffix_chain_before_ki()) { break lab42; } break lab0; } while (false); cursor = limit - v_1; lab43: do { // (, line 410 // [, line 410 ket = cursor; // or, line 410 lab44: do { v_22 = limit - cursor; lab45: do { // call mark_DA, line 410 if (!r_mark_DA()) { break lab45; } break lab44; } while (false); cursor = limit - v_22; lab46: do { // call mark_yU, line 410 if (!r_mark_yU()) { break lab46; } break lab44; } while (false); cursor = limit - v_22; // call mark_yA, line 410 if (!r_mark_yA()) { break lab43; } } while (false); // ], line 410 bra = cursor; // delete, line 410 slice_del(); // try, line 410 v_23 = limit - cursor; lab47: do { // (, line 410 // [, line 410 ket = cursor; // (, line 410 // or, line 410 lab48: do { v_24 = limit - cursor; lab49: do { // (, line 410 // call mark_possessives, line 410 if (!r_mark_possessives()) { break lab49; } // ], line 410 bra = cursor; // delete, line 410 slice_del(); // try, line 410 v_25 = limit - cursor; lab50: do { // (, line 410 // [, line 410 ket = cursor; // call mark_lAr, line 410 if (!r_mark_lAr()) { cursor = limit - v_25; break lab50; } } while (false); break lab48; } while (false); cursor = limit - v_24; // call mark_lAr, line 410 if (!r_mark_lAr()) { cursor = limit - v_23; break lab47; } } while (false); // ], line 410 bra = cursor; // delete, line 410 slice_del(); // [, line 410 ket = cursor; // call stem_suffix_chain_before_ki, line 410 if (!r_stem_suffix_chain_before_ki()) { cursor = limit - v_23; break lab47; } } while (false); break lab0; } while (false); cursor = limit - v_1; // (, line 412 // [, line 412 ket = cursor; // or, line 412 lab51: do { v_26 = limit - cursor; lab52: do { // call mark_possessives, line 412 if (!r_mark_possessives()) { break lab52; } break lab51; } while (false); cursor = limit - v_26; // call mark_sU, line 412 if (!r_mark_sU()) { return false; } } while (false); // ], line 412 bra = cursor; // delete, line 412 slice_del(); // try, line 412 v_27 = limit - cursor; lab53: do { // (, line 412 // [, line 412 ket = cursor; // call mark_lAr, line 412 if (!r_mark_lAr()) { cursor = limit - v_27; break lab53; } // ], line 412 bra = cursor; // delete, line 412 slice_del(); // call stem_suffix_chain_before_ki, line 412 if (!r_stem_suffix_chain_before_ki()) { cursor = limit - v_27; break lab53; } } while (false); } while (false); return true; } private boolean r_post_process_last_consonants() { int among_var; // (, line 415 // [, line 416 ket = cursor; // substring, line 416 among_var = find_among_b(a_23, 4); if (among_var == 0) { return false; } // ], line 416 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 417 // <-, line 417 slice_from("p"); break; case 2: // (, line 418 // <-, line 418 slice_from("\u00E7"); break; case 3: // (, line 419 // <-, line 419 slice_from("t"); break; case 4: // (, line 420 // <-, line 420 slice_from("k"); break; } return true; } private boolean r_append_U_to_stems_ending_with_d_or_g() { int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; int v_8; int v_9; int v_10; int v_11; int v_12; int v_13; int v_14; int v_15; // (, line 430 // test, line 431 v_1 = limit - cursor; // (, line 431 // or, line 431 lab0: do { v_2 = limit - cursor; lab1: do { // literal, line 431 if (!(eq_s_b(1, "d"))) { break lab1; } break lab0; } while (false); cursor = limit - v_2; // literal, line 431 if (!(eq_s_b(1, "g"))) { return false; } } while (false); cursor = limit - v_1; // or, line 433 lab2: do { v_3 = limit - cursor; lab3: do { // (, line 432 // test, line 432 v_4 = limit - cursor; // (, line 432 // (, line 432 // goto, line 432 golab4: while(true) { v_5 = limit - cursor; lab5: do { if (!(in_grouping_b(g_vowel, 97, 305))) { break lab5; } cursor = limit - v_5; break golab4; } while (false); cursor = limit - v_5; if (cursor <= limit_backward) { break lab3; } cursor--; } // or, line 432 lab6: do { v_6 = limit - cursor; lab7: do { // literal, line 432 if (!(eq_s_b(1, "a"))) { break lab7; } break lab6; } while (false); cursor = limit - v_6; // literal, line 432 if (!(eq_s_b(1, "\u0131"))) { break lab3; } } while (false); cursor = limit - v_4; // <+, line 432 { int c = cursor; insert(cursor, cursor, "\u0131"); cursor = c; } break lab2; } while (false); cursor = limit - v_3; lab8: do { // (, line 434 // test, line 434 v_7 = limit - cursor; // (, line 434 // (, line 434 // goto, line 434 golab9: while(true) { v_8 = limit - cursor; lab10: do { if (!(in_grouping_b(g_vowel, 97, 305))) { break lab10; } cursor = limit - v_8; break golab9; } while (false); cursor = limit - v_8; if (cursor <= limit_backward) { break lab8; } cursor--; } // or, line 434 lab11: do { v_9 = limit - cursor; lab12: do { // literal, line 434 if (!(eq_s_b(1, "e"))) { break lab12; } break lab11; } while (false); cursor = limit - v_9; // literal, line 434 if (!(eq_s_b(1, "i"))) { break lab8; } } while (false); cursor = limit - v_7; // <+, line 434 { int c = cursor; insert(cursor, cursor, "i"); cursor = c; } break lab2; } while (false); cursor = limit - v_3; lab13: do { // (, line 436 // test, line 436 v_10 = limit - cursor; // (, line 436 // (, line 436 // goto, line 436 golab14: while(true) { v_11 = limit - cursor; lab15: do { if (!(in_grouping_b(g_vowel, 97, 305))) { break lab15; } cursor = limit - v_11; break golab14; } while (false); cursor = limit - v_11; if (cursor <= limit_backward) { break lab13; } cursor--; } // or, line 436 lab16: do { v_12 = limit - cursor; lab17: do { // literal, line 436 if (!(eq_s_b(1, "o"))) { break lab17; } break lab16; } while (false); cursor = limit - v_12; // literal, line 436 if (!(eq_s_b(1, "u"))) { break lab13; } } while (false); cursor = limit - v_10; // <+, line 436 { int c = cursor; insert(cursor, cursor, "u"); cursor = c; } break lab2; } while (false); cursor = limit - v_3; // (, line 438 // test, line 438 v_13 = limit - cursor; // (, line 438 // (, line 438 // goto, line 438 golab18: while(true) { v_14 = limit - cursor; lab19: do { if (!(in_grouping_b(g_vowel, 97, 305))) { break lab19; } cursor = limit - v_14; break golab18; } while (false); cursor = limit - v_14; if (cursor <= limit_backward) { return false; } cursor--; } // or, line 438 lab20: do { v_15 = limit - cursor; lab21: do { // literal, line 438 if (!(eq_s_b(1, "\u00F6"))) { break lab21; } break lab20; } while (false); cursor = limit - v_15; // literal, line 438 if (!(eq_s_b(1, "\u00FC"))) { return false; } } while (false); cursor = limit - v_13; // <+, line 438 { int c = cursor; insert(cursor, cursor, "\u00FC"); cursor = c; } } while (false); return true; } private boolean r_more_than_one_syllable_word() { int v_1; int v_3; // (, line 445 // test, line 446 v_1 = cursor; // (, line 446 // atleast, line 446 { int v_2 = 2; // atleast, line 446 replab0: while(true) { v_3 = cursor; lab1: do { // (, line 446 // gopast, line 446 golab2: while(true) { lab3: do { if (!(in_grouping(g_vowel, 97, 305))) { break lab3; } break golab2; } while (false); if (cursor >= limit) { break lab1; } cursor++; } v_2--; continue replab0; } while (false); cursor = v_3; break replab0; } if (v_2 > 0) { return false; } } cursor = v_1; return true; } private boolean r_is_reserved_word() { int v_1; int v_2; int v_4; // (, line 449 // or, line 451 lab0: do { v_1 = cursor; lab1: do { // test, line 450 v_2 = cursor; // (, line 450 // gopast, line 450 golab2: while(true) { lab3: do { // literal, line 450 if (!(eq_s(2, "ad"))) { break lab3; } break golab2; } while (false); if (cursor >= limit) { break lab1; } cursor++; } // (, line 450 I_strlen = 2; // (, line 450 if (!(I_strlen == limit)) { break lab1; } cursor = v_2; break lab0; } while (false); cursor = v_1; // test, line 452 v_4 = cursor; // (, line 452 // gopast, line 452 golab4: while(true) { lab5: do { // literal, line 452 if (!(eq_s(5, "soyad"))) { break lab5; } break golab4; } while (false); if (cursor >= limit) { return false; } cursor++; } // (, line 452 I_strlen = 5; // (, line 452 if (!(I_strlen == limit)) { return false; } cursor = v_4; } while (false); return true; } private boolean r_postlude() { int v_1; int v_2; int v_3; // (, line 455 // not, line 456 { v_1 = cursor; lab0: do { // (, line 456 // call is_reserved_word, line 456 if (!r_is_reserved_word()) { break lab0; } return false; } while (false); cursor = v_1; } // backwards, line 457 limit_backward = cursor; cursor = limit; // (, line 457 // do, line 458 v_2 = limit - cursor; lab1: do { // call append_U_to_stems_ending_with_d_or_g, line 458 if (!r_append_U_to_stems_ending_with_d_or_g()) { break lab1; } } while (false); cursor = limit - v_2; // do, line 459 v_3 = limit - cursor; lab2: do { // call post_process_last_consonants, line 459 if (!r_post_process_last_consonants()) { break lab2; } } while (false); cursor = limit - v_3; cursor = limit_backward; return true; } public boolean stem() { int v_1; int v_2; // (, line 464 // (, line 465 // call more_than_one_syllable_word, line 465 if (!r_more_than_one_syllable_word()) { return false; } // (, line 466 // backwards, line 467 limit_backward = cursor; cursor = limit; // (, line 467 // do, line 468 v_1 = limit - cursor; lab0: do { // call stem_nominal_verb_suffixes, line 468 if (!r_stem_nominal_verb_suffixes()) { break lab0; } } while (false); cursor = limit - v_1; // Boolean test continue_stemming_noun_suffixes, line 469 if (!(B_continue_stemming_noun_suffixes)) { return false; } // do, line 470 v_2 = limit - cursor; lab1: do { // call stem_noun_suffixes, line 470 if (!r_stem_noun_suffixes()) { break lab1; } } while (false); cursor = limit - v_2; cursor = limit_backward; // call postlude, line 473 if (!r_postlude()) { return false; } return true; } } lucene-2.9.4/contrib/snowball/src/java/org/tartarus/snowball/ext/SpanishStemmer.java0000644000175000017500000012324111474320235031332 0ustar janpascaljanpascal// This file was generated automatically by the Snowball to Java compiler package org.tartarus.snowball.ext; import org.tartarus.snowball.SnowballProgram; import org.tartarus.snowball.Among; /** * Generated class implementing code defined by a snowball script. */ public class SpanishStemmer extends SnowballProgram { private Among a_0[] = { new Among ( "", -1, 6, "", this), new Among ( "\u00E1", 0, 1, "", this), new Among ( "\u00E9", 0, 2, "", this), new Among ( "\u00ED", 0, 3, "", this), new Among ( "\u00F3", 0, 4, "", this), new Among ( "\u00FA", 0, 5, "", this) }; private Among a_1[] = { new Among ( "la", -1, -1, "", this), new Among ( "sela", 0, -1, "", this), new Among ( "le", -1, -1, "", this), new Among ( "me", -1, -1, "", this), new Among ( "se", -1, -1, "", this), new Among ( "lo", -1, -1, "", this), new Among ( "selo", 5, -1, "", this), new Among ( "las", -1, -1, "", this), new Among ( "selas", 7, -1, "", this), new Among ( "les", -1, -1, "", this), new Among ( "los", -1, -1, "", this), new Among ( "selos", 10, -1, "", this), new Among ( "nos", -1, -1, "", this) }; private Among a_2[] = { new Among ( "ando", -1, 6, "", this), new Among ( "iendo", -1, 6, "", this), new Among ( "yendo", -1, 7, "", this), new Among ( "\u00E1ndo", -1, 2, "", this), new Among ( "i\u00E9ndo", -1, 1, "", this), new Among ( "ar", -1, 6, "", this), new Among ( "er", -1, 6, "", this), new Among ( "ir", -1, 6, "", this), new Among ( "\u00E1r", -1, 3, "", this), new Among ( "\u00E9r", -1, 4, "", this), new Among ( "\u00EDr", -1, 5, "", this) }; private Among a_3[] = { new Among ( "ic", -1, -1, "", this), new Among ( "ad", -1, -1, "", this), new Among ( "os", -1, -1, "", this), new Among ( "iv", -1, 1, "", this) }; private Among a_4[] = { new Among ( "able", -1, 1, "", this), new Among ( "ible", -1, 1, "", this), new Among ( "ante", -1, 1, "", this) }; private Among a_5[] = { new Among ( "ic", -1, 1, "", this), new Among ( "abil", -1, 1, "", this), new Among ( "iv", -1, 1, "", this) }; private Among a_6[] = { new Among ( "ica", -1, 1, "", this), new Among ( "ancia", -1, 2, "", this), new Among ( "encia", -1, 5, "", this), new Among ( "adora", -1, 2, "", this), new Among ( "osa", -1, 1, "", this), new Among ( "ista", -1, 1, "", this), new Among ( "iva", -1, 9, "", this), new Among ( "anza", -1, 1, "", this), new Among ( "log\u00EDa", -1, 3, "", this), new Among ( "idad", -1, 8, "", this), new Among ( "able", -1, 1, "", this), new Among ( "ible", -1, 1, "", this), new Among ( "ante", -1, 2, "", this), new Among ( "mente", -1, 7, "", this), new Among ( "amente", 13, 6, "", this), new Among ( "aci\u00F3n", -1, 2, "", this), new Among ( "uci\u00F3n", -1, 4, "", this), new Among ( "ico", -1, 1, "", this), new Among ( "ismo", -1, 1, "", this), new Among ( "oso", -1, 1, "", this), new Among ( "amiento", -1, 1, "", this), new Among ( "imiento", -1, 1, "", this), new Among ( "ivo", -1, 9, "", this), new Among ( "ador", -1, 2, "", this), new Among ( "icas", -1, 1, "", this), new Among ( "ancias", -1, 2, "", this), new Among ( "encias", -1, 5, "", this), new Among ( "adoras", -1, 2, "", this), new Among ( "osas", -1, 1, "", this), new Among ( "istas", -1, 1, "", this), new Among ( "ivas", -1, 9, "", this), new Among ( "anzas", -1, 1, "", this), new Among ( "log\u00EDas", -1, 3, "", this), new Among ( "idades", -1, 8, "", this), new Among ( "ables", -1, 1, "", this), new Among ( "ibles", -1, 1, "", this), new Among ( "aciones", -1, 2, "", this), new Among ( "uciones", -1, 4, "", this), new Among ( "adores", -1, 2, "", this), new Among ( "antes", -1, 2, "", this), new Among ( "icos", -1, 1, "", this), new Among ( "ismos", -1, 1, "", this), new Among ( "osos", -1, 1, "", this), new Among ( "amientos", -1, 1, "", this), new Among ( "imientos", -1, 1, "", this), new Among ( "ivos", -1, 9, "", this) }; private Among a_7[] = { new Among ( "ya", -1, 1, "", this), new Among ( "ye", -1, 1, "", this), new Among ( "yan", -1, 1, "", this), new Among ( "yen", -1, 1, "", this), new Among ( "yeron", -1, 1, "", this), new Among ( "yendo", -1, 1, "", this), new Among ( "yo", -1, 1, "", this), new Among ( "yas", -1, 1, "", this), new Among ( "yes", -1, 1, "", this), new Among ( "yais", -1, 1, "", this), new Among ( "yamos", -1, 1, "", this), new Among ( "y\u00F3", -1, 1, "", this) }; private Among a_8[] = { new Among ( "aba", -1, 2, "", this), new Among ( "ada", -1, 2, "", this), new Among ( "ida", -1, 2, "", this), new Among ( "ara", -1, 2, "", this), new Among ( "iera", -1, 2, "", this), new Among ( "\u00EDa", -1, 2, "", this), new Among ( "ar\u00EDa", 5, 2, "", this), new Among ( "er\u00EDa", 5, 2, "", this), new Among ( "ir\u00EDa", 5, 2, "", this), new Among ( "ad", -1, 2, "", this), new Among ( "ed", -1, 2, "", this), new Among ( "id", -1, 2, "", this), new Among ( "ase", -1, 2, "", this), new Among ( "iese", -1, 2, "", this), new Among ( "aste", -1, 2, "", this), new Among ( "iste", -1, 2, "", this), new Among ( "an", -1, 2, "", this), new Among ( "aban", 16, 2, "", this), new Among ( "aran", 16, 2, "", this), new Among ( "ieran", 16, 2, "", this), new Among ( "\u00EDan", 16, 2, "", this), new Among ( "ar\u00EDan", 20, 2, "", this), new Among ( "er\u00EDan", 20, 2, "", this), new Among ( "ir\u00EDan", 20, 2, "", this), new Among ( "en", -1, 1, "", this), new Among ( "asen", 24, 2, "", this), new Among ( "iesen", 24, 2, "", this), new Among ( "aron", -1, 2, "", this), new Among ( "ieron", -1, 2, "", this), new Among ( "ar\u00E1n", -1, 2, "", this), new Among ( "er\u00E1n", -1, 2, "", this), new Among ( "ir\u00E1n", -1, 2, "", this), new Among ( "ado", -1, 2, "", this), new Among ( "ido", -1, 2, "", this), new Among ( "ando", -1, 2, "", this), new Among ( "iendo", -1, 2, "", this), new Among ( "ar", -1, 2, "", this), new Among ( "er", -1, 2, "", this), new Among ( "ir", -1, 2, "", this), new Among ( "as", -1, 2, "", this), new Among ( "abas", 39, 2, "", this), new Among ( "adas", 39, 2, "", this), new Among ( "idas", 39, 2, "", this), new Among ( "aras", 39, 2, "", this), new Among ( "ieras", 39, 2, "", this), new Among ( "\u00EDas", 39, 2, "", this), new Among ( "ar\u00EDas", 45, 2, "", this), new Among ( "er\u00EDas", 45, 2, "", this), new Among ( "ir\u00EDas", 45, 2, "", this), new Among ( "es", -1, 1, "", this), new Among ( "ases", 49, 2, "", this), new Among ( "ieses", 49, 2, "", this), new Among ( "abais", -1, 2, "", this), new Among ( "arais", -1, 2, "", this), new Among ( "ierais", -1, 2, "", this), new Among ( "\u00EDais", -1, 2, "", this), new Among ( "ar\u00EDais", 55, 2, "", this), new Among ( "er\u00EDais", 55, 2, "", this), new Among ( "ir\u00EDais", 55, 2, "", this), new Among ( "aseis", -1, 2, "", this), new Among ( "ieseis", -1, 2, "", this), new Among ( "asteis", -1, 2, "", this), new Among ( "isteis", -1, 2, "", this), new Among ( "\u00E1is", -1, 2, "", this), new Among ( "\u00E9is", -1, 1, "", this), new Among ( "ar\u00E9is", 64, 2, "", this), new Among ( "er\u00E9is", 64, 2, "", this), new Among ( "ir\u00E9is", 64, 2, "", this), new Among ( "ados", -1, 2, "", this), new Among ( "idos", -1, 2, "", this), new Among ( "amos", -1, 2, "", this), new Among ( "\u00E1bamos", 70, 2, "", this), new Among ( "\u00E1ramos", 70, 2, "", this), new Among ( "i\u00E9ramos", 70, 2, "", this), new Among ( "\u00EDamos", 70, 2, "", this), new Among ( "ar\u00EDamos", 74, 2, "", this), new Among ( "er\u00EDamos", 74, 2, "", this), new Among ( "ir\u00EDamos", 74, 2, "", this), new Among ( "emos", -1, 1, "", this), new Among ( "aremos", 78, 2, "", this), new Among ( "eremos", 78, 2, "", this), new Among ( "iremos", 78, 2, "", this), new Among ( "\u00E1semos", 78, 2, "", this), new Among ( "i\u00E9semos", 78, 2, "", this), new Among ( "imos", -1, 2, "", this), new Among ( "ar\u00E1s", -1, 2, "", this), new Among ( "er\u00E1s", -1, 2, "", this), new Among ( "ir\u00E1s", -1, 2, "", this), new Among ( "\u00EDs", -1, 2, "", this), new Among ( "ar\u00E1", -1, 2, "", this), new Among ( "er\u00E1", -1, 2, "", this), new Among ( "ir\u00E1", -1, 2, "", this), new Among ( "ar\u00E9", -1, 2, "", this), new Among ( "er\u00E9", -1, 2, "", this), new Among ( "ir\u00E9", -1, 2, "", this), new Among ( "i\u00F3", -1, 2, "", this) }; private Among a_9[] = { new Among ( "a", -1, 1, "", this), new Among ( "e", -1, 2, "", this), new Among ( "o", -1, 1, "", this), new Among ( "os", -1, 1, "", this), new Among ( "\u00E1", -1, 1, "", this), new Among ( "\u00E9", -1, 2, "", this), new Among ( "\u00ED", -1, 1, "", this), new Among ( "\u00F3", -1, 1, "", this) }; private static final char g_v[] = {17, 65, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 17, 4, 10 }; private int I_p2; private int I_p1; private int I_pV; private void copy_from(SpanishStemmer other) { I_p2 = other.I_p2; I_p1 = other.I_p1; I_pV = other.I_pV; super.copy_from(other); } private boolean r_mark_regions() { int v_1; int v_2; int v_3; int v_6; int v_8; // (, line 31 I_pV = limit; I_p1 = limit; I_p2 = limit; // do, line 37 v_1 = cursor; lab0: do { // (, line 37 // or, line 39 lab1: do { v_2 = cursor; lab2: do { // (, line 38 if (!(in_grouping(g_v, 97, 252))) { break lab2; } // or, line 38 lab3: do { v_3 = cursor; lab4: do { // (, line 38 if (!(out_grouping(g_v, 97, 252))) { break lab4; } // gopast, line 38 golab5: while(true) { lab6: do { if (!(in_grouping(g_v, 97, 252))) { break lab6; } break golab5; } while (false); if (cursor >= limit) { break lab4; } cursor++; } break lab3; } while (false); cursor = v_3; // (, line 38 if (!(in_grouping(g_v, 97, 252))) { break lab2; } // gopast, line 38 golab7: while(true) { lab8: do { if (!(out_grouping(g_v, 97, 252))) { break lab8; } break golab7; } while (false); if (cursor >= limit) { break lab2; } cursor++; } } while (false); break lab1; } while (false); cursor = v_2; // (, line 40 if (!(out_grouping(g_v, 97, 252))) { break lab0; } // or, line 40 lab9: do { v_6 = cursor; lab10: do { // (, line 40 if (!(out_grouping(g_v, 97, 252))) { break lab10; } // gopast, line 40 golab11: while(true) { lab12: do { if (!(in_grouping(g_v, 97, 252))) { break lab12; } break golab11; } while (false); if (cursor >= limit) { break lab10; } cursor++; } break lab9; } while (false); cursor = v_6; // (, line 40 if (!(in_grouping(g_v, 97, 252))) { break lab0; } // next, line 40 if (cursor >= limit) { break lab0; } cursor++; } while (false); } while (false); // setmark pV, line 41 I_pV = cursor; } while (false); cursor = v_1; // do, line 43 v_8 = cursor; lab13: do { // (, line 43 // gopast, line 44 golab14: while(true) { lab15: do { if (!(in_grouping(g_v, 97, 252))) { break lab15; } break golab14; } while (false); if (cursor >= limit) { break lab13; } cursor++; } // gopast, line 44 golab16: while(true) { lab17: do { if (!(out_grouping(g_v, 97, 252))) { break lab17; } break golab16; } while (false); if (cursor >= limit) { break lab13; } cursor++; } // setmark p1, line 44 I_p1 = cursor; // gopast, line 45 golab18: while(true) { lab19: do { if (!(in_grouping(g_v, 97, 252))) { break lab19; } break golab18; } while (false); if (cursor >= limit) { break lab13; } cursor++; } // gopast, line 45 golab20: while(true) { lab21: do { if (!(out_grouping(g_v, 97, 252))) { break lab21; } break golab20; } while (false); if (cursor >= limit) { break lab13; } cursor++; } // setmark p2, line 45 I_p2 = cursor; } while (false); cursor = v_8; return true; } private boolean r_postlude() { int among_var; int v_1; // repeat, line 49 replab0: while(true) { v_1 = cursor; lab1: do { // (, line 49 // [, line 50 bra = cursor; // substring, line 50 among_var = find_among(a_0, 6); if (among_var == 0) { break lab1; } // ], line 50 ket = cursor; switch(among_var) { case 0: break lab1; case 1: // (, line 51 // <-, line 51 slice_from("a"); break; case 2: // (, line 52 // <-, line 52 slice_from("e"); break; case 3: // (, line 53 // <-, line 53 slice_from("i"); break; case 4: // (, line 54 // <-, line 54 slice_from("o"); break; case 5: // (, line 55 // <-, line 55 slice_from("u"); break; case 6: // (, line 57 // next, line 57 if (cursor >= limit) { break lab1; } cursor++; break; } continue replab0; } while (false); cursor = v_1; break replab0; } return true; } private boolean r_RV() { if (!(I_pV <= cursor)) { return false; } return true; } private boolean r_R1() { if (!(I_p1 <= cursor)) { return false; } return true; } private boolean r_R2() { if (!(I_p2 <= cursor)) { return false; } return true; } private boolean r_attached_pronoun() { int among_var; // (, line 67 // [, line 68 ket = cursor; // substring, line 68 if (find_among_b(a_1, 13) == 0) { return false; } // ], line 68 bra = cursor; // substring, line 72 among_var = find_among_b(a_2, 11); if (among_var == 0) { return false; } // call RV, line 72 if (!r_RV()) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 73 // ], line 73 bra = cursor; // <-, line 73 slice_from("iendo"); break; case 2: // (, line 74 // ], line 74 bra = cursor; // <-, line 74 slice_from("ando"); break; case 3: // (, line 75 // ], line 75 bra = cursor; // <-, line 75 slice_from("ar"); break; case 4: // (, line 76 // ], line 76 bra = cursor; // <-, line 76 slice_from("er"); break; case 5: // (, line 77 // ], line 77 bra = cursor; // <-, line 77 slice_from("ir"); break; case 6: // (, line 81 // delete, line 81 slice_del(); break; case 7: // (, line 82 // literal, line 82 if (!(eq_s_b(1, "u"))) { return false; } // delete, line 82 slice_del(); break; } return true; } private boolean r_standard_suffix() { int among_var; int v_1; int v_2; int v_3; int v_4; int v_5; // (, line 86 // [, line 87 ket = cursor; // substring, line 87 among_var = find_among_b(a_6, 46); if (among_var == 0) { return false; } // ], line 87 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 98 // call R2, line 99 if (!r_R2()) { return false; } // delete, line 99 slice_del(); break; case 2: // (, line 104 // call R2, line 105 if (!r_R2()) { return false; } // delete, line 105 slice_del(); // try, line 106 v_1 = limit - cursor; lab0: do { // (, line 106 // [, line 106 ket = cursor; // literal, line 106 if (!(eq_s_b(2, "ic"))) { cursor = limit - v_1; break lab0; } // ], line 106 bra = cursor; // call R2, line 106 if (!r_R2()) { cursor = limit - v_1; break lab0; } // delete, line 106 slice_del(); } while (false); break; case 3: // (, line 110 // call R2, line 111 if (!r_R2()) { return false; } // <-, line 111 slice_from("log"); break; case 4: // (, line 114 // call R2, line 115 if (!r_R2()) { return false; } // <-, line 115 slice_from("u"); break; case 5: // (, line 118 // call R2, line 119 if (!r_R2()) { return false; } // <-, line 119 slice_from("ente"); break; case 6: // (, line 122 // call R1, line 123 if (!r_R1()) { return false; } // delete, line 123 slice_del(); // try, line 124 v_2 = limit - cursor; lab1: do { // (, line 124 // [, line 125 ket = cursor; // substring, line 125 among_var = find_among_b(a_3, 4); if (among_var == 0) { cursor = limit - v_2; break lab1; } // ], line 125 bra = cursor; // call R2, line 125 if (!r_R2()) { cursor = limit - v_2; break lab1; } // delete, line 125 slice_del(); switch(among_var) { case 0: cursor = limit - v_2; break lab1; case 1: // (, line 126 // [, line 126 ket = cursor; // literal, line 126 if (!(eq_s_b(2, "at"))) { cursor = limit - v_2; break lab1; } // ], line 126 bra = cursor; // call R2, line 126 if (!r_R2()) { cursor = limit - v_2; break lab1; } // delete, line 126 slice_del(); break; } } while (false); break; case 7: // (, line 134 // call R2, line 135 if (!r_R2()) { return false; } // delete, line 135 slice_del(); // try, line 136 v_3 = limit - cursor; lab2: do { // (, line 136 // [, line 137 ket = cursor; // substring, line 137 among_var = find_among_b(a_4, 3); if (among_var == 0) { cursor = limit - v_3; break lab2; } // ], line 137 bra = cursor; switch(among_var) { case 0: cursor = limit - v_3; break lab2; case 1: // (, line 140 // call R2, line 140 if (!r_R2()) { cursor = limit - v_3; break lab2; } // delete, line 140 slice_del(); break; } } while (false); break; case 8: // (, line 146 // call R2, line 147 if (!r_R2()) { return false; } // delete, line 147 slice_del(); // try, line 148 v_4 = limit - cursor; lab3: do { // (, line 148 // [, line 149 ket = cursor; // substring, line 149 among_var = find_among_b(a_5, 3); if (among_var == 0) { cursor = limit - v_4; break lab3; } // ], line 149 bra = cursor; switch(among_var) { case 0: cursor = limit - v_4; break lab3; case 1: // (, line 152 // call R2, line 152 if (!r_R2()) { cursor = limit - v_4; break lab3; } // delete, line 152 slice_del(); break; } } while (false); break; case 9: // (, line 158 // call R2, line 159 if (!r_R2()) { return false; } // delete, line 159 slice_del(); // try, line 160 v_5 = limit - cursor; lab4: do { // (, line 160 // [, line 161 ket = cursor; // literal, line 161 if (!(eq_s_b(2, "at"))) { cursor = limit - v_5; break lab4; } // ], line 161 bra = cursor; // call R2, line 161 if (!r_R2()) { cursor = limit - v_5; break lab4; } // delete, line 161 slice_del(); } while (false); break; } return true; } private boolean r_y_verb_suffix() { int among_var; int v_1; int v_2; // (, line 167 // setlimit, line 168 v_1 = limit - cursor; // tomark, line 168 if (cursor < I_pV) { return false; } cursor = I_pV; v_2 = limit_backward; limit_backward = cursor; cursor = limit - v_1; // (, line 168 // [, line 168 ket = cursor; // substring, line 168 among_var = find_among_b(a_7, 12); if (among_var == 0) { limit_backward = v_2; return false; } // ], line 168 bra = cursor; limit_backward = v_2; switch(among_var) { case 0: return false; case 1: // (, line 171 // literal, line 171 if (!(eq_s_b(1, "u"))) { return false; } // delete, line 171 slice_del(); break; } return true; } private boolean r_verb_suffix() { int among_var; int v_1; int v_2; int v_3; int v_4; // (, line 175 // setlimit, line 176 v_1 = limit - cursor; // tomark, line 176 if (cursor < I_pV) { return false; } cursor = I_pV; v_2 = limit_backward; limit_backward = cursor; cursor = limit - v_1; // (, line 176 // [, line 176 ket = cursor; // substring, line 176 among_var = find_among_b(a_8, 96); if (among_var == 0) { limit_backward = v_2; return false; } // ], line 176 bra = cursor; limit_backward = v_2; switch(among_var) { case 0: return false; case 1: // (, line 179 // try, line 179 v_3 = limit - cursor; lab0: do { // (, line 179 // literal, line 179 if (!(eq_s_b(1, "u"))) { cursor = limit - v_3; break lab0; } // test, line 179 v_4 = limit - cursor; // literal, line 179 if (!(eq_s_b(1, "g"))) { cursor = limit - v_3; break lab0; } cursor = limit - v_4; } while (false); // ], line 179 bra = cursor; // delete, line 179 slice_del(); break; case 2: // (, line 200 // delete, line 200 slice_del(); break; } return true; } private boolean r_residual_suffix() { int among_var; int v_1; int v_2; // (, line 204 // [, line 205 ket = cursor; // substring, line 205 among_var = find_among_b(a_9, 8); if (among_var == 0) { return false; } // ], line 205 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 208 // call RV, line 208 if (!r_RV()) { return false; } // delete, line 208 slice_del(); break; case 2: // (, line 210 // call RV, line 210 if (!r_RV()) { return false; } // delete, line 210 slice_del(); // try, line 210 v_1 = limit - cursor; lab0: do { // (, line 210 // [, line 210 ket = cursor; // literal, line 210 if (!(eq_s_b(1, "u"))) { cursor = limit - v_1; break lab0; } // ], line 210 bra = cursor; // test, line 210 v_2 = limit - cursor; // literal, line 210 if (!(eq_s_b(1, "g"))) { cursor = limit - v_1; break lab0; } cursor = limit - v_2; // call RV, line 210 if (!r_RV()) { cursor = limit - v_1; break lab0; } // delete, line 210 slice_del(); } while (false); break; } return true; } public boolean stem() { int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; // (, line 215 // do, line 216 v_1 = cursor; lab0: do { // call mark_regions, line 216 if (!r_mark_regions()) { break lab0; } } while (false); cursor = v_1; // backwards, line 217 limit_backward = cursor; cursor = limit; // (, line 217 // do, line 218 v_2 = limit - cursor; lab1: do { // call attached_pronoun, line 218 if (!r_attached_pronoun()) { break lab1; } } while (false); cursor = limit - v_2; // do, line 219 v_3 = limit - cursor; lab2: do { // (, line 219 // or, line 219 lab3: do { v_4 = limit - cursor; lab4: do { // call standard_suffix, line 219 if (!r_standard_suffix()) { break lab4; } break lab3; } while (false); cursor = limit - v_4; lab5: do { // call y_verb_suffix, line 220 if (!r_y_verb_suffix()) { break lab5; } break lab3; } while (false); cursor = limit - v_4; // call verb_suffix, line 221 if (!r_verb_suffix()) { break lab2; } } while (false); } while (false); cursor = limit - v_3; // do, line 223 v_5 = limit - cursor; lab6: do { // call residual_suffix, line 223 if (!r_residual_suffix()) { break lab6; } } while (false); cursor = limit - v_5; cursor = limit_backward; // do, line 225 v_6 = cursor; lab7: do { // call postlude, line 225 if (!r_postlude()) { break lab7; } } while (false); cursor = v_6; return true; } } lucene-2.9.4/contrib/snowball/src/java/org/tartarus/snowball/ext/DutchStemmer.java0000644000175000017500000006464011474320235031003 0ustar janpascaljanpascal// This file was generated automatically by the Snowball to Java compiler package org.tartarus.snowball.ext; import org.tartarus.snowball.SnowballProgram; import org.tartarus.snowball.Among; /** * Generated class implementing code defined by a snowball script. */ public class DutchStemmer extends SnowballProgram { private Among a_0[] = { new Among ( "", -1, 6, "", this), new Among ( "\u00E1", 0, 1, "", this), new Among ( "\u00E4", 0, 1, "", this), new Among ( "\u00E9", 0, 2, "", this), new Among ( "\u00EB", 0, 2, "", this), new Among ( "\u00ED", 0, 3, "", this), new Among ( "\u00EF", 0, 3, "", this), new Among ( "\u00F3", 0, 4, "", this), new Among ( "\u00F6", 0, 4, "", this), new Among ( "\u00FA", 0, 5, "", this), new Among ( "\u00FC", 0, 5, "", this) }; private Among a_1[] = { new Among ( "", -1, 3, "", this), new Among ( "I", 0, 2, "", this), new Among ( "Y", 0, 1, "", this) }; private Among a_2[] = { new Among ( "dd", -1, -1, "", this), new Among ( "kk", -1, -1, "", this), new Among ( "tt", -1, -1, "", this) }; private Among a_3[] = { new Among ( "ene", -1, 2, "", this), new Among ( "se", -1, 3, "", this), new Among ( "en", -1, 2, "", this), new Among ( "heden", 2, 1, "", this), new Among ( "s", -1, 3, "", this) }; private Among a_4[] = { new Among ( "end", -1, 1, "", this), new Among ( "ig", -1, 2, "", this), new Among ( "ing", -1, 1, "", this), new Among ( "lijk", -1, 3, "", this), new Among ( "baar", -1, 4, "", this), new Among ( "bar", -1, 5, "", this) }; private Among a_5[] = { new Among ( "aa", -1, -1, "", this), new Among ( "ee", -1, -1, "", this), new Among ( "oo", -1, -1, "", this), new Among ( "uu", -1, -1, "", this) }; private static final char g_v[] = {17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128 }; private static final char g_v_I[] = {1, 0, 0, 17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128 }; private static final char g_v_j[] = {17, 67, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128 }; private int I_p2; private int I_p1; private boolean B_e_found; private void copy_from(DutchStemmer other) { I_p2 = other.I_p2; I_p1 = other.I_p1; B_e_found = other.B_e_found; super.copy_from(other); } private boolean r_prelude() { int among_var; int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; // (, line 41 // test, line 42 v_1 = cursor; // repeat, line 42 replab0: while(true) { v_2 = cursor; lab1: do { // (, line 42 // [, line 43 bra = cursor; // substring, line 43 among_var = find_among(a_0, 11); if (among_var == 0) { break lab1; } // ], line 43 ket = cursor; switch(among_var) { case 0: break lab1; case 1: // (, line 45 // <-, line 45 slice_from("a"); break; case 2: // (, line 47 // <-, line 47 slice_from("e"); break; case 3: // (, line 49 // <-, line 49 slice_from("i"); break; case 4: // (, line 51 // <-, line 51 slice_from("o"); break; case 5: // (, line 53 // <-, line 53 slice_from("u"); break; case 6: // (, line 54 // next, line 54 if (cursor >= limit) { break lab1; } cursor++; break; } continue replab0; } while (false); cursor = v_2; break replab0; } cursor = v_1; // try, line 57 v_3 = cursor; lab2: do { // (, line 57 // [, line 57 bra = cursor; // literal, line 57 if (!(eq_s(1, "y"))) { cursor = v_3; break lab2; } // ], line 57 ket = cursor; // <-, line 57 slice_from("Y"); } while (false); // repeat, line 58 replab3: while(true) { v_4 = cursor; lab4: do { // goto, line 58 golab5: while(true) { v_5 = cursor; lab6: do { // (, line 58 if (!(in_grouping(g_v, 97, 232))) { break lab6; } // [, line 59 bra = cursor; // or, line 59 lab7: do { v_6 = cursor; lab8: do { // (, line 59 // literal, line 59 if (!(eq_s(1, "i"))) { break lab8; } // ], line 59 ket = cursor; if (!(in_grouping(g_v, 97, 232))) { break lab8; } // <-, line 59 slice_from("I"); break lab7; } while (false); cursor = v_6; // (, line 60 // literal, line 60 if (!(eq_s(1, "y"))) { break lab6; } // ], line 60 ket = cursor; // <-, line 60 slice_from("Y"); } while (false); cursor = v_5; break golab5; } while (false); cursor = v_5; if (cursor >= limit) { break lab4; } cursor++; } continue replab3; } while (false); cursor = v_4; break replab3; } return true; } private boolean r_mark_regions() { // (, line 64 I_p1 = limit; I_p2 = limit; // gopast, line 69 golab0: while(true) { lab1: do { if (!(in_grouping(g_v, 97, 232))) { break lab1; } break golab0; } while (false); if (cursor >= limit) { return false; } cursor++; } // gopast, line 69 golab2: while(true) { lab3: do { if (!(out_grouping(g_v, 97, 232))) { break lab3; } break golab2; } while (false); if (cursor >= limit) { return false; } cursor++; } // setmark p1, line 69 I_p1 = cursor; // try, line 70 lab4: do { // (, line 70 if (!(I_p1 < 3)) { break lab4; } I_p1 = 3; } while (false); // gopast, line 71 golab5: while(true) { lab6: do { if (!(in_grouping(g_v, 97, 232))) { break lab6; } break golab5; } while (false); if (cursor >= limit) { return false; } cursor++; } // gopast, line 71 golab7: while(true) { lab8: do { if (!(out_grouping(g_v, 97, 232))) { break lab8; } break golab7; } while (false); if (cursor >= limit) { return false; } cursor++; } // setmark p2, line 71 I_p2 = cursor; return true; } private boolean r_postlude() { int among_var; int v_1; // repeat, line 75 replab0: while(true) { v_1 = cursor; lab1: do { // (, line 75 // [, line 77 bra = cursor; // substring, line 77 among_var = find_among(a_1, 3); if (among_var == 0) { break lab1; } // ], line 77 ket = cursor; switch(among_var) { case 0: break lab1; case 1: // (, line 78 // <-, line 78 slice_from("y"); break; case 2: // (, line 79 // <-, line 79 slice_from("i"); break; case 3: // (, line 80 // next, line 80 if (cursor >= limit) { break lab1; } cursor++; break; } continue replab0; } while (false); cursor = v_1; break replab0; } return true; } private boolean r_R1() { if (!(I_p1 <= cursor)) { return false; } return true; } private boolean r_R2() { if (!(I_p2 <= cursor)) { return false; } return true; } private boolean r_undouble() { int v_1; // (, line 90 // test, line 91 v_1 = limit - cursor; // among, line 91 if (find_among_b(a_2, 3) == 0) { return false; } cursor = limit - v_1; // [, line 91 ket = cursor; // next, line 91 if (cursor <= limit_backward) { return false; } cursor--; // ], line 91 bra = cursor; // delete, line 91 slice_del(); return true; } private boolean r_e_ending() { int v_1; // (, line 94 // unset e_found, line 95 B_e_found = false; // [, line 96 ket = cursor; // literal, line 96 if (!(eq_s_b(1, "e"))) { return false; } // ], line 96 bra = cursor; // call R1, line 96 if (!r_R1()) { return false; } // test, line 96 v_1 = limit - cursor; if (!(out_grouping_b(g_v, 97, 232))) { return false; } cursor = limit - v_1; // delete, line 96 slice_del(); // set e_found, line 97 B_e_found = true; // call undouble, line 98 if (!r_undouble()) { return false; } return true; } private boolean r_en_ending() { int v_1; int v_2; // (, line 101 // call R1, line 102 if (!r_R1()) { return false; } // and, line 102 v_1 = limit - cursor; if (!(out_grouping_b(g_v, 97, 232))) { return false; } cursor = limit - v_1; // not, line 102 { v_2 = limit - cursor; lab0: do { // literal, line 102 if (!(eq_s_b(3, "gem"))) { break lab0; } return false; } while (false); cursor = limit - v_2; } // delete, line 102 slice_del(); // call undouble, line 103 if (!r_undouble()) { return false; } return true; } private boolean r_standard_suffix() { int among_var; int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; int v_8; int v_9; int v_10; // (, line 106 // do, line 107 v_1 = limit - cursor; lab0: do { // (, line 107 // [, line 108 ket = cursor; // substring, line 108 among_var = find_among_b(a_3, 5); if (among_var == 0) { break lab0; } // ], line 108 bra = cursor; switch(among_var) { case 0: break lab0; case 1: // (, line 110 // call R1, line 110 if (!r_R1()) { break lab0; } // <-, line 110 slice_from("heid"); break; case 2: // (, line 113 // call en_ending, line 113 if (!r_en_ending()) { break lab0; } break; case 3: // (, line 116 // call R1, line 116 if (!r_R1()) { break lab0; } if (!(out_grouping_b(g_v_j, 97, 232))) { break lab0; } // delete, line 116 slice_del(); break; } } while (false); cursor = limit - v_1; // do, line 120 v_2 = limit - cursor; lab1: do { // call e_ending, line 120 if (!r_e_ending()) { break lab1; } } while (false); cursor = limit - v_2; // do, line 122 v_3 = limit - cursor; lab2: do { // (, line 122 // [, line 122 ket = cursor; // literal, line 122 if (!(eq_s_b(4, "heid"))) { break lab2; } // ], line 122 bra = cursor; // call R2, line 122 if (!r_R2()) { break lab2; } // not, line 122 { v_4 = limit - cursor; lab3: do { // literal, line 122 if (!(eq_s_b(1, "c"))) { break lab3; } break lab2; } while (false); cursor = limit - v_4; } // delete, line 122 slice_del(); // [, line 123 ket = cursor; // literal, line 123 if (!(eq_s_b(2, "en"))) { break lab2; } // ], line 123 bra = cursor; // call en_ending, line 123 if (!r_en_ending()) { break lab2; } } while (false); cursor = limit - v_3; // do, line 126 v_5 = limit - cursor; lab4: do { // (, line 126 // [, line 127 ket = cursor; // substring, line 127 among_var = find_among_b(a_4, 6); if (among_var == 0) { break lab4; } // ], line 127 bra = cursor; switch(among_var) { case 0: break lab4; case 1: // (, line 129 // call R2, line 129 if (!r_R2()) { break lab4; } // delete, line 129 slice_del(); // or, line 130 lab5: do { v_6 = limit - cursor; lab6: do { // (, line 130 // [, line 130 ket = cursor; // literal, line 130 if (!(eq_s_b(2, "ig"))) { break lab6; } // ], line 130 bra = cursor; // call R2, line 130 if (!r_R2()) { break lab6; } // not, line 130 { v_7 = limit - cursor; lab7: do { // literal, line 130 if (!(eq_s_b(1, "e"))) { break lab7; } break lab6; } while (false); cursor = limit - v_7; } // delete, line 130 slice_del(); break lab5; } while (false); cursor = limit - v_6; // call undouble, line 130 if (!r_undouble()) { break lab4; } } while (false); break; case 2: // (, line 133 // call R2, line 133 if (!r_R2()) { break lab4; } // not, line 133 { v_8 = limit - cursor; lab8: do { // literal, line 133 if (!(eq_s_b(1, "e"))) { break lab8; } break lab4; } while (false); cursor = limit - v_8; } // delete, line 133 slice_del(); break; case 3: // (, line 136 // call R2, line 136 if (!r_R2()) { break lab4; } // delete, line 136 slice_del(); // call e_ending, line 136 if (!r_e_ending()) { break lab4; } break; case 4: // (, line 139 // call R2, line 139 if (!r_R2()) { break lab4; } // delete, line 139 slice_del(); break; case 5: // (, line 142 // call R2, line 142 if (!r_R2()) { break lab4; } // Boolean test e_found, line 142 if (!(B_e_found)) { break lab4; } // delete, line 142 slice_del(); break; } } while (false); cursor = limit - v_5; // do, line 146 v_9 = limit - cursor; lab9: do { // (, line 146 if (!(out_grouping_b(g_v_I, 73, 232))) { break lab9; } // test, line 148 v_10 = limit - cursor; // (, line 148 // among, line 149 if (find_among_b(a_5, 4) == 0) { break lab9; } if (!(out_grouping_b(g_v, 97, 232))) { break lab9; } cursor = limit - v_10; // [, line 152 ket = cursor; // next, line 152 if (cursor <= limit_backward) { break lab9; } cursor--; // ], line 152 bra = cursor; // delete, line 152 slice_del(); } while (false); cursor = limit - v_9; return true; } public boolean stem() { int v_1; int v_2; int v_3; int v_4; // (, line 157 // do, line 159 v_1 = cursor; lab0: do { // call prelude, line 159 if (!r_prelude()) { break lab0; } } while (false); cursor = v_1; // do, line 160 v_2 = cursor; lab1: do { // call mark_regions, line 160 if (!r_mark_regions()) { break lab1; } } while (false); cursor = v_2; // backwards, line 161 limit_backward = cursor; cursor = limit; // do, line 162 v_3 = limit - cursor; lab2: do { // call standard_suffix, line 162 if (!r_standard_suffix()) { break lab2; } } while (false); cursor = limit - v_3; cursor = limit_backward; // do, line 163 v_4 = cursor; lab3: do { // call postlude, line 163 if (!r_postlude()) { break lab3; } } while (false); cursor = v_4; return true; } } lucene-2.9.4/contrib/snowball/src/java/org/tartarus/snowball/ext/PortugueseStemmer.java0000644000175000017500000011757211474320235032101 0ustar janpascaljanpascal// This file was generated automatically by the Snowball to Java compiler package org.tartarus.snowball.ext; import org.tartarus.snowball.SnowballProgram; import org.tartarus.snowball.Among; /** * Generated class implementing code defined by a snowball script. */ public class PortugueseStemmer extends SnowballProgram { private Among a_0[] = { new Among ( "", -1, 3, "", this), new Among ( "\u00E3", 0, 1, "", this), new Among ( "\u00F5", 0, 2, "", this) }; private Among a_1[] = { new Among ( "", -1, 3, "", this), new Among ( "a~", 0, 1, "", this), new Among ( "o~", 0, 2, "", this) }; private Among a_2[] = { new Among ( "ic", -1, -1, "", this), new Among ( "ad", -1, -1, "", this), new Among ( "os", -1, -1, "", this), new Among ( "iv", -1, 1, "", this) }; private Among a_3[] = { new Among ( "ante", -1, 1, "", this), new Among ( "avel", -1, 1, "", this), new Among ( "\u00EDvel", -1, 1, "", this) }; private Among a_4[] = { new Among ( "ic", -1, 1, "", this), new Among ( "abil", -1, 1, "", this), new Among ( "iv", -1, 1, "", this) }; private Among a_5[] = { new Among ( "ica", -1, 1, "", this), new Among ( "\u00E2ncia", -1, 1, "", this), new Among ( "\u00EAncia", -1, 4, "", this), new Among ( "ira", -1, 9, "", this), new Among ( "adora", -1, 1, "", this), new Among ( "osa", -1, 1, "", this), new Among ( "ista", -1, 1, "", this), new Among ( "iva", -1, 8, "", this), new Among ( "eza", -1, 1, "", this), new Among ( "log\u00EDa", -1, 2, "", this), new Among ( "idade", -1, 7, "", this), new Among ( "ante", -1, 1, "", this), new Among ( "mente", -1, 6, "", this), new Among ( "amente", 12, 5, "", this), new Among ( "\u00E1vel", -1, 1, "", this), new Among ( "\u00EDvel", -1, 1, "", this), new Among ( "uci\u00F3n", -1, 3, "", this), new Among ( "ico", -1, 1, "", this), new Among ( "ismo", -1, 1, "", this), new Among ( "oso", -1, 1, "", this), new Among ( "amento", -1, 1, "", this), new Among ( "imento", -1, 1, "", this), new Among ( "ivo", -1, 8, "", this), new Among ( "a\u00E7a~o", -1, 1, "", this), new Among ( "ador", -1, 1, "", this), new Among ( "icas", -1, 1, "", this), new Among ( "\u00EAncias", -1, 4, "", this), new Among ( "iras", -1, 9, "", this), new Among ( "adoras", -1, 1, "", this), new Among ( "osas", -1, 1, "", this), new Among ( "istas", -1, 1, "", this), new Among ( "ivas", -1, 8, "", this), new Among ( "ezas", -1, 1, "", this), new Among ( "log\u00EDas", -1, 2, "", this), new Among ( "idades", -1, 7, "", this), new Among ( "uciones", -1, 3, "", this), new Among ( "adores", -1, 1, "", this), new Among ( "antes", -1, 1, "", this), new Among ( "a\u00E7o~es", -1, 1, "", this), new Among ( "icos", -1, 1, "", this), new Among ( "ismos", -1, 1, "", this), new Among ( "osos", -1, 1, "", this), new Among ( "amentos", -1, 1, "", this), new Among ( "imentos", -1, 1, "", this), new Among ( "ivos", -1, 8, "", this) }; private Among a_6[] = { new Among ( "ada", -1, 1, "", this), new Among ( "ida", -1, 1, "", this), new Among ( "ia", -1, 1, "", this), new Among ( "aria", 2, 1, "", this), new Among ( "eria", 2, 1, "", this), new Among ( "iria", 2, 1, "", this), new Among ( "ara", -1, 1, "", this), new Among ( "era", -1, 1, "", this), new Among ( "ira", -1, 1, "", this), new Among ( "ava", -1, 1, "", this), new Among ( "asse", -1, 1, "", this), new Among ( "esse", -1, 1, "", this), new Among ( "isse", -1, 1, "", this), new Among ( "aste", -1, 1, "", this), new Among ( "este", -1, 1, "", this), new Among ( "iste", -1, 1, "", this), new Among ( "ei", -1, 1, "", this), new Among ( "arei", 16, 1, "", this), new Among ( "erei", 16, 1, "", this), new Among ( "irei", 16, 1, "", this), new Among ( "am", -1, 1, "", this), new Among ( "iam", 20, 1, "", this), new Among ( "ariam", 21, 1, "", this), new Among ( "eriam", 21, 1, "", this), new Among ( "iriam", 21, 1, "", this), new Among ( "aram", 20, 1, "", this), new Among ( "eram", 20, 1, "", this), new Among ( "iram", 20, 1, "", this), new Among ( "avam", 20, 1, "", this), new Among ( "em", -1, 1, "", this), new Among ( "arem", 29, 1, "", this), new Among ( "erem", 29, 1, "", this), new Among ( "irem", 29, 1, "", this), new Among ( "assem", 29, 1, "", this), new Among ( "essem", 29, 1, "", this), new Among ( "issem", 29, 1, "", this), new Among ( "ado", -1, 1, "", this), new Among ( "ido", -1, 1, "", this), new Among ( "ando", -1, 1, "", this), new Among ( "endo", -1, 1, "", this), new Among ( "indo", -1, 1, "", this), new Among ( "ara~o", -1, 1, "", this), new Among ( "era~o", -1, 1, "", this), new Among ( "ira~o", -1, 1, "", this), new Among ( "ar", -1, 1, "", this), new Among ( "er", -1, 1, "", this), new Among ( "ir", -1, 1, "", this), new Among ( "as", -1, 1, "", this), new Among ( "adas", 47, 1, "", this), new Among ( "idas", 47, 1, "", this), new Among ( "ias", 47, 1, "", this), new Among ( "arias", 50, 1, "", this), new Among ( "erias", 50, 1, "", this), new Among ( "irias", 50, 1, "", this), new Among ( "aras", 47, 1, "", this), new Among ( "eras", 47, 1, "", this), new Among ( "iras", 47, 1, "", this), new Among ( "avas", 47, 1, "", this), new Among ( "es", -1, 1, "", this), new Among ( "ardes", 58, 1, "", this), new Among ( "erdes", 58, 1, "", this), new Among ( "irdes", 58, 1, "", this), new Among ( "ares", 58, 1, "", this), new Among ( "eres", 58, 1, "", this), new Among ( "ires", 58, 1, "", this), new Among ( "asses", 58, 1, "", this), new Among ( "esses", 58, 1, "", this), new Among ( "isses", 58, 1, "", this), new Among ( "astes", 58, 1, "", this), new Among ( "estes", 58, 1, "", this), new Among ( "istes", 58, 1, "", this), new Among ( "is", -1, 1, "", this), new Among ( "ais", 71, 1, "", this), new Among ( "eis", 71, 1, "", this), new Among ( "areis", 73, 1, "", this), new Among ( "ereis", 73, 1, "", this), new Among ( "ireis", 73, 1, "", this), new Among ( "\u00E1reis", 73, 1, "", this), new Among ( "\u00E9reis", 73, 1, "", this), new Among ( "\u00EDreis", 73, 1, "", this), new Among ( "\u00E1sseis", 73, 1, "", this), new Among ( "\u00E9sseis", 73, 1, "", this), new Among ( "\u00EDsseis", 73, 1, "", this), new Among ( "\u00E1veis", 73, 1, "", this), new Among ( "\u00EDeis", 73, 1, "", this), new Among ( "ar\u00EDeis", 84, 1, "", this), new Among ( "er\u00EDeis", 84, 1, "", this), new Among ( "ir\u00EDeis", 84, 1, "", this), new Among ( "ados", -1, 1, "", this), new Among ( "idos", -1, 1, "", this), new Among ( "amos", -1, 1, "", this), new Among ( "\u00E1ramos", 90, 1, "", this), new Among ( "\u00E9ramos", 90, 1, "", this), new Among ( "\u00EDramos", 90, 1, "", this), new Among ( "\u00E1vamos", 90, 1, "", this), new Among ( "\u00EDamos", 90, 1, "", this), new Among ( "ar\u00EDamos", 95, 1, "", this), new Among ( "er\u00EDamos", 95, 1, "", this), new Among ( "ir\u00EDamos", 95, 1, "", this), new Among ( "emos", -1, 1, "", this), new Among ( "aremos", 99, 1, "", this), new Among ( "eremos", 99, 1, "", this), new Among ( "iremos", 99, 1, "", this), new Among ( "\u00E1ssemos", 99, 1, "", this), new Among ( "\u00EAssemos", 99, 1, "", this), new Among ( "\u00EDssemos", 99, 1, "", this), new Among ( "imos", -1, 1, "", this), new Among ( "armos", -1, 1, "", this), new Among ( "ermos", -1, 1, "", this), new Among ( "irmos", -1, 1, "", this), new Among ( "\u00E1mos", -1, 1, "", this), new Among ( "ar\u00E1s", -1, 1, "", this), new Among ( "er\u00E1s", -1, 1, "", this), new Among ( "ir\u00E1s", -1, 1, "", this), new Among ( "eu", -1, 1, "", this), new Among ( "iu", -1, 1, "", this), new Among ( "ou", -1, 1, "", this), new Among ( "ar\u00E1", -1, 1, "", this), new Among ( "er\u00E1", -1, 1, "", this), new Among ( "ir\u00E1", -1, 1, "", this) }; private Among a_7[] = { new Among ( "a", -1, 1, "", this), new Among ( "i", -1, 1, "", this), new Among ( "o", -1, 1, "", this), new Among ( "os", -1, 1, "", this), new Among ( "\u00E1", -1, 1, "", this), new Among ( "\u00ED", -1, 1, "", this), new Among ( "\u00F3", -1, 1, "", this) }; private Among a_8[] = { new Among ( "e", -1, 1, "", this), new Among ( "\u00E7", -1, 2, "", this), new Among ( "\u00E9", -1, 1, "", this), new Among ( "\u00EA", -1, 1, "", this) }; private static final char g_v[] = {17, 65, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 19, 12, 2 }; private int I_p2; private int I_p1; private int I_pV; private void copy_from(PortugueseStemmer other) { I_p2 = other.I_p2; I_p1 = other.I_p1; I_pV = other.I_pV; super.copy_from(other); } private boolean r_prelude() { int among_var; int v_1; // repeat, line 36 replab0: while(true) { v_1 = cursor; lab1: do { // (, line 36 // [, line 37 bra = cursor; // substring, line 37 among_var = find_among(a_0, 3); if (among_var == 0) { break lab1; } // ], line 37 ket = cursor; switch(among_var) { case 0: break lab1; case 1: // (, line 38 // <-, line 38 slice_from("a~"); break; case 2: // (, line 39 // <-, line 39 slice_from("o~"); break; case 3: // (, line 40 // next, line 40 if (cursor >= limit) { break lab1; } cursor++; break; } continue replab0; } while (false); cursor = v_1; break replab0; } return true; } private boolean r_mark_regions() { int v_1; int v_2; int v_3; int v_6; int v_8; // (, line 44 I_pV = limit; I_p1 = limit; I_p2 = limit; // do, line 50 v_1 = cursor; lab0: do { // (, line 50 // or, line 52 lab1: do { v_2 = cursor; lab2: do { // (, line 51 if (!(in_grouping(g_v, 97, 250))) { break lab2; } // or, line 51 lab3: do { v_3 = cursor; lab4: do { // (, line 51 if (!(out_grouping(g_v, 97, 250))) { break lab4; } // gopast, line 51 golab5: while(true) { lab6: do { if (!(in_grouping(g_v, 97, 250))) { break lab6; } break golab5; } while (false); if (cursor >= limit) { break lab4; } cursor++; } break lab3; } while (false); cursor = v_3; // (, line 51 if (!(in_grouping(g_v, 97, 250))) { break lab2; } // gopast, line 51 golab7: while(true) { lab8: do { if (!(out_grouping(g_v, 97, 250))) { break lab8; } break golab7; } while (false); if (cursor >= limit) { break lab2; } cursor++; } } while (false); break lab1; } while (false); cursor = v_2; // (, line 53 if (!(out_grouping(g_v, 97, 250))) { break lab0; } // or, line 53 lab9: do { v_6 = cursor; lab10: do { // (, line 53 if (!(out_grouping(g_v, 97, 250))) { break lab10; } // gopast, line 53 golab11: while(true) { lab12: do { if (!(in_grouping(g_v, 97, 250))) { break lab12; } break golab11; } while (false); if (cursor >= limit) { break lab10; } cursor++; } break lab9; } while (false); cursor = v_6; // (, line 53 if (!(in_grouping(g_v, 97, 250))) { break lab0; } // next, line 53 if (cursor >= limit) { break lab0; } cursor++; } while (false); } while (false); // setmark pV, line 54 I_pV = cursor; } while (false); cursor = v_1; // do, line 56 v_8 = cursor; lab13: do { // (, line 56 // gopast, line 57 golab14: while(true) { lab15: do { if (!(in_grouping(g_v, 97, 250))) { break lab15; } break golab14; } while (false); if (cursor >= limit) { break lab13; } cursor++; } // gopast, line 57 golab16: while(true) { lab17: do { if (!(out_grouping(g_v, 97, 250))) { break lab17; } break golab16; } while (false); if (cursor >= limit) { break lab13; } cursor++; } // setmark p1, line 57 I_p1 = cursor; // gopast, line 58 golab18: while(true) { lab19: do { if (!(in_grouping(g_v, 97, 250))) { break lab19; } break golab18; } while (false); if (cursor >= limit) { break lab13; } cursor++; } // gopast, line 58 golab20: while(true) { lab21: do { if (!(out_grouping(g_v, 97, 250))) { break lab21; } break golab20; } while (false); if (cursor >= limit) { break lab13; } cursor++; } // setmark p2, line 58 I_p2 = cursor; } while (false); cursor = v_8; return true; } private boolean r_postlude() { int among_var; int v_1; // repeat, line 62 replab0: while(true) { v_1 = cursor; lab1: do { // (, line 62 // [, line 63 bra = cursor; // substring, line 63 among_var = find_among(a_1, 3); if (among_var == 0) { break lab1; } // ], line 63 ket = cursor; switch(among_var) { case 0: break lab1; case 1: // (, line 64 // <-, line 64 slice_from("\u00E3"); break; case 2: // (, line 65 // <-, line 65 slice_from("\u00F5"); break; case 3: // (, line 66 // next, line 66 if (cursor >= limit) { break lab1; } cursor++; break; } continue replab0; } while (false); cursor = v_1; break replab0; } return true; } private boolean r_RV() { if (!(I_pV <= cursor)) { return false; } return true; } private boolean r_R1() { if (!(I_p1 <= cursor)) { return false; } return true; } private boolean r_R2() { if (!(I_p2 <= cursor)) { return false; } return true; } private boolean r_standard_suffix() { int among_var; int v_1; int v_2; int v_3; int v_4; // (, line 76 // [, line 77 ket = cursor; // substring, line 77 among_var = find_among_b(a_5, 45); if (among_var == 0) { return false; } // ], line 77 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 92 // call R2, line 93 if (!r_R2()) { return false; } // delete, line 93 slice_del(); break; case 2: // (, line 97 // call R2, line 98 if (!r_R2()) { return false; } // <-, line 98 slice_from("log"); break; case 3: // (, line 101 // call R2, line 102 if (!r_R2()) { return false; } // <-, line 102 slice_from("u"); break; case 4: // (, line 105 // call R2, line 106 if (!r_R2()) { return false; } // <-, line 106 slice_from("ente"); break; case 5: // (, line 109 // call R1, line 110 if (!r_R1()) { return false; } // delete, line 110 slice_del(); // try, line 111 v_1 = limit - cursor; lab0: do { // (, line 111 // [, line 112 ket = cursor; // substring, line 112 among_var = find_among_b(a_2, 4); if (among_var == 0) { cursor = limit - v_1; break lab0; } // ], line 112 bra = cursor; // call R2, line 112 if (!r_R2()) { cursor = limit - v_1; break lab0; } // delete, line 112 slice_del(); switch(among_var) { case 0: cursor = limit - v_1; break lab0; case 1: // (, line 113 // [, line 113 ket = cursor; // literal, line 113 if (!(eq_s_b(2, "at"))) { cursor = limit - v_1; break lab0; } // ], line 113 bra = cursor; // call R2, line 113 if (!r_R2()) { cursor = limit - v_1; break lab0; } // delete, line 113 slice_del(); break; } } while (false); break; case 6: // (, line 121 // call R2, line 122 if (!r_R2()) { return false; } // delete, line 122 slice_del(); // try, line 123 v_2 = limit - cursor; lab1: do { // (, line 123 // [, line 124 ket = cursor; // substring, line 124 among_var = find_among_b(a_3, 3); if (among_var == 0) { cursor = limit - v_2; break lab1; } // ], line 124 bra = cursor; switch(among_var) { case 0: cursor = limit - v_2; break lab1; case 1: // (, line 127 // call R2, line 127 if (!r_R2()) { cursor = limit - v_2; break lab1; } // delete, line 127 slice_del(); break; } } while (false); break; case 7: // (, line 133 // call R2, line 134 if (!r_R2()) { return false; } // delete, line 134 slice_del(); // try, line 135 v_3 = limit - cursor; lab2: do { // (, line 135 // [, line 136 ket = cursor; // substring, line 136 among_var = find_among_b(a_4, 3); if (among_var == 0) { cursor = limit - v_3; break lab2; } // ], line 136 bra = cursor; switch(among_var) { case 0: cursor = limit - v_3; break lab2; case 1: // (, line 139 // call R2, line 139 if (!r_R2()) { cursor = limit - v_3; break lab2; } // delete, line 139 slice_del(); break; } } while (false); break; case 8: // (, line 145 // call R2, line 146 if (!r_R2()) { return false; } // delete, line 146 slice_del(); // try, line 147 v_4 = limit - cursor; lab3: do { // (, line 147 // [, line 148 ket = cursor; // literal, line 148 if (!(eq_s_b(2, "at"))) { cursor = limit - v_4; break lab3; } // ], line 148 bra = cursor; // call R2, line 148 if (!r_R2()) { cursor = limit - v_4; break lab3; } // delete, line 148 slice_del(); } while (false); break; case 9: // (, line 152 // call RV, line 153 if (!r_RV()) { return false; } // literal, line 153 if (!(eq_s_b(1, "e"))) { return false; } // <-, line 154 slice_from("ir"); break; } return true; } private boolean r_verb_suffix() { int among_var; int v_1; int v_2; // setlimit, line 159 v_1 = limit - cursor; // tomark, line 159 if (cursor < I_pV) { return false; } cursor = I_pV; v_2 = limit_backward; limit_backward = cursor; cursor = limit - v_1; // (, line 159 // [, line 160 ket = cursor; // substring, line 160 among_var = find_among_b(a_6, 120); if (among_var == 0) { limit_backward = v_2; return false; } // ], line 160 bra = cursor; switch(among_var) { case 0: limit_backward = v_2; return false; case 1: // (, line 179 // delete, line 179 slice_del(); break; } limit_backward = v_2; return true; } private boolean r_residual_suffix() { int among_var; // (, line 183 // [, line 184 ket = cursor; // substring, line 184 among_var = find_among_b(a_7, 7); if (among_var == 0) { return false; } // ], line 184 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 187 // call RV, line 187 if (!r_RV()) { return false; } // delete, line 187 slice_del(); break; } return true; } private boolean r_residual_form() { int among_var; int v_1; int v_2; int v_3; // (, line 191 // [, line 192 ket = cursor; // substring, line 192 among_var = find_among_b(a_8, 4); if (among_var == 0) { return false; } // ], line 192 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 194 // call RV, line 194 if (!r_RV()) { return false; } // delete, line 194 slice_del(); // [, line 194 ket = cursor; // or, line 194 lab0: do { v_1 = limit - cursor; lab1: do { // (, line 194 // literal, line 194 if (!(eq_s_b(1, "u"))) { break lab1; } // ], line 194 bra = cursor; // test, line 194 v_2 = limit - cursor; // literal, line 194 if (!(eq_s_b(1, "g"))) { break lab1; } cursor = limit - v_2; break lab0; } while (false); cursor = limit - v_1; // (, line 195 // literal, line 195 if (!(eq_s_b(1, "i"))) { return false; } // ], line 195 bra = cursor; // test, line 195 v_3 = limit - cursor; // literal, line 195 if (!(eq_s_b(1, "c"))) { return false; } cursor = limit - v_3; } while (false); // call RV, line 195 if (!r_RV()) { return false; } // delete, line 195 slice_del(); break; case 2: // (, line 196 // <-, line 196 slice_from("c"); break; } return true; } public boolean stem() { int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; int v_8; int v_9; int v_10; // (, line 201 // do, line 202 v_1 = cursor; lab0: do { // call prelude, line 202 if (!r_prelude()) { break lab0; } } while (false); cursor = v_1; // do, line 203 v_2 = cursor; lab1: do { // call mark_regions, line 203 if (!r_mark_regions()) { break lab1; } } while (false); cursor = v_2; // backwards, line 204 limit_backward = cursor; cursor = limit; // (, line 204 // do, line 205 v_3 = limit - cursor; lab2: do { // (, line 205 // or, line 209 lab3: do { v_4 = limit - cursor; lab4: do { // (, line 206 // and, line 207 v_5 = limit - cursor; // (, line 206 // or, line 206 lab5: do { v_6 = limit - cursor; lab6: do { // call standard_suffix, line 206 if (!r_standard_suffix()) { break lab6; } break lab5; } while (false); cursor = limit - v_6; // call verb_suffix, line 206 if (!r_verb_suffix()) { break lab4; } } while (false); cursor = limit - v_5; // do, line 207 v_7 = limit - cursor; lab7: do { // (, line 207 // [, line 207 ket = cursor; // literal, line 207 if (!(eq_s_b(1, "i"))) { break lab7; } // ], line 207 bra = cursor; // test, line 207 v_8 = limit - cursor; // literal, line 207 if (!(eq_s_b(1, "c"))) { break lab7; } cursor = limit - v_8; // call RV, line 207 if (!r_RV()) { break lab7; } // delete, line 207 slice_del(); } while (false); cursor = limit - v_7; break lab3; } while (false); cursor = limit - v_4; // call residual_suffix, line 209 if (!r_residual_suffix()) { break lab2; } } while (false); } while (false); cursor = limit - v_3; // do, line 211 v_9 = limit - cursor; lab8: do { // call residual_form, line 211 if (!r_residual_form()) { break lab8; } } while (false); cursor = limit - v_9; cursor = limit_backward; // do, line 213 v_10 = cursor; lab9: do { // call postlude, line 213 if (!r_postlude()) { break lab9; } } while (false); cursor = v_10; return true; } } lucene-2.9.4/contrib/snowball/src/java/org/tartarus/snowball/ext/RomanianStemmer.java0000644000175000017500000011113011474320235031463 0ustar janpascaljanpascal// This file was generated automatically by the Snowball to Java compiler package org.tartarus.snowball.ext; import org.tartarus.snowball.SnowballProgram; import org.tartarus.snowball.Among; /** * Generated class implementing code defined by a snowball script. */ public class RomanianStemmer extends SnowballProgram { private Among a_0[] = { new Among ( "", -1, 3, "", this), new Among ( "I", 0, 1, "", this), new Among ( "U", 0, 2, "", this) }; private Among a_1[] = { new Among ( "ea", -1, 3, "", this), new Among ( "a\u0163ia", -1, 7, "", this), new Among ( "aua", -1, 2, "", this), new Among ( "iua", -1, 4, "", this), new Among ( "a\u0163ie", -1, 7, "", this), new Among ( "ele", -1, 3, "", this), new Among ( "ile", -1, 5, "", this), new Among ( "iile", 6, 4, "", this), new Among ( "iei", -1, 4, "", this), new Among ( "atei", -1, 6, "", this), new Among ( "ii", -1, 4, "", this), new Among ( "ului", -1, 1, "", this), new Among ( "ul", -1, 1, "", this), new Among ( "elor", -1, 3, "", this), new Among ( "ilor", -1, 4, "", this), new Among ( "iilor", 14, 4, "", this) }; private Among a_2[] = { new Among ( "icala", -1, 4, "", this), new Among ( "iciva", -1, 4, "", this), new Among ( "ativa", -1, 5, "", this), new Among ( "itiva", -1, 6, "", this), new Among ( "icale", -1, 4, "", this), new Among ( "a\u0163iune", -1, 5, "", this), new Among ( "i\u0163iune", -1, 6, "", this), new Among ( "atoare", -1, 5, "", this), new Among ( "itoare", -1, 6, "", this), new Among ( "\u0103toare", -1, 5, "", this), new Among ( "icitate", -1, 4, "", this), new Among ( "abilitate", -1, 1, "", this), new Among ( "ibilitate", -1, 2, "", this), new Among ( "ivitate", -1, 3, "", this), new Among ( "icive", -1, 4, "", this), new Among ( "ative", -1, 5, "", this), new Among ( "itive", -1, 6, "", this), new Among ( "icali", -1, 4, "", this), new Among ( "atori", -1, 5, "", this), new Among ( "icatori", 18, 4, "", this), new Among ( "itori", -1, 6, "", this), new Among ( "\u0103tori", -1, 5, "", this), new Among ( "icitati", -1, 4, "", this), new Among ( "abilitati", -1, 1, "", this), new Among ( "ivitati", -1, 3, "", this), new Among ( "icivi", -1, 4, "", this), new Among ( "ativi", -1, 5, "", this), new Among ( "itivi", -1, 6, "", this), new Among ( "icit\u0103i", -1, 4, "", this), new Among ( "abilit\u0103i", -1, 1, "", this), new Among ( "ivit\u0103i", -1, 3, "", this), new Among ( "icit\u0103\u0163i", -1, 4, "", this), new Among ( "abilit\u0103\u0163i", -1, 1, "", this), new Among ( "ivit\u0103\u0163i", -1, 3, "", this), new Among ( "ical", -1, 4, "", this), new Among ( "ator", -1, 5, "", this), new Among ( "icator", 35, 4, "", this), new Among ( "itor", -1, 6, "", this), new Among ( "\u0103tor", -1, 5, "", this), new Among ( "iciv", -1, 4, "", this), new Among ( "ativ", -1, 5, "", this), new Among ( "itiv", -1, 6, "", this), new Among ( "ical\u0103", -1, 4, "", this), new Among ( "iciv\u0103", -1, 4, "", this), new Among ( "ativ\u0103", -1, 5, "", this), new Among ( "itiv\u0103", -1, 6, "", this) }; private Among a_3[] = { new Among ( "ica", -1, 1, "", this), new Among ( "abila", -1, 1, "", this), new Among ( "ibila", -1, 1, "", this), new Among ( "oasa", -1, 1, "", this), new Among ( "ata", -1, 1, "", this), new Among ( "ita", -1, 1, "", this), new Among ( "anta", -1, 1, "", this), new Among ( "ista", -1, 3, "", this), new Among ( "uta", -1, 1, "", this), new Among ( "iva", -1, 1, "", this), new Among ( "ic", -1, 1, "", this), new Among ( "ice", -1, 1, "", this), new Among ( "abile", -1, 1, "", this), new Among ( "ibile", -1, 1, "", this), new Among ( "isme", -1, 3, "", this), new Among ( "iune", -1, 2, "", this), new Among ( "oase", -1, 1, "", this), new Among ( "ate", -1, 1, "", this), new Among ( "itate", 17, 1, "", this), new Among ( "ite", -1, 1, "", this), new Among ( "ante", -1, 1, "", this), new Among ( "iste", -1, 3, "", this), new Among ( "ute", -1, 1, "", this), new Among ( "ive", -1, 1, "", this), new Among ( "ici", -1, 1, "", this), new Among ( "abili", -1, 1, "", this), new Among ( "ibili", -1, 1, "", this), new Among ( "iuni", -1, 2, "", this), new Among ( "atori", -1, 1, "", this), new Among ( "osi", -1, 1, "", this), new Among ( "ati", -1, 1, "", this), new Among ( "itati", 30, 1, "", this), new Among ( "iti", -1, 1, "", this), new Among ( "anti", -1, 1, "", this), new Among ( "isti", -1, 3, "", this), new Among ( "uti", -1, 1, "", this), new Among ( "i\u015Fti", -1, 3, "", this), new Among ( "ivi", -1, 1, "", this), new Among ( "it\u0103i", -1, 1, "", this), new Among ( "o\u015Fi", -1, 1, "", this), new Among ( "it\u0103\u0163i", -1, 1, "", this), new Among ( "abil", -1, 1, "", this), new Among ( "ibil", -1, 1, "", this), new Among ( "ism", -1, 3, "", this), new Among ( "ator", -1, 1, "", this), new Among ( "os", -1, 1, "", this), new Among ( "at", -1, 1, "", this), new Among ( "it", -1, 1, "", this), new Among ( "ant", -1, 1, "", this), new Among ( "ist", -1, 3, "", this), new Among ( "ut", -1, 1, "", this), new Among ( "iv", -1, 1, "", this), new Among ( "ic\u0103", -1, 1, "", this), new Among ( "abil\u0103", -1, 1, "", this), new Among ( "ibil\u0103", -1, 1, "", this), new Among ( "oas\u0103", -1, 1, "", this), new Among ( "at\u0103", -1, 1, "", this), new Among ( "it\u0103", -1, 1, "", this), new Among ( "ant\u0103", -1, 1, "", this), new Among ( "ist\u0103", -1, 3, "", this), new Among ( "ut\u0103", -1, 1, "", this), new Among ( "iv\u0103", -1, 1, "", this) }; private Among a_4[] = { new Among ( "ea", -1, 1, "", this), new Among ( "ia", -1, 1, "", this), new Among ( "esc", -1, 1, "", this), new Among ( "\u0103sc", -1, 1, "", this), new Among ( "ind", -1, 1, "", this), new Among ( "\u00E2nd", -1, 1, "", this), new Among ( "are", -1, 1, "", this), new Among ( "ere", -1, 1, "", this), new Among ( "ire", -1, 1, "", this), new Among ( "\u00E2re", -1, 1, "", this), new Among ( "se", -1, 2, "", this), new Among ( "ase", 10, 1, "", this), new Among ( "sese", 10, 2, "", this), new Among ( "ise", 10, 1, "", this), new Among ( "use", 10, 1, "", this), new Among ( "\u00E2se", 10, 1, "", this), new Among ( "e\u015Fte", -1, 1, "", this), new Among ( "\u0103\u015Fte", -1, 1, "", this), new Among ( "eze", -1, 1, "", this), new Among ( "ai", -1, 1, "", this), new Among ( "eai", 19, 1, "", this), new Among ( "iai", 19, 1, "", this), new Among ( "sei", -1, 2, "", this), new Among ( "e\u015Fti", -1, 1, "", this), new Among ( "\u0103\u015Fti", -1, 1, "", this), new Among ( "ui", -1, 1, "", this), new Among ( "ezi", -1, 1, "", this), new Among ( "\u00E2i", -1, 1, "", this), new Among ( "a\u015Fi", -1, 1, "", this), new Among ( "se\u015Fi", -1, 2, "", this), new Among ( "ase\u015Fi", 29, 1, "", this), new Among ( "sese\u015Fi", 29, 2, "", this), new Among ( "ise\u015Fi", 29, 1, "", this), new Among ( "use\u015Fi", 29, 1, "", this), new Among ( "\u00E2se\u015Fi", 29, 1, "", this), new Among ( "i\u015Fi", -1, 1, "", this), new Among ( "u\u015Fi", -1, 1, "", this), new Among ( "\u00E2\u015Fi", -1, 1, "", this), new Among ( "a\u0163i", -1, 2, "", this), new Among ( "ea\u0163i", 38, 1, "", this), new Among ( "ia\u0163i", 38, 1, "", this), new Among ( "e\u0163i", -1, 2, "", this), new Among ( "i\u0163i", -1, 2, "", this), new Among ( "\u00E2\u0163i", -1, 2, "", this), new Among ( "ar\u0103\u0163i", -1, 1, "", this), new Among ( "ser\u0103\u0163i", -1, 2, "", this), new Among ( "aser\u0103\u0163i", 45, 1, "", this), new Among ( "seser\u0103\u0163i", 45, 2, "", this), new Among ( "iser\u0103\u0163i", 45, 1, "", this), new Among ( "user\u0103\u0163i", 45, 1, "", this), new Among ( "\u00E2ser\u0103\u0163i", 45, 1, "", this), new Among ( "ir\u0103\u0163i", -1, 1, "", this), new Among ( "ur\u0103\u0163i", -1, 1, "", this), new Among ( "\u00E2r\u0103\u0163i", -1, 1, "", this), new Among ( "am", -1, 1, "", this), new Among ( "eam", 54, 1, "", this), new Among ( "iam", 54, 1, "", this), new Among ( "em", -1, 2, "", this), new Among ( "asem", 57, 1, "", this), new Among ( "sesem", 57, 2, "", this), new Among ( "isem", 57, 1, "", this), new Among ( "usem", 57, 1, "", this), new Among ( "\u00E2sem", 57, 1, "", this), new Among ( "im", -1, 2, "", this), new Among ( "\u00E2m", -1, 2, "", this), new Among ( "\u0103m", -1, 2, "", this), new Among ( "ar\u0103m", 65, 1, "", this), new Among ( "ser\u0103m", 65, 2, "", this), new Among ( "aser\u0103m", 67, 1, "", this), new Among ( "seser\u0103m", 67, 2, "", this), new Among ( "iser\u0103m", 67, 1, "", this), new Among ( "user\u0103m", 67, 1, "", this), new Among ( "\u00E2ser\u0103m", 67, 1, "", this), new Among ( "ir\u0103m", 65, 1, "", this), new Among ( "ur\u0103m", 65, 1, "", this), new Among ( "\u00E2r\u0103m", 65, 1, "", this), new Among ( "au", -1, 1, "", this), new Among ( "eau", 76, 1, "", this), new Among ( "iau", 76, 1, "", this), new Among ( "indu", -1, 1, "", this), new Among ( "\u00E2ndu", -1, 1, "", this), new Among ( "ez", -1, 1, "", this), new Among ( "easc\u0103", -1, 1, "", this), new Among ( "ar\u0103", -1, 1, "", this), new Among ( "ser\u0103", -1, 2, "", this), new Among ( "aser\u0103", 84, 1, "", this), new Among ( "seser\u0103", 84, 2, "", this), new Among ( "iser\u0103", 84, 1, "", this), new Among ( "user\u0103", 84, 1, "", this), new Among ( "\u00E2ser\u0103", 84, 1, "", this), new Among ( "ir\u0103", -1, 1, "", this), new Among ( "ur\u0103", -1, 1, "", this), new Among ( "\u00E2r\u0103", -1, 1, "", this), new Among ( "eaz\u0103", -1, 1, "", this) }; private Among a_5[] = { new Among ( "a", -1, 1, "", this), new Among ( "e", -1, 1, "", this), new Among ( "ie", 1, 1, "", this), new Among ( "i", -1, 1, "", this), new Among ( "\u0103", -1, 1, "", this) }; private static final char g_v[] = {17, 65, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 32, 0, 0, 4 }; private boolean B_standard_suffix_removed; private int I_p2; private int I_p1; private int I_pV; private void copy_from(RomanianStemmer other) { B_standard_suffix_removed = other.B_standard_suffix_removed; I_p2 = other.I_p2; I_p1 = other.I_p1; I_pV = other.I_pV; super.copy_from(other); } private boolean r_prelude() { int v_1; int v_2; int v_3; // (, line 31 // repeat, line 32 replab0: while(true) { v_1 = cursor; lab1: do { // goto, line 32 golab2: while(true) { v_2 = cursor; lab3: do { // (, line 32 if (!(in_grouping(g_v, 97, 259))) { break lab3; } // [, line 33 bra = cursor; // or, line 33 lab4: do { v_3 = cursor; lab5: do { // (, line 33 // literal, line 33 if (!(eq_s(1, "u"))) { break lab5; } // ], line 33 ket = cursor; if (!(in_grouping(g_v, 97, 259))) { break lab5; } // <-, line 33 slice_from("U"); break lab4; } while (false); cursor = v_3; // (, line 34 // literal, line 34 if (!(eq_s(1, "i"))) { break lab3; } // ], line 34 ket = cursor; if (!(in_grouping(g_v, 97, 259))) { break lab3; } // <-, line 34 slice_from("I"); } while (false); cursor = v_2; break golab2; } while (false); cursor = v_2; if (cursor >= limit) { break lab1; } cursor++; } continue replab0; } while (false); cursor = v_1; break replab0; } return true; } private boolean r_mark_regions() { int v_1; int v_2; int v_3; int v_6; int v_8; // (, line 38 I_pV = limit; I_p1 = limit; I_p2 = limit; // do, line 44 v_1 = cursor; lab0: do { // (, line 44 // or, line 46 lab1: do { v_2 = cursor; lab2: do { // (, line 45 if (!(in_grouping(g_v, 97, 259))) { break lab2; } // or, line 45 lab3: do { v_3 = cursor; lab4: do { // (, line 45 if (!(out_grouping(g_v, 97, 259))) { break lab4; } // gopast, line 45 golab5: while(true) { lab6: do { if (!(in_grouping(g_v, 97, 259))) { break lab6; } break golab5; } while (false); if (cursor >= limit) { break lab4; } cursor++; } break lab3; } while (false); cursor = v_3; // (, line 45 if (!(in_grouping(g_v, 97, 259))) { break lab2; } // gopast, line 45 golab7: while(true) { lab8: do { if (!(out_grouping(g_v, 97, 259))) { break lab8; } break golab7; } while (false); if (cursor >= limit) { break lab2; } cursor++; } } while (false); break lab1; } while (false); cursor = v_2; // (, line 47 if (!(out_grouping(g_v, 97, 259))) { break lab0; } // or, line 47 lab9: do { v_6 = cursor; lab10: do { // (, line 47 if (!(out_grouping(g_v, 97, 259))) { break lab10; } // gopast, line 47 golab11: while(true) { lab12: do { if (!(in_grouping(g_v, 97, 259))) { break lab12; } break golab11; } while (false); if (cursor >= limit) { break lab10; } cursor++; } break lab9; } while (false); cursor = v_6; // (, line 47 if (!(in_grouping(g_v, 97, 259))) { break lab0; } // next, line 47 if (cursor >= limit) { break lab0; } cursor++; } while (false); } while (false); // setmark pV, line 48 I_pV = cursor; } while (false); cursor = v_1; // do, line 50 v_8 = cursor; lab13: do { // (, line 50 // gopast, line 51 golab14: while(true) { lab15: do { if (!(in_grouping(g_v, 97, 259))) { break lab15; } break golab14; } while (false); if (cursor >= limit) { break lab13; } cursor++; } // gopast, line 51 golab16: while(true) { lab17: do { if (!(out_grouping(g_v, 97, 259))) { break lab17; } break golab16; } while (false); if (cursor >= limit) { break lab13; } cursor++; } // setmark p1, line 51 I_p1 = cursor; // gopast, line 52 golab18: while(true) { lab19: do { if (!(in_grouping(g_v, 97, 259))) { break lab19; } break golab18; } while (false); if (cursor >= limit) { break lab13; } cursor++; } // gopast, line 52 golab20: while(true) { lab21: do { if (!(out_grouping(g_v, 97, 259))) { break lab21; } break golab20; } while (false); if (cursor >= limit) { break lab13; } cursor++; } // setmark p2, line 52 I_p2 = cursor; } while (false); cursor = v_8; return true; } private boolean r_postlude() { int among_var; int v_1; // repeat, line 56 replab0: while(true) { v_1 = cursor; lab1: do { // (, line 56 // [, line 58 bra = cursor; // substring, line 58 among_var = find_among(a_0, 3); if (among_var == 0) { break lab1; } // ], line 58 ket = cursor; switch(among_var) { case 0: break lab1; case 1: // (, line 59 // <-, line 59 slice_from("i"); break; case 2: // (, line 60 // <-, line 60 slice_from("u"); break; case 3: // (, line 61 // next, line 61 if (cursor >= limit) { break lab1; } cursor++; break; } continue replab0; } while (false); cursor = v_1; break replab0; } return true; } private boolean r_RV() { if (!(I_pV <= cursor)) { return false; } return true; } private boolean r_R1() { if (!(I_p1 <= cursor)) { return false; } return true; } private boolean r_R2() { if (!(I_p2 <= cursor)) { return false; } return true; } private boolean r_step_0() { int among_var; int v_1; // (, line 72 // [, line 73 ket = cursor; // substring, line 73 among_var = find_among_b(a_1, 16); if (among_var == 0) { return false; } // ], line 73 bra = cursor; // call R1, line 73 if (!r_R1()) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 75 // delete, line 75 slice_del(); break; case 2: // (, line 77 // <-, line 77 slice_from("a"); break; case 3: // (, line 79 // <-, line 79 slice_from("e"); break; case 4: // (, line 81 // <-, line 81 slice_from("i"); break; case 5: // (, line 83 // not, line 83 { v_1 = limit - cursor; lab0: do { // literal, line 83 if (!(eq_s_b(2, "ab"))) { break lab0; } return false; } while (false); cursor = limit - v_1; } // <-, line 83 slice_from("i"); break; case 6: // (, line 85 // <-, line 85 slice_from("at"); break; case 7: // (, line 87 // <-, line 87 slice_from("a\u0163i"); break; } return true; } private boolean r_combo_suffix() { int among_var; int v_1; // test, line 91 v_1 = limit - cursor; // (, line 91 // [, line 92 ket = cursor; // substring, line 92 among_var = find_among_b(a_2, 46); if (among_var == 0) { return false; } // ], line 92 bra = cursor; // call R1, line 92 if (!r_R1()) { return false; } // (, line 92 switch(among_var) { case 0: return false; case 1: // (, line 100 // <-, line 101 slice_from("abil"); break; case 2: // (, line 103 // <-, line 104 slice_from("ibil"); break; case 3: // (, line 106 // <-, line 107 slice_from("iv"); break; case 4: // (, line 112 // <-, line 113 slice_from("ic"); break; case 5: // (, line 117 // <-, line 118 slice_from("at"); break; case 6: // (, line 121 // <-, line 122 slice_from("it"); break; } // set standard_suffix_removed, line 125 B_standard_suffix_removed = true; cursor = limit - v_1; return true; } private boolean r_standard_suffix() { int among_var; int v_1; // (, line 129 // unset standard_suffix_removed, line 130 B_standard_suffix_removed = false; // repeat, line 131 replab0: while(true) { v_1 = limit - cursor; lab1: do { // call combo_suffix, line 131 if (!r_combo_suffix()) { break lab1; } continue replab0; } while (false); cursor = limit - v_1; break replab0; } // [, line 132 ket = cursor; // substring, line 132 among_var = find_among_b(a_3, 62); if (among_var == 0) { return false; } // ], line 132 bra = cursor; // call R2, line 132 if (!r_R2()) { return false; } // (, line 132 switch(among_var) { case 0: return false; case 1: // (, line 148 // delete, line 149 slice_del(); break; case 2: // (, line 151 // literal, line 152 if (!(eq_s_b(1, "\u0163"))) { return false; } // ], line 152 bra = cursor; // <-, line 152 slice_from("t"); break; case 3: // (, line 155 // <-, line 156 slice_from("ist"); break; } // set standard_suffix_removed, line 160 B_standard_suffix_removed = true; return true; } private boolean r_verb_suffix() { int among_var; int v_1; int v_2; int v_3; // setlimit, line 164 v_1 = limit - cursor; // tomark, line 164 if (cursor < I_pV) { return false; } cursor = I_pV; v_2 = limit_backward; limit_backward = cursor; cursor = limit - v_1; // (, line 164 // [, line 165 ket = cursor; // substring, line 165 among_var = find_among_b(a_4, 94); if (among_var == 0) { limit_backward = v_2; return false; } // ], line 165 bra = cursor; switch(among_var) { case 0: limit_backward = v_2; return false; case 1: // (, line 200 // or, line 200 lab0: do { v_3 = limit - cursor; lab1: do { if (!(out_grouping_b(g_v, 97, 259))) { break lab1; } break lab0; } while (false); cursor = limit - v_3; // literal, line 200 if (!(eq_s_b(1, "u"))) { limit_backward = v_2; return false; } } while (false); // delete, line 200 slice_del(); break; case 2: // (, line 214 // delete, line 214 slice_del(); break; } limit_backward = v_2; return true; } private boolean r_vowel_suffix() { int among_var; // (, line 218 // [, line 219 ket = cursor; // substring, line 219 among_var = find_among_b(a_5, 5); if (among_var == 0) { return false; } // ], line 219 bra = cursor; // call RV, line 219 if (!r_RV()) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 220 // delete, line 220 slice_del(); break; } return true; } public boolean stem() { int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; int v_8; // (, line 225 // do, line 226 v_1 = cursor; lab0: do { // call prelude, line 226 if (!r_prelude()) { break lab0; } } while (false); cursor = v_1; // do, line 227 v_2 = cursor; lab1: do { // call mark_regions, line 227 if (!r_mark_regions()) { break lab1; } } while (false); cursor = v_2; // backwards, line 228 limit_backward = cursor; cursor = limit; // (, line 228 // do, line 229 v_3 = limit - cursor; lab2: do { // call step_0, line 229 if (!r_step_0()) { break lab2; } } while (false); cursor = limit - v_3; // do, line 230 v_4 = limit - cursor; lab3: do { // call standard_suffix, line 230 if (!r_standard_suffix()) { break lab3; } } while (false); cursor = limit - v_4; // do, line 231 v_5 = limit - cursor; lab4: do { // (, line 231 // or, line 231 lab5: do { v_6 = limit - cursor; lab6: do { // Boolean test standard_suffix_removed, line 231 if (!(B_standard_suffix_removed)) { break lab6; } break lab5; } while (false); cursor = limit - v_6; // call verb_suffix, line 231 if (!r_verb_suffix()) { break lab4; } } while (false); } while (false); cursor = limit - v_5; // do, line 232 v_7 = limit - cursor; lab7: do { // call vowel_suffix, line 232 if (!r_vowel_suffix()) { break lab7; } } while (false); cursor = limit - v_7; cursor = limit_backward; // do, line 234 v_8 = cursor; lab8: do { // call postlude, line 234 if (!r_postlude()) { break lab8; } } while (false); cursor = v_8; return true; } } lucene-2.9.4/contrib/snowball/src/java/org/tartarus/snowball/ext/ItalianStemmer.java0000644000175000017500000012314611474320235031312 0ustar janpascaljanpascal// This file was generated automatically by the Snowball to Java compiler package org.tartarus.snowball.ext; import org.tartarus.snowball.SnowballProgram; import org.tartarus.snowball.Among; /** * Generated class implementing code defined by a snowball script. */ public class ItalianStemmer extends SnowballProgram { private Among a_0[] = { new Among ( "", -1, 7, "", this), new Among ( "qu", 0, 6, "", this), new Among ( "\u00E1", 0, 1, "", this), new Among ( "\u00E9", 0, 2, "", this), new Among ( "\u00ED", 0, 3, "", this), new Among ( "\u00F3", 0, 4, "", this), new Among ( "\u00FA", 0, 5, "", this) }; private Among a_1[] = { new Among ( "", -1, 3, "", this), new Among ( "I", 0, 1, "", this), new Among ( "U", 0, 2, "", this) }; private Among a_2[] = { new Among ( "la", -1, -1, "", this), new Among ( "cela", 0, -1, "", this), new Among ( "gliela", 0, -1, "", this), new Among ( "mela", 0, -1, "", this), new Among ( "tela", 0, -1, "", this), new Among ( "vela", 0, -1, "", this), new Among ( "le", -1, -1, "", this), new Among ( "cele", 6, -1, "", this), new Among ( "gliele", 6, -1, "", this), new Among ( "mele", 6, -1, "", this), new Among ( "tele", 6, -1, "", this), new Among ( "vele", 6, -1, "", this), new Among ( "ne", -1, -1, "", this), new Among ( "cene", 12, -1, "", this), new Among ( "gliene", 12, -1, "", this), new Among ( "mene", 12, -1, "", this), new Among ( "sene", 12, -1, "", this), new Among ( "tene", 12, -1, "", this), new Among ( "vene", 12, -1, "", this), new Among ( "ci", -1, -1, "", this), new Among ( "li", -1, -1, "", this), new Among ( "celi", 20, -1, "", this), new Among ( "glieli", 20, -1, "", this), new Among ( "meli", 20, -1, "", this), new Among ( "teli", 20, -1, "", this), new Among ( "veli", 20, -1, "", this), new Among ( "gli", 20, -1, "", this), new Among ( "mi", -1, -1, "", this), new Among ( "si", -1, -1, "", this), new Among ( "ti", -1, -1, "", this), new Among ( "vi", -1, -1, "", this), new Among ( "lo", -1, -1, "", this), new Among ( "celo", 31, -1, "", this), new Among ( "glielo", 31, -1, "", this), new Among ( "melo", 31, -1, "", this), new Among ( "telo", 31, -1, "", this), new Among ( "velo", 31, -1, "", this) }; private Among a_3[] = { new Among ( "ando", -1, 1, "", this), new Among ( "endo", -1, 1, "", this), new Among ( "ar", -1, 2, "", this), new Among ( "er", -1, 2, "", this), new Among ( "ir", -1, 2, "", this) }; private Among a_4[] = { new Among ( "ic", -1, -1, "", this), new Among ( "abil", -1, -1, "", this), new Among ( "os", -1, -1, "", this), new Among ( "iv", -1, 1, "", this) }; private Among a_5[] = { new Among ( "ic", -1, 1, "", this), new Among ( "abil", -1, 1, "", this), new Among ( "iv", -1, 1, "", this) }; private Among a_6[] = { new Among ( "ica", -1, 1, "", this), new Among ( "logia", -1, 3, "", this), new Among ( "osa", -1, 1, "", this), new Among ( "ista", -1, 1, "", this), new Among ( "iva", -1, 9, "", this), new Among ( "anza", -1, 1, "", this), new Among ( "enza", -1, 5, "", this), new Among ( "ice", -1, 1, "", this), new Among ( "atrice", 7, 1, "", this), new Among ( "iche", -1, 1, "", this), new Among ( "logie", -1, 3, "", this), new Among ( "abile", -1, 1, "", this), new Among ( "ibile", -1, 1, "", this), new Among ( "usione", -1, 4, "", this), new Among ( "azione", -1, 2, "", this), new Among ( "uzione", -1, 4, "", this), new Among ( "atore", -1, 2, "", this), new Among ( "ose", -1, 1, "", this), new Among ( "ante", -1, 1, "", this), new Among ( "mente", -1, 1, "", this), new Among ( "amente", 19, 7, "", this), new Among ( "iste", -1, 1, "", this), new Among ( "ive", -1, 9, "", this), new Among ( "anze", -1, 1, "", this), new Among ( "enze", -1, 5, "", this), new Among ( "ici", -1, 1, "", this), new Among ( "atrici", 25, 1, "", this), new Among ( "ichi", -1, 1, "", this), new Among ( "abili", -1, 1, "", this), new Among ( "ibili", -1, 1, "", this), new Among ( "ismi", -1, 1, "", this), new Among ( "usioni", -1, 4, "", this), new Among ( "azioni", -1, 2, "", this), new Among ( "uzioni", -1, 4, "", this), new Among ( "atori", -1, 2, "", this), new Among ( "osi", -1, 1, "", this), new Among ( "anti", -1, 1, "", this), new Among ( "amenti", -1, 6, "", this), new Among ( "imenti", -1, 6, "", this), new Among ( "isti", -1, 1, "", this), new Among ( "ivi", -1, 9, "", this), new Among ( "ico", -1, 1, "", this), new Among ( "ismo", -1, 1, "", this), new Among ( "oso", -1, 1, "", this), new Among ( "amento", -1, 6, "", this), new Among ( "imento", -1, 6, "", this), new Among ( "ivo", -1, 9, "", this), new Among ( "it\u00E0", -1, 8, "", this), new Among ( "ist\u00E0", -1, 1, "", this), new Among ( "ist\u00E8", -1, 1, "", this), new Among ( "ist\u00EC", -1, 1, "", this) }; private Among a_7[] = { new Among ( "isca", -1, 1, "", this), new Among ( "enda", -1, 1, "", this), new Among ( "ata", -1, 1, "", this), new Among ( "ita", -1, 1, "", this), new Among ( "uta", -1, 1, "", this), new Among ( "ava", -1, 1, "", this), new Among ( "eva", -1, 1, "", this), new Among ( "iva", -1, 1, "", this), new Among ( "erebbe", -1, 1, "", this), new Among ( "irebbe", -1, 1, "", this), new Among ( "isce", -1, 1, "", this), new Among ( "ende", -1, 1, "", this), new Among ( "are", -1, 1, "", this), new Among ( "ere", -1, 1, "", this), new Among ( "ire", -1, 1, "", this), new Among ( "asse", -1, 1, "", this), new Among ( "ate", -1, 1, "", this), new Among ( "avate", 16, 1, "", this), new Among ( "evate", 16, 1, "", this), new Among ( "ivate", 16, 1, "", this), new Among ( "ete", -1, 1, "", this), new Among ( "erete", 20, 1, "", this), new Among ( "irete", 20, 1, "", this), new Among ( "ite", -1, 1, "", this), new Among ( "ereste", -1, 1, "", this), new Among ( "ireste", -1, 1, "", this), new Among ( "ute", -1, 1, "", this), new Among ( "erai", -1, 1, "", this), new Among ( "irai", -1, 1, "", this), new Among ( "isci", -1, 1, "", this), new Among ( "endi", -1, 1, "", this), new Among ( "erei", -1, 1, "", this), new Among ( "irei", -1, 1, "", this), new Among ( "assi", -1, 1, "", this), new Among ( "ati", -1, 1, "", this), new Among ( "iti", -1, 1, "", this), new Among ( "eresti", -1, 1, "", this), new Among ( "iresti", -1, 1, "", this), new Among ( "uti", -1, 1, "", this), new Among ( "avi", -1, 1, "", this), new Among ( "evi", -1, 1, "", this), new Among ( "ivi", -1, 1, "", this), new Among ( "isco", -1, 1, "", this), new Among ( "ando", -1, 1, "", this), new Among ( "endo", -1, 1, "", this), new Among ( "Yamo", -1, 1, "", this), new Among ( "iamo", -1, 1, "", this), new Among ( "avamo", -1, 1, "", this), new Among ( "evamo", -1, 1, "", this), new Among ( "ivamo", -1, 1, "", this), new Among ( "eremo", -1, 1, "", this), new Among ( "iremo", -1, 1, "", this), new Among ( "assimo", -1, 1, "", this), new Among ( "ammo", -1, 1, "", this), new Among ( "emmo", -1, 1, "", this), new Among ( "eremmo", 54, 1, "", this), new Among ( "iremmo", 54, 1, "", this), new Among ( "immo", -1, 1, "", this), new Among ( "ano", -1, 1, "", this), new Among ( "iscano", 58, 1, "", this), new Among ( "avano", 58, 1, "", this), new Among ( "evano", 58, 1, "", this), new Among ( "ivano", 58, 1, "", this), new Among ( "eranno", -1, 1, "", this), new Among ( "iranno", -1, 1, "", this), new Among ( "ono", -1, 1, "", this), new Among ( "iscono", 65, 1, "", this), new Among ( "arono", 65, 1, "", this), new Among ( "erono", 65, 1, "", this), new Among ( "irono", 65, 1, "", this), new Among ( "erebbero", -1, 1, "", this), new Among ( "irebbero", -1, 1, "", this), new Among ( "assero", -1, 1, "", this), new Among ( "essero", -1, 1, "", this), new Among ( "issero", -1, 1, "", this), new Among ( "ato", -1, 1, "", this), new Among ( "ito", -1, 1, "", this), new Among ( "uto", -1, 1, "", this), new Among ( "avo", -1, 1, "", this), new Among ( "evo", -1, 1, "", this), new Among ( "ivo", -1, 1, "", this), new Among ( "ar", -1, 1, "", this), new Among ( "ir", -1, 1, "", this), new Among ( "er\u00E0", -1, 1, "", this), new Among ( "ir\u00E0", -1, 1, "", this), new Among ( "er\u00F2", -1, 1, "", this), new Among ( "ir\u00F2", -1, 1, "", this) }; private static final char g_v[] = {17, 65, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 128, 8, 2, 1 }; private static final char g_AEIO[] = {17, 65, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 128, 8, 2 }; private static final char g_CG[] = {17 }; private int I_p2; private int I_p1; private int I_pV; private void copy_from(ItalianStemmer other) { I_p2 = other.I_p2; I_p1 = other.I_p1; I_pV = other.I_pV; super.copy_from(other); } private boolean r_prelude() { int among_var; int v_1; int v_2; int v_3; int v_4; int v_5; // (, line 34 // test, line 35 v_1 = cursor; // repeat, line 35 replab0: while(true) { v_2 = cursor; lab1: do { // (, line 35 // [, line 36 bra = cursor; // substring, line 36 among_var = find_among(a_0, 7); if (among_var == 0) { break lab1; } // ], line 36 ket = cursor; switch(among_var) { case 0: break lab1; case 1: // (, line 37 // <-, line 37 slice_from("\u00E0"); break; case 2: // (, line 38 // <-, line 38 slice_from("\u00E8"); break; case 3: // (, line 39 // <-, line 39 slice_from("\u00EC"); break; case 4: // (, line 40 // <-, line 40 slice_from("\u00F2"); break; case 5: // (, line 41 // <-, line 41 slice_from("\u00F9"); break; case 6: // (, line 42 // <-, line 42 slice_from("qU"); break; case 7: // (, line 43 // next, line 43 if (cursor >= limit) { break lab1; } cursor++; break; } continue replab0; } while (false); cursor = v_2; break replab0; } cursor = v_1; // repeat, line 46 replab2: while(true) { v_3 = cursor; lab3: do { // goto, line 46 golab4: while(true) { v_4 = cursor; lab5: do { // (, line 46 if (!(in_grouping(g_v, 97, 249))) { break lab5; } // [, line 47 bra = cursor; // or, line 47 lab6: do { v_5 = cursor; lab7: do { // (, line 47 // literal, line 47 if (!(eq_s(1, "u"))) { break lab7; } // ], line 47 ket = cursor; if (!(in_grouping(g_v, 97, 249))) { break lab7; } // <-, line 47 slice_from("U"); break lab6; } while (false); cursor = v_5; // (, line 48 // literal, line 48 if (!(eq_s(1, "i"))) { break lab5; } // ], line 48 ket = cursor; if (!(in_grouping(g_v, 97, 249))) { break lab5; } // <-, line 48 slice_from("I"); } while (false); cursor = v_4; break golab4; } while (false); cursor = v_4; if (cursor >= limit) { break lab3; } cursor++; } continue replab2; } while (false); cursor = v_3; break replab2; } return true; } private boolean r_mark_regions() { int v_1; int v_2; int v_3; int v_6; int v_8; // (, line 52 I_pV = limit; I_p1 = limit; I_p2 = limit; // do, line 58 v_1 = cursor; lab0: do { // (, line 58 // or, line 60 lab1: do { v_2 = cursor; lab2: do { // (, line 59 if (!(in_grouping(g_v, 97, 249))) { break lab2; } // or, line 59 lab3: do { v_3 = cursor; lab4: do { // (, line 59 if (!(out_grouping(g_v, 97, 249))) { break lab4; } // gopast, line 59 golab5: while(true) { lab6: do { if (!(in_grouping(g_v, 97, 249))) { break lab6; } break golab5; } while (false); if (cursor >= limit) { break lab4; } cursor++; } break lab3; } while (false); cursor = v_3; // (, line 59 if (!(in_grouping(g_v, 97, 249))) { break lab2; } // gopast, line 59 golab7: while(true) { lab8: do { if (!(out_grouping(g_v, 97, 249))) { break lab8; } break golab7; } while (false); if (cursor >= limit) { break lab2; } cursor++; } } while (false); break lab1; } while (false); cursor = v_2; // (, line 61 if (!(out_grouping(g_v, 97, 249))) { break lab0; } // or, line 61 lab9: do { v_6 = cursor; lab10: do { // (, line 61 if (!(out_grouping(g_v, 97, 249))) { break lab10; } // gopast, line 61 golab11: while(true) { lab12: do { if (!(in_grouping(g_v, 97, 249))) { break lab12; } break golab11; } while (false); if (cursor >= limit) { break lab10; } cursor++; } break lab9; } while (false); cursor = v_6; // (, line 61 if (!(in_grouping(g_v, 97, 249))) { break lab0; } // next, line 61 if (cursor >= limit) { break lab0; } cursor++; } while (false); } while (false); // setmark pV, line 62 I_pV = cursor; } while (false); cursor = v_1; // do, line 64 v_8 = cursor; lab13: do { // (, line 64 // gopast, line 65 golab14: while(true) { lab15: do { if (!(in_grouping(g_v, 97, 249))) { break lab15; } break golab14; } while (false); if (cursor >= limit) { break lab13; } cursor++; } // gopast, line 65 golab16: while(true) { lab17: do { if (!(out_grouping(g_v, 97, 249))) { break lab17; } break golab16; } while (false); if (cursor >= limit) { break lab13; } cursor++; } // setmark p1, line 65 I_p1 = cursor; // gopast, line 66 golab18: while(true) { lab19: do { if (!(in_grouping(g_v, 97, 249))) { break lab19; } break golab18; } while (false); if (cursor >= limit) { break lab13; } cursor++; } // gopast, line 66 golab20: while(true) { lab21: do { if (!(out_grouping(g_v, 97, 249))) { break lab21; } break golab20; } while (false); if (cursor >= limit) { break lab13; } cursor++; } // setmark p2, line 66 I_p2 = cursor; } while (false); cursor = v_8; return true; } private boolean r_postlude() { int among_var; int v_1; // repeat, line 70 replab0: while(true) { v_1 = cursor; lab1: do { // (, line 70 // [, line 72 bra = cursor; // substring, line 72 among_var = find_among(a_1, 3); if (among_var == 0) { break lab1; } // ], line 72 ket = cursor; switch(among_var) { case 0: break lab1; case 1: // (, line 73 // <-, line 73 slice_from("i"); break; case 2: // (, line 74 // <-, line 74 slice_from("u"); break; case 3: // (, line 75 // next, line 75 if (cursor >= limit) { break lab1; } cursor++; break; } continue replab0; } while (false); cursor = v_1; break replab0; } return true; } private boolean r_RV() { if (!(I_pV <= cursor)) { return false; } return true; } private boolean r_R1() { if (!(I_p1 <= cursor)) { return false; } return true; } private boolean r_R2() { if (!(I_p2 <= cursor)) { return false; } return true; } private boolean r_attached_pronoun() { int among_var; // (, line 86 // [, line 87 ket = cursor; // substring, line 87 if (find_among_b(a_2, 37) == 0) { return false; } // ], line 87 bra = cursor; // among, line 97 among_var = find_among_b(a_3, 5); if (among_var == 0) { return false; } // (, line 97 // call RV, line 97 if (!r_RV()) { return false; } switch(among_var) { case 0: return false; case 1: // (, line 98 // delete, line 98 slice_del(); break; case 2: // (, line 99 // <-, line 99 slice_from("e"); break; } return true; } private boolean r_standard_suffix() { int among_var; int v_1; int v_2; int v_3; int v_4; // (, line 103 // [, line 104 ket = cursor; // substring, line 104 among_var = find_among_b(a_6, 51); if (among_var == 0) { return false; } // ], line 104 bra = cursor; switch(among_var) { case 0: return false; case 1: // (, line 111 // call R2, line 111 if (!r_R2()) { return false; } // delete, line 111 slice_del(); break; case 2: // (, line 113 // call R2, line 113 if (!r_R2()) { return false; } // delete, line 113 slice_del(); // try, line 114 v_1 = limit - cursor; lab0: do { // (, line 114 // [, line 114 ket = cursor; // literal, line 114 if (!(eq_s_b(2, "ic"))) { cursor = limit - v_1; break lab0; } // ], line 114 bra = cursor; // call R2, line 114 if (!r_R2()) { cursor = limit - v_1; break lab0; } // delete, line 114 slice_del(); } while (false); break; case 3: // (, line 117 // call R2, line 117 if (!r_R2()) { return false; } // <-, line 117 slice_from("log"); break; case 4: // (, line 119 // call R2, line 119 if (!r_R2()) { return false; } // <-, line 119 slice_from("u"); break; case 5: // (, line 121 // call R2, line 121 if (!r_R2()) { return false; } // <-, line 121 slice_from("ente"); break; case 6: // (, line 123 // call RV, line 123 if (!r_RV()) { return false; } // delete, line 123 slice_del(); break; case 7: // (, line 124 // call R1, line 125 if (!r_R1()) { return false; } // delete, line 125 slice_del(); // try, line 126 v_2 = limit - cursor; lab1: do { // (, line 126 // [, line 127 ket = cursor; // substring, line 127 among_var = find_among_b(a_4, 4); if (among_var == 0) { cursor = limit - v_2; break lab1; } // ], line 127 bra = cursor; // call R2, line 127 if (!r_R2()) { cursor = limit - v_2; break lab1; } // delete, line 127 slice_del(); switch(among_var) { case 0: cursor = limit - v_2; break lab1; case 1: // (, line 128 // [, line 128 ket = cursor; // literal, line 128 if (!(eq_s_b(2, "at"))) { cursor = limit - v_2; break lab1; } // ], line 128 bra = cursor; // call R2, line 128 if (!r_R2()) { cursor = limit - v_2; break lab1; } // delete, line 128 slice_del(); break; } } while (false); break; case 8: // (, line 133 // call R2, line 134 if (!r_R2()) { return false; } // delete, line 134 slice_del(); // try, line 135 v_3 = limit - cursor; lab2: do { // (, line 135 // [, line 136 ket = cursor; // substring, line 136 among_var = find_among_b(a_5, 3); if (among_var == 0) { cursor = limit - v_3; break lab2; } // ], line 136 bra = cursor; switch(among_var) { case 0: cursor = limit - v_3; break lab2; case 1: // (, line 137 // call R2, line 137 if (!r_R2()) { cursor = limit - v_3; break lab2; } // delete, line 137 slice_del(); break; } } while (false); break; case 9: // (, line 141 // call R2, line 142 if (!r_R2()) { return false; } // delete, line 142 slice_del(); // try, line 143 v_4 = limit - cursor; lab3: do { // (, line 143 // [, line 143 ket = cursor; // literal, line 143 if (!(eq_s_b(2, "at"))) { cursor = limit - v_4; break lab3; } // ], line 143 bra = cursor; // call R2, line 143 if (!r_R2()) { cursor = limit - v_4; break lab3; } // delete, line 143 slice_del(); // [, line 143 ket = cursor; // literal, line 143 if (!(eq_s_b(2, "ic"))) { cursor = limit - v_4; break lab3; } // ], line 143 bra = cursor; // call R2, line 143 if (!r_R2()) { cursor = limit - v_4; break lab3; } // delete, line 143 slice_del(); } while (false); break; } return true; } private boolean r_verb_suffix() { int among_var; int v_1; int v_2; // setlimit, line 148 v_1 = limit - cursor; // tomark, line 148 if (cursor < I_pV) { return false; } cursor = I_pV; v_2 = limit_backward; limit_backward = cursor; cursor = limit - v_1; // (, line 148 // [, line 149 ket = cursor; // substring, line 149 among_var = find_among_b(a_7, 87); if (among_var == 0) { limit_backward = v_2; return false; } // ], line 149 bra = cursor; switch(among_var) { case 0: limit_backward = v_2; return false; case 1: // (, line 163 // delete, line 163 slice_del(); break; } limit_backward = v_2; return true; } private boolean r_vowel_suffix() { int v_1; int v_2; // (, line 170 // try, line 171 v_1 = limit - cursor; lab0: do { // (, line 171 // [, line 172 ket = cursor; if (!(in_grouping_b(g_AEIO, 97, 242))) { cursor = limit - v_1; break lab0; } // ], line 172 bra = cursor; // call RV, line 172 if (!r_RV()) { cursor = limit - v_1; break lab0; } // delete, line 172 slice_del(); // [, line 173 ket = cursor; // literal, line 173 if (!(eq_s_b(1, "i"))) { cursor = limit - v_1; break lab0; } // ], line 173 bra = cursor; // call RV, line 173 if (!r_RV()) { cursor = limit - v_1; break lab0; } // delete, line 173 slice_del(); } while (false); // try, line 175 v_2 = limit - cursor; lab1: do { // (, line 175 // [, line 176 ket = cursor; // literal, line 176 if (!(eq_s_b(1, "h"))) { cursor = limit - v_2; break lab1; } // ], line 176 bra = cursor; if (!(in_grouping_b(g_CG, 99, 103))) { cursor = limit - v_2; break lab1; } // call RV, line 176 if (!r_RV()) { cursor = limit - v_2; break lab1; } // delete, line 176 slice_del(); } while (false); return true; } public boolean stem() { int v_1; int v_2; int v_3; int v_4; int v_5; int v_6; int v_7; // (, line 181 // do, line 182 v_1 = cursor; lab0: do { // call prelude, line 182 if (!r_prelude()) { break lab0; } } while (false); cursor = v_1; // do, line 183 v_2 = cursor; lab1: do { // call mark_regions, line 183 if (!r_mark_regions()) { break lab1; } } while (false); cursor = v_2; // backwards, line 184 limit_backward = cursor; cursor = limit; // (, line 184 // do, line 185 v_3 = limit - cursor; lab2: do { // call attached_pronoun, line 185 if (!r_attached_pronoun()) { break lab2; } } while (false); cursor = limit - v_3; // do, line 186 v_4 = limit - cursor; lab3: do { // (, line 186 // or, line 186 lab4: do { v_5 = limit - cursor; lab5: do { // call standard_suffix, line 186 if (!r_standard_suffix()) { break lab5; } break lab4; } while (false); cursor = limit - v_5; // call verb_suffix, line 186 if (!r_verb_suffix()) { break lab3; } } while (false); } while (false); cursor = limit - v_4; // do, line 187 v_6 = limit - cursor; lab6: do { // call vowel_suffix, line 187 if (!r_vowel_suffix()) { break lab6; } } while (false); cursor = limit - v_6; cursor = limit_backward; // do, line 189 v_7 = cursor; lab7: do { // call postlude, line 189 if (!r_postlude()) { break lab7; } } while (false); cursor = v_7; return true; } } lucene-2.9.4/contrib/snowball/src/java/org/tartarus/snowball/ext/DanishStemmer.java0000644000175000017500000003065611474320235031142 0ustar janpascaljanpascal// This file was generated automatically by the Snowball to Java compiler package org.tartarus.snowball.ext; import org.tartarus.snowball.SnowballProgram; import org.tartarus.snowball.Among; /** * Generated class implementing code defined by a snowball script. */ public class DanishStemmer extends SnowballProgram { private Among a_0[] = { new Among ( "hed", -1, 1, "", this), new Among ( "ethed", 0, 1, "", this), new Among ( "ered", -1, 1, "", this), new Among ( "e", -1, 1, "", this), new Among ( "erede", 3, 1, "", this), new Among ( "ende", 3, 1, "", this), new Among ( "erende", 5, 1, "", this), new Among ( "ene", 3, 1, "", this), new Among ( "erne", 3, 1, "", this), new Among ( "ere", 3, 1, "", this), new Among ( "en", -1, 1, "", this), new Among ( "heden", 10, 1, "", this), new Among ( "eren", 10, 1, "", this), new Among ( "er", -1, 1, "", this), new Among ( "heder", 13, 1, "", this), new Among ( "erer", 13, 1, "", this), new Among ( "s", -1, 2, "", this), new Among ( "heds", 16, 1, "", this), new Among ( "es", 16, 1, "", this), new Among ( "endes", 18, 1, "", this), new Among ( "erendes", 19, 1, "", this), new Among ( "enes", 18, 1, "", this), new Among ( "ernes", 18, 1, "", this), new Among ( "eres", 18, 1, "", this), new Among ( "ens", 16, 1, "", this), new Among ( "hedens", 24, 1, "", this), new Among ( "erens", 24, 1, "", this), new Among ( "ers", 16, 1, "", this), new Among ( "ets", 16, 1, "", this), new Among ( "erets", 28, 1, "", this), new Among ( "et", -1, 1, "", this), new Among ( "eret", 30, 1, "", this) }; private Among a_1[] = { new Among ( "gd", -1, -1, "", this), new Among ( "dt", -1, -1, "", this), new Among ( "gt", -1, -1, "", this), new Among ( "kt", -1, -1, "", this) }; private Among a_2[] = { new Among ( "ig", -1, 1, "", this), new Among ( "lig", 0, 1, "", this), new Among ( "elig", 1, 1, "", this), new Among ( "els", -1, 1, "", this), new Among ( "l\u00F8st", -1, 2, "", this) }; private static final char g_v[] = {17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48, 0, 128 }; private static final char g_s_ending[] = {239, 254, 42, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16 }; private int I_x; private int I_p1; private StringBuffer S_ch = new StringBuffer(); private void copy_from(DanishStemmer other) { I_x = other.I_x; I_p1 = other.I_p1; S_ch = other.S_ch; super.copy_from(other); } private boolean r_mark_regions() { int v_1; int v_2; // (, line 29 I_p1 = limit; // test, line 33 v_1 = cursor; // (, line 33 // hop, line 33 { int c = cursor + 3; if (0 > c || c > limit) { return false; } cursor = c; } // setmark x, line 33 I_x = cursor; cursor = v_1; // goto, line 34 golab0: while(true) { v_2 = cursor; lab1: do { if (!(in_grouping(g_v, 97, 248))) { break lab1; } cursor = v_2; break golab0; } while (false); cursor = v_2; if (cursor >= limit) { return false; } cursor++; } // gopast, line 34 golab2: while(true) { lab3: do { if (!(out_grouping(g_v, 97, 248))) { break lab3; } break golab2; } while (false); if (cursor >= limit) { return false; } cursor++; } // setmark p1, line 34 I_p1 = cursor; // try, line 35 lab4: do { // (, line 35 if (!(I_p1 < I_x)) { break lab4; } I_p1 = I_x; } while (false); return true; } private boolean r_main_suffix() { int among_var; int v_1; int v_2; // (, line 40 // setlimit, line 41 v_1 = limit - cursor; // tomark, line 41 if (cursor < I_p1) { return false; } cursor = I_p1; v_2 = limit_backward; limit_backward = cursor; cursor = limit - v_1; // (, line 41 // [, line 41 ket = cursor; // substring, line 41 among_var = find_among_b(a_0, 32); if (among_var == 0) { limit_backward = v_2; return false; } // ], line 41 bra = cursor; limit_backward = v_2; switch(among_var) { case 0: return false; case 1: // (, line 48 // delete, line 48 slice_del(); break; case 2: // (, line 50 if (!(in_grouping_b(g_s_ending, 97, 229))) { return false; } // delete, line 50 slice_del(); break; } return true; } private boolean r_consonant_pair() { int v_1; int v_2; int v_3; // (, line 54 // test, line 55 v_1 = limit - cursor; // (, line 55 // setlimit, line 56 v_2 = limit - cursor; // tomark, line 56 if (cursor < I_p1) { return false; } cursor = I_p1; v_3 = limit_backward; limit_backward = cursor; cursor = limit - v_2; // (, line 56 // [, line 56 ket = cursor; // substring, line 56 if (find_among_b(a_1, 4) == 0) { limit_backward = v_3; return false; } // ], line 56 bra = cursor; limit_backward = v_3; cursor = limit - v_1; // next, line 62 if (cursor <= limit_backward) { return false; } cursor--; // ], line 62 bra = cursor; // delete, line 62 slice_del(); return true; } private boolean r_other_suffix() { int among_var; int v_1; int v_2; int v_3; int v_4; // (, line 65 // do, line 66 v_1 = limit - cursor; lab0: do { // (, line 66 // [, line 66 ket = cursor; // literal, line 66 if (!(eq_s_b(2, "st"))) { break lab0; } // ], line 66 bra = cursor; // literal, line 66 if (!(eq_s_b(2, "ig"))) { break lab0; } // delete, line 66 slice_del(); } while (false); cursor = limit - v_1; // setlimit, line 67 v_2 = limit - cursor; // tomark, line 67 if (cursor < I_p1) { return false; } cursor = I_p1; v_3 = limit_backward; limit_backward = cursor; cursor = limit - v_2; // (, line 67 // [, line 67 ket = cursor; // substring, line 67 among_var = find_among_b(a_2, 5); if (among_var == 0) { limit_backward = v_3; return false; } // ], line 67 bra = cursor; limit_backward = v_3; switch(among_var) { case 0: return false; case 1: // (, line 70 // delete, line 70 slice_del(); // do, line 70 v_4 = limit - cursor; lab1: do { // call consonant_pair, line 70 if (!r_consonant_pair()) { break lab1; } } while (false); cursor = limit - v_4; break; case 2: // (, line 72 // <-, line 72 slice_from("l\u00F8s"); break; } return true; } private boolean r_undouble() { int v_1; int v_2; // (, line 75 // setlimit, line 76 v_1 = limit - cursor; // tomark, line 76 if (cursor < I_p1) { return false; } cursor = I_p1; v_2 = limit_backward; limit_backward = cursor; cursor = limit - v_1; // (, line 76 // [, line 76 ket = cursor; if (!(out_grouping_b(g_v, 97, 248))) { limit_backward = v_2; return false; } // ], line 76 bra = cursor; // -> ch, line 76 S_ch = slice_to(S_ch); limit_backward = v_2; // name ch, line 77 if (!(eq_v_b(S_ch))) { return false; } // delete, line 78 slice_del(); return true; } public boolean stem() { int v_1; int v_2; int v_3; int v_4; int v_5; // (, line 82 // do, line 84 v_1 = cursor; lab0: do { // call mark_regions, line 84 if (!r_mark_regions()) { break lab0; } } while (false); cursor = v_1; // backwards, line 85 limit_backward = cursor; cursor = limit; // (, line 85 // do, line 86 v_2 = limit - cursor; lab1: do { // call main_suffix, line 86 if (!r_main_suffix()) { break lab1; } } while (false); cursor = limit - v_2; // do, line 87 v_3 = limit - cursor; lab2: do { // call consonant_pair, line 87 if (!r_consonant_pair()) { break lab2; } } while (false); cursor = limit - v_3; // do, line 88 v_4 = limit - cursor; lab3: do { // call other_suffix, line 88 if (!r_other_suffix()) { break lab3; } } while (false); cursor = limit - v_4; // do, line 89 v_5 = limit - cursor; lab4: do { // call undouble, line 89 if (!r_undouble()) { break lab4; } } while (false); cursor = limit - v_5; cursor = limit_backward; return true; } } lucene-2.9.4/contrib/snowball/src/java/org/tartarus/snowball/SnowballProgram.java0000644000175000017500000002465111474320235030706 0ustar janpascaljanpascalpackage org.tartarus.snowball; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.lang.reflect.InvocationTargetException; /** * This is the rev 500 of the Snowball SVN trunk, * but modified: * made abstract and introduced abstract method stem * to avoid expensive */ public abstract class SnowballProgram { protected SnowballProgram() { current = new StringBuffer(); setCurrent(""); } public abstract boolean stem(); /** * Set the current string. */ public void setCurrent(String value) { current.replace(0, current.length(), value); cursor = 0; limit = current.length(); limit_backward = 0; bra = cursor; ket = limit; } /** * Get the current string. */ public String getCurrent() { String result = current.toString(); // Make a new StringBuffer. If we reuse the old one, and a user of // the library keeps a reference to the buffer returned (for example, // by converting it to a String in a way which doesn't force a copy), // the buffer size will not decrease, and we will risk wasting a large // amount of memory. // Thanks to Wolfram Esser for spotting this problem. current = new StringBuffer(); return result; } // current string protected StringBuffer current; protected int cursor; protected int limit; protected int limit_backward; protected int bra; protected int ket; protected void copy_from(SnowballProgram other) { current = other.current; cursor = other.cursor; limit = other.limit; limit_backward = other.limit_backward; bra = other.bra; ket = other.ket; } protected boolean in_grouping(char [] s, int min, int max) { if (cursor >= limit) return false; char ch = current.charAt(cursor); if (ch > max || ch < min) return false; ch -= min; if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false; cursor++; return true; } protected boolean in_grouping_b(char [] s, int min, int max) { if (cursor <= limit_backward) return false; char ch = current.charAt(cursor - 1); if (ch > max || ch < min) return false; ch -= min; if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false; cursor--; return true; } protected boolean out_grouping(char [] s, int min, int max) { if (cursor >= limit) return false; char ch = current.charAt(cursor); if (ch > max || ch < min) { cursor++; return true; } ch -= min; if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) { cursor ++; return true; } return false; } protected boolean out_grouping_b(char [] s, int min, int max) { if (cursor <= limit_backward) return false; char ch = current.charAt(cursor - 1); if (ch > max || ch < min) { cursor--; return true; } ch -= min; if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) { cursor--; return true; } return false; } protected boolean in_range(int min, int max) { if (cursor >= limit) return false; char ch = current.charAt(cursor); if (ch > max || ch < min) return false; cursor++; return true; } protected boolean in_range_b(int min, int max) { if (cursor <= limit_backward) return false; char ch = current.charAt(cursor - 1); if (ch > max || ch < min) return false; cursor--; return true; } protected boolean out_range(int min, int max) { if (cursor >= limit) return false; char ch = current.charAt(cursor); if (!(ch > max || ch < min)) return false; cursor++; return true; } protected boolean out_range_b(int min, int max) { if (cursor <= limit_backward) return false; char ch = current.charAt(cursor - 1); if(!(ch > max || ch < min)) return false; cursor--; return true; } protected boolean eq_s(int s_size, String s) { if (limit - cursor < s_size) return false; int i; for (i = 0; i != s_size; i++) { if (current.charAt(cursor + i) != s.charAt(i)) return false; } cursor += s_size; return true; } protected boolean eq_s_b(int s_size, String s) { if (cursor - limit_backward < s_size) return false; int i; for (i = 0; i != s_size; i++) { if (current.charAt(cursor - s_size + i) != s.charAt(i)) return false; } cursor -= s_size; return true; } protected boolean eq_v(StringBuffer s) { return eq_s(s.length(), s.toString()); } protected boolean eq_v_b(StringBuffer s) { return eq_s_b(s.length(), s.toString()); } protected int find_among(Among v[], int v_size) { int i = 0; int j = v_size; int c = cursor; int l = limit; int common_i = 0; int common_j = 0; boolean first_key_inspected = false; while(true) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; // smaller Among w = v[k]; int i2; for (i2 = common; i2 < w.s_size; i2++) { if (c + common == l) { diff = -1; break; } diff = current.charAt(c + common) - w.s.charAt(i2); if (diff != 0) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; // v->s has been inspected if (j == i) break; // only one item in v // - but now we need to go round once more to get // v->s inspected. This looks messy, but is actually // the optimal approach. if (first_key_inspected) break; first_key_inspected = true; } } while(true) { Among w = v[i]; if (common_i >= w.s_size) { cursor = c + w.s_size; if (w.method == null) return w.result; boolean res; try { Object resobj = w.method.invoke(w.methodobject, new Object[0]); res = resobj.toString().equals("true"); } catch (InvocationTargetException e) { res = false; // FIXME - debug message } catch (IllegalAccessException e) { res = false; // FIXME - debug message } cursor = c + w.s_size; if (res) return w.result; } i = w.substring_i; if (i < 0) return 0; } } // find_among_b is for backwards processing. Same comments apply protected int find_among_b(Among v[], int v_size) { int i = 0; int j = v_size; int c = cursor; int lb = limit_backward; int common_i = 0; int common_j = 0; boolean first_key_inspected = false; while(true) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; Among w = v[k]; int i2; for (i2 = w.s_size - 1 - common; i2 >= 0; i2--) { if (c - common == lb) { diff = -1; break; } diff = current.charAt(c - 1 - common) - w.s.charAt(i2); if (diff != 0) break; common++; } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; if (j == i) break; if (first_key_inspected) break; first_key_inspected = true; } } while(true) { Among w = v[i]; if (common_i >= w.s_size) { cursor = c - w.s_size; if (w.method == null) return w.result; boolean res; try { Object resobj = w.method.invoke(w.methodobject, new Object[0]); res = resobj.toString().equals("true"); } catch (InvocationTargetException e) { res = false; // FIXME - debug message } catch (IllegalAccessException e) { res = false; // FIXME - debug message } cursor = c - w.s_size; if (res) return w.result; } i = w.substring_i; if (i < 0) return 0; } } /* to replace chars between c_bra and c_ket in current by the * chars in s. */ protected int replace_s(int c_bra, int c_ket, String s) { int adjustment = s.length() - (c_ket - c_bra); current.replace(c_bra, c_ket, s); limit += adjustment; if (cursor >= c_ket) cursor += adjustment; else if (cursor > c_bra) cursor = c_bra; return adjustment; } protected void slice_check() { if (bra < 0 || bra > ket || ket > limit || limit > current.length()) // this line could be removed { System.err.println("faulty slice operation"); // FIXME: report error somehow. /* fprintf(stderr, "faulty slice operation:\n"); debug(z, -1, 0); exit(1); */ } } protected void slice_from(String s) { slice_check(); replace_s(bra, ket, s); } protected void slice_from(StringBuffer s) { slice_from(s.toString()); } protected void slice_del() { slice_from(""); } protected void insert(int c_bra, int c_ket, String s) { int adjustment = replace_s(c_bra, c_ket, s); if (c_bra <= bra) bra += adjustment; if (c_bra <= ket) ket += adjustment; } protected void insert(int c_bra, int c_ket, StringBuffer s) { insert(c_bra, c_ket, s.toString()); } /* Copy the slice into the supplied StringBuffer */ protected StringBuffer slice_to(StringBuffer s) { slice_check(); int len = ket - bra; s.replace(0, s.length(), current.substring(bra, ket)); return s; } protected StringBuffer assign_to(StringBuffer s) { s.replace(0, s.length(), current.substring(0, limit)); return s; } /* extern void debug(struct SN_env * z, int number, int line_count) { int i; int limit = SIZE(z->p); //if (number >= 0) printf("%3d (line %4d): '", number, line_count); if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit); for (i = 0; i <= limit; i++) { if (z->lb == i) printf("{"); if (z->bra == i) printf("["); if (z->c == i) printf("|"); if (z->ket == i) printf("]"); if (z->l == i) printf("}"); if (i < limit) { int ch = z->p[i]; if (ch == 0) ch = '#'; printf("%c", ch); } } printf("'\n"); } */ }; lucene-2.9.4/contrib/snowball/src/java/overview.html0000644000175000017500000000272411474320235023206 0ustar janpascaljanpascal

Lucene Snowball README file

This project provides pre-compiled version of the Snowball stemmers based on revision 500 of the Tartarus Snowball repository, together with classes integrating them with the Lucene search engine.

See the Snowball home page for more information about the algorithms.

IMPORTANT NOTICE ON BACKWARDS COMPATIBILITY!

An index created using the Snowball module in Lucene 2.3.2 and below might not be compatible with the Snowball module in Lucene 2.4 or greater.

For more information about this issue see: https://issues.apache.org/jira/browse/LUCENE-1142

lucene-2.9.4/contrib/snowball/src/test/0000755000175000017500000000000011474320235020503 5ustar janpascaljanpascallucene-2.9.4/contrib/snowball/src/test/org/0000755000175000017500000000000011474320235021272 5ustar janpascaljanpascallucene-2.9.4/contrib/snowball/src/test/org/apache/0000755000175000017500000000000011474320235022513 5ustar janpascaljanpascallucene-2.9.4/contrib/snowball/src/test/org/apache/lucene/0000755000175000017500000000000011474320235023766 5ustar janpascaljanpascallucene-2.9.4/contrib/snowball/src/test/org/apache/lucene/analysis/0000755000175000017500000000000011474320235025611 5ustar janpascaljanpascallucene-2.9.4/contrib/snowball/src/test/org/apache/lucene/analysis/snowball/0000755000175000017500000000000011554106561027435 5ustar janpascaljanpascallucene-2.9.4/contrib/snowball/src/test/org/apache/lucene/analysis/snowball/TestSnowball.java0000644000175000017500000001231011474320235032713 0ustar janpascaljanpascalpackage org.apache.lucene.analysis.snowball; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Reader; import java.io.StringReader; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.index.Payload; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.util.Version; public class TestSnowball extends BaseTokenStreamTestCase { public void testEnglish() throws Exception { Analyzer a = new SnowballAnalyzer("English"); assertAnalyzesTo(a, "he abhorred accents", new String[]{"he", "abhor", "accent"}); } public void testStopwords() throws Exception { Analyzer a = new SnowballAnalyzer(Version.LUCENE_29, "English", StandardAnalyzer.STOP_WORDS_SET); assertAnalyzesTo(a, "the quick brown fox jumped", new String[]{"quick", "brown", "fox", "jump"}); } public void testReusableTokenStream() throws Exception { Analyzer a = new SnowballAnalyzer("English"); assertAnalyzesToReuse(a, "he abhorred accents", new String[]{"he", "abhor", "accent"}); assertAnalyzesToReuse(a, "she abhorred him", new String[]{"she", "abhor", "him"}); } /** * subclass that acts just like whitespace analyzer for testing */ private class SnowballSubclassAnalyzer extends SnowballAnalyzer { public SnowballSubclassAnalyzer(String name) { super(name); } public TokenStream tokenStream(String fieldName, Reader reader) { return new WhitespaceTokenizer(reader); } } public void testLUCENE1678BWComp() throws Exception { Analyzer a = new SnowballSubclassAnalyzer("English"); assertAnalyzesToReuse(a, "he abhorred accents", new String[]{"he", "abhorred", "accents"}); } public void testFilterTokens() throws Exception { SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English"); TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class); OffsetAttribute offsetAtt = (OffsetAttribute) filter.getAttribute(OffsetAttribute.class); TypeAttribute typeAtt = (TypeAttribute) filter.getAttribute(TypeAttribute.class); PayloadAttribute payloadAtt = (PayloadAttribute) filter.getAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) filter.getAttribute(PositionIncrementAttribute.class); FlagsAttribute flagsAtt = (FlagsAttribute) filter.getAttribute(FlagsAttribute.class); filter.incrementToken(); assertEquals("accent", termAtt.term()); assertEquals(2, offsetAtt.startOffset()); assertEquals(7, offsetAtt.endOffset()); assertEquals("wrd", typeAtt.type()); assertEquals(3, posIncAtt.getPositionIncrement()); assertEquals(77, flagsAtt.getFlags()); assertEquals(new Payload(new byte[]{0,1,2,3}), payloadAtt.getPayload()); } private final class TestTokenStream extends TokenStream { private TermAttribute termAtt; private OffsetAttribute offsetAtt; private TypeAttribute typeAtt; private PayloadAttribute payloadAtt; private PositionIncrementAttribute posIncAtt; private FlagsAttribute flagsAtt; TestTokenStream() { super(); termAtt = (TermAttribute) addAttribute(TermAttribute.class); offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class); posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class); } public boolean incrementToken() { clearAttributes(); termAtt.setTermBuffer("accents"); offsetAtt.setOffset(2, 7); typeAtt.setType("wrd"); posIncAtt.setPositionIncrement(3); payloadAtt.setPayload(new Payload(new byte[]{0,1,2,3})); flagsAtt.setFlags(77); return true; } } }lucene-2.9.4/contrib/snowball/xdocs/0000755000175000017500000000000011554106561020060 5ustar janpascaljanpascallucene-2.9.4/contrib/snowball/xdocs/stylesheets/0000755000175000017500000000000011554106561022434 5ustar janpascaljanpascallucene-2.9.4/contrib/snowball/xdocs/stylesheets/project.xml0000644000175000017500000000227011474320235024622 0ustar janpascaljanpascal Snowball Stemmers for Lucene lucene-2.9.4/contrib/snowball/xdocs/index.xml0000644000175000017500000000053511474320235021711 0ustar janpascaljanpascal Overview - Snowball Stemmers for Lucene

This project provides pre-compiled version of the Snowball stemmers together with classes integrating them with the Lucene search engine.

lucene-2.9.4/contrib/snowball/pom.xml.template0000644000175000017500000000265011474320235022067 0ustar janpascaljanpascal 4.0.0 org.apache.lucene lucene-contrib @version@ org.apache.lucene lucene-snowball Lucene Snowball @version@ Snowball Analyzers jar lucene-2.9.4/contrib/snowball/README.txt0000644000175000017500000000111211474320235020426 0ustar janpascaljanpascalLucene Snowball README file This project provides pre-compiled version of the Snowball stemmers based on revision 500 of the Tartarus Snowball repository, together with classes integrating them with the Lucene search engine. IMPORTANT NOTICE ON BACKWARDS COMPATIBILITY! An index created using the Snowball module in Lucene 2.3.2 and below might not be compatible with the Snowball module in Lucene 2.4 or greater. For more information about this issue see: https://issues.apache.org/jira/browse/LUCENE-1142 For more information on Snowball, see: http://snowball.tartarus.org/ lucene-2.9.4/contrib/snowball/bin/0000755000175000017500000000000011554106561017510 5ustar janpascaljanpascallucene-2.9.4/contrib/snowball/bin/snowball.sh0000644000175000017500000000021311474320235021656 0ustar janpascaljanpascal#!/bin/csh -f set infile = $1 set outdir = $2 set name = $infile:h:t:uStemmer exec $0:h/snowball $infile -o $outdir/$name -n $name -java lucene-2.9.4/lucene-demos-pom.xml.template0000644000175000017500000000316511474320267021173 0ustar janpascaljanpascal 4.0.0 org.apache.lucene lucene-parent @version@ org.apache.lucene lucene-demos Lucene Demos @version@ This is the demo for Apache Lucene Java jar org.apache.lucene lucene-core @version@ lucene-2.9.4/docs/0000755000175000017500000000000011554106561014407 5ustar janpascaljanpascallucene-2.9.4/docs/demo4.pdf0000644000175000017500000003412711474320234016115 0ustar janpascaljanpascal%PDF-1.3 %ª«¬­ 4 0 obj << /Type /Info /Producer (FOP 0.20.5) >> endobj 5 0 obj << /Length 755 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gb!$E9on!^&;KZL(%)M@C]'T];['NJWi`:9,OM2sOA-]])e/(XD#*gb6Q-mN$7g75_gr\bVIQ.jGG&B:^TM++YE+/C'\!1.4)TgL]Yq.A%<+S98#8eJBLQS/]Djgds0HRZt:LJ.rh2\'jags5FX3@$T>UGH^RdfH81CYiIilruGeDt\_4[H6uK_i&?`V`"rm-$p&ik-Q5k)J@n(ZVm*19#/t8+[:2IFrrj^CRjKU%>$2VqQFu,7f+53SUon+Q5QlO?0#)R4cBJY8`'WnJJa80[)Nc>Rgf[b$8iYC;%b>;q3E*a2T>0\],$EP3oW0H]Vt7*%@q"+mU=dY?S&BRc_/#g>&OdMc-LE8>&9K"V]0?('~> endstream endobj 6 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 5 0 R /Annots 7 0 R >> endobj 7 0 obj [ 8 0 R 10 0 R 12 0 R 14 0 R 16 0 R 18 0 R 20 0 R 22 0 R ] endobj 8 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 519.166 185.996 507.166 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 9 0 R /H /I >> endobj 10 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 500.966 330.608 488.966 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 11 0 R /H /I >> endobj 12 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 482.766 266.636 470.766 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 13 0 R /H /I >> endobj 14 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 464.566 271.952 452.566 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 15 0 R /H /I >> endobj 16 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 446.366 221.312 434.366 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 17 0 R /H /I >> endobj 18 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 428.166 237.956 416.166 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 19 0 R /H /I >> endobj 20 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 409.966 283.592 397.966 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 21 0 R /H /I >> endobj 22 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 391.766 243.308 379.766 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 23 0 R /H /I >> endobj 24 0 obj << /Length 2510 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gatm>>E@Ms(4Q"]&GhP(QnT.Ah>5]:4)KR[d;9:#(-?r;A_L`HMln[UAq0b#=gE,Z/SWue#V\'!GWFCE4,:uWEq.V!n^kWhE>#mte,0uQfDH-6,^,\elV,(km:U'o\BK*45ep/-1k3:1!p9eGK@*3Yd7gal]C]7:p[TBERSKb)X"B=W6cqq>Q;3cFe=S(HQ$9O/+FC8Zg&%qZ9Ghjrm+Tul"Ge=cG$GnqQU/m:u3BgQ3_l-OKV%%a!6=QKrp*X0KR!QB6:n]^gOXda09'p`7Ss[3SV+ug)LH8EqHMsdCaDco%VGg(*>cYR6XtpqqgPm*>e3<8!JTe0>nFsqgD1>'1\ep/,=]@pHBs#0`(#"+Hd,sfY;.9nQB$_j6@QSI`)@e=@MC>kgVhrNlRWb.F-da`QKG+XSR[`"S^)"naNQHfqLjK9>TsGHHjo^IX"D4lqLp%*KTNW0?I-IseMH\3GBp5;8"oXmfP+eQZfsN0T27Y`D2edM4afDK8qGnOnbGWM\:L:l:at&qYfl3!O?7J1`eAS/lSPO;^Gq@fcs%[H&OYUB^Rd,kDU(^Dq4A+V[DikCO$@[&H6qGrPL^#<,`mOW3]rb!ndSqCit-7m3HDTHG/P[BB#R`YTjH%:^/Z"AS7jYTa=-J3QS=s\^eFq1p'SFXpk"'$GFngWNlu9RW2qJ$sjlQNh<6@1oo6g:qBRN%f5H+EFtAn00HI!DL/#$k_Z&7bA(l45?qCpXD>3UBTuoEsauObVP3\BS$C1p^lHSpRp[WXF;AN.>j[U[lT0eFB,?olLe3732k"q1i:!WE[QN[K'h9NRF#IFb8cWUTdqsk+dfCqBS)I.*tMAce=+C1S36=;ei@5n9ugY-`G+5mdj'S)o5L:E@N"O!i@%B$dBhQ]hXDj`o0!KVCKoGPXHSgr$6IMdoW!^^8']61K;FrLbjPR.PYKG>I'.6CO/tfK<;^ir8CXaAigm2<>TX]fFLdER]=6$sC9(J'tpk)sdpsG=i`l0+S?%C*lj.7pQ,f90+BG"m%,H'E;/cHX[a\lZdO#86bX96a;*%L$8#7>CMr5:IReXCRmO'W-=&ZRiBC-&=q(aa8ZNBX`D^b)0*H(19$^?N)q4Tn>fnsKT%mM)UAD]d5TI`P3d)H`#O3o#8Ds[%##=5:GOJ,8nb39V[`4+IQ:)n:C">Lp6Upg::W!'o+1k55++,!@1&'!8AbKd:AK$[icsQ*e2e#Y2;4l$VcnOA>_L.rW:&o&(Z0rsb5"(r"^NMlnT?33U4L_^S,RBH!JGqW\(Bc8q!s^k!O*g7p5FB?0TsqXTN1G7hob.kK@DlG1*(CqY[f^0joD"O&<2U4Cg*[E.cI]B%rnK)@_n@(\cS!T^YcKga2(F"Q%rn)Z0E~> endstream endobj 25 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 24 0 R >> endobj 26 0 obj << /Length 2630 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gau0FD3*^8%fSZ,_TeG7!4+l(gVJ:d\o"o0fE0g^!(G;G%uRJ7lI],egG>>uqP3Gr-5FXd4']%XS]J&]nfdgjPigfam^pa"T6J#ULS+&(et21f8Wpmh;;?YM)eQ&g3/TfP]3k!_D=7Dn5'Hi(^=hl@]O@*2s*SIL)(0=]7F%cNIfD0iFjm?CDju+:D(r'b-*-4T5Lp#N*"IbbMiB<@9g1c`:=*.jWI5.Nh\(_Gu48dfH*]sIP<5WFmOk6>aCW;e""P=gC\2DH/LuqgF;\.o]4G>9gK2/k:QE@_&3t0$tP4uC_Q5jD=/(oBnSH/%&9W/YB3#?XZRA9tDqIF(fo/UEMl>:-$GL##`;BVrl?He>6-0qMbbI%@YbpY7F6\R4]KL^Jfo$8oY`!nFp`EFYnbsC9!(.Mk:dSU3%3bkT:36@B)YnI"DoCIaXH^$Ep%K_QK-/J0].op+G:ct(E`F5U[Wh;Af,KQJj2'0TO8UT3+TO0a_:*5muV,/VGAqj4;*pU3r`iq>kY3d<-L*3=BPMI0_fs%b:Q5C40abe[nDi!367P-K,faH-KV,GV1h(XZK*im#8iH/U]T'RA.]dRVpeYMd.CV3'3/O]fJfO6#;eAPEhQ)UdGBIL;_It:r+)TreOBQUVj^\1+9"al;LjW\mTV1u!P>*u!`qSJ\gLU?U'h7"Q;+6%eC1,1(=(ONDJe`2c5qOCcie<8F1SCaD^Kkd\P/_Jc,GSFe.fZ3r?+a=.Bqt_9@S&%ot^)50#4C!V?S6_\h?lQhA;(]%q#M\:i<%H2mSoD)J"S/.mh09(/>S2Zd&UC>:0sX-5L?R8Ue?g4%L\O\,Zs7B2])D#ONc>C06:M-'`Fjd=mJ?[Lfk9e_"adPIZ/qNSO=lX`dr)iGQ$JtEY&5=<,8lS:]U<-Q/qAuf`qUacNKm5Q4_HUBbg>,]>ii5R&1H;UNc`^8D0A6'jIEZ+3a7`PAn0oH0M:NL]ooS&OVlr#oi+;*]$g0JFn(f&UIL=rg_U"T_5!kS%`k.tH:$oo'LNq^`,F-CP3>;"`Kc^%X0@1jBn[O>2:Q:ic',#CVm>%)[>$k[bs.Ma-jA]W.JoOPr[:8cc;8jbn7*nCRaWG[r6XQPY*"q42[HN;3GQNf'*'Vb>J]%FMZQOb"mbh$n&CYfeiD>LeBHmrJRilt&tLPW>fh^?3n*@]EMtU\[X"P2EE8Gm;uH$a4s?_s\MFOIFGjV]rSeg7Q':BClc]a-rcdH#rU8BCB9&b\Nr@.NIIE,]<'c"\F1J+;A(M59E_DZYIcOhm`ro4;YJXlb1I5RAZQs1i\M(&^5H3<2r,Ae*0g:91!I?MF-t2!k\JtW:^!tCJ0O9k=P=5n-qPJ8R'YTSQrpb,%;DgL](DNS%!k1rSXoEjf"''s7b^+j(8)8).%02@NRe72*kYXks4/`IL(k_\#t(COM&3Cs1rH$k],&CK,\Yb+8%T*Has))]^4!OugES`&io7kn2)-VftSK.Xjlo9OF`E1t(*Q-BqO9)F+dWLrsQ\8EZn+!tN)0VXXQ1]9pkN83QhcT?/Uf['@#ZYj\*0Im*3JTUi=E8rX^aKBT%i+-_I-$K\RM'&fh\\C>6HjkrjFk:=?>TbAShNH8Jer1)&4o(CFP!&Lr;tKYJf9t?Kq*:WN)5*$I53q4=6/:e3]BJ5nFb.WWd^(DFcQ0j/@5tY@!A$E<&53k8;2kL%_sd2+&j'5!W;,CNhoOhLM>^kULbp%Z'#ac3$9Tu08*B2oEI\T@`h2=pI6$oTpuFRp-cPkI&89dBDEJD`]c/fjrM@q]#^ZU?5Or;V49~> endstream endobj 27 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 26 0 R >> endobj 28 0 obj << /Length 1780 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gat%$968iG&AII30Y'07M%RHD^#`+(d`EZ3ea]@$!@4!UC62g/d#.D[$U,gf'I$,h@p9iTSK=;Ko6Kq>]6SN<3M*FHCo`OWc#WKYX2C`DR+(Y]jS\9;cfBqZ?m0[:cpTuiumgS-`Y<]+Z:d]ej666hR<5dj`$&"lAg?[VCe^_H@GB=h?0Z[I`J@]`V:/`>`9kTGUqk9"@6a$j6bN.f6]2"YaY:t$f!>q6U>TYuA[iFjr&cmjm#^b93DKg7F%d44pk/D1lo*b\F"S>3n*4j647$ZN$:6*FFrh(UIgK%Q-@Iq""C$B.?GJBOYP7'CEg^N1ABTSJ\E5:e<@ol,mg6efN8AE8GYKPt1!oS4OXB&Ul>GV:3-3Sg^6/\?CcY+].!.;q!,MK9_FcoK6qnHap[S@+FD4OHe?6'MIQ&s<)ZA]ab>[^K;I-%&u+7?h@/4bf[Gm0sj87;s`R3OYMb'<)ZTZ@i]4HF3o)XYR`:X=]*X&d>i+K/)3?KlLF?lk1""cdZAI(:Ja/XHZM@m$D(G=pfh\PHPRU+CYerLk>GG(7-^47+FJ74uiRN%97LH99_N2u'"`9+hn0=3U*tV%qo=#M<06'&7d$OfP@-nS1$-pNH!f5u.6>61jqZKP;MEr`>Y]#l)D$@;%g9jp`_%(K/:dIB`j%LoW3A^eMYgP,O!P:\GqDkIm,7Z7OK$fFCRecmP'Q'Q.3*#GfH)N:1<2*SsM_IM(g-$9a=+ZEr_Ua9D](hVF?='bc\p$Yi__3j2EC9O^P=e#C>bCk9Q_[dL#1!!.oXs^DjCYCOoK1+mnB^jSh!'Sc7cI@E*t^.[>Q)M168GtFAt\.J,XFNt.Rb`H^'4sj`_M:)[Z%j?<%WD)bd`afYtMt/jCjtnhf9qrh$atV-F9&23)H^;>%Db0/aa&GcD#bPL-[1(\=G@t+>1K'/!#sEUq"2AReJ?7/d5X:d6U%G,m%80eR0(4V4lC85$[+U%dTtf^bMg>GD;]_B7&DFi6&b=cAc]LccX([P,&LFVj2.;F"NS/J"CO:+lNL;&LFIn4dYIhru!&@WZnDX=A21/Q!1n`RRsm.:1#Rf%_Jl"])-#AX%Tet4BU26GEAlI%^M2k$X.sPE/P78,&qj'^7dOrh!'_g0sg(#G#.q/Wd?JeC880#$8>oZ1olgR&V$/BdA[#o;>\%"30Aq-Uk,;oOX:I):NU51*hmUumT_hqH1[:S6JlL&;+.]LEmdbh`g`<@/%WXjXjFF]3[qN[H"YAX9?5]`#Oe2$*<~> endstream endobj 29 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 28 0 R >> endobj 31 0 obj << /Title (\376\377\0\61\0\40\0\101\0\142\0\157\0\165\0\164\0\40\0\164\0\150\0\145\0\40\0\103\0\157\0\144\0\145) /Parent 30 0 R /Next 32 0 R /A 9 0 R >> endobj 32 0 obj << /Title (\376\377\0\62\0\40\0\114\0\157\0\143\0\141\0\164\0\151\0\157\0\156\0\40\0\157\0\146\0\40\0\164\0\150\0\145\0\40\0\163\0\157\0\165\0\162\0\143\0\145\0\40\0\50\0\144\0\145\0\166\0\145\0\154\0\157\0\160\0\145\0\162\0\163\0\57\0\144\0\145\0\160\0\154\0\157\0\171\0\145\0\162\0\163\0\51) /Parent 30 0 R /Prev 31 0 R /Next 33 0 R /A 11 0 R >> endobj 33 0 obj << /Title (\376\377\0\63\0\40\0\151\0\156\0\144\0\145\0\170\0\56\0\152\0\163\0\160\0\40\0\50\0\144\0\145\0\166\0\145\0\154\0\157\0\160\0\145\0\162\0\163\0\57\0\144\0\145\0\160\0\154\0\157\0\171\0\145\0\162\0\163\0\51) /Parent 30 0 R /Prev 32 0 R /Next 34 0 R /A 13 0 R >> endobj 34 0 obj << /Title (\376\377\0\64\0\40\0\150\0\145\0\141\0\144\0\145\0\162\0\56\0\152\0\163\0\160\0\40\0\50\0\144\0\145\0\166\0\145\0\154\0\157\0\160\0\145\0\162\0\163\0\57\0\144\0\145\0\160\0\154\0\157\0\171\0\145\0\162\0\163\0\51) /Parent 30 0 R /Prev 33 0 R /Next 35 0 R /A 15 0 R >> endobj 35 0 obj << /Title (\376\377\0\65\0\40\0\162\0\145\0\163\0\165\0\154\0\164\0\163\0\56\0\152\0\163\0\160\0\40\0\50\0\144\0\145\0\166\0\145\0\154\0\157\0\160\0\145\0\162\0\163\0\51) /Parent 30 0 R /Prev 34 0 R /Next 36 0 R /A 17 0 R >> endobj 36 0 obj << /Title (\376\377\0\66\0\40\0\115\0\157\0\162\0\145\0\40\0\163\0\157\0\165\0\162\0\143\0\145\0\163\0\40\0\50\0\144\0\145\0\166\0\145\0\154\0\157\0\160\0\145\0\162\0\163\0\51) /Parent 30 0 R /Prev 35 0 R /Next 37 0 R /A 19 0 R >> endobj 37 0 obj << /Title (\376\377\0\67\0\40\0\127\0\150\0\145\0\162\0\145\0\40\0\164\0\157\0\40\0\147\0\157\0\40\0\146\0\162\0\157\0\155\0\40\0\150\0\145\0\162\0\145\0\77\0\40\0\50\0\145\0\166\0\145\0\162\0\171\0\157\0\156\0\145\0\41\0\51) /Parent 30 0 R /Prev 36 0 R /Next 38 0 R /A 21 0 R >> endobj 38 0 obj << /Title (\376\377\0\70\0\40\0\127\0\150\0\145\0\156\0\40\0\164\0\157\0\40\0\143\0\157\0\156\0\164\0\141\0\143\0\164\0\40\0\164\0\150\0\145\0\40\0\101\0\165\0\164\0\150\0\157\0\162) /Parent 30 0 R /Prev 37 0 R /A 23 0 R >> endobj 39 0 obj << /Type /Font /Subtype /Type1 /Name /F3 /BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding >> endobj 40 0 obj << /Type /Font /Subtype /Type1 /Name /F5 /BaseFont /Times-Roman /Encoding /WinAnsiEncoding >> endobj 41 0 obj << /Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica /Encoding /WinAnsiEncoding >> endobj 42 0 obj << /Type /Font /Subtype /Type1 /Name /F9 /BaseFont /Courier /Encoding /WinAnsiEncoding >> endobj 43 0 obj << /Type /Font /Subtype /Type1 /Name /F2 /BaseFont /Helvetica-Oblique /Encoding /WinAnsiEncoding >> endobj 44 0 obj << /Type /Font /Subtype /Type1 /Name /F7 /BaseFont /Times-Bold /Encoding /WinAnsiEncoding >> endobj 1 0 obj << /Type /Pages /Count 4 /Kids [6 0 R 25 0 R 27 0 R 29 0 R ] >> endobj 2 0 obj << /Type /Catalog /Pages 1 0 R /Outlines 30 0 R /PageMode /UseOutlines >> endobj 3 0 obj << /Font << /F3 39 0 R /F5 40 0 R /F1 41 0 R /F9 42 0 R /F2 43 0 R /F7 44 0 R >> /ProcSet [ /PDF /ImageC /Text ] >> endobj 9 0 obj << /S /GoTo /D [25 0 R /XYZ 85.0 659.0 null] >> endobj 11 0 obj << /S /GoTo /D [25 0 R /XYZ 85.0 567.066 null] >> endobj 13 0 obj << /S /GoTo /D [25 0 R /XYZ 85.0 453.932 null] >> endobj 15 0 obj << /S /GoTo /D [25 0 R /XYZ 85.0 335.598 null] >> endobj 17 0 obj << /S /GoTo /D [25 0 R /XYZ 85.0 230.464 null] >> endobj 19 0 obj << /S /GoTo /D [27 0 R /XYZ 85.0 271.8 null] >> endobj 21 0 obj << /S /GoTo /D [27 0 R /XYZ 85.0 166.666 null] >> endobj 23 0 obj << /S /GoTo /D [29 0 R /XYZ 85.0 484.6 null] >> endobj 30 0 obj << /First 31 0 R /Last 38 0 R >> endobj xref 0 45 0000000000 65535 f 0000012567 00000 n 0000012646 00000 n 0000012738 00000 n 0000000015 00000 n 0000000071 00000 n 0000000917 00000 n 0000001037 00000 n 0000001111 00000 n 0000012872 00000 n 0000001246 00000 n 0000012935 00000 n 0000001383 00000 n 0000013001 00000 n 0000001520 00000 n 0000013067 00000 n 0000001657 00000 n 0000013133 00000 n 0000001794 00000 n 0000013199 00000 n 0000001931 00000 n 0000013263 00000 n 0000002068 00000 n 0000013329 00000 n 0000002205 00000 n 0000004808 00000 n 0000004916 00000 n 0000007639 00000 n 0000007747 00000 n 0000009620 00000 n 0000013393 00000 n 0000009728 00000 n 0000009901 00000 n 0000010270 00000 n 0000010563 00000 n 0000010862 00000 n 0000011108 00000 n 0000011360 00000 n 0000011661 00000 n 0000011905 00000 n 0000012018 00000 n 0000012128 00000 n 0000012236 00000 n 0000012342 00000 n 0000012458 00000 n trailer << /Size 45 /Root 2 0 R /Info 4 0 R >> startxref 13444 %%EOF lucene-2.9.4/docs/demo.html0000644000175000017500000002651511474320234016226 0ustar janpascaljanpascal Apache Lucene - Building and Installing the Basic Demo
 

Apache Lucene - Building and Installing the Basic Demo

About this Document

This document is intended as a "getting started" guide to using and running the Lucene demos. It walks you through some basic installation and configuration.

About the Demos

The Lucene command-line demo code consists of two applications that demonstrate various functionalities of Lucene and how one should go about adding Lucene to their applications.

Setting your CLASSPATH

First, you should download the latest Lucene distribution and then extract it to a working directory. Alternatively, you can check out the sources from Subversion, and then run ant war-demo to generate the JARs and WARs.

You should see the Lucene JAR file in the directory you created when you extracted the archive. It should be named something like lucene-core-{version}.jar. You should also see a file called lucene-demos-{version}.jar. If you checked out the sources from Subversion then the JARs are located under the build subdirectory (after running ant successfully). Put both of these files in your Java CLASSPATH.

Indexing Files

Once you've gotten this far you're probably itching to go. Let's build an index! Assuming you've set your CLASSPATH correctly, just type:

    java org.apache.lucene.demo.IndexFiles {full-path-to-lucene}/src
This will produce a subdirectory called index which will contain an index of all of the Lucene source code.

To search the index type:

    java org.apache.lucene.demo.SearchFiles
You'll be prompted for a query. Type in a swear word and press the enter key. You'll see that the Lucene developers are very well mannered and get no results. Now try entering the word "string". That should return a whole bunch of documents. The results will page at every tenth result and ask you whether you want more results.

About the code...

 
lucene-2.9.4/docs/demo4.html0000644000175000017500000004363311474320234016312 0ustar janpascaljanpascal Apache Lucene - Basic Demo Sources Walkthrough
 

Apache Lucene - Basic Demo Sources Walkthrough

About the Code

In this section we walk through the sources behind the basic Lucene Web Application demo: where to find them, their parts and their function. This section is intended for Java developers wishing to understand how to use Lucene in their applications or for those involved in deploying web applications based on Lucene.

Location of the source (developers/deployers)

Relative to the directory created when you extracted Lucene or retrieved it from Subversion, you should see a directory called src which in turn contains a directory called jsp. This is the root for all of the Lucene web demo.

Within this directory you should see index.jsp. Bring this up in vi or your editor of choice.

index.jsp (developers/deployers)

This jsp page is pretty boring by itself. All it does is include a header, display a form and include a footer. If you look at the form, it has two fields: query (where you enter your search criteria) and maxresults where you specify the number of results per page. By the structure of this JSP it should be easy to customize it without even editing this particular file. You could simply change the header and footer. Let's look at the header.jsp (located in the same directory) next.

header.jsp (developers/deployers)

The header is also very simple by itself. The only thing it does is include the configuration.jsp (which you looked at in the last section of this guide) and set the title and a brief header. This would be a good place to put your own custom HTML to "pretty" things up a bit. We won't cover the footer because all it does is display the footer and close your tags. Let's look at the results.jsp, the meat of this application, next.

results.jsp (developers)

Most of the functionality lies in results.jsp. Much of it is for paging the search results, which we'll not cover here as it's commented well enough. The first thing in this page is the actual imports for the Lucene classes and Lucene demo classes. These classes are loaded from the jars included in the WEB-INF/lib directory in the luceneweb.war file.

You'll notice that this file includes the same header and footer as index.jsp. From there it constructs an IndexSearcher with the indexLocation that was specified in configuration.jsp. If there is an error of any kind in opening the index, it is displayed to the user and the boolean flag error is set to tell the rest of the sections of the jsp not to continue.

From there, this jsp attempts to get the search criteria, the start index (used for paging) and the maximum number of results per page. If the maximum results per page is not set or not valid then it and the start index are set to default values. If only the start index is invalid it is set to a default value. If the criteria isn't provided then a servlet error is thrown (it is assumed that this is the result of url tampering or some form of browser malfunction).

The jsp moves on to construct a StandardAnalyzer to analyze the search text. This matches the analyzer used during indexing (IndexHTML), which is generally recommended. This is passed to the QueryParser along with the criteria to construct a Query object. You'll also notice the string literal "contents" included. This specifies that the search should cover the contents field and not the title, url or some other field in the indexed documents. If there is any error in constructing a Query object an error is displayed to the user.

In the next section of the jsp the IndexSearcher is asked to search given the query object. The results are returned in a collection called hits. If the length property of the hits collection is 0 (meaning there were no results) then an error is displayed to the user and the error flag is set.

Finally the jsp iterates through the hits collection, taking the current page into account, and displays properties of the Document objects we talked about in the first walkthrough. These objects contain "known" fields specific to their indexer (in this case IndexHTML constructs a document with "url", "title" and "contents").

Please note that in a real deployment of Lucene, it's best to instantiate IndexSearcher and QueryParser once, and then share them across search requests, instead of re-instantiating per search request.

More sources (developers)

There are additional sources used by the web app that were not specifically covered by either walkthrough. For example the HTML parser, the IndexHTML class and HTMLDocument class. These are very similar to the classes covered in the first example, with properties specific to parsing and indexing HTML. This is beyond our scope; however, by now you should feel like you're "getting started" with Lucene.

Where to go from here? (everyone!)

There are a number of things this demo doesn't do or doesn't do quite right. For instance, you may have noticed that documents in the root context are unreachable (unless you reconfigure Tomcat to support that context or redirect to it), anywhere where the directory doesn't quite match the context mapping, you'll have a broken link in your results. If you want to index non-local files or have some other needs this isn't supported, plus there may be security issues with running the indexing application from your webapps directory. There are a number of things left for you the developer to do.

In time some of these things may be added to Lucene as features (if you've got a good idea we'd love to hear it!), but for now: this is where you begin and the search engine/indexer ends. Lastly, one would assume you'd want to follow the above advice and customize the application to look a little more fancy than black on white with "Lucene Template" at the top. We'll see you on the Lucene Users' or Developers' mailing lists!

When to contact the Author

Please resist the urge to contact the authors of this document (without bribes of fame and fortune attached). First contact the mailing lists, taking care to Ask Questions The Smart Way. Certainly you'll get the most help that way as well. That being said, feedback, and modifications to this document and samples are ever so greatly appreciated. They are just best sent to the lists or posted as patches, so that everyone can share in them. Thanks for understanding!

 
lucene-2.9.4/docs/contributions.pdf0000644000175000017500000004031511474320234020003 0ustar janpascaljanpascal%PDF-1.3 %ª«¬­ 4 0 obj << /Type /Info /Producer (FOP 0.20.5) >> endobj 5 0 obj << /Length 1056 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gb!#\?#SFN'Rf.G=IM(_6bW7FJ]C)YW!;G$;B,S9K(a!70]UL4,j"m't3U:1DiicU'kHrT??Un0WGQk:G1@sE2XB(gl2[%9)m+S*cW8/f2N1>`Z-TirmBG%WN'([*78Lgc>*Z_dX(l*TF9CUKtMsO9;*Y]BLi4X0oKi47-(g`L6om^t3S(n"%)T()?S7+^]6pprV4"A(oP\8%(NQ?''2PA$sfhO&k,V0X%;_n^e`8:Yau"s(`[;7CIaBs'c]Mcoi'K\8s__l4j;D6Sqcgq,O"9\H#O?VF>C'=b(]KnT3psW4!_>lPQ48eZMr!cTubVa&BjeFqs2J16bm;X=D7E[.ilI-rq:[WA"o'"^TTeHM^Wdj=p;c>61MK=2,KR!0,gQ@/e,;jQQjAF!FMK@X3U`15J3f>h0n]D/T.L0Y!kM"GHDG5TPWMa>j6P5,ZE'2Y*Xu:QH&YRGV+q]UR4+FD3_*m;9c>?rYm-)VNq~> endstream endobj 6 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 5 0 R /Annots 7 0 R >> endobj 7 0 obj [ 8 0 R 10 0 R 12 0 R 14 0 R 16 0 R 18 0 R 20 0 R 22 0 R 24 0 R 26 0 R 28 0 R 30 0 R 32 0 R 34 0 R 36 0 R 38 0 R ] endobj 8 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 546.166 157.316 534.166 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 9 0 R /H /I >> endobj 10 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 527.966 175.652 515.966 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 11 0 R /H /I >> endobj 12 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 509.766 148.16 497.766 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 13 0 R /H /I >> endobj 14 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 491.566 273.128 479.566 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 15 0 R /H /I >> endobj 16 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 473.366 253.964 461.366 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 17 0 R /H /I >> endobj 18 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 455.166 218.156 443.166 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 19 0 R /H /I >> endobj 20 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 436.966 218.156 424.966 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 21 0 R /H /I >> endobj 22 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 418.766 168.512 406.766 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 23 0 R /H /I >> endobj 24 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 400.566 297.5 388.566 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 25 0 R /H /I >> endobj 26 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 382.366 376.136 370.366 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 27 0 R /H /I >> endobj 28 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 364.166 404.84 352.166 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 29 0 R /H /I >> endobj 30 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 345.966 179.324 333.966 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 31 0 R /H /I >> endobj 32 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 327.766 244.448 315.766 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 33 0 R /H /I >> endobj 34 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 309.566 166.172 297.566 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 35 0 R /H /I >> endobj 36 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 291.366 291.128 279.366 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 37 0 R /H /I >> endobj 38 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 273.166 160.832 261.166 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 39 0 R /H /I >> endobj 40 0 obj << /Length 2281 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gatm>=`<=Y&:XAW&5ZaF?pfjJW2%/&gJ,9rp?L3t$Zu9=aL%d5'"sN?Du/cm?-t?gCar#.,bd+?c@#P%L-t.cC@UV)nSM\Z5O`N)eUJTulbCXRI*7[WC\m"Vf@D.MO_qYQ;84,P?_:b]0QeqgXc(/:MjLK`ns*kH>%"a@f[b5>hOqTfeZ(Z0M7_=gZi5E*d8T@i1p;2qC!Ym)iJ4>^5M#+*CF3r+?[%]3#PkOW:E(uUZq6[>8NWn8+i27igA+ch;@GAL_CZj!Bi:2R+obkhR>:$GkC"*8*%4?aLe:.L!)C2_%+eRT)?t2k#HA#sPU4Y2)hl@X)(56'\5gOW#m=^7V<_Q;_)n0/?YpmIcIX>&dV@%Yc[`n^%gudb:)%)n0MS6^.Gon@Gb!h6+g\CH"L,KP_mP$QL^itZ4NPAbebj6u=u2h7h13,h%CEn#B3(h*0aK9][hP"0)0"i.3_4!D"u?USc#^hU/%Y@U=H6G3.+dDm1)_*ho0F4^7(@j2)>pStrN97?Sb=f?ukmi\6i7c9>*12\]:r9FOi_lP0ZK$U@n@Ed2-5T[]]jhkE;9XU+n=>H/E"/")/_lq14GW/']&$?Kfr2ZGGFb62%bu@T%`;ZX??Y0Ebf:_b9&;drb65WaHd=agLkjb/,p05l]&2./^.heO$(A;I,t(p1Q3=)U*M_\AI[;0l]-^Oo$.r(/Id_ku.et-=]UTDRRhdCI3baMhK>Ho!$rPr[6Bole0[ha0>u`-jVhZ.*5[X(R>TMt+3YM&u3]:GT]&"`;I4;;tWmPBMa$lTJ-,aMV%1kOo4JIGXO-8CZb54IV:L1n0cq(2%hS+JESgpsV.C9;%nm_kU?XX>s4kR[?"pZ!EeT2]f@1]f-s7,4L(rfMYb$:UZFH.P&gN,hsA'7VT"a>G12m8#+`+?LVEOe1O%S2iiYq(rugL[CP\\&M;'3*GX2P\t$_Yu7H!cUoMTru%S\Sol@M#6*o7%HFB@k-tfGj)E>eIE1``03r&EdVd]5j-2@E`4!it)GVSU`XoJFYm.Q/^5I'7i,R]W,m.l25+u6%'1id$[>YCB&Y(l.#=DZ^lnQl?+Vb1i44#CYRb:h.dcRJ]r@\!LRhN9aD%tX[cP(oWSJu+G5UQT_;f"S2q8;!hRVIpr8V\fgg:e$^A'Y"$tUNl@G+.slLhicu,ch:kF'b:cZj([DVKDl;M?7DBr6>E.o]"c[j,T1A0P)Uc)W;o/kVMZk3,XrYGpq-.N#.)EIJ2YeLs%JZa'k9R=VJ9^GPR+uW]u+"g*$Sr7P%?sED65(dtZ;qHbK7mJ\Za.qUV&m3nO6Z_#9QHC,4OB](J.L?=+AXs4b&TPlT/,r26bHT#2]B<$)d`bYmO2PHHC+MqsXl[W+=CpQ[EpN/2>__e@TZd0p"=bntnef\Um"WdNXpdg2n[s(9XgR^ie[jsm@sq&8mDrr\p1c3X~> endstream endobj 41 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 40 0 R >> endobj 42 0 obj << /Length 2201 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gau0F9lo&I&A@sBnC1#>L57L3oMdoO48#Bi2c43Ke+Q%"OMk*B<0E.+r;&$;a96!D>Y&`f[KUtYfR8/E(+]6-HL^%fink+^/E,omM_l\f&4I1Y(UA=7aV%&[AO3;Iqhs>aY)Pe>Ou*Jjd/4TB2Etc?-:(H\K>YJ%petS/OJ:Wa_J@8q*@<^b5WSaW]!r%#a?oYtK?dJ9P!VF_:%7_YgY;5!l?`@+AFf6Z5dp:']m_6<>Y]+R%^TuU.un12!d1bAVP5XD1s8!$g48njY(-`#8g9F=80ESSVZsbL=0H^kKO%maNShm"1[`oWKrH'.aMBJJTkdXbC)87Wg3q'.fkQYMot8%_ftY1$`=K:i7EW7iC;0S"K'+Pm9R'dH(bHin\;W_dA\-K5@HmP/"!PEA+RnjmCq!81h-YH*M_D*3Ea9YaIMk_o;fRukb=FHDpseBO"]:IMO@TYGV%r`BYD^Z.l,SCaWn.nPl)'$`_l+oZqdWP>uM&SWo4u0J;cWlrZu2F6inb9IK5AbcZ:m#'_B.?NaUq@R#TIrPH3<"EQ[#+o9\tY*5cGCj)TAaT;os.;dHD>A*Ne46*J,7?Y.3h?Yi]4c*KIsYCLJXc0AeB0;,d$JtAe2N@tNDNrg<4N#/.C_(MSZ!q^Hq!]'HsBYAqqO%'hWomT0:Hk;"qOa1#a?J*C4XBl\%!$7p.(Y.>E'1!]'ION>E1\iLSsbh@H:Lqfm"uL0?d.aq9RPe?T[4?&]YP[:SO'nH/Os"&kubOfe/_f("*XQ7L>Hi[?U,WSTq@>']Xf;C5+/r7)l3DT.`UFu'H>sQF/'^;N[6t(\0IP7@!5+)2bB5["`#?s3?Q(pN[GcYDhWh22\BM\F`ocNOXA(cu0>qG"iRQ!ZcAsnEA)pmF_X8X(U`asa$IkQ>lDrnMIGO4n;-G8!l!8F.ME/o"OaN`O(lMgj^@Ljuc3Kis>8q46a=,ai8$2Edb3\!QAlm?^T"_51^Tu5;i\p[s*lcj$kd5G91h&.V):1Q(r.jmk\cL.CBs@X)<38$I"^L(2IX'EK[!7CFCo/saJZIHn$gok-._X%??()dR>_W%Krd"i\/biB+Rdorb>h^-Z[_h,%G,1*b+@*5p+@,c[S"6pSO)iR.Gr<1ZR;*)/l\A`IRqZcCa2]E+dde4M"!I]%:'Nfdl_><4\g2B%UD?>rWLJX*0)`cAqodWb2n8T++34JIXJAb,?Qtka+hX'+^fQIB#W5O]!#jS.Pb(h3\-LMr9cJ,9jX-sS>`0Eo(fGNnu\VT,VSeqHtdC0QODG7d-7a]r]h#[OUoQ06:.,U:*"D+a]WrCHI5&,b@g"p>5nMcTa>liV[NVp0(O_28`\"X>k0ns0.ul665D''RHWIj`>oB4Sf/>"A)PV;h7<&nHu)=^1&8U($OW'"_?sqkRSB]E^D(H5PP9@nj-6L8*4)'H=8\REHF8TZ'0'5f2MjDDKpph+CgHII*.r#?K)+F&"rH>H>^`WAj2U)&W@\dZOsbuk"i\AJ"!M3qQ%GJEl3K^L_g+4K.Kr5rds!6I>=2cEQ!2%Zn>VoCOdOngNq[/g:]"PXX[>";01.fpE(5&-AM~> endstream endobj 43 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 42 0 R >> endobj 44 0 obj << /Length 707 /Filter [ /ASCII85Decode /FlateDecode ] >> stream GatUpbANe7'Sc?E`ECn&%s)T?\uLe93\\Fr/@u&L#IUVCTi/&)LJ=e1YQ#Qf#@C<$UcnqTe34LpD"Ap(bjYfB+Aro(r8-JEC"iF>Nh.BGrB'YekC&PN`.[OBI8C_WPc0D)Rk[i-4)<*`nmd?mSjiK-!.:WL`4qmK^p7J0^3fk]-qI@?[*pGD86L'nJGOmP*a`g3,.Tf?_U[u?e7qUIc8DDN5(';Hc^PIn^fbD3Q=KAF2]lZ2`IW-:[:%:n endstream endobj 45 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 44 0 R >> endobj 47 0 obj << /Title (\376\377\0\61\0\40\0\117\0\166\0\145\0\162\0\166\0\151\0\145\0\167) /Parent 46 0 R /Next 48 0 R /A 9 0 R >> endobj 48 0 obj << /Title (\376\377\0\62\0\40\0\114\0\165\0\143\0\145\0\156\0\145\0\40\0\124\0\157\0\157\0\154\0\163) /Parent 46 0 R /First 49 0 R /Last 50 0 R /Prev 47 0 R /Next 51 0 R /Count -2 /A 11 0 R >> endobj 49 0 obj << /Title (\376\377\0\62\0\56\0\61\0\40\0\114\0\165\0\153\0\145) /Parent 48 0 R /Next 50 0 R /A 13 0 R >> endobj 50 0 obj << /Title (\376\377\0\62\0\56\0\62\0\40\0\114\0\111\0\115\0\117\0\40\0\50\0\114\0\165\0\143\0\145\0\156\0\145\0\40\0\111\0\156\0\144\0\145\0\170\0\40\0\115\0\157\0\156\0\151\0\164\0\157\0\162\0\51) /Parent 48 0 R /Prev 49 0 R /A 15 0 R >> endobj 51 0 obj << /Title (\376\377\0\63\0\40\0\114\0\165\0\143\0\145\0\156\0\145\0\40\0\104\0\157\0\143\0\165\0\155\0\145\0\156\0\164\0\40\0\103\0\157\0\156\0\166\0\145\0\162\0\164\0\145\0\162\0\163) /Parent 46 0 R /First 52 0 R /Last 57 0 R /Prev 48 0 R /Next 58 0 R /Count -6 /A 17 0 R >> endobj 52 0 obj << /Title (\376\377\0\63\0\56\0\61\0\40\0\130\0\115\0\114\0\40\0\104\0\157\0\143\0\165\0\155\0\145\0\156\0\164\0\40\0\43\0\61) /Parent 51 0 R /Next 53 0 R /A 19 0 R >> endobj 53 0 obj << /Title (\376\377\0\63\0\56\0\62\0\40\0\130\0\115\0\114\0\40\0\104\0\157\0\143\0\165\0\155\0\145\0\156\0\164\0\40\0\43\0\62) /Parent 51 0 R /Prev 52 0 R /Next 54 0 R /A 21 0 R >> endobj 54 0 obj << /Title (\376\377\0\63\0\56\0\63\0\40\0\120\0\104\0\106\0\40\0\102\0\157\0\170) /Parent 51 0 R /Prev 53 0 R /Next 55 0 R /A 23 0 R >> endobj 55 0 obj << /Title (\376\377\0\63\0\56\0\64\0\40\0\130\0\120\0\104\0\106\0\40\0\55\0\40\0\120\0\104\0\106\0\40\0\104\0\157\0\143\0\165\0\155\0\145\0\156\0\164\0\40\0\103\0\157\0\156\0\166\0\145\0\162\0\163\0\151\0\157\0\156) /Parent 51 0 R /Prev 54 0 R /Next 56 0 R /A 25 0 R >> endobj 56 0 obj << /Title (\376\377\0\63\0\56\0\65\0\40\0\120\0\104\0\106\0\124\0\145\0\170\0\164\0\123\0\164\0\162\0\145\0\141\0\155\0\40\0\55\0\55\0\40\0\120\0\104\0\106\0\40\0\164\0\145\0\170\0\164\0\40\0\141\0\156\0\144\0\40\0\155\0\145\0\164\0\141\0\144\0\141\0\164\0\141\0\40\0\145\0\170\0\164\0\162\0\141\0\143\0\164\0\151\0\157\0\156) /Parent 51 0 R /Prev 55 0 R /Next 57 0 R /A 27 0 R >> endobj 57 0 obj << /Title (\376\377\0\63\0\56\0\66\0\40\0\120\0\112\0\40\0\103\0\154\0\141\0\163\0\163\0\151\0\143\0\40\0\46\0\40\0\120\0\112\0\40\0\120\0\162\0\157\0\146\0\145\0\163\0\163\0\151\0\157\0\156\0\141\0\154\0\40\0\55\0\40\0\120\0\104\0\106\0\40\0\104\0\157\0\143\0\165\0\155\0\145\0\156\0\164\0\40\0\103\0\157\0\156\0\166\0\145\0\162\0\163\0\151\0\157\0\156) /Parent 51 0 R /Prev 56 0 R /A 29 0 R >> endobj 58 0 obj << /Title (\376\377\0\64\0\40\0\115\0\151\0\163\0\143\0\145\0\154\0\154\0\141\0\156\0\145\0\157\0\165\0\163) /Parent 46 0 R /First 59 0 R /Last 62 0 R /Prev 51 0 R /Count -4 /A 31 0 R >> endobj 59 0 obj << /Title (\376\377\0\64\0\56\0\61\0\40\0\101\0\162\0\141\0\142\0\151\0\143\0\40\0\101\0\156\0\141\0\154\0\171\0\172\0\145\0\162\0\40\0\146\0\157\0\162\0\40\0\112\0\141\0\166\0\141) /Parent 58 0 R /Next 60 0 R /A 33 0 R >> endobj 60 0 obj << /Title (\376\377\0\64\0\56\0\62\0\40\0\120\0\150\0\157\0\156\0\145\0\164\0\151\0\170) /Parent 58 0 R /Prev 59 0 R /Next 61 0 R /A 35 0 R >> endobj 61 0 obj << /Title (\376\377\0\64\0\56\0\63\0\40\0\145\0\152\0\111\0\156\0\144\0\145\0\170\0\40\0\55\0\40\0\112\0\102\0\157\0\163\0\163\0\40\0\115\0\102\0\145\0\141\0\156\0\40\0\146\0\157\0\162\0\40\0\114\0\165\0\143\0\145\0\156\0\145) /Parent 58 0 R /Prev 60 0 R /Next 62 0 R /A 37 0 R >> endobj 62 0 obj << /Title (\376\377\0\64\0\56\0\64\0\40\0\112\0\141\0\166\0\141\0\103\0\103) /Parent 58 0 R /Prev 61 0 R /A 39 0 R >> endobj 63 0 obj << /Type /Font /Subtype /Type1 /Name /F3 /BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding >> endobj 64 0 obj << /Type /Font /Subtype /Type1 /Name /F5 /BaseFont /Times-Roman /Encoding /WinAnsiEncoding >> endobj 65 0 obj << /Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica /Encoding /WinAnsiEncoding >> endobj 66 0 obj << /Type /Font /Subtype /Type1 /Name /F2 /BaseFont /Helvetica-Oblique /Encoding /WinAnsiEncoding >> endobj 67 0 obj << /Type /Font /Subtype /Type1 /Name /F7 /BaseFont /Times-Bold /Encoding /WinAnsiEncoding >> endobj 1 0 obj << /Type /Pages /Count 4 /Kids [6 0 R 41 0 R 43 0 R 45 0 R ] >> endobj 2 0 obj << /Type /Catalog /Pages 1 0 R /Outlines 46 0 R /PageMode /UseOutlines >> endobj 3 0 obj << /Font << /F3 63 0 R /F5 64 0 R /F1 65 0 R /F2 66 0 R /F7 67 0 R >> /ProcSet [ /PDF /ImageC /Text ] >> endobj 9 0 obj << /S /GoTo /D [41 0 R /XYZ 85.0 659.0 null] >> endobj 11 0 obj << /S /GoTo /D [41 0 R /XYZ 85.0 532.666 null] >> endobj 13 0 obj << /S /GoTo /D [41 0 R /XYZ 85.0 480.332 null] >> endobj 15 0 obj << /S /GoTo /D [41 0 R /XYZ 85.0 409.779 null] >> endobj 17 0 obj << /S /GoTo /D [41 0 R /XYZ 85.0 339.226 null] >> endobj 19 0 obj << /S /GoTo /D [41 0 R /XYZ 85.0 260.492 null] >> endobj 21 0 obj << /S /GoTo /D [41 0 R /XYZ 85.0 189.939 null] >> endobj 23 0 obj << /S /GoTo /D [43 0 R /XYZ 85.0 659.0 null] >> endobj 25 0 obj << /S /GoTo /D [43 0 R /XYZ 85.0 588.447 null] >> endobj 27 0 obj << /S /GoTo /D [43 0 R /XYZ 85.0 517.894 null] >> endobj 29 0 obj << /S /GoTo /D [43 0 R /XYZ 85.0 447.341 null] >> endobj 31 0 obj << /S /GoTo /D [43 0 R /XYZ 85.0 376.788 null] >> endobj 33 0 obj << /S /GoTo /D [43 0 R /XYZ 85.0 337.654 null] >> endobj 35 0 obj << /S /GoTo /D [43 0 R /XYZ 85.0 267.101 null] >> endobj 37 0 obj << /S /GoTo /D [43 0 R /XYZ 85.0 196.548 null] >> endobj 39 0 obj << /S /GoTo /D [45 0 R /XYZ 85.0 659.0 null] >> endobj 46 0 obj << /First 47 0 R /Last 58 0 R >> endobj xref 0 68 0000000000 65535 f 0000013756 00000 n 0000013835 00000 n 0000013927 00000 n 0000000015 00000 n 0000000071 00000 n 0000001219 00000 n 0000001339 00000 n 0000001469 00000 n 0000014050 00000 n 0000001604 00000 n 0000014113 00000 n 0000001741 00000 n 0000014179 00000 n 0000001877 00000 n 0000014245 00000 n 0000002014 00000 n 0000014311 00000 n 0000002151 00000 n 0000014377 00000 n 0000002288 00000 n 0000014443 00000 n 0000002425 00000 n 0000014509 00000 n 0000002562 00000 n 0000014573 00000 n 0000002697 00000 n 0000014639 00000 n 0000002834 00000 n 0000014705 00000 n 0000002970 00000 n 0000014771 00000 n 0000003107 00000 n 0000014837 00000 n 0000003244 00000 n 0000014903 00000 n 0000003381 00000 n 0000014969 00000 n 0000003518 00000 n 0000015035 00000 n 0000003655 00000 n 0000006029 00000 n 0000006137 00000 n 0000008431 00000 n 0000008539 00000 n 0000009338 00000 n 0000015099 00000 n 0000009446 00000 n 0000009585 00000 n 0000009802 00000 n 0000009928 00000 n 0000010187 00000 n 0000010487 00000 n 0000010675 00000 n 0000010877 00000 n 0000011034 00000 n 0000011325 00000 n 0000011727 00000 n 0000012143 00000 n 0000012353 00000 n 0000012596 00000 n 0000012760 00000 n 0000013062 00000 n 0000013200 00000 n 0000013313 00000 n 0000013423 00000 n 0000013531 00000 n 0000013647 00000 n trailer << /Size 68 /Root 2 0 R /Info 4 0 R >> startxref 15150 %%EOF lucene-2.9.4/docs/demo2.html0000644000175000017500000003670011474320234016305 0ustar janpascaljanpascal Apache Lucene - Basic Demo Sources Walk-through
 

Apache Lucene - Basic Demo Sources Walk-through

About the Code

In this section we walk through the sources behind the command-line Lucene demo: where to find them, their parts and their function. This section is intended for Java developers wishing to understand how to use Lucene in their applications.

Location of the source

Relative to the directory created when you extracted Lucene or retrieved it from Subversion, you should see a directory called src which in turn contains a directory called demo. This is the root for all of the Lucene demos. Under this directory is org/apache/lucene/demo. This is where all the Java sources for the demos live.

Within this directory you should see the IndexFiles.java class we executed earlier. Bring it up in vi or your editor of choice and let's take a look at it.

IndexFiles

As we discussed in the previous walk-through, the IndexFiles class creates a Lucene Index. Let's take a look at how it does this.

The first substantial thing the main function does is instantiate IndexWriter. It passes the string "index" and a new instance of a class called StandardAnalyzer. The "index" string is the name of the filesystem directory where all index information should be stored. Because we're not passing a full path, this will be created as a subdirectory of the current working directory (if it does not already exist). On some platforms, it may be created in other directories (such as the user's home directory).

The IndexWriter is the main class responsible for creating indices. To use it you must instantiate it with a path that it can write the index into. If this path does not exist it will first create it. Otherwise it will refresh the index at that path. You can also create an index using one of the subclasses of Directory. In any case, you must also pass an instance of org.apache.lucene.analysis.Analyzer.

The particular Analyzer we are using, StandardAnalyzer, is little more than a standard Java Tokenizer, converting all strings to lowercase and filtering out stop words and characters from the index. By stop words and characters I mean common language words such as articles (a, an, the, etc.) and other strings that may have less value for searching (e.g. 's) . It should be noted that there are different rules for every language, and you should use the proper analyzer for each. Lucene currently provides Analyzers for a number of different languages (see the *Analyzer.java sources under contrib/analyzers/src/java/org/apache/lucene/analysis).

Looking further down in the file, you should see the indexDocs() code. This recursive function simply crawls the directories and uses FileDocument to create Document objects. The Document is simply a data object to represent the content in the file as well as its creation time and location. These instances are added to the indexWriter. Take a look inside FileDocument. It's not particularly complicated. It just adds fields to the Document.

As you can see there isn't much to creating an index. The devil is in the details. You may also wish to examine the other samples in this directory, particularly the IndexHTML class. It is a bit more complex but builds upon this example.

Searching Files

The SearchFiles class is quite simple. It primarily collaborates with an IndexSearcher, StandardAnalyzer (which is used in the IndexFiles class as well) and a QueryParser. The query parser is constructed with an analyzer used to interpret your query text in the same way the documents are interpreted: finding the end of words and removing useless words like 'a', 'an' and 'the'. The Query object contains the results from the QueryParser which is passed to the searcher. Note that it's also possible to programmatically construct a rich Query object without using the query parser. The query parser just enables decoding the Lucene query syntax into the corresponding Query object. Search can be executed in two different ways:

  • Streaming: A Collector subclass simply prints out the document ID and score for each matching document.
  • Paging: Using a TopScoreDocCollector the search results are printed in pages, sorted by score (i. e. relevance).

The Web example...

 
lucene-2.9.4/docs/index.pdf0000644000175000017500000000322311474320234016205 0ustar janpascaljanpascal%PDF-1.3 %ª«¬­ 4 0 obj << /Type /Info /Producer (FOP 0.20.5) >> endobj 5 0 obj << /Length 456 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gasak9iHZu&A@Zcp=LFF+XBmEXkH._K9"0#:_+/$P*(pMLQ;Dk!&E5fG/iJ@D`?ADp@1%DE4LsYEC(ht!?GKl5[baf5i!)?3n.V68JJ?N42B>)[a.sUQ>j['Q0U#Fnj@_FD=db"l:uNBek:7=D!*2LKP[ABR4FGR`0=&=ZdQ4u*&2ONpq&&A#?GW()d2u/ECr6X[;A%qU*i']Se*+qi?$iU9"*r&O26Yi8)#8&opKngo?nK.tq?fWK#9"n=sVE*hZUNnZ%OIIeG+7hB?$#)^=!IR""#ntZ6d(t.7)03l)J'R;9-![+1L-gtY(6D!R,h")ApA!1*IG+F84u3ds`$gU.b"@o_ohjig5?Q`iap(kS[]bV:&;%&bPm8OXMm-qro:ZMuqDSS endstream endobj 6 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 5 0 R >> endobj 7 0 obj << /Type /Font /Subtype /Type1 /Name /F3 /BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding >> endobj 8 0 obj << /Type /Font /Subtype /Type1 /Name /F5 /BaseFont /Times-Roman /Encoding /WinAnsiEncoding >> endobj 9 0 obj << /Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica /Encoding /WinAnsiEncoding >> endobj 10 0 obj << /Type /Font /Subtype /Type1 /Name /F2 /BaseFont /Helvetica-Oblique /Encoding /WinAnsiEncoding >> endobj 1 0 obj << /Type /Pages /Count 1 /Kids [6 0 R ] >> endobj 2 0 obj << /Type /Catalog /Pages 1 0 R >> endobj 3 0 obj << /Font << /F3 7 0 R /F5 8 0 R /F1 9 0 R /F2 10 0 R >> /ProcSet [ /PDF /ImageC /Text ] >> endobj xref 0 11 0000000000 65535 f 0000001168 00000 n 0000001226 00000 n 0000001276 00000 n 0000000015 00000 n 0000000071 00000 n 0000000618 00000 n 0000000724 00000 n 0000000836 00000 n 0000000945 00000 n 0000001052 00000 n trailer << /Size 11 /Root 2 0 R /Info 4 0 R >> startxref 1385 %%EOF lucene-2.9.4/docs/images/0000755000175000017500000000000011554106561015654 5ustar janpascaljanpascallucene-2.9.4/docs/images/lia_3d.jpg0000644000175000017500000001045111474320233017505 0ustar janpascaljanpascalÿØÿàJFIFHHÿÛC  !"$"$ÿÛCÿÀ“"ÿÄ ÿĵ}!1AQa"q2‘¡#B±ÁRÑð$3br‚ %&'()*456789:CDEFGHIJSTUVWXYZcdefghijstuvwxyzƒ„…†‡ˆ‰Š’“”•–—˜™š¢£¤¥¦§¨©ª²³´µ¶·¸¹ºÂÃÄÅÆÇÈÉÊÒÓÔÕÖרÙÚáâãäåæçèéêñòóôõö÷øùúÿÄ ÿĵw!1AQaq"2B‘¡±Á #3RðbrÑ $4á%ñ&'()*56789:CDEFGHIJSTUVWXYZcdefghijstuvwxyz‚ƒ„…†‡ˆ‰Š’“”•–—˜™š¢£¤¥¦§¨©ª²³´µ¶·¸¹ºÂÃÄÅÆÇÈÉÊÒÓÔÕÖרÙÚâãäåæçèéêòóôõö÷øùúÿÚ ?û.Š( Š( Š( Š( Š) ©€Šnôþð£xí“øP¨¦î=”Ð ÏJuœúŠZ(£#4PEPEPEPY~)Õ—CЮuWŒÊ¶áK(êA`?­jW)ñi‚ü<Öç%<öÖ³«'I®ˆ¨$ä“54=]5­>;Ûâ’)r£§±ô5tÇpzÌ WÌÞñ6­ ™%Ò/|µ”|À€È}÷­ Ÿˆ>0ŸïkR ô¯§Q¤’œ]ü¿áÏRMJšÅ«EñJçñ¡a„œ­|Ãsâ-~ç>~·¨Ècrøü³^³ð’Yeð|m,Žìg“%›'­i€Î£­ì£ir1yd°Ôùå+žd·OâQMkËuþ0~•‰1u…Ú$ß RUIÆN8ž³kO0dµŽ=ܳJIÇÐwé^Ùæ;jŽáQ¶£ýÔüëòõ§t/sgm`á#'žpA?‡ëI-¥éLÍ«´g³$j zõÏ·å@ï!à*ŠŒÞNz0‚txæRÇQ¾rÙìØêqþÈýjõœ6Q¡ß‚rw9cŸÆ€.›‰‰Ï˜ki>è¬ ßOº(h¢Š(¢Š(¢Š+–ø¯ÿ$ûVÿ®kÿ¡­u5ËüU]ÞÕTœegþÚ-e]^œ½tþ5ê|Í-`(z{ý}ýûþµ5Ivò ·‚eE–1†ÛÐãŒöŽÕ¯Ï³jn~WØúܶjtn‚½«áü‰±ÿ×y?x­{WÂùcÿ®ò:ëáÏ÷Ï“ý sŸ÷ŸùQ_v|©£p-,'ºa•†6r>ƒ5á·úâêɨêºå¹ºy GnÜùdçjà— w5î·P¥Å´°H2’¡F ŒùïÆñ†ï{=RÎVšHcˆ[£»sÒ1*ŰG*8ýk‹¦Òå:(J1»“±{៌ïl|ie£ÌÛ,o¤xÚ;жÖ¨Ž1ÓÚ½ê¾bøbº&‰ñ u­~öy±[fhDcüLsÀ^؇ŒWÓpËÑ,°È²Fã*Êrõ¯ …ª VF'}5[é÷E`Vú}Ñ]FÑEQEQEW+ñ_þIö­ÿ\×ÿCZê«”ø¶3ðïX§”¿úÖUÿ…/F]/Ž>¨ù¼¨ÉojNÕ™áÛ–šˆ¾áwÇÐúqßó5§Ú¿;ÌÜeÌï¡öX*jœR¶¥jCWÑmµ!’']Û7nÛÉp=+Ü~ê:}€ZîöúÚÖÞ äó¥šUD‘÷‰8GZù~Ó#ᮄGüýÛgþÿŠï¾J–ºþ™sª2E¡^™¥˜â!qö(„‰ãïoÆ‹ëÕÉiªx¹Iy¯ÅšáUJ-_fÿ‡¿^x˶—/m>½b³$-1A(c°GæNÁ¿JóŒVzüGð“é÷—±ßÎcµ¶Šè«ÙËË­¶&Œ:0;| ®A$W–&ŒuTñÍ®©~¦êÆÿVÕ4ý9#>mКݖ÷g÷ˆ#}ªuêN¦jþñ&­¦X ;N»ÛøCFdÊß,Gˆ3¢M££wÆE}[«>ˆð£€Ã«)K·§åØõÛ?ØÞhÚ•ü6wpͧ]=Í­ÀU’9•U¶’¬ÊFÖSHÁ¯×uÿí路Ë$òìÆä¨IÀ>‡{溉-á++m*×P´7¿i¼»7Ü\K)*¤ÿh‘ôÀ`q^C Ö5K" ~å¢<¨¢šu}¤hTí£·ùžiQ–2­9•ÿÈëürtí?O‰á‚X¡vòžqÛƒùŠî?go¿Ÿuá9aÊ÷vòç Ê†SùƒÇ½pmûG‚8ÇÌ“ŸîóË5¿û:Å5Ç®fHÏ•mk&÷>ŒWëœþUçd\ÒùIßS³<å§„ai©ôEo§Ý[é÷E{'0´QEQEQFG­Ê|[Çü+½c<)ô5®¤º¬+–ø¬è|« ƒû´ãþµ•áKÑ—Kãª>VðÔ‚S3.æ,î÷ãÿQƒ[©óépÙ\µÅ¾ÕIוóœçõÿ=ÙÚ¿;Ìë_ÈûL,”¢Ú8mPФŠ> (ªJ¶á`šV‘ r€Ù²>RFG¸õ¯]øyâ³g¡ßøRÏK²‡Ö–öo$;¢“í•ÊŒØH㌒2y¯8Ó|'ih¶ÆIÞi-Œ íÛƒl÷àŽ£ü+Ú>x@Ô|5 íäw2N%¸ÅÃ)ùæIpT«Æ…HÁï“^æM:2Æ?fÞÏôîy™œ«ý_÷¯ù—>!êúî†ÚÄ6:ÕÂ¥§…îo¢LÜW7ãx‰tõ-ZêÞêæmjÝÕí·Ky"u^€©Dïæç"½jãÃÚMÚ¿Û­EãIfÖR´ìXÉ ²7c“Ö§6...•b&¹O'Ù×t¹Æw|ÙÀëè=+ëOœ¹òÿ޵J_j±k/Û$¸*ÐÝN¨±°BOY ×dšÈðžšÒk1µÃ†Dd¥qú“ù×ÐúŸÃ¯ Eâ ¼M2M,…Úf‚R/0œî=ð8ÏøW›ëŒ°XÞèVq™šâLÏ~0 '— ƒêX­yù¶-F²]WÜ,¯)b}´žÎöïþG¬½æ³ÔHf³F(ªF®?^ãð§Á¯á}gQºµœO¤ÞÛÄö’dna’@aêúþÅþÏiž¶Ñ‚‘ÅíB§æÎr~‚¾•ðT2Aá"A–qo”W‘UºœÐõsz)Î5æÅo§Ý[é÷E} å EPT¯oVI˜=¯àcм9wàoC®êʾM´æ²BÄT¯™‘ÔsœcµzÏÃë‹{ŸéMj²¬1Û,+æc'Ëù ëÓ*kšñ†Ÿ«x·ÄðéÛÝÙèð·wMŒÉ‚IHç<zrON±*N¬3µÚ~¾¦°‚ÄWsÄý”þ^žg}i<7V±\ÛȲC*Ôä2‘Et‰÷ErÚFŸi¥i–úuŒ^Uµº7€=Ï5Ô§Ýé+ÛSÚú ESQET70‰PŒf¦¢€8oháƒ2­y׊mž=6tÇ\ÿ} ÷KÛe™Íp4ðÛ^ØÏgc8á±Ðç5h¹S’]™tݦ›&1Qàaá8¿,ÄZLT]G‰A-Ì(bC8± kkt`hF2:Â0@íY$*LÑQtp%&å 0Ì o&-:{}6l%Lð¹0/G•)*J `je>#6)EcŒRWfÏ, ¶()¦ l':Z/p/¦/yTÐF’75f=MVa.Ô%XLP®ˆèDuJ'79DK¦àÊlIDATxÚµÖ‰[ÔFð4©ÄAm«Sã⢫Ю8©£F}à ^´Je-[mñE/DÔÝ7W’]p=yß—™¿ŒyoÞnp}=‚TUµ.1 ®¼,á9‹Ü•F-7dö,>Üxº §ÜMG±’B‘ÒRé¡Ì>áäqì†ù{%¸¢r½¶ŒâŠ–æ$"ŠëãP¦—á¾àºŸ™Îc`ýY¢ì¡ï‘’ïs§a«a±æÕÓÑæåÌb®dÈI7½bã…yC±`QH™>c„B„½é¤w\Ïÿ6ñjÿÞæÑ÷†/7À½'Á7‹‹Ü¾[¼xŠ >R¥žaÄøyá¸,þðbbúê_Ãóã9,$í–KRi4 G ¥ƒ¥0»Þ\‡LܨBæÎœËâ¡·ÿnÀ‰¿‹óÛ½, û¬µÁR¸ÏsÇWó·{Gþx2|«Xq Ìô¶0ËÄÇ`Þ§DÈÜñÕ0UÁ‰#Íå¡6íQ&œg±îIþŽ=üŽup¬ëE¯ w|5LÕ{Fúº^½ÃXlIJð=ƒîïøÅÛnºÎCåa ×…q¶~xÑ8>‹;fž}£“»KU¡CÓsh¹™SÌå­!Ì7÷ïlwY\^®Œ›º¶Üškƒ?û·ƒ½Î¹í³8_y<ª7»­0ù2WÃÖ¹í³8?ñkðs×äe½bûƒå«âZ¯sÆ}—õD½·ÛÁKÕêwÿµY=6ø?mõh^ IEND®B`‚lucene-2.9.4/docs/images/instruction_arrow.png0000644000175000017500000000043511474320233022152 0ustar janpascaljanpascal‰PNG  IHDR Ý"õtRNSøüøZì— pHYs  šœ½IDAT×côðòذi)€éÙÓg_¾|Á%]_[ÿùóg4AFeeqqñ”´555L=î.îÂÂÂS§OuuwEˆ*«(v Sxdø£§~üùñãÏ¢ô@€°°ðœys~üùÁ ©©IRÄÅÇ1$¥$\LLQ1QÏ_=g¸z㪘¸A ›¶n‚ø‡ñÇŸ/ž¿Ø°~çOŸ0•¶6·þùó§ ¨ º¦š‹› * ÑŠ YZY9qM"útzu—êsIEND®B`‚lucene-2.9.4/docs/images/favicon.ico0000644000175000017500000000706611474320233020001 0ustar janpascaljanpascalh& ¨Ž( @ÿÿÿwŒ}±Õ»6:7Za\ÝÛÝ”°›ÄïЯ³°hxlËËËïíîŸÄª¾½¾JZN†¡ŽºãÅœŸwxw†‡†eif¤ª¦©Ì²ÔÔÔæææœ¸¤‘ •õöõo†u_td¯Á´nnnS\U~’ƒ¥»«†—ŠVj[©”ÆÅÅ}…€¸¹¸˜˜™¶ÚÀúúú|||˜¦œ_fbio¥Ç®ŠŒŠª¯¬nupØØÙbyhœ¿¦ÀÃÁâââr€v®Ê¶òñò|Œ ³¤™´ ¬Ð¶Š’ŒÍÐι¿»–‘‹Ÿ‘Š¥éêé­˜…Žˆgmi¨«©©¹­µ·µ–£™²Û¾vˆ{¡·¦cmf°Ð¸ýüýVfZÇñÒ¯¯¯¢“aqeh|mÅÈÇx{yr„vµ×¾y‰~‘±™•³\f`prpÁ½À¨Ç°ÒÒÒlllˆ•ŒóôóP]T­Ó·õóõðððXd[Xh\]rcbvh¼äǯֹ¬Ë´~Ž‚¡Â©Œ‘މžš¹¢ª–£Ç¬‹¢‘aueÁÁ¸¾¹­°®¦ª¨¥¬§òïñØÖØptqž¾¦ž¸¥ðîð_rc¯µ°ª±¬§”þþþªÍ³¨Í±Š¡«——¢šûúûçåæ^scqƒvsv³Ú¾x‹}°Õº±Ôº¾¾½­Ï¶’ –¦©§þýþýýýóñòVi[`fbØÙÙ‰¥Š¥‘¥¼«ž¿¦¬—¨¬©§ª¨•³œŽŽŽŽŽŽ¢cr~qŠŠŽ‰O›2G§¨Ž6‘W`E‹D€Ž…FX £=Ž„/P'gŽ8a9Ž 4L‚­¬†bŒ”*>1«{’œimv M$©ˆ®ªs#:d(e+.h0"VwzNŽH5kZ^šyY‡lBŽ-oŸ?R((J>f‡áùáˬâ7¨³dÈÅ\Œ:v²‚Ä«¾¾¹Lì⢨D´Ïœ.¶  Fže “ ·lTÜ!||õôB7‘†‹µî F6ééÉuu****…*æå-¤h\"­u||Ý-L½ówZƒŠ?’õÀ7Ö õŽÒ5Œ›£Amõ’’ Pêßßê¨8ÖC@HÏÇ6b§ÆÆÆDg‡‡#ÙSòêô™¤¢üö†Ç‹µ+10zz×蔺¿‰QQ㑃ங9 Ä`‹}š ÐÒc@º÷²Â6Žl––¡çpØÖ¬É=H"Þµ%­²¯ÍxxG°¥æR|s!RìdÔ7§ÊÏHžÜR𣣣˜m½½X92õ;N‹y4js›ärÓ^»ªXMÌù77¤ÔØp}Ò—[Cx_G]½óàŠ•Ëˆ¦¦BdÛq¶l~Õ{×~kk3&V&(±±fí#•ûOÞ Apache Lucene - Getting Started Guide
 

Apache Lucene - Getting Started Guide

Getting Started

This document is intended as a "getting started" guide. It has three audiences: first-time users looking to install Apache Lucene in their application or web server; developers looking to modify or base the applications they develop on Lucene; and developers looking to become involved in and contribute to the development of Lucene. This document is written in tutorial and walk-through format. The goal is to help you "get started". It does not go into great depth on some of the conceptual or inner details of Lucene.

Each section listed below builds on one another. More advanced users may wish to skip sections.

 
lucene-2.9.4/docs/index.html0000644000175000017500000002147511474320234016411 0ustar janpascaljanpascal Lucene Java Documentation
 

Lucene Java Documentation

This is the official documentation for Lucene Java 2.9.4
Please use the menu on the left to access the Javadocs and different documents.

Additional documentation is available in the Wiki.

 
lucene-2.9.4/docs/fileformats.html0000644000175000017500000025701611474320234017617 0ustar janpascaljanpascal Apache Lucene - Index File Formats
 

Apache Lucene - Index File Formats

Index File Formats

This document defines the index file formats used in Lucene version 2.9. If you are using a different version of Lucene, please consult the copy of docs/fileformats.html that was distributed with the version you are using.

Apache Lucene is written in Java, but several efforts are underway to write versions of Lucene in other programming languages. If these versions are to remain compatible with Apache Lucene, then a language-independent definition of the Lucene index format is required. This document thus attempts to provide a complete and independent definition of the Apache Lucene 2.9 file formats.

As Lucene evolves, this document should evolve. Versions of Lucene in different programming languages should endeavor to agree on file formats, and generate new versions of this document.

Compatibility notes are provided in this document, describing how file formats have changed from prior versions.

In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching or adding/deleting of docs. When the new segments file is saved (committed), it will be written in the new file format (meaning no specific "upgrade" process is needed). But note that once a commit has occurred, pre-2.1 Lucene will not be able to read the index.

In version 2.3, the file format was changed to allow segments to share a single set of doc store (vectors & stored fields) files. This allows for faster indexing in certain cases. The change is fully backwards compatible (in the same way as the lock-less commits change in 2.1).

Definitions

The fundamental concepts in Lucene are index, document, field and term.

An index contains a sequence of documents.

  • A document is a sequence of fields.

  • A field is a named sequence of terms.

  • A term is a string.

The same string in two different fields is considered a different term. Thus terms are represented as a pair of strings, the first naming the field, and the second naming text within the field.

Inverted Indexing

The index stores statistics about terms in order to make term-based search more efficient. Lucene's index falls into the family of indexes known as an inverted index. This is because it can list, for a term, the documents that contain it. This is the inverse of the natural relationship, in which documents list terms.

Types of Fields

In Lucene, fields may be stored, in which case their text is stored in the index literally, in a non-inverted manner. Fields that are inverted are called indexed. A field may be both stored and indexed.

The text of a field may be tokenized into terms to be indexed, or the text of a field may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is useful for certain identifier fields to be indexed literally.

See the Field java docs for more information on Fields.

Segments

Lucene indexes may be composed of multiple sub-indexes, or segments. Each segment is a fully independent index, which could be searched separately. Indexes evolve by:

  1. Creating new segments for newly added documents.

  2. Merging existing segments.

Searches may involve multiple segments and/or multiple indexes, each index potentially composed of a set of segments.

Document Numbers

Internally, Lucene refers to documents by an integer document number. The first document added to an index is numbered zero, and each subsequent document added gets a number one greater than the previous.


Note that a document's number may change, so caution should be taken when storing these numbers outside of Lucene. In particular, numbers may change in the following situations:

  • The numbers stored in each segment are unique only within the segment, and must be converted before they can be used in a larger context. The standard technique is to allocate each segment a range of values, based on the range of numbers used in that segment. To convert a document number from a segment to an external value, the segment's base document number is added. To convert an external value back to a segment-specific value, the segment is identified by the range that the external value is in, and the segment's base value is subtracted. For example two five document segments might be combined, so that the first segment has a base value of zero, and the second of five. Document three from the second segment would have an external value of eight.

  • When documents are deleted, gaps are created in the numbering. These are eventually removed as the index evolves through merging. Deleted documents are dropped when segments are merged. A freshly-merged segment thus has no gaps in its numbering.

Overview

Each segment index maintains the following:

  • Field names. This contains the set of field names used in the index.

  • Stored Field values. This contains, for each document, a list of attribute-value pairs, where the attributes are field names. These are used to store auxiliary information about the document, such as its title, url, or an identifier to access a database. The set of stored fields are what is returned for each hit when searching. This is keyed by document number.

  • Term dictionary. A dictionary containing all of the terms used in all of the indexed fields of all of the documents. The dictionary also contains the number of documents which contain the term, and pointers to the term's frequency and proximity data.

  • Term Frequency data. For each term in the dictionary, the numbers of all the documents that contain that term, and the frequency of the term in that document if omitTf is false.

  • Term Proximity data. For each term in the dictionary, the positions that the term occurs in each document. Note that this will not exist if all fields in all documents set omitTf to true.

  • Normalization factors. For each field in each document, a value is stored that is multiplied into the score for hits on that field.

  • Term Vectors. For each field in each document, the term vector (sometimes called document vector) may be stored. A term vector consists of term text and term frequency. To add Term Vectors to your index see the Field constructors

  • Deleted documents. An optional file indicating which documents are deleted.

Details on each of these are provided in subsequent sections.

File Naming

All files belonging to a segment have the same name with varying extensions. The extensions correspond to the different file formats described below. When using the Compound File format (default in 1.4 and greater) these files are collapsed into a single .cfs file (see below for details)

Typically, all segments in an index are stored in a single directory, although this is not required.

As of version 2.1 (lock-less commits), file names are never re-used (there is one exception, "segments.gen", see below). That is, when any file is saved to the Directory it is given a never before used filename. This is achieved using a simple generations approach. For example, the first segments file is segments_1, then segments_2, etc. The generation is a sequential long integer represented in alpha-numeric (base 36) form.

Summary of File Extensions

The following table summarizes the names and extensions of the files in Lucene:
Name Extension Brief Description
Segments File segments.gen, segments_N Stores information about segments
Lock File write.lock The Write lock prevents multiple IndexWriters from writing to the same file.
Compound File .cfs An optional "virtual" file consisting of all the other index files for systems that frequently run out of file handles.
Fields .fnm Stores information about the fields
Field Index .fdx Contains pointers to field data
Field Data .fdt The stored fields for documents
Term Infos .tis Part of the term dictionary, stores term info
Term Info Index .tii The index into the Term Infos file
Frequencies .frq Contains the list of docs which contain each term along with frequency
Positions .prx Stores position information about where a term occurs in the index
Norms .nrm Encodes length and boost factors for docs and fields
Term Vector Index .tvx Stores offset into the document data file
Term Vector Documents .tvd Contains information about each document that has term vectors
Term Vector Fields .tvf The field level info about term vectors
Deleted Documents .del Info about what files are deleted

Primitive Types

Byte

The most primitive type is an eight-bit byte. Files are accessed as sequences of bytes. All other data types are defined as sequences of bytes, so file formats are byte-order independent.

UInt32

32-bit unsigned integers are written as four bytes, high-order bytes first.

UInt32 --> <Byte>4

Uint64

64-bit unsigned integers are written as eight bytes, high-order bytes first.

UInt64 --> <Byte>8

VInt

A variable-length format for positive integers is defined where the high-order bit of each byte indicates whether more bytes remain to be read. The low-order seven bits are appended as increasingly more significant bits in the resulting integer value. Thus values from zero to 127 may be stored in a single byte, values from 128 to 16,383 may be stored in two bytes, and so on.

VInt Encoding Example

Value

First byte

Second byte

Third byte

0

00000000



1

00000001



2

00000010



...




127

01111111



128

10000000

00000001


129

10000001

00000001


130

10000010

00000001


...




16,383

11111111

01111111


16,384

10000000

10000000

00000001

16,385

10000001

10000000

00000001

...




This provides compression while still being efficient to decode.

Chars

Lucene writes unicode character sequences as UTF-8 encoded bytes.

String

Lucene writes strings as UTF-8 encoded bytes. First the length, in bytes, is written as a VInt, followed by the bytes.

String --> VInt, Chars

Compound Types

Map<String,String>

In a couple places Lucene stores a Map String->String.

Map<String,String> --> Count<String,String>Count

Per-Index Files

The files in this section exist one-per-index.

Segments File

The active segments in the index are stored in the segment info file, segments_N. There may be one or more segments_N files in the index; however, the one with the largest generation is the active one (when older segments_N files are present it's because they temporarily cannot be deleted, or, a writer is in the process of committing, or a custom IndexDeletionPolicy is in use). This file lists each segment by name, has details about the separate norms and deletion files, and also contains the size of each segment.

As of 2.1, there is also a file segments.gen. This file contains the current generation (the _N in segments_N) of the index. This is used only as a fallback in case the current generation cannot be accurately determined by directory listing alone (as is the case for some NFS clients with time-based directory cache expiraation). This file simply contains an Int32 version header (SegmentInfos.FORMAT_LOCKLESS = -2), followed by the generation recorded as Int64, written twice.

2.9 Segments --> Format, Version, NameCounter, SegCount, <SegName, SegSize, DelGen, DocStoreOffset, [DocStoreSegment, DocStoreIsCompoundFile], HasSingleNormFile, NumField, NormGenNumField, IsCompoundFile, DeletionCount, HasProx, Diagnostics>SegCount, CommitUserData, Checksum

Format, NameCounter, SegCount, SegSize, NumField, DocStoreOffset, DeletionCount --> Int32

Version, DelGen, NormGen, Checksum --> Int64

SegName, DocStoreSegment --> String

Diagnostics --> Map<String,String>

IsCompoundFile, HasSingleNormFile, DocStoreIsCompoundFile, HasProx --> Int8

CommitUserData --> Map<String,String>

Format is -9 (SegmentInfos.FORMAT_DIAGNOSTICS).

Version counts how often the index has been changed by adding or deleting documents.

NameCounter is used to generate names for new segment files.

SegName is the name of the segment, and is used as the file name prefix for all of the files that compose the segment's index.

SegSize is the number of documents contained in the segment index.

DelGen is the generation count of the separate deletes file. If this is -1, there are no separate deletes. If it is 0, this is a pre-2.1 segment and you must check filesystem for the existence of _X.del. Anything above zero means there are separate deletes (_X_N.del).

NumField is the size of the array for NormGen, or -1 if there are no NormGens stored.

NormGen records the generation of the separate norms files. If NumField is -1, there are no normGens stored and they are all assumed to be 0 when the segment file was written pre-2.1 and all assumed to be -1 when the segments file is 2.1 or above. The generation then has the same meaning as delGen (above).

IsCompoundFile records whether the segment is written as a compound file or not. If this is -1, the segment is not a compound file. If it is 1, the segment is a compound file. Else it is 0, which means we check filesystem to see if _X.cfs exists.

If HasSingleNormFile is 1, then the field norms are written as a single joined file (with extension .nrm); if it is 0 then each field's norms are stored as separate .fN files. See "Normalization Factors" below for details.

DocStoreOffset, DocStoreSegment, DocStoreIsCompoundFile: If DocStoreOffset is -1, this segment has its own doc store (stored fields values and term vectors) files and DocStoreSegment and DocStoreIsCompoundFile are not stored. In this case all files for stored field values (*.fdt and *.fdx) and term vectors (*.tvf, *.tvd and *.tvx) will be stored with this segment. Otherwise, DocStoreSegment is the name of the segment that has the shared doc store files; DocStoreIsCompoundFile is 1 if that segment is stored in compound file format (as a .cfx file); and DocStoreOffset is the starting document in the shared doc store files where this segment's documents begin. In this case, this segment does not store its own doc store files but instead shares a single set of these files with other segments.

Checksum contains the CRC32 checksum of all bytes in the segments_N file up until the checksum. This is used to verify integrity of the file on opening the index.

DeletionCount records the number of deleted documents in this segment.

HasProx is 1 if any fields in this segment have omitTf set to false; else, it's 0.

CommitUserData stores an optional user-supplied opaque Map<String,String> that was passed to IndexWriter's commit or prepareCommit, or IndexReader's flush methods.

The Diagnostics Map is privately written by IndexWriter, as a debugging aid, for each segment it creates. It includes metadata like the current Lucene version, OS, Java version, why the segment was created (merge, flush, addIndexes), etc.

Lock File

The write lock, which is stored in the index directory by default, is named "write.lock". If the lock directory is different from the index directory then the write lock will be named "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index directory. When this file is present, a writer is currently modifying the index (adding or removing documents). This lock file ensures that only one writer is modifying the index at a time.

Deletable File

A writer dynamically computes the files that are deletable, instead, so no file is written.

Compound Files

Starting with Lucene 1.4 the compound file format became default. This is simply a container for all files described in the next section (except for the .del file).

Compound (.cfs) --> FileCount, <DataOffset, FileName> FileCount , FileData FileCount

FileCount --> VInt

DataOffset --> Long

FileName --> String

FileData --> raw file data

The raw file data is the data from the individual files named above.

Starting with Lucene 2.3, doc store files (stored field values and term vectors) can be shared in a single set of files for more than one segment. When compound file is enabled, these shared files will be added into a single compound file (same format as above) but with the extension .cfx.

Per-Segment Files

The remaining files are all per-segment, and are thus defined by suffix.

Fields


Field Info

Field names are stored in the field info file, with suffix .fnm.

FieldInfos (.fnm) --> FNMVersion,FieldsCount, <FieldName, FieldBits> FieldsCount

FNMVersion, FieldsCount --> VInt

FieldName --> String

FieldBits --> Byte

  • The low-order bit is one for indexed fields, and zero for non-indexed fields.
  • The second lowest-order bit is one for fields that have term vectors stored, and zero for fields without term vectors.
  • If the third lowest-order bit is set (0x04), term positions are stored with the term vectors.
  • If the fourth lowest-order bit is set (0x08), term offsets are stored with the term vectors.
  • If the fifth lowest-order bit is set (0x10), norms are omitted for the indexed field.
  • If the sixth lowest-order bit is set (0x20), payloads are stored for the indexed field.

FNMVersion (added in 2.9) is always -2.

Fields are numbered by their order in this file. Thus field zero is the first field in the file, field one the next, and so on. Note that, like document numbers, field numbers are segment relative.


Stored Fields

Stored fields are represented by two files:

  1. The field index, or .fdx file.

    This contains, for each document, a pointer to its field data, as follows:

    FieldIndex (.fdx) --> <FieldValuesPosition> SegSize

    FieldValuesPosition --> Uint64

    This is used to find the location within the field data file of the fields of a particular document. Because it contains fixed-length data, this file may be easily randomly accessed. The position of document n 's field data is the Uint64 at n*8 in this file.

  2. The field data, or .fdt file.

    This contains the stored fields of each document, as follows:

    FieldData (.fdt) --> <DocFieldData> SegSize

    DocFieldData --> FieldCount, <FieldNum, Bits, Value> FieldCount

    FieldCount --> VInt

    FieldNum --> VInt

    Bits --> Byte

    • low order bit is one for tokenized fields
    • second bit is one for fields containing binary data
    • third bit is one for fields with compression option enabled (if compression is enabled, the algorithm used is ZLIB)

    Value --> String | BinaryValue (depending on Bits)

    BinaryValue --> ValueSize, <Byte>^ValueSize

    ValueSize --> VInt

Term Dictionary

The term dictionary is represented as two files:

  1. The term infos, or tis file.

    TermInfoFile (.tis)--> TIVersion, TermCount, IndexInterval, SkipInterval, MaxSkipLevels, TermInfos

    TIVersion --> UInt32

    TermCount --> UInt64

    IndexInterval --> UInt32

    SkipInterval --> UInt32

    MaxSkipLevels --> UInt32

    TermInfos --> <TermInfo> TermCount

    TermInfo --> <Term, DocFreq, FreqDelta, ProxDelta, SkipDelta>

    Term --> <PrefixLength, Suffix, FieldNum>

    Suffix --> String

    PrefixLength, DocFreq, FreqDelta, ProxDelta, SkipDelta
    --> VInt

    This file is sorted by Term. Terms are ordered first lexicographically (by UTF16 character code) by the term's field name, and within that lexicographically (by UTF16 character code) by the term's text.

    TIVersion names the version of the format of this file and is equal to TermInfosWriter.FORMAT_CURRENT.

    Term text prefixes are shared. The PrefixLength is the number of initial characters from the previous term which must be pre-pended to a term's suffix in order to form the term's text. Thus, if the previous term's text was "bone" and the term is "boy", the PrefixLength is two and the suffix is "y".

    FieldNumber determines the term's field, whose name is stored in the .fdt file.

    DocFreq is the count of documents which contain the term.

    FreqDelta determines the position of this term's TermFreqs within the .frq file. In particular, it is the difference between the position of this term's data in that file and the position of the previous term's data (or zero, for the first term in the file).

    ProxDelta determines the position of this term's TermPositions within the .prx file. In particular, it is the difference between the position of this term's data in that file and the position of the previous term's data (or zero, for the first term in the file. For fields with omitTf true, this will be 0 since prox information is not stored.

    SkipDelta determines the position of this term's SkipData within the .frq file. In particular, it is the number of bytes after TermFreqs that the SkipData starts. In other words, it is the length of the TermFreq data. SkipDelta is only stored if DocFreq is not smaller than SkipInterval.

  2. The term info index, or .tii file.

    This contains every IndexInterval th entry from the .tis file, along with its location in the "tis" file. This is designed to be read entirely into memory and used to provide random access to the "tis" file.

    The structure of this file is very similar to the .tis file, with the addition of one item per record, the IndexDelta.

    TermInfoIndex (.tii)--> TIVersion, IndexTermCount, IndexInterval, SkipInterval, MaxSkipLevels, TermIndices

    TIVersion --> UInt32

    IndexTermCount --> UInt64

    IndexInterval --> UInt32

    SkipInterval --> UInt32

    TermIndices --> <TermInfo, IndexDelta> IndexTermCount

    IndexDelta --> VLong

    IndexDelta determines the position of this term's TermInfo within the .tis file. In particular, it is the difference between the position of this term's entry in that file and the position of the previous term's entry.

    SkipInterval is the fraction of TermDocs stored in skip tables. It is used to accelerate TermDocs.skipTo(int). Larger values result in smaller indexes, greater acceleration, but fewer accelerable cases, while smaller values result in bigger indexes, less acceleration (in case of a small value for MaxSkipLevels) and more accelerable cases.

    MaxSkipLevels is the max. number of skip levels stored for each term in the .frq file. A low value results in smaller indexes but less acceleration, a larger value results in slighly larger indexes but greater acceleration. See format of .frq file for more information about skip levels.

Frequencies

The .frq file contains the lists of documents which contain each term, along with the frequency of the term in that document (if omitTf is false).

FreqFile (.frq) --> <TermFreqs, SkipData> TermCount

TermFreqs --> <TermFreq> DocFreq

TermFreq --> DocDelta[, Freq?]

SkipData --> <<SkipLevelLength, SkipLevel> NumSkipLevels-1, SkipLevel> <SkipDatum>

SkipLevel --> <SkipDatum> DocFreq/(SkipInterval^(Level + 1))

SkipDatum --> DocSkip,PayloadLength?,FreqSkip,ProxSkip,SkipChildLevelPointer?

DocDelta,Freq,DocSkip,PayloadLength,FreqSkip,ProxSkip --> VInt

SkipChildLevelPointer --> VLong

TermFreqs are ordered by term (the term is implicit, from the .tis file).

TermFreq entries are ordered by increasing document number.

DocDelta: if omitTf is false, this determines both the document number and the frequency. In particular, DocDelta/2 is the difference between this document number and the previous document number (or zero when this is the first document in a TermFreqs). When DocDelta is odd, the frequency is one. When DocDelta is even, the frequency is read as another VInt. If omitTf is true, DocDelta contains the gap (not multiplied by 2) between document numbers and no frequency information is stored.

For example, the TermFreqs for a term which occurs once in document seven and three times in document eleven, with omitTf false, would be the following sequence of VInts:

15, 8, 3

If omitTf were true it would be this sequence of VInts instead:

7,4

DocSkip records the document number before every SkipInterval th document in TermFreqs. If payloads are disabled for the term's field, then DocSkip represents the difference from the previous value in the sequence. If payloads are enabled for the term's field, then DocSkip/2 represents the difference from the previous value in the sequence. If payloads are enabled and DocSkip is odd, then PayloadLength is stored indicating the length of the last payload before the SkipIntervalth document in TermPositions. FreqSkip and ProxSkip record the position of every SkipInterval th entry in FreqFile and ProxFile, respectively. File positions are relative to the start of TermFreqs and Positions, to the previous SkipDatum in the sequence.

For example, if DocFreq=35 and SkipInterval=16, then there are two SkipData entries, containing the 15 th and 31 st document numbers in TermFreqs. The first FreqSkip names the number of bytes after the beginning of TermFreqs that the 16 th SkipDatum starts, and the second the number of bytes after that that the 32 nd starts. The first ProxSkip names the number of bytes after the beginning of Positions that the 16 th SkipDatum starts, and the second the number of bytes after that that the 32 nd starts.

Each term can have multiple skip levels. The amount of skip levels for a term is NumSkipLevels = Min(MaxSkipLevels, floor(log(DocFreq/log(SkipInterval)))). The number of SkipData entries for a skip level is DocFreq/(SkipInterval^(Level + 1)), whereas the lowest skip level is Level=0.
Example: SkipInterval = 4, MaxSkipLevels = 2, DocFreq = 35. Then skip level 0 has 8 SkipData entries, containing the 3rd, 7th, 11th, 15th, 19th, 23rd, 27th, and 31st document numbers in TermFreqs. Skip level 1 has 2 SkipData entries, containing the 15th and 31st document numbers in TermFreqs.
The SkipData entries on all upper levels > 0 contain a SkipChildLevelPointer referencing the corresponding SkipData entry in level-1. In the example has entry 15 on level 1 a pointer to entry 15 on level 0 and entry 31 on level 1 a pointer to entry 31 on level 0.

Positions

The .prx file contains the lists of positions that each term occurs at within documents. Note that fields with omitTf true do not store anything into this file, and if all fields in the index have omitTf true then the .prx file will not exist.

ProxFile (.prx) --> <TermPositions> TermCount

TermPositions --> <Positions> DocFreq

Positions --> <PositionDelta,Payload?> Freq

Payload --> <PayloadLength?,PayloadData>

PositionDelta --> VInt

PayloadLength --> VInt

PayloadData --> bytePayloadLength

TermPositions are ordered by term (the term is implicit, from the .tis file).

Positions entries are ordered by increasing document number (the document number is implicit from the .frq file).

PositionDelta is, if payloads are disabled for the term's field, the difference between the position of the current occurrence in the document and the previous occurrence (or zero, if this is the first occurrence in this document). If payloads are enabled for the term's field, then PositionDelta/2 is the difference between the current and the previous position. If payloads are enabled and PositionDelta is odd, then PayloadLength is stored, indicating the length of the payload at the current term position.

For example, the TermPositions for a term which occurs as the fourth term in one document, and as the fifth and ninth term in a subsequent document, would be the following sequence of VInts (payloads disabled):

4, 5, 4

PayloadData is metadata associated with the current term position. If PayloadLength is stored at the current position, then it indicates the length of this Payload. If PayloadLength is not stored, then this Payload has the same length as the Payload at the previous position.

Normalization Factors

There's a single .nrm file containing all norms:

AllNorms (.nrm) --> NormsHeader,<Norms> NumFieldsWithNorms

Norms --> <Byte> SegSize

NormsHeader --> 'N','R','M',Version

Version --> Byte

NormsHeader has 4 bytes, last of which is the format version for this file, currently -1.

Each byte encodes a floating point value. Bits 0-2 contain the 3-bit mantissa, and bits 3-8 contain the 5-bit exponent.

These are converted to an IEEE single float value as follows:

  1. If the byte is zero, use a zero float.

  2. Otherwise, set the sign bit of the float to zero;

  3. add 48 to the exponent and use this as the float's exponent;

  4. map the mantissa to the high-order 3 bits of the float's mantissa; and

  5. set the low-order 21 bits of the float's mantissa to zero.

A separate norm file is created when the norm values of an existing segment are modified. When field N is modified, a separate norm file .sN is created, to maintain the norm values for that field.

Separate norm files are created (when adequate) for both compound and non compound segments.

Term Vectors

Term Vector support is an optional on a field by field basis. It consists of 3 files.

  1. The Document Index or .tvx file.

    For each document, this stores the offset into the document data (.tvd) and field data (.tvf) files.

    DocumentIndex (.tvx) --> TVXVersion<DocumentPosition,FieldPosition> NumDocs

    TVXVersion --> Int (TermVectorsReader.CURRENT)

    DocumentPosition --> UInt64 (offset in the .tvd file)

    FieldPosition --> UInt64 (offset in the .tvf file)

  2. The Document or .tvd file.

    This contains, for each document, the number of fields, a list of the fields with term vector info and finally a list of pointers to the field information in the .tvf (Term Vector Fields) file.

    Document (.tvd) --> TVDVersion<NumFields, FieldNums, FieldPositions> NumDocs

    TVDVersion --> Int (TermVectorsReader.FORMAT_CURRENT)

    NumFields --> VInt

    FieldNums --> <FieldNumDelta> NumFields

    FieldNumDelta --> VInt

    FieldPositions --> <FieldPositionDelta> NumFields-1

    FieldPositionDelta --> VLong

    The .tvd file is used to map out the fields that have term vectors stored and where the field information is in the .tvf file.

  3. The Field or .tvf file.

    This file contains, for each field that has a term vector stored, a list of the terms, their frequencies and, optionally, position and offest information.

    Field (.tvf) --> TVFVersion<NumTerms, Position/Offset, TermFreqs> NumFields

    TVFVersion --> Int (TermVectorsReader.FORMAT_CURRENT)

    NumTerms --> VInt

    Position/Offset --> Byte

    TermFreqs --> <TermText, TermFreq, Positions?, Offsets?> NumTerms

    TermText --> <PrefixLength, Suffix>

    PrefixLength --> VInt

    Suffix --> String

    TermFreq --> VInt

    Positions --> <VInt>TermFreq

    Offsets --> <VInt, VInt>TermFreq


    Notes:

    • Position/Offset byte stores whether this term vector has position or offset information stored.
    • Term text prefixes are shared. The PrefixLength is the number of initial characters from the previous term which must be pre-pended to a term's suffix in order to form the term's text. Thus, if the previous term's text was "bone" and the term is "boy", the PrefixLength is two and the suffix is "y".
    • Positions are stored as delta encoded VInts. This means we only store the difference of the current position from the last position
    • Offsets are stored as delta encoded VInts. The first VInt is the startOffset, the second is the endOffset.

Deleted Documents

The .del file is optional, and only exists when a segment contains deletions.

Although per-segment, this file is maintained exterior to compound segment files.

Deletions (.del) --> [Format],ByteCount,BitCount, Bits | DGaps (depending on Format)

Format,ByteSize,BitCount --> Uint32

Bits --> <Byte> ByteCount

DGaps --> <DGap,NonzeroByte> NonzeroBytesCount

DGap --> VInt

NonzeroByte --> Byte

Format is Optional. -1 indicates DGaps. Non-negative value indicates Bits, and that Format is excluded.

ByteCount indicates the number of bytes in Bits. It is typically (SegSize/8)+1.

BitCount indicates the number of bits that are currently set in Bits.

Bits contains one bit for each document indexed. When the bit corresponding to a document number is set, that document is marked as deleted. Bit ordering is from least to most significant. Thus, if Bits contains two bytes, 0x00 and 0x02, then document 9 is marked as deleted.

DGaps represents sparse bit-vectors more efficiently than Bits. It is made of DGaps on indexes of nonzero bytes in Bits, and the nonzero bytes themselves. The number of nonzero bytes in Bits (NonzeroBytesCount) is not stored.

For example, if there are 8000 bits and only bits 10,12,32 are set, DGaps would be used:

(VInt) 1 , (byte) 20 , (VInt) 3 , (Byte) 1

Limitations

When referring to term numbers, Lucene's current implementation uses a Java int to hold the term index, which means the maximum number of unique terms in any single index segment is ~2.1 billion times the term index interval (default 128) = ~274 billion. This is technically not a limitation of the index file format, just of Lucene's current implementation.

Similarly, Lucene uses a Java int to refer to document numbers, and the index file format uses an Int32 on-disk to store document numbers. This is a limitation of both the index file format and the current implementation. Eventually these should be replaced with either UInt64 values, or better yet, VInt values which have no limit.

 
lucene-2.9.4/docs/gettingstarted.pdf0000644000175000017500000000704411474320234020133 0ustar janpascaljanpascal%PDF-1.3 %ª«¬­ 4 0 obj << /Type /Info /Producer (FOP 0.20.5) >> endobj 5 0 obj << /Length 417 /Filter [ /ASCII85Decode /FlateDecode ] >> stream GauHFZ#51J&Dd:"2prkPcX7/k&iGrS8lZUED$LE&+Ak[dPGit<0G/K&cF!oFg#mSJNjhpIIM2dD-j*H^2L]\`THXYXfEg_d(.X=D5hLaiQbBNdm(0i3(?1<6cFmJ"?$Qlc2F^)D/XiYma$SUOB(W4u<:eu]V"QYc0X7@H#MFhn=h\Nmh4,,V8nOlfQF?e5G_!!G_(hju0':m90)_56R=$]p!3:C%G>B?2-"=6T43ETqnZXps@@$Ba3sG5MI.(<-'+B7J-W3+9g5ML?!l9?b1MZjP/D*%M5&(;F"b=1Wfmn)G9Qmc$&,$d/0GD$\;XhOm=4uDg)9W(hml"-3+K3\b.$60't7I_c<9V,Cq=+\A>\cucR-i4EPU-hGZZ&0J'nLCXoAGKg9QA~> endstream endobj 6 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 5 0 R /Annots 7 0 R >> endobj 7 0 obj [ 8 0 R ] endobj 8 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 546.166 182.996 534.166 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 9 0 R /H /I >> endobj 10 0 obj << /Length 987 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gat=*a_oie&A@B[Ln'W\),V0W[&9clJefrb1qp@\d*c#?,)\'Bg$mmc>&4Wg"[re$Q=`Ythj[Vi/+7rKrUtW-Ukui3h(rTCHZD>F#gig$Hh$tXnPZ"U,iYNr%sdo`@%=N682QF9pZ1/4]\A4/Y\q?nj8#M?X9JK2A2kJ[mIl@;[cGIdUGl]qoOb6J_&-;kPN(7$D'6S:^W>^3!1WZ4*_-dbO=!hHL(Q*X3@Hjq_19Bk?oNi1!+8gLB"B]u=hBCA9H(PnCBJ_9!BOot=r#NP3;rIbA"qb!h(#H(FdY!lVrPN'JTA0nhH9:J3Vp]*bm;Bs]'>pV:8UdBDBg2MR"neUFPkk?JXY_Pj*JO9E3r]_2B;gok0%;[f'1S1JXK;/rg>d"gnSo+NW]"%`>6FI53o1_4\n.ToQY]N6s%A=lZo#56hFOZt-nRkF90Gm\1fAP=*CTS+YXPAAU-p<&oMue6,iVoS;9e.=DCM8G!C*U8m/X>js,D'F'qcL\AM=[nr?/)d+Ku=-R<$k3S=c8kcH/Q7DLpj27,+S.Akfbe+1Q,t?%^-NN0%'2Qc,7WBj:D4.G``2K6ooHG)]*P#&!V_B&`oKXfQOO!&knQXb:h=#o^eDX3u5Uh[@UJh))RiPIM@+rXZ;#TQ-Wn(V-$3Co>XZ0o[HJPAb=\!g:nA:TgNW`7'gdb`:!l*R7A"3K@el7fdMMB@Xk]d$Qu^cBB%4lNS&p^"jt&OY]5!0d"H=:BTG_h9BZtZ:j96,$lThP1jUjO2/Me_Sm?7k0+o3*G?)(+\*UcbU.BLEKZ"+#Ac*e;=0BI(1gh]ERMDqB>E1G"hl'QdTG[bJGe$BcY?r"66ja-C?rNNr=g7kVW&`jG;i'IIXL7!o%bBKX$aQ~> endstream endobj 11 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 10 0 R >> endobj 13 0 obj << /Title (\376\377\0\61\0\40\0\107\0\145\0\164\0\164\0\151\0\156\0\147\0\40\0\123\0\164\0\141\0\162\0\164\0\145\0\144) /Parent 12 0 R /A 9 0 R >> endobj 14 0 obj << /Type /Font /Subtype /Type1 /Name /F3 /BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding >> endobj 15 0 obj << /Type /Font /Subtype /Type1 /Name /F5 /BaseFont /Times-Roman /Encoding /WinAnsiEncoding >> endobj 16 0 obj << /Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica /Encoding /WinAnsiEncoding >> endobj 17 0 obj << /Type /Font /Subtype /Type1 /Name /F2 /BaseFont /Helvetica-Oblique /Encoding /WinAnsiEncoding >> endobj 18 0 obj << /Type /Font /Subtype /Type1 /Name /F7 /BaseFont /Times-Bold /Encoding /WinAnsiEncoding >> endobj 1 0 obj << /Type /Pages /Count 2 /Kids [6 0 R 11 0 R ] >> endobj 2 0 obj << /Type /Catalog /Pages 1 0 R /Outlines 12 0 R /PageMode /UseOutlines >> endobj 3 0 obj << /Font << /F3 14 0 R /F5 15 0 R /F1 16 0 R /F2 17 0 R /F7 18 0 R >> /ProcSet [ /PDF /ImageC /Text ] >> endobj 9 0 obj << /S /GoTo /D [11 0 R /XYZ 85.0 659.0 null] >> endobj 12 0 obj << /First 13 0 R /Last 13 0 R >> endobj xref 0 19 0000000000 65535 f 0000002768 00000 n 0000002833 00000 n 0000002925 00000 n 0000000015 00000 n 0000000071 00000 n 0000000579 00000 n 0000000699 00000 n 0000000724 00000 n 0000003048 00000 n 0000000859 00000 n 0000001938 00000 n 0000003111 00000 n 0000002046 00000 n 0000002212 00000 n 0000002325 00000 n 0000002435 00000 n 0000002543 00000 n 0000002659 00000 n trailer << /Size 19 /Root 2 0 R /Info 4 0 R >> startxref 3162 %%EOF lucene-2.9.4/docs/queryparsersyntax.pdf0000644000175000017500000005502211474320234020733 0ustar janpascaljanpascal%PDF-1.3 %ª«¬­ 4 0 obj << /Type /Info /Producer (FOP 0.20.5) >> endobj 5 0 obj << /Length 1002 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gb!$GbAu>[(kq^5$C8tZ6H)sQ1.gNOFP9u7c#h?V6"TCr7Zgq$kP"NOaTe8,NO,mK(DbHXnprdui]J0.S.L3"N'K9C7HQ6F7:7i:+OWIW;KIoJ#SN_ZllQ'ET.NK\-VqN,NYDA-r,n:Jb0-EgbM/:]cRsPh<$R[O:S4[g4Iq#[+T%o1JH3Q85IMJC$%aS!IH*8lokV&KWbgtCC:>lG!8%AH,ApZf1lF\jP%gB5E^JNpc5ZY!*jpjC2FS_VtLCXQ*&fRDWq1lTZ?mnN*(W/_S?*%s.'PP?@7WTDSJclg-UP[l:QG1tFsj27#Higt//>#P'oiEh;\XgP-,XC<_S??//M2NJfll7,B!M$dGnSoU>_6)u].6T"5"C'^;/Z#G#-*,D[46'$NkTUN:KaL$^9W0YoJThK]hkgGW[e-$BPHlK,?%(3clB2u84GE]G+.guU endstream endobj 6 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 5 0 R /Annots 7 0 R >> endobj 7 0 obj [ 8 0 R 10 0 R 12 0 R 14 0 R 16 0 R 18 0 R 20 0 R 22 0 R 24 0 R 26 0 R 28 0 R 30 0 R 32 0 R 34 0 R 36 0 R 38 0 R 40 0 R 42 0 R ] endobj 8 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 546.166 157.316 534.166 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 9 0 R /H /I >> endobj 10 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 527.966 140.66 515.966 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 11 0 R /H /I >> endobj 12 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 509.766 139.34 497.766 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 13 0 R /H /I >> endobj 14 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 491.566 186.32 479.566 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 15 0 R /H /I >> endobj 16 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 473.366 213.8 461.366 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 17 0 R /H /I >> endobj 18 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 455.166 198.476 443.166 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 19 0 R /H /I >> endobj 20 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 436.966 217.16 424.966 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 21 0 R /H /I >> endobj 22 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 418.766 199.808 406.766 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 23 0 R /H /I >> endobj 24 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 400.566 204.164 388.566 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 25 0 R /H /I >> endobj 26 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 382.366 200.312 370.366 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 27 0 R /H /I >> endobj 28 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 364.166 114.5 352.166 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 29 0 R /H /I >> endobj 30 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 345.966 149.492 333.966 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 31 0 R /H /I >> endobj 32 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 327.766 130.268 315.766 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 33 0 R /H /I >> endobj 34 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 309.566 148.16 297.566 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 35 0 R /H /I >> endobj 36 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 291.366 127.496 279.366 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 37 0 R /H /I >> endobj 38 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 273.166 155.996 261.166 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 39 0 R /H /I >> endobj 40 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 254.966 183.668 242.966 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 41 0 R /H /I >> endobj 42 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 236.766 246.632 224.766 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 43 0 R /H /I >> endobj 44 0 obj << /Length 2208 /Filter [ /ASCII85Decode /FlateDecode ] >> stream GatU5968iI%)2U?BQDd<-;jH%gfr4[G1tV>](MZ'H]Vgmi`$[nMMgHnqLouV!1IrVmW><4-&M[d!$=JeI^Srn[J71LlsuMD^=E2k>dsYL3m5IIjNa3Dq:Bt/Z#jg&MS]Am&&S&7>;rop5HlE>er*=^qpj^!]@_/Y%BJm5Z$$L7!WKW1s(UE/p;gG)XaaZfp9f&/FiV/ko9%-bc48IW^Xh[[YGU'&rI,=C#Bg_K-a>jkTMe/#om3T:U?C5rg^CbF<%aVF5)K6?a]QZ^FX>!OhlqEQU2ole?1WZK,C1l;h_WAPKf1#A,H$edPgeVr_^)-X,ZTdRo_._3_[V'K'ImiP5Xql!'&+A,5.;u3I[*%=WF!ZW+t\"5\mlQK,k-2UT0K-WB(Eqi[pVu?Tl\;3*kYHG7NgmRL=Q_EdTG/GK?ppog7:$nhG%R2O0VN9ur\$f.;m+]W)m6)FBSjHRuX=;T;M9F7@UV'4Ftf39!LlMA-GVAlm6ZF&uF;3k;Cfm*t2S@8\T(p`F*TEf[4(Z0hp7_7H]Tr$m&Or,qILl9g>:h+e+Pna_(L**51=c/YbX_AniqjZ[R=1+9tpLYQcY1GbgTuP]]p'3BKZPUuJu7?GB^9&XJmhFqdF,n;YNSf3Rb8&^t6,kA"@_-_42.T>1Qr,aG8u`rgM5L:8V>DA:iu*F.icDSQ9dq*[&PfKS=OF&':FR)/N5/+?VC/d-;TNqA(%5^q#DaL:9!WmWZ&FYTC/B*<$>;2>VC+.[;;A6e)U8]jQkFPg8]\!egBdGqn^)J\-VXU9DB-UB.Z.!smJ9_Rp%@i.\T9,Q=-VFZ@E?K/5.rgA4l:d,Kfr*p`CWe3aUY(bs_FpT0mnIC\'gSli)k4<(,jIl='0H7e3[M:kZJf^Mf3_\&^CqhO.lOmD^mP47$F`>@P2YS!`EY.9!:ER#3Y3DdpH1neBf4+fU-H$qWFkTPaPb+GB\4:)#2&+l@XXcO+K_XF2CkG<)MfeVCP87MPSZ*lRi7$_8:dH"CNFkQdZ\c$+:R^2m&.3MIOA[.230tb6ENki[D&#*UtHP1PWNG7gR3reKu>bh$lepuZMe2q`VS4Pl50@Kmc@h8If#BNFgb^.ZZ",PCTtof:3D#f(WKX0JN26+GeqH(Ic?A]i8_D)h*('rLnFX*(,PW":`/c#Que$(i:DrIF3/o5+<_cs\*0=>NT-'1l@p]n;ia[(g"m$Ai8@c348?o:kECfTt+^r3r;UCVdHQ'4Oq]fB(3?s]PS@$&9MkE^emT@(er'#Ht-LaX*pG='>?D"/,D;8j#PKd**;qY,FBfJ(C*uoT=e`O7ui@.'kC$%4/qe6V="0:,VQEdQV=El34,$Mg">,^J$B+-Z>H!@;cRLSF/RVeN&,N*o0Qk4Tui5oYIK!JQQjY'HF84>/LiQJVHYpbu;iX]u7b,Z9Ak(ir'ZnHJpX>[=1"eA#>4CK4L08$r'cc>W!Eo&)HWqs*B_R3;NLkI24NAV9pfk4hQmN6C/nU2^Z9$ endstream endobj 45 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 44 0 R >> endobj 46 0 obj << /Length 1828 /Filter [ /ASCII85Decode /FlateDecode ] >> stream GauHL>Ar7S'Roe[&H'ra&l,5n?1Ln#lV_P$Mmb>O%-ns:M7/,[U.WgQr9m-YP'*^ZZ%E6JJ0[W!cfI64]j&^0>KhQ?Hk[s5+*c%;MK/6Sq+9puC?;u:^OGZaM3!$;ACUOSXSMl/1P>kq.H,]II@7cQb^R*_4P[TR=O);42)?5r&,+_X1bc+ghI>ODrA$XA]M!9QoC=pY7971orp^bRZqB>qh%-'tJ>,"3H_JB^0e,J_bep4kC'u:(dAW?)k;F'O(hD4PWfg!+_inZ,KueuR):ojAH+OjR5ZL-&V:t"lC3[`!Hbi^n*pC;m%ReG?,E_<":uGq3dX8TtjEaYm@\>$W\#i)STT+LE,LU"."9Ii,P&C^k@Lha1^pai0"'P2S!7:6k42lFeF)Wkk(Uu2b"Z1CO20VN[\5-3/645f04t=RaQk=hXn.8j[,$gmujR_L[\P,>/^<=0P!c.&rBtKCiZs8=Jb?#D_e$StOZHp!8.fI9Ya`:!Xnqd*b!+[@/f\ej4&q^1.G(fI6:%FNI$OK3c'??sX!jeI-(Q'KU_[%pF]g+dQ3CqKGJ)#5MbC#.7%5$R3!9]gT,HH`%7O7YJ]ItqY-i@Bq6:ZLs.$nCR)+qL4ZH#Dk#Eok+-"O/iFgS%:0MhMGu;)a@976.%s&RT0$i1Pl9Wl!cIl@+8()GFRNCV3"?`O(WKHT&k\=F"7Vt\5MlMc_fm2STUTM\H8Q0"&1jh>8kS_)@kopFi,Q?jMPhA1Oh1k'?MDQ0DXq:7I:LNu[)N0<&-1"0m>XA[DuB.iHmj)%hjBa-`$"cUuTh4BSUZ//9,A"^Zl:o$:!'Me/fGdc*&a'7mL:566`i"XZC]XT3>1VY%JkR6).kpLnFW*A$q9e*1UV[[$l\,n^AHi?OcM3\0"e<2g6raF.#C7l^F^HJ?!>aEIT/?lq92jp#2/Ooc+_^*hL`e,"Kf[mAB):-:*1)!R(;S.`03e@'ZEqIo4PeCeI-oH4[pbl844?jR#W%O8]\QsbVT4,JR;ur=8d=a1]KB3RP6j#`:q0#l3_o3!9apbNEkaSpURMVE\4b3G5+I`^pS>7Bo.K/)UQ@ZGW2_A#.X[5",=&*~> endstream endobj 47 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 46 0 R >> endobj 48 0 obj << /Length 2113 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gb!#]gMYb8&:O:S&:/AKGe5oknkp`I*4,uT-e:p&#K;R*8SfeQk/Fc,1uI/`8;a_d2mQXN'S\1ka^*2^3HJeUrH`e`G'\=Y`gd51^,l]OToS;]n5m3GpBt&#'!fk5Znjf@E;'/'"AZI^*rk3IFVolqe+jm;cD'gGS9?5Am8<+A=3d`8ht-]4F2>?/S4>%J"tD4r@8rS>(fX-b\51O4;FN]Q.CCY"V(RNR8S4C'9&_g!j*Zjq5]_%P,0ue*cLdG=$;*GF;qI_]B>WiVV[liKVcYqPRigNJ*I(JFTEd+qJVd[r0\TuKi[0'4Z50l@5Hd+p0\5CXS.ShG3<,FKqW6Y^53uEY=g1@D4@_@G_tP5bnse/\:XS7OK)7YZ9(^,:WdNM%P!gO:e.+$a1hG$q=RS?b^K5iUr4lK@*.@\Ubl0i9tK$I3Q0ISP0uM8pK!.idDj3$t3DllZHN5-j4V?Y/ZMajek\k?W@ZIN`bG!%:9)3e4%r>O^jk'bV:%e=,h6phn=:b3M"=K8N4Sj&3N58=YR?Y5P8E+"G\(#IBCShO?ROM.MpHg3lSJbm`a:\SEj_>u#4p5M]f`"F\cRc>OC4,L>aG]6&,d:.`g^ne1tRG^J`pQ!>pcBgJ#HS-8FlrtVMP>BRO2Aq>$qVn*1kd7=H8P62JUb]G$V)A1?*Hf)2?Rd:IuB0nI*b=(m6r%1PIs%3R?=E[@:gHeMRX18fJ+U<-t99Pfu'%Je\'i:5VN07Qg8E,(%Ws\gGr3b6;5;:8KfdiX!eNFg!YXMHXD"05S]Yl\hJo0S%5i#?PHYaQtr&o@6DmdG_k1*@6)C1#h*Ru[2fsY#fbboB0KS3*4aK`c8X,m_m\_BIEpjntWD;B!Fn?%9/A?mJj#M8soL.j:^pn$*\P!g#^W3QTDpUDKrJ-;,(Ck2JPs/6AE%p^A&&mH=KhYNqs7=o_f.S]M]>l$$dYqHe$S.i6A`cjN#`3jYuC]-^fgt3iuJ?dL4ipJ/8U9lPHfV/?_3jlR_i\uZImJY=IO9Re"q_d1[FrHt\hj2*1;FN6:BhqM*Nc^We>]hCa+QKi@;0Cc^b"EG?c@"_B^'#/Xi#NE].>7jAo,<]7pH;o30p)EsCTjaF2LJu[#_IN'XS'5rm]M?j8m=F\]n#SOIm!LaL@UXS&]C1Ha+3O0@@a)J`eko29'$)H`"+q@jm`o^6$FWZY]h2`kSd976!Bg>X=;nDo&fEel%2?O!4=@Sg5.YWR:5qhOlP8FOXKF)ogp4V1edq#^4J2-"mA'+h4Ubc)3=b>?q"_-**93T@6*.nG>:D(@CD3p=7:1.j;s*JbM;CJVVr/iarPW;:Fb,SkOB!=_\.]F1)(.a:!oJ&A0aQgqhXB"'s(g3U6'6SVj.iC*V$ge?/`=X6.og[=,J-V#S#fKu7c$IYf(qd_ik&It;Sq6:s5nb/Z$j5iYZCtE#pktrR=1nIVM,ZYh<-FJVS#deiLa2qF\1f-:*9Rg/skJVlV3KmQ endstream endobj 49 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 48 0 R >> endobj 50 0 obj << /Length 2028 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gatm=gMYb*&:Ml+#_"0?Cs*PB;O;f)gMML1Oef*d(7Qm@gU+(`(0ucSA"XoN8sZQP9\Ktk"L1Bf,j041b`)I3I.d7/lGjY5(Ae'Kej;H@/>-Bt5a#eWKKPH.J\DqY@YMNc[']_\+,>^nOoM*8%EGj_&n5BuQ-N:obLIEg[VJ%gt+*9^Veq3Bi.k3G7um?Z0#-(&r;Q1s0/MAWqU68fdrHm<4%dX9HgnG[GZMUf-/^Gl/4VK0IK6DM6B>,3Q),[qfA4,HXF#XOF.SMVFhWQ7MXjLUG6ktI??N*:hfKuh1Wc_k/C:Pc(ob'AKMibH%]Pafd/[1^"OqbMGcAmg&T3*L$+J7We)jYL;cZ"u"'lKT`Fk]EBF[^I)X22:Ng<@JnC9R.f2X+!2-3Wc's]:)8uMG:+:f&HsdbkY5[U)uV8BCfjq&]MXFfLS#u'D]l?LBBWl9Z)TL.e@J;ui/1=.tqeQm+o_7'Nq)$6ZhN,&Tf&13C\2U9V):Z60tE*\ge/-)"N&s',8b<>diZC$YXL4BP3OXi=*4h4=Ro!#8REgrRWa1X>8]:[9\^*/r;EU+2[gK8Fm,p&eetdl#X+!it0V2[Hi(*QI$A.^K17o0UF)@R[N:D*;c\u+L#2Q93iZ5bf$#QR<#.R'*/IaLk4#n1:<@s_M,6.]<3W.(K0*nQDmIuN>/0.n4#"aaT]lbS!TM*?/eE1X#$Ef@n3no:[rmXY=f,j.on!!72rZI#=)XU@KE3eh+2@:aoEF<5789\"CR\VqprRJ2-6'fV]3jfq:/2QX"b%eVUKsIEt#Qr>\X^X5umpI==#;"s)5(g]=h&*'t.tZ/aOOYLS\W-=PnlDcZW_Pd):Aimpr+K)0h^+hfDf>'Y4C"Whi]GLE'()B8ZkkX^,!H>^lgTmr)nd/X6+fEJGH;*2#J3Ao:_M50V$cG86Q=d:c_._4:Q8i`>Equa.P4COH6Tl^9)I7#!]G%tfrDjk%/7i>"ZD.]-S[tcB7`KF-UM['J@Tb(l`rQp?-2,S'C$'Rk35Y>I8"LTm6C-_6Z`lhf<,7dL2rHc2+%i5!QhQ4d`6Y&gV!H0J_ZNW;cFBo02N550"k_f%ZaHDQJM+_2_(VKKFi+On'Sn_A3GjL`h;33$oFM?S93'eIH9a4Spc!iN/IeHOrR6g"'\c7r\jTJBca2fc\V+X>:JR.brP\\lj#&FY,D8WY>aKi1J.mojnm!bdU88:Df!=6lX2^LI\N&##Gkh3$X6WS])l69>";^[>n!s1iCUTo8^4`U%r]03Zdh)LIG2R%*dg2MQ.@D,%1$)CNtsH&o6`caSPWc>HMtmjnZ]l4,fZ#6`#:`\i`fQ;7B6XI_fD[D&IS<$B"Bj"F`/:#g(%'/.q((]lrmTT7us)YUirdGR`nOMGmb5P<_IIB"11R(;B`e&%LKZIOp3A42agn'3#D6mC4A<`d,=L@)X+/[c"W:_ZF%bkHKnMT:7e*DPbe,a#6>qk&-f@jhYO(K7GbMh\]]]e.[OZLMQAf+r`V%/2"+qdD^(rNdFCdl$/=Eiq=f9A&(Eqtg$t?3:1YA\5E@]0UL]$-# endstream endobj 51 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 50 0 R >> endobj 52 0 obj << /Length 1715 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gb!SlgQ(#H&:O:SW;DfJ\h(%ZPddH*!Ye)+IV\dFHRW0U/3:A\n_\P,8_-]gAaOT/!6&Da%jd)h_fO*3okZhhI6RNbA&/NO?d)tOBFonAHqJPVI3[a5E"rV3%2Q9/ZY0#-:n7"dh>YT='eg,!k/iQ'B2SG65r02B%]Km!S)CJ?Bmq)XnDdFgD)8N*^$1^ZGI9mVG-8@BXf!h%,M0ch_>KA114g+k/MhVLK[Q;u\aJk''tnVQQM&8[sp'M8iaPX%ra34FsTCpXpq"D;+'9RdED6\T=7XNo(8j]MV$7O:qBR]a-0G4N-.[[sFp-m?r+r;NkFtk>+^PQsGBX.MePe.ga7$$p\=#.Jc^[=@6`:V@1eaZeeY,$\RuPg*h"Q_C_0,NYUAl2Hn+l[L!+esIS!2>7-QUQ\$cI/Np1#m\>kc?[*+35Eg#,b8"=g"3j"h4Wsm8<=?J&A8gqIbHY?j?eE&\G0dR2HZiAbGlj&E$iXqkI`m9nV`Sj\!DY!`#N_VDC4Ws2.KrW,bji!K^m[/eli_)];Q`oP8e^=3d9*0"q;\D/pQm?_BbhdUb9?EVRT!,r-R@_PUb,5CmuE^#s$]Kd!NA'C,[,H;+kdJ#On&eL!A=9&-4C!=i%oXmK!%L))E'YZl,0>,>8+oV.@8<^-;O%+2Dc`a&S_sK(-"K-VGZtn[7htL-a7N&@YGMLKje'+'\_lBa\!nQ,N=qngr'QF8-I$VBju!-Rb2K9f'+B\^'=K'>NVqW^RMei.dI-o5!2:5^Qs!`BfJV`s%(_ONZjPO;Kd**i*&QS3&_N@^J7O_E`9t*-,)?"X?AIl*aG!&S#/^Y28l(H^D6I7$,*r1V]-f$0#t8K^[@`BA867+C&ioY.u/#oO;#,]c*Q?oZik\"KR14#h>?B-gq4jD"--drudNZEj?"BD!jIh84TA$84FCR\];mgUeA(0/Ms20jP1BH$WZ"E%\KL`QYI4jb,H-5OA?Il7i!lfVKrs$bZQV]TJC+^A[>gf2k&G.+@Gd*CX*67?:c&'P5VnB\"s*rM)H#n`oL<45O=OO2&JF_nWAYA1Ap0S_ig[kG9`cUq*HY,>@NtrY(rpb9s`+@m8n_AW>TTGgQ,abK9G+K=jBfn/fVSEepEdU5g?1UeGo>hl0m&i!6[3HiJ)-Kq*2-K8<'g@M&uEbV0$a&BDQhFQ6MBlD%=OOk6HYRn-=Bn+KSC^N_.2&T6U)T_YUA,DDHNJB"BP,9-K]^g@\1+I:r7lh5,#ni[CYg`;i7pKU@F?DTHS80"WPP)t5QiP2tk^.;>kj"BW\ONI6%Z/4K)L:u,\ClO"OGI68 endstream endobj 53 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 52 0 R >> endobj 54 0 obj << /Length 1062 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gatm:hf"u<&:WfGnFBC%MsB$7`mC&dXkHCr*nlmgdRfZa$!I\c^t*8Vrq;bP4rVctD2K%qN.(GKa5$tJ^3/RBNu#R7kE6R[Y)Lfob),:*Uc)o_k*HHdGm"fAE2NXQYO"X!0jQN0,eC=bX:sJP3Z!TQ)uR*4d)OKGoT,a/-sPs'%e:^gc!9013je3,oCH)PmYuRedP?`EblZ]Fs&4Q=f"itB$nI'hZVi(V7e<65Oo:$sB(Am6#4^n9j]rsb:-._;(bk9@r]_-MU`>Dke!a&8r#A(XHX4%p3g]3EXIYcFhO/):J)h!@O]$P2!hrB/noKC8=B,pV6d"i['HgV9ohEA1`XQ-s?HB;s9248uA>nIHC-NuS.!Pso;45`EaTC>b\fDM@"cks.*MneEh1<#%X2ODF;q56.\t37#Fho%h9U7OftGg/$oYT&+EXIYqddl5Yk`(=&HFmV*a0$XiW*kml\SrO>7kCoee@`[Wp%@F;5l%a0akTQ01J$q;XAMlho)>1^)r0qJ(=d,RdGmM4?,%gbDWX_))IW^HXZ95`r'Dl)e7_Co.//iA,\CKHXhLYjtXpsN&#MIk]RB.@b*+cO?s7BWA!6])&dqGHB"/#7Eja9QdtUqu'C*j#KfZrFJ6R-*qrKp/e#Ge[*8)'ddT>L<`Y9e?W-3F1bikQr<$gW7?Mn$'f`WU2Qu5A"?R0GL80@N,?Uka)bahGlM@rQ8]Mntm)sV<`Gj#Gok[Z/*\bFm^7EHH+HYfmhc?_YjMclYBW~> endstream endobj 55 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 54 0 R >> endobj 57 0 obj << /Title (\376\377\0\61\0\40\0\117\0\166\0\145\0\162\0\166\0\151\0\145\0\167) /Parent 56 0 R /Next 58 0 R /A 9 0 R >> endobj 58 0 obj << /Title (\376\377\0\62\0\40\0\124\0\145\0\162\0\155\0\163) /Parent 56 0 R /Prev 57 0 R /Next 59 0 R /A 11 0 R >> endobj 59 0 obj << /Title (\376\377\0\63\0\40\0\106\0\151\0\145\0\154\0\144\0\163) /Parent 56 0 R /Prev 58 0 R /Next 60 0 R /A 13 0 R >> endobj 60 0 obj << /Title (\376\377\0\64\0\40\0\124\0\145\0\162\0\155\0\40\0\115\0\157\0\144\0\151\0\146\0\151\0\145\0\162\0\163) /Parent 56 0 R /First 61 0 R /Last 65 0 R /Prev 59 0 R /Next 66 0 R /Count -5 /A 15 0 R >> endobj 61 0 obj << /Title (\376\377\0\64\0\56\0\61\0\40\0\127\0\151\0\154\0\144\0\143\0\141\0\162\0\144\0\40\0\123\0\145\0\141\0\162\0\143\0\150\0\145\0\163) /Parent 60 0 R /Next 62 0 R /A 17 0 R >> endobj 62 0 obj << /Title (\376\377\0\64\0\56\0\62\0\40\0\106\0\165\0\172\0\172\0\171\0\40\0\123\0\145\0\141\0\162\0\143\0\150\0\145\0\163) /Parent 60 0 R /Prev 61 0 R /Next 63 0 R /A 19 0 R >> endobj 63 0 obj << /Title (\376\377\0\64\0\56\0\63\0\40\0\120\0\162\0\157\0\170\0\151\0\155\0\151\0\164\0\171\0\40\0\123\0\145\0\141\0\162\0\143\0\150\0\145\0\163) /Parent 60 0 R /Prev 62 0 R /Next 64 0 R /A 21 0 R >> endobj 64 0 obj << /Title (\376\377\0\64\0\56\0\64\0\40\0\122\0\141\0\156\0\147\0\145\0\40\0\123\0\145\0\141\0\162\0\143\0\150\0\145\0\163) /Parent 60 0 R /Prev 63 0 R /Next 65 0 R /A 23 0 R >> endobj 65 0 obj << /Title (\376\377\0\64\0\56\0\65\0\40\0\102\0\157\0\157\0\163\0\164\0\151\0\156\0\147\0\40\0\141\0\40\0\124\0\145\0\162\0\155) /Parent 60 0 R /Prev 64 0 R /A 25 0 R >> endobj 66 0 obj << /Title (\376\377\0\65\0\40\0\102\0\157\0\157\0\154\0\145\0\141\0\156\0\40\0\117\0\160\0\145\0\162\0\141\0\164\0\157\0\162\0\163) /Parent 56 0 R /First 67 0 R /Last 71 0 R /Prev 60 0 R /Next 72 0 R /Count -5 /A 27 0 R >> endobj 67 0 obj << /Title (\376\377\0\65\0\56\0\61\0\40) /Parent 66 0 R /Next 68 0 R /A 29 0 R >> endobj 68 0 obj << /Title (\376\377\0\65\0\56\0\62\0\40\0\101\0\116\0\104) /Parent 66 0 R /Prev 67 0 R /Next 69 0 R /A 31 0 R >> endobj 69 0 obj << /Title (\376\377\0\65\0\56\0\63\0\40\0\53) /Parent 66 0 R /Prev 68 0 R /Next 70 0 R /A 33 0 R >> endobj 70 0 obj << /Title (\376\377\0\65\0\56\0\64\0\40\0\116\0\117\0\124) /Parent 66 0 R /Prev 69 0 R /Next 71 0 R /A 35 0 R >> endobj 71 0 obj << /Title (\376\377\0\65\0\56\0\65\0\40\0\55) /Parent 66 0 R /Prev 70 0 R /A 37 0 R >> endobj 72 0 obj << /Title (\376\377\0\66\0\40\0\107\0\162\0\157\0\165\0\160\0\151\0\156\0\147) /Parent 56 0 R /Prev 66 0 R /Next 73 0 R /A 39 0 R >> endobj 73 0 obj << /Title (\376\377\0\67\0\40\0\106\0\151\0\145\0\154\0\144\0\40\0\107\0\162\0\157\0\165\0\160\0\151\0\156\0\147) /Parent 56 0 R /Prev 72 0 R /Next 74 0 R /A 41 0 R >> endobj 74 0 obj << /Title (\376\377\0\70\0\40\0\105\0\163\0\143\0\141\0\160\0\151\0\156\0\147\0\40\0\123\0\160\0\145\0\143\0\151\0\141\0\154\0\40\0\103\0\150\0\141\0\162\0\141\0\143\0\164\0\145\0\162\0\163) /Parent 56 0 R /Prev 73 0 R /A 43 0 R >> endobj 75 0 obj << /Type /Font /Subtype /Type1 /Name /F3 /BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding >> endobj 76 0 obj << /Type /Font /Subtype /Type1 /Name /F5 /BaseFont /Times-Roman /Encoding /WinAnsiEncoding >> endobj 77 0 obj << /Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica /Encoding /WinAnsiEncoding >> endobj 78 0 obj << /Type /Font /Subtype /Type1 /Name /F9 /BaseFont /Courier /Encoding /WinAnsiEncoding >> endobj 79 0 obj << /Type /Font /Subtype /Type1 /Name /F2 /BaseFont /Helvetica-Oblique /Encoding /WinAnsiEncoding >> endobj 80 0 obj << /Type /Font /Subtype /Type1 /Name /F7 /BaseFont /Times-Bold /Encoding /WinAnsiEncoding >> endobj 1 0 obj << /Type /Pages /Count 7 /Kids [6 0 R 45 0 R 47 0 R 49 0 R 51 0 R 53 0 R 55 0 R ] >> endobj 2 0 obj << /Type /Catalog /Pages 1 0 R /Outlines 56 0 R /PageMode /UseOutlines >> endobj 3 0 obj << /Font << /F3 75 0 R /F5 76 0 R /F1 77 0 R /F9 78 0 R /F2 79 0 R /F7 80 0 R >> /ProcSet [ /PDF /ImageC /Text ] >> endobj 9 0 obj << /S /GoTo /D [45 0 R /XYZ 85.0 659.0 null] >> endobj 11 0 obj << /S /GoTo /D [45 0 R /XYZ 85.0 313.466 null] >> endobj 13 0 obj << /S /GoTo /D [47 0 R /XYZ 85.0 659.0 null] >> endobj 15 0 obj << /S /GoTo /D [47 0 R /XYZ 85.0 383.886 null] >> endobj 17 0 obj << /S /GoTo /D [47 0 R /XYZ 85.0 331.552 null] >> endobj 19 0 obj << /S /GoTo /D [49 0 R /XYZ 85.0 606.74 null] >> endobj 21 0 obj << /S /GoTo /D [49 0 R /XYZ 85.0 419.367 null] >> endobj 23 0 obj << /S /GoTo /D [49 0 R /XYZ 85.0 331.854 null] >> endobj 25 0 obj << /S /GoTo /D [51 0 R /XYZ 85.0 659.0 null] >> endobj 27 0 obj << /S /GoTo /D [51 0 R /XYZ 85.0 414.167 null] >> endobj 29 0 obj << /S /GoTo /D [51 0 R /XYZ 85.0 335.433 null] >> endobj 31 0 obj << /S /GoTo /D [51 0 R /XYZ 85.0 182.46 null] >> endobj 33 0 obj << /S /GoTo /D [53 0 R /XYZ 85.0 606.74 null] >> endobj 35 0 obj << /S /GoTo /D [53 0 R /XYZ 85.0 511.227 null] >> endobj 37 0 obj << /S /GoTo /D [53 0 R /XYZ 85.0 358.254 null] >> endobj 39 0 obj << /S /GoTo /D [53 0 R /XYZ 85.0 262.741 null] >> endobj 41 0 obj << /S /GoTo /D [55 0 R /XYZ 85.0 659.0 null] >> endobj 43 0 obj << /S /GoTo /D [55 0 R /XYZ 85.0 562.406 null] >> endobj 56 0 obj << /First 57 0 R /Last 74 0 R >> endobj xref 0 81 0000000000 65535 f 0000019806 00000 n 0000019906 00000 n 0000019998 00000 n 0000000015 00000 n 0000000071 00000 n 0000001165 00000 n 0000001285 00000 n 0000001429 00000 n 0000020132 00000 n 0000001564 00000 n 0000020195 00000 n 0000001700 00000 n 0000020261 00000 n 0000001836 00000 n 0000020325 00000 n 0000001972 00000 n 0000020391 00000 n 0000002107 00000 n 0000020457 00000 n 0000002244 00000 n 0000020522 00000 n 0000002380 00000 n 0000020588 00000 n 0000002517 00000 n 0000020654 00000 n 0000002654 00000 n 0000020718 00000 n 0000002791 00000 n 0000020784 00000 n 0000002926 00000 n 0000020850 00000 n 0000003063 00000 n 0000020915 00000 n 0000003200 00000 n 0000020980 00000 n 0000003336 00000 n 0000021046 00000 n 0000003473 00000 n 0000021112 00000 n 0000003610 00000 n 0000021178 00000 n 0000003747 00000 n 0000021242 00000 n 0000003884 00000 n 0000006185 00000 n 0000006293 00000 n 0000008214 00000 n 0000008322 00000 n 0000010528 00000 n 0000010636 00000 n 0000012757 00000 n 0000012865 00000 n 0000014673 00000 n 0000014781 00000 n 0000015936 00000 n 0000021308 00000 n 0000016044 00000 n 0000016183 00000 n 0000016319 00000 n 0000016461 00000 n 0000016690 00000 n 0000016893 00000 n 0000017092 00000 n 0000017315 00000 n 0000017514 00000 n 0000017704 00000 n 0000017951 00000 n 0000018053 00000 n 0000018187 00000 n 0000018308 00000 n 0000018442 00000 n 0000018549 00000 n 0000018703 00000 n 0000018892 00000 n 0000019144 00000 n 0000019257 00000 n 0000019367 00000 n 0000019475 00000 n 0000019581 00000 n 0000019697 00000 n trailer << /Size 81 /Root 2 0 R /Info 4 0 R >> startxref 21359 %%EOF lucene-2.9.4/docs/queryparsersyntax.html0000644000175000017500000005260611474320234021133 0ustar janpascaljanpascal Apache Lucene - Query Parser Syntax
 

Apache Lucene - Query Parser Syntax

Overview

Although Lucene provides the ability to create your own queries through its API, it also provides a rich query language through the Query Parser, a lexer which interprets a string into a Lucene Query using JavaCC.

Generally, the query parser syntax may change from release to release. This page describes the syntax as of the current release. If you are using a different version of Lucene, please consult the copy of docs/queryparsersyntax.html that was distributed with the version you are using.

Before choosing to use the provided Query Parser, please consider the following:

  1. If you are programmatically generating a query string and then parsing it with the query parser then you should seriously consider building your queries directly with the query API. In other words, the query parser is designed for human-entered text, not for program-generated text.
  2. Untokenized fields are best added directly to queries, and not through the query parser. If a field's values are generated programmatically by the application, then so should query clauses for this field. An analyzer, which the query parser uses, is designed to convert human-entered text to terms. Program-generated values, like dates, keywords, etc., should be consistently program-generated.
  3. In a query form, fields which are general text should use the query parser. All others, such as date ranges, keywords, etc. are better added directly through the query API. A field with a limit set of values, that can be specified with a pull-down menu should not be added to a query string which is subsequently parsed, but rather added as a TermQuery clause.

Terms

A query is broken up into terms and operators. There are two types of terms: Single Terms and Phrases.

A Single Term is a single word such as "test" or "hello".

A Phrase is a group of words surrounded by double quotes such as "hello dolly".

Multiple terms can be combined together with Boolean operators to form a more complex query (see below).

Note: The analyzer used to create the index will be used on the terms and phrases in the query string. So it is important to choose an analyzer that will not interfere with the terms used in the query string.

Fields

Lucene supports fielded data. When performing a search you can either specify a field, or use the default field. The field names and default field is implementation specific.

You can search any field by typing the field name followed by a colon ":" and then the term you are looking for.

As an example, let's assume a Lucene index contains two fields, title and text and text is the default field. If you want to find the document entitled "The Right Way" which contains the text "don't go this way", you can enter:

title:"The Right Way" AND text:go

or

title:"Do it right" AND right

Since text is the default field, the field indicator is not required.

Note: The field is only valid for the term that it directly precedes, so the query

title:Do it right

Will only find "Do" in the title field. It will find "it" and "right" in the default field (in this case the text field).

Term Modifiers

Lucene supports modifying query terms to provide a wide range of searching options.

Wildcard Searches

Lucene supports single and multiple character wildcard searches within single terms (not within phrase queries).

To perform a single character wildcard search use the "?" symbol.

To perform a multiple character wildcard search use the "*" symbol.

The single character wildcard search looks for terms that match that with the single character replaced. For example, to search for "text" or "test" you can use the search:

te?t

Multiple character wildcard searches looks for 0 or more characters. For example, to search for test, tests or tester, you can use the search:

test*

You can also use the wildcard searches in the middle of a term.

te*t

Note: You cannot use a * or ? symbol as the first character of a search.

Fuzzy Searches

Lucene supports fuzzy searches based on the Levenshtein Distance, or Edit Distance algorithm. To do a fuzzy search use the tilde, "~", symbol at the end of a Single word Term. For example to search for a term similar in spelling to "roam" use the fuzzy search:

roam~

This search will find terms like foam and roams.

Starting with Lucene 1.9 an additional (optional) parameter can specify the required similarity. The value is between 0 and 1, with a value closer to 1 only terms with a higher similarity will be matched. For example:

roam~0.8

The default that is used if the parameter is not given is 0.5.

Proximity Searches

Lucene supports finding words are a within a specific distance away. To do a proximity search use the tilde, "~", symbol at the end of a Phrase. For example to search for a "apache" and "jakarta" within 10 words of each other in a document use the search:

"jakarta apache"~10

Range Searches

Range Queries allow one to match documents whose field(s) values are between the lower and upper bound specified by the Range Query. Range Queries can be inclusive or exclusive of the upper and lower bounds. Sorting is done lexicographically.

mod_date:[20020101 TO 20030101]

This will find documents whose mod_date fields have values between 20020101 and 20030101, inclusive. Note that Range Queries are not reserved for date fields. You could also use range queries with non-date fields:

title:{Aida TO Carmen}

This will find all documents whose titles are between Aida and Carmen, but not including Aida and Carmen.

Inclusive range queries are denoted by square brackets. Exclusive range queries are denoted by curly brackets.

Boosting a Term

Lucene provides the relevance level of matching documents based on the terms found. To boost a term use the caret, "^", symbol with a boost factor (a number) at the end of the term you are searching. The higher the boost factor, the more relevant the term will be.

Boosting allows you to control the relevance of a document by boosting its term. For example, if you are searching for

jakarta apache

and you want the term "jakarta" to be more relevant boost it using the ^ symbol along with the boost factor next to the term. You would type:

jakarta^4 apache

This will make documents with the term jakarta appear more relevant. You can also boost Phrase Terms as in the example:

"jakarta apache"^4 "Apache Lucene"

By default, the boost factor is 1. Although the boost factor must be positive, it can be less than 1 (e.g. 0.2)

Boolean Operators

Boolean operators allow terms to be combined through logic operators. Lucene supports AND, "+", OR, NOT and "-" as Boolean operators(Note: Boolean operators must be ALL CAPS).

The OR operator is the default conjunction operator. This means that if there is no Boolean operator between two terms, the OR operator is used. The OR operator links two terms and finds a matching document if either of the terms exist in a document. This is equivalent to a union using sets. The symbol || can be used in place of the word OR.

To search for documents that contain either "jakarta apache" or just "jakarta" use the query:

"jakarta apache" jakarta

or

"jakarta apache" OR jakarta

AND

The AND operator matches documents where both terms exist anywhere in the text of a single document. This is equivalent to an intersection using sets. The symbol && can be used in place of the word AND.

To search for documents that contain "jakarta apache" and "Apache Lucene" use the query:

"jakarta apache" AND "Apache Lucene"

+

The "+" or required operator requires that the term after the "+" symbol exist somewhere in a the field of a single document.

To search for documents that must contain "jakarta" and may contain "lucene" use the query:

+jakarta lucene

NOT

The NOT operator excludes documents that contain the term after NOT. This is equivalent to a difference using sets. The symbol ! can be used in place of the word NOT.

To search for documents that contain "jakarta apache" but not "Apache Lucene" use the query:

"jakarta apache" NOT "Apache Lucene"

Note: The NOT operator cannot be used with just one term. For example, the following search will return no results:

NOT "jakarta apache"

-

The "-" or prohibit operator excludes documents that contain the term after the "-" symbol.

To search for documents that contain "jakarta apache" but not "Apache Lucene" use the query:

"jakarta apache" -"Apache Lucene"

Grouping

Lucene supports using parentheses to group clauses to form sub queries. This can be very useful if you want to control the boolean logic for a query.

To search for either "jakarta" or "apache" and "website" use the query:

(jakarta OR apache) AND website

This eliminates any confusion and makes sure you that website must exist and either term jakarta or apache may exist.

Field Grouping

Lucene supports using parentheses to group multiple clauses to a single field.

To search for a title that contains both the word "return" and the phrase "pink panther" use the query:

title:(+return +"pink panther")

Escaping Special Characters

Lucene supports escaping special characters that are part of the query syntax. The current list special characters are

+ - && || ! ( ) { } [ ] ^ " ~ * ? : \

To escape these character use the \ before the character. For example to search for (1+1):2 use the query:

\(1\+1\)\:2
 
lucene-2.9.4/docs/demo.pdf0000644000175000017500000001567311474320234016036 0ustar janpascaljanpascal%PDF-1.3 %ª«¬­ 4 0 obj << /Type /Info /Producer (FOP 0.20.5) >> endobj 5 0 obj << /Length 615 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gaua<9lHd\&;KZO$6PI!?%6qk[>h\QrZS[;SS9*tq%RUKg"Z-=iJi,V6e@1\!3'-R1n!sH%5#,o-gM!\Hg&>W_;dQP+4,u7[XPB1\/7+Ao>h6sjnT"]0]79`e]WO?[k;oKaS!U7po!1:=t2l\T84$?#Bp5qGZ?qA1f]/4l9qWt_?2:EL'cF#E*R:c*3!Ck-/0XCNFDE8ON6or?.h2[V=Slf,qoS;K]fqX$ud%MC+h7cPP?[U:^fU"'&U0IL[V?uh&+DIdDl;"6G?7r_&sh]k]?hfla8Q]3%>0u=5DG&/T6jGqH&jZ3)^jab#@g6E8GGq!JO7#He(-UOT1?^=(N\mBt[#]19e)\_:&\L!JaIgHe/5Op:2edT?ObYU-JMC2SaGRr9aARs80`*5f((_ZiWP15qTo"p`1o%59aUF&<)EQghTEIEu9ZP`,n*A9E"KdQeS'N5OeuqYu:e!d&GO`,cA=lXZ,AgX1apdDH2ge;CeqB,ZWqo+1q'MCh3Jq$g>sp!2=LlrV~> endstream endobj 6 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 5 0 R /Annots 7 0 R >> endobj 7 0 obj [ 8 0 R 10 0 R 12 0 R 14 0 R 16 0 R ] endobj 8 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 519.166 213.332 507.166 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 9 0 R /H /I >> endobj 10 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 500.966 194.66 488.966 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 11 0 R /H /I >> endobj 12 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 482.766 240.68 470.766 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 13 0 R /H /I >> endobj 14 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 464.566 179.0 452.566 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 15 0 R /H /I >> endobj 16 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 446.366 192.32 434.366 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 17 0 R /H /I >> endobj 18 0 obj << /Length 2210 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gatm=gQ(#H&:O:SW;D_\L`''6pE@=idS1AXBppnU'MTZ_M7/\77>IVg0nT=M8,&].\o&DhfNh)n3HO?,n-jBESGfa`I@f+;>"#f7(*V[cYRAQK,*;d[@4(d$_I[TEbG@DR$&iJgo6WcI(^/h)MW!r(;o@4Ls$gbiVhZbnkJ-HXd=Le>3'uT$A-PM)!Mh3(3LORFDDjg]Wa,&WU6)F4*FEhgGDto:O"WW@H]9^rk-s(V:'`I4=Lu.MC2KCP>.Gl*-j/k^WSs5bAstb/G!+nX0u],[q3d)hVtC/Ce)EXI89Q4'IX6jOhV-eh'8hXU;U8-;/Wh`/$VI2Y?6q,(bK\fbEtmjDK$prZMoU:0`OqKqGBYnJJ`%AWkmbO+UMt__9"]-<93ht$;Vmm"aN@Q[f]aq=c/hP6,qe-g5N[8$\dRkf#Y<&R`P4"/n*]%>07Pj#flfpm7Wp+E+Up%IZNi_.0!0"i=fA6Q)D_k++qAlF=COfa-ni50k---sF.'2R1IVT#6a-8$i,O\d!&[IP<:18`hUf!';VctOicL2.WT:;snqcU(VRXj$4:aP9RQu]IZ7t/+p`:%(=I':"DN$Ad/]A3^Tj0a"=TC-O01#k2[:\m9s&0^5,Aeo!s3ldNl2,GY1F^(L,[^+$A3t\^ah0S+]PD=g[!1,r%,?lh_>k^%MXdW6a+qnYXZqU'/:a;a,r@e/S*SE/[P74PpX.f(;h<:A@r(d)W\aPNYO<5B03b/,o/c'iUW2/p\YGtSAg(2_EbRXX'qR>6Lss(QGDD!-RR.Na9s@266=YhT"om]6e**%b-F>NBY[;]N^41UOI/1S4;)o8`i[.dBY3&08kCjC#`!A9,#hQIE7@Y+b61RZ.eG`NERceoR<-J#ETn:EcM\B_\XYEu$>DF#H0'=_mAi"eP&mQgGYTS#j_>u65<`[p`T7e=DY:?*hL>ns51u8M7oK9t'l'b1l.Sc8B'J$W7&3Wr,l_R]F:^'2F/Q\"[5&H/ca!mkc@.k7s2FUL2%(aN69_(6Za.=&ORE1%,lHmbr.f2'qoVod':`]<.BCo(HUlZP4p'_9MBdt^6.P7V+J7GaO&*;@+bl:kCe9oTlG-8cr=le>D6D@dg^L]C#=f)?M.HB[QA%K,HlS/GeT6PgXRotO0>]KB!l(l3TNOL7XToE#&c)1P^(=CK\S<.9m^bEFpW:HE;WGfNGC_giP0YH+RZ&Op4^G'Bf+&eP&85gc4AW20E/ke)ijlp]n3&uKZdU/hbY4SO8QWs+>7u;0&X-&,7t#7%,QfG*PXYnKYfh%i0\HOFNCYcc8FKaiAbQ`Fc(8B!B=*U'UE!hE]W9:XhaLLg#[$&DCn:n5$MM)i.$]V1B4&[+Y!%nj:1o)D`e.Zqu\8B7)Z!c'iLApB@3f"?'iiONA.Hf2$YLXs(JQpDCRZpAlSf7,tU=Mh:tDZ5?"?%DS#`2rlN@B?D<99@+KR?LEuLr]PQU&'jU&7/in1-#<1J[[2,DB_q*T:D+=/e3=%6P2fJk7SC$lTJ#CZY=uc#Ra91@6Zd]Af5K\)SXeQt-(&f9qT+3*d[Y&#T\&3cYuZXgBJrY("Z099%=LIC8a^1emp2S@1^`f)-\u^@!6c9H('~> endstream endobj 19 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 18 0 R >> endobj 21 0 obj << /Title (\376\377\0\61\0\40\0\101\0\142\0\157\0\165\0\164\0\40\0\164\0\150\0\151\0\163\0\40\0\104\0\157\0\143\0\165\0\155\0\145\0\156\0\164) /Parent 20 0 R /Next 22 0 R /A 9 0 R >> endobj 22 0 obj << /Title (\376\377\0\62\0\40\0\101\0\142\0\157\0\165\0\164\0\40\0\164\0\150\0\145\0\40\0\104\0\145\0\155\0\157\0\163) /Parent 20 0 R /Prev 21 0 R /Next 23 0 R /A 11 0 R >> endobj 23 0 obj << /Title (\376\377\0\63\0\40\0\123\0\145\0\164\0\164\0\151\0\156\0\147\0\40\0\171\0\157\0\165\0\162\0\40\0\103\0\114\0\101\0\123\0\123\0\120\0\101\0\124\0\110) /Parent 20 0 R /Prev 22 0 R /Next 24 0 R /A 13 0 R >> endobj 24 0 obj << /Title (\376\377\0\64\0\40\0\111\0\156\0\144\0\145\0\170\0\151\0\156\0\147\0\40\0\106\0\151\0\154\0\145\0\163) /Parent 20 0 R /Prev 23 0 R /Next 25 0 R /A 15 0 R >> endobj 25 0 obj << /Title (\376\377\0\65\0\40\0\101\0\142\0\157\0\165\0\164\0\40\0\164\0\150\0\145\0\40\0\143\0\157\0\144\0\145\0\56\0\56\0\56) /Parent 20 0 R /Prev 24 0 R /A 17 0 R >> endobj 26 0 obj << /Type /Font /Subtype /Type1 /Name /F3 /BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding >> endobj 27 0 obj << /Type /Font /Subtype /Type1 /Name /F5 /BaseFont /Times-Roman /Encoding /WinAnsiEncoding >> endobj 28 0 obj << /Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica /Encoding /WinAnsiEncoding >> endobj 29 0 obj << /Type /Font /Subtype /Type1 /Name /F9 /BaseFont /Courier /Encoding /WinAnsiEncoding >> endobj 30 0 obj << /Type /Font /Subtype /Type1 /Name /F2 /BaseFont /Helvetica-Oblique /Encoding /WinAnsiEncoding >> endobj 31 0 obj << /Type /Font /Subtype /Type1 /Name /F7 /BaseFont /Times-Bold /Encoding /WinAnsiEncoding >> endobj 1 0 obj << /Type /Pages /Count 2 /Kids [6 0 R 19 0 R ] >> endobj 2 0 obj << /Type /Catalog /Pages 1 0 R /Outlines 20 0 R /PageMode /UseOutlines >> endobj 3 0 obj << /Font << /F3 26 0 R /F5 27 0 R /F1 28 0 R /F9 29 0 R /F2 30 0 R /F7 31 0 R >> /ProcSet [ /PDF /ImageC /Text ] >> endobj 9 0 obj << /S /GoTo /D [19 0 R /XYZ 85.0 659.0 null] >> endobj 11 0 obj << /S /GoTo /D [19 0 R /XYZ 85.0 593.466 null] >> endobj 13 0 obj << /S /GoTo /D [19 0 R /XYZ 85.0 527.932 null] >> endobj 15 0 obj << /S /GoTo /D [19 0 R /XYZ 85.0 375.198 null] >> endobj 17 0 obj << /S /GoTo /D [19 0 R /XYZ 85.0 209.264 null] >> endobj 20 0 obj << /First 21 0 R /Last 25 0 R >> endobj xref 0 32 0000000000 65535 f 0000005712 00000 n 0000005777 00000 n 0000005869 00000 n 0000000015 00000 n 0000000071 00000 n 0000000777 00000 n 0000000897 00000 n 0000000950 00000 n 0000006003 00000 n 0000001085 00000 n 0000006066 00000 n 0000001221 00000 n 0000006132 00000 n 0000001357 00000 n 0000006198 00000 n 0000001492 00000 n 0000006264 00000 n 0000001628 00000 n 0000003931 00000 n 0000006330 00000 n 0000004039 00000 n 0000004242 00000 n 0000004436 00000 n 0000004672 00000 n 0000004861 00000 n 0000005050 00000 n 0000005163 00000 n 0000005273 00000 n 0000005381 00000 n 0000005487 00000 n 0000005603 00000 n trailer << /Size 32 /Root 2 0 R /Info 4 0 R >> startxref 6381 %%EOF lucene-2.9.4/docs/fileformats.pdf0000644000175000017500000016547411474320234017432 0ustar janpascaljanpascal%PDF-1.3 %ª«¬­ 4 0 obj << /Type /Info /Producer (FOP 0.20.5) >> endobj 5 0 obj << /Length 1170 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gb!$H?#SFN'Sc)P'u$KllIh3b,gnM`rK93a>pniR[U#\:BJbmV&DH^DBBR/mYR[Qi"F:)$Da'b6m.VLZZgluPQ%p3s"H@$r@brF6K'!/mM#O.t2#ic0?r$u?S=gtA:WmGBW(t\go)nsIn7Fg\.2\\c+0X.mH>/.9nH.pWZ93u$eHu)+`h2p@pk[M1Z#spB$VW]d?$dPS<2U;iX'mo%5>SSl:anAr5a6)WNtQ&"-n+5,FZp/_Ce%bP%b2>m\mg:jNuFO/KFdn,2aO:YSp@$'kDk)U@9,0=G6EHm6I"LH0&eGL"a$*"q%Y`1;4@k>W5,4&`d9Mf4=mr5fBJLA$T2eM?-\-`SKC$(-JP9Sf%K1\DB;Ci=RNlu0"I[)]NM8blg*pfMN5K2ouG=O0O"5Y^>2mXsg29"Dl=3NqC?+Y!LU_bQ96"`r!BUY?Jj3OfLGV+#3X+T)OB_dc(Y,Q;lCpClj\.(lQ*3P43B+_]9ktarH2:glj1BaZjbiRC>FCQbNpk=:k'RbL#g:j9Nc6"Z4jYF2&'/-G6Ncun(4F_pWd#b^EqK*2kS[Kn6THTWM0=h0fjE;dKM[N6gK3-os28#?9n\lW5s<=h%lFk^S)..`J`uHZ`tk7=hLF(4A2!n3h7<\m4',=k_<@i>BcZu48Bm((Y!qkQ63,[^T63T*kWfNoq_,^%Oos3N@+nt'Ee$!b"lAigsY6R0(WoVdggZS*G=D+6'j\9IJ]d\].NE(r@R?hJ5b99EXV;Q;5!n8S,>GWBT(1%q9I?K;E)4rI:h0gXU!&NB?IE9ol>T3dMjR.0plMpPKT%85@UtKY@6#Ej)a/ij+:4n16@rV)rmi?cRZf4!f\$d(ECl6GHC((SV/D endstream endobj 6 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 5 0 R /Annots 7 0 R >> endobj 7 0 obj [ 8 0 R 10 0 R 12 0 R 14 0 R 16 0 R 18 0 R 20 0 R 22 0 R 24 0 R 26 0 R 28 0 R 30 0 R 32 0 R 34 0 R 36 0 R 38 0 R 40 0 R 42 0 R 44 0 R 46 0 R 48 0 R 50 0 R 52 0 R 54 0 R ] endobj 8 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 559.666 201.332 547.666 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 9 0 R /H /I >> endobj 10 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 541.466 164.0 529.466 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 11 0 R /H /I >> endobj 12 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 523.266 209.144 511.266 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 13 0 R /H /I >> endobj 14 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 505.066 198.164 493.066 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 15 0 R /H /I >> endobj 16 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 486.866 170.168 474.866 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 17 0 R /H /I >> endobj 18 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 468.666 220.484 456.666 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 19 0 R /H /I >> endobj 20 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 450.466 157.316 438.466 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 21 0 R /H /I >> endobj 22 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 432.266 170.336 420.266 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 23 0 R /H /I >> endobj 24 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 414.066 247.004 402.066 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 25 0 R /H /I >> endobj 26 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 395.866 187.004 383.866 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 27 0 R /H /I >> endobj 28 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 377.666 146.168 365.666 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 29 0 R /H /I >> endobj 30 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 359.466 157.496 347.466 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 31 0 R /H /I >> endobj 32 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 341.266 156.836 329.266 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 33 0 R /H /I >> endobj 34 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 323.066 145.496 311.066 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 35 0 R /H /I >> endobj 36 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 304.866 151.496 292.866 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 37 0 R /H /I >> endobj 38 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 286.666 152.84 274.666 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 39 0 R /H /I >> endobj 40 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 268.466 195.668 256.466 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 41 0 R /H /I >> endobj 42 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 250.266 220.712 238.266 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 43 0 R /H /I >> endobj 44 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 232.066 183.656 220.066 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 45 0 R /H /I >> endobj 46 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 213.866 191.84 201.866 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 47 0 R /H /I >> endobj 48 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 195.666 169.832 183.666 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 49 0 R /H /I >> endobj 50 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 177.466 191.156 165.466 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 51 0 R /H /I >> endobj 52 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 159.266 203.18 147.266 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 53 0 R /H /I >> endobj 54 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 141.066 198.332 129.066 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 55 0 R /H /I >> endobj 56 0 obj << /Length 661 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gaua=95iT.&BF8='HD<-RIW30Qq&a9MQ0X`EbTm)Cj'=1O2U@$Rst28NXXc:iO4Y?1\)X_bmX\57IhTGo"e1Zj8&^YnlD[DX51Ug$(G7\ZnS)'MDhWTFIS<"[8WIYo*Z3Z"Y1>'GPl[m#o5*h_is6%"1.MSimXF4'T3tS=9eWSt!^06U=@A!7-38I&*[P,!*em$lPC(t$[qTI06$!,[j3U\To$AQj*=BLh@op+`hpR\b3mYua)TK*P_CS@p=Yd,=cm]H/=W3&*"m[uCW6`W_5<2FSngmP]C"U[@S3.M@kuLTk%N=%t*BW(QB>o'P^9hj:hhAL8Jo]b:#_%ifoWXPZn!gJJb);m.$dBE^Ak3$kDdSA9n;r2?AuYusKE7e"9QC#9<_qFE+sgbTV.aq"XE;4#?*12(IBEON4F,DY"h?er2Z%AJ+,f4!4WH3r-Xs5].1g]gJq47cC:$XJ-9(Xp5u+m@b]]C`.-$u]gTDX@rH^UF?nX/J:fMHI^/ou endstream endobj 57 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 56 0 R /Annots 58 0 R >> endobj 58 0 obj [ 59 0 R 61 0 R 63 0 R 65 0 R 67 0 R 69 0 R 71 0 R 73 0 R ] endobj 59 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 659.8 152.84 647.8 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 60 0 R /H /I >> endobj 61 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 641.6 203.816 629.6 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 62 0 R /H /I >> endobj 63 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 623.4 181.484 611.4 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 64 0 R /H /I >> endobj 65 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 605.2 167.516 593.2 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 66 0 R /H /I >> endobj 67 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 587.0 231.152 575.0 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 68 0 R /H /I >> endobj 69 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 568.8 189.812 556.8 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 70 0 R /H /I >> endobj 71 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 550.6 218.48 538.6 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 72 0 R /H /I >> endobj 73 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 532.4 170.344 520.4 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 74 0 R /H /I >> endobj 75 0 obj << /Length 2056 /Filter [ /ASCII85Decode /FlateDecode ] >> stream GauHM95iiK&AJ$C#^[s4+Aqcse_=ur/9]E.+%CUUHXJ,A(01s$PjHZ0n)unZ;6guiM?&L4Lu^MPb^;4B5O:9mm5D-e6f/)P_q++fNbR'o>pJ!:RQ,!(s8;8Z[L!Aucofp2hi:ojIPb?dZlMk[m:GnTk/2*.o^$@*rPC2D(trC#/&74F[8S(b?MCg4VYtXMlM'#-eLYe9.?F8UEVQ'"*/t;fYu9FoIqu[BXU/UGo#D6%mH@d#lQOsT:I$(_pJEn[q7cGb!h7]i#m`_M#de$2h;gh`n%bO7:rpo?UFXF#[DKc>84cb[h@2FO-,Ynj$\Wp/qBerOC.%P,d%m_[a2=eRs`;T4G2\@U)$F(%$d&m:+QV(/_`01gFoY]%P;'i4'd1qp)14#HD%/ArK$K*DSO73lHdF*4j/jH'B$_N6"cJteVg+r+F^7XKmXF[j]9`5)_,$P\&a3!bg],jJ#9M&5C>-pC.?&r>@14'+2S#-F+Q6BOKi.rA+;bhY/hbPs7=+*Z`6C@#nlSk*1gL,Bljm[A/r'kdb+Y%J/D>D;HEZMu$aU(O),Cj"C-^bq-Y2%),,5&^`Kl/^aaM8AFi%PK8G$:b-);L0Wfu@T/@sJr_m\M_M481b8>P1QB[o\YC2IUsrZP"O[h/rfKVW5&)i=.#Sb>r4?+QH,]9#rjp=^f.HEIW*IYsWFeurhQot;fEgdlEoX/gBb2-(*4PL@fB8SWt,N]!]-i6_KpfRc@b'7"&D;#@rf7kXNZ6C&mm#b.$%JZbhZ3iu'qk_iK-j^1TO%0tj>Lbqc*^I";WVI,75'hkgsHs[D3cbctJ4F/9JiEM?$8cM6"pgF),8,Y#3lPd[mICC^9L2OPE@>Oft?n*l9%ePSujBpPjDIt,G9qtE7."moT\EKEO-4N(=K6h37Aq(>2H5"EZfMLdMRk=_6coA<>_])*Oo!AM;>VlAmLs&Fm@bN3U7:!7b6E-_oD4fVoR_:!c6^3ILF3W>NpC@:KEuY~> endstream endobj 76 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 75 0 R >> endobj 77 0 obj << /Length 2058 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gatm=bAu>q']&(*\4\Se_Z]a6G=`]aghAN\X^0Zb6pVG(MH!-nNq;qWHpG=e!F)/cA/l7.!Sk`+k?Q@Uer*>i`V0Q/p@.[!Z1NnA`Gl=dLJ`DH*$Jb^bgTRn6!,7tG)[imFF.d^5)jP(5ojKoAh`RWr&lMb8gX^9\P=A?j0eS$@R)P;(Br`*fGe<2L6k,saqG,M"hJZJBL?euGf>NQ5;G>0.3/j.!>Oj%tTt=Q9S*la#6%$GBT&f15g7QtorFG`?31$XRBL1]o9J7q/rkR1tA'.5q*`dZdSqn*`st$Lt!:3kaR!DEPmmR\8Y1R6Ml#GEYI8m(O4r!t#7Q+Y<,q.13YO&5Ld]M9SE>)j0Zf`p$IRE$QT0l>U9Yo,Np?L&Wf*BG-;G^VfM`c9j3<1VVW,-D%P]8dN^G+5^iil1#C3nt)i`G[#Dj%+n[+;5U_I.!F/"bZj]Lr;ZVD32=DXs>LZ*bJShNcV**4_)ekYdm=O:5t[YDdI]S(\WlB;eWlAW8Bna.h?j)6VZA;Nebp'FN302#Nh0l?tA&p'62U$V:7\$5=oStOG8!XsTV*Z`gO)`#(RT>KSRdi?>F(YV$[V2l=P8mkiU.qTso4V<4:BB#"F^A1pYfqu->'rAeS5JV0>ZenmD1T@Y_:^?C)W'_E$p;@+Y,AEs7(H#+qU`"ij$1LOp*s1FqTVh<_TrIfR4ncPXk6,MYXMdHik2,V+qRdO7%Zd`3\9ns!WqCGse^l418$#EM^n5&i,u_[H1,Yf]`AcVj-\PROF@esMXL[cWILKuh!-jH^6EW1i!P>AlFr-XLd=\t[P1Dl3UOpLSf7jlQ$!_3:?'F_!1V<3sCk9$XWd'2JLJGX'bVpfIj$NV4cDDCicE4:.$GK6Ibch/9LC2\=sB?T$$$"7P.$3oV7oO<#mhHZ.t%`?MO"kg5DJ_Juu">uZ)Y689!IF#4FDtP!NHm&6iF6OXss\j:*qD6<$Um%WEBUHu4l_3)2IoPD(0RC&6gaT*7DMQ:uN!TQS5L':ohN-$Gh&2C5c$[l1apN'jrbH`YTKREn=bZZ5Om4r75/72`"-:dbs0BdY9nNQjEL1N83X\[Mr"nl&NLPu/h[pk)\uid5213VRo1j'F-\GLp&NRfR:Pao%5'gAcRoPK$+a"*YcM:6i>UARd),RRDhot,5hh[iNb(B^SOgkBGB$F93jI:h:"NVEJg4$_f'<-mKCBa.r$<*V=%[6D($,I,YB3kqNK(9rY+F\%c-(o4A%:?tu^TYG=<;#.@Y4]ZTQk\u`mpjfr4kW!%c+-30?4oG?aY8cAa47t9m]t^m')b=@@]*(jOo5W.E?)',n=`lCe+'8FH-]fA8~> endstream endobj 78 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 77 0 R >> endobj 79 0 obj << /Length 2230 /Filter [ /ASCII85Decode /FlateDecode ] >> stream GauHMbDt=8']%):B\*PEUZcF6h'Gc8ZmaA01"o(F"YJ+[&P7(8ar]a390Fh48u-9AK#,Ze1*?71GjZaRG5^oZrUB3jGBWQO_o'oD4_M$%aM1U-4?r^?dpUh]4T,$Ep?q@P=0^cjo$r'`i3f=Bq;XQNQSq4kKtHguB3dRu5,+dE)%u_0o,P-$4^sZC9++SEfP[0$Nq<]qO%;.KU$?>L1n3Ts\HB"TmAIUmtGX?m,9FdpWoH/RAG2B&I`5;9D=]nrlV7BO0[Q0JK\SD/gC6ga(G;4ak$7LOs[=\h.uC]QB@EGPXrD`5@+M18#0,!g);(HX'9"#SYB(7WF!5L8>nOfK*XRbHbia5G2XZVki$4g>$0D,Ah%?PW%*u0V?TLL]Hlu>gak(!hr`%JlFaZJW%n7LpbNeeiQ_(9?_RjdI4[Ag_!@P%gP)PeE4[0*aW>@V?s2ar(s09%,*Ja#%qhKZLE&rgl[un3l]\)np8CMg?j%"hU'.jmNFjYmu"C_-o(?ZpY-YT6O.AJ_Be&C=PA/IaQA5SWgMFm/(8[mXTH1a-`@>RN$jKB$">:Sb!8]LGdCIAcRM%&;_4;#Q3)tZV'#_iahTb;E`Zbj:%flGAKNe+g(n@"'?<-(gXNd("ner1eU4mr_IjL\_?N6d)2BZhe`S[-!KSL9dmW#7F>>R1J`8bH0sDM]YPJrrbZnFNTq*0;`;B#t_j&g=rLP(sm_KQrm=`]p=ghFR$[QD)=[:"7"XqLoUdYqo(e$tP1tJS-@;Tsa]l'1m4Ho^6Jn).Lm,Qk[Vq)TrF2$7M`dn5'22IMg2A^3&q1M+G?pJIa_OP?MgJWNe2j*BBe$X"mjgbVT.Eo&k(QGG4>%pf22Q3,)KF1I%;7"fLnY%"m@aK+m4_[`/E\p\4d]YZp%GM3Q5\9'/DK>rOR0K6jV8iC5j!m.FVIoISPXN:r,)PAAl=GP9mR"T,&/e?[R[Vlu0_DS*L0XKceN(%gaAR,Us\cO>/!!_i_4bErEoM'rs]A[V3Jr%MrU$#iVdm55bTXXqpjja-KLi(gE\646lmNIOdMAp4'V[6_484E=V76+#Zh-F\t'O9aW_TReOdGI)dEV;@FB0I+4>"d;?Xt(mh&8j%p4t1hPJnREi0o?eOZ;/0*D>];*i''+)cGrAsO0p64Td,h*74OHBDn&LJ&SPTJP9V6P-9CNYe38qVnR?Gt-n[H[[7S?L5k3#J%E*L8aaYV:[CPt&X0H8-O"lVS^f/_cK@)r?0'182fF427g@e`rI*4nkcZ=tStpUdDRTnbG9I$^(n/Qi7^Y?HM?/QAc8Iru23\[MI)DT/SN@J=A&lRHggWXFQ(C8#FMCNIjgG[O#F3#T7+!>.JtOT~> endstream endobj 80 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 79 0 R >> endobj 81 0 obj << /Length 2725 /Filter [ /ASCII85Decode /FlateDecode ] >> stream GatV!>Ar7S'Z],0.>0me5f-.+aO8r![2Ck2`O6]K__MXX:"dC*nK^,/p?>)(\fAkW7*[70=:-_Er-832LQ"F`2P5/^Il8nd_S@Hmo94K,o96^ni&KiGRK)C;5<>:\7BYZ,.fQ?Iq0_Wcj:U[Agj.WO[e[/^]R/9l?IN9uDd;J8&L3*]Sg1jF,+ADS/p%$eNhX#.4`6&NmNG0tGkF)>P5P++q:"$q/5>ilOacX*+;>!n5FCAa#V-ZEa2?=,Fa!*p\s8Jo:Xa/S`BUA/9u@jnq5d\`TBN#a1PA6N0TZcM_^3*_!]"=I@34fK8T7()#j3HoC;*DWM&5B58;e]\3N\uS6UE^6AF:@6YY$%k><'28!aqF8q6pd-G\?Pac7[.C0j*<ePVK1NE_iX^5MolfC&=%m%ZCc]`U_kh/qU_l+ohL]RSksAjB@=;6#k>(`ed&mmU=.])#B2N[M>E#rE0?cAPj6J8U,f;=]2h@[7,E2T5S;WqToV+pL[%AL#@f#"BY=agB_#I4mpcJmjjS8M+!+DCtpXe1hb]3>YVSQR0D]-kZ7,6nms-M%mL`;>0om$fg]_c?j*KgRCA8d#mL0mU195j7MlDP/*__P7$r0;,TfH^Ws*7k#Wc$RO#;%IGfWaCAj77jI*'rG`@<0iLmL`r[0=O#A.#gU@G$^9*NFBZpspk_A3(tM,R8DG(u*sZLBWLjjRXOIm03Q(_o5fRk17Q0^sj^^qW=_fQT,bj.rbk(]`%_YcrFZgV5jnn6FEFK-n7fcTMt4^LrpK^"ni;s-:sGQnX@^c>-#S-):(,WjoB[]S]Ij)(e"DUAOkOnM8cb@6R-La[1CMfP)C@#hD@%Q>i'#jF[eSWJFL;:lGGl90i$>':dUpHqV-b`%H=PH"F7=,[DL]t;*.F?6-p$FK=m*HY-[+?/em_Ns,[pS6:-fHdU"KudaV9!:T/:ML&l"+Usg1/V)=la2>JQS,5CZKe>3##/&3[$B%D[dNY+&Sp$Z\ZfYHY1NH2:-YS'F0HP$*FeM_HS1%K=6#r6>pt1fn)$j+`\Sjn%loSBT$_Q"=*BJGtuNY&uBMG-#.R__dL\VHa,e2.TPa;..4^bD2mtAk1CAK*URjTDSBL?AuT46h\fRI+fa.ok#hlNPK0AFif[qMl)T9%XZr\dncEf:C;"]ed@7d07JT4d2kRe5Pr%+3/?Q<',i1H0aWK&e"u"19F.r[GQRJXZ3FujAs6q_TLt0!s7!1>i:;nJ7FY7p4;l:H.6T&JO:5]Ua5stNLFh6LiAtmS31R)uYD(&/#:=ak-CU2HS<$tFs8r0f[&>p84FX>i..^OkD*(ApLN0JC9nKed?`4=Uu;!]jY-[Vjb-IqLH'3]8Le>g);D_KB"S$+9i$g:]lm!+r7a(AR-Pe'nSel?[af>Nt)28phPWGk1HZEQ'm!jaM1)]`Lu+H>E:>R/WZ;)ttr<49uX&ncu1heqE-=1'kBYI=\5L"2udX4rK3'JV$qPKFFj+8CYZHV)`tqlR??Uf(dJu;=qj1R+!obeC%Xcf\U!RFIPCg'kmodr>ATBna4T(#'2dTqEote5T70;u4lRt\3^N;,jG%^Kmd=$@`JNu/=0&"S[&p3`V>PUCq,V.OG#;FVaW;~> endstream endobj 82 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 81 0 R >> endobj 83 0 obj << /Length 2050 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gau0F>Ar7S'Roe[csm:(XE`1t(^`*8?"=e,D"[Y1m6S!*7t;gEP+4*'I\]PlP]\56[RD053TT/pcM7q\8`+DeT=1f`(%h4tkOU1sJaWb.^cr,?cYGonJ>cO(mpYndl?4RO\NJuGOMjJW5S+FDI1Z5P+7D&M!;R.lc:4Y*=@aY^L(o4BX22iq&U'A)P>U6`/7jGl6!Zg]^7A#PRqhgC"Qo2-We3j/$NObW>6@>/ti;KofRs8Fq1'OO',.]D5=]17SBKee71+%g*+WD%MGMVK=AhfWO?7hCLt/1MMS\:A)^mn>KUJRjmlf@`\8&[[K24:%3*9j'3'e4167qFt:,AbD;/QQob2ESP,kjIn[nAqj]YO][bqg6]G-WN&[Xj-l^@9.R]7PntkgFmK*o2&:,maZg6p./-:9AAZ=l/5tG(>#%Hm0/9('@@%CPj0"hZ]`SsA/A'%elinK86UMLCKEDo4bbb[7JAr5R:IT"drJeVH@ZM``\hWYh/u?-15#P,B^uP2NmPPP_Rr`?Kp:JU9_c[+YAOj[k@E!fp_4f'dQ970rZgAo.M@h.T6&4(K$@i61I<:dA_\=[7g-%@^;LX=R064)GS8)pPCO#dMlaP97d>f&D%PZ7'U^KD\O8?VobVf8-\jug=QVW2D)/mn?g<4*/:]BW`C+J'piu=dYf*bqCptT40XI9:@#@s8;g``c>N!).^?o`1"QC4U`pF!RHjjp,$1qROZF\")`i3T4*PafI-n&EU4bG6`>'>i.b"]=BcEHYF$AnLo!'Qko,3JDe;kj#X7kEB+92-&a]gYC;ZXG?AqLd_9FNkngOpOHE5$\api>d=nVSV"jKnt2bW$,QF'n$)H'>usrG)4Q*5@W!,D?.(@*ma%EE5;d1C$Ns't;\5e=kkekR'$B;%&(r2fZ$:o#0V7_hNXlBuSL81hSH**^&69hf(s4FCpEau0UDMMZOV@?2K\M?WKFuraV1,Z'jL1i/W@*IKug.:P/%7M?KLPp\L&Faq*>6mj8=06S2Fq:T)SsjB9ZFm\ZU-BRUV1+a-hQD"(bGXJF``\X.C0Db*W/lme'Fji'2"l>=2qU:-OnduD5fGZ;r:B)Wd9ndKiJ;CSpfs%QT"1s)*!Ho[+d)UTl:9j5W:PG`CO+=34fbFkI]`1<@E#aCZgq'i48"^LiR0`!):R2;h0Gu<"UP3HX_;]h]<0]RFlOj3-/Oi(k\>c;No9s,1uB5W#dp8DVPcGu!L`Yg++o(sE/q)U0or44.YZr1])%rI^6>a/8:fZe+abl'RdkOm%=J[7jX!I]mVGGh8fkpQZ\dZ-YBSf2hn'?g,4<[uhB\4^pg\[jAo,c*l~> endstream endobj 84 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 83 0 R >> endobj 85 0 obj << /Length 2545 /Filter [ /ASCII85Decode /FlateDecode ] >> stream GatV"_/e9g&;KY&$6IZaQlu`e!fp&&cD8oVgLtnIH_;ZWif$::Uhn)"qEb5tJ0Y9q&=lBD;N')nO8W,M3r$4tqn0T=pPqXGnb:44Oi0hUkIL@HA#IBAN:Jr<2!V+)na?.`O8]<>=C@MGG[M+T`A>]^?#j8:Ld*/J.p3eR8JWlgd9l?kr[GEScVE?1P[.QYm]655eSL:.rPYT#%cq\LXY6:"^-:+$Q+g0Ua(eh$]lD);]WO/8;m#_Pf3QoEX,f@&@TRJlpMICBN#-lCNDS>gIU&-g,O!1WUN#j$[/=L"4Y>[[P'u/flW&53qE[(b"QT.bXVp!;X^U$cLU(fVkZbs31@<5,X=ooPP5FrqS7>p><6ZOsj#'IcT(;-FC=]8RAT(:)c/Vd$]b1A=FgKo=>t_Q.[nXOJl86OsJIE1K%C5AnpIF1k;A<`Nm@UG1,gZGQ=lXhW[0M-Q?e)Q-bt5LNAl!*1>d?SYDtZojO$\E*4hoEE7d)`[M>(]I,ir)OcehYrNSV)jb2[8!N2nl=.iVtuhqE0]71o-L:2LYKD&-BcO$CCJjmfla[HkZ;J,WaqT(Ws3a'FW1B1YcA6ATl6T.\'F;9qm'4Q9#8RFi0e]s1c4iaeUV-Ym.W(,"W(3V?"o=p?11/V0F=D^'Ps&)jhC&sgMi6KrYC&t:@b6I-RfZ:l&oFU:^+Ot9-nWdS)<\YtW;Vi4;0*pTmm%P--X%OE[-Au5s+ie4.tSR3>&Ou^_J+q>pOZGc]^j<)5cg!NfoUdb#_%lnIEnrh<'FkUs@R5,;hfVJq0!UY<2l]9lKp01S[O(%H9+.n>:1X<2N7L93PqM>E=('tWZ?YUnS?obPsb>DRC1mh+e%cN?^.Xj;PH@isr<_1.\.EKH4(i59>2C6AX'2QD1P8N3JBTN`/!dXmUA.MA&B`!+bR3GZN%3h@aQf<(`>dW&F"F6fa"S$iUe*oCJU)%Rl2[$H86@k[*+j4U9Ae*Ee7aPasOm1-qai`B:nTo,dfU;8n@CJIKaXt(F4Q)"oRK=Q]P\0aOUfROSQg"Uh6;T(GP`GF:6Q7;6#.>D^%N,jOEC"ilVcSZ*'79CL%#+$.lkQL/0*3SI2bAh!(.St1b=2Ht_m46/7jfQI#ePMG&Pb[4IAjO^,k4SU,f2q[-M/_?L+k4_`6GF+.1TbEW:oV.]/ISuM#;;+?/pf-JDHZh!X@5:3NL';3h@`&fWChtemV^kF6bEZ-XjnNO7*\HF:g9E6`n[c-t=@bPii8:gqc$(KFb:ia[lt]iMcJ;AUP\Ur+tftBT=SE!Hr7=&f=k<#$$GKi1#TP6*/m\$5V2,:dr6APYD21O#q,fg([4bUa^P9O./HO17(A1AKVjIQ_VclbGE2NV@Xl'O2P2C.:?QBW:oUs$DnXM"K#Qm9`\9?/4TjPX<3(p6T#%%bpoW!5tqA\,V2I9'O^mLZUQK?+U]7JH$k[B!XTb'U"1t1!fc[teT36-*P[sL&AgrsXQ1'LY]*S=p,<.R$qZ!0P+^.1m@/OH=_W:`.^AaF7@#rq;[&AgrsXQ1W\\8YFE`hn.bM>@4A'tWH+Q\@FYh1dVT.)aB4ThC&Y=ZF-j'+tk04herBk98;6drbk@8-RJJ)@n.EI\f93k+7L:k5sKg:-gYI@u@>i"r$l`7V1P.V<)e(fVXKXkcsp2$0>S"n`%$Og#g*?\%h0c5Nt[Kfj0,kj^h($T)BLLg%te-o;f8Z[CCA1.P]e[ID!l.mLu]X<#j*rI#gs)%Hj!ie)5ONf"]V56_jYl3j7/;mrpEJ1;1tUZr5kIN5@"q-SGB79i%[9fReEaOME7%3u7J,,3XDO9D!rVM#I!#]hg6>RMX]Zgq%6_:hcRgYY7ea<33&&KeEeE`@"oqD"p6O-NE.DH64N!j&@Md0(*O>]eG:/@1A3R:;C'n6+[?;TZEo^XADtC&m2Hus"=37WEE;99(gIj=giOM$GbRC;Aq_j?a(=5T*An?a%o+@4SE](HbFNq,po/%9lEtmOF4D>'\uP5bei5J]\uYRXj-B:dDk=T]S7eo]RsJ(9k+P<:&EgW.[HhSCS&hkm$&q^;1I=jo=]4`;lF,+5-i`rSa^a0Q@rZrXjt/&7)&s%an?@^D7pDms:f01tQ*TI*1Ci,fg6LLUM"$1T1ch0I2jU(bZ(,hM/VIJo(@%.Z;n8%rHl%^N=QF2gmapW%eI6BP&$EW)lg"B*c@Za5h`)]/mg!mT:+%4tn,*Qj1/Hno%B=u<_e()tb3>o(nV)ks[rOQX4`7'D3MIrJ(I+b&80=!BN5\62,+])ALiiWlRR+l>@&0+SZff9S)^P1`%CF?[*R".4P&[/[m-2#16`U endstream endobj 86 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 85 0 R >> endobj 87 0 obj << /Length 2016 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gatm(?5SPt*$oU`99>^Tm+61Q(:OpEaVGRJ`0+]2qrm\:Q-"o@_On0l]5lW8$1o@cA6,Fei^?(5C:##A1#W2L3fS,pF7e,dC4=&*9BgM+PI)IU/J#*X,u4_15kX7!^2%':W>]XKTRtIZL:mX_M[F0qI7_LCAbG1&-fRQ;I6j>8LXX^1U8Uf]8Vk"P,Z[@e2p2Y630Dn*gl2)i3'3K5^FODa9:;2"'`I'il`;)6=DE*M+<\&/A!JE5I]kgf@'N0DI7B$TYa_oP`9su%pu9AnY'C=)07G&j\a'p_1PBd8*VYY[Y(7hQnH]Mm`WI<"iFSTuo3I+?R6PS#1!!5;!b,*+ZbU4l4VjEWooaWGNRL7UgRNqUG@"2.k<#qsdHoYeq7.>ad1(L:HJRj=G^)gp\q*Maf?3&pAu'7-n$WO3gbob5g`6T;;fan'%lH8mrq))(&oc2t5k*a"=deONS;'$.JqgK]d?.T,8r]d5bOCo*tWV.`*>C@aHpc;X%#:rir%lir3QagjM!XlXC;@d\,V8R8HC6#]4eLYC^^2ekVY)lKukNK=$LpR2g.]Tt#.,=(XB=KO!CrI/%[SP:d]S?qY&^'JULWW(:(`6=%ph:3O/?l>>`\@aJq__LJf:T!8onPDk3MY-c5n2&Hf*[FI_Cq[1b`1@kWUUX6Nr*=CPf%*"t`e@O,g_(U>^X4F_I"(,(sRPmBc[!/i<-0WRmn@@\g8>$X0ZNIeap$]9&0-'iOWEQQd%TYnAN]:hM&Z/2)u81E0fD5_,\)A1X?fCpKVGOp*ahg_1U494NuF"O9Y"KNC&hur*V]p'_p*)XLV4D#eVdCd;Eg>XFEIG930f#H<6Dmo)R]E37=[Z0KGiLQ`2_0UkN;o1**B1iH/3k&:ML3V8m1=)3V/\D8b$2jmWiTI!Xt&_E;k-Ag;M,ed;>3rj@"kG[aA__G3gglAL)P6=>#>pu[E:e@N)M`C#]1(]Sja3QBOU_@NKpG^5?SVW38J=PkU7\Z+p"3Y*G0DRU,!/bKL1TZN>,R)^$Vi?CFGSX13roYh95WG\*l!MIf"FLlc8tO5E=Faq$=lfnkoDn1>8\:!e>?gh1B$HJFf@jCcK#=..?1e)?l6-]T7Q863#2L)6Fq%-BK2M^1I)h@#K?.OIGHURT>`j]QGUeH3&Fm4c_nT%.kV65ZW=Y1ZMaK"]aI#m8Y4PW_3*cC/mg;b3/1elPZ;+IMU-6F'%(]=3K@p["pRK!U`B)1.BVOM'JGj%b/U]1G\e:jdDN/3!Xk9K;!m`7<$Aa%N%.`A%7TnAUe86#r^(3JR%,WDu]>e#iIn8Gj9NiJ#4)1q_='0EpCOEO?JTh>I%2>K_Cou08Y0WdfGf?PXL^<=pc1Zb-[*k)1e]Pm9"0(\J*MV"=YC%sur-]HNg`GF;p>O@i`BcuL%p+0R9pMF?roLp`Te7q:rB1^N1lEq#LL6I/!P=p"*rst#!T!/~> endstream endobj 88 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 87 0 R >> endobj 89 0 obj << /Length 1848 /Filter [ /ASCII85Decode /FlateDecode ] >> stream GatU4m;JaX)#p2B!o92YV1o)h>IJ6DYJ'Q`mFp))MCm'#a*cU6"bftS.=:P3!Y+4'ra_o+?ILMb;"J]1Dfm)M$X-U#NB/_?BQ`h+'2U:jHC"U@^'uZ7'imAnO&I9$r>d?'s=ioE2mAo3;i*Tt..`hpe)7JUE1UJD2Xg3at#^o#)+83XUpA^AEHCcYB?qP?pI*HI>:U1]LrA10(7U_!dYWD5V[8%bA=r&c;1de*IUQI5gURYuID\SPIW'^M[@29H8m.Z*"#/M:;kJ]l"ZmeQRC/,h!rGk&N]U`Bdh*9-Ge!o%MdSqrBa0tJn`YSg/W:kPQ0eKhC;'Zk\hc;01@ZhQ14AuB=4CNIc8eG4c(qe^^B,&DAeIF-'nJq/AKQ#lpg=-[1[ANq*3%)cgC2>gCb#B,8H&2h[;$E11N#4OC5Ud!DLs^*F--QlWd^l.AqBf+jP"N4tl*DS]I+AQ)63*.X%hM:P\CO%q`k')[p7_QnX]@Rfk!1A8)L3M4Wt^:\]`-g4r7]n>9uG6+6Og!L.Z[Qk0Wa[Z%OA"Sp_P:dOdYm0WXM&u[N),(%)CU.GR_^pb-gGgTG1o&kKt[Ti)mhUn".gh@)7Q/kR8gg1AKA=aCNooY/Fg5E[4RWK=TBgX.Gg@n,rK1f,]aBJ_sRM[S0oc^D;%6VITmhmdg$dq8rllV;!/VP5P)?h[?:YbO4kBR4<_4P:oTMTPF%g)!4fU<=;Q[;d%Q%'oE>`JHeK!tUH/ZrjABnUPC%^lT4);W[6lVkHWM.-Xg6PRet/Oj#t!OcK!U'(LI<0dYZoRKmN1ElC49*$[;rHB?tJkd%.26I,r(O_Ho;]pdPg(7gtA&=DLnGDqHP2\G@\Uog>tR^U,@_bR8>f\g8VesZ:I^7M7k@C?4nnN]d"uKCr3PA%.ZMorcN).CW"_Ed)Uf>HAgfo#7VCnb*?@*ad6NUtR^bhsX6RTA\tP"NU3_gb1X*$J(U(nl]4g(jQ&]6GNV)~> endstream endobj 90 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 89 0 R >> endobj 91 0 obj << /Length 2261 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gatm=D/\/e&H88.+nWs@+]7nb$S)Rkag.c,g90ts!R!1D&nM`A,upc-pttNg=k6Fs;g2o-`Ze->pYm<"pU>+ji/'J8G,pu2-@43ti)5t\LfIeWVV@JEmMArUu2.3(M(>&a-sc2BYmR3@*liXM%pYH`K4o+0%[HEp,MYMXD"qOS0E5sWq9VhFl0AD^VF'iHfLbECG`&3__OF3:ojp@XiPU6%9,!d]n=^66ns!gV%[u[LF=SNknrJ%^TQ.1eX"l?XO(I"d<'8hrWo$gpka@'EdUXVkA0%f^?l>JF%*mWC=`1XLNBO\1-XE+ri#,@i;tGpEQ:?qn+3'u;_65uB`orQM@nR!,"g>-3r9pJ+\WMqV?Hb6X=/D1RjgNqj8D43@q!d7JhC2_,35m`m?\G3Cp\uVUTgD&,pqK"oHhK(W*pI$Hrk1.M;FipH\AP%%)W5pfu,"Z0B-C#6A]9\d3R`I'*_@eWKQUhUbcn(iCDitEn;h1Ic`6AiNt'l0LL^ZYh^EXo$+`)Eqm3,33Yf)'3<$A*Xd[J$HeS,o3tW5lP8d76AfC-^(uJi3eB%#":AU7F%o)_q[tGoMOE--L$J!;.aFoc>#+K/J`&u)`L#*2X;K.N!!%em;@gYji=qq1HfuQ[F7,d>o3,NPh^pG9<#W%@t1XanYK`QF8F2cnF/f;$#G@tB+@]U6'5e5E46a8I+?F1g'p<`ROatOQlA!m*U:ilGokV-U56CMXgZCrJ&Nd.p=B1c\%>->OR:2-9iJ*J+=`M8UY%UMtSTVh.7*kuqitgP[MX7_&Bbc6()(L]+0&#DbT`r:goNmQC6:nc:64FJp**6;,3Gg0PiAc*5fe+l1$p,[^Qt2)d(@8A*5+k3q2N;bJgGYlS=ocL`_4M=Pp\/@jL2;Z`Ym.EKO5.$Hm0j%:WFCe1d-CbDo`N!HQ8lrIGJdJ*ZHg+I5?sjq.al-:]CsLSB?ABf/bp9oY5AD\S68n![*2m+r5Iq?^;-/iLPF+aCa\k?N\@hObptk>74EZugPP7h8t"?R1@j0LrB-,:14Bp=kr/KC@OFH(*']?ucd6R0LW9u`ZE>P.A?__8!MU^eD-&8W!I\]QE1B!e9X1J\30@;#prmDHg&gjU5#h5ikfd\1oX5J1)%N*LqEK.#5OrZ@:e7;D?R$[/+_,dmeNh9HCAko`_fCt'1q#\W)Z/m7Ou/ijHZoJglnA-XKbT?FXu%;BTjAOk&)*0q.1WXSc7j:;g,@(WLKS>:MkSZh\IYX1f!GXh+5+bdqHs+0cXLCV]CRCO&K3lM2B8mE+,-BWuW0Cjg`A3l@iWAm9n1Tq!*587X#eMQLI3&qmq$pI@/KlCGr%/;+[2($1SM:Z(XnBWBjomW8=upApedC[?TTMp^@N:b;p`8RTjKkXrCg>`qcC(96`.@>g?7S!#rb\#?9B$8G]4a->.A;2\j5)o=n+WM=&7Wn=eU!S+Fg4if]`Nc4$_Y$S+lem2.G&:5SThYu*u;I0).Co2Q)S=7*`2UH-N#"Rm.5OWT*G#lQ\o?ORs&!T"h1;Ya3o!X^ahsL"4f!?J?O2<2t0Qt,m3PVSfXm9d;6NJ9!*fc?"e endstream endobj 92 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 91 0 R >> endobj 93 0 obj << /Length 1675 /Filter [ /ASCII85Decode /FlateDecode ] >> stream GauHLgMZ%0&:O:SkV3*SY"7B"@Y&,q02f0n[P>3$9cn&,ZAH_u/('^Mf9_^QCT*>/Op4A5fHZ6jR5;`:1V-^_IU#SGSmZK=5+a3jO*iU@A+`'Fo^B]DY8?;&k2Z#H>pTir3qV(hq>Pp>+&muD?/O*''*Em3(*/$XA4%OY/b+]sSM"KsK)tjVM]u6E!\(iqb.r)>34NORn+*bMtV7@gI>Uf%16)7.n@*%J,P\/#**P.TEQ7]?>_BYD:62GlA@MLSMTO8,n(TTi8Uo9^1]SGp(%Nf()&.c/'_ZdCK2^PL*1^CHe\seE?iR3TaKLD]@GaNmc1X4Ns?k&6m*N]eVMU%lKr&]0)jT8A?@N]p/nEJfo%m``m?`dD4fm(UZ:Q50nReN`97\4(hJN]i"@ZD'BgaY$5MIWUp#sdq;,/+YapMFQ1NY57Md%G5f[+?l.%1uUE[KX4(-3)?0A\T,rh8\[C+Stcjrrdpn5;X8sN,FY\/L3ddfpN@9Fl,rGa_JatEFhrH?:t='Ma'k5^N>8BfouR]607d!%+$7XERPR0;u+UJ^NlO\R4K["1]EG@"lI9XWUk8i+V]Trq:GS#]FE^^X-Ap[#?2Ia'Q-`epcbu5]R&%L'"T@b\Ia.NU>=\0o`]\(%DBQW]5#K@T150SkMU(E=jTs)a.)L71.<\VWV-/\1TWl%iqSg?+B08(sEAqkE.c_#2gI\,_X$ge4<9K;n)@-<38THlQ/2a/FJ*c6d`[)$7Y,LojDPJ&M8Bl^ZY$XK8!@06AH>-WMqM,KLi&U5[[UBkFNjJAP:!-c" endstream endobj 94 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 93 0 R >> endobj 95 0 obj << /Length 1540 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gatm&7t5&$9&dO_@6e=`brN:5XLJ8_F`3$5KHT/2e)Y1P`k(LRcSSPAceO5:5%;W;dqeWK2E[MB_^LV##F1K+nn9GW#6`-U1@*EP1AunOOU'ta$+%"cZmH/6m0`\EZ9UViq]tBeOD`]dSF&O\$rt\5UPp8!,##'K!CbE\_-0gf<-gK8![`!;25L%Aa.&!"f#HR]9ns,0_((Q;pJ)klP)Fpp*U(cI@3?gH7a[2S:$'*JnQO(jLfkpBkfF'2>J5^m6Eh2)s4q;*1KoOX63%GRPVDc4NR+\qc6]*Bk2mmQ)8?_YZjthj.^cMZ@:=CeM%E18Q3-J`()og60t]_*nC%@r>,%Me?o=GXi^Tk=Mn-rjSoP10PO9B=&^,P[UM*jhkRcnYS+a-!ELLNp&>D"?a:7<K_6![e#C4QS@p0Mdh+S"pB-RIl6"EdEh.Z@mad"WE?\C[6b77D*KkMO'cef1G']/plYbBMqVtfU%q%+ed;.7F4O(kSGg8*3grU9]HgJ/7WO]1p*a&gG(S?U+k]S*W6:52??P2up?'%a8W4,GY<]2F:*uX#&0V:>l,=48af,^]m'6>(d,X+Nn`&>%[*.T&>s2ZCLcQiuLp_*n0'VuE7P#-#q'[%"rpKSN(L\>&3PP0e=`MQ(O5;K>Ss,IO"]!f9`CO%o< endstream endobj 96 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 95 0 R >> endobj 97 0 obj << /Length 1525 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gb"/'>Ar7S'Roe[&G!+0LfI;r"%.VtXToANQR5?!YRH.8;)U.#F=>:Tp?P@CG'N54\ZA(=L*9*L6](!sSaW(9nU31hR#GoQaBFDW?Ub^LG-!A+`EtjnrZ;AO-STA]7peaS?i9biBj`)c/1&9Vl$#/I#?f-)]<-X"LQ6:!Q1`1;1Acr36YLmWNSRpqnTat4$J2VQHgZa`Vj71+W5h96J$0*`lAW//Imd)rlh)C@G5$;pFcNrj?-Ll6q/W*ZG[`a]>uuN>eT[r!,29jr0K9c:YK/"QG9QTPFe(Q<5Bf/c1n[C:I[*+,SN1MUsoK&M;tcZ-YV9;oQV`%ptCi%k>f#)OS:%%N%.ATV$*DIP]<3dH%)[>bZ70:1M&ftAN=@!:I/(bWkZJVP0jG+=%e^QAI5Hn-oS8m.15b,+i'aeRBeX_`@Q4F1rhO&qFLAopaUGe5:PLr-:tI;.h'jjf&559n0P'-hXm(cFHGSNCBhoNQJ5T,?8Rcqf*m0>O2bR=IJ>QL$uZoG<=g\R75J![pu-#`[Y9.";giVYY1O\kCN)%$d4%&.N5Lp)!nf%]mJDU,]4)13@r.`"6h[dgj>bhFAO*!GhA:B9O?J(*^NK$f7S6KQN4r<4?Aq[)BThhiB\0(g'A^C8#M)Y.dM0f3.YlT$6(qr3IX-4(\I(^87-@Sm)tRJ:O.Z^3UQqq[BV;t4$5c:Z_S$]-dgl]dudR?Wi+%R5(?f&i?<0K(rg4i$,-b^oRDo#M[8IF7#)5e0,@j,:L=lL^'Lel)>L1C>6Xi%t$3TcPWnR[siO%YpMe4i>YM[$pkJq@@-3`GY^^=h\?L2708aNahKdr]eRX'ZRG\rKgUrW%`qjqrr=?0Kh0W+T!=a#YuQYXMq*7oPI-%6<_J1*AC0D6CsI@2]16;KqRK.R]%Z#AXLYqU@p_&`P>";%k6f#SjJ'2Qs+F*Xqf]?GCR(4eT\rJNWUduSrMOI&F@#/RLRuGaIYenCE9%V)6VMdYp/j"Cs'nB=&O(4:X&g"gg<6gf80CE,B=ti$?e@hNpR&,]jn=^G=74(#9lOPAP;[O38/Y+OY0U[a9Q^[:2tJtBub,P#OqRfY]UQ_Q8gK@(Cdt$kj"1j`NlC6'_c2&#"V@hj+L=aI=ht15d-NXBYcX>Dn'4e.SaDt:Ji'I)jihq^$Q$F\A;u5R4?#L#&6=g5l~> endstream endobj 98 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 97 0 R >> endobj 99 0 obj << /Length 1826 /Filter [ /ASCII85Decode /FlateDecode ] >> stream GatU48T3WI'YaHG#]o5582T&!"XjSBV&?)M!,]cI9AK\%;%?GpoMAMWr;0B]q'_Gedir`^dX!FY\+"fNYC9WXI]Rp0-U1s=RHJlk*67W"K_jjN'EtXp6AH4e]D?sKnO8$X8_b/qrpCO^^?"7SbEY"_H1K:*HnXkY;;fH#jK*.[YIAl&"jOXdoBRpi-5cNkV#icgl<-4\[:m-t9=S03;E\rj"T`_T'_'@)iUncM5;^Yf=QFl,L[IB:J^mLGUscBG2XVKVL*W(jrcZl-0's-)2k'+t0OD,i27XpU:-=nQ9M_RIc5,aW@8lj$:;<:0sP,nM8^c'j3m78qAiRW[Sfj($#[k*SCJCR8;18R@:0tU=L1rL",/M/!/)*1Z(S1L?#Uj10i!W8cYCB1+Q^9A.MNrj@kK<4Tls4VN3g,1^5/C@l*2$c.1(`H[tIpmtA0uWp>ACT#iA22.HC!*;a+cVTEVKISg9;r#R?;Sb+FlAkF%pl:[JdRYk%LeiM5ba,jNn/,In<4F%k!F&FY@HG:7XZT,/kOD(ecJ?kH7JJc&%Lt:]mTO@3lqN:e"D%ako!dn*QKf4)u_I5eIB!oiZ'hn_KZqPSsca%>#c_ID3(aW>b1!`T+oG":7Dl);7$R^SbIk6))^kp8?W=D]3&F51L;*$-rHSuDQ*Hae++4)]#5oif^CZZQO1$6)]?Dhf3YG[\k6('%fgL!XT+k/n=PZ8X(r37nj`+_Xn5JJWuENLp2%ceZM3J>s@dQD(7&J`%MkJW&i]XTXk.>X]GTi.K\r!GIB@G-IRc#e/BdKe:"S@Il8Li8k?ZD[QG9FDb02!=@mi;4cIsc7U+dnt@DDVmGr+IfaHZtr=Id!@u1&O"S6U)E1bhNbSZ_+XIZHo2arqpjpkG-N9UL]Z07T'K_Cegc.-X8^53,Nk/Rq;J`bVkfODkG;NMK@f$>6d&*WHL2Rpl*a&XMJ*+?ua_[T`^[Z=Fl[YUKjBp+u/-`fkA9Y]l_YpB9Kc-&dP^Zk]5,8>;t:$GXjpbgeLA@2MMdl+P4q^7j#km;h?@'aVFmL?n5loq*pp]aA%jijkoQ&(ZlrMZ1!ZKQ,+1\&V@o2K\S+J:)Z^qV^),Xpno4\82lsd!;B.@q41FdpR[oBYU6L=@n?]ab.h_QW+Z~> endstream endobj 100 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 99 0 R >> endobj 101 0 obj << /Length 1741 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gau`TgMZ%0&:O:SkcG>0/#',7(Vp=3Ua]1E7>`tVL\2FE--@PaP7AnX8\`!8CmdYY[$,?b,"Nog4s0b>Zq^&O6:e2RdCkJph[o%;TX*iI#c[5PJ-E+?/L`3&!HB&^YDu-7MF@aE.;DD[U?sjqa;+q?:8hgTSA\3/>W*ofCA=A=M"KEl/!.`oR<"bBh^F@"<-s?0mmANC#TsT',>eIHq?=h@)d93ENTS*=7n+tY@4Tc.r9Q(J,ln3PU%J2Ls(B+YZ^#Hf'mK!b00_"2<'LT$In*@`O+b1IhPRD1QKqA8:,NeS7Tq`9:p>@g8l*XRoKhY't%>sHiO3TTZNYPZbgOh+tY<-JqekqF"D0'-j7(gou^`G)ZjCjRTn&Q_R'cLfrM/epXBK)O&a&5;I5Qg.8p^AFS<6:c(K3U8Poa`d1"A;\DC[gX_.A63&_gA'@p,PA-c`.Gp.5@@qtZQk"Mnc/%bXY;\XmdodQ72n-#lTORd60Fk#;+'fqs(EI17,&#^Mlo?k*n1#l^W^HiU`8Po`W2[U9K%G[Plb!8DH5i9]C$mWG>fk$=s7X6JACYiGdr$tCZBUocNF8Rcs9k6F%b,XR7.;L-MX0%W^E#(N-**X*rj*q[HWi0V\7+l9%d":e1=qaBM#-)DbC9b%,:;Vi;-[.8>n4P8-$FU+jU!XVE3&&,(\[F#OJJ-,2XPm@H_GWcs)RGUSC+VLe_sU+IfPL3\9VeT*iNedj>6:9[5=GCnA]pMT7M3(2[U<[g[T[aRG!:]['aTuU<.-HP6XV<3+]'KdcP+c)IgWLMr^9!,@bCEd/ctdtDhP;_lZo]g5cS#;*@#kp#.&//$B0frh/ZDiM\6df/[:@kddOP_&sp;/V73&1IDnHJ@EkQ[`tQ@,CB3gUaB);1G=i^bZ'6,#IM[pUB(\Vd@f"t#;GMH@kU9DuqeoX4JGB.&V#dd_>PdJZ]UbRqJE#91kGb$[ET\S\4W'K%%=d&@#)eqa0nj-$hC^/5.%.*heLrcu93>la='cqN7H@iR!Om3Di0MZ8r*d;iUM"W`M0aRu=I[!3ZbbL(8EJIgWu:'i/_^jWq0E26<#_@Xj;F+fqAVNN`c_(8opC@V+4[ZBYk63Gae2gU+Hj.&&5'HQ;K!H^"qKaEurG3@tXmj)cn01EK?JOjTA$(Vo->i\\)kC2'ZfhT(;G,K?@42Zh#_c/"C[FIFbnV]m$>bq_n"PrmR^!W*hZTGk6fT!fIFfq)I6[>)(FI0P:YBK+UY#*f>#&Y(B[=1f9\i^Z-gdj2d\%utAa$C"!KTR*IVGZP&JZo..n[`.u=&,'^f(H;\>9;)g&?1r`H'):Tp=DE1FqbX55o_i44_ckrK_E=Z*I>e endstream endobj 102 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 101 0 R >> endobj 103 0 obj << /Length 1797 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gb!#]>Ar7S'Roe[csqYRMC==m$qlL9M2N'^OY+RZjpb-4g*S/'%RnF`^E4b)k+PJfDDDVhO@H%eF8>V$>guQ5dEQ4o1d]burdGp-NpTq7&BK?L>X+ijT:XT?@+lQkQbsMIfQ.e-qnIkA-BD3I/4.YaS5-9FW'BU!)1g&H83\QRDiC`.F6-L_Cm5%`Fh#!j<9"!"2H9.B0-,^T>PDSmPf?qjh(C.n)ct>0]"=C4J!c1#S1Le8lqcF2lX.+`1(4jf3PN]E6.TLF7`k20>)mk+FFbb+\TVD='CrT+n*dH"'$(o#C&.BJ!,/(c][Y(BMK$1BCQl3'EEb/Ti&,Un):?HQn20&'U\b%sh5@\.^KpZ[m`H!W[MSdfE8W8V6p_rOh!ANpbbsZ^qQSAUVmbUr#9-S$u2pa0?Z`^Jq^U'busu56_jA'WD'1llH0[.P#%2SlHJ/'[GNSAYadp0/-ZJ0^2ZZ&,3Y*T.`"Ele=H^!JJH>%LK,_ET*Ac@\-qS=1.cb5`@:<\t&%p?3?sN)jmD3I@k@'npjcNk2n3-me,*V>A%'#WF$D-(%5?-<\49NboYlH#%oHDe@Yr6]H8Zj\G="'K[,D3bY?B+Lr')[?%)ab!No:H'aKBC+Xg%@;*>gCA!#o^^i`$Z@R*YN""76o,trFZ!<9l+p+:7'\5jPi[@hfrgrk%69.c?J16\uA9_Uh3GjL(,SheS_[?T?MgM6H\`=-_Gh)PNn].F8JIbQ"T&?Vi4[MZ*LcU_8Jn$3Id?q)h4:PFk=km&T7EOdG+p-Se=2t'\Pb9Tf#m+>/T3M'7Nob"5(@OI0eMpJNaQbS$Yg2>W2NC2'8e&_!#h\Y>"W]Q-MS^u244BpsN(4@(@!JNcrXK9o4GV@&pp;:/HO0LOXgr!o"]u[Sn7YVk#0DPg:(_dU*HTCuVIKqL-":P\pG]j>WHLa[Nb0kA8!up%)VIBSp@2B_g_#ME-0aN"iAB,\r\iDQ=nuU9i!.tV.-GV^Z2VUBYO`nfgpQ0o9\Y92XZO*]A9_=F4t;tSuS4MF-(g!WKZB1WdtuZr)MLd(j*pDm-S[%PVAoQUgdCdmON6dE(,g'ISFjFSD>PG3D6Z=@)\e-cr$-;]5LM;qtc>;F0YMmBrQ17#GYrsXI@3VAUZ_[i[*OqpB,HH&W5ZNFU37*0_]-fA#ip7[Ap2fI#Mk>'%7,1mG-/jZpQBAA6NqMjieN#.>t$AJ>OII""GI>G?iJn)F&Xq52NKn,+?a1`RJEHb:B\hXs$O-.oB(U>0(6Q^s\.6oY4_u[:j?b3uUqhct431V]@[)8h&;\pZ9EZHN(UMX_n%KLUqpjp].HHWgAAh9^"QII1f_6u7s["Chl"id>^r0_)g&H!?98IqW~> endstream endobj 104 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 103 0 R >> endobj 105 0 obj << /Length 1997 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gatm=gN)%,&:N/3nBLZc";<&s99\U6gfUO;C"ao&jItuD,_?Y@GYnahQ[F4tqgo"t-'QG1(*I(kHhQgBK"bC4DHF0J1Mgpicb!=j,MVd;?iKMVqX]i=6eo"jA<94D="I^;XYG-f>Ws01%V,S'B^cWXo2^R'EdJlfV+hm0V`=40uVn@PZ>gRW9ic-:ob&L8F^p=b+ZO:.hi@KuOL5091,ckPP#0a2PouL6!=n/,lqI:/B)^2BqGi&ZeXst(%Ssr(rGHATDRs#-"_5I/@*Z(;5VC\WjCj:,1Z3O\U`:9tt0-QY==BS=\U^>lIOjf(lmC7Zq2E6oi;LDQmcr(tYr=p4un'39lbC:jgE\/Wj7CJN^f?P9'SBjSJO1pAT4^V"%q(bbZ74@L)4cZi-O\lbl.H6[b&'J1DhQqT6mI\RjK,pf]H??jP_j"Nl!9P)a&[X.#W,;TB@?t7i^*3I6Au="?m?p'o*LiZt;dld^46uR+[p`!aq[r4Eq(,q0\N1!.oQW')&6drREQ)F#Y$&f`RoZnCc9moHI*J?%'G`VW,m'7:YF`.R^Q#["=J2aHb!I=:+4;m;2aJ7N/jJ"eK)]]h:glit^W\,?E#VBUr2g:WkB?*sWuC.P2!Y4]^MLYuN$tH8JC`N--XaFsLb]Z55S>ZZPgnCk?QY@N1"uBY.=DgV^Q#:$a:8XE&*E^\mlPC)cEKL%b@4)LKP+)Xm!'@+G3Bh+=XYHT#oLdm>#t!cn(0-o7=_DcH*[QZOH.+e\-n8+4R&ht;SpVjFB0'X8A1MfMpb$&@DTMS,ZG=(IAh.:cbQ$*?n3;1O8TT/UIf_k@?C^$kpM33u`bWe)UIB/F5^r,jfG?QQLah;sJH;LCO;)@$Dk8H``h1oW'E=rC6b%dl=0^HTBWumSiF79+;?$K=1&oHid1ATReb'LAXU`Zk/_7(>9h8#I7ibMXfE3LmZj_YJkt<7`P,*8nqgu]fidms3#&='j_o(89J=Ao-3X1Pu`>Y=(`1^;b,6G4=q]05`pFf-K[<@[T/VFCQW^&9!"A&i#6E$)]93qiFGU#7`<2`TZ6U6>HiJW],:aU7,>@\!*/Kn6j=pQm\ECH2q3"e>FsWX3I]X+mo9P[!V@-9,uL#LLWH/8"!/#Fb+Srm6$*cnFu(`kcTgMMC+f;EQm=O1%\bllu/6V\Xmo2C]KJP!QUD]1hiLnNYT^[&TGt#*TVml$KS#h4&O&T]kk;!qa#b,^ium'pP04$eZ"6X]$"UWUs(AO@ZM0s4/lJPl9AmfjK\#D7X%psQX*MHP`l$CKVr]$W>!]k/+QLR7/_IoXh#Sa6/:+11EnK2:>c`R7heY^C_A7_]t;aik-nh">Y5<>$6c?*l13[SGT8`Pm>n#BLK622kV_Br78c9lA^o!VMaJQ7iK[!DXM'V$K4]$&`j/_t+3)0]i;F2<(. endstream endobj 106 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 105 0 R >> endobj 107 0 obj << /Length 1471 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gau0DD/\/e&H;*)_2en!1En:5"Xj_b9iZYCjPQ#ETlm/"44u:jN_Se`GJ&:j3E#=CUCYi$Y7X&np%^S)4m\jb#h4?rf>#@r0haSpKg>C-5=Y)o0-E4SdPhJ&KW5:+CudI34S;3@]9!7Wn9d(KU-Gi]EFAS\PRi+qs$Rf6nN@N>8kdT9/o=HZ=^8Yse#8PE2ZcQUeUs;6:=qF0uGXc;p5MarJGTFqs@&\%/QN^=0a6J`s,jpjC@>-g&q?`A5!4p&'(3I%Lecla5qcZP6JbN)PR3W6LLk]bgR#gJI&I[#u^faI[$/a&d7Ar"nS=dLD=btB)QX[!EUZI-6\D*E/#7r,!pVGdf:"6Woa0rLW9\'5V_':q7[%OE'%E_8)PbN0!5\FVQ4m:G33Ik\Mfd>%3h?qb#ri+?hI@J$>-A=ab=kB6.]\6PldgDYG=O-EO*bV>(IsFlE)XG]c/86-$is9a48/Slpjo!cO7*Fhb=IEh24[LO:j#<1]a':ot\U_S`D_orX+#r31nr^M8)Ad.TiAYF1$nS(;ihprO]seVZ7\0D'K:($18RntV*C/I&7].CQ44j(AF7k%MRuij!Hr,!aa?6;7M[=;Z'.ST,Cc!s^lG:T40(&4t%*8c+VPDOgbE0itamX/GJ"J)\)M8)Qm`o83Y-C!_,n,*i*ol5+nlcFVMG9A)Z-h]hCiF8M\]QC:("-BXDnil8)]l>B??$=^d$=:+Y+,1sP7QpaB`D%WmG]j2&G`X80ToK@lG.E)8*]aeZM7F"pGrkl/Pd@%'\_0)>BY:F3>4K"!%kgK:6""3\5BYW45\YE9F`E@)7?P0]g=Zg5@1jKf!ID@V]5/19+caHjU8uW;\]E$pYq\;2W/i`+(YF22!M=@gXl!KmB^tBSGP>/L;dS?f1OXTrK0H>gI2$iWInQ-*0bT*Z'RbM?+%?ET`!LJcN"-n"tLH="sZ3ioal&@uKO?"VVSpPu&?8a;WV9atR9CpW@2NU7@;74F+H=F_2'rr>"f=sO~> endstream endobj 108 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 107 0 R >> endobj 109 0 obj << /Length 1637 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gau0DD,8qH&H9DY,3$ds`.lp$o5^k0$?T.G*p"O+9;tZpf@c.J`7r32p2iOum(D7r2Wi/;'3]SHfJCjt1Geac/=q9KB0U%M>)U_g/95RInpOmCpPA`1-TJIri`[))*bKoIcMhm;LKrR=b>/nl63U(Z_?J9U\0a,a0!"lUg+q@&_ll8Pt@S9^uUg3R@MhdJN[^>CTbZ[RBih`j:;(.qI.@8a.Q[JHNU387WT^=rm9egl2?a5REn[Z+MM(8dAdtVhd&*.9WOll5d,O--XY?0VAq^\.i6)#)TWNm>35E68EAEJ!3+HdE.'2rZpC_(UJ&n4qM$=I>1nh;2h-42s#U1'F7\oOQM+&:,[k4FK?h:sK>tN(jcg<@YercJaRW*F/GQ,bCd@Hm->e1F_HcWLB!*,"4fDheUZ#Fc!5)a6=V6D(+D(+;g8kneL"s.gd>E']Y,Zj##!g`9+9t6FI:k>o42AOF3n%u_T6*I0/X%PQDoZ/DbL7W0PZYdTb3u][5VbPU=I*(abGWMB?RcE>sZ7-injp^GZiO'q4'2,#Z=UN4d_S4'L2+B3j2VA^N0,f)64m7L[>?\0X34CjLDUUK@"uOo+$BSu4\<9Rqa#T8Q`1!A:h"qSPe(!Sap/THta=q&QZ>&,*F\-5U"egI,TQ2,;#_;LAS-8'Y1cob#(%RX6BQmnnu9dN3Cb-P.t?V09?W@,/M&:jq&9,J8GG@LN31]Y@+<^HM)g;VHA/o.%XI.'t60;2#c_g%%R+0-Soe0,0&VdTkr[LmNG0FAiPP2@1-`F^[(=flMbAgaZnBG+*NdjP=Tk%(BA23RJD>!_*SX#m6QFb[nE'o!.>R?*@*>DYiD`1s>5s71[TtO'@"b*l)B("EKSZ=XT`3LG?"Jp*J<]2&V2MMUj?<[q'29i8QP+29%u*WY#)63-k,HMYj=a&2lp/73k;*;][riG)`5fUFRY5FSbF^?`eO']U:k[I30l_S\NmulcSFAU:ohMK&GZhB^<:&f.9fYp1MAH4MIlIme$dm-Fd@9jFpC[OKe9=Zlu2_%b$`4JiH#h0;GS(\$:W["JG's[!^^?",2EfjHKM]YZ4#t@YbT"Sf&;s-8P=JHY)PW%CX9=FZfiW"9si9TFHeF*3ZB(WH$q#s#+N\%9SZ#(A(X2FTR^iI,%ZBJol[RkojJ*XNLVVb74H*)8)Oae6L;c@Q:QLYHS;"tmB4U_S(=]-fM[M3;BtNf^)HZ5TGmn"[GtJWnu)Hul"1\dW/(*i0AH&&%9hRF(t4pSO6U1-SG+ZTfOCW~> endstream endobj 110 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 109 0 R >> endobj 111 0 obj << /Length 1845 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gatm<>BAg]'RnB3n/LcAW02O@J/Pq745JTKec65F#:5KGZ*oni-#N@,hi;M#d?,Yd>@9L+Eb+E>^2oQXAs9^1ai=_XQ0'^&@dco^idiB%[<2Vm-.F?@O`+.)LNo/%e)!DF@c9H;iPf\X6.,28n**]:&i5-/YXjesG6chX''KWD??0@/BgBgPg/i+("o=j2NX(P0'ODl8@)CV8'q/0JJ'qT,2dZj@eRhl2)eG5hql,[U'&GA5g]tBkWd"ht*C]:.X,IJD:$Y8*\o"9@*4#=aI`*i2[+QM-hWkQX:B7;*(I#aIE4Y-7]I@hlm_I=Ys^3AIVYCG^U?e;aZ/$Qjg2,^d?/3#mjXsjHcBla:tk5Zc1Wp9=8F?5!XlfGL:G5q0gNL#ZR+WL!h;L)l;MY.r[`SLg6)`G/k_:L[KZnU*72gF,.XiE3`m5#7f1@JS.WIo@efS#26'!,qgh`+lfY'KqWP4WnNG$TdmPA&P1bNXJEpZ\B5_g1KufB?AQgcV/oL3562M1$jDUZ1WCYN83`gae1`4ETh\sT\c[KpZmarCa$tR*@=Y;>DfdgNV-pfL=BX>945KHrX\,OeJpGQrNP66?Z4=7n(UJ]F!FJr.#+@(,[$AS,(A(1%\_5*=SV2gop=(.1,XC28DF:0,;X?n:1j]'Wb.SUFW*)2i<_uNVVVkC$3J`Vrg?RV1/(db)*gG\kW84OJT(0*+D9"Le6*KPe'UFCGc%8Uf"`,Uc=9Bb\NVp6YGICZ%"(U$^I)i[3O21F*H=B\^-/',Lfd)SVJd[/QeTj/7!KC?b2Sr8-R&2$!%f9\.Z;KjY/\pF]/`6V/E$iU0aau[6U,bNl>lkro4uNKp7S4\bE2qFEc1sV3VnHsGg=oj%ebEhM2anjEk9A5(cdng-*M"4t5uKquQQ_a69:iC8#Ds(]^mtB.WYGadC0Adc)WACId3qH"_<63NQG`$BeI4^mk/[+:Hd>#e@!eWJq1$^qlA*I$M'_Vg+Q_O?7#,E2C?@^ATK_ZF85?IOS2(1O_eSX_NLU4h_NH#b`4qeD:hJW!qA#HkL"n)KBH@-.LpP>=ki2,O,5a/`l#J;k\mu>9-Z2""?/*82.L2pc7M[%J+,^%=DgZ1dXbF=T#uc3+nuNVAp&*oI>jo`AQCR[td7;TqaI0,q(ah.jdG(2!m_[[pb3OpT`#>323Wj*LEW=Faj\2j3$aQbKW?iCt:Ss^4Pb5'-*c,geVDUOr:]L=E^(LLf endstream endobj 112 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 111 0 R >> endobj 114 0 obj << /Title (\376\377\0\61\0\40\0\111\0\156\0\144\0\145\0\170\0\40\0\106\0\151\0\154\0\145\0\40\0\106\0\157\0\162\0\155\0\141\0\164\0\163) /Parent 113 0 R /Next 115 0 R /A 9 0 R >> endobj 115 0 obj << /Title (\376\377\0\62\0\40\0\104\0\145\0\146\0\151\0\156\0\151\0\164\0\151\0\157\0\156\0\163) /Parent 113 0 R /First 116 0 R /Last 119 0 R /Prev 114 0 R /Next 120 0 R /Count -4 /A 11 0 R >> endobj 116 0 obj << /Title (\376\377\0\62\0\56\0\61\0\40\0\111\0\156\0\166\0\145\0\162\0\164\0\145\0\144\0\40\0\111\0\156\0\144\0\145\0\170\0\151\0\156\0\147) /Parent 115 0 R /Next 117 0 R /A 13 0 R >> endobj 117 0 obj << /Title (\376\377\0\62\0\56\0\62\0\40\0\124\0\171\0\160\0\145\0\163\0\40\0\157\0\146\0\40\0\106\0\151\0\145\0\154\0\144\0\163) /Parent 115 0 R /Prev 116 0 R /Next 118 0 R /A 15 0 R >> endobj 118 0 obj << /Title (\376\377\0\62\0\56\0\63\0\40\0\123\0\145\0\147\0\155\0\145\0\156\0\164\0\163) /Parent 115 0 R /Prev 117 0 R /Next 119 0 R /A 17 0 R >> endobj 119 0 obj << /Title (\376\377\0\62\0\56\0\64\0\40\0\104\0\157\0\143\0\165\0\155\0\145\0\156\0\164\0\40\0\116\0\165\0\155\0\142\0\145\0\162\0\163) /Parent 115 0 R /Prev 118 0 R /A 19 0 R >> endobj 120 0 obj << /Title (\376\377\0\63\0\40\0\117\0\166\0\145\0\162\0\166\0\151\0\145\0\167) /Parent 113 0 R /Prev 115 0 R /Next 121 0 R /A 21 0 R >> endobj 121 0 obj << /Title (\376\377\0\64\0\40\0\106\0\151\0\154\0\145\0\40\0\116\0\141\0\155\0\151\0\156\0\147) /Parent 113 0 R /Prev 120 0 R /Next 122 0 R /A 23 0 R >> endobj 122 0 obj << /Title (\376\377\0\65\0\40\0\123\0\165\0\155\0\155\0\141\0\162\0\171\0\40\0\157\0\146\0\40\0\106\0\151\0\154\0\145\0\40\0\105\0\170\0\164\0\145\0\156\0\163\0\151\0\157\0\156\0\163) /Parent 113 0 R /Prev 121 0 R /Next 123 0 R /A 25 0 R >> endobj 123 0 obj << /Title (\376\377\0\66\0\40\0\120\0\162\0\151\0\155\0\151\0\164\0\151\0\166\0\145\0\40\0\124\0\171\0\160\0\145\0\163) /Parent 113 0 R /First 124 0 R /Last 129 0 R /Prev 122 0 R /Next 130 0 R /Count -6 /A 27 0 R >> endobj 124 0 obj << /Title (\376\377\0\66\0\56\0\61\0\40\0\102\0\171\0\164\0\145) /Parent 123 0 R /Next 125 0 R /A 29 0 R >> endobj 125 0 obj << /Title (\376\377\0\66\0\56\0\62\0\40\0\125\0\111\0\156\0\164\0\63\0\62) /Parent 123 0 R /Prev 124 0 R /Next 126 0 R /A 31 0 R >> endobj 126 0 obj << /Title (\376\377\0\66\0\56\0\63\0\40\0\125\0\151\0\156\0\164\0\66\0\64) /Parent 123 0 R /Prev 125 0 R /Next 127 0 R /A 33 0 R >> endobj 127 0 obj << /Title (\376\377\0\66\0\56\0\64\0\40\0\126\0\111\0\156\0\164) /Parent 123 0 R /Prev 126 0 R /Next 128 0 R /A 35 0 R >> endobj 128 0 obj << /Title (\376\377\0\66\0\56\0\65\0\40\0\103\0\150\0\141\0\162\0\163) /Parent 123 0 R /Prev 127 0 R /Next 129 0 R /A 37 0 R >> endobj 129 0 obj << /Title (\376\377\0\66\0\56\0\66\0\40\0\123\0\164\0\162\0\151\0\156\0\147) /Parent 123 0 R /Prev 128 0 R /A 39 0 R >> endobj 130 0 obj << /Title (\376\377\0\67\0\40\0\103\0\157\0\155\0\160\0\157\0\165\0\156\0\144\0\40\0\124\0\171\0\160\0\145\0\163) /Parent 113 0 R /First 131 0 R /Last 131 0 R /Prev 123 0 R /Next 132 0 R /Count -1 /A 41 0 R >> endobj 131 0 obj << /Title (\376\377\0\67\0\56\0\61\0\40\0\115\0\141\0\160\0\74\0\123\0\164\0\162\0\151\0\156\0\147\0\54\0\123\0\164\0\162\0\151\0\156\0\147\0\76) /Parent 130 0 R /A 43 0 R >> endobj 132 0 obj << /Title (\376\377\0\70\0\40\0\120\0\145\0\162\0\55\0\111\0\156\0\144\0\145\0\170\0\40\0\106\0\151\0\154\0\145\0\163) /Parent 113 0 R /First 133 0 R /Last 136 0 R /Prev 130 0 R /Next 137 0 R /Count -4 /A 45 0 R >> endobj 133 0 obj << /Title (\376\377\0\70\0\56\0\61\0\40\0\123\0\145\0\147\0\155\0\145\0\156\0\164\0\163\0\40\0\106\0\151\0\154\0\145) /Parent 132 0 R /Next 134 0 R /A 47 0 R >> endobj 134 0 obj << /Title (\376\377\0\70\0\56\0\62\0\40\0\114\0\157\0\143\0\153\0\40\0\106\0\151\0\154\0\145) /Parent 132 0 R /Prev 133 0 R /Next 135 0 R /A 49 0 R >> endobj 135 0 obj << /Title (\376\377\0\70\0\56\0\63\0\40\0\104\0\145\0\154\0\145\0\164\0\141\0\142\0\154\0\145\0\40\0\106\0\151\0\154\0\145) /Parent 132 0 R /Prev 134 0 R /Next 136 0 R /A 51 0 R >> endobj 136 0 obj << /Title (\376\377\0\70\0\56\0\64\0\40\0\103\0\157\0\155\0\160\0\157\0\165\0\156\0\144\0\40\0\106\0\151\0\154\0\145\0\163) /Parent 132 0 R /Prev 135 0 R /A 53 0 R >> endobj 137 0 obj << /Title (\376\377\0\71\0\40\0\120\0\145\0\162\0\55\0\123\0\145\0\147\0\155\0\145\0\156\0\164\0\40\0\106\0\151\0\154\0\145\0\163) /Parent 113 0 R /First 138 0 R /Last 144 0 R /Prev 132 0 R /Next 145 0 R /Count -7 /A 55 0 R >> endobj 138 0 obj << /Title (\376\377\0\71\0\56\0\61\0\40\0\106\0\151\0\145\0\154\0\144\0\163) /Parent 137 0 R /Next 139 0 R /A 60 0 R >> endobj 139 0 obj << /Title (\376\377\0\71\0\56\0\62\0\40\0\124\0\145\0\162\0\155\0\40\0\104\0\151\0\143\0\164\0\151\0\157\0\156\0\141\0\162\0\171) /Parent 137 0 R /Prev 138 0 R /Next 140 0 R /A 62 0 R >> endobj 140 0 obj << /Title (\376\377\0\71\0\56\0\63\0\40\0\106\0\162\0\145\0\161\0\165\0\145\0\156\0\143\0\151\0\145\0\163) /Parent 137 0 R /Prev 139 0 R /Next 141 0 R /A 64 0 R >> endobj 141 0 obj << /Title (\376\377\0\71\0\56\0\64\0\40\0\120\0\157\0\163\0\151\0\164\0\151\0\157\0\156\0\163) /Parent 137 0 R /Prev 140 0 R /Next 142 0 R /A 66 0 R >> endobj 142 0 obj << /Title (\376\377\0\71\0\56\0\65\0\40\0\116\0\157\0\162\0\155\0\141\0\154\0\151\0\172\0\141\0\164\0\151\0\157\0\156\0\40\0\106\0\141\0\143\0\164\0\157\0\162\0\163) /Parent 137 0 R /Prev 141 0 R /Next 143 0 R /A 68 0 R >> endobj 143 0 obj << /Title (\376\377\0\71\0\56\0\66\0\40\0\124\0\145\0\162\0\155\0\40\0\126\0\145\0\143\0\164\0\157\0\162\0\163) /Parent 137 0 R /Prev 142 0 R /Next 144 0 R /A 70 0 R >> endobj 144 0 obj << /Title (\376\377\0\71\0\56\0\67\0\40\0\104\0\145\0\154\0\145\0\164\0\145\0\144\0\40\0\104\0\157\0\143\0\165\0\155\0\145\0\156\0\164\0\163) /Parent 137 0 R /Prev 143 0 R /A 72 0 R >> endobj 145 0 obj << /Title (\376\377\0\61\0\60\0\40\0\114\0\151\0\155\0\151\0\164\0\141\0\164\0\151\0\157\0\156\0\163) /Parent 113 0 R /Prev 137 0 R /A 74 0 R >> endobj 146 0 obj << /Type /Font /Subtype /Type1 /Name /F3 /BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding >> endobj 147 0 obj << /Type /Font /Subtype /Type1 /Name /F5 /BaseFont /Times-Roman /Encoding /WinAnsiEncoding >> endobj 148 0 obj << /Type /Font /Subtype /Type1 /Name /F6 /BaseFont /Times-Italic /Encoding /WinAnsiEncoding >> endobj 149 0 obj << /Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica /Encoding /WinAnsiEncoding >> endobj 150 0 obj << /Type /Font /Subtype /Type1 /Name /F9 /BaseFont /Courier /Encoding /WinAnsiEncoding >> endobj 151 0 obj << /Type /Font /Subtype /Type1 /Name /F2 /BaseFont /Helvetica-Oblique /Encoding /WinAnsiEncoding >> endobj 152 0 obj << /Type /Font /Subtype /Type1 /Name /F7 /BaseFont /Times-Bold /Encoding /WinAnsiEncoding >> endobj 1 0 obj << /Type /Pages /Count 21 /Kids [6 0 R 57 0 R 76 0 R 78 0 R 80 0 R 82 0 R 84 0 R 86 0 R 88 0 R 90 0 R 92 0 R 94 0 R 96 0 R 98 0 R 100 0 R 102 0 R 104 0 R 106 0 R 108 0 R 110 0 R 112 0 R ] >> endobj 2 0 obj << /Type /Catalog /Pages 1 0 R /Outlines 113 0 R /PageMode /UseOutlines >> endobj 3 0 obj << /Font << /F3 146 0 R /F5 147 0 R /F1 149 0 R /F6 148 0 R /F9 150 0 R /F2 151 0 R /F7 152 0 R >> /ProcSet [ /PDF /ImageC /Text ] >> endobj 9 0 obj << /S /GoTo /D [76 0 R /XYZ 85.0 659.0 null] >> endobj 11 0 obj << /S /GoTo /D [76 0 R /XYZ 85.0 289.466 null] >> endobj 13 0 obj << /S /GoTo /D [78 0 R /XYZ 85.0 637.8 null] >> endobj 15 0 obj << /S /GoTo /D [78 0 R /XYZ 85.0 546.947 null] >> endobj 17 0 obj << /S /GoTo /D [78 0 R /XYZ 85.0 400.494 null] >> endobj 19 0 obj << /S /GoTo /D [78 0 R /XYZ 85.0 267.241 null] >> endobj 21 0 obj << /S /GoTo /D [80 0 R /XYZ 85.0 513.0 null] >> endobj 23 0 obj << /S /GoTo /D [82 0 R /XYZ 85.0 659.0 null] >> endobj 25 0 obj << /S /GoTo /D [82 0 R /XYZ 85.0 458.666 null] >> endobj 27 0 obj << /S /GoTo /D [84 0 R /XYZ 85.0 400.75 null] >> endobj 29 0 obj << /S /GoTo /D [84 0 R /XYZ 85.0 369.616 null] >> endobj 31 0 obj << /S /GoTo /D [84 0 R /XYZ 85.0 305.163 null] >> endobj 33 0 obj << /S /GoTo /D [84 0 R /XYZ 85.0 232.71 null] >> endobj 35 0 obj << /S /GoTo /D [84 0 R /XYZ 85.0 160.257 null] >> endobj 37 0 obj << /S /GoTo /D [86 0 R /XYZ 85.0 150.6 null] >> endobj 39 0 obj << /S /GoTo /D [88 0 R /XYZ 85.0 637.8 null] >> endobj 41 0 obj << /S /GoTo /D [88 0 R /XYZ 85.0 552.147 null] >> endobj 43 0 obj << /S /GoTo /D [88 0 R /XYZ 85.0 521.013 null] >> endobj 45 0 obj << /S /GoTo /D [88 0 R /XYZ 85.0 448.56 null] >> endobj 47 0 obj << /S /GoTo /D [88 0 R /XYZ 85.0 396.226 null] >> endobj 49 0 obj << /S /GoTo /D [92 0 R /XYZ 85.0 413.0 null] >> endobj 51 0 obj << /S /GoTo /D [92 0 R /XYZ 85.0 295.747 null] >> endobj 53 0 obj << /S /GoTo /D [92 0 R /XYZ 85.0 244.494 null] >> endobj 55 0 obj << /S /GoTo /D [94 0 R /XYZ 85.0 534.6 null] >> endobj 60 0 obj << /S /GoTo /D [94 0 R /XYZ 85.0 482.266 null] >> endobj 62 0 obj << /S /GoTo /D [96 0 R /XYZ 85.0 226.6 null] >> endobj 64 0 obj << /S /GoTo /D [100 0 R /XYZ 85.0 181.4 null] >> endobj 66 0 obj << /S /GoTo /D [104 0 R /XYZ 85.0 418.6 null] >> endobj 68 0 obj << /S /GoTo /D [106 0 R /XYZ 85.0 455.4 null] >> endobj 70 0 obj << /S /GoTo /D [108 0 R /XYZ 85.0 607.4 null] >> endobj 72 0 obj << /S /GoTo /D [110 0 R /XYZ 85.0 245.4 null] >> endobj 74 0 obj << /S /GoTo /D [112 0 R /XYZ 85.0 346.6 null] >> endobj 113 0 obj << /First 114 0 R /Last 145 0 R >> endobj xref 0 153 0000000000 65535 f 0000054490 00000 n 0000054696 00000 n 0000054789 00000 n 0000000015 00000 n 0000000071 00000 n 0000001333 00000 n 0000001453 00000 n 0000001639 00000 n 0000054941 00000 n 0000001774 00000 n 0000055004 00000 n 0000001909 00000 n 0000055070 00000 n 0000002046 00000 n 0000055134 00000 n 0000002183 00000 n 0000055200 00000 n 0000002320 00000 n 0000055266 00000 n 0000002457 00000 n 0000055332 00000 n 0000002594 00000 n 0000055396 00000 n 0000002731 00000 n 0000055460 00000 n 0000002868 00000 n 0000055526 00000 n 0000003005 00000 n 0000055591 00000 n 0000003142 00000 n 0000055657 00000 n 0000003279 00000 n 0000055723 00000 n 0000003416 00000 n 0000055788 00000 n 0000003553 00000 n 0000055854 00000 n 0000003690 00000 n 0000055918 00000 n 0000003826 00000 n 0000055982 00000 n 0000003963 00000 n 0000056048 00000 n 0000004100 00000 n 0000056114 00000 n 0000004237 00000 n 0000056179 00000 n 0000004373 00000 n 0000056245 00000 n 0000004510 00000 n 0000056309 00000 n 0000004647 00000 n 0000056375 00000 n 0000004783 00000 n 0000056441 00000 n 0000004920 00000 n 0000005673 00000 n 0000005796 00000 n 0000005872 00000 n 0000056505 00000 n 0000006004 00000 n 0000056571 00000 n 0000006137 00000 n 0000056635 00000 n 0000006270 00000 n 0000056700 00000 n 0000006403 00000 n 0000056765 00000 n 0000006536 00000 n 0000056830 00000 n 0000006669 00000 n 0000056895 00000 n 0000006801 00000 n 0000056960 00000 n 0000006934 00000 n 0000009083 00000 n 0000009191 00000 n 0000011342 00000 n 0000011450 00000 n 0000013773 00000 n 0000013881 00000 n 0000016699 00000 n 0000016807 00000 n 0000018950 00000 n 0000019058 00000 n 0000021696 00000 n 0000021804 00000 n 0000023913 00000 n 0000024021 00000 n 0000025962 00000 n 0000026070 00000 n 0000028424 00000 n 0000028532 00000 n 0000030300 00000 n 0000030408 00000 n 0000032041 00000 n 0000032149 00000 n 0000033767 00000 n 0000033875 00000 n 0000035794 00000 n 0000035903 00000 n 0000037738 00000 n 0000037848 00000 n 0000039739 00000 n 0000039849 00000 n 0000041940 00000 n 0000042050 00000 n 0000043615 00000 n 0000043725 00000 n 0000045456 00000 n 0000045566 00000 n 0000047505 00000 n 0000057025 00000 n 0000047615 00000 n 0000047815 00000 n 0000048033 00000 n 0000048239 00000 n 0000048447 00000 n 0000048615 00000 n 0000048815 00000 n 0000048973 00000 n 0000049148 00000 n 0000049411 00000 n 0000049652 00000 n 0000049781 00000 n 0000049935 00000 n 0000050089 00000 n 0000050233 00000 n 0000050383 00000 n 0000050524 00000 n 0000050759 00000 n 0000050954 00000 n 0000051194 00000 n 0000051376 00000 n 0000051549 00000 n 0000051752 00000 n 0000051940 00000 n 0000052192 00000 n 0000052333 00000 n 0000052542 00000 n 0000052728 00000 n 0000052902 00000 n 0000053147 00000 n 0000053338 00000 n 0000053544 00000 n 0000053710 00000 n 0000053824 00000 n 0000053935 00000 n 0000054047 00000 n 0000054156 00000 n 0000054263 00000 n 0000054380 00000 n trailer << /Size 153 /Root 2 0 R /Info 4 0 R >> startxref 57079 %%EOF lucene-2.9.4/docs/scoring.html0000644000175000017500000007054311474320234016746 0ustar janpascaljanpascal Apache Lucene - Scoring
 

Apache Lucene - Scoring

Introduction

Lucene scoring is the heart of why we all love Lucene. It is blazingly fast and it hides almost all of the complexity from the user. In a nutshell, it works. At least, that is, until it doesn't work, or doesn't work as one would expect it to work. Then we are left digging into Lucene internals or asking for help on java-user@lucene.apache.org to figure out why a document with five of our query terms scores lower than a different document with only one of the query terms.

While this document won't answer your specific scoring issues, it will, hopefully, point you to the places that can help you figure out the what and why of Lucene scoring.

Lucene scoring uses a combination of the Vector Space Model (VSM) of Information Retrieval and the Boolean model to determine how relevant a given Document is to a User's query. In general, the idea behind the VSM is the more times a query term appears in a document relative to the number of times the term appears in all the documents in the collection, the more relevant that document is to the query. It uses the Boolean model to first narrow down the documents that need to be scored based on the use of boolean logic in the Query specification. Lucene also adds some capabilities and refinements onto this model to support boolean and fuzzy searching, but it essentially remains a VSM based system at the heart. For some valuable references on VSM and IR in general refer to the Lucene Wiki IR references.

The rest of this document will cover Scoring basics and how to change your Similarity. Next it will cover ways you can customize the Lucene internals in Changing your Scoring -- Expert Level which gives details on implementing your own Query class and related functionality. Finally, we will finish up with some reference material in the Appendix.

Scoring

Scoring is very much dependent on the way documents are indexed, so it is important to understand indexing (see Apache Lucene - Getting Started Guide and the Lucene file formats before continuing on with this section.) It is also assumed that readers know how to use the Searcher.explain(Query query, int doc) functionality, which can go a long way in informing why a score is returned.

Fields and Documents

In Lucene, the objects we are scoring are Documents. A Document is a collection of Fields. Each Field has semantics about how it is created and stored (i.e. tokenized, untokenized, raw data, compressed, etc.) It is important to note that Lucene scoring works on Fields and then combines the results to return Documents. This is important because two Documents with the exact same content, but one having the content in two Fields and the other in one Field will return different scores for the same query due to length normalization (assumming the DefaultSimilarity on the Fields).

Score Boosting

Lucene allows influencing search results by "boosting" in more than one level:

  • Document level boosting - while indexing - by calling document.setBoost() before a document is added to the index.
  • Document's Field level boosting - while indexing - by calling field.setBoost() before adding a field to the document (and before adding the document to the index).
  • Query level boosting - during search, by setting a boost on a query clause, calling Query.setBoost().

Indexing time boosts are preprocessed for storage efficiency and written to the directory (when writing the document) in a single byte (!) as follows: For each field of a document, all boosts of that field (i.e. all boosts under the same field name in that doc) are multiplied. The result is multiplied by the boost of the document, and also multiplied by a "field length norm" value that represents the length of that field in that doc (so shorter fields are automatically boosted up). The result is decoded as a single byte (with some precision loss of course) and stored in the directory. The similarity object in effect at indexing computes the length-norm of the field.

This composition of 1-byte representation of norms (that is, indexing time multiplication of field boosts & doc boost & field-length-norm) is nicely described in Fieldable.setBoost().

Encoding and decoding of the resulted float norm in a single byte are done by the static methods of the class Similarity: encodeNorm() and decodeNorm(). Due to loss of precision, it is not guaranteed that decode(encode(x)) = x, e.g. decode(encode(0.89)) = 0.75. At scoring (search) time, this norm is brought into the score of document as norm(t, d), as shown by the formula in Similarity.

Understanding the Scoring Formula

This scoring formula is described in the Similarity class. Please take the time to study this formula, as it contains much of the information about how the basics of Lucene scoring work, especially the TermQuery.

The Big Picture

OK, so the tf-idf formula and the Similarity is great for understanding the basics of Lucene scoring, but what really drives Lucene scoring are the use and interactions between the Query classes, as created by each application in response to a user's information need.

In this regard, Lucene offers a wide variety of Query implementations, most of which are in the org.apache.lucene.search package. These implementations can be combined in a wide variety of ways to provide complex querying capabilities along with information about where matches took place in the document collection. The Query section below highlights some of the more important Query classes. For information on the other ones, see the package summary. For details on implementing your own Query class, see Changing your Scoring -- Expert Level below.

Once a Query has been created and submitted to the IndexSearcher, the scoring process begins. (See the Appendix Algorithm section for more notes on the process.) After some infrastructure setup, control finally passes to the Weight implementation and its Scorer instance. In the case of any type of BooleanQuery, scoring is handled by the BooleanWeight2 (link goes to ViewVC BooleanQuery java code which contains the BooleanWeight2 inner class), unless Weight#scoresDocsOutOfOrder() method is set to true, in which case the BooleanWeight (link goes to ViewVC BooleanQuery java code, which contains the BooleanWeight inner class) from the 1.4 version of Lucene is used by default. See CHANGES.txt under release 1.9 RC1 for more information on choosing which Scorer to use.

ry#setUseScorer14(boolean) Assuming the use of the BooleanWeight2, a BooleanScorer2 is created by bringing together all of the Scorers from the sub-clauses of the BooleanQuery. When the BooleanScorer2 is asked to score it delegates its work to an internal Scorer based on the type of clauses in the Query. This internal Scorer essentially loops over the sub scorers and sums the scores provided by each scorer while factoring in the coord() score.

Query Classes

For information on the Query Classes, refer to the search package javadocs

Changing Similarity

One of the ways of changing the scoring characteristics of Lucene is to change the similarity factors. For information on how to do this, see the search package javadocs

Changing your Scoring -- Expert Level

At a much deeper level, one can affect scoring by implementing their own Query classes (and related scoring classes.) To learn more about how to do this, refer to the search package javadocs

Appendix

Algorithm

This section is mostly notes on stepping through the Scoring process and serves as fertilizer for the earlier sections.

In the typical search application, a Query is passed to the Searcher , beginning the scoring process.

Once inside the Searcher, a Collector is used for the scoring and sorting of the search results. These important objects are involved in a search:

  1. The Weight object of the Query. The Weight object is an internal representation of the Query that allows the Query to be reused by the Searcher.
  2. The Searcher that initiated the call.
  3. A Filter for limiting the result set. Note, the Filter may be null.
  4. A Sort object for specifying how to sort the results if the standard score based sort method is not desired.

Assuming we are not sorting (since sorting doesn't effect the raw Lucene score), we call one of the search methods of the Searcher, passing in the Weight object created by Searcher.createWeight(Query), Filter and the number of results we want. This method returns a TopDocs object, which is an internal collection of search results. The Searcher creates a TopScoreDocCollector and passes it along with the Weight, Filter to another expert search method (for more on the Collector mechanism, see Searcher .) The TopDocCollector uses a PriorityQueue to collect the top results for the search.

If a Filter is being used, some initial setup is done to determine which docs to include. Otherwise, we ask the Weight for a Scorer for the IndexReader of the current searcher and we proceed by calling the score method on the Scorer .

At last, we are actually going to score some documents. The score method takes in the Collector (most likely the TopScoreDocCollector or TopFieldCollector) and does its business. Of course, here is where things get involved. The Scorer that is returned by the Weight object depends on what type of Query was submitted. In most real world applications with multiple query terms, the Scorer is going to be a BooleanScorer2 (see the section on customizing your scoring for info on changing this.)

Assuming a BooleanScorer2 scorer, we first initialize the Coordinator, which is used to apply the coord() factor. We then get a internal Scorer based on the required, optional and prohibited parts of the query. Using this internal Scorer, the BooleanScorer2 then proceeds into a while loop based on the Scorer#next() method. The next() method advances to the next document matching the query. This is an abstract method in the Scorer class and is thus overriden by all derived implementations. If you have a simple OR query your internal Scorer is most likely a DisjunctionSumScorer, which essentially combines the scorers from the sub scorers of the OR'd terms.

 
lucene-2.9.4/docs/demo2.pdf0000644000175000017500000002275611474320234016120 0ustar janpascaljanpascal%PDF-1.3 %ª«¬­ 4 0 obj << /Type /Info /Producer (FOP 0.20.5) >> endobj 5 0 obj << /Length 615 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gau`Q9okbt&A@ZcI*44-[e.;KFgg#JKGcQBKRKDR5(KL#b#6\,#?)B`u-0L4HL6.Cr%L*.3SLelrCYrX7HY3s)gaKV.<4LD)f4M5dlF]'$TK4!,"41*J=8QC0DIY7FmUWM)ZZTbP]'rr76BD^,+W9>c2`e-qBDnsTeEaOkqqL^6gO?u-&$3a*=E=N-aFhaDp8iHf0kjrn8%6g:X?i1D813hISi[B)*ng%.m[eRfTC,4WG<@7H?4u:XCirrnp5/;h`MtI0Yd%sSX'1p(PsIq=5FMA&K)~> endstream endobj 6 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 5 0 R /Annots 7 0 R >> endobj 7 0 obj [ 8 0 R 10 0 R 12 0 R 14 0 R 16 0 R ] endobj 8 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 519.166 185.996 507.166 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 9 0 R /H /I >> endobj 10 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 500.966 217.64 488.966 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 11 0 R /H /I >> endobj 12 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 482.766 160.664 470.766 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 13 0 R /H /I >> endobj 14 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 464.566 184.328 452.566 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 15 0 R /H /I >> endobj 16 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 446.366 206.972 434.366 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 17 0 R /H /I >> endobj 18 0 obj << /Length 2590 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gatm>D3*F0%/ui*@I-YAiD+#II`oIu[WP\L?n]c$%(gr73D-$rYqk!-HMdB1r(9"JH:*GhG:1G*aG8hDMWt(cnbme(pVLRr.eB!uq&KWiq&T@5bk(JPF+3(+=g3iI/\K-C08o`#ipKP+"m1O3G2[e["\=/nJ_+auQLK4[XUj810*Cpj]+%t!04-rDVOj+H?;-8qeHT'(HL&&2qEZY)LL-B[`3LX+e;7(h&7o<(#WlUr,%89][+"'U&g&[*BiU#&Q3r%f7V\K>nI]]hWnHb[-<."m\Nl<"`EniLa4n-c*-f$)/S'^0^2%b?f,J^+h]R(86DI=leO)gNp+mKXAoHi>*,`hql=kP)\jkEFfhVJ1[OGG-OE9%*6,aqgZ52Ok+RrZiKYQUrNY+7-661bL!m\(QA9Trlj&O>$hKT.KYbcmQF]B*Mp?G>V*+,kO\3uWX=SVDgaIF3m.u'hp$^3;?@hqi0Il/5q)G.jk5O@;6Ogj2H:%=b7'/;U)Q[T5)Vlo(LkR3#&d![=eNJY=lfA>HJCal@&8KJf"ShDlf'=Y#meg9U)RNW[Z:,PsFX(laHDZ0\WqTh7ZrsJ8O.9^cgcN9'>6#9E0Ylg?)VbH^X=C8^M.Xb#MSc=]YQl"]62B36\<^L)7%\&GnXqcM'IaqXr4!(&.7U6$"D.308dVI]cR_JIMVZ@K%]h*4FqeCb@7n9obo[`Z.E0oO[6bG[<`K!R\TObKad?S)[]2_9Mb0=L$IY;#$.)=^;Z%H0@aFWlII6^2*p,N%gq^=l,]3Dpc7=rYC/>=K2>I$.P>%+9s.o,ZKXi!?\&)d22sEPCO]-$(&Q\F:(o'#n8NHV<61t]=$^QF\,nhL9?h8Q6[&*o"\,N>=q]0D\5tWPO8o^F)K"2<]*-Xo&hcPm*WO_qG]m3oSioqNd,RL5]-q%H>i^a"XKkQ`XDF,`[DQn@9Na_*ibhTfT>5[:YVl&\Z>k(eD!k%B\>(f<@ECCBV,ciDPeW'hq94^*q&C8)'H&B[N(W#Ba`X*E?7a`rbEWDY70IBo?/4T]*etW3pE8KDe2Aqa4j#W5[sO5WiR-:fucrXr#Xl1&6U`TFE2ZDENn8LcRf6i1jK7njkMHH\*_f,t6OHE*8%nU@15rciNBl1tKW0QA9P+`;WoN*(A3&@LGK]R`*KjiO'&9!sDN\G8q%0$bWE71sPF'DpHXG.L2f(:tnH3;rr'M(_CMQ.+hi#jmRrZ,o6T`-CMJF!caPQ&+$_H8Z#D/D2jFlnd)XX9(,k@MdY?lJoT83<7(T<7`?PsoGp&)`n3%b$BF=Z&;@t-]2+?Q_[nNn[L)OgR68tVAugP.KDI_;-JT2Td\eI2V\oIlfM-!^YPZ4,WPp`7S.?a+qla&8g,I\HR/@:6?lruTGQN[p^&Pg'aJ^t%M"e_)u>[JfWq.1241u)uQ)JctnB,0Ohc@5hJWM'i?1o8!jZti4mDOq&G7A$2H6'Eh:(/4%B/lnA0J*5"B/]_PHgq0*!sb@W"rfc\TF29XJSbmX_D.HG-sLMH:o?/n9"WmAO'@7\B<*kb6GFSbb%ITRAltPJG@-<+Rr23_9*ma;.*!Q:fGlFu0+F3u#ZpuOXK?\97[ep\8VeBB5.W&@HX=ga4Sc.NfqSf@-f)dT+4)9m$nT(t2['V_XJBi#"S>h6Rh=S%(]ah^HXa3hR4URC$V5VcIdT484>]HN"Fc+U<9l5t@e>-R!bd*b8aHK%5o=^<^/`3$/U.(M"4O@EGSg\7.eN[M>4m$Q>9Ho"om=*Kc[C7d:YmMh(*C<#m^I endstream endobj 19 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 18 0 R >> endobj 20 0 obj << /Length 2028 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gatm(40JC)T[02cB0-Ad$ERM17FiD5t82!l:T*iE/9;j':#2W@W!'*1!UYj:&^f.g/bFn5n3A01=S8=B-T0!\_T-NE1<%[b,t[23g0Ngl/c+f[fpIm9m`l5uSIAXm(4QZg3bp,u],#fS)6\FUt(d6KAPqoYeo#r!fip$'?Wo83/(l$a-qm2*WeuWBUt37*H#q+6PF`j?pUKQrF.Sq2rggk=nOE3kqU%jB)icj]8<*AtaR5r2S#41-XsJ"-'\:'(5f(r1I64"ZOu,hf^hNh(T%Et&gamRf\T@+Erpek0Ptg=.`#k6R1tj)%UeTinU!>@j1cHZ\cqG7&L_.4-<9ANZSQh7RfO[c\Kc*F^3$)?rHkKW$Q0(Bbnkrr\FM.psU=r)+p\%+pEPq0I+g^7J;=XgS>c4a1U`1J.]5XD%2(Ar9%12"=\us*En!3)5F`FZhrpA)B_!r3jh(c5U-\n4.r:rp<^Be>%;ZC'eaQ>X9u'F24G$CT@b@NKIgqk(8\CRk8l.<^jq%\Q@5>3\4/'V)GhV&$UQ*t/1^_+8]cnI"%h)eDfN,?*7Ci[;+-91r=5NCJ^-KL9nT#R<2`?5fS5P**h'I6W<4Si=/[KAm&C\3i[WHM[CP)JUYg;#3,?=^.=:=J7Hbgo)N^O[r=cHtVCVQ"rEuMd9f!79>E%F43&:3#HoLb%RZ+MPj2l!PB)81&FXg3adI&`IUX$/WMclj)EX*5\ZmV-Q#Zq0ce9B6oi[9;b2s"+glMQo9pNC$BgMOAoQX6E0S[NO\+aqGO<4n\f1H1Ui>7;,XtDufjZp[_3jQ_;+*;XqT:_+l?XaPLfG;hR89PbeE4IM3rKr-fMAg@!haT=$4XK>lIDq:j5#:TUnG*Xdh=AWSt?Iii9iq;2#u0s-*`NqDL?Q&NS[LNF7%(2:^l'1)c0JNXi^A).kSMCeU&a6h-Si7J>@81Eta^)!uetN;%(D;%#/C'-4?L]LKL4$FLM@*K]i&"_ERff(njT-08`:97nss(/7*8[XZ/L8^>!iOP,)BhqN3^rI+O@Ud?"P1!(#=uDoB(J%;"*T\J>WFrpf,bR+>Vkon..!KuOPtfj/fEEX(qrKlMO.~> endstream endobj 21 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 20 0 R >> endobj 23 0 obj << /Title (\376\377\0\61\0\40\0\101\0\142\0\157\0\165\0\164\0\40\0\164\0\150\0\145\0\40\0\103\0\157\0\144\0\145) /Parent 22 0 R /Next 24 0 R /A 9 0 R >> endobj 24 0 obj << /Title (\376\377\0\62\0\40\0\114\0\157\0\143\0\141\0\164\0\151\0\157\0\156\0\40\0\157\0\146\0\40\0\164\0\150\0\145\0\40\0\163\0\157\0\165\0\162\0\143\0\145) /Parent 22 0 R /Prev 23 0 R /Next 25 0 R /A 11 0 R >> endobj 25 0 obj << /Title (\376\377\0\63\0\40\0\111\0\156\0\144\0\145\0\170\0\106\0\151\0\154\0\145\0\163) /Parent 22 0 R /Prev 24 0 R /Next 26 0 R /A 13 0 R >> endobj 26 0 obj << /Title (\376\377\0\64\0\40\0\123\0\145\0\141\0\162\0\143\0\150\0\151\0\156\0\147\0\40\0\106\0\151\0\154\0\145\0\163) /Parent 22 0 R /Prev 25 0 R /Next 27 0 R /A 15 0 R >> endobj 27 0 obj << /Title (\376\377\0\65\0\40\0\124\0\150\0\145\0\40\0\127\0\145\0\142\0\40\0\145\0\170\0\141\0\155\0\160\0\154\0\145\0\56\0\56\0\56) /Parent 22 0 R /Prev 26 0 R /A 17 0 R >> endobj 28 0 obj << /Type /Font /Subtype /Type1 /Name /F3 /BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding >> endobj 29 0 obj << /Type /Font /Subtype /Type1 /Name /F5 /BaseFont /Times-Roman /Encoding /WinAnsiEncoding >> endobj 30 0 obj << /Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica /Encoding /WinAnsiEncoding >> endobj 31 0 obj << /Type /Font /Subtype /Type1 /Name /F9 /BaseFont /Courier /Encoding /WinAnsiEncoding >> endobj 32 0 obj << /Type /Font /Subtype /Type1 /Name /F2 /BaseFont /Helvetica-Oblique /Encoding /WinAnsiEncoding >> endobj 33 0 obj << /Type /Font /Subtype /Type1 /Name /F7 /BaseFont /Times-Bold /Encoding /WinAnsiEncoding >> endobj 1 0 obj << /Type /Pages /Count 3 /Kids [6 0 R 19 0 R 21 0 R ] >> endobj 2 0 obj << /Type /Catalog /Pages 1 0 R /Outlines 22 0 R /PageMode /UseOutlines >> endobj 3 0 obj << /Font << /F3 28 0 R /F5 29 0 R /F1 30 0 R /F9 31 0 R /F2 32 0 R /F7 33 0 R >> /ProcSet [ /PDF /ImageC /Text ] >> endobj 9 0 obj << /S /GoTo /D [19 0 R /XYZ 85.0 659.0 null] >> endobj 11 0 obj << /S /GoTo /D [19 0 R /XYZ 85.0 580.266 null] >> endobj 13 0 obj << /S /GoTo /D [19 0 R /XYZ 85.0 453.932 null] >> endobj 15 0 obj << /S /GoTo /D [21 0 R /XYZ 85.0 520.2 null] >> endobj 17 0 obj << /S /GoTo /D [21 0 R /XYZ 85.0 316.666 null] >> endobj 22 0 obj << /First 23 0 R /Last 27 0 R >> endobj xref 0 34 0000000000 65535 f 0000008278 00000 n 0000008350 00000 n 0000008442 00000 n 0000000015 00000 n 0000000071 00000 n 0000000777 00000 n 0000000897 00000 n 0000000950 00000 n 0000008576 00000 n 0000001085 00000 n 0000008639 00000 n 0000001221 00000 n 0000008705 00000 n 0000001358 00000 n 0000008771 00000 n 0000001495 00000 n 0000008835 00000 n 0000001632 00000 n 0000004315 00000 n 0000004423 00000 n 0000006544 00000 n 0000008901 00000 n 0000006652 00000 n 0000006825 00000 n 0000007060 00000 n 0000007226 00000 n 0000007421 00000 n 0000007616 00000 n 0000007729 00000 n 0000007839 00000 n 0000007947 00000 n 0000008053 00000 n 0000008169 00000 n trailer << /Size 34 /Root 2 0 R /Info 4 0 R >> startxref 8952 %%EOF lucene-2.9.4/docs/scoring.pdf0000644000175000017500000004525211474320234016552 0ustar janpascaljanpascal%PDF-1.3 %ª«¬­ 4 0 obj << /Type /Info /Producer (FOP 0.20.5) >> endobj 5 0 obj << /Length 836 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gb!$F966RV&:j6I$6>[idjNpTaWU57Gfk(F5=$R](qaqDF3UIYj@M^-1MIjc90:#$FI99B4(doKP8'(\k&H\F-"(Zitbj:d+#Z?e8F9fNSk5-^ear<#\EFQ(F*njk"JNWV':V"nB[CH<&5l01#KE3Fu:.OGlJg!lkDuPsGWlCO@CQsXWBCBOOE"qrKCLGr)2A^/?*(6Pl`m]+4nff%9f%e`j9%;&)/V\`5LNmSZ\@G2a2%q7fR_Lkn3I"h[5)Tu0sU.c#7!*?^`0F#L4uJP]=,r?`B.ZJ7U2XZ!RHGb*>SM1~> endstream endobj 6 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 5 0 R /Annots 7 0 R >> endobj 7 0 obj [ 8 0 R 10 0 R 12 0 R 14 0 R 16 0 R 18 0 R 20 0 R 22 0 R 24 0 R 26 0 R 28 0 R ] endobj 8 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 546.166 169.328 534.166 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 9 0 R /H /I >> endobj 10 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 527.966 147.332 515.966 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 11 0 R /H /I >> endobj 12 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 509.766 230.828 497.766 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 13 0 R /H /I >> endobj 14 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 491.566 197.168 479.566 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 15 0 R /H /I >> endobj 16 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 473.366 295.82 461.366 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 17 0 R /H /I >> endobj 18 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 455.166 199.496 443.166 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 19 0 R /H /I >> endobj 20 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 436.966 192.488 424.966 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 21 0 R /H /I >> endobj 22 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 418.766 221.18 406.766 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 23 0 R /H /I >> endobj 24 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 400.566 298.304 388.566 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 25 0 R /H /I >> endobj 26 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 382.366 157.328 370.366 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 27 0 R /H /I >> endobj 28 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 364.166 173.504 352.166 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 29 0 R /H /I >> endobj 30 0 obj << /Length 2533 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gatm=gN)%<&q/A5i%9,;K]#ISD]:'ib?N*>`mQ1EX*kfd"*,Rc_r)_2rq^I3"E>?MOgUNrQ%^B3l0d$VVn30Um]Sh2?i>93\]H2g6Dsi&q0r?&p,7cfe2t^"o?/NEC?)jrZ\g"(5Do@()aANE0QLpmCp/<$qJ^rWrU&>/qZlQ\/'lmSd15fpGK*^ZO<.ahpSIK-GJ%_-p.UB`3roMr8F]?m5G5-.l*(FhO>"ta/+E4mPf^XSk17&pSDR:I=rnO?UP-a'ne[;E#o+.jhYM23&\X0%.kG?Jelf7sJ(@[>4MKr(_-b)#`Je5;$-+8O&8fAHf+r:\N/?0-8KH#9'aTe"ZFoUse1I;do@b3\-e11S>lIrHE.LjCL4#D]N;0N1g;*H(4g7":C2;]Dc\_g%FQ3C_L&Zct36"&NS\nQQHd)q>(Wl,]u9GH<6EIW/;#-ihZ#oN@%K.1!fUP+s_MTqQPWB?O?T_"upB7$&\\j?U0_0;%'"ruXt9VEes>iSJqf+Y:a^/p"V0h?9gh)3A";)JcL(PZosaeEg@^<5_8-Q=#0.)n8oI?8_U'JJt15q>oqj;SQ*68O`,L*bls6Qo@LM@PF;ek/,__?))`nUoClPe`ZL.u!MUaZbOq,.H1EGu;b:`*r]JE]YHh&G+@>0!UF+)u'+4EU7n/dJe:0'Al^-GNAL%'\=*WW"T2O>Idg9i5et8h=7IBA+[71[)A5\XPgY,JXSWE;/s0)^p3IUP]k/PjJo=p:CP$6NbpOcDL'W-"/9IWB"(_UKcI[HS&`'l/gHb4hBeJrW+A:J+qcfl<:aJ[B!,lidi=)#RJ/1.f9ie14tcAQ"a^_J-"U[hA\2>NHu:GQ>\]aQ6H'pdgEe2#hI0!(97R0%R4EdKGrM#PJS2f-bCW##,Vi>b@C)b&S3X"hgo<]KMIf!"8hG,De-4L9imEF!AdAKkQ/;A]LeW(QBUfD_:h0pph:Jh9$KZ.t9Xo]pg3/dj?T&?2U+!!Jd)&\Zb056l7i1.3]JQ`Y*a(B3QY)#B\X=C+@IW8.eme%oWg94.(5rTTes5nn(p6PN#2eSsk&pePHQ$c1.[%.j0$%&`&)EBnRacj%>]hW0K;_$mVEVnD`J=;4&dN>B4ng+oTC-)7t7($bqI%hR;JhVZ/KnK9#a?>[r!DdhAbgJWBk"I!?%ia@hqD94-nFR&1]C\$n7sV97cQiS![m"ad>D_XZolk:\;'n%2LQ67>0>K>?0RYP*4"]c%pt)&BP*]_22U&GZdJijpU>^;\.PT*6n&ehM6j(ton1Kpa]p#=]f,1VJEi\T\m[*V_pa*#*q?,$HQA`AP^:]dcd@-_;1GQD9Y,o<*h$5qogu!pM]J-*J5*a:ju?4=Y`<2fDXGKL]33*J&s7>9\CHZ:n5?=CIK4kg$kX8nrk&S#D`+^nZer7mA;9cJcJ.A.\&Vin20C5:6-8B!Z#(NuKBrfnZB=enU[!=2'bR=Y~> endstream endobj 31 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 30 0 R >> endobj 32 0 obj << /Length 2236 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gau`Uh/D%+&:`#5U%P+#J:s2$IFn_sd&cW=@2R]EH^a[o92s,8&m\@,Rf2n(,qHR%ai#2,$q218+3497a7#&OX.-G>2eSc/U>T)Ps64>?Ot]GU<.DoEq0naYPcU'kd.1Rkp@7M7fb`(uXME;ghBtTWoo5I3/0%t'otMml?VDk>_`50*K0L0_XLFq_l9)SPK;_5Jj7B#,WinI.=2LG4h_t*&Xj$[22k;/a[@<]eDkldo?FCki](sd?kH%`mqsIcO`6Xe<7CTT[s!tVT`PT5lPBasf^T\^EW)sM;b@T/e%5*0#4a:'ei7T-_'@!Z"C^36+CuYQh?LXrpLfdT#A4jOMh%GiZK1'CIWcD93LDZS`@XGBlA]\/05io;*@8b8!UFmnIOr="Ia0C9W>b/O)=?9(bJ<=6%$jKVF1/_e=pESkT6Dm4@hs)CsL-gqN-@ke(1R=)480Y'Uc^-\6]>)HQ2`urr,hMVJWVF&iuk+'U(4RU;Y:XUU*/JQ55cOsSMS_QSC5@FVQ^P4pZBS$M=)P9W:@cIs95pkW4a^ah(H#9Y\T6!kkdh2_:DU>gC0HL2'$,[QUD6F>]ESm.,GE)h(1JIO>0C=#%k]1_NhsJ9ZV7k=W@HpXNZQR6O&^Fe//`/Vbnlhd5+8j?hUG0L4-KHk\FHTi\ABlD9ZL\r$nP<&"^It#>sr\_D?qUWIl(N,"BcZDA2%_K6m*'@etTtE.?`e>W_*,^d"VVQ7?e*8ah1Biq=c-Htq?;sX+Pl>jW&p5B@T'QqfCC#R-tAY13Zk%dZcD[b#uaR.IAaFdR-34Z%]hRHn":)DHBb*Q]`r!Lc,LVP7TP\pd](Ec2@(>cDQmRWaMIR(PcdY\k>c#u&;1^41LJr03mLLdb,ekeF;$O483)YNfc%kEX\64W=+V6erHEEsEOaSJZ_]$E;Tr*[&K>.rf(A;ao4[YmA+$D'o?eNf/i$D,)39dOT!d^`3nj=ZR400)J[hP[H0r%Eigr_d&Gn1R<&%K!p1#giHrTjTOCLkW-8`jg*b;h9ed0!C!>96]Q/5Bl`0c'HS2OVpH9N;Xj=p-mk4p#>2KpFGX<&Qu("HN_rop.)EG=eAY@p_L93f27[3]#>tn(030j_Q/SPeRk;d1Bj(3DGQegs+M&.FLM9t3Q+(5gMq"HVBA8;.ha9!5CaP!_ND2sB*^4Kf\8g90Q8K[QVcI':#44>SW1/K)kWkq.b[8,8>ik%4pgoigN`SoQ#me3@`gWVB+#?CMBa2jcL[u)4_/R?s"GpuI@M[6g>1qg$&7^'VI>(0_cu6t&8S2hCW=Pq/CsF]hcg%+\*otgJ4uc6"`.7+E/Q3kf:olgLI=cPbh1)RSd.ajE";t&'2-&\.*9hF[gNjipI)B3IP"`J2rq]lXFL6g33osQ3COWGiLuo3ln+t'Lb,b7n0Z.j6FJ_!UkTPrClrcNpb;-BK;@3m4Gj,jCY)rrY1[2OmoaVH"+7P#<-3e9HJW60,l$d$**VB/HR/B'a[KL9 endstream endobj 33 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 32 0 R >> endobj 34 0 obj << /Length 2402 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gau`UgN)%,&:O:SE2F.gAo*-E8rnuOVeIr0',9bQP)ejmMk\M]p]2r[_lXGPOl]PQeUIEIPN1ON_.]\34>,nS@7"#!Guh'Vq_+qm@^I4IB"kG&(n:R%*7E9-UYnPK2!3E8#/hW"R;SV3]?@BNA/-d(%5CM&s'8eW-o_YhT&DM7j\:)\RA1*p6Kp-4tEYSe*j+@E=NV6I'8W7^m4mGo"lK6u4e/:2[jp$m*NM.Pur'-/PG[gLTK1Z%Qf_FDhn7Yj;Ks;3>+OrPm;#[&5d#QR1k4/Vom)F^kUK:,kKf2E<9i=g_oslNNW?IZ*#C;g?=QC%sig+IG6$2Gg&[Jr:A;)t4dkH&J"[7KqWi^ALZ+N/XA[edOWW!Wd"h,-mR!NgLX+XCR/o(.Z#Pn3jT;@@a3@8,#obYDqIE`GJut!-oJIpsQZEM:%W%\AN7J>cLF?8XBK+E)QhF6=7Z_3HCSCt-7.:Ign^V%T+RZb`_=^Wiph]BFlP%^4O70"8lR/W74Z!NZ#b!#)jfl,[Eda/6H2W(0\&T1\57rcX\9*T<5ihX7A)D>fI&D,k73gjX])`d0b,^bY>_a%A>\_L7ue?6<1?6MP''VuYr5"2Ar?=SV3j_:G_BPuG/>b0OB@9UdUYI.co@*Cg,Oi7CT:?(>T/YQR8RC']e$H]q7\WW6>rPHH$X/Q-50I8MiOf$jI_f2WIsJ'\itfGP(="FOrl>!N0:g+C`)e6.#Hd@.sQ$@/l%)'dUK2LGaW1jW`:Qeo$#.5m69Cl@>piX9t(gP!LN=JGLQnPB"0EQ$"U+KV88oJC-C;;q=k$]>F0USe>g$-BP;&L-Q^J.]"1]+"WV"n06+31XBP3`mhZPM6-#^3=4h1B%>p`\$8gC?e8?\Yi9^&B]CP6&IAm8,TLQcuV"_)k#dH`p_$bm/+?PXB/K>,I5!j>Ie9Md(224q#=A%nl6@#A-3GKdVJ%kaGc0GU44X$'WJU(K-##n+sb'[HA\96(H$I?.VnX=NuUm[&eLQ&4]%Ig*/!5']u1Q'ib!)`9AADL"@s*2*@r2`j?$,)UL;G`j'm"lF-cc&E+FN1hj^GSPKeB4:D;(nDLXTIE2VY6c]p0dH6Jp/f>XLX'9)X-tT/NkFNXJZFo4`'3:RiX!!*4P,ojhp"`>'9r<">gQV3=a`pl:9YqG$Jl[[jroQXQ3MP.!*ITtj#YNj4Er=7[YT(lrln)i"#P)0.hM!NK4BRS\@1SF[2U_H)NLM0('c&c*&X;.DHI]F.ZCm],K\mZ6jM9r$bP-R>K-D4T+bRW_0?hMcN1WpC]bP^IW2c_l]p$@4=__B^4M&gi`1cM][O0mA@KiWC=&BAcP>$Vc&M(ls2D;P$-UWLY8fptRMnYj7*sE0Muq]SMs[I5;_cOk+IQ`PT7b#Sj1c4Y7ZuIb\FhO7dm.GcPS8DIiG7UL+mQjRsUm*f=?"$/M-[\$\85g;!7iXt:;YSIE)HGJB%/k,gII'SqE=b*kOJ7:.3'=.1l>c1a@olnR(E=)T"Yr_6ngHH?J`6D$oEP=i<'Ts0XDrogQ(#9kaV:Z8[Aep9]f49ik0C.kZV+0h5ladnXI0U_qVlqH2g[8!R3@;(RCIf(Y$B]ka&W/RTN`ONg@YPDX'PasWr(k;_o\,ahDt5;FXNSp+V$.:qDFI1Cr^aI^th5Q*0Hr~> endstream endobj 35 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 34 0 R >> endobj 36 0 obj << /Length 2230 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gatm=hijNf&:X@\_-?A?/d*jFgp\QrA^]g"5L)'1kk.4GfiA0180X)cf678Mlse(\S`s[J=FdcjhsPs*P$Y>(^TV8,Atc\f^Mdc.^^n)s@KHMK8b`*N'`*ri\@8&XS@r7R?J[&9(r;-\Aet$&!r*FT_o*hk(I@Di.duhq0C1bNA?KjaJ%X;RK:'I#Q<>p9])UeEm/Be4A`e8Qla^AOHKuRdRn%pN;CkN>E#o\C;D3t6eYlYZ_fEI.e;F3/0WJL>k#Kn"\e%*1MA1ndmK4@TO`?ic5r(qcrth_f&.@_r+>S8TjqbRlA`tLUG-WX:QBb[RK\9@#duk,5$g20O@PH'cB.>_Ir$Y5DTrXa2UJT3V,@ir$^b74;9mISr[58l:`m_<*,NTd2bN4;UX`LQ?o?JY/=N9i)m5,J`DmjI0RQBm?ckNqH-r?k4KTs)S620!AQX5IF#2_?a`ZQ/MI'@&7-kE8WaYk=GNOEUpdk%2M\/YSWRQeGm!bm;QYjNTk7!@;[QQmI]#0h9WZ.Ofc%4NYPC%YLBX4cE&eo`18Ol#h"]Lf2JEY>bCITLe&8Fjs?o_+P`Q_.eVEq4ID_MY.Y(bkh(#+#"NiD_ElaA"Im#dHak_O`s@Xbu;Js;RZ0+,gDrNPED>>bh`#clVI&WCNcKSHQL_VH(%4)Ao59N_8\22S\ps-jh)WQ^tp@I=@=%LUP6+i7ra:[JF31LVZ,U+04LEe1lKa>s$BKAkQ2pr7?.pR(u7(Q%TEN.s*X4YJ^k3;[10+/EB#6Y082&(+UWZ'hmZIO`GhYd:'#:c'"_?gV;k5tml&*(="*"V>'%Q1>n_rIQPJbc]=s);hFu\Vf.CP#J>hD!$ktZp4uDp?654Yse?cG4\/bl(+(s_]ho5=rB^R9>9RhI;d1`]+;#Bm.ol4N]GBL`OI,_5H&Mn8V>GZRmOc(%JDZKiQbF*mbZ2IGIeQd+\>%AsO$bipQ;G)sn7W9lktQV8?bj01Qq2_CthXP$3\MfbCS6Q8@pXT3NWnm:gk3.HZsPLQG33Q+q>>4//)517cgKk>'Zni0?jO2=#GXWe&X6&jP$G:)sIfn!5PlRaKr[:-Q5XQPdDSF#pqZ9qi;FVedY+@$0g:N_KtmnRUPM/5[0EE:\H)*13^SOj!=9f+0Es4cTK!%NMBm8^%o__/N`hforqGl"h4-"lnTBS>u81_c."4)H?\.g"s_jKH5H7TSLrdI.&Y:H=*@-8e0?)efhAXr#ee3*PO[iNXerTJO[4^#Uu2l)@E8KH[q.SK$E%qcIfc#iqIgL0[e,eM,"o"#4rr~> endstream endobj 37 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 36 0 R >> endobj 38 0 obj << /Length 806 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gat%!?#S1G'Re<2Yf$X7`4X%5k9C.f5Y-MZ4>@nI`,,p>h:YQ4rd102O`.gS&unk*3;1&[SGSFKmX<6k7k[,3qgA)m+8[IHA,@?;rVAQL0L:Si_Y`;orqH'03^NkP0Ck+c&!q\@$da,Q+uo9`Pdfl.@]*b]$Z9'hKs&O*CpBbXBF!Y=X\I%[GjLeY@#'5H0"17##?J>g9F9F_Sbl;7VVYT4^65d1e(Y&1UX<77QJ,)kr]6Yg/Cj(THg7@:5tg0J]*pL1/tRG=sa[I8536ZjR&3PkOYL$2$N!d65r?dF.!kp>tTmqL?b1nlF_IBGNZ"L=CQcS0DA-h?GQsu%`!;kb-PQ7mpYp8]5(M%*]710%7L!n3ohDF#AUgTe3MRK_3i-LG:/R"$nA41e4)*j7TkJpM<_(#:odW3OICq(A^01MQlimqh`D+En(0\+NQI8_U6;5bc%;iqouJa2p4@TZhrTji0D["a8J&'@kH$(gJ"!E!1f3%5?7)foF!G]74ht^Ua~> endstream endobj 39 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 38 0 R >> endobj 41 0 obj << /Title (\376\377\0\61\0\40\0\111\0\156\0\164\0\162\0\157\0\144\0\165\0\143\0\164\0\151\0\157\0\156) /Parent 40 0 R /Next 42 0 R /A 9 0 R >> endobj 42 0 obj << /Title (\376\377\0\62\0\40\0\123\0\143\0\157\0\162\0\151\0\156\0\147) /Parent 40 0 R /First 43 0 R /Last 48 0 R /Prev 41 0 R /Next 49 0 R /Count -6 /A 11 0 R >> endobj 43 0 obj << /Title (\376\377\0\62\0\56\0\61\0\40\0\106\0\151\0\145\0\154\0\144\0\163\0\40\0\141\0\156\0\144\0\40\0\104\0\157\0\143\0\165\0\155\0\145\0\156\0\164\0\163) /Parent 42 0 R /Next 44 0 R /A 13 0 R >> endobj 44 0 obj << /Title (\376\377\0\62\0\56\0\62\0\40\0\123\0\143\0\157\0\162\0\145\0\40\0\102\0\157\0\157\0\163\0\164\0\151\0\156\0\147) /Parent 42 0 R /Prev 43 0 R /Next 45 0 R /A 15 0 R >> endobj 45 0 obj << /Title (\376\377\0\62\0\56\0\63\0\40\0\125\0\156\0\144\0\145\0\162\0\163\0\164\0\141\0\156\0\144\0\151\0\156\0\147\0\40\0\164\0\150\0\145\0\40\0\123\0\143\0\157\0\162\0\151\0\156\0\147\0\40\0\106\0\157\0\162\0\155\0\165\0\154\0\141) /Parent 42 0 R /Prev 44 0 R /Next 46 0 R /A 17 0 R >> endobj 46 0 obj << /Title (\376\377\0\62\0\56\0\64\0\40\0\124\0\150\0\145\0\40\0\102\0\151\0\147\0\40\0\120\0\151\0\143\0\164\0\165\0\162\0\145) /Parent 42 0 R /Prev 45 0 R /Next 47 0 R /A 19 0 R >> endobj 47 0 obj << /Title (\376\377\0\62\0\56\0\65\0\40\0\121\0\165\0\145\0\162\0\171\0\40\0\103\0\154\0\141\0\163\0\163\0\145\0\163) /Parent 42 0 R /Prev 46 0 R /Next 48 0 R /A 21 0 R >> endobj 48 0 obj << /Title (\376\377\0\62\0\56\0\66\0\40\0\103\0\150\0\141\0\156\0\147\0\151\0\156\0\147\0\40\0\123\0\151\0\155\0\151\0\154\0\141\0\162\0\151\0\164\0\171) /Parent 42 0 R /Prev 47 0 R /A 23 0 R >> endobj 49 0 obj << /Title (\376\377\0\63\0\40\0\103\0\150\0\141\0\156\0\147\0\151\0\156\0\147\0\40\0\171\0\157\0\165\0\162\0\40\0\123\0\143\0\157\0\162\0\151\0\156\0\147\0\40\0\55\0\55\0\40\0\105\0\170\0\160\0\145\0\162\0\164\0\40\0\114\0\145\0\166\0\145\0\154) /Parent 40 0 R /Prev 42 0 R /Next 50 0 R /A 25 0 R >> endobj 50 0 obj << /Title (\376\377\0\64\0\40\0\101\0\160\0\160\0\145\0\156\0\144\0\151\0\170) /Parent 40 0 R /First 51 0 R /Last 51 0 R /Prev 49 0 R /Count -1 /A 27 0 R >> endobj 51 0 obj << /Title (\376\377\0\64\0\56\0\61\0\40\0\101\0\154\0\147\0\157\0\162\0\151\0\164\0\150\0\155) /Parent 50 0 R /A 29 0 R >> endobj 52 0 obj << /Type /Font /Subtype /Type1 /Name /F3 /BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding >> endobj 53 0 obj << /Type /Font /Subtype /Type1 /Name /F5 /BaseFont /Times-Roman /Encoding /WinAnsiEncoding >> endobj 54 0 obj << /Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica /Encoding /WinAnsiEncoding >> endobj 55 0 obj << /Type /Font /Subtype /Type1 /Name /F2 /BaseFont /Helvetica-Oblique /Encoding /WinAnsiEncoding >> endobj 56 0 obj << /Type /Font /Subtype /Type1 /Name /F7 /BaseFont /Times-Bold /Encoding /WinAnsiEncoding >> endobj 1 0 obj << /Type /Pages /Count 6 /Kids [6 0 R 31 0 R 33 0 R 35 0 R 37 0 R 39 0 R ] >> endobj 2 0 obj << /Type /Catalog /Pages 1 0 R /Outlines 40 0 R /PageMode /UseOutlines >> endobj 3 0 obj << /Font << /F3 52 0 R /F5 53 0 R /F1 54 0 R /F2 55 0 R /F7 56 0 R >> /ProcSet [ /PDF /ImageC /Text ] >> endobj 9 0 obj << /S /GoTo /D [31 0 R /XYZ 85.0 659.0 null] >> endobj 11 0 obj << /S /GoTo /D [31 0 R /XYZ 85.0 305.466 null] >> endobj 13 0 obj << /S /GoTo /D [31 0 R /XYZ 85.0 200.332 null] >> endobj 15 0 obj << /S /GoTo /D [33 0 R /XYZ 85.0 598.2 null] >> endobj 17 0 obj << /S /GoTo /D [33 0 R /XYZ 85.0 239.747 null] >> endobj 19 0 obj << /S /GoTo /D [33 0 R /XYZ 85.0 162.094 null] >> endobj 21 0 obj << /S /GoTo /D [35 0 R /XYZ 85.0 310.2 null] >> endobj 23 0 obj << /S /GoTo /D [35 0 R /XYZ 85.0 258.947 null] >> endobj 25 0 obj << /S /GoTo /D [35 0 R /XYZ 85.0 194.494 null] >> endobj 27 0 obj << /S /GoTo /D [37 0 R /XYZ 85.0 637.8 null] >> endobj 29 0 obj << /S /GoTo /D [37 0 R /XYZ 85.0 606.666 null] >> endobj 40 0 obj << /First 41 0 R /Last 50 0 R >> endobj xref 0 57 0000000000 65535 f 0000016819 00000 n 0000016912 00000 n 0000017004 00000 n 0000000015 00000 n 0000000071 00000 n 0000000998 00000 n 0000001118 00000 n 0000001213 00000 n 0000017127 00000 n 0000001348 00000 n 0000017190 00000 n 0000001485 00000 n 0000017256 00000 n 0000001622 00000 n 0000017322 00000 n 0000001759 00000 n 0000017386 00000 n 0000001895 00000 n 0000017452 00000 n 0000002032 00000 n 0000017518 00000 n 0000002169 00000 n 0000017582 00000 n 0000002305 00000 n 0000017648 00000 n 0000002442 00000 n 0000017714 00000 n 0000002579 00000 n 0000017778 00000 n 0000002716 00000 n 0000005342 00000 n 0000005450 00000 n 0000007779 00000 n 0000007887 00000 n 0000010382 00000 n 0000010490 00000 n 0000012813 00000 n 0000012921 00000 n 0000013819 00000 n 0000017844 00000 n 0000013927 00000 n 0000014090 00000 n 0000014278 00000 n 0000014498 00000 n 0000014697 00000 n 0000015008 00000 n 0000015212 00000 n 0000015405 00000 n 0000015620 00000 n 0000015941 00000 n 0000016121 00000 n 0000016263 00000 n 0000016376 00000 n 0000016486 00000 n 0000016594 00000 n 0000016710 00000 n trailer << /Size 57 /Root 2 0 R /Info 4 0 R >> startxref 17895 %%EOF lucene-2.9.4/docs/linkmap.pdf0000644000175000017500000000733311474320234016537 0ustar janpascaljanpascal%PDF-1.3 %ª«¬­ 4 0 obj << /Type /Info /Producer (FOP 0.20.5) >> endobj 5 0 obj << /Length 1062 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gau1.?#Q2d(kqGM/%BCKnGE`#-L%P4W\dG0G-apW"fK]od".C\qt79=#r\d#7_;]<0i?8B?MhA>cPD]TQc-cU5]V%rZHaS=,.OlYP7Z+93D0eo$W3[?b4sP;DekmOi\dm@^k'Y@T&l5uOkX<^I^KlJ*Au7;=eR_lB$Q-BXciJ-.lRYd4U'iK!jP@s4SK"SK,U%KYBeHbE!=Tog1.N.2mu*J6OVX=W%J"C9K$[oDR%F/@OrFk"@Nj1NRBRB'aDVL%>NDO#MPNHW4'6]*_jUE4(5!VGn:u?K,B7OZ?[3%pS&]n>lc'-Zn7HSVOK*aM4u0XqPVY>0h,p1r;UjS9D_2+tI2hhi3@hHOLFhD:-2eZj0Wr;W_eJX*>SmJK;@uW]@>$n^)"\c^SfM#M4(*)BXHo1;[o(*KqR\h'"Dlh&Yu+8Mg&6@"4Fu&dHW&\.ItHI*>DSB'Jd+dRJJ;N/.GF#:tdNg4jnR'E&uFF6`%ABb@OTIZ%r_'Bl63)VQN$69jmCL0+b(p;P9;Y%784T4`Eml@!U]1/.#67mIlsI?!8[7-Y5sf'HI?CUU#UX:Ihh'Mf)s3[YO.M]]-R*2X,!8Y2d=;ln_Aa]e7`^:SJHadBW&XSD^KdJFSG&9(B]LK>$+M]k^XcL=LA8Z&fa3#\"mrQqf3N]9+a+]JkGD"k!\S)+^UH3`I2[nZ_#8`qkI$B_Z;0&t[<)8:qqQIW#T!t("nI`h,cQ26V]fGjI'W:dB:q_]u@S\K$7r64dQW#0Mugu">QA6OhM!$+#tmE5&bBW2-M(A#u8XeQ/K=/,^"LAEgc~> endstream endobj 6 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 5 0 R >> endobj 7 0 obj << /Length 1123 /Filter [ /ASCII85Decode /FlateDecode ] >> stream GatUt?#S1G'Sc)R.rkq42?&CrElfYcKKs"B&kIP7Qb$I)RWNF>ftYT'C?$#[P1O!1X?#`ZpH:\f\QR57/80\jJC/C#K@+:c,&$sPKLou41tR(eUC#Z5:7p$1Q\.rl;%fS7-n5HgiSdIMpP?mgdSI1-E)"PX!mjWD%5Z?]K9[Ee7(7AnK[Tbg:H7`%IX@/Qh/Q$s3t6jXuet6*G2.eDV;<`YB09b!9j5\/#$biPD@Nbgr/%mS>#JdTF"rSaL$ZbA^:2>#7lO!d)fYEH:8Z%I).XBA%p[*ZL/qQ;Rbs@EVo]Ncf`b'KKqX4_S+Ab#PqTYUQL&f!qA9M>0UJG*Z*Rbfe7hWh*^)hu^5`n1c-H/?#cJA%kmB>_[/E#>'MaOX]J`XQ!Y7$#u9\f1N5=A*0k]afLsqQaI6n$_-EqSg#`%Y/'d[@NR)^mR\ob^J9%N,'!A(BM%QGS3AQHS2m'hBdGF5C+K1d:"S9R,6?fgX<93R\7ogd-f$@ghTj61RkpBeCeU4_O'B0$XtAa`@=P=]>@HPk endstream endobj 8 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 7 0 R >> endobj 9 0 obj << /Type /Font /Subtype /Type1 /Name /F3 /BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding >> endobj 10 0 obj << /Type /Font /Subtype /Type1 /Name /F5 /BaseFont /Times-Roman /Encoding /WinAnsiEncoding >> endobj 11 0 obj << /Type /Font /Subtype /Type1 /Name /F6 /BaseFont /Times-Italic /Encoding /WinAnsiEncoding >> endobj 12 0 obj << /Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica /Encoding /WinAnsiEncoding >> endobj 13 0 obj << /Type /Font /Subtype /Type1 /Name /F2 /BaseFont /Helvetica-Oblique /Encoding /WinAnsiEncoding >> endobj 1 0 obj << /Type /Pages /Count 2 /Kids [6 0 R 8 0 R ] >> endobj 2 0 obj << /Type /Catalog /Pages 1 0 R >> endobj 3 0 obj << /Font << /F3 9 0 R /F5 10 0 R /F1 12 0 R /F6 11 0 R /F2 13 0 R >> /ProcSet [ /PDF /ImageC /Text ] >> endobj xref 0 14 0000000000 65535 f 0000003209 00000 n 0000003273 00000 n 0000003323 00000 n 0000000015 00000 n 0000000071 00000 n 0000001225 00000 n 0000001331 00000 n 0000002546 00000 n 0000002652 00000 n 0000002764 00000 n 0000002874 00000 n 0000002985 00000 n 0000003093 00000 n trailer << /Size 14 /Root 2 0 R /Info 4 0 R >> startxref 3445 %%EOF lucene-2.9.4/docs/demo3.html0000644000175000017500000003252011474320234016302 0ustar janpascaljanpascal Apache Lucene - Building and Installing the Basic Demo
 

Apache Lucene - Building and Installing the Basic Demo

About this Document

This document is intended as a "getting started" guide to installing and running the Lucene web application demo. This guide assumes that you have read the information in the previous two examples. We'll use Tomcat as our reference web container. These demos should work with nearly any container, but you may have to adapt them appropriately.

About the Demos

The Lucene Web Application demo is a template web application intended for deployment on Tomcat or a similar web container. It's NOT designed as a "best practices" implementation by ANY means. It's more of a "hello world" type Lucene Web App. The purpose of this application is to demonstrate Lucene. With that being said, it should be relatively simple to create a small searchable website in Tomcat or a similar application server.

Indexing Files

Once you've gotten this far you're probably itching to go. Let's start by creating the index you'll need for the web examples. Since you've already set your CLASSPATH in the previous examples, all you need to do is type:

    java org.apache.lucene.demo.IndexHTML -create -index {index-dir} ..
You'll need to do this from a (any) subdirectory of your {tomcat}/webapps directory (make sure you didn't leave off the .. or you'll get a null pointer exception). {index-dir} should be a directory that Tomcat has permission to read and write, but is outside of a web accessible context. By default the webapp is configured to look in /opt/lucene/index for this index.

Deploying the Demos

Located in your distribution directory you should see a war file called luceneweb.war. If you're working with a Subversion checkout, this will be under the build subdirectory. Copy this to your {tomcat-home}/webapps directory. You may need to restart Tomcat.

Configuration

From your Tomcat directory look in the webapps/luceneweb subdirectory. If it's not present, try browsing to http://localhost:8080/luceneweb (which causes Tomcat to deploy the webapp), then look again. Edit a file called configuration.jsp. Ensure that the indexLocation is equal to the location you used for your index. You may also customize the appTitle and appFooter strings as you see fit. Once you have finished altering the configuration you may need to restart Tomcat. You may also wish to update the war file by typing jar -uf luceneweb.war configuration.jsp from the luceneweb subdirectory. (The -u option is not available in all versions of jar. In this case recreate the war file).

Running the Demos

Now you're ready to roll. In your browser set the url to http://localhost:8080/luceneweb enter test and the number of items per page and press search.

You should now be looking either at a number of results (provided you didn't erase the Tomcat examples) or nothing. If you get an error regarding opening the index, then you probably set the path in configuration.jsp incorrectly or Tomcat doesn't have permissions to the index (or you skipped the step of creating it). Try other search terms. Depending on the number of items per page you set and results returned, there may be a link at the bottom that says More Results>>; clicking it takes you to subsequent pages.

About the code...

If you want to know more about how this web app works or how to customize it then read on>>>.

 
lucene-2.9.4/docs/lucene-contrib/0000755000175000017500000000000011554106561017320 5ustar janpascaljanpascallucene-2.9.4/docs/lucene-contrib/index.pdf0000644000175000017500000004554311474320233021130 0ustar janpascaljanpascal%PDF-1.3 %ª«¬­ 4 0 obj << /Type /Info /Producer (FOP 0.20.5) >> endobj 5 0 obj << /Length 1067 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gb!$G_/c#!&;KY&$6>[iMB8"ZHWN),B+Xsl+7Q_XPG3ngAAkM_dbr"8/YR3-d6sK'1Okn:)NnJ1,n)Z+eq>fON!6-T75RnfiHpkR>;M92uRd<(b'$`,mj%PgH-)Y2,cGsq3\;I`Z34^JNQg>"EM#RFlr$r'_bRp#;(dp=7>Qt3DB660?0*3UVimTY7:;<$rrSa?9\VOGQ;9W9CehIOk1Q"uO`\[!.GJ_@@L\$Z#!\%:5k!nmqI'YE_[o(lF"Ye.dVqrYcc'u7RNnqo-I\#'7lpYJ[V.X-0D1P&8*gTAC?\o^;`TLggJ99sgUK3s$/ECZ!`>I_[M33D7S]j(-3+PI,,L<]E7CTZ/j#.*(RNiR-Clgb6j="H7G*P8IJ1C[!MVs<41isC`X[Gp#fSWg@Pd&?slm>=6eDM5RF;0b0#!^ck(`tYGQ5V&3+-\tV;dK9bD-Q1hG[-eJ:XdK(+1>UD(aWk';6RH*BLc'sKQhjFca!65)6m=PU_CRH7W:s]0;pWD^6k*00M:f$CZd^CshQ*Bl[OQXpVdef6%cT:KF9@0K5T2QI`@N]8W"3'5LXN46lSd)LOhja9!?8IV()\FJa1U4&^TXitc>I;*G-Y1'iHAWhiKujkHaV!phchRLg_MmfKa^/N="rUA@6!4JkuF^1GDIm/;^7C?8>:R#gOnIX&_\Q`]Po.DMQA14-hjLS/\:8bBHlG4UAd';P0P:Dk-2ID,CZTb-Z(cbIH10ZOLuYA`hIph^s2B7k4lE#h&K@Q^3MQ`BH'_]V&:@o*n[l91feSK?dsu,ke@-mHa+C~> endstream endobj 6 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 5 0 R /Annots 7 0 R >> endobj 7 0 obj [ 8 0 R 10 0 R 12 0 R 14 0 R 16 0 R 18 0 R 20 0 R 22 0 R 24 0 R 26 0 R 28 0 R 30 0 R 32 0 R 34 0 R 36 0 R 38 0 R 40 0 R 42 0 R 44 0 R 46 0 R 48 0 R 50 0 R 52 0 R ] endobj 8 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 559.666 184.988 547.666 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 9 0 R /H /I >> endobj 10 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 541.466 168.812 529.466 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 11 0 R /H /I >> endobj 12 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 523.266 138.164 511.266 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 13 0 R /H /I >> endobj 14 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 505.066 176.816 493.066 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 15 0 R /H /I >> endobj 16 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 486.866 165.5 474.866 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 17 0 R /H /I >> endobj 18 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 468.666 135.5 456.666 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 19 0 R /H /I >> endobj 20 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 450.466 176.168 438.466 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 21 0 R /H /I >> endobj 22 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 432.266 231.476 420.266 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 23 0 R /H /I >> endobj 24 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 414.066 178.832 402.066 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 25 0 R /H /I >> endobj 26 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 395.866 144.836 383.866 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 27 0 R /H /I >> endobj 28 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 377.666 168.496 365.666 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 29 0 R /H /I >> endobj 30 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 359.466 151.168 347.466 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 31 0 R /H /I >> endobj 32 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 341.266 185.14 329.266 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 33 0 R /H /I >> endobj 34 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 323.066 155.152 311.066 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 35 0 R /H /I >> endobj 36 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 304.866 161.824 292.866 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 37 0 R /H /I >> endobj 38 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 286.666 171.832 274.666 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 39 0 R /H /I >> endobj 40 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 268.466 159.832 256.466 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 41 0 R /H /I >> endobj 42 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 250.266 188.476 238.266 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 43 0 R /H /I >> endobj 44 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 232.066 171.16 220.066 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 45 0 R /H /I >> endobj 46 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 213.866 157.168 201.866 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 47 0 R /H /I >> endobj 48 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 195.666 175.828 183.666 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 49 0 R /H /I >> endobj 50 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 177.466 167.824 165.466 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 51 0 R /H /I >> endobj 52 0 obj << /Type /Annot /Subtype /Link /Rect [ 108.0 159.266 211.804 147.266 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 53 0 R /H /I >> endobj 54 0 obj << /Length 1855 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gau0DgMZ%0&:O:S#lBhd#qbGFm8Go;[U!\cmD^_S1ju#C3p'T]?&FN'IXO`\OXrM4g*/dudN[jpB$SWa_d[eqR/Jlu8Iu*'D%rTPV8]?JL4Nd3#@mVNY9/a,BG/e`>R16iDXm[g\qJdtTt%@trA+8SR*BPtK,+tK<\r3LR)'7$7ucf4A&I>uZ.KtES5F[$5HB:ndOVn?5@b>^!s4?*6r(8!(YBmJCP=pNe*J`emo3+5(N)tqd#%;JmXoMr+DEVrF]`L9T'n;0<6DREF6)io*M_3.K8WsJ?/,0eL'h:K>>Rr$+hIdmbtRuR;BkjGkJ49<$P,Pf/#B?L-'2.ZRTbXjELem)1_%QP/m[Sq'GP_hV:)ThEV`lMXmu#?&.@&eAc;-"<,RcN'2#o*I#@qA=f9m,.J2?u<+c',ms^;FC7)('g',[Db*VhAp:[\bR8g^S@2ND-==O?Im#5C_eOh,]YQd#P@obNoT&$N>gm^`2ek!BcHc]ZhO-Fo348Ihl]V/kuNT-/^&"'5NDnRa\a'l#iR_>KZndd#_ee`@nJsq?^?_o[?,)l?I[3mJP/edbjLYg)N-pQLgD,DeMcQO_D+h2JR*ac!ST!%)iLdt%YY#q`,lj)[>@qNWVd's11c8*<1[j0jl7]#lu*6HT&ahD-.*OC7m%X[51a8,m,b9=FeN3%K3ra/8G?*HfL1ik8Y5VQ#'Wa\bfcZ@Ys`Va/s1V-VV,RX3*K`Bp_/`'5`t?rf^#m1:8[N&L9[eh0#j3Kbb_7=6+i0qlaq!t+5G>l#dsdNXAgAG;]bH["%p?dG_sr*6HLn'5*%A$#c0tS-`gC\#GqhB'N#>KNua^(VYQ=D7h#'lakD1p>F49#4]?@Oa,=Z:'T)&F>>ZoZI2q_%1-Vk%'=PHaep@(>52V9X+:OdSk!Z5)GuCYfq$j^#Iq[?A`kq2j;8gOSJ19\lWa2"UNZ?ej'E0[@h@Q4`[(BWoiT"9`Y.jr#L?L!qd7P[tJr)Ji]QZbeXnUcCLolunE^7hlC6K'?qr?tdb/TTW'0*6VL)(-.'F3'Uip^F,2D]/J54A)I[[)2jU4QrZN[kJ)"clckAK,`IV9c('4NbchckZ:]X;i@1rPR0PO#M7ri=W1p3XArg..O?Op4.E&:S?-q(D5Eng[tJNk3*S8B9F\.+b![fL'k[U@_1P\R0bG[E\&/@IG;0a<3F=jHV?0gP]tAT[]3JD?F!D,_7X[Hjoc3bs)LfA@72ub&[e!_?$eOZgEV-)D:p3,K4[Zo@kr]1kkX[bl8]iCD<_^sg?i9a&a)#MZ#'P(csa,7KBT)+c'[$8[3d^@^##4`q?0U(FeTdMFT\.F"Q1il)$_i'I*_9_->J6W`0OY7>iY]HOIh06DRV6*H=5ga@e$%qdbFsZg_rMC5[jeHWs1jN<7;$Dm(bNr6JG\eTBg*jQ"M6rWEDXiKWO:QCtcQDV613:iYm@M#1J<^GL,CJWUdcrCqbHS\s6iq?FQqeVck3UQ0UNBdee@CX*/Al%ZhHg7*r2R(oeA-] endstream endobj 55 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 54 0 R >> endobj 56 0 obj << /Length 1357 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gb!#[9lK&M&A@sBkdV#M$pR2GR-Z_hIKA2/pXJoPaL'2E/U8d*WV>"g,af5^8<$Oj5[i:\,nB+_]JghqB!qD_[J7.>EKL)%p#/'mYroJ82_:lMK6qIeNWm/c"CPCJpG,RRlaQ6]=moF``mrWHbHT*\(RD,lGio/7]K:PA_fQuV#(t<:g)qig&/``+0m\?7Ze%f#tJ+%"M)_^)j$tr.7KEH!B!._N>Tjm3SiE#MG^7]!^Djki&+S,c"&?tM$[XC$pNPP00blRX`^c9=4`^\Vf0S9:is)d#=#lum38<:'T0&&3l^JqA"o62'>$A^SNmu`+VLH;c,Q(#<:+A\c,&Ymm$!p.3^WL\?"t0!BPQGYC-**pe(J&/gM$BTBY64IF9E,o\`kKZWlsO48dce9"tDV_6D.?GpY+;=Yo;'DJfu"D-eaVLoQo6[l[%(>jm!YH_.#i0^PLCS^71D$+JOK$gi85H-*e=(-3EVlfgfE,gN?2\X]i((:NH,9gYFj#1&d(eVt%e()`0tg=0QTH:C/%I<3O=EcX@%r'6?/E>_"!C!tT[q_p3.D%_cd]g,"D$c4K1"V1`dl^K_1RYG77;;=W5,8Bagf:!?aZ#;jh3+tg+P*fSk4kJV-_l,R#jiQkH9"L&YQaB;F(oOS,]`7,28Y`\rTLq_=V?a3(GM&ha"(!(>+0U7fsE!7*o[]RqIsDgMHW)30oTifF1*Gi_`B7$VaLqoq^X:-=^QhOV";P)kD0?Pbod4472(*]!98WVe_=O!Yo.sUul9Cqg.p*iKpMCE9tRK;Ud^TJ$iIcK^1Zag?$W::XgXGd6b,H+7+0`]f_=`H_,(KH(?Qrq8TXif.`!+OMO1As'HERA$p#g]S[`1om&WI.aq7_IV;7M05b[b*4o'&9[X<*L9(`4m>sH74ggb%J1FoWAY3b-BoN\utomUKT.dK7MXI%45As-eO-U9/3:O#(dC7=(]>FI!3cdQ,i@"cn5Cp";[*5G+%$CT@&^;jI$k9djBX4H%d<85+(%e-`VoD=q1N2YqGgB>C7,Y!lNP%TRmoeps4U&YK@fEQ1j#mZdd]pZ[u; endstream endobj 57 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 56 0 R >> endobj 58 0 obj << /Length 1310 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gau1.bAQ&o'Sc@2$8,Ojb_e>bQ>J>?bb,-f`k/Ye>,#jDFE&TV:?Lg(8?tW=OkLXh@+Z5@P3;E74J^hNtX/Hqp;4?FP:F]Rq>CsJqY5F+Z`mKF4EiT)I8WQJot!lGr)KK5p3)cAA4X]0JB=NSqugfj_Ed0$k1!E:"3%hpC>0I8$OHl*^r'NjAJkFOCQST'2b%[Zja56a#6H6-8Bf<*:97+3^("'_P@Fucil9e7HOUt\=Zj1f.jCB2(&GX#c;*FNgf@VD/7r&GG'%JC6#(&"l%2R>e75Edt9P.26GWX<;'L\EgsZn[uI?orEl1i=TrjN^SCClJitj$TTEToTiOr*#WW/EJZbc0R3UcrmFD00KNUVrM;q5f/Jc;b8uUjQ>tplsE?lp<9QrL`q%KYg1*r+ALl%$="X.LE]N\]b]7@B]S],VGq9cY)jp/MT&,5\9^M!(5.'7#:r\%VhfS,qmj2gW@X\Dki3,/$a-&Hq\`*+o!":Sp:U2-WY[\6ir&k[cr0'6kWEH9mhO0\e'p!rT,e%&oeo?jAL^XKMs=0(U#WE.B2,uVqO5eB=]hsCVWt/Xu)TI2Gc>Fq%4+Kalo#9DbS&2^,OXX`=TQn-AVN>;2?De3MGVIrUqYg6%!V]8tqEY`F?Edr`P#WW\h-?HJ/BJ6OWupb]+pr324 endstream endobj 59 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 58 0 R >> endobj 60 0 obj << /Length 1078 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gau1-9iKe#&;KZL'fs2T_YTRZ^_CBAF9Y>ISSL.c8\im,AYZ^k[np;:C+07FhgiLcNu'#71a3eClF*eI_A.W$c/AA!b9&/%jr/#&I,]ko2/=Q(;P]urD_']819q.d.\c:M8I;H9WL)mX`LgR:)@MgDsT4TM'eklQH'+<\S;pO1%4lQLsWj(Y$Xf+&7k8(g1HDfbcIB$,0Q:D3R;MK=@jFCb>C=lT%V&e=VjYqH>hI``PPbZS#JsgXr#Xu/g-N=_;-"c,Y<(M=7d?SZDk%T58"^"@,Y(5Q"_UfTa9d)CjBcfkJl4l;p$r4;FlOfa:Di2b/Y\'b3,bqGI'#IG9,AEY>qi`$9t=KcYVR(?[-VKk#(DT6*s7M4(i2I2gta6M4[(r%j%D:hoVumV+?r(Q=*Oua"52g/ki^*sg0=`:_F&nW:(Bu4Xq[1,e2hG.gNT*gnglYtT*nI.OuPY*)n834]#@=:6jc6S32hLUSW.%.0Q70Dm0r'&6I/;]+D5q*@K+,h`^O;N+iR(Q=^\!cX7j":]m5XudoVF<-[VP,S!DEE5l?M*cHR,h()p#1iHFCNZ2,.ODo/2bBh+3#TR9H;310o?\RHhmH+ARUn8ZSk7X9R`J&kh9mX9=&`LHD>i\U*J$rRY\Q=SicamE"iYCE[q&O3!p*r6GmZe;%ML3C,mpDbANj%c"qnR<=DMp%qS:CJ>nk'l(:nh5?\_GV=I&FgtJQS%HeO(^\>;/[kCgR9tlM]2^r=b5FT)DUgGV&O8S'j+q5_Z=%!'66L,UPc(jP&:]Tl-<0FiZo.H$[HJfdcND8h%aTh5aIj@im2-C#87#>d;?mE?>L)au/LNjiY?"_#Y+:7>>2qCqocZ(=O,2hf="k$4(BO*OAZ[?DLGYpb-]%$'O'uX5h>.1aa[5+$+.95'1ft>s,f"5s"*_)TN9KR*@3nbO endstream endobj 61 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 60 0 R >> endobj 63 0 obj << /Title (\376\377\0\61\0\40\0\114\0\165\0\143\0\145\0\156\0\145\0\40\0\103\0\157\0\156\0\164\0\162\0\151\0\142) /Parent 62 0 R /First 64 0 R /Last 85 0 R /Count -22 /A 9 0 R >> endobj 64 0 obj << /Title (\376\377\0\61\0\56\0\61\0\40\0\141\0\156\0\141\0\154\0\171\0\172\0\145\0\162\0\163) /Parent 63 0 R /Next 65 0 R /A 11 0 R >> endobj 65 0 obj << /Title (\376\377\0\61\0\56\0\62\0\40\0\141\0\156\0\164) /Parent 63 0 R /Prev 64 0 R /Next 66 0 R /A 13 0 R >> endobj 66 0 obj << /Title (\376\377\0\61\0\56\0\63\0\40\0\142\0\145\0\156\0\143\0\150\0\155\0\141\0\162\0\153) /Parent 63 0 R /Prev 65 0 R /Next 67 0 R /A 15 0 R >> endobj 67 0 obj << /Title (\376\377\0\61\0\56\0\64\0\40\0\143\0\157\0\154\0\154\0\141\0\164\0\151\0\157\0\156) /Parent 63 0 R /Prev 66 0 R /Next 68 0 R /A 17 0 R >> endobj 68 0 obj << /Title (\376\377\0\61\0\56\0\65\0\40\0\144\0\142) /Parent 63 0 R /Prev 67 0 R /Next 69 0 R /A 19 0 R >> endobj 69 0 obj << /Title (\376\377\0\61\0\56\0\66\0\40\0\150\0\151\0\147\0\150\0\154\0\151\0\147\0\150\0\164\0\145\0\162) /Parent 63 0 R /Prev 68 0 R /Next 70 0 R /A 21 0 R >> endobj 70 0 obj << /Title (\376\377\0\61\0\56\0\67\0\40\0\146\0\141\0\163\0\164\0\55\0\166\0\145\0\143\0\164\0\157\0\162\0\55\0\150\0\151\0\147\0\150\0\154\0\151\0\147\0\150\0\164\0\145\0\162) /Parent 63 0 R /Prev 69 0 R /Next 71 0 R /A 23 0 R >> endobj 71 0 obj << /Title (\376\377\0\61\0\56\0\70\0\40\0\151\0\156\0\163\0\164\0\141\0\156\0\164\0\151\0\141\0\164\0\145\0\144) /Parent 63 0 R /Prev 70 0 R /Next 72 0 R /A 25 0 R >> endobj 72 0 obj << /Title (\376\377\0\61\0\56\0\71\0\40\0\154\0\165\0\143\0\154\0\151) /Parent 63 0 R /Prev 71 0 R /Next 73 0 R /A 27 0 R >> endobj 73 0 obj << /Title (\376\377\0\61\0\56\0\61\0\60\0\40\0\155\0\145\0\155\0\157\0\162\0\171) /Parent 63 0 R /Prev 72 0 R /Next 74 0 R /A 29 0 R >> endobj 74 0 obj << /Title (\376\377\0\61\0\56\0\61\0\61\0\40\0\155\0\151\0\163\0\143) /Parent 63 0 R /Prev 73 0 R /Next 75 0 R /A 31 0 R >> endobj 75 0 obj << /Title (\376\377\0\61\0\56\0\61\0\62\0\40\0\161\0\165\0\145\0\162\0\171\0\160\0\141\0\162\0\163\0\145\0\162) /Parent 63 0 R /Prev 74 0 R /Next 76 0 R /A 33 0 R >> endobj 76 0 obj << /Title (\376\377\0\61\0\56\0\61\0\63\0\40\0\162\0\145\0\147\0\145\0\170) /Parent 63 0 R /Prev 75 0 R /Next 77 0 R /A 35 0 R >> endobj 77 0 obj << /Title (\376\377\0\61\0\56\0\61\0\64\0\40\0\162\0\145\0\155\0\157\0\164\0\145) /Parent 63 0 R /Prev 76 0 R /Next 78 0 R /A 37 0 R >> endobj 78 0 obj << /Title (\376\377\0\61\0\56\0\61\0\65\0\40\0\163\0\156\0\157\0\167\0\142\0\141\0\154\0\154) /Parent 63 0 R /Prev 77 0 R /Next 79 0 R /A 39 0 R >> endobj 79 0 obj << /Title (\376\377\0\61\0\56\0\61\0\66\0\40\0\163\0\160\0\141\0\164\0\151\0\141\0\154) /Parent 63 0 R /Prev 78 0 R /Next 80 0 R /A 41 0 R >> endobj 80 0 obj << /Title (\376\377\0\61\0\56\0\61\0\67\0\40\0\163\0\160\0\145\0\154\0\154\0\143\0\150\0\145\0\143\0\153\0\145\0\162) /Parent 63 0 R /Prev 79 0 R /Next 81 0 R /A 43 0 R >> endobj 81 0 obj << /Title (\376\377\0\61\0\56\0\61\0\70\0\40\0\163\0\165\0\162\0\162\0\157\0\165\0\156\0\144) /Parent 63 0 R /Prev 80 0 R /Next 82 0 R /A 45 0 R >> endobj 82 0 obj << /Title (\376\377\0\61\0\56\0\61\0\71\0\40\0\163\0\167\0\151\0\156\0\147) /Parent 63 0 R /Prev 81 0 R /Next 83 0 R /A 47 0 R >> endobj 83 0 obj << /Title (\376\377\0\61\0\56\0\62\0\60\0\40\0\167\0\151\0\153\0\151\0\160\0\145\0\144\0\151\0\141) /Parent 63 0 R /Prev 82 0 R /Next 84 0 R /A 49 0 R >> endobj 84 0 obj << /Title (\376\377\0\61\0\56\0\62\0\61\0\40\0\167\0\157\0\162\0\144\0\156\0\145\0\164) /Parent 63 0 R /Prev 83 0 R /Next 85 0 R /A 51 0 R >> endobj 85 0 obj << /Title (\376\377\0\61\0\56\0\62\0\62\0\40\0\170\0\155\0\154\0\55\0\161\0\165\0\145\0\162\0\171\0\55\0\160\0\141\0\162\0\163\0\145\0\162) /Parent 63 0 R /Prev 84 0 R /A 53 0 R >> endobj 86 0 obj << /Type /Font /Subtype /Type1 /Name /F3 /BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding >> endobj 87 0 obj << /Type /Font /Subtype /Type1 /Name /F5 /BaseFont /Times-Roman /Encoding /WinAnsiEncoding >> endobj 88 0 obj << /Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica /Encoding /WinAnsiEncoding >> endobj 89 0 obj << /Type /Font /Subtype /Type1 /Name /F2 /BaseFont /Helvetica-Oblique /Encoding /WinAnsiEncoding >> endobj 90 0 obj << /Type /Font /Subtype /Type1 /Name /F7 /BaseFont /Times-Bold /Encoding /WinAnsiEncoding >> endobj 1 0 obj << /Type /Pages /Count 5 /Kids [6 0 R 55 0 R 57 0 R 59 0 R 61 0 R ] >> endobj 2 0 obj << /Type /Catalog /Pages 1 0 R /Outlines 62 0 R /PageMode /UseOutlines >> endobj 3 0 obj << /Font << /F3 86 0 R /F5 87 0 R /F1 88 0 R /F2 89 0 R /F7 90 0 R >> /ProcSet [ /PDF /ImageC /Text ] >> endobj 9 0 obj << /S /GoTo /D [55 0 R /XYZ 85.0 659.0 null] >> endobj 11 0 obj << /S /GoTo /D [55 0 R /XYZ 85.0 339.066 null] >> endobj 13 0 obj << /S /GoTo /D [55 0 R /XYZ 85.0 266.613 null] >> endobj 15 0 obj << /S /GoTo /D [55 0 R /XYZ 85.0 194.16 null] >> endobj 17 0 obj << /S /GoTo /D [57 0 R /XYZ 85.0 637.8 null] >> endobj 19 0 obj << /S /GoTo /D [57 0 R /XYZ 85.0 538.947 null] >> endobj 21 0 obj << /S /GoTo /D [57 0 R /XYZ 85.0 466.494 null] >> endobj 23 0 obj << /S /GoTo /D [57 0 R /XYZ 85.0 394.041 null] >> endobj 25 0 obj << /S /GoTo /D [57 0 R /XYZ 85.0 295.188 null] >> endobj 27 0 obj << /S /GoTo /D [57 0 R /XYZ 85.0 209.535 null] >> endobj 29 0 obj << /S /GoTo /D [59 0 R /XYZ 85.0 659.0 null] >> endobj 31 0 obj << /S /GoTo /D [59 0 R /XYZ 85.0 586.547 null] >> endobj 33 0 obj << /S /GoTo /D [59 0 R /XYZ 85.0 500.894 null] >> endobj 35 0 obj << /S /GoTo /D [59 0 R /XYZ 85.0 415.241 null] >> endobj 37 0 obj << /S /GoTo /D [59 0 R /XYZ 85.0 342.788 null] >> endobj 39 0 obj << /S /GoTo /D [59 0 R /XYZ 85.0 270.335 null] >> endobj 41 0 obj << /S /GoTo /D [59 0 R /XYZ 85.0 197.882 null] >> endobj 43 0 obj << /S /GoTo /D [61 0 R /XYZ 85.0 659.0 null] >> endobj 45 0 obj << /S /GoTo /D [61 0 R /XYZ 85.0 586.547 null] >> endobj 47 0 obj << /S /GoTo /D [61 0 R /XYZ 85.0 514.094 null] >> endobj 49 0 obj << /S /GoTo /D [61 0 R /XYZ 85.0 441.641 null] >> endobj 51 0 obj << /S /GoTo /D [61 0 R /XYZ 85.0 369.188 null] >> endobj 53 0 obj << /S /GoTo /D [61 0 R /XYZ 85.0 296.735 null] >> endobj 62 0 obj << /First 63 0 R /Last 63 0 R >> endobj xref 0 91 0000000000 65535 f 0000015540 00000 n 0000015626 00000 n 0000015718 00000 n 0000000015 00000 n 0000000071 00000 n 0000001230 00000 n 0000001350 00000 n 0000001529 00000 n 0000015841 00000 n 0000001664 00000 n 0000015904 00000 n 0000001801 00000 n 0000015970 00000 n 0000001938 00000 n 0000016036 00000 n 0000002075 00000 n 0000016101 00000 n 0000002210 00000 n 0000016165 00000 n 0000002345 00000 n 0000016231 00000 n 0000002482 00000 n 0000016297 00000 n 0000002619 00000 n 0000016363 00000 n 0000002756 00000 n 0000016429 00000 n 0000002893 00000 n 0000016495 00000 n 0000003030 00000 n 0000016559 00000 n 0000003167 00000 n 0000016625 00000 n 0000003303 00000 n 0000016691 00000 n 0000003440 00000 n 0000016757 00000 n 0000003577 00000 n 0000016823 00000 n 0000003714 00000 n 0000016889 00000 n 0000003851 00000 n 0000016955 00000 n 0000003988 00000 n 0000017019 00000 n 0000004124 00000 n 0000017085 00000 n 0000004261 00000 n 0000017151 00000 n 0000004398 00000 n 0000017217 00000 n 0000004535 00000 n 0000017283 00000 n 0000004672 00000 n 0000006620 00000 n 0000006728 00000 n 0000008178 00000 n 0000008286 00000 n 0000009689 00000 n 0000009797 00000 n 0000010968 00000 n 0000017349 00000 n 0000011076 00000 n 0000011277 00000 n 0000011433 00000 n 0000011567 00000 n 0000011737 00000 n 0000011907 00000 n 0000012035 00000 n 0000012217 00000 n 0000012469 00000 n 0000012657 00000 n 0000012803 00000 n 0000012960 00000 n 0000013105 00000 n 0000013292 00000 n 0000013443 00000 n 0000013600 00000 n 0000013769 00000 n 0000013932 00000 n 0000014125 00000 n 0000014294 00000 n 0000014445 00000 n 0000014620 00000 n 0000014783 00000 n 0000014984 00000 n 0000015097 00000 n 0000015207 00000 n 0000015315 00000 n 0000015431 00000 n trailer << /Size 91 /Root 2 0 R /Info 4 0 R >> startxref 17400 %%EOF lucene-2.9.4/docs/lucene-contrib/index.html0000644000175000017500000004330611474320233021316 0ustar janpascaljanpascal Apache Lucene - Lucene Contrib
 

Apache Lucene - Lucene Contrib

Lucene Contrib

The Lucene Java project also contains a workspace, Lucene Contrib (formerly known as the Lucene Sandbox), that is open both to all Lucene Java core committers and to developers whose commit rights are restricted to the Contrib workspace; these developers are referred to as "Contrib committers". The Lucene Contrib workspace hosts the following types of packages:

  • Various third party contributions.
  • Contributions with third party dependencies - the Lucene Java core distribution has no external runtime dependencies.
  • New ideas that are intended for eventual inclusion into the Lucene Java core.

Users are free to experiment with the components developed in the Contrib workspace, but Contrib packages will not necessarily be maintained, particularly in their current state. The Lucene Java core backwards compatibility commitments (see http://wiki.apache.org/lucene-java/BackwardsCompatibility) do not necessarily extend to the packages in the Contrib workspace. See the README.txt file for each Contrib package for details. If the README.txt file does not address its backwards compatibility commitments, users should assume it does not make any compatibility commitments.

See Contrib CHANGES for changes included in the current release.

You can access the current trunk Contrib repository at http://svn.apache.org/repos/asf/lucene/java/trunk/contrib/.

analyzers

Contributed Analyzers, Tokenizers, and Filters for various uses and languages.

See analyzers javadoc

ant

Ant task to create Lucene indexes.

See ant javadoc

benchmark

The benchmark contribution contains tools for benchmarking Lucene using standard, freely available corpora.

See benchmark javadoc

collation

CollationKeyFilter/Analyzer and ICUCollationKeyFilter/Analyzer - can be used as an efficient replacement for Locale sorting and Locale range queries as well as Locale-specific normalization

See collation javadoc

db

Provides integration with Berkley DB.

See db javadoc

highlighter

A set of classes for highlighting matching terms in search results.

See highlighter javadoc

fast-vector-highlighter

An alternative set of classes for highlighting matching terms in search results that relies on stored term vectors. This highlighter can be much faster than the standard highlighter, especially on large fields.

See fast-vector-highlighter javadoc

instantiated

RAM-based index that enables much faster searching than RAMDirectory in certain situations.

See instantiated javadoc

lucli

An application that allows Lucene index manipulation from the command-line.

See lucli javadoc

memory

High-performance single-document main memory index.

See memory javadoc

misc

A variety of miscellaneous files, including QueryParsers, and other alternate Lucene class implementations and tools.

See misc javadoc

queryparser

A new Lucene query parser implementation, which matches the syntax of the core QueryParser but offers a more modular architecture to enable customization.

See queryparser javadoc

regex

Queries with additional regex matching capabilities.

See regex javadoc

remote

Classes to help use Lucene with RMI.

See remote javadoc

snowball

Pre-compiled versions of the Snowball stemmers for Lucene.

See snowball javadoc

spatial

Classes to help with efficient distance based sorting.

See spatial javadoc

spellchecker

Provides tools for spellchecking and suggestions with Lucene.

See spellchecker javadoc

surround

A QueryParser that supports the Span family of queries as well as pre and infix notation.

See surround javadoc

swing

Swing components designed to integrate with Lucene.

See swing javadoc

wikipedia

Tools for working with wikipedia content.

See wikipedia javadoc

wordnet

Tools to help utilize wordnet synonyms with Lucene

See wordnet javadoc

xml-query-parser

A QueryParser that can read queries written in an XML format.

See xml-query-parser javadoc

 
lucene-2.9.4/docs/broken-links.xml0000644000175000017500000000003711474320234017523 0ustar janpascaljanpascal lucene-2.9.4/docs/contributions.html0000644000175000017500000005216311474320234020202 0ustar janpascaljanpascal Apache Lucene - Contributions
 

Apache Lucene - Contributions

Overview

This page lists external Lucene resources. If you have written something that should be included, please post all relevant information to one of the mailing lists. Nothing listed here is directly supported by the Lucene developers, so if you encounter any problems with any of this software, please use the author's contact information to get help.

If you are looking for information on contributing patches or other improvements to Lucene, see How To Contribute on the Lucene Wiki.

Lucene Tools

Software that works with Lucene indices.

Luke

URL http://www.getopt.org/luke/
author Andrzej Bialecki

LIMO (Lucene Index Monitor)

URL http://limo.sf.net/
author Julien Nioche

Lucene Document Converters

Lucene requires information you want to index to be converted into a Document class. Here are contributions for various solutions that convert different content types to Lucene's Document classes.

XML Document #1

URL http://marc.theaimsgroup.com/?l=lucene-dev&m=100723333506246&w=2
author Philip Ogren - ogren@mayo.edu

XML Document #2

URL http://www.mail-archive.com/lucene-user@jakarta.apache.org/msg00346.html
author Peter Carlson - carlson@bookandhammer.com

PDF Box

URL http://www.pdfbox.org/
author Ben Litchfield - ben@csh.rit.edu

XPDF - PDF Document Conversion

URL http://www.foolabs.com/xpdf
author N/A

PDFTextStream -- PDF text and metadata extraction

URL http://snowtide.com
author N/A

PJ Classic & PJ Professional - PDF Document Conversion

URL http://www.etymon.com/
author N/A

Miscellaneous

Arabic Analyzer for Java

URL http://savannah.nongnu.org/projects/aramorph
author Pierrick Brihaye

Phonetix

URL http://www.companywebstore.de/tangentum/mirror/en/products/phonetix/index.html
author tangentum technologies

ejIndex - JBoss MBean for Lucene

URL http://ejindex.sourceforge.net/
author Andy Scholz

JavaCC

URL https://javacc.dev.java.net/
author Sun Microsystems (java.net)
 
lucene-2.9.4/docs/demo3.pdf0000644000175000017500000002414411474320234016112 0ustar janpascaljanpascal%PDF-1.3 %ª«¬­ 4 0 obj << /Type /Info /Producer (FOP 0.20.5) >> endobj 5 0 obj << /Length 661 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gb!$E966RV&:j6I$6AM$'Km$JrMXAWktJ!,\$QAA_UaYs\NH&;hj7E+lul!AM$jQq@bKRU[1^)HK%h:LKL#a(,!as/$P:qq(ngTB+R\AYL*-pMO;u+8@Wb;LK3X!I,,4eei_u%4j2j5t\8IPVQ.`d;eDs9l"!esA*+8%OgmP0;[6FJjp-#S-8nCcQb+fu:hX3)5A:Q[]^c@$,@O$HG9DRWWCnM_sf'6+uP;rXtqjTUg?qA2O>`q;75!%Q6>1g]O1M^`>l>!WF!Cl8G0^@ZZm;sl`He?raJAM3I^@pj.2RW-Y+&q?tXj*;E)o**Y,(-SpN<"_WUtl1iA/:fV"H!eIbi8NRJ)GZ(d1o.DhWRl>#U++WJ7+J"89@#rT(\_sjf`[-%+mYB"ginoUHFiTUU]oDG3R^CIp#k->N[.YZ7$J9\&EQQ1BDJtIH]dJl0\&lA(@%sJ),KS*0S,5;eP]2"#Qfg7A:JoRt+_48D&j<1>KdC6BYk>L[X[JPB%-QG4)+;?jcnU$)80'8WTlDM#]Ap1U;H,$Vsp0a`)'7/Q$-d,,JO"uJ#rAP_*n7h~> endstream endobj 6 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 5 0 R /Annots 7 0 R >> endobj 7 0 obj [ 8 0 R 10 0 R 12 0 R 14 0 R 16 0 R 18 0 R 20 0 R ] endobj 8 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 519.166 213.332 507.166 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 9 0 R /H /I >> endobj 10 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 500.966 194.66 488.966 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 11 0 R /H /I >> endobj 12 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 482.766 179.0 470.766 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 13 0 R /H /I >> endobj 14 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 464.566 215.324 452.566 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 15 0 R /H /I >> endobj 16 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 446.366 177.332 434.366 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 17 0 R /H /I >> endobj 18 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 428.166 206.0 416.166 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 19 0 R /H /I >> endobj 20 0 obj << /Type /Annot /Subtype /Link /Rect [ 102.0 409.966 192.32 397.966 ] /C [ 0 0 0 ] /Border [ 0 0 0 ] /A 21 0 R /H /I >> endobj 22 0 obj << /Length 2641 /Filter [ /ASCII85Decode /FlateDecode ] >> stream Gatm=>Ar7U(4OT5&GjfjBPpFVdk1L4ND].pN<2Z"W?[*>R$C33&4*R'#j,KYhgO\)`hV[/B;Z:LSQpO/iUQWG8q=2/%O/5B[+F1T+MkM>%29k)/B[Z:=(Isg]i,qa800_"&1+*@Nio71i,G:MjYY5Wlm]=_>&71Scm=Dk9HB6se4^^kXDWP/V@LX'r/r9Ff)Dk1[k0D"=:_pP=mCaKo$%Msade/H/k*t?4@ac>\R*QKQ&;cPZ[=NgC0,Xn?&e%`)pZ>!:CSY">*=uC@!C3m1&eQO0C2;SREcZ?BW>+f`5[OB4pbml=Jhe'0Ua.Q;Mh3a%oj3Mb"<\MX$ASIo]JV#'>)Pr<\C1_ouf+>G1>Ekj1*csIQG3K[/=ikc&EC:N\?0u9*WP\4L=ii%fgY1_5O9Ecj3o2uo:s6',^be3Ju=mY.b`YNB)p;*33[1bp[++jTf$g#D5>2VIus\QGS&rYiQN7Ek5"=dbSq>C9GurV:%iq#DRU'0B,qB6IS`/E#,EdkQ9%_.u)]5F!ub?r@k%hJl%"O=Q51bQ8i@5&qgT[_L`@QQ_ro:WYJOaNNJ2bG_Qd1OjF;o#tqfIn_k/lTj@3&AZcOL^SuuNADZMn??P+6GY)FbXnF5?.uV*_+KeNch>Hm_[?sXGg1pUn="%-rA%$P.o,Qj5+="o@e/ZUq;-=U49Fgr>!l]V1\883Df+oS[c^UsCPDR;ig2#WeMA5Aa3sK\VI$BRjP#?A$ed1m^H?5+W/$mu-aJ.)L8jXJr-]*D_mZVR4-Nj@cT&:7=VjN9TJF@<@Y&`6l[;#tYrq0#1kqKUdH4r3eZ?kfJm$j]X%oP-N&G'r7p,rFc0HTTOE9@3lI3g>W;n98F[P*"DJ5)dBggf:hTFLloo^O`rINZtgQmB/P!UR2d_ZjbKRp$DlUK:7p#@+OeNiGC)G=%G%QNT^&;/1i[^+AXa;fs6@'o/-obT13?R>rf-OB/Y0WtQm?.'s5%2RmJXBhI01P3Nmkms"Lb%hHG1/:TW(0F_@.E:+RVS93rLjEK7RS=![l$:-]\D0N5_IN&3S5+ulFeX!%CRdDBD#>jbuI,>#j/:Y;k\eB'!Np^/u&aB.ZluDrM5JCI)lAt0Q6g=Do2XNC5B.Zqo//>(4+$80Pp_G0b?%FF[CNP8.NlbnSGjk@c/i!+'(W=N>6h#HO,Ng-V-q"'QqIT53j,<=n7;!R@Gm(t]7:M`:BdnhF=r4A>\`T'RCVADgWmg$1#@j>E;r\]Vj,EGJ(?Dd>[T`O(/=i?jpTB=N8>8bHJ_U8$/j>:Bp+BSObi&se_XaWjmGKc4L7uaq/`Zo&eJ:HBTF6@-BU\514ReepDWl3-QOurVOX9-4@0V=;hlc.EX,4Ls:hk&$/mTPDlQ/2NbY92I589YD4+<.U>!#Y0@L&J),raVmCU#bBgc5nj9bpX1RPuXmUoiZTu:+'KH8!3[DQNq$eI/#kMV645drZ@;SGP)Gb5ZTtHL?kDeb=eX`kB7YaZUIR%I(%nEfp,P`BB"AJVf)o6_M'iArs'g[#s/2d5Z0Me!'YWGJ-Vgg]lDN"qgAM5n/+c17):sZ/Lf&j8_a;#Vn@(RbmQKfGW_e1*6Vm_Jf*\9+6"'\(U2Eh-cGP4*;U"Z]T;(b*.#XoiCHnjJtN+(^oGi/ouW'>m^O4A,`_+)];@%70,?Qe5eb]io0.>-pTcJt*olkR*oB*K6Bqbs2^+n)8kDr#\Rq=j\dtH4;UOq0QAkO3*0o?'G,WkA?.eoqh(E0l9(DYo80$4/%53k\@1IMkdHJf-]jO-2LPLMRf!ssgi3fZ8]31aQD-F!!4Sd?RH%S'q\8Wi>(J#Z.nfSK`d*^c`Ud71g2ZPeMfQ.ZI1rn"u?d^O^?D)7WG=WnH^52[eIA+u>J0eX^!1e,GLS[&BA]NRfdK61;i)HIIC\7SiG/Ls#hCd'Q)p%YK6!)NA endstream endobj 23 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 22 0 R >> endobj 24 0 obj << /Length 1607 /Filter [ /ASCII85Decode /FlateDecode ] >> stream GatmHj8aZRh]p#>_brG$?L+nfRL%p#0Rhd5m4LO.cF31Tc+[lSb`?#>6/B#fgFmj;^:?*Vgu/`_*-Yrb-#E[AIi)P/Hube\gC^:;]D=/D[>b]B7*gD24+93onMJ@H5[:WN7pIdLgkGFo:X/IN#a;Kd=7>>uq39I)a,Me&,W>9(ZeV9X(FpKrJ'pqdJ/&ISh&;P_:l421KLt-[mT*VVV`>ZHg#A8%p!.0n,uSVD4cSN>nu0PCSa:2?t/PAVO-Cm@-!oou?Eh4E1o7JgNHAAof1a@I4o[\]C/9/5?,S\--^P5;8#e+Q@L?"0#hE0"%7_@lgDOpF?+S0jCT);!VBe47Z*UlbH[AtsSG_c59NCU!BLKpnJ,RZ<3$(#M3uu/p$VLK"CZkmKS1AY\>VWk.WaBN+IS=.$E`@H[M[j/Dn435,ba!&a,.88^2\'5h\8cOua.=Wa+A1+sGbOO8_MhY>k/ZZ[j"/1C3%88/U8mu.0c8*7kt5VnU6>So[Z.Kb>%9PIb3S=6klg1n,RqQad5/mk%QQJj69/QNIq,j"UtfEFO?I(Wc#'R/0AHoMNQrDs0l%pV.YZYR@KN'KTggqA'Ph8s1H>pUnPR2!aP:,:2XR40k3hYa'^\fZqba__CprM="uVpF0KL-OV1s0V!cJ"`Gq"=X%FC()lA6qsHl0JJ%bDD+o"2<0k?uS8_$M9k'2o#B6Gn8$K7PItc+k#G1Ql(A[orPk15mJJHLGspK+hVS]!K:%]BQhlhomhR[0eY);<^4b^%ImXA5ceRa@Ph3T1L6)75TYT<:5NE9]?F69&AC[Oe/]X0*?=::o4q]VdS36GZV/_$Qi421/.i=lK<>$F1,tVeU9$l$g[GSkPFZ!T-V&YQf\"0*@\U57c_ii1N5G/&)#Fch7U.lnMtUIr@;dVAol'53pGrCk4/*1l>L;^lSXA%.-c"J#NZVp;2kLr5i\4_PM"&&5`Kr]ISeQdMs[[, endstream endobj 25 0 obj << /Type /Page /Parent 1 0 R /MediaBox [ 0 0 612 792 ] /Resources 3 0 R /Contents 24 0 R >> endobj 27 0 obj << /Title (\376\377\0\61\0\40\0\101\0\142\0\157\0\165\0\164\0\40\0\164\0\150\0\151\0\163\0\40\0\104\0\157\0\143\0\165\0\155\0\145\0\156\0\164) /Parent 26 0 R /Next 28 0 R /A 9 0 R >> endobj 28 0 obj << /Title (\376\377\0\62\0\40\0\101\0\142\0\157\0\165\0\164\0\40\0\164\0\150\0\145\0\40\0\104\0\145\0\155\0\157\0\163) /Parent 26 0 R /Prev 27 0 R /Next 29 0 R /A 11 0 R >> endobj 29 0 obj << /Title (\376\377\0\63\0\40\0\111\0\156\0\144\0\145\0\170\0\151\0\156\0\147\0\40\0\106\0\151\0\154\0\145\0\163) /Parent 26 0 R /Prev 28 0 R /Next 30 0 R /A 13 0 R >> endobj 30 0 obj << /Title (\376\377\0\64\0\40\0\104\0\145\0\160\0\154\0\157\0\171\0\151\0\156\0\147\0\40\0\164\0\150\0\145\0\40\0\104\0\145\0\155\0\157\0\163) /Parent 26 0 R /Prev 29 0 R /Next 31 0 R /A 15 0 R >> endobj 31 0 obj << /Title (\376\377\0\65\0\40\0\103\0\157\0\156\0\146\0\151\0\147\0\165\0\162\0\141\0\164\0\151\0\157\0\156) /Parent 26 0 R /Prev 30 0 R /Next 32 0 R /A 17 0 R >> endobj 32 0 obj << /Title (\376\377\0\66\0\40\0\122\0\165\0\156\0\156\0\151\0\156\0\147\0\40\0\164\0\150\0\145\0\40\0\104\0\145\0\155\0\157\0\163) /Parent 26 0 R /Prev 31 0 R /Next 33 0 R /A 19 0 R >> endobj 33 0 obj << /Title (\376\377\0\67\0\40\0\101\0\142\0\157\0\165\0\164\0\40\0\164\0\150\0\145\0\40\0\143\0\157\0\144\0\145\0\56\0\56\0\56) /Parent 26 0 R /Prev 32 0 R /A 21 0 R >> endobj 34 0 obj << /Type /Font /Subtype /Type1 /Name /F3 /BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding >> endobj 35 0 obj << /Type /Font /Subtype /Type1 /Name /F5 /BaseFont /Times-Roman /Encoding /WinAnsiEncoding >> endobj 36 0 obj << /Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica /Encoding /WinAnsiEncoding >> endobj 37 0 obj << /Type /Font /Subtype /Type1 /Name /F9 /BaseFont /Courier /Encoding /WinAnsiEncoding >> endobj 38 0 obj << /Type /Font /Subtype /Type1 /Name /F2 /BaseFont /Helvetica-Oblique /Encoding /WinAnsiEncoding >> endobj 39 0 obj << /Type /Font /Subtype /Type1 /Name /F7 /BaseFont /Times-Bold /Encoding /WinAnsiEncoding >> endobj 1 0 obj << /Type /Pages /Count 3 /Kids [6 0 R 23 0 R 25 0 R ] >> endobj 2 0 obj << /Type /Catalog /Pages 1 0 R /Outlines 26 0 R /PageMode /UseOutlines >> endobj 3 0 obj << /Font << /F3 34 0 R /F5 35 0 R /F1 36 0 R /F9 37 0 R /F2 38 0 R /F7 39 0 R >> /ProcSet [ /PDF /ImageC /Text ] >> endobj 9 0 obj << /S /GoTo /D [23 0 R /XYZ 85.0 659.0 null] >> endobj 11 0 obj << /S /GoTo /D [23 0 R /XYZ 85.0 567.066 null] >> endobj 13 0 obj << /S /GoTo /D [23 0 R /XYZ 85.0 448.732 null] >> endobj 15 0 obj << /S /GoTo /D [23 0 R /XYZ 85.0 303.998 null] >> endobj 17 0 obj << /S /GoTo /D [23 0 R /XYZ 85.0 225.264 null] >> endobj 19 0 obj << /S /GoTo /D [25 0 R /XYZ 85.0 602.2 null] >> endobj 21 0 obj << /S /GoTo /D [25 0 R /XYZ 85.0 436.266 null] >> endobj 26 0 obj << /First 27 0 R /Last 33 0 R >> endobj xref 0 40 0000000000 65535 f 0000008656 00000 n 0000008728 00000 n 0000008820 00000 n 0000000015 00000 n 0000000071 00000 n 0000000823 00000 n 0000000943 00000 n 0000001010 00000 n 0000008954 00000 n 0000001145 00000 n 0000009017 00000 n 0000001281 00000 n 0000009083 00000 n 0000001416 00000 n 0000009149 00000 n 0000001553 00000 n 0000009215 00000 n 0000001690 00000 n 0000009281 00000 n 0000001825 00000 n 0000009345 00000 n 0000001961 00000 n 0000004695 00000 n 0000004803 00000 n 0000006503 00000 n 0000009411 00000 n 0000006611 00000 n 0000006814 00000 n 0000007008 00000 n 0000007197 00000 n 0000007415 00000 n 0000007599 00000 n 0000007805 00000 n 0000007994 00000 n 0000008107 00000 n 0000008217 00000 n 0000008325 00000 n 0000008431 00000 n 0000008547 00000 n trailer << /Size 40 /Root 2 0 R /Info 4 0 R >> startxref 9462 %%EOF lucene-2.9.4/docs/skin/0000755000175000017500000000000011554106561015353 5ustar janpascaljanpascallucene-2.9.4/docs/skin/scripts/0000755000175000017500000000000011474320233017035 5ustar janpascaljanpascallucene-2.9.4/docs/skin/print.css0000644000175000017500000000232611474320233017217 0ustar janpascaljanpascal/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ body { font-family: Georgia, Palatino, serif; font-size: 12pt; background: white; } #tabs, #menu, #content .toc { display: none; } #content { width: auto; padding: 0; float: none !important; color: black; background: inherit; } a:link, a:visited { color: #336699; background: inherit; text-decoration: underline; } #top .logo { padding: 0; margin: 0 0 2em 0; } #footer { margin-top: 4em; } acronym { border: 0; } lucene-2.9.4/docs/skin/translations/0000755000175000017500000000000011474320233020067 5ustar janpascaljanpascallucene-2.9.4/docs/skin/CommonMessages_en_US.xml0000644000175000017500000000206011474320233022077 0ustar janpascaljanpascal Font size: Last Published: Search Search site with lucene-2.9.4/docs/skin/profile.css.xslt0000644000175000017500000001720511474320233020516 0ustar janpascaljanpascal #top { background-color: ;} #top .header .current { background-color: ;} #top .header .current a:link { color: ; } #top .header .current a:visited { color: ; } #top .header .current a:hover { color: ; } #tabs li { background-color: ;} #tabs li a:link { color: ; } #tabs li a:visited { color: ; } #tabs li a:hover { color: ; } #level2tabs { background-color: ;} #level2tabs a:link { color: ; } #level2tabs a:visited { color: ; } #level2tabs a:hover { color: ; } .heading { background-color: ;} .boxed { background-color: ;} .underlined_5 {border-bottom: solid 5px ;} .underlined_10 {border-bottom: solid 10px ;} table caption { background-color: ; color: ; } #feedback { color: ; background: ; text-align: ; } #feedback #feedbackto { color: ; } #main .breadtrail { background: ; color: ; } #main .breadtrail a:link { color: ; } #main .breadtrail a:visited { color: ; } #main .breadtrail a:hover { color: ; } #top .breadtrail { background: ; color: ; } #top .breadtrail a:link { color: ; } #top .breadtrail a:visited { color: ; } #top .breadtrail a:hover { color: ; } #publishedStrip { color: ; background: ; } #publishedStrip { color: ; background: ; } #menu .menupagetitle { background-color: } #menu { border-color: ;} #menu .menupagetitle { border-color: ;} #menu .menupageitemgroup { border-color: ;} #menu { background-color: ;} #menu { color: ;} #menu a:link { color: ;} #menu a:visited { color: ;} #menu a:hover { background-color: ; color: ;} #menu .menupagetitle { color: ;} #menu .menupageitemgroup { background-color: ; } #menu .menupageitem { color: ; } #menu .menupageitem a:link { color: ;} #menu .menupageitem a:visited { color: ;} #menu .menupageitem a:hover { background-color: ; color: ; } #menu h1 { color: ; background-color: ; } #top .searchbox { background-color: ; color: ; } body{ background-color: ; color: ; } a:link { color:} a:visited { color:} a:hover { color:} #footer { background-color: ;} .highlight { background-color: ;} .fixme { border-color: ;} .note { border-color: ;} .warning { border-color: ;} .code { border-color: ;} .ForrestTable { background-color: ;} .ForrestTable td { background-color: ;} lucene-2.9.4/docs/skin/CommonMessages_es.xml0000644000175000017500000000206511474320233021502 0ustar janpascaljanpascal Tamaño del texto: Fecha de publicación: Buscar Buscar en lucene-2.9.4/docs/skin/getBlank.js0000644000175000017500000000310511474320233017432 0ustar janpascaljanpascal/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * getBlank script - when included in a html file and called from a form text field, will set the value of this field to "" * if the text value is still the standard value. * getPrompt script - when included in a html file and called from a form text field, will set the value of this field to the prompt * if the text value is empty. * * Typical usage: * * */ lucene-2.9.4/docs/skin/note.txt0000644000175000017500000000255011474320233017056 0ustar janpascaljanpascalNotes for developer: --Legend------------------- TODO -> blocker DONE -> blocker ToDo -> enhancement bug done -> enhancement bug --Issues------------------- - the corner images should be rendered through svg with the header color. -> DONE -> ToDo: get rid of the images and use only divs! - the menu points should be displayed "better". -> DONE -- Use the krysalis-site menu approach for the overall menu display. -> DONE -- Use the old lenya innermenu approch to further enhance the menu . -> DONE - the content area needs some attention. -> DONE -- introduce the heading scheme from krysalis () -> DONE -> ToDo: make box with round corners -> done: make underlined with variable border height -> ToDo: make underline with bottom round corner -- introduce the toc for each html-page -> DONE -- introduce the external-link-images. -> DONE - the publish note should be where now only a border is. Like
-> DONE , but make it configurable. -> DONE - footer needs some attention -> DONE -- the footer do not have the color profile! Enable it! -> DONE -- the footer should as well contain a feedback link. See http://issues.apache.org/eyebrowse/ReadMsg?listName=forrest-user@xml.apache.org&msgNo=71 -> DONE - introduce credits alternativ location -> DONE - border for published / breadtrail / menu /tab divs -> ToDolucene-2.9.4/docs/skin/CommonMessages_de.xml0000644000175000017500000000210211474320233021453 0ustar janpascaljanpascal Schriftgrösse: Zuletzt veröffentlicht: Suche: Suche auf der Seite mit lucene-2.9.4/docs/skin/profile.css0000644000175000017500000001034211474320233017520 0ustar janpascaljanpascal /* ==================== aural ============================ */ @media aural { h1, h2, h3, h4, h5, h6 { voice-family: paul, male; stress: 20; richness: 90 } h1 { pitch: x-low; pitch-range: 90 } h2 { pitch: x-low; pitch-range: 80 } h3 { pitch: low; pitch-range: 70 } h4 { pitch: medium; pitch-range: 60 } h5 { pitch: medium; pitch-range: 50 } h6 { pitch: medium; pitch-range: 40 } li, dt, dd { pitch: medium; richness: 60 } dt { stress: 80 } pre, code, tt { pitch: medium; pitch-range: 0; stress: 0; richness: 80 } em { pitch: medium; pitch-range: 60; stress: 60; richness: 50 } strong { pitch: medium; pitch-range: 60; stress: 90; richness: 90 } dfn { pitch: high; pitch-range: 60; stress: 60 } s, strike { richness: 0 } i { pitch: medium; pitch-range: 60; stress: 60; richness: 50 } b { pitch: medium; pitch-range: 60; stress: 90; richness: 90 } u { richness: 0 } :link { voice-family: harry, male } :visited { voice-family: betty, female } :active { voice-family: betty, female; pitch-range: 80; pitch: x-high } } a.external { padding: 0 20px 0px 0px; display:inline; background-repeat: no-repeat; background-position: center right; background-image: url(images/external-link.gif); } #top { background-color: #FFFFFF;} #top .header .current { background-color: #4C6C8F;} #top .header .current a:link { color: #ffffff; } #top .header .current a:visited { color: #ffffff; } #top .header .current a:hover { color: #ffffff; } #tabs li { background-color: #E5E4D9 ;} #tabs li a:link { color: #000000; } #tabs li a:visited { color: #000000; } #tabs li a:hover { color: #000000; } #level2tabs a.selected { background-color: #4C6C8F ;} #level2tabs a:link { color: #ffffff; } #level2tabs a:visited { color: #ffffff; } #level2tabs a:hover { color: #ffffff; } #level2tabs { background-color: #E5E4D9;} #level2tabs a.unselected:link { color: #000000; } #level2tabs a.unselected:visited { color: #000000; } #level2tabs a.unselected:hover { color: #000000; } .heading { background-color: #E5E4D9;} .boxed { background-color: #E5E4D9;} .underlined_5 {border-bottom: solid 5px #E5E4D9;} .underlined_10 {border-bottom: solid 10px #E5E4D9;} table caption { background-color: #E5E4D9; color: #000000; } #feedback { color: #FFFFFF; background: #4C6C8F; text-align: center; } #feedback #feedbackto { color: #FFFFFF; } #publishedStrip { color: #FFFFFF; background: #4C6C8F; } #publishedStrip { color: #000000; background: #E5E4D9; } #menu .menupagetitle { background-color: #CFDCED; color: #000000;} #menu { border-color: #999999;} #menu .menupagetitle { border-color: #999999;} #menu .menupageitemgroup { border-color: #999999;} #menu { background-color: #4C6C8F;} #menu { color: #ffffff;} #menu a:link { color: #ffffff;} #menu a:visited { color: #ffffff;} #menu a:hover { background-color: #4C6C8F; color: #ffffff;} #menu h1 { color: #000000; background-color: #cfdced; } #top .searchbox { background-color: #E5E4D9 ; color: #000000; } #menu .menupageitemgroup { background-color: #E5E4D9; } #menu .menupageitem { color: #000000; } #menu .menupageitem a:link { color: #000000;} #menu .menupageitem a:visited { color: #000000;} #menu .menupageitem a:hover { background-color: #E5E4D9; color: #000000; } body{ background-color: #ffffff; color: #000000; } a:link { color:#0000ff} a:visited { color:#009999} a:hover { color:#6587ff} .ForrestTable { background-color: #ccc;} .ForrestTable td { background-color: #ffffff;} .highlight { background-color: #ffff00;} .fixme { border-color: #c60;} .note { border-color: #069;} .warning { border-color: #900;} .code { border-color: #a5b6c6;} #footer { background-color: #E5E4D9;} /* extra-css */ p.quote { margin-left: 2em; padding: .5em; background-color: #f0f0f0; font-family: monospace; } img.float-right { float: right; margin-left: 2em; padding: .5em; } #footer a { color: #0F3660; } #footer a:visited { color: #009999; } pre.code { margin-left: 2em; margin-right: 2em; padding: 0.5em; background-color: #f0f0f0; } lucene-2.9.4/docs/skin/CommonMessages_fr.xml0000644000175000017500000000210311474320233021473 0ustar janpascaljanpascal Taille : Dernière publication : Rechercher Rechercher sur le site avec lucene-2.9.4/docs/skin/prototype.js0000644000175000017500000010020611474320233017750 0ustar janpascaljanpascal/* Prototype JavaScript framework, version 1.4.0_pre4 * (c) 2005 Sam Stephenson * * THIS FILE IS AUTOMATICALLY GENERATED. When sending patches, please diff * against the source tree, available from the Prototype darcs repository. * * Prototype is freely distributable under the terms of an MIT-style license. * * For details, see the Prototype web site: http://prototype.conio.net/ * /*--------------------------------------------------------------------------*/ var Prototype = { Version: '1.4.0_pre4', emptyFunction: function() {}, K: function(x) {return x} } var Class = { create: function() { return function() { this.initialize.apply(this, arguments); } } } var Abstract = new Object(); Object.extend = function(destination, source) { for (property in source) { destination[property] = source[property]; } return destination; } Function.prototype.bind = function(object) { var __method = this; return function() { return __method.apply(object, arguments); } } Function.prototype.bindAsEventListener = function(object) { var __method = this; return function(event) { return __method.call(object, event || window.event); } } Number.prototype.toColorPart = function() { var digits = this.toString(16); if (this < 16) return '0' + digits; return digits; } var Try = { these: function() { var returnValue; for (var i = 0; i < arguments.length; i++) { var lambda = arguments[i]; try { returnValue = lambda(); break; } catch (e) {} } return returnValue; } } /*--------------------------------------------------------------------------*/ var PeriodicalExecuter = Class.create(); PeriodicalExecuter.prototype = { initialize: function(callback, frequency) { this.callback = callback; this.frequency = frequency; this.currentlyExecuting = false; this.registerCallback(); }, registerCallback: function() { setInterval(this.onTimerEvent.bind(this), this.frequency * 1000); }, onTimerEvent: function() { if (!this.currentlyExecuting) { try { this.currentlyExecuting = true; this.callback(); } finally { this.currentlyExecuting = false; } } } } /*--------------------------------------------------------------------------*/ function $() { var elements = new Array(); for (var i = 0; i < arguments.length; i++) { var element = arguments[i]; if (typeof element == 'string') element = document.getElementById(element); if (arguments.length == 1) return element; elements.push(element); } return elements; } if (!Array.prototype.push) { Array.prototype.push = function() { var startLength = this.length; for (var i = 0; i < arguments.length; i++) this[startLength + i] = arguments[i]; return this.length; } } if (!Function.prototype.apply) { // Based on code from http://www.youngpup.net/ Function.prototype.apply = function(object, parameters) { var parameterStrings = new Array(); if (!object) object = window; if (!parameters) parameters = new Array(); for (var i = 0; i < parameters.length; i++) parameterStrings[i] = 'parameters[' + i + ']'; object.__apply__ = this; var result = eval('object.__apply__(' + parameterStrings.join(', ') + ')'); object.__apply__ = null; return result; } } Object.extend(String.prototype, { stripTags: function() { return this.replace(/<\/?[^>]+>/gi, ''); }, escapeHTML: function() { var div = document.createElement('div'); var text = document.createTextNode(this); div.appendChild(text); return div.innerHTML; }, unescapeHTML: function() { var div = document.createElement('div'); div.innerHTML = this.stripTags(); return div.childNodes[0].nodeValue; }, parseQuery: function() { var str = this; if (str.substring(0,1) == '?') { str = this.substring(1); } var result = {}; var pairs = str.split('&'); for (var i = 0; i < pairs.length; i++) { var pair = pairs[i].split('='); result[pair[0]] = pair[1]; } return result; } }); var _break = new Object(); var _continue = new Object(); var Enumerable = { each: function(iterator) { var index = 0; try { this._each(function(value) { try { iterator(value, index++); } catch (e) { if (e != _continue) throw e; } }); } catch (e) { if (e != _break) throw e; } }, all: function(iterator) { var result = true; this.each(function(value, index) { if (!(result &= (iterator || Prototype.K)(value, index))) throw _break; }); return result; }, any: function(iterator) { var result = true; this.each(function(value, index) { if (result &= (iterator || Prototype.K)(value, index)) throw _break; }); return result; }, collect: function(iterator) { var results = []; this.each(function(value, index) { results.push(iterator(value, index)); }); return results; }, detect: function (iterator) { var result; this.each(function(value, index) { if (iterator(value, index)) { result = value; throw _break; } }); return result; }, findAll: function(iterator) { var results = []; this.each(function(value, index) { if (iterator(value, index)) results.push(value); }); return results; }, grep: function(pattern, iterator) { var results = []; this.each(function(value, index) { var stringValue = value.toString(); if (stringValue.match(pattern)) results.push((iterator || Prototype.K)(value, index)); }) return results; }, include: function(object) { var found = false; this.each(function(value) { if (value == object) { found = true; throw _break; } }); return found; }, inject: function(memo, iterator) { this.each(function(value, index) { memo = iterator(memo, value, index); }); return memo; }, invoke: function(method) { var args = $A(arguments).slice(1); return this.collect(function(value) { return value[method].apply(value, args); }); }, max: function(iterator) { var result; this.each(function(value, index) { value = (iterator || Prototype.K)(value, index); if (value >= (result || value)) result = value; }); return result; }, min: function(iterator) { var result; this.each(function(value, index) { value = (iterator || Prototype.K)(value, index); if (value <= (result || value)) result = value; }); return result; }, partition: function(iterator) { var trues = [], falses = []; this.each(function(value, index) { ((iterator || Prototype.K)(value, index) ? trues : falses).push(value); }); return [trues, falses]; }, pluck: function(property) { var results = []; this.each(function(value, index) { results.push(value[property]); }); return results; }, reject: function(iterator) { var results = []; this.each(function(value, index) { if (!iterator(value, index)) results.push(value); }); return results; }, sortBy: function(iterator) { return this.collect(function(value, index) { return {value: value, criteria: iterator(value, index)}; }).sort(function(left, right) { var a = left.criteria, b = right.criteria; return a < b ? -1 : a > b ? 1 : 0; }).pluck('value'); }, toArray: function() { return this.collect(Prototype.K); }, zip: function() { var iterator = Prototype.K, args = $A(arguments); if (typeof args.last() == 'function') iterator = args.pop(); var collections = [this].concat(args).map($A); return this.map(function(value, index) { iterator(value = collections.pluck(index)); return value; }); } } Object.extend(Enumerable, { map: Enumerable.collect, find: Enumerable.detect, select: Enumerable.findAll, member: Enumerable.include, entries: Enumerable.toArray }); $A = Array.from = function(iterable) { var results = []; for (var i = 0; i < iterable.length; i++) results.push(iterable[i]); return results; } Object.extend(Array.prototype, { _each: function(iterator) { for (var i = 0; i < this.length; i++) iterator(this[i]); }, first: function() { return this[0]; }, last: function() { return this[this.length - 1]; } }); Object.extend(Array.prototype, Enumerable); var Ajax = { getTransport: function() { return Try.these( function() {return new ActiveXObject('Msxml2.XMLHTTP')}, function() {return new ActiveXObject('Microsoft.XMLHTTP')}, function() {return new XMLHttpRequest()} ) || false; } } Ajax.Base = function() {}; Ajax.Base.prototype = { setOptions: function(options) { this.options = { method: 'post', asynchronous: true, parameters: '' } Object.extend(this.options, options || {}); }, responseIsSuccess: function() { return this.transport.status == undefined || this.transport.status == 0 || (this.transport.status >= 200 && this.transport.status < 300); }, responseIsFailure: function() { return !this.responseIsSuccess(); } } Ajax.Request = Class.create(); Ajax.Request.Events = ['Uninitialized', 'Loading', 'Loaded', 'Interactive', 'Complete']; Ajax.Request.prototype = Object.extend(new Ajax.Base(), { initialize: function(url, options) { this.transport = Ajax.getTransport(); this.setOptions(options); this.request(url); }, request: function(url) { var parameters = this.options.parameters || ''; if (parameters.length > 0) parameters += '&_='; try { if (this.options.method == 'get') url += '?' + parameters; this.transport.open(this.options.method, url, this.options.asynchronous); if (this.options.asynchronous) { this.transport.onreadystatechange = this.onStateChange.bind(this); setTimeout((function() {this.respondToReadyState(1)}).bind(this), 10); } this.setRequestHeaders(); var body = this.options.postBody ? this.options.postBody : parameters; this.transport.send(this.options.method == 'post' ? body : null); } catch (e) { } }, setRequestHeaders: function() { var requestHeaders = ['X-Requested-With', 'XMLHttpRequest', 'X-Prototype-Version', Prototype.Version]; if (this.options.method == 'post') { requestHeaders.push('Content-type', 'application/x-www-form-urlencoded'); /* Force "Connection: close" for Mozilla browsers to work around * a bug where XMLHttpReqeuest sends an incorrect Content-length * header. See Mozilla Bugzilla #246651. */ if (this.transport.overrideMimeType) requestHeaders.push('Connection', 'close'); } if (this.options.requestHeaders) requestHeaders.push.apply(requestHeaders, this.options.requestHeaders); for (var i = 0; i < requestHeaders.length; i += 2) this.transport.setRequestHeader(requestHeaders[i], requestHeaders[i+1]); }, onStateChange: function() { var readyState = this.transport.readyState; if (readyState != 1) this.respondToReadyState(this.transport.readyState); }, respondToReadyState: function(readyState) { var event = Ajax.Request.Events[readyState]; if (event == 'Complete') (this.options['on' + this.transport.status] || this.options['on' + (this.responseIsSuccess() ? 'Success' : 'Failure')] || Prototype.emptyFunction)(this.transport); (this.options['on' + event] || Prototype.emptyFunction)(this.transport); /* Avoid memory leak in MSIE: clean up the oncomplete event handler */ if (event == 'Complete') this.transport.onreadystatechange = Prototype.emptyFunction; } }); Ajax.Updater = Class.create(); Ajax.Updater.ScriptFragment = '(?:)((\n|.)*?)(?:<\/script>)'; Object.extend(Object.extend(Ajax.Updater.prototype, Ajax.Request.prototype), { initialize: function(container, url, options) { this.containers = { success: container.success ? $(container.success) : $(container), failure: container.failure ? $(container.failure) : (container.success ? null : $(container)) } this.transport = Ajax.getTransport(); this.setOptions(options); var onComplete = this.options.onComplete || Prototype.emptyFunction; this.options.onComplete = (function() { this.updateContent(); onComplete(this.transport); }).bind(this); this.request(url); }, updateContent: function() { var receiver = this.responseIsSuccess() ? this.containers.success : this.containers.failure; var match = new RegExp(Ajax.Updater.ScriptFragment, 'img'); var response = this.transport.responseText.replace(match, ''); var scripts = this.transport.responseText.match(match); if (receiver) { if (this.options.insertion) { new this.options.insertion(receiver, response); } else { receiver.innerHTML = response; } } if (this.responseIsSuccess()) { if (this.onComplete) setTimeout((function() {this.onComplete( this.transport)}).bind(this), 10); } if (this.options.evalScripts && scripts) { match = new RegExp(Ajax.Updater.ScriptFragment, 'im'); setTimeout((function() { for (var i = 0; i < scripts.length; i++) eval(scripts[i].match(match)[1]); }).bind(this), 10); } } }); Ajax.PeriodicalUpdater = Class.create(); Ajax.PeriodicalUpdater.prototype = Object.extend(new Ajax.Base(), { initialize: function(container, url, options) { this.setOptions(options); this.onComplete = this.options.onComplete; this.frequency = (this.options.frequency || 2); this.decay = 1; this.updater = {}; this.container = container; this.url = url; this.start(); }, start: function() { this.options.onComplete = this.updateComplete.bind(this); this.onTimerEvent(); }, stop: function() { this.updater.onComplete = undefined; clearTimeout(this.timer); (this.onComplete || Ajax.emptyFunction).apply(this, arguments); }, updateComplete: function(request) { if (this.options.decay) { this.decay = (request.responseText == this.lastText ? this.decay * this.options.decay : 1); this.lastText = request.responseText; } this.timer = setTimeout(this.onTimerEvent.bind(this), this.decay * this.frequency * 1000); }, onTimerEvent: function() { this.updater = new Ajax.Updater(this.container, this.url, this.options); } }); document.getElementsByClassName = function(className) { var children = document.getElementsByTagName('*') || document.all; var elements = new Array(); for (var i = 0; i < children.length; i++) { var child = children[i]; var classNames = child.className.split(' '); for (var j = 0; j < classNames.length; j++) { if (classNames[j] == className) { elements.push(child); break; } } } return elements; } /*--------------------------------------------------------------------------*/ if (!window.Element) { var Element = new Object(); } Object.extend(Element, { toggle: function() { for (var i = 0; i < arguments.length; i++) { var element = $(arguments[i]); element.style.display = (element.style.display == 'none' ? '' : 'none'); } }, hide: function() { for (var i = 0; i < arguments.length; i++) { var element = $(arguments[i]); element.style.display = 'none'; } }, show: function() { for (var i = 0; i < arguments.length; i++) { var element = $(arguments[i]); element.style.display = ''; } }, remove: function(element) { element = $(element); element.parentNode.removeChild(element); }, getHeight: function(element) { element = $(element); return element.offsetHeight; }, hasClassName: function(element, className) { element = $(element); if (!element) return; var a = element.className.split(' '); for (var i = 0; i < a.length; i++) { if (a[i] == className) return true; } return false; }, addClassName: function(element, className) { element = $(element); Element.removeClassName(element, className); element.className += ' ' + className; }, removeClassName: function(element, className) { element = $(element); if (!element) return; var newClassName = ''; var a = element.className.split(' '); for (var i = 0; i < a.length; i++) { if (a[i] != className) { if (i > 0) newClassName += ' '; newClassName += a[i]; } } element.className = newClassName; }, // removes whitespace-only text node children cleanWhitespace: function(element) { var element = $(element); for (var i = 0; i < element.childNodes.length; i++) { var node = element.childNodes[i]; if (node.nodeType == 3 && !/\S/.test(node.nodeValue)) Element.remove(node); } } }); var Toggle = new Object(); Toggle.display = Element.toggle; /*--------------------------------------------------------------------------*/ Abstract.Insertion = function(adjacency) { this.adjacency = adjacency; } Abstract.Insertion.prototype = { initialize: function(element, content) { this.element = $(element); this.content = content; if (this.adjacency && this.element.insertAdjacentHTML) { this.element.insertAdjacentHTML(this.adjacency, this.content); } else { this.range = this.element.ownerDocument.createRange(); if (this.initializeRange) this.initializeRange(); this.fragment = this.range.createContextualFragment(this.content); this.insertContent(); } } } var Insertion = new Object(); Insertion.Before = Class.create(); Insertion.Before.prototype = Object.extend(new Abstract.Insertion('beforeBegin'), { initializeRange: function() { this.range.setStartBefore(this.element); }, insertContent: function() { this.element.parentNode.insertBefore(this.fragment, this.element); } }); Insertion.Top = Class.create(); Insertion.Top.prototype = Object.extend(new Abstract.Insertion('afterBegin'), { initializeRange: function() { this.range.selectNodeContents(this.element); this.range.collapse(true); }, insertContent: function() { this.element.insertBefore(this.fragment, this.element.firstChild); } }); Insertion.Bottom = Class.create(); Insertion.Bottom.prototype = Object.extend(new Abstract.Insertion('beforeEnd'), { initializeRange: function() { this.range.selectNodeContents(this.element); this.range.collapse(this.element); }, insertContent: function() { this.element.appendChild(this.fragment); } }); Insertion.After = Class.create(); Insertion.After.prototype = Object.extend(new Abstract.Insertion('afterEnd'), { initializeRange: function() { this.range.setStartAfter(this.element); }, insertContent: function() { this.element.parentNode.insertBefore(this.fragment, this.element.nextSibling); } }); var Field = { clear: function() { for (var i = 0; i < arguments.length; i++) $(arguments[i]).value = ''; }, focus: function(element) { $(element).focus(); }, present: function() { for (var i = 0; i < arguments.length; i++) if ($(arguments[i]).value == '') return false; return true; }, select: function(element) { $(element).select(); }, activate: function(element) { $(element).focus(); $(element).select(); } } /*--------------------------------------------------------------------------*/ var Form = { serialize: function(form) { var elements = Form.getElements($(form)); var queryComponents = new Array(); for (var i = 0; i < elements.length; i++) { var queryComponent = Form.Element.serialize(elements[i]); if (queryComponent) queryComponents.push(queryComponent); } return queryComponents.join('&'); }, getElements: function(form) { var form = $(form); var elements = new Array(); for (tagName in Form.Element.Serializers) { var tagElements = form.getElementsByTagName(tagName); for (var j = 0; j < tagElements.length; j++) elements.push(tagElements[j]); } return elements; }, getInputs: function(form, typeName, name) { var form = $(form); var inputs = form.getElementsByTagName('input'); if (!typeName && !name) return inputs; var matchingInputs = new Array(); for (var i = 0; i < inputs.length; i++) { var input = inputs[i]; if ((typeName && input.type != typeName) || (name && input.name != name)) continue; matchingInputs.push(input); } return matchingInputs; }, disable: function(form) { var elements = Form.getElements(form); for (var i = 0; i < elements.length; i++) { var element = elements[i]; element.blur(); element.disabled = 'true'; } }, enable: function(form) { var elements = Form.getElements(form); for (var i = 0; i < elements.length; i++) { var element = elements[i]; element.disabled = ''; } }, focusFirstElement: function(form) { var form = $(form); var elements = Form.getElements(form); for (var i = 0; i < elements.length; i++) { var element = elements[i]; if (element.type != 'hidden' && !element.disabled) { Field.activate(element); break; } } }, reset: function(form) { $(form).reset(); } } Form.Element = { serialize: function(element) { var element = $(element); var method = element.tagName.toLowerCase(); var parameter = Form.Element.Serializers[method](element); if (parameter) return encodeURIComponent(parameter[0]) + '=' + encodeURIComponent(parameter[1]); }, getValue: function(element) { var element = $(element); var method = element.tagName.toLowerCase(); var parameter = Form.Element.Serializers[method](element); if (parameter) return parameter[1]; } } Form.Element.Serializers = { input: function(element) { switch (element.type.toLowerCase()) { case 'submit': case 'hidden': case 'password': case 'text': return Form.Element.Serializers.textarea(element); case 'checkbox': case 'radio': return Form.Element.Serializers.inputSelector(element); } return false; }, inputSelector: function(element) { if (element.checked) return [element.name, element.value]; }, textarea: function(element) { return [element.name, element.value]; }, select: function(element) { var value = ''; if (element.type == 'select-one') { var index = element.selectedIndex; if (index >= 0) value = element.options[index].value || element.options[index].text; } else { value = new Array(); for (var i = 0; i < element.length; i++) { var opt = element.options[i]; if (opt.selected) value.push(opt.value || opt.text); } } return [element.name, value]; } } /*--------------------------------------------------------------------------*/ var $F = Form.Element.getValue; /*--------------------------------------------------------------------------*/ Abstract.TimedObserver = function() {} Abstract.TimedObserver.prototype = { initialize: function(element, frequency, callback) { this.frequency = frequency; this.element = $(element); this.callback = callback; this.lastValue = this.getValue(); this.registerCallback(); }, registerCallback: function() { setInterval(this.onTimerEvent.bind(this), this.frequency * 1000); }, onTimerEvent: function() { var value = this.getValue(); if (this.lastValue != value) { this.callback(this.element, value); this.lastValue = value; } } } Form.Element.Observer = Class.create(); Form.Element.Observer.prototype = Object.extend(new Abstract.TimedObserver(), { getValue: function() { return Form.Element.getValue(this.element); } }); Form.Observer = Class.create(); Form.Observer.prototype = Object.extend(new Abstract.TimedObserver(), { getValue: function() { return Form.serialize(this.element); } }); /*--------------------------------------------------------------------------*/ Abstract.EventObserver = function() {} Abstract.EventObserver.prototype = { initialize: function(element, callback) { this.element = $(element); this.callback = callback; this.lastValue = this.getValue(); if (this.element.tagName.toLowerCase() == 'form') this.registerFormCallbacks(); else this.registerCallback(this.element); }, onElementEvent: function() { var value = this.getValue(); if (this.lastValue != value) { this.callback(this.element, value); this.lastValue = value; } }, registerFormCallbacks: function() { var elements = Form.getElements(this.element); for (var i = 0; i < elements.length; i++) this.registerCallback(elements[i]); }, registerCallback: function(element) { if (element.type) { switch (element.type.toLowerCase()) { case 'checkbox': case 'radio': element.target = this; element.prev_onclick = element.onclick || Prototype.emptyFunction; element.onclick = function() { this.prev_onclick(); this.target.onElementEvent(); } break; case 'password': case 'text': case 'textarea': case 'select-one': case 'select-multiple': element.target = this; element.prev_onchange = element.onchange || Prototype.emptyFunction; element.onchange = function() { this.prev_onchange(); this.target.onElementEvent(); } break; } } } } Form.Element.EventObserver = Class.create(); Form.Element.EventObserver.prototype = Object.extend(new Abstract.EventObserver(), { getValue: function() { return Form.Element.getValue(this.element); } }); Form.EventObserver = Class.create(); Form.EventObserver.prototype = Object.extend(new Abstract.EventObserver(), { getValue: function() { return Form.serialize(this.element); } }); if (!window.Event) { var Event = new Object(); } Object.extend(Event, { KEY_BACKSPACE: 8, KEY_TAB: 9, KEY_RETURN: 13, KEY_ESC: 27, KEY_LEFT: 37, KEY_UP: 38, KEY_RIGHT: 39, KEY_DOWN: 40, KEY_DELETE: 46, element: function(event) { return event.target || event.srcElement; }, isLeftClick: function(event) { return (((event.which) && (event.which == 1)) || ((event.button) && (event.button == 1))); }, pointerX: function(event) { return event.pageX || (event.clientX + (document.documentElement.scrollLeft || document.body.scrollLeft)); }, pointerY: function(event) { return event.pageY || (event.clientY + (document.documentElement.scrollTop || document.body.scrollTop)); }, stop: function(event) { if (event.preventDefault) { event.preventDefault(); event.stopPropagation(); } else { event.returnValue = false; } }, // find the first node with the given tagName, starting from the // node the event was triggered on; traverses the DOM upwards findElement: function(event, tagName) { var element = Event.element(event); while (element.parentNode && (!element.tagName || (element.tagName.toUpperCase() != tagName.toUpperCase()))) element = element.parentNode; return element; }, observers: false, _observeAndCache: function(element, name, observer, useCapture) { if (!this.observers) this.observers = []; if (element.addEventListener) { this.observers.push([element, name, observer, useCapture]); element.addEventListener(name, observer, useCapture); } else if (element.attachEvent) { this.observers.push([element, name, observer, useCapture]); element.attachEvent('on' + name, observer); } }, unloadCache: function() { if (!Event.observers) return; for (var i = 0; i < Event.observers.length; i++) { Event.stopObserving.apply(this, Event.observers[i]); Event.observers[i][0] = null; } Event.observers = false; }, observe: function(element, name, observer, useCapture) { var element = $(element); useCapture = useCapture || false; if (name == 'keypress' && ((/Konqueror|Safari|KHTML/.test(navigator.userAgent)) || element.attachEvent)) name = 'keydown'; this._observeAndCache(element, name, observer, useCapture); }, stopObserving: function(element, name, observer, useCapture) { var element = $(element); useCapture = useCapture || false; if (name == 'keypress' && ((/Konqueror|Safari|KHTML/.test(navigator.userAgent)) || element.detachEvent)) name = 'keydown'; if (element.removeEventListener) { element.removeEventListener(name, observer, useCapture); } else if (element.detachEvent) { element.detachEvent('on' + name, observer); } } }); /* prevent memory leaks in IE */ Event.observe(window, 'unload', Event.unloadCache, false); var Position = { // set to true if needed, warning: firefox performance problems // NOT neeeded for page scrolling, only if draggable contained in // scrollable elements includeScrollOffsets: false, // must be called before calling withinIncludingScrolloffset, every time the // page is scrolled prepare: function() { this.deltaX = window.pageXOffset || document.documentElement.scrollLeft || document.body.scrollLeft || 0; this.deltaY = window.pageYOffset || document.documentElement.scrollTop || document.body.scrollTop || 0; }, realOffset: function(element) { var valueT = 0, valueL = 0; do { valueT += element.scrollTop || 0; valueL += element.scrollLeft || 0; element = element.parentNode; } while (element); return [valueL, valueT]; }, cumulativeOffset: function(element) { var valueT = 0, valueL = 0; do { valueT += element.offsetTop || 0; valueL += element.offsetLeft || 0; element = element.offsetParent; } while (element); return [valueL, valueT]; }, // caches x/y coordinate pair to use with overlap within: function(element, x, y) { if (this.includeScrollOffsets) return this.withinIncludingScrolloffsets(element, x, y); this.xcomp = x; this.ycomp = y; this.offset = this.cumulativeOffset(element); return (y >= this.offset[1] && y < this.offset[1] + element.offsetHeight && x >= this.offset[0] && x < this.offset[0] + element.offsetWidth); }, withinIncludingScrolloffsets: function(element, x, y) { var offsetcache = this.realOffset(element); this.xcomp = x + offsetcache[0] - this.deltaX; this.ycomp = y + offsetcache[1] - this.deltaY; this.offset = this.cumulativeOffset(element); return (this.ycomp >= this.offset[1] && this.ycomp < this.offset[1] + element.offsetHeight && this.xcomp >= this.offset[0] && this.xcomp < this.offset[0] + element.offsetWidth); }, // within must be called directly before overlap: function(mode, element) { if (!mode) return 0; if (mode == 'vertical') return ((this.offset[1] + element.offsetHeight) - this.ycomp) / element.offsetHeight; if (mode == 'horizontal') return ((this.offset[0] + element.offsetWidth) - this.xcomp) / element.offsetWidth; }, clone: function(source, target) { source = $(source); target = $(target); target.style.position = 'absolute'; var offsets = this.cumulativeOffset(source); target.style.top = offsets[1] + 'px'; target.style.left = offsets[0] + 'px'; target.style.width = source.offsetWidth + 'px'; target.style.height = source.offsetHeight + 'px'; } } lucene-2.9.4/docs/skin/css/0000755000175000017500000000000011474320233016136 5ustar janpascaljanpascallucene-2.9.4/docs/skin/images/0000755000175000017500000000000011554106561016620 5ustar janpascaljanpascallucene-2.9.4/docs/skin/images/rc-b-r-15-1body-2menu-3menu.png0000644000175000017500000000047711474320233023723 0ustar janpascaljanpascal‰PNG  IHDR;Ö•J cHRMz&€„ú€èu0ê`:˜pœºQ<bKGDÿÿÿ ½§“ pHYsÄÄ•+³IDATxÚcôÉéÿÏ@&`"W# 3ùš¹ØÙÈ×,+)Džf66s%2m±…ž2yš•¤E$EH×ÌÆÊÂê@zT±²°0ت2¨Ê‰“¦™‰‘‘AT‡!#̉´DÒ("ÀÃМÄÀÁΊH(Ä8ULˆ‡¡);Hó¡¦2Bò#È©È6âÔ JŒ@¨(%ÂlÇ ® ‰;}Ã9(­J‹ 2X#”@ñH†mÐwÈIEND®B`‚lucene-2.9.4/docs/skin/images/txtdoc.svg.xslt0000644000175000017500000000447511474320233021644 0ustar janpascaljanpascal TXT lucene-2.9.4/docs/skin/images/add.jpg0000644000175000017500000000216611474320233020052 0ustar janpascaljanpascalÿØÿàJFIF``ÿÛCÿÛCÿÀ!ÿÄ  ÿÄ ÿÄÿÄ"ÿÚ ?±ÒîQžBÖ+ r^åv8kÏg*Ö#iFj·Ö¯q®&:c° ¤é¬ X –²U£ÅóEêM¬#Ǻ^%™šŠ"Ôh«¯õ¨êPLÂ$º¤›ÂéâRzyÒß ¶T!KuåÕœÊ9 Ñ(‚Zví9pŒ²"M’‘–<9Õ}áL‘g?õRÂöžAsº÷o%ÄßÑçJhmv•ëd¶O)ŸNÛ[Ü¿»%ˆs(Ô²µ¦‰T<)%û&úðO u`ÐÓ–É[§ún§/7¦’\ü~6Í­Xôz[·*˜±êb%i4-Ø¥ö"ÞO°ÐæÖ- ·b ÞÄ´sa›J.›q§KÔv1'ú.¬^äcrF«b(dˆ €û»P72†÷á0[é—*VйªÞÒâH[é¢ûîê¡yQanÃçrG4qˉ,R¢É±²¼rFê$Ô•tu!•”•e ‚AÅK£ÆìŽ¬ŽŒÈèêUÑÔÊÊ@*ÊA ¤ Ÿ%7ö[µ5/F:™.8¹=¦np§>£•Ç×Ö•Q¯ã—½K²/çFß`Jf ¯ê+*MôUÜáÐHå}v¼3žÂgÇœöö~t×$%À é!ÁäÞ•7Ø}zÔoî~¤Ð·–ÇÇí·cý{U3Ø` «¡Ð¥ÅˆyÖ½¶Ñ¶ŒÍ)ýiÓ~'aãpw¶zý>>î½ìsQúÓ[šž >· ; ?Ÿö6E›Ÿ²ÿEzB<½š¨ÚEmŒñóÕšÄWwz×]J‡«ró­Mþgœ‹zm,Ó|¶FÅ«ZÚùÍs;õI>ÎFź–B²·×)㡜VÚ&#ÎI•ÒHEÍ ôr-6dÅ;«&›žnÙºÌŞɹ¨ª£„RÕi˜O½c5M}©…4áŠPN_i«ÛK¹›3>«Ž8áŽ8¢"Š$XâŠ5TŽ8ÑB¤q¢€¨ˆ *ª€ª y„wyÙÝ™ÝÝ‹;»Y™‰%™‰%˜’I$“óâŸ(òßPb1‡KsZ¡ð Êç0ÌiÅÛkÈü«ð‡!ŒO‡< Ê÷Ê:O.rªò'¶žSÕù;ÝþWÄþ9ûéù?ÿÙlucene-2.9.4/docs/skin/images/page.gif0000644000175000017500000000011711474320233020215 0ustar janpascaljanpascalGIF89a€¥¶Æÿÿÿ!þCreated with The GIMP!ù , ŒyÀ€ 26…S;lucene-2.9.4/docs/skin/images/pdfdoc.gif0000644000175000017500000000071111474320233020540 0ustar janpascaljanpascalGIF89a „sqs„†„ÿÿ÷ÿÿÿœžœRQRýýõôôôÎÏÎççÖþþö101úúòïïïçççÆÇÆÿÿüÝÝÝÝwwwÿÿ÷ÿÿ÷ÿÿ÷ÿÿ÷ÿÿ÷ÿÿ÷ÿÿ÷ÿÿ÷ÿÿ÷ÿÿ÷ÿÿ÷!þCreated with The GIMP!ù , þà'Ž@`žhzŽlûB,ÏÂ`ßDáº0íÓ„ÜŽÕûÅ ‡!¾ŒPB‚© a0mò Š,­°(›w×pŒñk8  Ú** ?H¬!pÃt‚ƒ„…†‚oqŽ‘‰ŠŒŽ7˜7Ž¡–™¥›¨Ÿ¡S¤›6¥¨š« ­#§±»ž ·"¹—º°©µÂđΒ Á£¸‡Ö‡ ¶Ô_F| Þn¬ÚtDÝ{}nnÒãhæàßÁÁìÜ4Þèðâóuõ}÷áòl-#ÂO†=xéâ H¢ ?p® ¸mß9ˆ Yd(¿ƒN\ÈñEAo.5Ž$Y‘པ¶’Ø’DŒP8×ÍœYSD@ƒ J(9’*]Ê´©Ò!;lucene-2.9.4/docs/skin/images/rc-b-l-15-1body-2menu-3menu.png0000644000175000017500000000053411474320233023707 0ustar janpascaljanpascal‰PNG  IHDR;Ö•J cHRMz&€„ú€èu0ê`:˜pœºQ<bKGDÿÿÿ ½§“ pHYsÄÄ•+ÐIDATxÚcôÉéÿÏ@&`baaf [3;ùš¥DÉ×l©¯ÌÀÁÆJžfS%†¿ÿþ‘§YD€‡A]Q‚‰‘‰tÍ "#Ô‰•LÍò’ ÎÚ 쬤kÔ`{ a~ffÒ53311Ôg0€âXP< ¼þ²(qA¢¼€J &”G18›k1°³±à¬2 /d„:2ô–D2h(I2033aMH,øœŠ…΂P†7¾0œ¾ráÈùÛ ^¼eøòí'ß?Úü%1yáËIEND®B`‚lucene-2.9.4/docs/skin/images/txtdoc.png0000644000175000017500000000142011474320233020623 0ustar janpascaljanpascal‰PNG  IHDR©JLÎ cHRMz&€„ú€èu0ê`:˜pœºQ<gAMA±Ž|ûQ“sRGB®ÎébKGDÿÿÿ ½§“ pHYsÄÄ•+gIDATxÚÅVÏKAþ”¥B]ÌJ{25x°zÐKÉÕ“¨Ò iÁSɵý ,‚‘€â!Jõä©9%"žM–Hz)ÖèÁüÚí¼]GwÓu'¼¼™}3óÍ›÷Í›ôÀ"™L¦ (Êx]×Ñl6Ñjµ ¥¶ªª?S©TÒ:N²u$IÇãƒhš†F£a,Îí=Ö>VB—Âwoá–6ð"@Ä…ŒüB b±h¨Ûqq «å …œÒÀ·7l>u~¯­½—ÇÇ]¯Ñ"Ìö2@µ%Ê“¢àãò²{D1àóCúLõ(ÙX ÅŇcìèèºNq1fgMå§§Àê*°°ÌÍ××þ‹ ¶‰ÈÐküÖë@¡ ——ÀÑ03ŒIvïÊeÓO}ò«ªm‘¿WWÈår†–ÙXtŽ(›5ub8;¶·›ÓV«@"ÜÝ™ Üo‘w}}ˆ0‰#r’õu``Øß7A9"~|ÞÈ046‡=“Á[Dì^àð˜š"O5¯s ŠfrXY1Ù·´ä TeyÌçó†V*•j±’?: ln··Àî.ÐßoúwvL›ø|>D£Qµnkëÿíîí=¶76žJ’­À>MïgŠÆ€t¤àû¯û¢úvzúSxd$@gÜ®4Ù(ª–6YêWÙ= ‰ž‰¯ÀÞù1?Ÿ”‡‡¥RÉVúž„öçáÏù¹ñM–eqŽh`0„ßïw}ØDß„¬súÐ ˜0"ÐmV[§"íôœ(x»V«92³§­Ÿ T½ã)€c¼†üB.Ëe:Â=AIEND®B`‚lucene-2.9.4/docs/skin/images/rss.png0000644000175000017500000000055011474320233020130 0ustar janpascaljanpascal‰PNG  IHDR/S4­½PLTEÿÿÿZ„²GŠäëôÖáì'V”QQQ3bœyŸÂÃÇÉÉÙç²ÆÛûþþ´´´££¢òöúøúü¹Íáòøþƒ¢È¡¹Ó__^òòòÝÝÝ;h Js¨v˜¿c‰¶êêê«ÃÙCn¥ŽªÊÐÝêûüýáááýýþþþþV~­ˆ¥Æýÿÿ›²Ë¿¿¿ˆx˜ÿÿ˜666˜6Ë6˜ËÛåïÏÐÐ>…±®­Ž°Ï”®ÌèðöúúúžºÕÔÔÔרØiºÿffffxÕ¨êtEXtSoftwareGraphicConverter5]HîAIDATxœb°Ãìp¬-.@YF¢Pô2@d@SÖ–¦^ì&£ÙËÀ@5á–ÅÎøÿÿ{CN÷›ÿX%IEND®B`‚lucene-2.9.4/docs/skin/images/info.png0000644000175000017500000000245011474320233020255 0ustar janpascaljanpascal‰PNG  IHDR D¤ŠÆ7PLTEÿV~¦2b’R†J~ЦÂ>vºÂÎzš¶Vbr¾ÊÖ *JÖÞêfŠ®6jFj–FnžÚâæ2Z~.Z"Bb6N"VŠz–ºbr†Bnšþþþn’²>fJv¢:6b–š²Ê2J*Jjj‚š&ZŠ2j Bz>:f–‚ž¾2bBJ~Nj‚Vzž^†ª:j2ZŽªÆVrŽ*ZŽnŽ®"F^~¢:rNz¢®ºÊBv:jšN†Š–¢*R*^’b‚¦Fz6Z‚F~.V~Vfvr’®.^v–²JzNršBfŽj޲*V’ªÆ&JZ‚ª >v†¢¾¢¶Î*Z6^Š.b"R†6f–Bf 2^Bjšfz’"JzjŠª&V†^j~Z~ž~ž¾N‚Z~¦FržVj‚&N’®Æ6rf†ª>j–Nvž&RF‚~šº:n*Fj¶¾Ê^‚ªb†ªBz.^ŽBrJ‚v’¶ž¶ÎVj~jŠ®:VJrž&VŽŽ¦Âr’¶Vz¢2b–2^’Rz¦RŠ.ZŠN‚F~RŠ6n"B&NzBnžZ~ª>rŠ¢Â.ZRz¢–®Æ6V2fJ†v–ºBnJr¢F‚Zfvb†®N†B~jв–ªÆ†¢Â*^.f>jšNv¢.^’J‚:fš*Z’n޲zšº"VŽ&ZŽ:rJ‚v–¶: B~"RŠVz¦f†®¾ÊÚ^†®Bf’N†Ž¦Æi%„tRNS@æØf pHYs  ÒÝ~ütIMEÓ 7 Ãã[wIDATxœ½ÑÛSapÀÂÉÂl( LÈ™j:˜4Ge§bÐu¥,il³\©AJ»P  ØZ;(‡ÅA@JÆþ·z·ù°›.ºò»}~Ïû‚ƒYKèn¼bY2±‘HÚ`óù¹™,»~½?»âõö¹KM‰²Û¹›‘´>®Ey¼åÅUo›k«”“LXœæéȦÖßiìF ý6Ç£)wFǦäi•o.bƒÕ ë?—ú¥YSBrLQ÷pè•V%ÒçC“]V*ÞÃWi6W¾×¯P(Î÷ ˆ¸‚¬f;è#`Ùpl•Œ¦rŹ ÷\Ïœ=¿7Ùu§8Š€ÐÕ8•KHÜ–èØíwO|j„¾µXF`×Uj2•ùüøÀ›Ï->í±ƭŰZÀ-¾oš†ëjk/~œz4|^À“¤yfZÞgxrùdz®W«™ê„X"~C˜‰5¤µSN_9³(æû Y‘Ê‚0Ï«U<;úø53)ƒÀ[ˆÉÓ‘þ:ÍË ?_GbU`€íGÒZÑ[ý0€oJš„>I’‰€ Ƨm~ξ󒟠¤1–ܾXdÓ&âfòóŽ|§’)¾á:8ø=g!Äv»y ¤¤°’ p ý¦Í×9b³žK< ¨$Ô1)AWAAÅée!vmÜú˜Í†Aÿ/ØáôÐ?4.ß¿ö¼·EIáɤ”À;:j ¿º¦)Žª©¹µµ–BŒÓ4U¬ŒÝÎþ(Þ ‡Õ$NS4¤A" ­5âQþ}SpzHqÇ)ªø_ ÈþP‚ƒY¿Àûú²ó­òžIEND®B`‚lucene-2.9.4/docs/skin/images/poddoc.svg.xslt0000644000175000017500000000447511474320233021607 0ustar janpascaljanpascal POD lucene-2.9.4/docs/skin/images/update.jpg0000644000175000017500000000173611474320233020606 0ustar janpascaljanpascalÿØÿàJFIF``ÿÛCÿÛCÿÀ!ÿÄ ÿÄ% !X˜ÖÿÄÿÄ"2ÿÚ ?¦}ÜÝM»¨öz/\¢•üˆPÍ~¬­ÇÉ×kÆõ•:-9±î¨a2^¦µà¬cQñŒ«Pj3è°gŽ‘,G¬ všI5AvÚ>S|ÄÖ±yTÍmX«ÞF!ѳ’£Ç jÓE äÁBߨ Ð=›6ŽzýÆ] ͲJ9[¿I<¸ÊÂíÞÛ¡’ÌyCè÷&éžÚüNs'Ѩ{×M?sVÀR/œ•¦Ç²Ö·¿ÔØ“Q̪7îòŽ;r³ñèöèÙÇ©{£÷û{ì¯b½V^µô—+àúç¶ÍI3½¿‰îLˆ=qGË]5ô/T¤yyº³–bÖ~?u隯I*Ý»w8:¾6ç?ˆnáØƒhbL‚m–É’>ÅE•Å«Œ’û$§}"½å¬­e;1²…•êŒ`ÀÓWä0lQ¬´³¶"m ÐärÀzލHËIêú¿ãd“‹N Æ[VE bíUK–<¤BCCž¯˜±QÁ‡µl;ªK&?‹mAFÌö޾§È‹A|I±.”³#»¨î‹ô,§lßE LÅeÔ–¶$IwÛEÃp3ä @•jT`EÈ{\ÅvQo‰jÿeë}:…Ù+f«8Ýv¬Ž&5; …Ö¼(úŒT¶ðGз³ 21svòAe#“±Ü°²ç¤C86¡Ëz£`+WlÚ$lµcÕl°+sgýúý‡áàÙ[ ]K^½ûDšføýçÃó×JûVb@P3üûÿŸáço uS×1üøù›4Í0^½ûÂ0kÍ~Ò5ü ƒ{O^“®ˆÓWî#O3Ü{úˆÈÓÌÀÈÀpâÒ]ò4ÿúõ‡áä•{dÚ Ÿ¿#_ó·Ÿ¿È× ÊLßBk™[Œ~IEND®B`‚lucene-2.9.4/docs/skin/images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png0000644000175000017500000000032611474320233030036 0ustar janpascaljanpascal‰PNG  IHDRo&å cHRMz&€„ú€èu0ê`:˜pœºQ<bKGDÿÿÿ ½§“ pHYsÄÄ•+JIDATxÚc|úäæfffN^>>FF&& øû÷/÷/^¿zÈðÿÿ?ˆ ü¿þ0|úô!“øñý3ª Ì(Óò$ù‹TñIEND®B`‚lucene-2.9.4/docs/skin/images/forrest-credit-logo.png0000644000175000017500000001103111474320233023207 0ustar janpascaljanpascal‰PNG  IHDRÜAú{ ¸sBITÛáOà€PLTE”'´­ž‰2tq>(?]Ú$ÕÏÄ1+1Jes`„Œ•&`ÌÆ»µ¶¶™”‹Ö°ÏnÌ$²’ ½µ§ÖV#R}je>46F¶'zxr¼ eÛ#§‡uA1:\ª¤–)EcYj~𓇠<&9TÅC"°dKMXj&¤ž“xMÔTk;@RY69ÝFàÛÐagq¡k©B"¾Jxsjår„cgÐFM^u\M#%0P¤'#“Žƒ"¾SjÜÕÆPBQ“\s‹…z½s%ËĵMI×eŠ~&2R$4©­®Ž&=Ó̽“*'3;Q':ZdNBQV•eß2# š#(I¬¦œKjI?G[[^ƒ~smy‡Ìfv )Û?1ÆY¸?M¼0"NQaqlc¸¡ Z=…q+æàÖßX6Czƒ&*JÄ,ŠDA´±«ˆcíÄ Ä¾³•›Ä¼®ÞØÍ@bC8á8Þ ' pHYs  ÒÝ~ütEXtCreation Time04/10/029ü‹ï!tEXtSoftwareMacromedia Fireworks 4.0ê&'uáIDATxœÝZ_ÚÈÖFvG1r«QKjÍ⦵+×")û¦—M!ØHTŒh±Ýnéî†W‹¤õ_¿gòÁ·ŠnÝ»ûNÕ0“3ÏýpŽ— éº8ºÝöñX9ŸÅËJ,Lè••a·°÷$}xÞ½àU´L%¬è9žÑS¯­$,kK "¼Òo·áx¶^^¯c;AUh™Ðu?þ 1 ÑN~ï·âÖÄìn  D ˞ظÑ"f­g5ÄŽF|>²_ÄüÄlûŠ¡+p¿‡•”¢ÅyD4†úìÀ2©Ð×i— ûJ»oékn»e7 ŸQ.عKìþ~G$`x¦éÂHæ†Ý›ÿpïk¨ÀK#9Ȱ{h·>bƒŠ4ü¾)ªw†]=òæÁñ¢Ï¶ÓØ–¢·6 ÑM ­B"­€4RL ÚÕïÍ »V.¥Pú]䯾W˜‹6,>:U3ó´Æ M¸"8¡P‚Nh¬bÓÁMk²¡)•”ÐÕp‡uÇî7ùÏÙã•Éq_®B‰‚‹våxQ¤$ä ™«I"ç ¶*)S„_’+’¢‰qžQÁöÇ㥋¶¾ ÷¤¾7»:½»ø4¸~PîÀ±ïyÈêŽ-I9%)'r5Îcÿêl¼ ÁËó0r¥`Ȱouî8àˆÇœ—_LFl8Ua:-Ã$!¢À+<2âǬ˹LÁë—$W“Âý7$û„ÛÈ;vƒÑÙàÄÜìO Æ‚s²Æ©&’W0e¤S~œ>p΂¡%„d2ì8Jå/yˆ:ΜݛØÈ®OÖ7VÌ®6x NãÅ&°>ÂÂà±`± •’‚ôáÜÙ‹Óåµ¹újÞ±"¶á+c6½’‘šê«A¸#Y]¶¦ç(?(¨BGÇjÉùàî^àY¦Ø(ñ¶TÂ)Ãú¬ål¶ 4«æt‚6<ài‘¢ÜgjM&’[N’L£ÿ{ôŒ¶…Ý!ºÈ01Ç|mŸO¶µ…Æ4CŠ$I ²;˜öAhrL|ü˜ŸÐð8Øs"À]JѼûbü©6pÝ"Ö0¸ÛmÿÃãÕ¾øõ/»kÁ|3³sk»×Û±s†Í|óÕŽÓ‹áðA® `øËÌÃKbúׂ{¹0+IàH¦~ðE²â ¦qº®Ëg_ ¼yσ#“õÓ-ÞŠ@ÚI*Ū¹ OË?÷´åypäôÃÓw6œN¥xIúŠyYÎ7»õíƒ#ïmï¾mŸ?­>›¯g4!6Rjô²þ¹ìÃŽ|ú]äØ~ áÑŠÅ»Lfm¸±×sQvd4¹ò¶a„}xNiœ+: ˜‘Á•&/¦¸| ‘1)á¾n:0W~Œ9»÷ú{îD¶Ø ag‹TPBÜ1t¡ÅlrN€&X'™V'§2ÌGMk‘-‘ºÐû¸‰ÅsvdÝXµáÎd•t "øˆ*¥Eg 0+ßk'€ZXò´TåHKÁéêtg.ž{u¡?Ûpã§kevd}/Ðv¦ÇÅæRÞœ’€4“*5EщTüÈgJgs8ÜtUù„¨2œìçyp¦À.5.Éü:pÆ}Ø‘yG¾ÍîˆÈ5pš¹ )‹¤Só‹˜é%²òM…#U>£œ3…ÓL€[晋•܆Ën/Ž/€#7{¯ƒ6\²J#‡mâ·W” pÞ‚1xBTøØç*jÓ¥æxBô6:ä¿#(s ±èàÈ;ÇJ\Áxâ9ÖË+FNge³´ÞrT<ã-,I!Weùp¿e·'£[àÈ·yÇÊxÃ6qI4m$«&eÃðHˆçá°£¸H¹ üЬîKv{n6ˆ™wœí‚‹–M8½Ùp@ß.{†6E œk¤Û#ÀídOç¢ÇØ‘¯1;Np¬E¬NNYÎC.gŒŽ_û<ÄŽŒ&×p9i\%£Ðhb•°¸Îê…sƒð-Ç "Ë Ï¨pÁ•½üvdôóA¶Ñ†£ŒÅ±SƒðûàhÊØaG (ÊFƒ[8o±#ƒë‹Óûm8Sh'2È\† À9Y ®)0œw48zÛñ|;²vèóØìB–ÐŽdjÇ#f0kÂXp¬èTÍ{µ&Ä6¸Ñ¨®jCA;pñdì7ìÈÚr²f¿y8ý&°è$añD «±K$’¤,BI²©¥ð½¢ÓIBà~ôèÞP’¸4ßàDZ#÷“¼ýìUüæs¦ªÆÄ\Üp[‰›#¨bdnMÅ"©°ÄüÇØã«* TdÒàÈ} ÛPO„z k>£‚³oɨìzH¦{ .%‘ÆŽd:1¶7 ûk»u.`qpiþ¤­O$Ýpx†¤±Ô¿¨à2ªÀ[K¡à ]rXÝýÃý?j#’Ûùöv—q;m8¹»¿ö~ µs÷þ-ÖÏ·Ö†’ûæ÷™O;]qeçWèøö#Í-µaäžÿkfæÓ78Ì"SŽë?AÇ–¢XøÓ†‘ÛþåþÌÏáLS,9¾ ,ü1»€Ï°žÈ::;›:æñ…ÒCÚCÈ1?Ô?œ.–‹Ð1qþÅxÙÞEáLÓ™0[“¤æÿ~ªB.9V/ž–·€‹)ÇÒ›§‘b$z¯Ÿ\ˆ“ÕjÕç+­[ÿmÜátýáÞiýpycÈÑwov}o"ºêãùžÌ8$ÈÎp¸Ñ ½*·òz7±© •+~Ü›{õí»å·¾FÒ”ãØÓúöç@ùØ×0¾ÜéÜ)D%\2ßß’&75NtÆnª‰Ar[Oóß}Žd–·¢¯¿›‹Dßä”r25¤ÔÃëhÓ%•„Õ—ö4åP÷ð@î;#ùÌÝ™æ#±™°_Q $“n˜+Ý‹’s+Ñ·‡EKŽïêÐ)ƒ{Èè@®ÖCŽà\2n"iÈ4Äqº,&àúŒÓ‰ô„Ò„&€ÑdÍÉ.MhRÞBî©)ÈQ¬WtýÄ}m†ä|Ó³»{‹¯Ÿ~ï³åx\Ÿp,îžßñ !WQ3F$I’Ð4¢‘¼­édq1ƒt]h‘G¨©»šl¡f iz%—É`2_—NÁçjÕ˜u“ã8Mh9T*.øß_[ä¦ë+ŽÉÀøjqÌ”#;Ý[«—Ÿ4úÉytɲÙÅåÆïÀg M¾å+ê®gÀã„ÓéTsï3%#îôw$4°yÅóG®EÁgÐa¿`3 ½ôV~îÕr£Á÷”~7%÷ã8˃ÀùÛÇÓ–ƒ¸cv|u`ç<º UsµêŸÙÁ ÿÈÉl P*a?î:JÅãñš±8e°ÃÅl®H|YÙŠ^󽢤\‚¨æ–Œ{oPûõ“c¶ÊùuÇA¾¼ äL9žç·‹ñÃ~r!P«ï-¯Â§09=Cð0r&91÷gðya®¶¯ã#窄ÃK Ï&×LUKRZwÁñ²lþPí :p/Qú!¥ŸÜÒq4ðbïàu6¸0nÊq#øàXŒD csª©Ü`a…0v: PÇÖ3Ì?Åóí7¶}M»!—b)pStz&È$ëµÉј\®? ߘÜÎq6ðÂ1¹Ýx2nÊq£ ;7 ä°»»QÒøWÛçC/ÔøØ!çÁWþ7º¯CÇ›½›/™MQ—èdý¶¡éýzä~ fA–“+õgϲ–1¹¹ÝÙñ'¦–ºnÆäؑȵÍz;ð#¦æJFbgæ8(-Ë­ U3=7æîí;^oNîp#›ŸpÌÎÛr æO5ƒ³ô}û "¥.Û^Æ”Rl›\a$ràB©çÏŽ v%Æ<'m}rî·å×k޹•úÆFÖ’ãÆìÚÞç cçúó/Kmr°‘^›k0¸’œ ´ÔB¯ìЙ »‚Á׈L_è2º’|°6ûÎ ‡å¸])î­åËÏù—Þ“¹&ÞH` æpÈ»ŠÞ{¶/É ó¨K8LyÚúþ:ä^£»ŸñAäL9Þ9€>å'ƒäð!Õ!b%Œ_ÏžÁÉ[5ž£«ÈU‚ §úeœù¤æMßÈT×Sý§ÉÑ_Æ!¢,F¦WÇm9>Þö9|ÏŸ7úe™æô¦ÚËΰÇa¥n =³À»Š\Ú…°TOfìæp1r(]¥–»O ôŸ"G'bKß¼ëÓ”c,^K&cóµþhâÌ3ˆv»HÉ2(+Hœàs*ÕE®¿#Mˆ-’l'ž?}ig`UCÕ!àîtVjµ#ûÏ¿ÿýè——£ŸçƒäܱXº˜äkËßg-9úJûûÉ}~ ½ó¸ÈB¼Ú%št“tBï-dx£¡M™,ø»É t ¤Ìà²ÂÎc‹3H¾U¿ÙÌJÎ`F–eS¹N2@mlÌ7²H‡¾ÚC(Y„u½-ÇF£X”^ÑéX­$uí&àûççkÖ¹eô&bµ¥î[;bjþyfÙL(=®Jõ}ÉvJœ”7¾d'›/_^'¾à3“„Lµ#G˜±È¾IÀ«¦ôÆ=LO’kÜÑsË@ž…1¬Ì4ξ¶¹wõÞ„­`¸•F§Å¶GQûÀmVC¦é™¼o4*ò`»ø»sÊbÑðÛÍþ—íò/B0»!rü§´+¿åùgî™Ùþ ußP÷ûèIEND®B`‚lucene-2.9.4/docs/skin/images/header_white_line.gif0000644000175000017500000000004511474320233022740 0ustar janpascaljanpascalGIF87a€ÿÿÿÂÐâ,„™;lucene-2.9.4/docs/skin/images/valid-html401.png0000644000175000017500000000560411474320233021614 0ustar janpascaljanpascal‰PNG  IHDRXcÈ àgAMA± üabKGDÿÿÿ ½§“ pHYs  ÒÝ~ütIMEÑ+Øxé IDATxœÝš!”âJÖÇó' W¸DÖY•vÁŒL;Ú5’uŒëvI»Æ î!ÁÑ®ã&¸ŽúNÉàR.åfE$!ôÎîÙóÞìÞsè[·R·þuëŸ{«ù²ßïðˉý«ø—ÄZp¥Ô—fÛo¾ïÿù^Ý“€^þÕ^ü´è4#Š5BŒ'³«ößþŸ~Bþ["Xk°¶Ýçÿû½ù”T!8ƒÝ_/‚Å­[ÞÛŠÞ6;´ØÖÿgmo]òJŸkùõ®úiíùVˆ²ÑÂI+×[3Ï¡Ó)a(íÎ.¶Õ~B\ƺ¶ýì*Š7Üe”+¹¢ˆ¶u°7ôm6mŸK„`÷‘¢Âóå¾@Aâ}¡[½pH2T¸B…+î†ÅU=¼¡ÂIš3_íQáŠÞxƒÉŠghÒmPáŠåæ\É|ó WDûô´š\@»u¥bû|ÐGž7 C×qpÖÇÍ×YÄd4àiXÏ8ŽÆNß°6§M”Óå}6,\°–årIš¦<<<àû>I’ º]”ã\ùžÙâùYt/ GcÑGËÃ}—Ùó“å<Ïw(Gðû×®ÛA§–£)>Ë胧q€ù&!Ñ€ÔX•vÆT#òg¢˜ŠýµÔV]‰¶–·Xó:®èe6Ûä à½N9ìuqÓiðX2èœ5ÏÏÏ, ^^^Øn·¼¿¿3 PaxåóéiñA3Æg`¤’Œ‡ÖXžç;<%?z•ù_)X¬<î8¦9Ë7#Gc¹xׄ© ô€ƒ¥<<ÖïvÉ‘¾WD•§$Aàï4úhPŽ<÷‰÷lÎðaÀl\DÏá#eô!\X”;ÁÃr¹Äó<†Ã!»ÝŽápˆµ–ÑhTŸá‰ÞJÕ!1’CÍñ!Ø<¿àiÊÉ–”"|yLÞbNs¬µ|ûL‡ž 6f¦Nrÿ!¾ï+°9ñA×ô¾r!ˈöu}Qú Ï‘xŽd¯SÈ2F¡_.Òr¹DJÉv»e6›ñýûwîïïñ<Ïóê3u—'Cl;æÇ~ÂfV,X^Á ‡õŒÀZx =<%™.ö,V†"ðÕ¥K)EAG‚ŸàâªM»\ô¸]Þ@t tˆ¶½¥ ኢàóõ©\f£ÁÙ6Š"žŸŸ‘òýüñËåíª­ºé¤’ ¹Þ™•›âï…;ó’O;ÀÓ¸G¢ úh™Žýf׳,V{¦Ó˜élËò-¹áQ  Ðëõd>¯=\¥iŽô=—Ý!%9¥ïnWø¾' :°àJIà;ÜÝuËùY¾ƒçRàžï`ò!:ŒE(”#ñ}—äÖžs§º„}‡ÌB¼M±X,„^È6ÎBp\½ï^°€ÑúGÿÛ·/­ <‡]ôA´ÕxCŸM¬‹ï){M´×LB¿ŒæœQP€4_ï8l?Àíï5ï"…ÀZ‹R ¥TÛpu±—/¾×åûò¡P–\×ïuù¾z(îK[Ù…÷ŵȲð4êñ4º;ƒ³y Kl,žçò}u_tцÇÐãñ¦çÅ«ÚàßR:‰Aî.Ѿ_,ȳìG°Xü­µTz ä…Þv(Ça:êœõñA#Ü.ï ³é±XÆÒ”]r$Ñ¥º¥~¯/úÀWȸ?&éfÂf6D¸.q¼g%g€oˆÔäŒ×gù¨¸ÑÖf×ÌUöþÖ÷F!Ðë„ýË®µï==á†á—V€¥øÊåh,¯o{È-ž+‘R0ðTMz¹]‚a_ñ4ìè°Œö@q$úS\[ƒæ$Å©®m·¼åOv¢m1iJ3Rk~¶ùú-!žÆ­$Ò{z¢ÿúúEqû4í~  ³¼Å tЍÎút¾’­ýƒžЩÅZ~žjAUߦ‡$#Š3]C‡ftSË:ʘo2ÖQÆ1m<«d(Î(²ÑS[1x²ÜctZYÌÊ”àF%¸—^…œÀ=yvà°¯Š|0·8Ž$‘:ô¢,[¥Ü)÷Ÿ¢•šã'6Ô©£¹×Ê(Œ¶9Ï+ÃleˆöYÃðrÝ%“eÊ22lË·È0Yfì’°˜ mY¼gÌV†MœUÆÃbO<ß?ǘ¤²cM\V•µØ‚Áï¿×À…Oö•Sî{Þ™g=%ñ=²âeèHÁ:Nøû"¦ŠÓAÈ,žrÑìv»Öq‹QÝð¢˜Ô[”13x® Ó-á¼¢Š"[ù¶1“¡d3UL‡c,¯›b'­âŒ¯ó”hgétNkSŒs˜oÙ.‹Š1Õ†Í(âë²*,"wó5j-œ»ãþlvõ/£OÜC_ù9zOò0ð }Y }‹,1ÏËd}4%÷æ¾b0°Z­H’zâÞV|4“‚x›ñúž`Têçfa9蜣åÂA½ÊkIR]ž%A}Lý–œÁ=o{kyŽ9Æ%µÈ­ŠõiY±6åÓóà ç0wÝsv’ÐWLÝ.ƒ^¡Ÿ‚$e±Ú±Ù&X &5HÏeúà!„`2™0éõzH)ÑZ³^¯‡Áåä® àáPDnO fã.‰†ƒh*•SÎóœœ"/.2/‹Å}®!ËrüAå v»¬váöTßAïŽunµ–÷¯Qm¤*9¹¡ÂúNËÂòi÷<—§aïL'Q®ài88úô=‡Ílˆwçp<Œ±xw›éÃ9:=Ïc:²Z­xyya½^óøøx£T.Æ[¼[2 žq’' I-ñ6»ä±•Ùu8¥Éâük)èà“-ÏåÏ*‹p»„‹¼PÁ¬>½coÒ¯äÍ×òi !ÎÅŵޯm•ÐW ^GìSCÎ'qU Ã4MÑZ#„h¯ìʤÿÄç]ÑÁò¶³åØpЖD[zw›èZ”ÛASœ§Ç“"“”Ûiy‰–ƒÚ¢!x éȘý:¹JObÁÄÇŸ Ð:»aUÈçßîךvI)d;Uû]?TÅžì`òЛƒ(bm¯s–‘!𣠋”‚u”ñmcßKÆÃ.Oð¾µÌÞ2_Š]pï Š„çñyÞ VПHW²ïZ3îÁSoÒkí¿Á‰TÖÍWeÒo-H gçî”Àó  ¤×½œõÿ>êâts¢½añf‘ÆdžR†ÄN'çN Üè§voÒC¸âç¸N 'pOçÎmŽWäË~¿ÿñëýðd~£±HÅê›§\€†ÎØ‚:d÷´Ûš&°XD2U›„c¤yް‚§>Þø¹EÇâ‡' ¸“§×ö_öüšÒ|¥W!ZÚZtRêÄ­ÍWVù,hTk ;#pBÅ=!™ÎàžÌn1u!¿Àg?m9ïæµ„m‚ 5lØ6F­RJ*_¶p 'PÔ*©óÎgÖå7Qž×&IB–e­FŠœò¨ìtÚÀp}¾s#ÿü—¥-›ãTÚNœ¯–DUf›üfŒa:bŒ!¿¼Vét:äyþ§]ÏãbÉMBGrkÙëI¬…D§Heh“@çþ“ô½_¥IEND®B`‚lucene-2.9.4/docs/skin/images/rc.svg.xslt0000644000175000017500000000230211474320233020726 0ustar janpascaljanpascal lucene-2.9.4/docs/skin/images/corner-imports.svg.xslt0000644000175000017500000000760011474320233023313 0ustar janpascaljanpascal 0 fill:; fill:; stroke:; 1 -1 1 -1 0 - 0 - lucene-2.9.4/docs/skin/images/fix.jpg0000644000175000017500000000164411474320233020110 0ustar janpascaljanpascalÿØÿàJFIF``ÿÛCÿÛCÿÀ!ÿÄ  ÿÄ ÿÄÿÄ !1ÿÚ ?Ø–ãïÕ+¨câ+6²÷².qFx¨5¢Âø â[“£´1þÎçË­©X3ÖX}¹\çÌ^ |˜ï6|äúÕ·š4Óé=ï³Û~ŽmÒß)ŠÊ‚Oxr»j{¥¢è­«·Ã×ÍaA°YõÚ…MÍ4Snò¤¨º<\ƒs´&™Qù¬g7Y(Î8ßxqÇ&Ã|¨E½¶D­ÝÜx­Ò®à÷¶õJׂʽ²£®¬*¥ÉQ²#³Š=FÌ"ªš®…gMYÁ,ê>ª|Ôm¨E²ÚgQ7öµß_vÖê¿,šðö`RÃRó«ˆ mâ˰JÊ YöÕàSò=ŒûZ-ô1§¢Œhm§ðë&ÛlJoYrw¬y*S,²ypÍ~‚”Ñ¢jº:kª'qLY~§ ϱ™ISM+×Ï.“¥ýèߎ4âרv>Ú¼£ÖyÎ9ô³ÙUö:¿ ×OŒvŸH¢¢Rí–xv'ï†sƒ±×]%–*–µ›b¶¶E¾ëhN!ÕÞ0‘jÄU"‘`%µ§3›D°k÷¥Rw˜3|þ°Ì’Í$’a}›:Bñ³Às 5<å•h‰D>ÕÕ]I¬CÃØ#ø@#ü gú!\º/–êù­XYUÒŠµ‹´è¢’gºnèÀ}#2MïÈ6u…§e¦Ç5àmÉíÀË,5ª3‰€}ihü ޝ0‚+X³@Í —(ia$¢É€F™æGÍ6L~¥Î<’V—“¨§¨ƒ°¨¨°&¥… #м¼Tx‚ƒ .$Q¢D‹è£Ã‡ |8ñù8ãŸÿÙlucene-2.9.4/docs/skin/images/chapter.gif0000644000175000017500000000006111474320233020725 0ustar janpascaljanpascalGIF89a€¥¶Æÿÿÿ!ù,Diì«T(;lucene-2.9.4/docs/skin/images/rc-t-l-5-1header-2tab-selected-3tab-selected.png0000644000175000017500000000032111474320233027115 0ustar janpascaljanpascal‰PNG  IHDRo&å cHRMz&€„ú€èu0ê`:˜pœºQ<bKGDÿÿÿ ½§“ pHYsÄÄ•+EIDATxÚcü?~þfX¾ý$Ãás7^¿ÿÌÀ(í_Éðäå;†?þ1€HÅÓ—ïá`A–ßþ2 &è p(¡sEé™IEND®B`‚lucene-2.9.4/docs/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.png0000644000175000017500000000031011474320233027077 0ustar janpascaljanpascal‰PNG  IHDRo&å cHRMz&€„ú€èu0ê`:˜pœºQ<bKGDÿÿÿ ½§“ pHYsÄÄ•+n†°^. }†ä[vV:{â _IEND®B`‚lucene-2.9.4/docs/skin/images/warning.png0000644000175000017500000000227711474320233020776 0ustar janpascaljanpascal‰PNG  IHDR szzôbKGDÿÿÿ ½§“ pHYs  ÒÝ~ütIMEÓ  :7°ÜÄ—LIDATxœí–[ˆUUÆ¿}9çÌœsæŒã̘·¦œ±É—%S ‘ðyE{T„„Â!¨,B ¦´è6¾AtJz°‹e!ÑCƒPVh=$“9Íœ½ÖÿÖÃÞgΨ]PgÁ‹µ÷~Xßïÿ­ï¿ØÀõq­;²c®}¡§ÿ•5S¯dŸð²Ä®,¡xëû´<v.x÷£åàª`ò]ÛÔÇÓŒ” ‹»–oÙ|ÕlßÒ”SŸÀÈ!d‡æææ=_R‚rŒeý¹)Ç÷m—ªÍ0Ê„)±ƒ±¯CÈÉAÈ!àͼt`äÊØÛ‹[MÊ;ÓêS,ë}(â8†2C¼ƒú$]É¡! ©•BÔse”gì”an® [Í~r€¤QA„Gª×l5r(EüàþÕMó. Àö/»Å|~›&UŒ¶¿vûYÂ|¾Ì öì%PòòÈ©›r¶÷½5¥Ü¥;N|†‡(?b9ׯ^ñ ŒÓ†QáÔ%—‰;{¨ c›ãŶ_€í[ºP}´A}R·œ²þ§ô¬Ýéï'OL.'¨*ÄÒýã`×›+‹ÿ¤_$ÞÛ å9t!اí&”^®½ñ742 _žƒIEND®B`‚lucene-2.9.4/docs/skin/images/error.png0000644000175000017500000000325511474320233020457 0ustar janpascaljanpascal‰PNG  IHDR szzôbKGDÿÿÿ ½§“ pHYs  ÒÝ~ütIMEÓ &ýƒ„E:IDATxœí—mlžUÇ羟—¾—u+{ÕV£®c›ÝŒHô“~1çæ^ Ú̈Æ’>,@Ì n‚DZ± P,AsΠØ °tföÆÚ®}úvßçýøáyÚ­];˜Ñè®ä$wNrîßÿºÎuþ÷¹áýø‡¸Òá®ïÁüù ‘X䜭÷Zã“tÀ'i—íë;^»çQÿ_ß×hãh£¶¶Uó-%V)¼R ‘ÒDJõ )Ÿõ©|xþþ=oüG„]»Ë]uÕ–ÔûM‰Ry«$^i‚RxYJBŠb4±VÎ¥éÞ$IîúØÁß üÛ¾§t.ûü°5KŒ”%ˆ.BåèżRX)±Râ¤zs8MV~æèKW, <ñdƒÌçkÝh•,A&Áu±  0RbRÉØi)ÏJùÙ5Ý]¯Mʼn¦„ïüY¹ÊfžRªÑjEÕ²¥T.oA¥)6IqRâ•ħ/‹ÃI‰NSŒ”Ô¯úüö·€JåÌ Ô³æÔOÅÊL5i«*· ½ÄII¦®ŽêË0ù9înFIj®¿žE÷mAÄ1¦ä|µµ¬¹ÿ^*•¦û±ÝD¼`­€¤¿ÿ¸KeÏxÙ“RéÓ”ªeKYüíD¹Þ9vÝò ^ÿÍ’cÇøÅÚVôh‚ˆ">ÿÐvnne„€[‚[ýÞ¹>ïÚ§°è—O{“&Ïg^‚W.nfñöˆÊÊpÖòØ­ùûs¿fN37Š}¹ƒgZoFŽÅ1«òc¾´Šä"¸ ÐçýÁNk¦ß€‘Ñчe’º1¸ÏfX¼íÄåïybÓt<ù³ã˜«£ˆ‚X@ÏŸ_äçm°Je2¬ä§D×^Câ=6@!xÎ:»c2ïŸøí7†FG÷ê4EËâgÕ;wŽ}wn¦}ÏÞ p'Àˆœ>xˆým0Râ¼§Ï;†ƒGÏ[Ö:åÜ “ySÞž^º|F…±¯dµjÔR¡ëfp¦¢œtvRE\Å”!&tºL¤!]{ }Þq¦»›YQLÁûÁ.kVtZÝýžìllZRmÌ¡¼63¥1 x‡ P. [‚» ÇlÌt2†‚Çïe·5_`À;N8û§ÎÞØiMÇåïùZ~KEõ 5"º½Bˆ+…(‹„ ”²58ïëõþàYow¼mí ½Á‡w{ïÿ˜ÜÍWÕFÑ'Ë„hPo@¤>ô‚;Þë]û[Îõ¿û[Þÿ£øÍo9L‡§IEND®B`‚lucene-2.9.4/docs/skin/images/hack.jpg0000644000175000017500000000134711474320233020230 0ustar janpascaljanpascalÿØÿàJFIFHHÿÛC       ÿÛC  ÿÀÿÄÿÄ$!"#$2ÿÄÿÄ&!1AaQq‘ð2ÿÚ ?êKV«T¬ÛVšê×kÞÒ€X,#±‘jF#s3óF‚×ä<þvÔ¿“v¢5W”ª¹½ò"l|ZMØ4 ˆ~O´ôXÓ;˜‰ózdëO Jí`%]Eøµp.Ô·¶„>È­r./[)m`«RÛ5Ÿ ßŒŽ–U&”ˆ²SÞfzï[nUÑ?2»‚u´øKФsÞE“ä.Щcõ±›RšºãbåC#lu’ØP«×±–ALʉdî3oã ]Ó¦ÅU_è m²äÙ—·Ë‚#}‰b§á–ùMó¾Ðn3IÌYêÝ„—ŽWYƒ¢úÀteêP?ǯ ÿ^r lucene-2.9.4/docs/skin/images/built-with-forrest-button.png0000644000175000017500000000362011474320233024405 0ustar janpascaljanpascal‰PNG  IHDRXTúÒ tEXtTitleBuilt with Apache Forrest!3þ tEXtAuthorstevenn@apache.orgËœ•](tEXtCopyrightThe Apache Software Foundation»FtEXtCreation Time11/05/02­Ð}TtIMEÒ  F*à+ pHYs ð ðB¬4˜gAMA± üaPLTE„))¬¬¦5Oi£‡1+À bŽ”—EUTbq€¸M&`k æ!.If†eÊõÐ!½Ÿ 4ff3IhŠDA333Á p}ˆwIòÌ Sfx(?]ßXA\Åg]gq¶'SHG¼ e%2RáB;@²’ PXbÍ 6\MÕYs?8d9:…q+DZq’Å L™™™°dÝFOÃ6»·®W`k-;TÖ°‰¸?M‰2tq?ÕÌ»#›fW+ZÓ¡•s`‘2¸£!B2B;A>&1Qàaá8¿,ÄZLT]G‰A-Ì(bC8± kkt`hF2:Â0@íY$*LÑQtp%&å 0Ì o&-:{}6l%Lð¹0/G•)*J `je>#6)EcŒRWfÏ, ¶()¦ l':Z/p/¦/yTÐF’75f=MVa.Ô%XLP®ˆèDuJ'79DK¦àÊlIDATxÚµÖ‰[ÔFð4©ÄAm«Sã⢫Ю8©£F}à ^´Je-[mñE/DÔÝ7W’]p=yß—™¿ŒyoÞnp}=‚TUµ.1 ®¼,á9‹Ü•F-7dö,>Üxº §ÜMG±’B‘ÒRé¡Ì>áäqì†ù{%¸¢r½¶ŒâŠ–æ$"ŠëãP¦—á¾àºŸ™Îc`ýY¢ì¡ï‘’ïs§a«a±æÕÓÑæåÌb®dÈI7½bã…yC±`QH™>c„B„½é¤w\Ïÿ6ñjÿÞæÑ÷†/7À½'Á7‹‹Ü¾[¼xŠ >R¥žaÄøyá¸,þðbbúê_Ãóã9,$í–KRi4 G ¥ƒ¥0»Þ\‡LܨBæÎœËâ¡·ÿnÀ‰¿‹óÛ½, û¬µÁR¸ÏsÇWó·{Gþx2|«Xq Ìô¶0ËÄÇ`Þ§DÈÜñÕ0UÁ‰#Íå¡6íQ&œg±îIþŽ=üŽup¬ëE¯ w|5LÕ{Fúº^½ÃXlIJð=ƒîïøÅÛnºÎCåa ×…q¶~xÑ8>‹;fž}£“»KU¡CÓsh¹™SÌå­!Ì7÷ïlwY\^®Œ›º¶Üškƒ?û·ƒ½Î¹í³8_y<ª7»­0ù2WÃÖ¹í³8?ñkðs×äe½bûƒå«âZ¯sÆ}—õD½·ÛÁKÕêwÿµY=6ø?mõh^ IEND®B`‚lucene-2.9.4/docs/skin/images/chapter_open.gif0000644000175000017500000000006111474320233021746 0ustar janpascaljanpascalGIF89a€¥¶Æÿÿÿ!ù,„j Y(;lucene-2.9.4/docs/skin/images/rc-b-r-5-1header-2tab-selected-3tab-selected.old.png0000644000175000017500000000031011474320233027654 0ustar janpascaljanpascal‰PNG  IHDRo&å cHRMz&€„ú€èu0ê`:˜pœºQ<bKGDÿÿÿ ½§“ pHYsÄÄ•+n†°^. }†ä[vV:{â _IEND®B`‚lucene-2.9.4/docs/skin/images/label.gif0000644000175000017500000000006611474320233020363 0ustar janpascaljanpascalGIF89a€¥¶Æÿÿÿ!ù, Œay¨ßL22u›*;lucene-2.9.4/docs/skin/images/spacer.gif0000644000175000017500000000005311474320233020555 0ustar janpascaljanpascalGIF89a€ÿÿÿ!ù,D;lucene-2.9.4/docs/skin/images/rc-t-r-5-1header-2tab-selected-3tab-selected.png0000644000175000017500000000032711474320233027131 0ustar janpascaljanpascal‰PNG  IHDRo&å cHRMz&€„ú€èu0ê`:˜pœºQ<bKGDÿÿÿ ½§“ pHYsÄÄ•+KIDATxÚcôÉéÿ/ÀÇÅ`c ÆïgÍÀÁÎÊÀÄ>}cØ~äC^ç†?CAàï¿ ¯Þ~fX¸é(B&qäÂ-TA˜Q]̱}V®IEND®B`‚lucene-2.9.4/docs/skin/images/success.png0000644000175000017500000000241311474320233020771 0ustar janpascaljanpascal‰PNG  IHDR szzôbKGDÿÿÿ ½§“ pHYs  ÒÝ~ütIMEÓ '“Ÿ…’˜IDATxœí–KlTUÇÿg:ÓvÚ2}BN¡E„È«>ˆ ÑÄ€ƒ1ÑlÄÆ%.иB¢‰1‘htA4è]ctà £‰+#%hÀiyTŠÃtîù^.î½3wæNíìå$'÷Ι›ûûß÷?ß=À­ñ®qá‚}¹ƒ{Ô´]¡P(Ø êl· $c`S($|Æ)«Õž%xïUèúߥãÏozï«$/Û( BöN¹TY'ÊãðåÆ ¯j®Cªÿ…ëÉ{¶î½yO¥ë•ÜñŸ¼§Ž4¼¬&4ˆ© d°qMÀ½'TÊAg#/%@Qƒ×“ àM¢êÃ{O ïG&% ªé‚ðÖjpBOf{n?ˆuùûqcöf” ‚'Z\€€]òå)xœ!K_™ ʆíŧQèÂÌÏeüöã4*åžLœ2}“ °K¾´„úˆ“™ MwÏÒG1Ø9‚³çNãÈ»o ¨(„ƒˆRRkîêØtMáBðä±"¿Kw ðœDïHC«»Â]Á aiÁA²Ü{d­ÛG÷ÂÁáõ·ãÒܯX51€Ž%m`f0 „­Í@H›MÁº›‰°½¸ݹ^œúö$>úüÆ&Ñ;ÜQpÖV2Ð`À&uá\m4ÄXïf\¹:ƒ—ïÇŠõ,[Û´„Be…²-nB1±¤ã‹]1”Cˆ+‡ûœ ó€C”ò8r g+&Lv;…àûKŸazî"ŠÃãØ¶ê L_¸†  Ø½w-߉ùJ“/?‹ÂHË×ÉejP¶Z BA-5¢:§çz 'οÏÚú8üZø¹ Y³¯9„™ÒŒnéCç’,Têë®T­‹Û0ê±8Å|Çœ8ó!œsxjç Øµæ9ô´÷ãä×'ðñ`tK?z‡ó€¢.âØ„g‚ÒÛ0ý5¬kÂApfînûsŠ÷¢¿0ˆ™ËÓÑ–[‚eã=ÈdB××"ŽêN ŽÖ´•>0†{€ˆÐÖnøô§·1ûÏeˆ¼2 t—QÜЇ\¾-uò·&ÖÏ„Ô,ñ7<ìóÄŒ [ÂáOöáòÂÔï§1¾u]ýíP‰SN¥†LÄe M} š• ÃJÕ…Dy¨ \!@ûx W®@¾ƒC³º+8Ý„ ’î„;†â£Ó‹' >A0§èì“ÆQ%f" MD)µÐˆÊ7çƒð¥!]]]ª$\UH\UHOOOݲjT|ºùïïîîî?<5™™Î§Q¾K$à¾s]K$³“^¾šK±ÃàNK@NK@ÎÎÎÌÌ̸OOQD<,$¾¾¾÷Á^Ÿnž Ep´®®®¿__¡·Ù×>lz‘îÁ^}e1Þ´XÐÛì 6d®mX+&Y¨ìÏÏ’«ÓL>!U¦<1ÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿLLLާÒîòøWSP<<<£9$ÁÏæ D«//,,,M¢®ŒEƒŸÍ>ÿÌfŸŸ³E)s”Ç r8µµµßæò ÀL“£ctRNSÿóÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿóÿÿÿÿ¢ÿÿóÿÿÿÿÿÿÿÿÿÿóÿÿÓÿÿÿÿÿÿÿÿ¢ÿÿÿÿÿÿÿÿÿÿÿ0ÿÿÿÿ¢ÿ 0ÿÿÿÿÿÿÿÿóÿÿÿÿÿÿÿ0Óÿÿÿÿÿÿÿ³z›‘IDATxÚí•}SÓ@Æ×Õj ¢ØúR«DŒEA¡Rû’¾OĈQ’¶'–¿ÿ‡pï.eÒêL+þr§Él/“_îž{vÖfï?pÀMrúx:÷ü¢À/þƒ/ÜÇ¿œÇ¼bDQáà`‹Rg¯ FDÛŠE´sT8€ùÎü9¤@Ü&òWôÄßù~ѧ4œtRç̨|ýD^ ŽNV‰²3h|{/oc@TGliY<êjm Í 1¸2B4kœ…®n‘Ëæ$üüæ{{߀#¥Ó°N—ÿvy WTÖøçÔd‘ï (kŽNzãà›—ŽÛûÆn ìóà.µøÊóÜ}–"i·8–¬°eœPD‡` C1þ²ÄqÜ6à.¶xïØvu5ë²r…O‡I° ÍøÍ½Ž]‘–JÀµFÁ?–t¼4`æu±¡$P:ù–)Û\¬ö¤UmŠ_ ”µ-”@#ÜeÃýö0®¼€u貨A,5z«Ipf®3^¨O¸ XWÊ&8å~ŒÁ½sl_íe]-þhIÛzáë#@(-G'j%‹ËãܽÌ.3ŽC‚ßš¬0,é°)l½QVUˆª²[­êŠfhxË¥Qî° yFUýV¤WJé'Ù[ Dór¥!÷†âžv·¼Ö õ]ƆϞ.S'­úLì× /sŸ¼i›»\pJˆ¢@7¡:mAçàˆ¦›ñ0î<=ó"Ý3=Õ?WT¨ùoq ʦè rîÁx£w¯þ~ç”[šñ ²3äÎühÚÙ4ÜÙŸy ›š{‡éÂbé²N韓cnúx}ín ¾ÿyrÜ›>®>º¢Á¿^a¶õ'‘IEND®B`‚lucene-2.9.4/docs/skin/images/printer.gif0000644000175000017500000000066611474320233020775 0ustar janpascaljanpascalGIF89a ÄÿÿÿÿÿÞßÞÖ×ÖÎÏÎÎ1ÆÇƵ¶µ¥¦¥œžœ”–”„†„sqscžcacÿ0œÀÀÀ!ù, ÿ $ŽdI.¨©®ê2¦l\º3-³4Õw[ê"C$Ò›‰€$a‘”HØ$HåRÒ”äv#i±:0<Ú×Ë8@±¢ê©/‹áA G†¬ä~rt^ yD"nxz €r…(Š‹# ‰ yo“‡B~ š‹4TAp‡¸ª««‹ˆ¯ ”±””ˆ¹o¼« qÁǸˆoɪšjDÁ’“T‰^Îkp¼ÌŒ í” ÎåçÙC# ë íàûòòÐK6„'ÑŠù/ß< †¨2ØhÂbþ)8oˆ½û—Àa5ï3™Pi1‘Æ’Cò½a¹â c ‘猦Œ!à¼ì è³ÈyrÊ+:EÂK)6]É4F;lucene-2.9.4/docs/skin/images/instruction_arrow.png0000644000175000017500000000043511474320233023116 0ustar janpascaljanpascal‰PNG  IHDR Ý"õtRNSøüøZì— pHYs  šœ½IDAT×côðòذi)€éÙÓg_¾|Á%]_[ÿùóg4AFeeqqñ”´555L=î.îÂÂÂS§OuuwEˆ*«(v Sxdø£§~üùñãÏ¢ô@€°°ðœys~üùÁ ©©IRÄÅÇ1$¥$\LLQ1QÏ_=g¸z㪘¸A ›¶n‚ø‡ñÇŸ/ž¿Ø°~çOŸ0•¶6·þùó§ ¨ º¦š‹› * ÑŠ YZY9qM"útzu—êsIEND®B`‚lucene-2.9.4/docs/skin/images/current.gif0000644000175000017500000000006611474320233020766 0ustar janpascaljanpascalGIF89a€¥¶Æÿÿÿ!ù, „¡k`›‰ÒÅ*;lucene-2.9.4/docs/skin/images/xmldoc.gif0000644000175000017500000000120711474320233020570 0ustar janpascaljanpascalGIF89a ¥sqs„†„ÿÿ÷ÿÿÿœžœRQRýýõôôôÎÏÎÇÇÀããÛççÖ997þþörrn101ŽŽ‰qqmúúòïïïçççÆÇÆÿÿüùùñøøñUURMMMˆˆ…ŸŸŸOOOTTR‰‰…………óóìšššªª¤ºººjjj333ÍÍÍ´´§gggÍ;gg_ššŽMMG33/·¸·€€€ÿÿ÷ÿÿ÷ÿÿ÷ÿÿ÷ÿÿ÷ÿÿ÷ÿÿ÷ÿÿ÷ÿÿ÷ÿÿ÷!þCreated with The GIMP!ù ?, þÀŸðÈäqÈlШT0¨Z §0ív X-“ëR",~vŠ2a±ÎŠÉQƒo|tl[y P ~SŽZxq^`vMx RP˜š`žc] ‡]µ™›®Cx^R©¶·¹B»SЍªÀ™ÂneSµ˜tËÄP´˜ ÓQ  æª ÑÒÔy!"  #µÑÝà}‹Bâ($äW¡D»ïâA ¡ž '8Ü;ñEŠ*J`.¹ "€±aˆ0WˆHë˜-nÒ¼ º¤ ™Œ-ˆÝt~âùë™» BÇ pÖó&9ƒ ܲ4Ê/d0h± Ò©Ÿª6ÅŠ© ^°€1“ªM²Y[´fÁE +À*[šŠt»Á˜ñ‚µª¿²L[è 1\$ÕµÔ(7ˆ.X !³íPîw[Áé×e¨^ͺµkÕÔ ÈžM»¶íÙC‚;lucene-2.9.4/docs/skin/images/external-link.gif0000644000175000017500000000010711474320233022055 0ustar janpascaljanpascalGIF89a ‘ÿÿÿƒ•©^o‚!ù, Œ¢í”`ÂY×ÜûYÉxʃ9ˆ!5;lucene-2.9.4/docs/skin/images/rc-t-r-5-1header-2searchbox-3searchbox.png0000644000175000017500000000032611474320233026174 0ustar janpascaljanpascal‰PNG  IHDRo&å cHRMz&€„ú€èu0ê`:˜pœºQ<bKGDÿÿÿ ½§“ pHYsÄÄ•+JIDATxÚc|úäæfffN^>>FF&& øû÷/÷/^¿zÈðÿÿ?ˆ ü¿þ0|úô!“øñý3ª Ì(Óò$ù‹TñIEND®B`‚lucene-2.9.4/docs/skin/images/rc-t-l-5-1header-2searchbox-3searchbox.png0000644000175000017500000000030711474320233026165 0ustar janpascaljanpascal‰PNG  IHDRo&å cHRMz&€„ú€èu0ê`:˜pœºQ<bKGDÿÿÿ ½§“ pHYsÄÄ•+;IDATxÚcüÿ>zÇðíûg†3°€Þ¼~ÌðçÏ/ ,0T €AZÀ‚ 3ÐU>,çÿÆîIEND®B`‚lucene-2.9.4/docs/skin/images/poddoc.png0000644000175000017500000000153011474320233020570 0ustar janpascaljanpascal‰PNG  IHDR©JLÎ cHRMz&€„ú€èu0ê`:˜pœºQ<gAMA»€a¬”bKGDÿÿÿ ½§“ pHYsÄÄ•+¼IDATxÚÅ–;HAÇ'§5z>Žt‚ŠX˜&ZÉ!â ÔÂ(6ÁRIÄJE«¤Áˆ‚………(„ØÄXˆ‡`!ø¸ó)Œ†C =½ËÌ®»î^ÖÛó"䃙ovfþó½×|n·û-Ï H$ÂÝÝ÷÷÷ ËùÑÑÑ×¾¾¾:ã>§Ip:#•••qƒ„ÃaB¡r¹6>…£÷:I´×A´Q>àE€4£F0ùÝÈï÷+Ë\qÔØãñXÀ§q^ ?§¦Þ¸JKc:>,/c’Œˆ¹SúÉíæ]WWl¼ðAwÿÊŠÊϤe¯—Pg§nƸL—i!_0 A~>ìíÁØ””¨k''00ííPS£î]_‡ùy¸ºÒì‚!IŸ57ƒÏ--P[ ‹‹07yy0: 99œ ››ÐÛ úÑß——lll(|"&Ÿ¯b|¶·¡§––„ñ—Um´Kabff ¾^?ú:#ƒRD’Ee±ÑÈáPµ*/‡ós8’&ÓÌHRS“j–‹ ¸¾V×ÒÓÿJM•Yk ö>’¦ëî†ÕUX[SA%µ¶ÂÁùTYlm=Yóì5ÒCø&3†‡¡¡á1·**`v Lù î–Âggg6á=9iþÒÖýýj(OOCuµì#p{«j½¿¯oMKK£¸¸8ÎZ'/3’ ˆÁÁGY†ûÓk*°Váý"•!,€"†Ê`©Ñwøü㡨¾ªªzŸ_T”+mÍò°RT s9J9(òÈc×&D‘ù¨ _::ê\……¹@ÀTú­ZBt{øµ³£¬¹\.û¢*7扒“••³±Ù­ÙvX«€DÀl5’ÕÂ8Þʈô/Zhó››ËÈtDÉÕÒU/ñRoüúõœÙ_aœ‘IEND®B`‚lucene-2.9.4/docs/skin/images/README.txt0000644000175000017500000000010611474320233020306 0ustar janpascaljanpascalThe images in this directory are used if the current skin lacks them. lucene-2.9.4/docs/skin/images/remove.jpg0000644000175000017500000000234311474320233020614 0ustar janpascaljanpascalÿØÿàJFIF``ÿÛCÿÛCÿÀ!ÿÄ ÿÄ! ÿÄÿÄ!ÿÚ ?Ô~°o©ÂÀ¯‘ô5ÉS¢¢ç̼Ú9R¤^Í2?ªÝqY;0#SÛÇ8^†>ùñF£8DÐ!~Ìïü©½:üSïËgKWÔΔnDö¡%ÙtFtu»z¦œëè÷QŸ¦Ÿd¯Š5­ŠÄ‹l¾)n µ«8HytýÏ%v¸¢KršØ÷ÇËò¦Øªúª²ôŠÌ¾è’è¾±9ÅLÈæ5VÅV1ÑŒ/3,°‡“0Aåó¢&C(ȹ’‡*4è2{Æï˯‘ññò@é—ÄjÃØwkNCÐK[.ä5ªüÿÚƒŒõcì ÚY­6A”Øx’ð(jæïúL3ICÅC _#nÖ± ‘ÝÕwĆjýÄÏb5Ó&¾²| €p6ƨù)æÝ+[&6é6ˆÐÕÝñ÷+fÒÍ™f‡^)¸)CáÚkê Ýn½Ÿã»–jUì㇩È”ž|©E·Cz¬›yÌçí¥”0‘œˆWB÷œW1 ÂX/iY ¢Î þ¢q‘è—•Åëy/ê¼—CÍö•ìâöØ8‘ñÔQÕÊè³°¤¾¾=Köhò²ÅTiiÙM‹ãT§sz—)š¢ôÝ„ƒˆU#Q+“ÿo7GB·¤šcek¤™¥Cj¸«RttV†å¼õÄ9Ã| Š®“ϳmÄ>ûíhh6+œþ“¸Ï–žgÕÿ›Ïú-'Åôdúo36¥ý úí6a°˜¯ðdg˜e¥¤Y›»óCûîïµõûê9èäúmõƒ}*i‘ÜÞîõ«Þ¿>xå"šTGÐû*iÈ…¬T’Õˆî½7¾z§ôºG*òéQÿX· €C`nO´&4+µû‘â5™-ñÒËÒC8SJ’eE) '•HÉ”¶ÆpQ§£ÏYç:³›¤,SL6BÙdû¹}m² ÎwU`²‰y¯´íßÈøv@ (àY ý¨ì Ñ’C™ðœ5=n0ÌÂ- !Õ–˜inèiî~D´ ÃK?þL‡óø-Í®®‡¤(Z^å.´Œõ-l0 P/EBŠKÉfš‘[0ƒ:ƒû²%IRU^qªÒé:M,]X×âú XXÒt¿®?RçK.`¹så‘>ÒØÒ|‰V—gf’¦œ\ pp:qœábÓ€PÖµ¬D° ÁÁÌÌÁÌÌÌÌÏ“UTÛMÛCë²·¶ªë©¬¢š©¡„×ÑCÚF×=Í3cZÃ&1„FdD[»ÿÙlucene-2.9.4/docs/skin/images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png0000644000175000017500000000030711474320233030027 0ustar janpascaljanpascal‰PNG  IHDRo&å cHRMz&€„ú€èu0ê`:˜pœºQ<bKGDÿÿÿ ½§“ pHYsÄÄ•+;IDATxÚcüÿ>zÇðíûg†3°€Þ¼~ÌðçÏ/ ,0T €AZÀ‚ 3ÐU>,çÿÆîIEND®B`‚lucene-2.9.4/docs/skin/breadcrumbs-optimized.js0000644000175000017500000000564511474320233022211 0ustar janpascaljanpascal/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ var PREPREND_CRUMBS=new Array(); var link1="@skinconfig.trail.link1.name@"; var link2="@skinconfig.trail.link2.name@"; var link3="@skinconfig.trail.link3.name@"; if(!(link1=="")&&!link1.indexOf( "@" ) == 0){ PREPREND_CRUMBS.push( new Array( link1, @skinconfig.trail.link1.href@ ) ); } if(!(link2=="")&&!link2.indexOf( "@" ) == 0){ PREPREND_CRUMBS.push( new Array( link2, @skinconfig.trail.link2.href@ ) ); } if(!(link3=="")&&!link3.indexOf( "@" ) == 0){ PREPREND_CRUMBS.push( new Array( link3, @skinconfig.trail.link3.href@ ) ); } var DISPLAY_SEPARATOR=" > "; var DISPLAY_PREPREND=" > "; var DISPLAY_POSTPREND=":"; var CSS_CLASS_CRUMB="breadcrumb"; var CSS_CLASS_TRAIL="breadcrumbTrail"; var CSS_CLASS_SEPARATOR="crumbSeparator"; var FILE_EXTENSIONS=new Array( ".html", ".htm", ".jsp", ".php", ".php3", ".php4" ); var PATH_SEPARATOR="/"; function sc(s) { var l=s.toLowerCase(); return l.substr(0,1).toUpperCase()+l.substr(1); } function getdirs() { var t=document.location.pathname.split(PATH_SEPARATOR); var lc=t[t.length-1]; for(var i=0;i < FILE_EXTENSIONS.length;i++) { if(lc.indexOf(FILE_EXTENSIONS[i])) return t.slice(1,t.length-1); } return t.slice(1,t.length); } function getcrumbs( d ) { var pre = "/"; var post = "/"; var c = new Array(); if( d != null ) { for(var i=0;i < d.length;i++) { pre+=d[i]+postfix; c.push(new Array(d[i],pre)); } } if(PREPREND_CRUMBS.length > 0 ) return PREPREND_CRUMBS.concat( c ); return c; } function gettrail( c ) { var h=DISPLAY_PREPREND; for(var i=0;i < c.length;i++) { h+=''+sc(c[i][0])+''; if(i!=(c.length-1)) h+=DISPLAY_SEPARATOR; } return h+DISPLAY_POSTPREND; } function gettrailXHTML( c ) { var h=''+DISPLAY_PREPREND; for(var i=0;i < c.length;i++) { h+=''+sc(c[i][0])+''; if(i!=(c.length-1)) h+=''+DISPLAY_SEPARATOR+''; } return h+DISPLAY_POSTPREND+''; } if(document.location.href.toLowerCase().indexOf("http://")==-1) document.write(gettrail(getcrumbs())); else document.write(gettrail(getcrumbs(getdirs()))); lucene-2.9.4/docs/skin/screen.css0000644000175000017500000003067211474320233017347 0ustar janpascaljanpascal/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ body { margin: 0px 0px 0px 0px; font-family: Verdana, Helvetica, sans-serif; } h1 { font-size : 160%; margin: 0px 0px 0px 0px; padding: 0px; } h2 { font-size : 140%; margin: 1em 0px 0.8em 0px; padding: 0px; font-weight : bold;} h3 { font-size : 130%; margin: 0.8em 0px 0px 0px; padding: 0px; font-weight : bold; } .h3 { margin: 22px 0px 3px 0px; } h4 { font-size : 120%; margin: 0.7em 0px 0px 0px; padding: 0px; font-weight : normal; text-align: left; } .h4 { margin: 18px 0px 0px 0px; } h4.faq { font-size : 120%; margin: 18px 0px 0px 0px; padding: 0px; font-weight : bold; text-align: left; } h5 { font-size : 100%; margin: 14px 0px 0px 0px; padding: 0px; font-weight : normal; text-align: left; } /** * table */ table .title { background-color: #000000; } .ForrestTable { color: #ffffff; background-color: #7099C5; width: 100%; font-size : 100%; empty-cells: show; } table caption { padding-left: 5px; color: white; text-align: left; font-weight: bold; background-color: #000000; } .ForrestTable td { color: black; background-color: #f0f0ff; } .ForrestTable th { text-align: center; } /** * Page Header */ #top { position: relative; float: left; width: 100%; background: #294563; /* if you want a background in the header, put it here */ } #top .breadtrail { background: #CFDCED; color: black; border-bottom: solid 1px white; padding: 3px 10px; font-size: 75%; } #top .breadtrail a { color: black; } #top .header { float: left; width: 100%; background: url("images/header_white_line.gif") repeat-x bottom; } #top .grouplogo { padding: 7px 0 10px 10px; float: left; text-align: left; } #top .projectlogo { padding: 7px 0 10px 10px; float: left; width: 33%; text-align: right; } #top .projectlogoA1 { padding: 7px 0 10px 10px; float: right; } html>body #top .searchbox { bottom: 0px; } #top .searchbox { position: absolute; right: 10px; height: 42px; font-size: 70%; white-space: nowrap; text-align: right; color: white; background-color: #000000; z-index:0; background-image: url(images/rc-t-l-5-1header-2searchbox-3searchbox.png); background-repeat: no-repeat; background-position: top left; bottom: -1px; /* compensate for IE rendering issue */ } #top .searchbox form { padding: 5px 10px; margin: 0; } #top .searchbox p { padding: 0 0 2px 0; margin: 0; } #top .searchbox input { font-size: 100%; } #tabs { clear: both; padding-left: 10px; margin: 0; list-style: none; } /* background: #CFDCED url("images/tab-right.gif") no-repeat right top;*/ #tabs li { float: left; background-image: url(images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png); background-repeat: no-repeat; background-position: top right; background-color: #000000; margin: 0 3px 0 0; padding: 0; } /*background: url("images/tab-left.gif") no-repeat left top;*/ #tabs li a { float: left; display: block; font-family: verdana, arial, sans-serif; text-decoration: none; color: black; white-space: nowrap; background-image: url(images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png); background-repeat: no-repeat; background-position: top left; padding: 5px 15px 4px; width: .1em; /* IE/Win fix */ } #tabs li a:hover { cursor: pointer; text-decoration:underline; } #tabs > li a { width: auto; } /* Rest of IE/Win fix */ /* Commented Backslash Hack hides rule from IE5-Mac \*/ #tabs a { float: none; } /* End IE5-Mac hack */ #top .header .current { background-color: #4C6C8F; background-image: url(images/rc-t-r-5-1header-2tab-selected-3tab-selected.png); background-repeat: no-repeat; background-position: top right; } #top .header .current a { font-weight: bold; padding-bottom: 5px; color: white; background-image: url(images/rc-t-l-5-1header-2tab-selected-3tab-selected.png); background-repeat: no-repeat; background-position: top left; } #publishedStrip { padding-right: 10px; padding-left: 20px; padding-top: 3px; padding-bottom:3px; color: #ffffff; font-size : 60%; font-weight: bold; background-color: #4C6C8F; text-align:right; } #level2tabs { margin: 0; float:left; position:relative; } #level2tabs a:hover { cursor: pointer; text-decoration:underline; } #level2tabs a{ cursor: pointer; text-decoration:none; background-image: url('images/chapter.gif'); background-repeat: no-repeat; background-position: center left; padding-left: 6px; margin-left: 6px; } /* * border-top: solid #4C6C8F 15px; */ #main { position: relative; background: white; clear:both; } #main .breadtrail { clear:both; position: relative; background: #CFDCED; color: black; border-bottom: solid 1px black; border-top: solid 1px black; padding: 0px 180px; font-size: 75%; z-index:10; } /** * Round corner */ #roundtop { background-image: url(images/rc-t-r-15-1body-2menu-3menu.png); background-repeat: no-repeat; background-position: top right; } #roundbottom { background-image: url(images/rc-b-r-15-1body-2menu-3menu.png); background-repeat: no-repeat; background-position: top right; } img.corner { width: 15px; height: 15px; border: none; display: block !important; } .roundtopsmall { background-image: url(images/rc-t-r-5-1header-2searchbox-3searchbox.png); background-repeat: no-repeat; background-position: top right; } #roundbottomsmall { background-image: url(images/rc-b-r-5-1header-2tab-selected-3tab-selected.png); background-repeat: no-repeat; background-position: top right; } img.cornersmall { width: 5px; height: 5px; border: none; display: block !important; } /** * Side menu */ #menu a { font-weight: normal; text-decoration: none;} #menu a:visited { font-weight: normal; } #menu a:active { font-weight: normal; } #menu a:hover { font-weight: normal; text-decoration:underline;} #menuarea { width:10em;} #menu { position: relative; float: left; width: 160px; padding-top: 0px; top:-18px; left:10px; z-index: 20; background-color: #f90; font-size : 70%; } .menutitle { cursor:pointer; padding: 3px 12px; margin-left: 10px; background-image: url('images/chapter.gif'); background-repeat: no-repeat; background-position: center left; font-weight : bold; } .menutitle:hover{text-decoration:underline;cursor: pointer;} #menu .menuitemgroup { margin: 0px 0px 6px 8px; padding: 0px; font-weight : bold; } #menu .selectedmenuitemgroup{ margin: 0px 0px 0px 8px; padding: 0px; font-weight : normal; } #menu .menuitem { padding: 2px 0px 1px 13px; background-image: url('images/page.gif'); background-repeat: no-repeat; background-position: center left; font-weight : normal; margin-left: 10px; } #menu .menupage { margin: 2px 0px 1px 10px; padding: 0px 3px 0px 12px; background-image: url('images/page.gif'); background-repeat: no-repeat; background-position: center left; font-style : normal; } #menu .menupagetitle { padding: 0px 0px 0px 1px; font-style : normal; border-style: solid; border-width: 1px; margin-right: 10px; } #menu .menupageitemgroup { padding: 3px 0px 4px 6px; font-style : normal; border-bottom: 1px solid ; border-left: 1px solid ; border-right: 1px solid ; margin-right: 10px; } #menu .menupageitem { font-style : normal; font-weight : normal; border-width: 0px; font-size : 90%; } #menu #credit { text-align: center; } #menu #credit2 { text-align: center; padding: 3px 3px 3px 3px; background-color: #ffffff; } #menu .searchbox { text-align: center; } #menu .searchbox form { padding: 3px 3px; margin: 0; } #menu .searchbox input { font-size: 100%; } #content { padding: 20px 20px 20px 180px; margin: 0; font : small Verdana, Helvetica, sans-serif; font-size : 80%; } #content ul { margin: 0; padding: 0 25px; } #content li { padding: 0 5px; } #feedback { color: black; background: #CFDCED; text-align:center; margin-top: 5px; } #feedback #feedbackto { font-size: 90%; color: black; } #footer { clear: both; position: relative; /* IE bugfix (http://www.dracos.co.uk/web/css/ie6floatbug/) */ width: 100%; background: #CFDCED; border-top: solid 1px #4C6C8F; color: black; } #footer .copyright { position: relative; /* IE bugfix cont'd */ padding: 5px; margin: 0; width: 45%; } #footer .lastmodified { position: relative; /* IE bugfix cont'd */ float: right; width: 45%; padding: 5px; margin: 0; text-align: right; } #footer a { color: white; } #footer #logos { text-align: left; } /** * Misc Styles */ acronym { cursor: help; } .boxed { background-color: #a5b6c6;} .underlined_5 {border-bottom: solid 5px #4C6C8F;} .underlined_10 {border-bottom: solid 10px #4C6C8F;} /* ==================== snail trail ============================ */ .trail { position: relative; /* IE bugfix cont'd */ font-size: 70%; text-align: right; float: right; margin: -10px 5px 0px 5px; padding: 0; } #motd-area { position: relative; /* IE bugfix cont'd */ float: right; width: 35%; background-color: #f0f0ff; border-top: solid 1px #4C6C8F; border-bottom: solid 1px #4C6C8F; margin-bottom: 15px; margin-left: 15px; margin-right: 10%; padding-bottom: 5px; padding-top: 5px; } #minitoc-area { border-top: solid 1px #4C6C8F; border-bottom: solid 1px #4C6C8F; margin: 15px 10% 5px 15px; /* margin-bottom: 15px; margin-left: 15px; margin-right: 10%;*/ padding-bottom: 7px; padding-top: 5px; } .minitoc { list-style-image: url('images/current.gif'); font-weight: normal; } li p { margin: 0; padding: 0; } .pdflink { position: relative; /* IE bugfix cont'd */ float: right; margin: 0px 5px; padding: 0; } .pdflink br { margin-top: -10px; padding-left: 1px; } .pdflink a { display: block; font-size: 70%; text-align: center; margin: 0; padding: 0; } .pdflink img { display: block; height: 16px; width: 16px; } .xmllink { position: relative; /* IE bugfix cont'd */ float: right; margin: 0px 5px; padding: 0; } .xmllink br { margin-top: -10px; padding-left: 1px; } .xmllink a { display: block; font-size: 70%; text-align: center; margin: 0; padding: 0; } .xmllink img { display: block; height: 16px; width: 16px; } .podlink { position: relative; /* IE bugfix cont'd */ float: right; margin: 0px 5px; padding: 0; } .podlink br { margin-top: -10px; padding-left: 1px; } .podlink a { display: block; font-size: 70%; text-align: center; margin: 0; padding: 0; } .podlink img { display: block; height: 16px; width: 16px; } .printlink { position: relative; /* IE bugfix cont'd */ float: right; } .printlink br { margin-top: -10px; padding-left: 1px; } .printlink a { display: block; font-size: 70%; text-align: center; margin: 0; padding: 0; } .printlink img { display: block; height: 16px; width: 16px; } p.instruction { display: list-item; list-style-image: url('../images/instruction_arrow.png'); list-style-position: outside; margin-left: 2em; } lucene-2.9.4/docs/skin/breadcrumbs.js0000644000175000017500000001467511474320233020212 0ustar janpascaljanpascal/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * This script, when included in a html file, builds a neat breadcrumb trail * based on its url. That is, if it doesn't contains bugs (I'm relatively * sure it does). * * Typical usage: * */ /** * IE 5 on Mac doesn't know Array.push. * * Implement it - courtesy to fritz. */ var abc = new Array(); if (!abc.push) { Array.prototype.push = function(what){this[this.length]=what} } /* ======================================================================== CONSTANTS ======================================================================== */ /** * Two-dimensional array containing extra crumbs to place at the front of * the trail. Specify first the name of the crumb, then the URI that belongs * to it. You'll need to modify this for every domain or subdomain where * you use this script (you can leave it as an empty array if you wish) */ var PREPREND_CRUMBS = new Array(); var link1 = "@skinconfig.trail.link1.name@"; var link2 = "@skinconfig.trail.link2.name@"; var link3 = "@skinconfig.trail.link3.name@"; var href1 = "@skinconfig.trail.link1.href@"; var href2 = "@skinconfig.trail.link2.href@"; var href3 = "@skinconfig.trail.link3.href@"; if(!(link1=="")&&!link1.indexOf( "@" ) == 0){ PREPREND_CRUMBS.push( new Array( link1, href1 ) ); } if(!(link2=="")&&!link2.indexOf( "@" ) == 0){ PREPREND_CRUMBS.push( new Array( link2, href2 ) ); } if(!(link3=="")&&!link3.indexOf( "@" ) == 0){ PREPREND_CRUMBS.push( new Array( link3, href3 ) ); } /** * String to include between crumbs: */ var DISPLAY_SEPARATOR = " > "; /** * String to include at the beginning of the trail */ var DISPLAY_PREPREND = " > "; /** * String to include at the end of the trail */ var DISPLAY_POSTPREND = ""; /** * CSS Class to use for a single crumb: */ var CSS_CLASS_CRUMB = "breadcrumb"; /** * CSS Class to use for the complete trail: */ var CSS_CLASS_TRAIL = "breadcrumbTrail"; /** * CSS Class to use for crumb separator: */ var CSS_CLASS_SEPARATOR = "crumbSeparator"; /** * Array of strings containing common file extensions. We use this to * determine what part of the url to ignore (if it contains one of the * string specified here, we ignore it). */ var FILE_EXTENSIONS = new Array( ".html", ".htm", ".jsp", ".php", ".php3", ".php4" ); /** * String that separates parts of the breadcrumb trail from each other. * When this is no longer a slash, I'm sure I'll be old and grey. */ var PATH_SEPARATOR = "/"; /* ======================================================================== UTILITY FUNCTIONS ======================================================================== */ /** * Capitalize first letter of the provided string and return the modified * string. */ function sentenceCase( string ) { return string; //var lower = string.toLowerCase(); //return lower.substr(0,1).toUpperCase() + lower.substr(1); } /** * Returns an array containing the names of all the directories in the * current document URL */ function getDirectoriesInURL() { var trail = document.location.pathname.split( PATH_SEPARATOR ); // check whether last section is a file or a directory var lastcrumb = trail[trail.length-1]; for( var i = 0; i < FILE_EXTENSIONS.length; i++ ) { if( lastcrumb.indexOf( FILE_EXTENSIONS[i] ) ) { // it is, remove it and send results return trail.slice( 1, trail.length-1 ); } } // it's not; send the trail unmodified return trail.slice( 1, trail.length ); } /* ======================================================================== BREADCRUMB FUNCTIONALITY ======================================================================== */ /** * Return a two-dimensional array describing the breadcrumbs based on the * array of directories passed in. */ function getBreadcrumbs( dirs ) { var prefix = "/"; var postfix = "/"; // the array we will return var crumbs = new Array(); if( dirs != null ) { for( var i = 0; i < dirs.length; i++ ) { prefix += dirs[i] + postfix; crumbs.push( new Array( dirs[i], prefix ) ); } } // preprend the PREPREND_CRUMBS if(PREPREND_CRUMBS.length > 0 ) { return PREPREND_CRUMBS.concat( crumbs ); } return crumbs; } /** * Return a string containing a simple text breadcrumb trail based on the * two-dimensional array passed in. */ function getCrumbTrail( crumbs ) { var xhtml = DISPLAY_PREPREND; for( var i = 0; i < crumbs.length; i++ ) { xhtml += ''; xhtml += unescape( crumbs[i][0] ) + ''; if( i != (crumbs.length-1) ) { xhtml += DISPLAY_SEPARATOR; } } xhtml += DISPLAY_POSTPREND; return xhtml; } /** * Return a string containing an XHTML breadcrumb trail based on the * two-dimensional array passed in. */ function getCrumbTrailXHTML( crumbs ) { var xhtml = ''; xhtml += DISPLAY_PREPREND; for( var i = 0; i < crumbs.length; i++ ) { xhtml += ''; xhtml += unescape( crumbs[i][0] ) + ''; if( i != (crumbs.length-1) ) { xhtml += '' + DISPLAY_SEPARATOR + ''; } } xhtml += DISPLAY_POSTPREND; xhtml += ''; return xhtml; } /* ======================================================================== PRINT BREADCRUMB TRAIL ======================================================================== */ // check if we're local; if so, only print the PREPREND_CRUMBS if( document.location.href.toLowerCase().indexOf( "http://" ) == -1 ) { document.write( getCrumbTrail( getBreadcrumbs() ) ); } else { document.write( getCrumbTrail( getBreadcrumbs( getDirectoriesInURL() ) ) ); } lucene-2.9.4/docs/skin/basic.css0000644000175000017500000000564311474320233017151 0ustar janpascaljanpascal/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * General */ img { border: 0; } #content table { border: 0; width: 100%; } /*Hack to get IE to render the table at 100%*/ * html #content table { margin-left: -3px; } #content th, #content td { margin: 0; padding: 0; vertical-align: top; } .clearboth { clear: both; } .note, .warning, .fixme { border: solid black 1px; margin: 1em 3em; } .note .label { background: #369; color: white; font-weight: bold; padding: 5px 10px; } .note .content { background: #F0F0FF; color: black; line-height: 120%; font-size: 90%; padding: 5px 10px; } .warning .label { background: #C00; color: white; font-weight: bold; padding: 5px 10px; } .warning .content { background: #FFF0F0; color: black; line-height: 120%; font-size: 90%; padding: 5px 10px; } .fixme .label { background: #C6C600; color: black; font-weight: bold; padding: 5px 10px; } .fixme .content { padding: 5px 10px; } /** * Typography */ body { font-family: verdana, "Trebuchet MS", arial, helvetica, sans-serif; font-size: 100%; } #content { font-family: Georgia, Palatino, Times, serif; font-size: 95%; } #tabs { font-size: 70%; } #menu { font-size: 80%; } #footer { font-size: 70%; } h1, h2, h3, h4, h5, h6 { font-family: "Trebuchet MS", verdana, arial, helvetica, sans-serif; font-weight: bold; margin-top: 1em; margin-bottom: .5em; } h1 { margin-top: 0; margin-bottom: 1em; font-size: 1.4em; } #content h1 { font-size: 160%; margin-bottom: .5em; } #menu h1 { margin: 0; padding: 10px; background: #336699; color: white; } h2 { font-size: 120%; } h3 { font-size: 100%; } h4 { font-size: 90%; } h5 { font-size: 80%; } h6 { font-size: 75%; } p { line-height: 120%; text-align: left; margin-top: .5em; margin-bottom: 1em; } #content li, #content th, #content td, #content li ul, #content li ol{ margin-top: .5em; margin-bottom: .5em; } #content li li, #minitoc-area li{ margin-top: 0em; margin-bottom: 0em; } #content .attribution { text-align: right; font-style: italic; font-size: 85%; margin-top: 1em; } .codefrag { font-family: "Courier New", Courier, monospace; font-size: 110%; } lucene-2.9.4/docs/skin/menu.js0000644000175000017500000000322311474320233016650 0ustar janpascaljanpascal/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * This script, when included in a html file, can be used to make collapsible menus * * Typical usage: * */ if (document.getElementById){ document.write('') } function SwitchMenu(obj) { if(document.getElementById) { var el = document.getElementById(obj); var title = document.getElementById(obj+'Title'); if(obj.indexOf("_selected_")==0&&el.style.display == ""){ el.style.display = "block"; title.className = "pagegroupselected"; } if(el.style.display != "block"){ el.style.display = "block"; title.className = "pagegroupopen"; } else{ el.style.display = "none"; title.className = "pagegroup"; } }// end - if(document.getElementById) }//end - function SwitchMenu(obj) lucene-2.9.4/docs/skin/fontsize.js0000644000175000017500000000605011474320233017546 0ustar janpascaljanpascal/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ function init() { //embedded in the doc //ndeSetTextSize(); } function checkBrowser(){ if (!document.getElementsByTagName){ return true; } else{ return false; } } function ndeSetTextSize(chgsize,rs) { var startSize; var newSize; if (!checkBrowser) { return; } startSize = parseInt(ndeGetDocTextSize()); if (!startSize) { startSize = 16; } switch (chgsize) { case 'incr': newSize = startSize + 2; break; case 'decr': newSize = startSize - 2; break; case 'reset': if (rs) {newSize = rs;} else {newSize = 16;} break; default: try{ newSize = parseInt(ndeReadCookie("nde-textsize")); } catch(e){ alert(e); } if (!newSize || newSize == 'NaN') { newSize = startSize; } break; } if (newSize < 10) { newSize = 10; } newSize += 'px'; document.getElementsByTagName('html')[0].style.fontSize = newSize; document.getElementsByTagName('body')[0].style.fontSize = newSize; ndeCreateCookie("nde-textsize", newSize, 365); } function ndeGetDocTextSize() { if (!checkBrowser) { return 0; } var size = 0; var body = document.getElementsByTagName('body')[0]; if (body.style && body.style.fontSize) { size = body.style.fontSize; } else if (typeof(getComputedStyle) != 'undefined') { size = getComputedStyle(body,'').getPropertyValue('font-size'); } else if (body.currentStyle) { size = body.currentStyle.fontSize; } //fix IE bug if( isNaN(size)){ if(size.substring(size.length-1)=="%"){ return } } return size; } function ndeCreateCookie(name,value,days) { var cookie = name + "=" + value + ";"; if (days) { var date = new Date(); date.setTime(date.getTime()+(days*24*60*60*1000)); cookie += " expires=" + date.toGMTString() + ";"; } cookie += " path=/"; document.cookie = cookie; } function ndeReadCookie(name) { var nameEQ = name + "="; var ca = document.cookie.split(';'); for(var i = 0; i < ca.length; i++) { var c = ca[i]; while (c.charAt(0) == ' ') { c = c.substring(1, c.length); } ctest = c.substring(0,name.length); if(ctest == name){ return c.substring(nameEQ.length,c.length); } } return null; } lucene-2.9.4/docs/skin/forrest.css.xslt0000644000175000017500000000551611474320233020544 0ustar janpascaljanpascal /* ==================== aural ============================ */ @media aural { h1, h2, h3, h4, h5, h6 { voice-family: paul, male; stress: 20; richness: 90 } h1 { pitch: x-low; pitch-range: 90 } h2 { pitch: x-low; pitch-range: 80 } h3 { pitch: low; pitch-range: 70 } h4 { pitch: medium; pitch-range: 60 } h5 { pitch: medium; pitch-range: 50 } h6 { pitch: medium; pitch-range: 40 } li, dt, dd { pitch: medium; richness: 60 } dt { stress: 80 } pre, code, tt { pitch: medium; pitch-range: 0; stress: 0; richness: 80 } em { pitch: medium; pitch-range: 60; stress: 60; richness: 50 } strong { pitch: medium; pitch-range: 60; stress: 90; richness: 90 } dfn { pitch: high; pitch-range: 60; stress: 60 } s, strike { richness: 0 } i { pitch: medium; pitch-range: 60; stress: 60; richness: 50 } b { pitch: medium; pitch-range: 60; stress: 90; richness: 90 } u { richness: 0 } :link { voice-family: harry, male } :visited { voice-family: betty, female } :active { voice-family: betty, female; pitch-range: 80; pitch: x-high } } a.external { padding: 0 20px 0px 0px; display:inline; background-repeat: no-repeat; background-position: center right; background-image: url(images/external-link.gif); } /* extra-css */ lucene-2.9.4/docs/skin/skinconf.xsl0000644000175000017500000001424111474320233017712 0ustar janpascaljanpascal lucene-2.9.4/docs/skin/getMenu.js0000644000175000017500000000317411474320233017315 0ustar janpascaljanpascal/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * This script, when included in a html file, can be used to make collapsible menus * * Typical usage: * */ if (document.getElementById){ document.write('') } function SwitchMenu(obj, thePath) { var open = 'url("'+thePath + 'images/chapter_open.gif")'; var close = 'url("'+thePath + 'images/chapter.gif")'; if(document.getElementById) { var el = document.getElementById(obj); var title = document.getElementById(obj+'Title'); if(el.style.display != "block"){ title.style.backgroundImage = open; el.style.display = "block"; }else{ title.style.backgroundImage = close; el.style.display = "none"; } }// end - if(document.getElementById) }//end - function SwitchMenu(obj) lucene-2.9.4/docs/linkmap.html0000644000175000017500000003535711474320234016741 0ustar janpascaljanpascal Site Linkmap Table of Contents
 

Site Linkmap Table of Contents

This is a map of the complete site and its structure.

  • Lucene  ___________________  site
      • Documentation  ___________________  docs
          • Overview  ___________________  overview
          • Changes  ___________________  changes
              • Core  ___________________  changes-core
              • Contrib  ___________________  changes-contrib
          • Javadocs  ___________________  javadoc
              • All  ___________________  javadoc-all
              • Core  ___________________  javadoc-core
              • Demo  ___________________  javadoc-demo
              • Contrib  ___________________  javadoc-contrib
                  • Analyzers  ___________________  javadoc-contrib-analyzers
                  • Ant  ___________________  javadoc-contrib-ant
                  • Bdb  ___________________  javadoc-contrib-bdb
                  • Bdb-je  ___________________  javadoc-contrib-bdb-je
                  • Benchmark  ___________________  javadoc-contrib-benchmark
                  • Collation  ___________________  javadoc-contrib-collation
                  • Highlighter  ___________________  javadoc-contrib-highlighter
                  • Instantiated  ___________________  javadoc-contrib-instantiated
                  • Lucli  ___________________  javadoc-contrib-lucli
                  • Memory  ___________________  javadoc-contrib-memory
                  • Queries  ___________________  javadoc-contrib-queries
                  • Regex  ___________________  javadoc-contrib-regex
                  • Remote  ___________________  javadoc-contrib-remote
                  • Snowball  ___________________  javadoc-contrib-snowball
                  • Spatial  ___________________  javadoc-contrib-spatial
                  • Spellchecker  ___________________  javadoc-contrib-spellchecker
                  • Surround  ___________________  javadoc-contrib-surround
                  • Swing  ___________________  javadoc-contrib-swing
                  • Wikipedia  ___________________  javadoc-contrib-wikipedia
                  • Wordnet  ___________________  javadoc-contrib-wordnet
          • FAQ  ___________________  faq
          • Scoring  ___________________  scoring
          • Wiki  ___________________  wiki
 
lucene-2.9.4/CHANGES.txt0000644000175000017500000056526111474505321015305 0ustar janpascaljanpascalLucene Change Log $Id: CHANGES.txt 1039905 2010-11-28 16:58:14Z uschindler $ ======================= Release 2.9.4 2010-12-03 ======================= Changes in runtime behavior * LUCENE-2689: NativeFSLockFactory no longer attempts to acquire a test lock just before the real lock is acquired. (Surinder Pal Singh Bindra via Mike McCandless) * LUCENE-2762: Fixed bug in IndexWriter causing it to hold open file handles against deleted files when compound-file was enabled (the default) and readers are pooled. As a result of this the peak worst-case free disk space required during optimize is now 3X the index size, when compound file is enabled (else 2X). (Mike McCandless) * LUCENE-2773: LogMergePolicy accepts a double noCFSRatio (default = 0.1), which means any time a merged segment is greater than 10% of the index size, it will be left in non-compound format even if compound format is on. This change was made to reduce peak transient disk usage during optimize which increased due to LUCENE-2762. (Mike McCandless) Bug fixes * LUCENE-2142 (correct fix): FieldCacheImpl.getStringIndex no longer throws an exception when term count exceeds doc count. (Mike McCandless, Uwe Schindler) * LUCENE-2513: when opening writable IndexReader on a not-current commit, do not overwrite "future" commits. (Mike McCandless) * LUCENE-2536: IndexWriter.rollback was failing to properly rollback buffered deletions against segments that were flushed (Mark Harwood via Mike McCandless) * LUCENE-2541: Fixed NumericRangeQuery that returned incorrect results with endpoints near Long.MIN_VALUE and Long.MAX_VALUE: NumericUtils.splitRange() overflowed, if - the range contained a LOWER bound that was greater than (Long.MAX_VALUE - (1L << precisionStep)) - the range contained an UPPER bound that was less than (Long.MIN_VALUE + (1L << precisionStep)) With standard precision steps around 4, this had no effect on most queries, only those that met the above conditions. Queries with large precision steps failed more easy. Queries with precision step >=64 were not affected. Also 32 bit data types int and float were not affected. (Yonik Seeley, Uwe Schindler) * LUCENE-2593: Fixed certain rare cases where a disk full could lead to a corrupted index (Robert Muir, Mike McCandless) * LUCENE-2620: Fixed a bug in WildcardQuery where too many asterisks would result in unbearably slow performance. (Nick Barkas via Robert Muir) * LUCENE-2627: Fixed bug in MMapDirectory chunking when a file is an exact multiple of the chunk size. (Robert Muir) * LUCENE-2634: isCurrent on an NRT reader was failing to return false if the writer had just committed (Nikolay Zamosenchuk via Mike McCandless) * LUCENE-2650: Added extra safety to MMapIndexInput clones to prevent accessing an unmapped buffer if the input is closed (Mike McCandless, Uwe Schindler, Robert Muir) * LUCENE-2384: Reset zzBuffer in StandardTokenizerImpl when lexer is reset. (Ruben Laguna via Uwe Schindler, sub-issue of LUCENE-2074) * LUCENE-2658: Exceptions while processing term vectors enabled for multiple fields could lead to invalid ArrayIndexOutOfBoundsExceptions. (Robert Muir, Mike McCandless) * LUCENE-2235: Implement missing PerFieldAnalyzerWrapper.getOffsetGap(). (Javier Godoy via Uwe Schindler) * LUCENE-2328: Fixed memory leak in how IndexWriter/Reader tracked already sync'd files. (Earwin Burrfoot via Mike McCandless) * LUCENE-2549: Fix TimeLimitingCollector#TimeExceededException to record the absolute docid. (Uwe Schindler) * LUCENE-2533: fix FileSwitchDirectory.listAll to not return dups when primary & secondary dirs share the same underlying directory. (Michael McCandless) * LUCENE-2365: IndexWriter.newestSegment (used normally for testing) is fixed to return null if there are no segments. (Karthick Sankarachary via Mike McCandless) * LUCENE-2730: Fix two rare deadlock cases in IndexWriter (Mike McCandless) * LUCENE-2744: CheckIndex was stating total number of fields, not the number that have norms enabled, on the "test: field norms..." output. (Mark Kristensson via Mike McCandless) * LUCENE-2759: Fixed two near-real-time cases where doc store files may be opened for read even though they are still open for write. (Mike McCandless) * LUCENE-2618: Fix rare thread safety issue whereby IndexWriter.optimize could sometimes return even though the index wasn't fully optimized (Mike McCandless) * LUCENE-2767: Fix thread safety issue in addIndexes(IndexReader[]) that could potentially result in index corruption. (Mike McCandless) * LUCENE-2762: Fixed bug in IndexWriter causing it to hold open file handles against deleted files when compound-file was enabled (the default) and readers are pooled. As a result of this the peak worst-case free disk space required during optimize is now 3X the index size, when compound file is enabled (else 2X). (Mike McCandless) * LUCENE-2216: OpenBitSet.hashCode returned different hash codes for sets that only differed by trailing zeros. (Dawid Weiss, yonik) * LUCENE-2782: Fix rare potential thread hazard with IndexWriter.commit (Mike McCandless) API Changes * LUCENE-2773: LogMergePolicy accepts a double noCFSRatio (default = 0.1), which means any time a merged segment is greater than 10% of the index size, it will be left in non-compound format even if compound format is on. This change was made to reduce peak transient disk usage during optimize which increased due to LUCENE-2762. (Mike McCandless) Optimizations * LUCENE-2556: Improve memory usage after cloning TermAttribute. (Adriano Crestani via Uwe Schindler) * LUCENE-2098: Improve the performance of BaseCharFilter, especially for large documents. (Robin Wojciki, Koji Sekiguchi, Robert Muir) New features * LUCENE-2675: Add support for Lucene 3.0 stored field files also in 2.9. The file format did not change, only the version number was upgraded to mark segments that have no compression. FieldsWriter still only writes 2.9 segments as they could contain compressed fields. This cross-version index format compatibility is provided here solely because Lucene 2.9 and 3.0 have the same bugfix level, features, and the same index format with this slight compression difference. In general, Lucene does not support reading newer indexes with older library versions. (Uwe Schindler) Documentation * LUCENE-2239: Documented limitations in NIOFSDirectory and MMapDirectory due to Java NIO behavior when a Thread is interrupted while blocking on IO. (Simon Willnauer, Robert Muir) ======================= Release 2.9.3 2010-06-18 ======================= Changes in backwards compatibility policy * LUCENE-2135: Added FieldCache.purge(IndexReader) method to the interface. Anyone implementing FieldCache externally will need to fix their code to implement this, on upgrading. (Mike McCandless) Changes in runtime behavior * LUCENE-2421: NativeFSLockFactory does not throw LockReleaseFailedException if it cannot delete the lock file, since obtaining the lock does not fail if the file is there. (Shai Erera) * LUCENE-2060: Changed ConcurrentMergeScheduler's default for maxNumThreads from 3 to 1, because in practice we get the most gains from running a single merge in the backround. More than one concurrent merge causes alot of thrashing (though it's possible on SSD storage that there would be net gains). (Jason Rutherglen, Mike McCandless) Bug fixes * LUCENE-2046: IndexReader should not see the index as changed, after IndexWriter.prepareCommit has been called but before IndexWriter.commit is called. (Peter Keegan via Mike McCandless) * LUCENE-2119: Don't throw NegativeArraySizeException if you pass Integer.MAX_VALUE as nDocs to IndexSearcher search methods. (Paul Taylor via Mike McCandless) * LUCENE-2142: FieldCacheImpl.getStringIndex no longer throws an exception when term count exceeds doc count. (Mike McCandless) * LUCENE-2104: NativeFSLock.release() would silently fail if the lock is held by another thread/process. (Shai Erera via Uwe Schindler) * LUCENE-2283: Use shared memory pool for term vector and stored fields buffers. This memory will be reclaimed if needed according to the configured RAM Buffer Size for the IndexWriter. This also fixes potentially excessive memory usage when many threads are indexing a mix of small and large documents. (Tim Smith via Mike McCandless) * LUCENE-2300: If IndexWriter is pooling reader (because NRT reader has been obtained), and addIndexes* is run, do not pool the readers from the external directory. This is harmless (NRT reader is correct), but a waste of resources. (Mike McCandless) * LUCENE-2422: Don't reuse byte[] in IndexInput/Output -- it gains little performance, and ties up possibly large amounts of memory for apps that index large docs. (Ross Woolf via Mike McCandless) * LUCENE-2387: Don't hang onto Fieldables from the last doc indexed, in IndexWriter, nor the Reader in Tokenizer after close is called. (Ruben Laguna, Uwe Schindler, Mike McCandless) * LUCENE-2417: IndexCommit did not implement hashCode() and equals() consistently. Now they both take Directory and version into consideration. In addition, all of IndexComnmit methods which threw UnsupportedOperationException are now abstract. (Shai Erera) * LUCENE-2467: Fixed memory leaks in IndexWriter when large documents are indexed. (Mike McCandless) * LUCENE-2473: Clicking on the "More Results" link in the luceneweb.war demo resulted in ArrayIndexOutOfBoundsException. (Sami Siren via Robert Muir) * LUCENE-2476: If any exception is hit init'ing IW, release the write lock (previously we only released on IOException). (Tamas Cservenak via Mike McCandless) * LUCENE-2478: Fix CachingWrapperFilter to not throw NPE when Filter.getDocIdSet() returns null. (Uwe Schindler, Daniel Noll) * LUCENE-2468: Allow specifying how new deletions should be handled in CachingWrapperFilter and CachingSpanFilter. By default, new deletions are ignored in CachingWrapperFilter, since typically this filter is AND'd with a query that correctly takes new deletions into account. This should be a performance gain (higher cache hit rate) in apps that reopen readers, or use near-real-time reader (IndexWriter.getReader()), but may introduce invalid search results (allowing deleted docs to be returned) for certain cases, so a new expert ctor was added to CachingWrapperFilter to enforce deletions at a performance cost. CachingSpanFilter by default recaches if there are new deletions (Shay Banon via Mike McCandless) * LUCENE-2299: If you open an NRT reader while addIndexes* is running, it may miss some segments (Earwin Burrfoot via Mike McCandless) * LUCENE-2397: Don't throw NPE from SnapshotDeletionPolicy.snapshot if there are no commits yet (Shai Erera) * LUCENE-2424: Fix FieldDoc.toString to actually return its fields (Stephen Green via Mike McCandless) * LUCENE-2311: Always pass a "fully loaded" (terms index & doc stores) SegmentsReader to IndexWriter's mergedSegmentWarmer (if set), so that warming is free to do whatever it needs to. (Earwin Burrfoot via Mike McCandless) * LUCENE-2486: Fixed intermittent FileNotFoundException on doc store files when a mergedSegmentWarmer is set on IndexWriter. (Mike McCandless) API Changes * LUCENE-2281: added doBeforeFlush to IndexWriter to allow extensions to perform operations before flush starts. Also exposed doAfterFlush as protected instead of package-private. (Shai Erera via Mike McCandless) * LUCENE-2356: Add IndexWriter.set/getReaderTermsIndexDivisor, to set what IndexWriter passes for termsIndexDivisor to the readers it opens internally when applying deletions or creating a near-real-time reader. (Earwin Burrfoot via Mike McCandless) Optimizations * LUCENE-2135: On IndexReader.close, forcefully evict any entries from the FieldCache rather than waiting for the WeakHashMap to release the reference (Mike McCandless) * LUCENE-2161: Improve concurrency of IndexReader, especially in the context of near real-time readers. (Mike McCandless) * LUCENE-2360: Small speedup to recycling of reused per-doc RAM in IndexWriter (Robert Muir, Mike McCandless) Build * LUCENE-2488: Support build with JDK 1.4 and exclude Java 1.5 contrib modules on request (pass '-Dforce.jdk14.build=true') when compiling/testing/packaging. This marks the benchmark contrib also as Java 1.5, as it depends on fast-vector-highlighter. (Uwe Schindler) ======================= Release 2.9.2 2010-02-26 ======================= Bug fixes * LUCENE-2045: Fix silly FileNotFoundException hit if you enable infoStream on IndexWriter and then add an empty document and commit (Shai Erera via Mike McCandless) * LUCENE-2088: addAttribute() should only accept interfaces that extend Attribute. (Shai Erera, Uwe Schindler) * LUCENE-2092: BooleanQuery was ignoring disableCoord in its hashCode and equals methods, cause bad things to happen when caching BooleanQueries. (Chris Hostetter, Mike McCandless) * LUCENE-2095: Fixes: when two threads call IndexWriter.commit() at the same time, it's possible for commit to return control back to one of the threads before all changes are actually committed. (Sanne Grinovero via Mike McCandless) * LUCENE-2166: Don't incorrectly keep warning about the same immense term, when IndexWriter.infoStream is on. (Mike McCandless) * LUCENE-2158: At high indexing rates, NRT reader could temporarily lose deletions. (Mike McCandless) * LUCENE-2182: DEFAULT_ATTRIBUTE_FACTORY was failing to load implementation class when interface was loaded by a different class loader. (Uwe Schindler, reported on java-user by Ahmed El-dawy) * LUCENE-2257: Increase max number of unique terms in one segment to termIndexInterval (default 128) * ~2.1 billion = ~274 billion. (Tom Burton-West via Mike McCandless) * LUCENE-2260: Fixed AttributeSource to not hold a strong reference to the Attribute/AttributeImpl classes which prevents unloading of custom attributes loaded by other classloaders (e.g. in Solr plugins). (Uwe Schindler) * LUCENE-1941: Fix Min/MaxPayloadFunction returns 0 when only one payload is present. (Erik Hatcher, Mike McCandless via Uwe Schindler) * LUCENE-2270: Queries consisting of all zero-boost clauses (for example, text:foo^0) sorted incorrectly and produced invalid docids. (yonik) API Changes * LUCENE-2190: Added a new class CustomScoreProvider to function package that can be subclassed to provide custom scoring to CustomScoreQuery. The methods in CustomScoreQuery that did this before were deprecated and replaced by a method getCustomScoreProvider(IndexReader) that returns a custom score implementation using the above class. The change is necessary with per-segment searching, as CustomScoreQuery is a stateless class (like all other Queries) and does not know about the currently searched segment. This API works similar to Filter's getDocIdSet(IndexReader). (Paul chez Jamespot via Mike McCandless, Uwe Schindler) * LUCENE-2080: Deprecate Version.LUCENE_CURRENT, as using this constant will cause backwards compatibility problems when upgrading Lucene. See the Version javadocs for additional information. (Robert Muir) Optimizations * LUCENE-2086: When resolving deleted terms, do so in term sort order for better performance (Bogdan Ghidireac via Mike McCandless) * LUCENE-2258: Remove unneeded synchronization in FuzzyTermEnum. (Uwe Schindler, Robert Muir) Test Cases * LUCENE-2114: Change TestFilteredSearch to test on multi-segment index as well. (Simon Willnauer via Mike McCandless) * LUCENE-2211: Improves BaseTokenStreamTestCase to use a fake attribute that checks if clearAttributes() was called correctly. (Uwe Schindler, Robert Muir) * LUCENE-2207, LUCENE-2219: Improve BaseTokenStreamTestCase to check if end() is implemented correctly. (Koji Sekiguchi, Robert Muir) Documentation * LUCENE-2114: Improve javadocs of Filter to call out that the provided reader is per-segment (Simon Willnauer via Mike McCandless) ======================= Release 2.9.1 2009-11-06 ======================= Changes in backwards compatibility policy * LUCENE-2002: Add required Version matchVersion argument when constructing QueryParser or MultiFieldQueryParser and, default (as of 2.9) enablePositionIncrements to true to match StandardAnalyzer's 2.9 default (Uwe Schindler, Mike McCandless) Bug fixes * LUCENE-1974: Fixed nasty bug in BooleanQuery (when it used BooleanScorer for scoring), whereby some matching documents fail to be collected. (Fulin Tang via Mike McCandless) * LUCENE-1124: Make sure FuzzyQuery always matches the precise term. (stefatwork@gmail.com via Mike McCandless) * LUCENE-1976: Fix IndexReader.isCurrent() to return the right thing when the reader is a near real-time reader. (Jake Mannix via Mike McCandless) * LUCENE-1986: Fix NPE when scoring PayloadNearQuery (Peter Keegan, Mark Miller via Mike McCandless) * LUCENE-1992: Fix thread hazard if a merge is committing just as an exception occurs during sync (Uwe Schindler, Mike McCandless) * LUCENE-1995: Note in javadocs that IndexWriter.setRAMBufferSizeMB cannot exceed 2048 MB, and throw IllegalArgumentException if it does. (Aaron McKee, Yonik Seeley, Mike McCandless) * LUCENE-2004: Fix Constants.LUCENE_MAIN_VERSION to not be inlined by client code. (Uwe Schindler) * LUCENE-2016: Replace illegal U+FFFF character with the replacement char (U+FFFD) during indexing, to prevent silent index corruption. (Peter Keegan, Mike McCandless) API Changes * Un-deprecate search(Weight weight, Filter filter, int n) from Searchable interface (deprecated by accident). (Uwe Schindler) * Un-deprecate o.a.l.util.Version constants. (Mike McCandless) * LUCENE-1987: Un-deprecate some ctors of Token, as they will not be removed in 3.0 and are still useful. Also add some missing o.a.l.util.Version constants for enabling invalid acronym settings in StandardAnalyzer to be compatible with the coming Lucene 3.0. (Uwe Schindler) * LUCENE-1973: Un-deprecate IndexSearcher.setDefaultFieldSortScoring, to allow controlling per-IndexSearcher whether scores are computed when sorting by field. (Uwe Schindler, Mike McCandless) Documentation * LUCENE-1955: Fix Hits deprecation notice to point users in right direction. (Mike McCandless, Mark Miller) * Fix javadoc about score tracking done by search methods in Searcher and IndexSearcher. (Mike McCandless) * LUCENE-2008: Javadoc improvements for TokenStream/Tokenizer/Token (Luke Nezda via Mike McCandless) ======================= Release 2.9.0 2009-09-23 ======================= Changes in backwards compatibility policy * LUCENE-1575: Searchable.search(Weight, Filter, int, Sort) no longer computes a document score for each hit by default. If document score tracking is still needed, you can call IndexSearcher.setDefaultFieldSortScoring(true, true) to enable both per-hit and maxScore tracking; however, this is deprecated and will be removed in 3.0. Alternatively, use Searchable.search(Weight, Filter, Collector) and pass in a TopFieldCollector instance, using the following code sample: TopFieldCollector tfc = TopFieldCollector.create(sort, numHits, fillFields, true /* trackDocScores */, true /* trackMaxScore */, false /* docsInOrder */); searcher.search(query, tfc); TopDocs results = tfc.topDocs(); Note that your Sort object cannot use SortField.AUTO when you directly instantiate TopFieldCollector. Also, the method search(Weight, Filter, Collector) was added to the Searchable interface and the Searcher abstract class to replace the deprecated HitCollector versions. If you either implement Searchable or extend Searcher, you should change your code to implement this method. If you already extend IndexSearcher, no further changes are needed to use Collector. Finally, the values Float.NaN and Float.NEGATIVE_INFINITY are not valid scores. Lucene uses these values internally in certain places, so if you have hits with such scores, it will cause problems. (Shai Erera via Mike McCandless) * LUCENE-1687: All methods and parsers from the interface ExtendedFieldCache have been moved into FieldCache. ExtendedFieldCache is now deprecated and contains only a few declarations for binary backwards compatibility. ExtendedFieldCache will be removed in version 3.0. Users of FieldCache and ExtendedFieldCache will be able to plug in Lucene 2.9 without recompilation. The auto cache (FieldCache.getAuto) is now deprecated. Due to the merge of ExtendedFieldCache and FieldCache, FieldCache can now additionally return long[] and double[] arrays in addition to int[] and float[] and StringIndex. The interface changes are only notable for users implementing the interfaces, which was unlikely done, because there is no possibility to change Lucene's FieldCache implementation. (Grant Ingersoll, Uwe Schindler) * LUCENE-1630, LUCENE-1771: Weight, previously an interface, is now an abstract class. Some of the method signatures have changed, but it should be fairly easy to see what adjustments must be made to existing code to sync up with the new API. You can find more detail in the API Changes section. Going forward Searchable will be kept for convenience only and may be changed between minor releases without any deprecation process. It is not recommended that you implement it, but rather extend Searcher. (Shai Erera, Chris Hostetter, Martin Ruckli, Mark Miller via Mike McCandless) * LUCENE-1422, LUCENE-1693: The new Attribute based TokenStream API (see below) has some backwards breaks in rare cases. We did our best to make the transition as easy as possible and you are not likely to run into any problems. If your tokenizers still implement next(Token) or next(), the calls are automatically wrapped. The indexer and query parser use the new API (eg use incrementToken() calls). All core TokenStreams are implemented using the new API. You can mix old and new API style TokenFilters/TokenStream. Problems only occur when you have done the following: You have overridden next(Token) or next() in one of the non-abstract core TokenStreams/-Filters. These classes should normally be final, but some of them are not. In this case, next(Token)/next() would never be called. To fail early with a hard compile/runtime error, the next(Token)/next() methods in these TokenStreams/-Filters were made final in this release. (Michael Busch, Uwe Schindler) * LUCENE-1763: MergePolicy now requires an IndexWriter instance to be passed upon instantiation. As a result, IndexWriter was removed as a method argument from all MergePolicy methods. (Shai Erera via Mike McCandless) * LUCENE-1748: LUCENE-1001 introduced PayloadSpans, but this was a back compat break and caused custom SpanQuery implementations to fail at runtime in a variety of ways. This issue attempts to remedy things by causing a compile time break on custom SpanQuery implementations and removing the PayloadSpans class, with its functionality now moved to Spans. To help in alleviating future back compat pain, Spans has been changed from an interface to an abstract class. (Hugh Cayless, Mark Miller) * LUCENE-1808: Query.createWeight has been changed from protected to public. This will be a back compat break if you have overridden this method - but you are likely already affected by the LUCENE-1693 (make Weight abstract rather than an interface) back compat break if you have overridden Query.creatWeight, so we have taken the opportunity to make this change. (Tim Smith, Shai Erera via Mark Miller) * LUCENE-1708 - IndexReader.document() no longer checks if the document is deleted. You can call IndexReader.isDeleted(n) prior to calling document(n). (Shai Erera via Mike McCandless) Changes in runtime behavior * LUCENE-1424: QueryParser now by default uses constant score auto rewriting when it generates a WildcardQuery and PrefixQuery (it already does so for TermRangeQuery, as well). Call setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE) to revert to slower BooleanQuery rewriting method. (Mark Miller via Mike McCandless) * LUCENE-1575: As of 2.9, the core collectors as well as IndexSearcher's search methods that return top N results, no longer filter documents with scores <= 0.0. If you rely on this functionality you can use PositiveScoresOnlyCollector like this: TopDocsCollector tdc = new TopScoreDocCollector(10); Collector c = new PositiveScoresOnlyCollector(tdc); searcher.search(query, c); TopDocs hits = tdc.topDocs(); ... * LUCENE-1604: IndexReader.norms(String field) is now allowed to return null if the field has no norms, as long as you've previously called IndexReader.setDisableFakeNorms(true). This setting now defaults to false (to preserve the fake norms back compatible behavior) but in 3.0 will be hardwired to true. (Shon Vella via Mike McCandless). * LUCENE-1624: If you open IndexWriter with create=true and autoCommit=false on an existing index, IndexWriter no longer writes an empty commit when it's created. (Paul Taylor via Mike McCandless) * LUCENE-1593: When you call Sort() or Sort.setSort(String field, boolean reverse), the resulting SortField array no longer ends with SortField.FIELD_DOC (it was unnecessary as Lucene breaks ties internally by docID). (Shai Erera via Michael McCandless) * LUCENE-1542: When the first token(s) have 0 position increment, IndexWriter used to incorrectly record the position as -1, if no payload is present, or Integer.MAX_VALUE if a payload is present. This causes positional queries to fail to match. The bug is now fixed, but if your app relies on the buggy behavior then you must call IndexWriter.setAllowMinus1Position(). That API is deprecated so you must fix your application, and rebuild your index, to not rely on this behavior by the 3.0 release of Lucene. (Jonathan Mamou, Mark Miller via Mike McCandless) * LUCENE-1715: Finalizers have been removed from the 4 core classes that still had them, since they will cause GC to take longer, thus tying up memory for longer, and at best they mask buggy app code. DirectoryReader (returned from IndexReader.open) & IndexWriter previously released the write lock during finalize. SimpleFSDirectory.FSIndexInput closed the descriptor in its finalizer, and NativeFSLock released the lock. It's possible applications will be affected by this, but only if the application is failing to close reader/writers. (Brian Groose via Mike McCandless) * LUCENE-1717: Fixed IndexWriter to account for RAM usage of buffered deletions. (Mike McCandless) * LUCENE-1727: Ensure that fields are stored & retrieved in the exact order in which they were added to the document. This was true in all Lucene releases before 2.3, but was broken in 2.3 and 2.4, and is now fixed in 2.9. (Mike McCandless) * LUCENE-1678: The addition of Analyzer.reusableTokenStream accidentally broke back compatibility of external analyzers that subclassed core analyzers that implemented tokenStream but not reusableTokenStream. This is now fixed, such that if reusableTokenStream is invoked on such a subclass, that method will forcefully fallback to tokenStream. (Mike McCandless) * LUCENE-1801: Token.clear() and Token.clearNoTermBuffer() now also clear startOffset, endOffset and type. This is not likely to affect any Tokenizer chains, as Tokenizers normally always set these three values. This change was made to be conform to the new AttributeImpl.clear() and AttributeSource.clearAttributes() to work identical for Token as one for all AttributeImpl and the 6 separate AttributeImpls. (Uwe Schindler, Michael Busch) * LUCENE-1483: When searching over multiple segments, a new Scorer is now created for each segment. Searching has been telescoped out a level and IndexSearcher now operates much like MultiSearcher does. The Weight is created only once for the top level Searcher, but each Scorer is passed a per-segment IndexReader. This will result in doc ids in the Scorer being internal to the per-segment IndexReader. It has always been outside of the API to count on a given IndexReader to contain every doc id in the index - and if you have been ignoring MultiSearcher in your custom code and counting on this fact, you will find your code no longer works correctly. If a custom Scorer implementation uses any caches/filters that rely on being based on the top level IndexReader, it will need to be updated to correctly use contextless caches/filters eg you can't count on the IndexReader to contain any given doc id or all of the doc ids. (Mark Miller, Mike McCandless) * LUCENE-1846: DateTools now uses the US locale to format the numbers in its date/time strings instead of the default locale. For most locales there will be no change in the index format, as DateFormatSymbols is using ASCII digits. The usage of the US locale is important to guarantee correct ordering of generated terms. (Uwe Schindler) * LUCENE-1860: MultiTermQuery now defaults to CONSTANT_SCORE_AUTO_REWRITE_DEFAULT rewrite method (previously it was SCORING_BOOLEAN_QUERY_REWRITE). This means that PrefixQuery and WildcardQuery will now produce constant score for all matching docs, equal to the boost of the query. (Mike McCandless) API Changes * LUCENE-1419: Add expert API to set custom indexing chain. This API is package-protected for now, so we don't have to officially support it. Yet, it will give us the possibility to try out different consumers in the chain. (Michael Busch) * LUCENE-1427: DocIdSet.iterator() is now allowed to throw IOException. (Paul Elschot, Mike McCandless) * LUCENE-1422, LUCENE-1693: New TokenStream API that uses a new class called AttributeSource instead of the Token class, which is now a utility class that holds common Token attributes. All attributes that the Token class had have been moved into separate classes: TermAttribute, OffsetAttribute, PositionIncrementAttribute, PayloadAttribute, TypeAttribute and FlagsAttribute. The new API is much more flexible; it allows to combine the Attributes arbitrarily and also to define custom Attributes. The new API has the same performance as the old next(Token) approach. For conformance with this new API Tee-/SinkTokenizer was deprecated and replaced by a new TeeSinkTokenFilter. (Michael Busch, Uwe Schindler; additional contributions and bug fixes by Daniel Shane, Doron Cohen) * LUCENE-1467: Add nextDoc() and next(int) methods to OpenBitSetIterator. These methods can be used to avoid additional calls to doc(). (Michael Busch) * LUCENE-1468: Deprecate Directory.list(), which sometimes (in FSDirectory) filters out files that don't look like index files, in favor of new Directory.listAll(), which does no filtering. Also, listAll() will never return null; instead, it throws an IOException (or subclass). Specifically, FSDirectory.listAll() will throw the newly added NoSuchDirectoryException if the directory does not exist. (Marcel Reutegger, Mike McCandless) * LUCENE-1546: Add IndexReader.flush(Map commitUserData), allowing you to record an opaque commitUserData (maps String -> String) into the commit written by IndexReader. This matches IndexWriter's commit methods. (Jason Rutherglen via Mike McCandless) * LUCENE-652: Added org.apache.lucene.document.CompressionTools, to enable compressing & decompressing binary content, external to Lucene's indexing. Deprecated Field.Store.COMPRESS. * LUCENE-1561: Renamed Field.omitTf to Field.omitTermFreqAndPositions (Otis Gospodnetic via Mike McCandless) * LUCENE-1500: Added new InvalidTokenOffsetsException to Highlighter methods to denote issues when offsets in TokenStream tokens exceed the length of the provided text. (Mark Harwood) * LUCENE-1575, LUCENE-1483: HitCollector is now deprecated in favor of a new Collector abstract class. For easy migration, people can use HitCollectorWrapper which translates (wraps) HitCollector into Collector. Note that this class is also deprecated and will be removed when HitCollector is removed. Also TimeLimitedCollector is deprecated in favor of the new TimeLimitingCollector which extends Collector. (Shai Erera, Mark Miller, Mike McCandless) * LUCENE-1592: The method TermsEnum.skipTo() was deprecated, because it is used nowhere in core/contrib and there is only a very ineffective default implementation available. If you want to position a TermEnum to another Term, create a new one using IndexReader.terms(Term). (Uwe Schindler) * LUCENE-1621: MultiTermQuery.getTerm() has been deprecated as it does not make sense for all subclasses of MultiTermQuery. Check individual subclasses to see if they support getTerm(). (Mark Miller) * LUCENE-1636: Make TokenFilter.input final so it's set only once. (Wouter Heijke, Uwe Schindler via Mike McCandless). * LUCENE-1658, LUCENE-1451: Renamed FSDirectory to SimpleFSDirectory (but left an FSDirectory base class). Added an FSDirectory.open static method to pick a good default FSDirectory implementation given the OS. FSDirectories should now be instantiated using FSDirectory.open or with public constructors rather than FSDirectory.getDirectory(), which has been deprecated. (Michael McCandless, Uwe Schindler, yonik) * LUCENE-1665: Deprecate SortField.AUTO, to be removed in 3.0. Instead, when sorting by field, the application should explicitly state the type of the field. (Mike McCandless) * LUCENE-1660: StopFilter, StandardAnalyzer, StopAnalyzer now require up front specification of enablePositionIncrement (Mike McCandless) * LUCENE-1614: DocIdSetIterator's next() and skipTo() were deprecated in favor of the new nextDoc() and advance(). The new methods return the doc Id they landed on, saving an extra call to doc() in most cases. For easy migration of the code, you can change the calls to next() to nextDoc() != DocIdSetIterator.NO_MORE_DOCS and similarly for skipTo(). However it is advised that you take advantage of the returned doc ID and not call doc() following those two. Also, doc() was deprecated in favor of docID(). docID() should return -1 or NO_MORE_DOCS if nextDoc/advance were not called yet, or NO_MORE_DOCS if the iterator has exhausted. Otherwise it should return the current doc ID. (Shai Erera via Mike McCandless) * LUCENE-1672: All ctors/opens and other methods using String/File to specify the directory in IndexReader, IndexWriter, and IndexSearcher were deprecated. You should instantiate the Directory manually before and pass it to these classes (LUCENE-1451, LUCENE-1658). (Uwe Schindler) * LUCENE-1407: Move RemoteSearchable, RemoteCachingWrapperFilter out of Lucene's core into new contrib/remote package. Searchable no longer extends java.rmi.Remote (Simon Willnauer via Mike McCandless) * LUCENE-1677: The global property org.apache.lucene.SegmentReader.class, and ReadOnlySegmentReader.class are now deprecated, to be removed in 3.0. src/gcj/* has been removed. (Earwin Burrfoot via Mike McCandless) * LUCENE-1673: Deprecated NumberTools in favour of the new NumericRangeQuery and its new indexing format for numeric or date values. (Uwe Schindler) * LUCENE-1630, LUCENE-1771: Weight is now an abstract class, and adds a scorer(IndexReader, boolean /* scoreDocsInOrder */, boolean /* topScorer */) method instead of scorer(IndexReader). IndexSearcher uses this method to obtain a scorer matching the capabilities of the Collector wrt orderedness of docIDs. Some Scorers (like BooleanScorer) are much more efficient if out-of-order documents scoring is allowed by a Collector. Collector must now implement acceptsDocsOutOfOrder. If you write a Collector which does not care about doc ID orderness, it is recommended that you return true. Weight has a scoresDocsOutOfOrder method, which by default returns false. If you create a Weight which will score documents out of order if requested, you should override that method to return true. BooleanQuery's setAllowDocsOutOfOrder and getAllowDocsOutOfOrder have been deprecated as they are not needed anymore. BooleanQuery will now score docs out of order when used with a Collector that can accept docs out of order. Finally, Weight#explain now takes a sub-reader and sub-docID, rather than a top level reader and docID. (Shai Erera, Chris Hostetter, Martin Ruckli, Mark Miller via Mike McCandless) * LUCENE-1466, LUCENE-1906: Added CharFilter and MappingCharFilter, which allows chaining & mapping of characters before tokenizers run. CharStream (subclass of Reader) is the base class for custom java.io.Reader's, that support offset correction. Tokenizers got an additional method correctOffset() that is passed down to the underlying CharStream if input is a subclass of CharStream/-Filter. (Koji Sekiguchi via Mike McCandless, Uwe Schindler) * LUCENE-1703: Add IndexWriter.waitForMerges. (Tim Smith via Mike McCandless) * LUCENE-1625: CheckIndex's programmatic API now returns separate classes detailing the status of each component in the index, and includes more detailed status than previously. (Tim Smith via Mike McCandless) * LUCENE-1713: Deprecated RangeQuery and RangeFilter and renamed to TermRangeQuery and TermRangeFilter. TermRangeQuery is in constant score auto rewrite mode by default. The new classes also have new ctors taking field and term ranges as Strings (see also LUCENE-1424). (Uwe Schindler) * LUCENE-1609: The termInfosIndexDivisor must now be specified up-front when opening the IndexReader. Attempts to call IndexReader.setTermInfosIndexDivisor will hit an UnsupportedOperationException. This was done to enable removal of all synchronization in TermInfosReader, which previously could cause threads to pile up in certain cases. (Dan Rosher via Mike McCandless) * LUCENE-1688: Deprecate static final String stop word array in and StopAnalzyer and replace it with an immutable implementation of CharArraySet. (Simon Willnauer via Mark Miller) * LUCENE-1742: SegmentInfos, SegmentInfo and SegmentReader have been made public as expert, experimental APIs. These APIs may suddenly change from release to release (Jason Rutherglen via Mike McCandless). * LUCENE-1754: QueryWeight.scorer() can return null if no documents are going to be matched by the query. Similarly, Filter.getDocIdSet() can return null if no documents are going to be accepted by the Filter. Note that these 'can' return null, however they don't have to and can return a Scorer/DocIdSet which does not match / reject all documents. This is already the behavior of some QueryWeight/Filter implementations, and is documented here just for emphasis. (Shai Erera via Mike McCandless) * LUCENE-1705: Added IndexWriter.deleteAllDocuments. (Tim Smith via Mike McCandless) * LUCENE-1460: Changed TokenStreams/TokenFilters in contrib to use the new TokenStream API. (Robert Muir, Michael Busch) * LUCENE-1748: LUCENE-1001 introduced PayloadSpans, but this was a back compat break and caused custom SpanQuery implementations to fail at runtime in a variety of ways. This issue attempts to remedy things by causing a compile time break on custom SpanQuery implementations and removing the PayloadSpans class, with its functionality now moved to Spans. To help in alleviating future back compat pain, Spans has been changed from an interface to an abstract class. (Hugh Cayless, Mark Miller) * LUCENE-1808: Query.createWeight has been changed from protected to public. (Tim Smith, Shai Erera via Mark Miller) * LUCENE-1826: Add constructors that take AttributeSource and AttributeFactory to all Tokenizer implementations. (Michael Busch) * LUCENE-1847: Similarity#idf for both a Term and Term Collection have been deprecated. New versions that return an IDFExplanation have been added. (Yasoja Seneviratne, Mike McCandless, Mark Miller) * LUCENE-1877: Made NativeFSLockFactory the default for the new FSDirectory API (open(), FSDirectory subclass ctors). All FSDirectory system properties were deprecated and all lock implementations use no lock prefix if the locks are stored inside the index directory. Because the deprecated String/File ctors of IndexWriter and IndexReader (LUCENE-1672) and FSDirectory.getDirectory() still use the old SimpleFSLockFactory and the new API NativeFSLockFactory, we strongly recommend not to mix deprecated and new API. (Uwe Schindler, Mike McCandless) * LUCENE-1911: Added a new method isCacheable() to DocIdSet. This method should return true, if the underlying implementation does not use disk I/O and is fast enough to be directly cached by CachingWrapperFilter. OpenBitSet, SortedVIntList, and DocIdBitSet are such candidates. The default implementation of the abstract DocIdSet class returns false. In this case, CachingWrapperFilter copies the DocIdSetIterator into an OpenBitSet for caching. (Uwe Schindler, Thomas Becker) Bug fixes * LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals() implementation - Leads to Solr Cache misses. (Todd Feak, Mark Miller via yonik) * LUCENE-1327: Fix TermSpans#skipTo() to behave as specified in javadocs of Terms#skipTo(). (Michael Busch) * LUCENE-1573: Do not ignore InterruptedException (caused by Thread.interrupt()) nor enter deadlock/spin loop. Now, an interrupt will cause a RuntimeException to be thrown. In 3.0 we will change public APIs to throw InterruptedException. (Jeremy Volkman via Mike McCandless) * LUCENE-1590: Fixed stored-only Field instances do not change the value of omitNorms, omitTermFreqAndPositions in FieldInfo; when you retrieve such fields they will now have omitNorms=true and omitTermFreqAndPositions=false (though these values are unused). (Uwe Schindler via Mike McCandless) * LUCENE-1587: RangeQuery#equals() could consider a RangeQuery without a collator equal to one with a collator. (Mark Platvoet via Mark Miller) * LUCENE-1600: Don't call String.intern unnecessarily in some cases when loading documents from the index. (P Eger via Mike McCandless) * LUCENE-1611: Fix case where OutOfMemoryException in IndexWriter could cause "infinite merging" to happen. (Christiaan Fluit via Mike McCandless) * LUCENE-1623: Properly handle back-compatibility of 2.3.x indexes that contain field names with non-ascii characters. (Mike Streeton via Mike McCandless) * LUCENE-1593: MultiSearcher and ParallelMultiSearcher did not break ties (in sort) by doc Id in a consistent manner (i.e., if Sort.FIELD_DOC was used vs. when it wasn't). (Shai Erera via Michael McCandless) * LUCENE-1647: Fix case where IndexReader.undeleteAll would cause the segment's deletion count to be incorrect. (Mike McCandless) * LUCENE-1542: When the first token(s) have 0 position increment, IndexWriter used to incorrectly record the position as -1, if no payload is present, or Integer.MAX_VALUE if a payload is present. This causes positional queries to fail to match. The bug is now fixed, but if your app relies on the buggy behavior then you must call IndexWriter.setAllowMinus1Position(). That API is deprecated so you must fix your application, and rebuild your index, to not rely on this behavior by the 3.0 release of Lucene. (Jonathan Mamou, Mark Miller via Mike McCandless) * LUCENE-1658: Fixed MMapDirectory to correctly throw IOExceptions on EOF, removed numeric overflow possibilities and added support for a hack to unmap the buffers on closing IndexInput. (Uwe Schindler) * LUCENE-1681: Fix infinite loop caused by a call to DocValues methods getMinValue, getMaxValue, getAverageValue. (Simon Willnauer via Mark Miller) * LUCENE-1599: Add clone support for SpanQuerys. SpanRegexQuery counts on this functionality and does not work correctly without it. (Billow Gao, Mark Miller) * LUCENE-1718: Fix termInfosIndexDivisor to carry over to reopened readers (Mike McCandless) * LUCENE-1583: SpanOrQuery skipTo() doesn't always move forwards as Spans documentation indicates it should. (Moti Nisenson via Mark Miller) * LUCENE-1566: Sun JVM Bug http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6478546 causes invalid OutOfMemoryError when reading too many bytes at once from a file on 32bit JVMs that have a large maximum heap size. This fix adds set/getReadChunkSize to FSDirectory so that large reads are broken into chunks, to work around this JVM bug. On 32bit JVMs the default chunk size is 100 MB; on 64bit JVMs, which don't show the bug, the default is Integer.MAX_VALUE. (Simon Willnauer via Mike McCandless) * LUCENE-1448: Added TokenStream.end() to perform end-of-stream operations (ie to return the end offset of the tokenization). This is important when multiple fields with the same name are added to a document, to ensure offsets recorded in term vectors for all of the instances are correct. (Mike McCandless, Mark Miller, Michael Busch) * LUCENE-1805: CloseableThreadLocal did not allow a null Object in get(), although it does allow it in set(Object). Fix get() to not assert the object is not null. (Shai Erera via Mike McCandless) * LUCENE-1801: Changed all Tokenizers or TokenStreams in core/contrib) that are the source of Tokens to always call AttributeSource.clearAttributes() first. (Uwe Schindler) * LUCENE-1819: MatchAllDocsQuery.toString(field) should produce output that is parsable by the QueryParser. (John Wang, Mark Miller) * LUCENE-1836: Fix localization bug in the new query parser and add new LocalizedTestCase as base class for localization junit tests. (Robert Muir, Uwe Schindler via Michael Busch) * LUCENE-1847: PhraseQuery/TermQuery/SpanQuery use IndexReader specific stats in their Weight#explain methods - these stats should be corpus wide. (Yasoja Seneviratne, Mike McCandless, Mark Miller) * LUCENE-1885: Fix the bug that NativeFSLock.isLocked() did not work, if the lock was obtained by another NativeFSLock(Factory) instance. Because of this IndexReader.isLocked() and IndexWriter.isLocked() did not work correctly. (Uwe Schindler) * LUCENE-1899: Fix O(N^2) CPU cost when setting docIDs in order in an OpenBitSet, due to an inefficiency in how the underlying storage is reallocated. (Nadav Har'El via Mike McCandless) * LUCENE-1918: Fixed cases where a ParallelReader would generate exceptions on being passed to IndexWriter.addIndexes(IndexReader[]). First case was when the ParallelReader was empty. Second case was when the ParallelReader used to contain documents with TermVectors, but all such documents have been deleted. (Christian Kohlschütter via Mike McCandless) New features * LUCENE-1411: Added expert API to open an IndexWriter on a prior commit, obtained from IndexReader.listCommits. This makes it possible to rollback changes to an index even after you've closed the IndexWriter that made the changes, assuming you are using an IndexDeletionPolicy that keeps past commits around. This is useful when building transactional support on top of Lucene. (Mike McCandless) * LUCENE-1382: Add an optional arbitrary Map (String -> String) "commitUserData" to IndexWriter.commit(), which is stored in the segments file and is then retrievable via IndexReader.getCommitUserData instance and static methods. (Shalin Shekhar Mangar via Mike McCandless) * LUCENE-1420: Similarity now has a computeNorm method that allows custom Similarity classes to override how norm is computed. It's provided a FieldInvertState instance that contains details from inverting the field. The default impl is boost * lengthNorm(numTerms), to be backwards compatible. Also added {set/get}DiscountOverlaps to DefaultSimilarity, to control whether overlapping tokens (tokens with 0 position increment) should be counted in lengthNorm. (Andrzej Bialecki via Mike McCandless) * LUCENE-1424: Moved constant score query rewrite capability into MultiTermQuery, allowing TermRangeQuery, PrefixQuery and WildcardQuery to switch between constant-score rewriting or BooleanQuery expansion rewriting via a new setRewriteMethod method. Deprecated ConstantScoreRangeQuery (Mark Miller via Mike McCandless) * LUCENE-1461: Added FieldCacheRangeFilter, a RangeFilter for single-term fields that uses FieldCache to compute the filter. If your documents all have a single term for a given field, and you need to create many RangeFilters with varying lower/upper bounds, then this is likely a much faster way to create the filters than RangeFilter. FieldCacheRangeFilter allows ranges on all data types, FieldCache supports (term ranges, byte, short, int, long, float, double). However, it comes at the expense of added RAM consumption and slower first-time usage due to populating the FieldCache. It also does not support collation (Tim Sturge, Matt Ericson via Mike McCandless and Uwe Schindler) * LUCENE-1296: add protected method CachingWrapperFilter.docIdSetToCache to allow subclasses to choose which DocIdSet implementation to use (Paul Elschot via Mike McCandless) * LUCENE-1390: Added ASCIIFoldingFilter, a Filter that converts alphabetic, numeric, and symbolic Unicode characters which are not in the first 127 ASCII characters (the "Basic Latin" Unicode block) into their ASCII equivalents, if one exists. ISOLatin1AccentFilter, which handles a subset of this filter, has been deprecated. (Andi Vajda, Steven Rowe via Mark Miller) * LUCENE-1478: Added new SortField constructor allowing you to specify a custom FieldCache parser to generate numeric values from terms for a field. (Uwe Schindler via Mike McCandless) * LUCENE-1528: Add support for Ideographic Space to the queryparser. (Luis Alves via Michael Busch) * LUCENE-1487: Added FieldCacheTermsFilter, to filter by multiple terms on single-valued fields. The filter loads the FieldCache for the field the first time it's called, and subsequent usage of that field, even with different Terms in the filter, are fast. (Tim Sturge, Shalin Shekhar Mangar via Mike McCandless). * LUCENE-1314: Add clone(), clone(boolean readOnly) and reopen(boolean readOnly) to IndexReader. Cloning an IndexReader gives you a new reader which you can make changes to (deletions, norms) without affecting the original reader. Now, with clone or reopen you can change the readOnly of the original reader. (Jason Rutherglen, Mike McCandless) * LUCENE-1506: Added FilteredDocIdSet, an abstract class which you subclass to implement the "match" method to accept or reject each docID. Unlike ChainedFilter (under contrib/misc), FilteredDocIdSet never requires you to materialize the full bitset. Instead, match() is called on demand per docID. (John Wang via Mike McCandless) * LUCENE-1398: Add ReverseStringFilter to contrib/analyzers, a filter to reverse the characters in each token. (Koji Sekiguchi via yonik) * LUCENE-1551: Add expert IndexReader.reopen(IndexCommit) to allow efficiently opening a new reader on a specific commit, sharing resources with the original reader. (Torin Danil via Mike McCandless) * LUCENE-1434: Added org.apache.lucene.util.IndexableBinaryStringTools, to encode byte[] as String values that are valid terms, and maintain sort order of the original byte[] when the bytes are interpreted as unsigned. (Steven Rowe via Mike McCandless) * LUCENE-1543: Allow MatchAllDocsQuery to optionally use norms from a specific fields to set the score for a document. (Karl Wettin via Mike McCandless) * LUCENE-1586: Add IndexReader.getUniqueTermCount(). (Mike McCandless via Derek) * LUCENE-1516: Added "near real-time search" to IndexWriter, via a new expert getReader() method. This method returns a reader that searches the full index, including any uncommitted changes in the current IndexWriter session. This should result in a faster turnaround than the normal approach of commiting the changes and then reopening a reader. (Jason Rutherglen via Mike McCandless) * LUCENE-1603: Added new MultiTermQueryWrapperFilter, to wrap any MultiTermQuery as a Filter. Also made some improvements to MultiTermQuery: return DocIdSet.EMPTY_DOCIDSET if there are no terms in the enum; track the total number of terms it visited during rewrite (getTotalNumberOfTerms). FilteredTermEnum is also more friendly to subclassing. (Uwe Schindler via Mike McCandless) * LUCENE-1605: Added BitVector.subset(). (Jeremy Volkman via Mike McCandless) * LUCENE-1618: Added FileSwitchDirectory that enables files with specified extensions to be stored in a primary directory and the rest of the files to be stored in the secondary directory. For example, this can be useful for the large doc-store (stored fields, term vectors) files in FSDirectory and the rest of the index files in a RAMDirectory. (Jason Rutherglen via Mike McCandless) * LUCENE-1494: Added FieldMaskingSpanQuery which can be used to cross-correlate Spans from different fields. (Paul Cowan and Chris Hostetter) * LUCENE-1634: Add calibrateSizeByDeletes to LogMergePolicy, to take deletions into account when considering merges. (Yasuhiro Matsuda via Mike McCandless) * LUCENE-1550: Added new n-gram based String distance measure for spell checking. See the Javadocs for NGramDistance.java for a reference paper on why this is helpful (Tom Morton via Grant Ingersoll) * LUCENE-1470, LUCENE-1582, LUCENE-1602, LUCENE-1673, LUCENE-1701, LUCENE-1712: Added NumericRangeQuery and NumericRangeFilter, a fast alternative to RangeQuery/RangeFilter for numeric searches. They depend on a specific structure of terms in the index that can be created by indexing using the new NumericField or NumericTokenStream classes. NumericField can only be used for indexing and optionally stores the values as string representation in the doc store. Documents returned from IndexReader/IndexSearcher will return only the String value using the standard Fieldable interface. NumericFields can be sorted on and loaded into the FieldCache. (Uwe Schindler, Yonik Seeley, Mike McCandless) * LUCENE-1405: Added support for Ant resource collections in contrib/ant task. (Przemyslaw Sztoch via Erik Hatcher) * LUCENE-1699: Allow setting a TokenStream on Field/Fieldable for indexing in conjunction with any other ways to specify stored field values, currently binary or string values. (yonik) * LUCENE-1701: Made the standard FieldCache.Parsers public and added parsers for fields generated using NumericField/NumericTokenStream. All standard parsers now also implement Serializable and enforce their singleton status. (Uwe Schindler, Mike McCandless) * LUCENE-1741: User configurable maximum chunk size in MMapDirectory. On 32 bit platforms, the address space can be very fragmented, so one big ByteBuffer for the whole file may not fit into address space. (Eks Dev via Uwe Schindler) * LUCENE-1644: Enable 4 rewrite modes for queries deriving from MultiTermQuery (WildcardQuery, PrefixQuery, TermRangeQuery, NumericRangeQuery): CONSTANT_SCORE_FILTER_REWRITE first creates a filter and then assigns constant score (boost) to docs; CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE create a BooleanQuery but uses a constant score (boost); SCORING_BOOLEAN_QUERY_REWRITE also creates a BooleanQuery but keeps the BooleanQuery's scores; CONSTANT_SCORE_AUTO_REWRITE tries to pick the most performant constant-score rewrite method. (Mike McCandless) * LUCENE-1448: Added TokenStream.end(), to perform end-of-stream operations. This is currently used to fix offset problems when multiple fields with the same name are added to a document. (Mike McCandless, Mark Miller, Michael Busch) * LUCENE-1776: Add an option to not collect payloads for an ordered SpanNearQuery. Payloads were not lazily loaded in this case as the javadocs implied. If you have payloads and want to use an ordered SpanNearQuery that does not need to use the payloads, you can disable loading them with a new constructor switch. (Mark Miller) * LUCENE-1341: Added PayloadNearQuery to enable SpanNearQuery functionality with payloads (Peter Keegan, Grant Ingersoll, Mark Miller) * LUCENE-1790: Added PayloadTermQuery to enable scoring of payloads based on the maximum payload seen for a document. Slight refactoring of Similarity and other payload queries (Grant Ingersoll, Mark Miller) * LUCENE-1749: Addition of FieldCacheSanityChecker utility, and hooks to use it in all existing Lucene Tests. This class can be used by any application to inspect the FieldCache and provide diagnostic information about the possibility of inconsistent FieldCache usage. Namely: FieldCache entries for the same field with different datatypes or parsers; and FieldCache entries for the same field in both a reader, and one of it's (descendant) sub readers. (Chris Hostetter, Mark Miller) * LUCENE-1789: Added utility class oal.search.function.MultiValueSource to ease the transition to segment based searching for any apps that directly call oal.search.function.* APIs. This class wraps any other ValueSource, but takes care when composite (multi-segment) are passed to not double RAM usage in the FieldCache. (Chris Hostetter, Mark Miller, Mike McCandless) Optimizations * LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing scores of the query, since they are just discarded. Also, made it more efficient (single pass) by not creating & populating an intermediate OpenBitSet (Paul Elschot, Mike McCandless) * LUCENE-1443: Performance improvement for OpenBitSetDISI.inPlaceAnd() (Paul Elschot via yonik) * LUCENE-1484: Remove synchronization of IndexReader.document() by using CloseableThreadLocal internally. (Jason Rutherglen via Mike McCandless). * LUCENE-1124: Short circuit FuzzyQuery.rewrite when input token length is small compared to minSimilarity. (Timo Nentwig, Mark Miller) * LUCENE-1316: MatchAllDocsQuery now avoids the synchronized IndexReader.isDeleted() call per document, by directly accessing the underlying deleteDocs BitVector. This improves performance with non-readOnly readers, especially in a multi-threaded environment. (Todd Feak, Yonik Seeley, Jason Rutherglen via Mike McCandless) * LUCENE-1483: When searching over multiple segments we now visit each sub-reader one at a time. This speeds up warming, since FieldCache entries (if required) can be shared across reopens for those segments that did not change, and also speeds up searches that sort by relevance or by field values. (Mark Miller, Mike McCandless) * LUCENE-1575: The new Collector class decouples collect() from score computation. Collector.setScorer is called to establish the current Scorer in-use per segment. Collectors that require the score should then call Scorer.score() per hit inside collect(). (Shai Erera via Mike McCandless) * LUCENE-1596: MultiTermDocs speedup when set with MultiTermDocs.seek(MultiTermEnum) (yonik) * LUCENE-1653: Avoid creating a Calendar in every call to DateTools#dateToString, DateTools#timeToString and DateTools#round. (Shai Erera via Mark Miller) * LUCENE-1688: Deprecate static final String stop word array and replace it with an immutable implementation of CharArraySet. Removes conversions between Set and array. (Simon Willnauer via Mark Miller) * LUCENE-1754: BooleanQuery.queryWeight.scorer() will return null if it won't match any documents (e.g. if there are no required and optional scorers, or not enough optional scorers to satisfy minShouldMatch). (Shai Erera via Mike McCandless) * LUCENE-1607: To speed up string interning for commonly used strings, the StringHelper.intern() interface was added with a default implementation that uses a lockless cache. (Earwin Burrfoot, yonik) * LUCENE-1800: QueryParser should use reusable TokenStreams. (yonik) Documentation * LUCENE-1908: Scoring documentation imrovements in Similarity javadocs. (Mark Miller, Shai Erera, Ted Dunning, Jiri Kuhn, Marvin Humphrey, Doron Cohen) * LUCENE-1872: NumericField javadoc improvements (Michael McCandless, Uwe Schindler) * LUCENE-1875: Make TokenStream.end javadoc less confusing. (Uwe Schindler) * LUCENE-1862: Rectified duplicate package level javadocs for o.a.l.queryParser and o.a.l.analysis.cn. (Chris Hostetter) * LUCENE-1886: Improved hyperlinking in key Analysis javadocs (Bernd Fondermann via Chris Hostetter) * LUCENE-1884: massive javadoc and comment cleanup, primarily dealing with typos. (Robert Muir via Chris Hostetter) * LUCENE-1898: Switch changes to use bullets rather than numbers and update changes-to-html script to handle the new format. (Steven Rowe, Mark Miller) * LUCENE-1900: Improve Searchable Javadoc. (Nadav Har'El, Doron Cohen, Marvin Humphrey, Mark Miller) * LUCENE-1896: Improve Similarity#queryNorm javadocs. (Jiri Kuhn, Mark Miller) Build * LUCENE-1440: Add new targets to build.xml that allow downloading and executing the junit testcases from an older release for backwards-compatibility testing. (Michael Busch) * LUCENE-1446: Add compatibility tag to common-build.xml and run backwards-compatibility tests in the nightly build. (Michael Busch) * LUCENE-1529: Properly test "drop-in" replacement of jar with backwards-compatibility tests. (Mike McCandless, Michael Busch) * LUCENE-1851: Change 'javacc' and 'clean-javacc' targets to build and clean contrib/surround files. (Luis Alves via Michael Busch) * LUCENE-1854: tar task should use longfile="gnu" to avoid false file name length warnings. (Mark Miller) Test Cases * LUCENE-1791: Enhancements to the QueryUtils and CheckHits utility classes to wrap IndexReaders and Searchers in MultiReaders or MultiSearcher when possible to help exercise more edge cases. (Chris Hostetter, Mark Miller) * LUCENE-1852: Fix localization test failures. (Robert Muir via Michael Busch) * LUCENE-1843: Refactored all tests that use assertAnalyzesTo() & others in core and contrib to use a new BaseTokenStreamTestCase base class. Also rewrote some tests to use this general analysis assert functions instead of own ones (e.g. TestMappingCharFilter). The new base class also tests tokenization with the TokenStream.next() backwards layer enabled (using Token/TokenWrapper as attribute implementation) and disabled (default for Lucene 3.0) (Uwe Schindler, Robert Muir) * LUCENE-1836: Added a new LocalizedTestCase as base class for localization junit tests. (Robert Muir, Uwe Schindler via Michael Busch) ======================= Release 2.4.1 2009-03-09 ======================= API Changes 1. LUCENE-1186: Add Analyzer.close() to free internal ThreadLocal resources. (Christian Kohlschütter via Mike McCandless) Bug fixes 1. LUCENE-1452: Fixed silent data-loss case whereby binary fields are truncated to 0 bytes during merging if the segments being merged are non-congruent (same field name maps to different field numbers). This bug was introduced with LUCENE-1219. (Andrzej Bialecki via Mike McCandless). 2. LUCENE-1429: Don't throw incorrect IllegalStateException from IndexWriter.close() if you've hit an OOM when autoCommit is true. (Mike McCandless) 3. LUCENE-1474: If IndexReader.flush() is called twice when there were pending deletions, it could lead to later false AssertionError during IndexReader.open. (Mike McCandless) 4. LUCENE-1430: Fix false AlreadyClosedException from IndexReader.open (masking an actual IOException) that takes String or File path. (Mike McCandless) 5. LUCENE-1442: Multiple-valued NOT_ANALYZED fields can double-count token offsets. (Mike McCandless) 6. LUCENE-1453: Ensure IndexReader.reopen()/clone() does not result in incorrectly closing the shared FSDirectory. This bug would only happen if you use IndexReader.open() with a File or String argument. The returned readers are wrapped by a FilterIndexReader that correctly handles closing of directory after reopen()/clone(). (Mark Miller, Uwe Schindler, Mike McCandless) 7. LUCENE-1457: Fix possible overflow bugs during binary searches. (Mark Miller via Mike McCandless) 8. LUCENE-1459: Fix CachingWrapperFilter to not throw exception if both bits() and getDocIdSet() methods are called. (Matt Jones via Mike McCandless) 9. LUCENE-1519: Fix int overflow bug during segment merging. (Deepak via Mike McCandless) 10. LUCENE-1521: Fix int overflow bug when flushing segment. (Shon Vella via Mike McCandless). 11. LUCENE-1544: Fix deadlock in IndexWriter.addIndexes(IndexReader[]). (Mike McCandless via Doug Sale) 12. LUCENE-1547: Fix rare thread safety issue if two threads call IndexWriter commit() at the same time. (Mike McCandless) 13. LUCENE-1465: NearSpansOrdered returns payloads from first possible match rather than the correct, shortest match; Payloads could be returned even if the max slop was exceeded; The wrong payload could be returned in certain situations. (Jonathan Mamou, Greg Shackles, Mark Miller) 14. LUCENE-1186: Add Analyzer.close() to free internal ThreadLocal resources. (Christian Kohlschütter via Mike McCandless) 15. LUCENE-1552: Fix IndexWriter.addIndexes(IndexReader[]) to properly rollback IndexWriter's internal state on hitting an exception. (Scott Garland via Mike McCandless) ======================= Release 2.4.0 2008-10-06 ======================= Changes in backwards compatibility policy 1. LUCENE-1340: In a minor change to Lucene's backward compatibility policy, we are now allowing the Fieldable interface to have changes, within reason, and made on a case-by-case basis. If an application implements it's own Fieldable, please be aware of this. Otherwise, no need to be concerned. This is in effect for all 2.X releases, starting with 2.4. Also note, that in all likelihood, Fieldable will be changed in 3.0. Changes in runtime behavior 1. LUCENE-1151: Fix StandardAnalyzer to not mis-identify host names (eg lucene.apache.org) as an ACRONYM. To get back to the pre-2.4 backwards compatible, but buggy, behavior, you can either call StandardAnalyzer.setDefaultReplaceInvalidAcronym(false) (static method), or, set system property org.apache.lucene.analysis.standard.StandardAnalyzer.replaceInvalidAcronym to "false" on JVM startup. All StandardAnalyzer instances created after that will then show the pre-2.4 behavior. Alternatively, you can call setReplaceInvalidAcronym(false) to change the behavior per instance of StandardAnalyzer. This backwards compatibility will be removed in 3.0 (hardwiring the value to true). (Mike McCandless) 2. LUCENE-1044: IndexWriter with autoCommit=true now commits (such that a reader can see the changes) far less often than it used to. Previously, every flush was also a commit. You can always force a commit by calling IndexWriter.commit(). Furthermore, in 3.0, autoCommit will be hardwired to false (IndexWriter constructors that take an autoCommit argument have been deprecated) (Mike McCandless) 3. LUCENE-1335: IndexWriter.addIndexes(Directory[]) and addIndexesNoOptimize no longer allow the same Directory instance to be passed in more than once. Internally, IndexWriter uses Directory and segment name to uniquely identify segments, so adding the same Directory more than once was causing duplicates which led to problems (Mike McCandless) 4. LUCENE-1396: Improve PhraseQuery.toString() so that gaps in the positions are indicated with a ? and multiple terms at the same position are joined with a |. (Andrzej Bialecki via Mike McCandless) API Changes 1. LUCENE-1084: Changed all IndexWriter constructors to take an explicit parameter for maximum field size. Deprecated all the pre-existing constructors; these will be removed in release 3.0. NOTE: these new constructors set autoCommit to false. (Steven Rowe via Mike McCandless) 2. LUCENE-584: Changed Filter API to return a DocIdSet instead of a java.util.BitSet. This allows using more efficient data structures for Filters and makes them more flexible. This deprecates Filter.bits(), so all filters that implement this outside the Lucene code base will need to be adapted. See also the javadocs of the Filter class. (Paul Elschot, Michael Busch) 3. LUCENE-1044: Added IndexWriter.commit() which flushes any buffered adds/deletes and then commits a new segments file so readers will see the changes. Deprecate IndexWriter.flush() in favor of IndexWriter.commit(). (Mike McCandless) 4. LUCENE-325: Added IndexWriter.expungeDeletes methods, which consult the MergePolicy to find merges necessary to merge away all deletes from the index. This should be a somewhat lower cost operation than optimize. (John Wang via Mike McCandless) 5. LUCENE-1233: Return empty array instead of null when no fields match the specified name in these methods in Document: getFieldables, getFields, getValues, getBinaryValues. (Stefan Trcek vai Mike McCandless) 6. LUCENE-1234: Make BoostingSpanScorer protected. (Andi Vajda via Grant Ingersoll) 7. LUCENE-510: The index now stores strings as true UTF-8 bytes (previously it was Java's modified UTF-8). If any text, either stored fields or a token, has illegal UTF-16 surrogate characters, these characters are now silently replaced with the Unicode replacement character U+FFFD. This is a change to the index file format. (Marvin Humphrey via Mike McCandless) 8. LUCENE-852: Let the SpellChecker caller specify IndexWriter mergeFactor and RAM buffer size. (Otis Gospodnetic) 9. LUCENE-1290: Deprecate org.apache.lucene.search.Hits, Hit and HitIterator and remove all references to these classes from the core. Also update demos and tutorials. (Michael Busch) 10. LUCENE-1288: Add getVersion() and getGeneration() to IndexCommit. getVersion() returns the same value that IndexReader.getVersion() returns when the reader is opened on the same commit. (Jason Rutherglen via Mike McCandless) 11. LUCENE-1311: Added IndexReader.listCommits(Directory) static method to list all commits in a Directory, plus IndexReader.open methods that accept an IndexCommit and open the index as of that commit. These methods are only useful if you implement a custom DeletionPolicy that keeps more than the last commit around. (Jason Rutherglen via Mike McCandless) 12. LUCENE-1325: Added IndexCommit.isOptimized(). (Shalin Shekhar Mangar via Mike McCandless) 13. LUCENE-1324: Added TokenFilter.reset(). (Shai Erera via Mike McCandless) 14. LUCENE-1340: Added Fieldable.omitTf() method to skip indexing term frequency, positions and payloads. This saves index space, and indexing/searching time. (Eks Dev via Mike McCandless) 15. LUCENE-1219: Add basic reuse API to Fieldable for binary fields: getBinaryValue/Offset/Length(); currently only lazy fields reuse the provided byte[] result to getBinaryValue. (Eks Dev via Mike McCandless) 16. LUCENE-1334: Add new constructor for Term: Term(String fieldName) which defaults term text to "". (DM Smith via Mike McCandless) 17. LUCENE-1333: Added Token.reinit(*) APIs to re-initialize (reuse) a Token. Also added term() method to return a String, with a performance penalty clearly documented. Also implemented hashCode() and equals() in Token, and fixed all core and contrib analyzers to use the re-use APIs. (DM Smith via Mike McCandless) 18. LUCENE-1329: Add optional readOnly boolean when opening an IndexReader. A readOnly reader is not allowed to make changes (deletions, norms) to the index; in exchanged, the isDeleted method, often a bottleneck when searching with many threads, is not synchronized. The default for readOnly is still false, but in 3.0 the default will become true. (Jason Rutherglen via Mike McCandless) 19. LUCENE-1367: Add IndexCommit.isDeleted(). (Shalin Shekhar Mangar via Mike McCandless) 20. LUCENE-1061: Factored out all "new XXXQuery(...)" in QueryParser.java into protected methods newXXXQuery(...) so that subclasses can create their own subclasses of each Query type. (John Wang via Mike McCandless) 21. LUCENE-753: Added new Directory implementation org.apache.lucene.store.NIOFSDirectory, which uses java.nio's FileChannel to do file reads. On most non-Windows platforms, with many threads sharing a single searcher, this may yield sizable improvement to query throughput when compared to FSDirectory, which only allows a single thread to read from an open file at a time. (Jason Rutherglen via Mike McCandless) 22. LUCENE-1371: Added convenience method TopDocs Searcher.search(Query query, int n). (Mike McCandless) 23. LUCENE-1356: Allow easy extensions of TopDocCollector by turning constructor and fields from package to protected. (Shai Erera via Doron Cohen) 24. LUCENE-1375: Added convenience method IndexCommit.getTimestamp, which is equivalent to getDirectory().fileModified(getSegmentsFileName()). (Mike McCandless) 23. LUCENE-1366: Rename Field.Index options to be more accurate: TOKENIZED becomes ANALYZED; UN_TOKENIZED becomes NOT_ANALYZED; NO_NORMS becomes NOT_ANALYZED_NO_NORMS and a new ANALYZED_NO_NORMS is added. (Mike McCandless) 24. LUCENE-1131: Added numDeletedDocs method to IndexReader (Otis Gospodnetic) Bug fixes 1. LUCENE-1134: Fixed BooleanQuery.rewrite to only optimize a single clause query if minNumShouldMatch<=0. (Shai Erera via Michael Busch) 2. LUCENE-1169: Fixed bug in IndexSearcher.search(): searching with a filter might miss some hits because scorer.skipTo() is called without checking if the scorer is already at the right position. scorer.skipTo(scorer.doc()) is not a NOOP, it behaves as scorer.next(). (Eks Dev, Michael Busch) 3. LUCENE-1182: Added scorePayload to SimilarityDelegator (Andi Vajda via Grant Ingersoll) 4. LUCENE-1213: MultiFieldQueryParser was ignoring slop in case of a single field phrase. (Trejkaz via Doron Cohen) 5. LUCENE-1228: IndexWriter.commit() was not updating the index version and as result IndexReader.reopen() failed to sense index changes. (Doron Cohen) 6. LUCENE-1267: Added numDocs() and maxDoc() to IndexWriter; deprecated docCount(). (Mike McCandless) 7. LUCENE-1274: Added new prepareCommit() method to IndexWriter, which does phase 1 of a 2-phase commit (commit() does phase 2). This is needed when you want to update an index as part of a transaction involving external resources (eg a database). Also deprecated abort(), renaming it to rollback(). (Mike McCandless) 8. LUCENE-1003: Stop RussianAnalyzer from removing numbers. (TUSUR OpenTeam, Dmitry Lihachev via Otis Gospodnetic) 9. LUCENE-1152: SpellChecker fix around clearIndex and indexDictionary methods, plus removal of IndexReader reference. (Naveen Belkale via Otis Gospodnetic) 10. LUCENE-1046: Removed dead code in SpellChecker (Daniel Naber via Otis Gospodnetic) 11. LUCENE-1189: Fixed the QueryParser to handle escaped characters within quoted terms correctly. (Tomer Gabel via Michael Busch) 12. LUCENE-1299: Fixed NPE in SpellChecker when IndexReader is not null and field is (Grant Ingersoll) 13. LUCENE-1303: Fixed BoostingTermQuery's explanation to be marked as a Match depending only upon the non-payload score part, regardless of the effect of the payload on the score. Prior to this, score of a query containing a BTQ differed from its explanation. (Doron Cohen) 14. LUCENE-1310: Fixed SloppyPhraseScorer to work also for terms repeating more than twice in the query. (Doron Cohen) 15. LUCENE-1351: ISOLatin1AccentFilter now cleans additional ligatures (Cedrik Lime via Grant Ingersoll) 16. LUCENE-1383: Workaround a nasty "leak" in Java's builtin ThreadLocal, to prevent Lucene from causing unexpected OutOfMemoryError in certain situations (notably J2EE applications). (Chris Lu via Mike McCandless) New features 1. LUCENE-1137: Added Token.set/getFlags() accessors for passing more information about a Token through the analysis process. The flag is not indexed/stored and is thus only used by analysis. 2. LUCENE-1147: Add -segment option to CheckIndex tool so you can check only a specific segment or segments in your index. (Mike McCandless) 3. LUCENE-1045: Reopened this issue to add support for short and bytes. 4. LUCENE-584: Added new data structures to o.a.l.util, such as OpenBitSet and SortedVIntList. These extend DocIdSet and can directly be used for Filters with the new Filter API. Also changed the core Filters to use OpenBitSet instead of java.util.BitSet. (Paul Elschot, Michael Busch) 5. LUCENE-494: Added QueryAutoStopWordAnalyzer to allow for the automatic removal, from a query of frequently occurring terms. This Analyzer is not intended for use during indexing. (Mark Harwood via Grant Ingersoll) 6. LUCENE-1044: Change Lucene to properly "sync" files after committing, to ensure on a machine or OS crash or power cut, even with cached writes, the index remains consistent. Also added explicit commit() method to IndexWriter to force a commit without having to close. (Mike McCandless) 7. LUCENE-997: Add search timeout (partial) support. A TimeLimitedCollector was added to allow limiting search time. It is a partial solution since timeout is checked only when collecting a hit, and therefore a search for rare words in a huge index might not stop within the specified time. (Sean Timm via Doron Cohen) 8. LUCENE-1184: Allow SnapshotDeletionPolicy to be re-used across close/re-open of IndexWriter while still protecting an open snapshot (Tim Brennan via Mike McCandless) 9. LUCENE-1194: Added IndexWriter.deleteDocuments(Query) to delete documents matching the specified query. Also added static unlock and isLocked methods (deprecating the ones in IndexReader). (Mike McCandless) 10. LUCENE-1201: Add IndexReader.getIndexCommit() method. (Tim Brennan via Mike McCandless) 11. LUCENE-550: Added InstantiatedIndex implementation. Experimental Index store similar to MemoryIndex but allows for multiple documents in memory. (Karl Wettin via Grant Ingersoll) 12. LUCENE-400: Added word based n-gram filter (in contrib/analyzers) called ShingleFilter and an Analyzer wrapper that wraps another Analyzer's token stream with a ShingleFilter (Sebastian Kirsch, Steve Rowe via Grant Ingersoll) 13. LUCENE-1166: Decomposition tokenfilter for languages like German and Swedish (Thomas Peuss via Grant Ingersoll) 14. LUCENE-1187: ChainedFilter and BooleanFilter now work with new Filter API and DocIdSetIterator-based filters. Backwards-compatibility with old BitSet-based filters is ensured. (Paul Elschot via Michael Busch) 15. LUCENE-1295: Added new method to MoreLikeThis for retrieving interesting terms and made retrieveTerms(int) public. (Grant Ingersoll) 16. LUCENE-1298: MoreLikeThis can now accept a custom Similarity (Grant Ingersoll) 17. LUCENE-1297: Allow other string distance measures for the SpellChecker (Thomas Morton via Otis Gospodnetic) 18. LUCENE-1001: Provide access to Payloads via Spans. All existing Span Query implementations in Lucene implement. (Mark Miller, Grant Ingersoll) 19. LUCENE-1354: Provide programmatic access to CheckIndex (Grant Ingersoll, Mike McCandless) 20. LUCENE-1279: Add support for Collators to RangeFilter/Query and Query Parser. (Steve Rowe via Grant Ingersoll) Optimizations 1. LUCENE-705: When building a compound file, use RandomAccessFile.setLength() to tell the OS/filesystem to pre-allocate space for the file. This may improve fragmentation in how the CFS file is stored, and allows us to detect an upcoming disk full situation before actually filling up the disk. (Mike McCandless) 2. LUCENE-1120: Speed up merging of term vectors by bulk-copying the raw bytes for each contiguous range of non-deleted documents. (Mike McCandless) 3. LUCENE-1185: Avoid checking if the TermBuffer 'scratch' in SegmentTermEnum is null for every call of scanTo(). (Christian Kohlschuetter via Michael Busch) 4. LUCENE-1217: Internal to Field.java, use isBinary instead of runtime type checking for possible speedup of binaryValue(). (Eks Dev via Mike McCandless) 5. LUCENE-1183: Optimized TRStringDistance class (in contrib/spell) that uses less memory than the previous version. (Cédrik LIME via Otis Gospodnetic) 6. LUCENE-1195: Improve term lookup performance by adding a LRU cache to the TermInfosReader. In performance experiments the speedup was about 25% on average on mid-size indexes with ~500,000 documents for queries with 3 terms and about 7% on larger indexes with ~4.3M documents. (Michael Busch) Documentation 1. LUCENE-1236: Added some clarifying remarks to EdgeNGram*.java (Hiroaki Kawai via Grant Ingersoll) 2. LUCENE-1157 and LUCENE-1256: HTML changes log, created automatically from CHANGES.txt. This HTML file is currently visible only via developers page. (Steven Rowe via Doron Cohen) 3. LUCENE-1349: Fieldable can now be changed without breaking backward compatibility rules (within reason. See the note at the top of this file and also on Fieldable.java). (Grant Ingersoll) 4. LUCENE-1873: Update documentation to reflect current Contrib area status. (Steven Rowe, Mark Miller) Build 1. LUCENE-1153: Added JUnit JAR to new lib directory. Updated build to rely on local JUnit instead of ANT/lib. 2. LUCENE-1202: Small fixes to the way Clover is used to work better with contribs. Of particular note: a single clover db is used regardless of whether tests are run globally or in the specific contrib directories. 3. LUCENE-1353: Javacc target in contrib/miscellaneous for generating the precedence query parser. Test Cases 1. LUCENE-1238: Fixed intermittent failures of TestTimeLimitedCollector.testTimeoutMultiThreaded. Within this fix, "greedy" flag was added to TimeLimitedCollector, to allow the wrapped collector to collect also the last doc, after allowed-tTime passed. (Doron Cohen) 2. LUCENE-1348: relax TestTimeLimitedCollector to not fail due to timeout exceeded (just because test machine is very busy). ======================= Release 2.3.2 2008-05-05 ======================= Bug fixes 1. LUCENE-1191: On hitting OutOfMemoryError in any index-modifying methods in IndexWriter, do not commit any further changes to the index to prevent risk of possible corruption. (Mike McCandless) 2. LUCENE-1197: Fixed issue whereby IndexWriter would flush by RAM too early when TermVectors were in use. (Mike McCandless) 3. LUCENE-1198: Don't corrupt index if an exception happens inside DocumentsWriter.init (Mike McCandless) 4. LUCENE-1199: Added defensive check for null indexReader before calling close in IndexModifier.close() (Mike McCandless) 5. LUCENE-1200: Fix rare deadlock case in addIndexes* when ConcurrentMergeScheduler is in use (Mike McCandless) 6. LUCENE-1208: Fix deadlock case on hitting an exception while processing a document that had triggered a flush (Mike McCandless) 7. LUCENE-1210: Fix deadlock case on hitting an exception while starting a merge when using ConcurrentMergeScheduler (Mike McCandless) 8. LUCENE-1222: Fix IndexWriter.doAfterFlush to always be called on flush (Mark Ferguson via Mike McCandless) 9. LUCENE-1226: Fixed IndexWriter.addIndexes(IndexReader[]) to commit successfully created compound files. (Michael Busch) 10. LUCENE-1150: Re-expose StandardTokenizer's constants publicly; this was accidentally lost with LUCENE-966. (Nicolas Lalevée via Mike McCandless) 11. LUCENE-1262: Fixed bug in BufferedIndexReader.refill whereby on hitting an exception in readInternal, the buffer is incorrectly filled with stale bytes such that subsequent calls to readByte() return incorrect results. (Trejkaz via Mike McCandless) 12. LUCENE-1270: Fixed intermittent case where IndexWriter.close() would hang after IndexWriter.addIndexesNoOptimize had been called. (Stu Hood via Mike McCandless) Build 1. LUCENE-1230: Include *pom.xml* in source release files. (Michael Busch) ======================= Release 2.3.1 2008-02-22 ======================= Bug fixes 1. LUCENE-1168: Fixed corruption cases when autoCommit=false and documents have mixed term vectors (Suresh Guvvala via Mike McCandless). 2. LUCENE-1171: Fixed some cases where OOM errors could cause deadlock in IndexWriter (Mike McCandless). 3. LUCENE-1173: Fixed corruption case when autoCommit=false and bulk merging of stored fields is used (Yonik via Mike McCandless). 4. LUCENE-1163: Fixed bug in CharArraySet.contains(char[] buffer, int offset, int len) that was ignoring offset and thus giving the wrong answer. (Thomas Peuss via Mike McCandless) 5. LUCENE-1177: Fix rare case where IndexWriter.optimize might do too many merges at the end. (Mike McCandless) 6. LUCENE-1176: Fix corruption case when documents with no term vector fields are added before documents with term vector fields. (Mike McCandless) 7. LUCENE-1179: Fixed assert statement that was incorrectly preventing Fields with empty-string field name from working. (Sergey Kabashnyuk via Mike McCandless) ======================= Release 2.3.0 2008-01-21 ======================= Changes in runtime behavior 1. LUCENE-994: Defaults for IndexWriter have been changed to maximize out-of-the-box indexing speed. First, IndexWriter now flushes by RAM usage (16 MB by default) instead of a fixed doc count (call IndexWriter.setMaxBufferedDocs to get backwards compatible behavior). Second, ConcurrentMergeScheduler is used to run merges using background threads (call IndexWriter.setMergeScheduler(new SerialMergeScheduler()) to get backwards compatible behavior). Third, merges are chosen based on size in bytes of each segment rather than document count of each segment (call IndexWriter.setMergePolicy(new LogDocMergePolicy()) to get backwards compatible behavior). NOTE: users of ParallelReader must change back all of these defaults in order to ensure the docIDs "align" across all parallel indices. (Mike McCandless) 2. LUCENE-1045: SortField.AUTO didn't work with long. When detecting the field type for sorting automatically, numbers used to be interpreted as int, then as float, if parsing the number as an int failed. Now the detection checks for int, then for long, then for float. (Daniel Naber) API Changes 1. LUCENE-843: Added IndexWriter.setRAMBufferSizeMB(...) to have IndexWriter flush whenever the buffered documents are using more than the specified amount of RAM. Also added new APIs to Token that allow one to set a char[] plus offset and length to specify a token (to avoid creating a new String() for each Token). (Mike McCandless) 2. LUCENE-963: Add setters to Field to allow for re-using a single Field instance during indexing. This is a sizable performance gain, especially for small documents. (Mike McCandless) 3. LUCENE-969: Add new APIs to Token, TokenStream and Analyzer to permit re-using of Token and TokenStream instances during indexing. Changed Token to use a char[] as the store for the termText instead of String. This gives faster tokenization performance (~10-15%). (Mike McCandless) 4. LUCENE-847: Factored MergePolicy, which determines which merges should take place and when, as well as MergeScheduler, which determines when the selected merges should actually run, out of IndexWriter. The default merge policy is now LogByteSizeMergePolicy (see LUCENE-845) and the default merge scheduler is now ConcurrentMergeScheduler (see LUCENE-870). (Steven Parkes via Mike McCandless) 5. LUCENE-1052: Add IndexReader.setTermInfosIndexDivisor(int) method that allows you to reduce memory usage of the termInfos by further sub-sampling (over the termIndexInterval that was used during indexing) which terms are loaded into memory. (Chuck Williams, Doug Cutting via Mike McCandless) 6. LUCENE-743: Add IndexReader.reopen() method that re-opens an existing IndexReader (see New features -> 8.) (Michael Busch) 7. LUCENE-1062: Add setData(byte[] data), setData(byte[] data, int offset, int length), getData(), getOffset() and clone() methods to o.a.l.index.Payload. Also add the field name as arg to Similarity.scorePayload(). (Michael Busch) 8. LUCENE-982: Add IndexWriter.optimize(int maxNumSegments) method to "partially optimize" an index down to maxNumSegments segments. (Mike McCandless) 9. LUCENE-1080: Changed Token.DEFAULT_TYPE to be public. 10. LUCENE-1064: Changed TopDocs constructor to be public. (Shai Erera via Michael Busch) 11. LUCENE-1079: DocValues cleanup: constructor now has no params, and getInnerArray() now throws UnsupportedOperationException (Doron Cohen) 12. LUCENE-1089: Added PriorityQueue.insertWithOverflow, which returns the Object (if any) that was bumped from the queue to allow re-use. (Shai Erera via Mike McCandless) 13. LUCENE-1101: Token reuse 'contract' (defined LUCENE-969) modified so it is token producer's responsibility to call Token.clear(). (Doron Cohen) 14. LUCENE-1118: Changed StandardAnalyzer to skip too-long (default > 255 characters) tokens. You can increase this limit by calling StandardAnalyzer.setMaxTokenLength(...). (Michael McCandless) Bug fixes 1. LUCENE-933: QueryParser fixed to not produce empty sub BooleanQueries "()" even if the Analyzer produced no tokens for input. (Doron Cohen) 2. LUCENE-955: Fixed SegmentTermPositions to work correctly with the first term in the dictionary. (Michael Busch) 3. LUCENE-951: Fixed NullPointerException in MultiLevelSkipListReader that was thrown after a call of TermPositions.seek(). (Rich Johnson via Michael Busch) 4. LUCENE-938: Fixed cases where an unhandled exception in IndexWriter's methods could cause deletes to be lost. (Steven Parkes via Mike McCandless) 5. LUCENE-962: Fixed case where an unhandled exception in IndexWriter.addDocument or IndexWriter.updateDocument could cause unreferenced files in the index to not be deleted (Steven Parkes via Mike McCandless) 6. LUCENE-957: RAMDirectory fixed to properly handle directories larger than Integer.MAX_VALUE. (Doron Cohen) 7. LUCENE-781: MultiReader fixed to not throw NPE if isCurrent(), isOptimized() or getVersion() is called. Separated MultiReader into two classes: MultiSegmentReader extends IndexReader, is package-protected and is created automatically by IndexReader.open() in case the index has multiple segments. The public MultiReader now extends MultiSegmentReader and is intended to be used by users who want to add their own subreaders. (Daniel Naber, Michael Busch) 8. LUCENE-970: FilterIndexReader now implements isOptimized(). Before a call of isOptimized() would throw a NPE. (Michael Busch) 9. LUCENE-832: ParallelReader fixed to not throw NPE if isCurrent(), isOptimized() or getVersion() is called. (Michael Busch) 10. LUCENE-948: Fix FNFE exception caused by stale NFS client directory listing caches when writers on different machines are sharing an index over NFS and using a custom deletion policy (Mike McCandless) 11. LUCENE-978: Ensure TermInfosReader, FieldsReader, and FieldsReader close any streams they had opened if an exception is hit in the constructor. (Ning Li via Mike McCandless) 12. LUCENE-985: If an extremely long term is in a doc (> 16383 chars), we now throw an IllegalArgumentException saying the term is too long, instead of cryptic ArrayIndexOutOfBoundsException. (Karl Wettin via Mike McCandless) 13. LUCENE-991: The explain() method of BoostingTermQuery had errors when no payloads were present on a document. (Peter Keegan via Grant Ingersoll) 14. LUCENE-992: Fixed IndexWriter.updateDocument to be atomic again (this was broken by LUCENE-843). (Ning Li via Mike McCandless) 15. LUCENE-1008: Fixed corruption case when document with no term vector fields is added after documents with term vector fields. This bug was introduced with LUCENE-843. (Grant Ingersoll via Mike McCandless) 16. LUCENE-1006: Fixed QueryParser to accept a "" field value (zero length quoted string.) (yonik) 17. LUCENE-1010: Fixed corruption case when document with no term vector fields is added after documents with term vector fields. This case is hit during merge and would cause an EOFException. This bug was introduced with LUCENE-984. (Andi Vajda via Mike McCandless) 19. LUCENE-1009: Fix merge slowdown with LogByteSizeMergePolicy when autoCommit=false and documents are using stored fields and/or term vectors. (Mark Miller via Mike McCandless) 20. LUCENE-1011: Fixed corruption case when two or more machines, sharing an index over NFS, can be writers in quick succession. (Patrick Kimber via Mike McCandless) 21. LUCENE-1028: Fixed Weight serialization for few queries: DisjunctionMaxQuery, ValueSourceQuery, CustomScoreQuery. Serialization check added for all queries. (Kyle Maxwell via Doron Cohen) 22. LUCENE-1048: Fixed incorrect behavior in Lock.obtain(...) when the timeout argument is very large (eg Long.MAX_VALUE). Also added Lock.LOCK_OBTAIN_WAIT_FOREVER constant to never timeout. (Nikolay Diakov via Mike McCandless) 23. LUCENE-1050: Throw LockReleaseFailedException in Simple/NativeFSLockFactory if we fail to delete the lock file when releasing the lock. (Nikolay Diakov via Mike McCandless) 24. LUCENE-1071: Fixed SegmentMerger to correctly set payload bit in the merged segment. (Michael Busch) 25. LUCENE-1042: Remove throwing of IOException in getTermFreqVector(int, String, TermVectorMapper) to be consistent with other getTermFreqVector calls. Also removed the throwing of the other IOException in that method to be consistent. (Karl Wettin via Grant Ingersoll) 26. LUCENE-1096: Fixed Hits behavior when hits' docs are deleted along with iterating the hits. Deleting docs already retrieved now works seamlessly. If docs not yet retrieved are deleted (e.g. from another thread), and then, relying on the initial Hits.length(), an application attempts to retrieve more hits than actually exist , a ConcurrentMidificationException is thrown. (Doron Cohen) 27. LUCENE-1068: Changed StandardTokenizer to fix an issue with it marking the type of some tokens incorrectly. This is done by adding a new flag named replaceInvalidAcronym which defaults to false, the current, incorrect behavior. Setting this flag to true fixes the problem. This flag is a temporary fix and is already marked as being deprecated. 3.x will implement the correct approach. (Shai Erera via Grant Ingersoll) LUCENE-1140: Fixed NPE caused by 1068 (Alexei Dets via Grant Ingersoll) 28. LUCENE-749: ChainedFilter behavior fixed when logic of first filter is ANDNOT. (Antonio Bruno via Doron Cohen) 29. LUCENE-508: Make sure SegmentTermEnum.prev() is accurate (= last term) after next() returns false. (Steven Tamm via Mike McCandless) New features 1. LUCENE-906: Elision filter for French. (Mathieu Lecarme via Otis Gospodnetic) 2. LUCENE-960: Added a SpanQueryFilter and related classes to allow for not only filtering, but knowing where in a Document a Filter matches (Grant Ingersoll) 3. LUCENE-868: Added new Term Vector access features. New callback mechanism allows application to define how and where to read Term Vectors from disk. This implementation contains several extensions of the new abstract TermVectorMapper class. The new API should be back-compatible. No changes in the actual storage of Term Vectors has taken place. 3.1 LUCENE-1038: Added setDocumentNumber() method to TermVectorMapper to provide information about what document is being accessed. (Karl Wettin via Grant Ingersoll) 4. LUCENE-975: Added PositionBasedTermVectorMapper that allows for position based lookup of term vector information. See item #3 above (LUCENE-868). 5. LUCENE-1011: Added simple tools (all in org.apache.lucene.store) to verify that locking is working properly. LockVerifyServer runs a separate server to verify locks. LockStressTest runs a simple tool that rapidly obtains and releases locks. VerifyingLockFactory is a LockFactory that wraps any other LockFactory and consults the LockVerifyServer whenever a lock is obtained or released, throwing an exception if an illegal lock obtain occurred. (Patrick Kimber via Mike McCandless) 6. LUCENE-1015: Added FieldCache extension (ExtendedFieldCache) to support doubles and longs. Added support into SortField for sorting on doubles and longs as well. (Grant Ingersoll) 7. LUCENE-1020: Created basic index checking & repair tool (o.a.l.index.CheckIndex). When run without -fix it does a detailed test of all segments in the index and reports summary information and any errors it hit. With -fix it will remove segments that had errors. (Mike McCandless) 8. LUCENE-743: Add IndexReader.reopen() method that re-opens an existing IndexReader by only loading those portions of an index that have changed since the reader was (re)opened. reopen() can be significantly faster than open(), depending on the amount of index changes. SegmentReader, MultiSegmentReader, MultiReader, and ParallelReader implement reopen(). (Michael Busch) 9. LUCENE-1040: CharArraySet useful for efficiently checking set membership of text specified by char[]. (yonik) 10. LUCENE-1073: Created SnapshotDeletionPolicy to facilitate taking a live backup of an index without pausing indexing. (Mike McCandless) 11. LUCENE-1019: CustomScoreQuery enhanced to support multiple ValueSource queries. (Kyle Maxwell via Doron Cohen) 12. LUCENE-1095: Added an option to StopFilter to increase positionIncrement of the token succeeding a stopped token. Disabled by default. Similar option added to QueryParser to consider token positions when creating PhraseQuery and MultiPhraseQuery. Disabled by default (so by default the query parser ignores position increments). (Doron Cohen) 13. LUCENE-1380: Added TokenFilter for setting position increment in special cases related to the ShingleFilter (Mck SembWever, Steve Rowe, Karl Wettin via Grant Ingersoll) Optimizations 1. LUCENE-937: CachingTokenFilter now uses an iterator to access the Tokens that are cached in the LinkedList. This increases performance significantly, especially when the number of Tokens is large. (Mark Miller via Michael Busch) 2. LUCENE-843: Substantial optimizations to improve how IndexWriter uses RAM for buffering documents and to speed up indexing (2X-8X faster). A single shared hash table now records the in-memory postings per unique term and is directly flushed into a single segment. (Mike McCandless) 3. LUCENE-892: Fixed extra "buffer to buffer copy" that sometimes takes place when using compound files. (Mike McCandless) 4. LUCENE-959: Remove synchronization in Document (yonik) 5. LUCENE-963: Add setters to Field to allow for re-using a single Field instance during indexing. This is a sizable performance gain, especially for small documents. (Mike McCandless) 6. LUCENE-939: Check explicitly for boundary conditions in FieldInfos and don't rely on exceptions. (Michael Busch) 7. LUCENE-966: Very substantial speedups (~6X faster) for StandardTokenizer (StandardAnalyzer) by using JFlex instead of JavaCC to generate the tokenizer. (Stanislaw Osinski via Mike McCandless) 8. LUCENE-969: Changed core tokenizers & filters to re-use Token and TokenStream instances when possible to improve tokenization performance (~10-15%). (Mike McCandless) 9. LUCENE-871: Speedup ISOLatin1AccentFilter (Ian Boston via Mike McCandless) 10. LUCENE-986: Refactored SegmentInfos from IndexReader into the new subclass DirectoryIndexReader. SegmentReader and MultiSegmentReader now extend DirectoryIndexReader and are the only IndexReader implementations that use SegmentInfos to access an index and acquire a write lock for index modifications. (Michael Busch) 11. LUCENE-1007: Allow flushing in IndexWriter to be triggered by either RAM usage or document count or both (whichever comes first), by adding symbolic constant DISABLE_AUTO_FLUSH to disable one of the flush triggers. (Ning Li via Mike McCandless) 12. LUCENE-1043: Speed up merging of stored fields by bulk-copying the raw bytes for each contiguous range of non-deleted documents. (Robert Engels via Mike McCandless) 13. LUCENE-693: Speed up nested conjunctions (~2x) that match many documents, and a slight performance increase for top level conjunctions. (yonik) 14. LUCENE-1098: Make inner class StandardAnalyzer.SavedStreams static and final. (Nathan Beyer via Michael Busch) Documentation 1. LUCENE-1051: Generate separate javadocs for core, demo and contrib classes, as well as an unified view. Also add an appropriate menu structure to the website. (Michael Busch) 2. LUCENE-746: Fix error message in AnalyzingQueryParser.getPrefixQuery. (Ronnie Kolehmainen via Michael Busch) Build 1. LUCENE-908: Improvements and simplifications for how the MANIFEST file and the META-INF dir are created. (Michael Busch) 2. LUCENE-935: Various improvements for the maven artifacts. Now the artifacts also include the sources as .jar files. (Michael Busch) 3. Added apply-patch target to top-level build. Defaults to looking for a patch in ${basedir}/../patches with name specified by -Dpatch.name. Can also specify any location by -Dpatch.file property on the command line. This should be helpful for easy application of patches, but it is also a step towards integrating automatic patch application with JIRA and Hudson, and is thus subject to change. (Grant Ingersoll) 4. LUCENE-935: Defined property "m2.repository.url" to allow setting the url to a maven remote repository to deploy to. (Michael Busch) 5. LUCENE-1051: Include javadocs in the maven artifacts. (Michael Busch) 6. LUCENE-1055: Remove gdata-server from build files and its sources from trunk. (Michael Busch) 7. LUCENE-935: Allow to deploy maven artifacts to a remote m2 repository via scp and ssh authentication. (Michael Busch) 8. LUCENE-1123: Allow overriding the specification version for MANIFEST.MF (Michael Busch) Test Cases 1. LUCENE-766: Test adding two fields with the same name but different term vector setting. (Nicolas Lalevée via Doron Cohen) ======================= Release 2.2.0 2007-06-19 ======================= Changes in runtime behavior API Changes 1. LUCENE-793: created new exceptions and added them to throws clause for many methods (all subclasses of IOException for backwards compatibility): index.StaleReaderException, index.CorruptIndexException, store.LockObtainFailedException. This was done to better call out the possible root causes of an IOException from these methods. (Mike McCandless) 2. LUCENE-811: make SegmentInfos class, plus a few methods from related classes, package-private again (they were unnecessarily made public as part of LUCENE-701). (Mike McCandless) 3. LUCENE-710: added optional autoCommit boolean to IndexWriter constructors. When this is false, index changes are not committed until the writer is closed. This gives explicit control over when a reader will see the changes. Also added optional custom deletion policy to explicitly control when prior commits are removed from the index. This is intended to allow applications to share an index over NFS by customizing when prior commits are deleted. (Mike McCandless) 4. LUCENE-818: changed most public methods of IndexWriter, IndexReader (and its subclasses), FieldsReader and RAMDirectory to throw AlreadyClosedException if they are accessed after being closed. (Mike McCandless) 5. LUCENE-834: Changed some access levels for certain Span classes to allow them to be overridden. They have been marked expert only and not for public consumption. (Grant Ingersoll) 6. LUCENE-796: Removed calls to super.* from various get*Query methods in MultiFieldQueryParser, in order to allow sub-classes to override them. (Steven Parkes via Otis Gospodnetic) 7. LUCENE-857: Removed caching from QueryFilter and deprecated QueryFilter in favour of QueryWrapperFilter or QueryWrapperFilter + CachingWrapperFilter combination when caching is desired. (Chris Hostetter, Otis Gospodnetic) 8. LUCENE-869: Changed FSIndexInput and FSIndexOutput to inner classes of FSDirectory to enable extensibility of these classes. (Michael Busch) 9. LUCENE-580: Added the public method reset() to TokenStream. This method does nothing by default, but may be overwritten by subclasses to support consuming the TokenStream more than once. (Michael Busch) 10. LUCENE-580: Added a new constructor to Field that takes a TokenStream as argument, available as tokenStreamValue(). This is useful to avoid the need of "dummy analyzers" for pre-analyzed fields. (Karl Wettin, Michael Busch) 11. LUCENE-730: Added the new methods to BooleanQuery setAllowDocsOutOfOrder() and getAllowDocsOutOfOrder(). Deprecated the methods setUseScorer14() and getUseScorer14(). The optimization patch LUCENE-730 (see Optimizations->3.) improves performance for certain queries but results in scoring out of docid order. This patch reverse this change, so now by default hit docs are scored in docid order if not setAllowDocsOutOfOrder(true) is explicitly called. This patch also enables the tests in QueryUtils again that check for docid order. (Paul Elschot, Doron Cohen, Michael Busch) 12. LUCENE-888: Added Directory.openInput(File path, int bufferSize) to optionally specify the size of the read buffer. Also added BufferedIndexInput.setBufferSize(int) to change the buffer size. (Mike McCandless) 13. LUCENE-923: Make SegmentTermPositionVector package-private. It does not need to be public because it implements the public interface TermPositionVector. (Michael Busch) Bug fixes 1. LUCENE-804: Fixed build.xml to pack a fully compilable src dist. (Doron Cohen) 2. LUCENE-813: Leading wildcard fixed to work with trailing wildcard. Query parser modified to create a prefix query only for the case that there is a single trailing wildcard (and no additional wildcard or '?' in the query text). (Doron Cohen) 3. LUCENE-812: Add no-argument constructors to NativeFSLockFactory and SimpleFSLockFactory. This enables all 4 builtin LockFactory implementations to be specified via the System property org.apache.lucene.store.FSDirectoryLockFactoryClass. (Mike McCandless) 4. LUCENE-821: The new single-norm-file introduced by LUCENE-756 failed to reduce the number of open descriptors since it was still opened once per field with norms. (yonik) 5. LUCENE-823: Make sure internal file handles are closed when hitting an exception (eg disk full) while flushing deletes in IndexWriter's mergeSegments, and also during IndexWriter.addIndexes. (Mike McCandless) 6. LUCENE-825: If directory is removed after FSDirectory.getDirectory() but before IndexReader.open you now get a FileNotFoundException like Lucene pre-2.1 (before this fix you got an NPE). (Mike McCandless) 7. LUCENE-800: Removed backslash from the TERM_CHAR list in the queryparser, because the backslash is the escape character. Also changed the ESCAPED_CHAR list to contain all possible characters, because every character that follows a backslash should be considered as escaped. (Michael Busch) 8. LUCENE-372: QueryParser.parse() now ensures that the entire input string is consumed. Now a ParseException is thrown if a query contains too many closing parentheses. (Andreas Neumann via Michael Busch) 9. LUCENE-814: javacc build targets now fix line-end-style of generated files. Now also deleting all javacc generated files before calling javacc. (Steven Parkes, Doron Cohen) 10. LUCENE-829: close readers in contrib/benchmark. (Karl Wettin, Doron Cohen) 11. LUCENE-828: Minor fix for Term's equal(). (Paul Cowan via Otis Gospodnetic) 12. LUCENE-846: Fixed: if IndexWriter is opened with autoCommit=false, and you call addIndexes, and hit an exception (eg disk full) then when IndexWriter rolls back its internal state this could corrupt the instance of IndexWriter (but, not the index itself) by referencing already deleted segments. This bug was only present in 2.2 (trunk), ie was never released. (Mike McCandless) 13. LUCENE-736: Sloppy phrase query with repeating terms matches wrong docs. For example query "B C B"~2 matches the doc "A B C D E". (Doron Cohen) 14. LUCENE-789: Fixed: custom similarity is ignored when using MultiSearcher (problem reported by Alexey Lef). Now the similarity applied by MultiSearcer.setSimilarity(sim) is being used. Note that as before this fix, creating a multiSearcher from Searchers for whom custom similarity was set has no effect - it is masked by the similarity of the MultiSearcher. This is as designed, because MultiSearcher operates on Searchables (not Searchers). (Doron Cohen) 15. LUCENE-880: Fixed DocumentWriter to close the TokenStreams after it has written the postings. Then the resources associated with the TokenStreams can safely be released. (Michael Busch) 16. LUCENE-883: consecutive calls to Spellchecker.indexDictionary() won't insert terms twice anymore. (Daniel Naber) 17. LUCENE-881: QueryParser.escape() now also escapes the characters '|' and '&' which are part of the queryparser syntax. (Michael Busch) 18. LUCENE-886: Spellchecker clean up: exceptions aren't printed to STDERR anymore and ignored, but re-thrown. Some javadoc improvements. (Daniel Naber) 19. LUCENE-698: FilteredQuery now takes the query boost into account for scoring. (Michael Busch) 20. LUCENE-763: Spellchecker: LuceneDictionary used to skip first word in enumeration. (Christian Mallwitz via Daniel Naber) 21. LUCENE-903: FilteredQuery explanation inaccuracy with boost. Explanation tests now "deep" check the explanation details. (Chris Hostetter, Doron Cohen) 22. LUCENE-912: DisjunctionMaxScorer first skipTo(target) call ignores the skip target param and ends up at the first match. (Sudaakeran B. via Chris Hostetter & Doron Cohen) 23. LUCENE-913: Two consecutive score() calls return different scores for Boolean Queries. (Michael Busch, Doron Cohen) 24. LUCENE-1013: Fix IndexWriter.setMaxMergeDocs to work "out of the box", again, by moving set/getMaxMergeDocs up from LogDocMergePolicy into LogMergePolicy. This fixes the API breakage (non backwards compatible change) caused by LUCENE-994. (Yonik Seeley via Mike McCandless) New features 1. LUCENE-759: Added two n-gram-producing TokenFilters. (Otis Gospodnetic) 2. LUCENE-822: Added FieldSelector capabilities to Searchable for use with RemoteSearcher, and other Searchable implementations. (Mark Miller, Grant Ingersoll) 3. LUCENE-755: Added the ability to store arbitrary binary metadata in the posting list. These metadata are called Payloads. For every position of a Token one Payload in the form of a variable length byte array can be stored in the prox file. Remark: The APIs introduced with this feature are in experimental state and thus contain appropriate warnings in the javadocs. (Michael Busch) 4. LUCENE-834: Added BoostingTermQuery which can boost scores based on the values of a payload (see #3 above.) (Grant Ingersoll) 5. LUCENE-834: Similarity has a new method for scoring payloads called scorePayloads that can be overridden to take advantage of payload storage (see #3 above) 6. LUCENE-834: Added isPayloadAvailable() onto TermPositions interface and implemented it in the appropriate places (Grant Ingersoll) 7. LUCENE-853: Added RemoteCachingWrapperFilter to enable caching of Filters on the remote side of the RMI connection. (Matt Ericson via Otis Gospodnetic) 8. LUCENE-446: Added Solr's search.function for scores based on field values, plus CustomScoreQuery for simple score (post) customization. (Yonik Seeley, Doron Cohen) 9. LUCENE-1058: Added new TeeTokenFilter (like the UNIX 'tee' command) and SinkTokenizer which can be used to share tokens between two or more Fields such that the other Fields do not have to go through the whole Analysis process over again. For instance, if you have two Fields that share all the same analysis steps except one lowercases tokens and the other does not, you can coordinate the operations between the two using the TeeTokenFilter and the SinkTokenizer. See TeeSinkTokenTest.java for examples. (Grant Ingersoll, Michael Busch, Yonik Seeley) Optimizations 1. LUCENE-761: The proxStream is now cloned lazily in SegmentTermPositions when nextPosition() is called for the first time. This allows using instances of SegmentTermPositions instead of SegmentTermDocs without additional costs. (Michael Busch) 2. LUCENE-431: RAMInputStream and RAMOutputStream extend IndexInput and IndexOutput directly now. This avoids further buffering and thus avoids unnecessary array copies. (Michael Busch) 3. LUCENE-730: Updated BooleanScorer2 to make use of BooleanScorer in some cases and possibly improve scoring performance. Documents can now be delivered out-of-order as they are scored (e.g. to HitCollector). N.B. A bit of code had to be disabled in QueryUtils in order for TestBoolean2 test to keep passing. (Paul Elschot via Otis Gospodnetic) 4. LUCENE-882: Spellchecker doesn't store the ngrams anymore but only indexes them to keep the spell index small. (Daniel Naber) 5. LUCENE-430: Delay allocation of the buffer after a clone of BufferedIndexInput. Together with LUCENE-888 this will allow to adjust the buffer size dynamically. (Paul Elschot, Michael Busch) 6. LUCENE-888: Increase buffer sizes inside CompoundFileWriter and BufferedIndexOutput. Also increase buffer size in BufferedIndexInput, but only when used during merging. Together, these increases yield 10-18% overall performance gain vs the previous 1K defaults. (Mike McCandless) 7. LUCENE-866: Adds multi-level skip lists to the posting lists. This speeds up most queries that use skipTo(), especially on big indexes with large posting lists. For average AND queries the speedup is about 20%, for queries that contain very frequent and very unique terms the speedup can be over 80%. (Michael Busch) Documentation 1. LUCENE 791 && INFRA-1173: Infrastructure moved the Wiki to http://wiki.apache.org/lucene-java/ Updated the links in the docs and wherever else I found references. (Grant Ingersoll, Joe Schaefer) 2. LUCENE-807: Fixed the javadoc for ScoreDocComparator.compare() to be consistent with java.util.Comparator.compare(): Any integer is allowed to be returned instead of only -1/0/1. (Paul Cowan via Michael Busch) 3. LUCENE-875: Solved javadoc warnings & errors under jdk1.4. Solved javadoc errors under jdk5 (jars in path for gdata). Made "javadocs" target depend on "build-contrib" for first downloading contrib jars configured for dynamic downloaded. (Note: when running behind firewall, a firewall prompt might pop up) (Doron Cohen) 4. LUCENE-740: Added SNOWBALL-LICENSE.txt to the snowball package and a remark about the license to NOTICE.TXT. (Steven Parkes via Michael Busch) 5. LUCENE-925: Added analysis package javadocs. (Grant Ingersoll and Doron Cohen) 6. LUCENE-926: Added document package javadocs. (Grant Ingersoll) Build 1. LUCENE-802: Added LICENSE.TXT and NOTICE.TXT to Lucene jars. (Steven Parkes via Michael Busch) 2. LUCENE-885: "ant test" now includes all contrib tests. The new "ant test-core" target can be used to run only the Core (non contrib) tests. (Chris Hostetter) 3. LUCENE-900: "ant test" now enables Java assertions (in Lucene packages). (Doron Cohen) 4. LUCENE-894: Add custom build file for binary distributions that includes targets to build the demos. (Chris Hostetter, Michael Busch) 5. LUCENE-904: The "package" targets in build.xml now also generate .md5 checksum files. (Chris Hostetter, Michael Busch) 6. LUCENE-907: Include LICENSE.TXT and NOTICE.TXT in the META-INF dirs of demo war, demo jar, and the contrib jars. (Michael Busch) 7. LUCENE-909: Demo targets for running the demo. (Doron Cohen) 8. LUCENE-908: Improves content of MANIFEST file and makes it customizable for the contribs. Adds SNOWBALL-LICENSE.txt to META-INF of the snowball jar and makes sure that the lucli jar contains LICENSE.txt and NOTICE.txt. (Chris Hostetter, Michael Busch) 9. LUCENE-930: Various contrib building improvements to ensure contrib dependencies are met, and test compilation errors fail the build. (Steven Parkes, Chris Hostetter) 10. LUCENE-622: Add ant target and pom.xml files for building maven artifacts of the Lucene core and the contrib modules. (Sami Siren, Karl Wettin, Michael Busch) ======================= Release 2.1.0 2007-02-14 ======================= Changes in runtime behavior 1. 's' and 't' have been removed from the list of default stopwords in StopAnalyzer (also used in by StandardAnalyzer). Having e.g. 's' as a stopword meant that 's-class' led to the same results as 'class'. Note that this problem still exists for 'a', e.g. in 'a-class' as 'a' continues to be a stopword. (Daniel Naber) 2. LUCENE-478: Updated the list of Unicode code point ranges for CJK (now split into CJ and K) in StandardAnalyzer. (John Wang and Steven Rowe via Otis Gospodnetic) 3. Modified some CJK Unicode code point ranges in StandardTokenizer.jj, and added a few more of them to increase CJK character coverage. Also documented some of the ranges. (Otis Gospodnetic) 4. LUCENE-489: Add support for leading wildcard characters (*, ?) to QueryParser. Default is to disallow them, as before. (Steven Parkes via Otis Gospodnetic) 5. LUCENE-703: QueryParser changed to default to use of ConstantScoreRangeQuery for range queries. Added useOldRangeQuery property to QueryParser to allow selection of old RangeQuery class if required. (Mark Harwood) 6. LUCENE-543: WildcardQuery now performs a TermQuery if the provided term does not contain a wildcard character (? or *), when previously a StringIndexOutOfBoundsException was thrown. (Michael Busch via Erik Hatcher) 7. LUCENE-726: Removed the use of deprecated doc.fields() method and Enumeration. (Michael Busch via Otis Gospodnetic) 8. LUCENE-436: Removed finalize() in TermInfosReader and SegmentReader, and added a call to enumerators.remove() in TermInfosReader.close(). The finalize() overrides were added to help with a pre-1.4.2 JVM bug that has since been fixed, plus we no longer support pre-1.4.2 JVMs. (Otis Gospodnetic) 9. LUCENE-771: The default location of the write lock is now the index directory, and is named simply "write.lock" (without a big digest prefix). The system properties "org.apache.lucene.lockDir" nor "java.io.tmpdir" are no longer used as the global directory for storing lock files, and the LOCK_DIR field of FSDirectory is now deprecated. (Mike McCandless) New features 1. LUCENE-503: New ThaiAnalyzer and ThaiWordFilter in contrib/analyzers (Samphan Raruenrom via Chris Hostetter) 2. LUCENE-545: New FieldSelector API and associated changes to IndexReader and implementations. New Fieldable interface for use with the lazy field loading mechanism. (Grant Ingersoll and Chuck Williams via Grant Ingersoll) 3. LUCENE-676: Move Solr's PrefixFilter to Lucene core. (Yura Smolsky, Yonik Seeley) 4. LUCENE-678: Added NativeFSLockFactory, which implements locking using OS native locking (via java.nio.*). (Michael McCandless via Yonik Seeley) 5. LUCENE-544: Added the ability to specify different boosts for different fields when using MultiFieldQueryParser (Matt Ericson via Otis Gospodnetic) 6. LUCENE-528: New IndexWriter.addIndexesNoOptimize() that doesn't optimize the index when adding new segments, only performing merges as needed. (Ning Li via Yonik Seeley) 7. LUCENE-573: QueryParser now allows backslash escaping in quoted terms and phrases. (Michael Busch via Yonik Seeley) 8. LUCENE-716: QueryParser now allows specification of Unicode characters in terms via a unicode escape of the form \uXXXX (Michael Busch via Yonik Seeley) 9. LUCENE-709: Added RAMDirectory.sizeInBytes(), IndexWriter.ramSizeInBytes() and IndexWriter.flushRamSegments(), allowing applications to control the amount of memory used to buffer documents. (Chuck Williams via Yonik Seeley) 10. LUCENE-723: QueryParser now parses *:* as MatchAllDocsQuery (Yonik Seeley) 11. LUCENE-741: Command-line utility for modifying or removing norms on fields in an existing index. This is mostly based on LUCENE-496 and lives in contrib/miscellaneous. (Chris Hostetter, Otis Gospodnetic) 12. LUCENE-759: Added NGramTokenizer and EdgeNGramTokenizer classes and their passing unit tests. (Otis Gospodnetic) 13. LUCENE-565: Added methods to IndexWriter to more efficiently handle updating documents (the "delete then add" use case). This is intended to be an eventual replacement for the existing IndexModifier. Added IndexWriter.flush() (renamed from flushRamSegments()) to flush all pending updates (held in RAM), to the Directory. (Ning Li via Mike McCandless) 14. LUCENE-762: Added in SIZE and SIZE_AND_BREAK FieldSelectorResult options which allow one to retrieve the size of a field without retrieving the actual field. (Chuck Williams via Grant Ingersoll) 15. LUCENE-799: Properly handle lazy, compressed fields. (Mike Klaas via Grant Ingersoll) API Changes 1. LUCENE-438: Remove "final" from Token, implement Cloneable, allow changing of termText via setTermText(). (Yonik Seeley) 2. org.apache.lucene.analysis.nl.WordlistLoader has been deprecated and is supposed to be replaced with the WordlistLoader class in package org.apache.lucene.analysis (Daniel Naber) 3. LUCENE-609: Revert return type of Document.getField(s) to Field for backward compatibility, added new Document.getFieldable(s) for access to new lazy loaded fields. (Yonik Seeley) 4. LUCENE-608: Document.fields() has been deprecated and a new method Document.getFields() has been added that returns a List instead of an Enumeration (Daniel Naber) 5. LUCENE-605: New Explanation.isMatch() method and new ComplexExplanation subclass allows explain methods to produce Explanations which model "matching" independent of having a positive value. (Chris Hostetter) 6. LUCENE-621: New static methods IndexWriter.setDefaultWriteLockTimeout and IndexWriter.setDefaultCommitLockTimeout for overriding default timeout values for all future instances of IndexWriter (as well as for any other classes that may reference the static values, ie: IndexReader). (Michael McCandless via Chris Hostetter) 7. LUCENE-638: FSDirectory.list() now only returns the directory's Lucene-related files. Thanks to this change one can now construct a RAMDirectory from a file system directory that contains files not related to Lucene. (Simon Willnauer via Daniel Naber) 8. LUCENE-635: Decoupling locking implementation from Directory implementation. Added set/getLockFactory to Directory and moved all locking code into subclasses of abstract class LockFactory. FSDirectory and RAMDirectory still default to their prior locking implementations, but now you can mix & match, for example using SingleInstanceLockFactory (ie, in memory locking) locking with an FSDirectory. Note that now you must call setDisableLocks before the instantiation a FSDirectory if you wish to disable locking for that Directory. (Michael McCandless, Jeff Patterson via Yonik Seeley) 9. LUCENE-657: Made FuzzyQuery non-final and inner ScoreTerm protected. (Steven Parkes via Otis Gospodnetic) 10. LUCENE-701: Lockless commits: a commit lock is no longer required when a writer commits and a reader opens the index. This includes a change to the index file format (see docs/fileformats.html for details). It also removes all APIs associated with the commit lock & its timeout. Readers are now truly read-only and do not block one another on startup. This is the first step to getting Lucene to work correctly over NFS (second step is LUCENE-710). (Mike McCandless) 11. LUCENE-722: DEFAULT_MIN_DOC_FREQ was misspelled DEFALT_MIN_DOC_FREQ in Similarity's MoreLikeThis class. The misspelling has been replaced by the correct spelling. (Andi Vajda via Daniel Naber) 12. LUCENE-738: Reduce the size of the file that keeps track of which documents are deleted when the number of deleted documents is small. This changes the index file format and cannot be read by previous versions of Lucene. (Doron Cohen via Yonik Seeley) 13. LUCENE-756: Maintain all norms in a single .nrm file to reduce the number of open files and file descriptors for the non-compound index format. This changes the index file format, but maintains the ability to read and update older indices. The first segment merge on an older format index will create a single .nrm file for the new segment. (Doron Cohen via Yonik Seeley) 14. LUCENE-732: DateTools support has been added to QueryParser, with setters for both the default Resolution, and per-field Resolution. For backwards compatibility, DateField is still used if no Resolutions are specified. (Michael Busch via Chris Hostetter) 15. Added isOptimized() method to IndexReader. (Otis Gospodnetic) 16. LUCENE-773: Deprecate the FSDirectory.getDirectory(*) methods that take a boolean "create" argument. Instead you should use IndexWriter's "create" argument to create a new index. (Mike McCandless) 17. LUCENE-780: Add a static Directory.copy() method to copy files from one Directory to another. (Jiri Kuhn via Mike McCandless) 18. LUCENE-773: Added Directory.clearLock(String name) to forcefully remove an old lock. The default implementation is to ask the lockFactory (if non null) to clear the lock. (Mike McCandless) 19. LUCENE-795: Directory.renameFile() has been deprecated as it is not used anymore inside Lucene. (Daniel Naber) Bug fixes 1. Fixed the web application demo (built with "ant war-demo") which didn't work because it used a QueryParser method that had been removed (Daniel Naber) 2. LUCENE-583: ISOLatin1AccentFilter fails to preserve positionIncrement (Yonik Seeley) 3. LUCENE-575: SpellChecker min score is incorrectly changed by suggestSimilar (Karl Wettin via Yonik Seeley) 4. LUCENE-587: Explanation.toHtml was producing malformed HTML (Chris Hostetter) 5. Fix to allow MatchAllDocsQuery to be used with RemoteSearcher (Yonik Seeley) 6. LUCENE-601: RAMDirectory and RAMFile made Serializable (Karl Wettin via Otis Gospodnetic) 7. LUCENE-557: Fixes to BooleanQuery and FilteredQuery so that the score Explanations match up with the real scores. (Chris Hostetter) 8. LUCENE-607: ParallelReader's TermEnum fails to advance properly to new fields (Chuck Williams, Christian Kohlschuetter via Yonik Seeley) 9. LUCENE-610,LUCENE-611: Simple syntax changes to allow compilation with ecj: disambiguate inner class scorer's use of doc() in BooleanScorer2, other test code changes. (DM Smith via Yonik Seeley) 10. LUCENE-451: All core query types now use ComplexExplanations so that boosts of zero don't confuse the BooleanWeight explain method. (Chris Hostetter) 11. LUCENE-593: Fixed LuceneDictionary's inner Iterator (Kåre Fiedler Christiansen via Otis Gospodnetic) 12. LUCENE-641: fixed an off-by-one bug with IndexWriter.setMaxFieldLength() (Daniel Naber) 13. LUCENE-659: Make PerFieldAnalyzerWrapper delegate getPositionIncrementGap() to the correct analyzer for the field. (Chuck Williams via Yonik Seeley) 14. LUCENE-650: Fixed NPE in Locale specific String Sort when Document has no value. (Oliver Hutchison via Chris Hostetter) 15. LUCENE-683: Fixed data corruption when reading lazy loaded fields. (Yonik Seeley) 16. LUCENE-678: Fixed bug in NativeFSLockFactory which caused the same lock to be shared between different directories. (Michael McCandless via Yonik Seeley) 17. LUCENE-690: Fixed thread unsafe use of IndexInput by lazy loaded fields. (Yonik Seeley) 18. LUCENE-696: Fix bug when scorer for DisjunctionMaxQuery has skipTo() called on it before next(). (Yonik Seeley) 19. LUCENE-569: Fixed SpanNearQuery bug, for 'inOrder' queries it would fail to recognize ordered spans if they overlapped with unordered spans. (Paul Elschot via Chris Hostetter) 20. LUCENE-706: Updated fileformats.xml|html concerning the docdelta value in the frequency file. (Johan Stuyts, Doron Cohen via Grant Ingersoll) 21. LUCENE-715: Fixed private constructor in IndexWriter.java to properly release the acquired write lock if there is an IOException after acquiring the write lock but before finishing instantiation. (Matthew Bogosian via Mike McCandless) 22. LUCENE-651: Multiple different threads requesting the same FieldCache entry (often for Sorting by a field) at the same time caused multiple generations of that entry, which was detrimental to performance and memory use. (Oliver Hutchison via Otis Gospodnetic) 23. LUCENE-717: Fixed build.xml not to fail when there is no lib dir. (Doron Cohen via Otis Gospodnetic) 24. LUCENE-728: Removed duplicate/old MoreLikeThis and SimilarityQueries classes from contrib/similarity, as their new home is under contrib/queries. (Otis Gospodnetic) 25. LUCENE-669: Do not double-close the RandomAccessFile in FSIndexInput/Output during finalize(). Besides sending an IOException up to the GC, this may also be the cause intermittent "The handle is invalid" IOExceptions on Windows when trying to close readers or writers. (Michael Busch via Mike McCandless) 26. LUCENE-702: Fix IndexWriter.addIndexes(*) to not corrupt the index on any exceptions (eg disk full). The semantics of these methods is now transactional: either all indices are merged or none are. Also fixed IndexWriter.mergeSegments (called outside of addIndexes(*) by addDocument, optimize, flushRamSegments) and IndexReader.commit() (called by close) to clean up and keep the instance state consistent to what's actually in the index (Mike McCandless). 27. LUCENE-129: Change finalizers to do "try {...} finally {super.finalize();}" to make sure we don't miss finalizers in classes above us. (Esmond Pitt via Mike McCandless) 28. LUCENE-754: Fix a problem introduced by LUCENE-651, causing IndexReaders to hang around forever, in addition to not fixing the original FieldCache performance problem. (Chris Hostetter, Yonik Seeley) 29. LUCENE-140: Fix IndexReader.deleteDocument(int docNum) to correctly raise ArrayIndexOutOfBoundsException when docNum is too large. Previously, if docNum was only slightly too large (within the same multiple of 8, ie, up to 7 ints beyond maxDoc), no exception would be raised and instead the index would become silently corrupted. The corruption then only appears much later, in mergeSegments, when the corrupted segment is merged with segment(s) after it. (Mike McCandless) 30. LUCENE-768: Fix case where an Exception during deleteDocument, undeleteAll or setNorm in IndexReader could leave the reader in a state where close() fails to release the write lock. (Mike McCandless) 31. Remove "tvp" from known index file extensions because it is never used. (Nicolas Lalevée via Bernhard Messer) 32. LUCENE-767: Change how SegmentReader.maxDoc() is computed to not rely on file length check and instead use the SegmentInfo's docCount that's already stored explicitly in the index. This is a defensive bug fix (ie, there is no known problem seen "in real life" due to this, just a possible future problem). (Chuck Williams via Mike McCandless) Optimizations 1. LUCENE-586: TermDocs.skipTo() is now more efficient for multi-segment indexes. This will improve the performance of many types of queries against a non-optimized index. (Andrew Hudson via Yonik Seeley) 2. LUCENE-623: RAMDirectory.close now nulls out its reference to all internal "files", allowing them to be GCed even if references to the RAMDirectory itself still exist. (Nadav Har'El via Chris Hostetter) 3. LUCENE-629: Compressed fields are no longer uncompressed and recompressed during segment merges (e.g. during indexing or optimizing), thus improving performance . (Michael Busch via Otis Gospodnetic) 4. LUCENE-388: Improve indexing performance when maxBufferedDocs is large by keeping a count of buffered documents rather than counting after each document addition. (Doron Cohen, Paul Smith, Yonik Seeley) 5. Modified TermScorer.explain to use TermDocs.skipTo() instead of looping through docs. (Grant Ingersoll) 6. LUCENE-672: New indexing segment merge policy flushes all buffered docs to their own segment and delays a merge until mergeFactor segments of a certain level have been accumulated. This increases indexing performance in the presence of deleted docs or partially full segments as well as enabling future optimizations. NOTE: this also fixes an "under-merging" bug whereby it is possible to get far too many segments in your index (which will drastically slow down search, risks exhausting file descriptor limit, etc.). This can happen when the number of buffered docs at close, plus the number of docs in the last non-ram segment is greater than mergeFactor. (Ning Li, Yonik Seeley) 7. Lazy loaded fields unnecessarily retained an extra copy of loaded String data. (Yonik Seeley) 8. LUCENE-443: ConjunctionScorer performance increase. Speed up any BooleanQuery with more than one mandatory clause. (Abdul Chaudhry, Paul Elschot via Yonik Seeley) 9. LUCENE-365: DisjunctionSumScorer performance increase of ~30%. Speeds up queries with optional clauses. (Paul Elschot via Yonik Seeley) 10. LUCENE-695: Optimized BufferedIndexInput.readBytes() for medium size buffers, which will speed up merging and retrieving binary and compressed fields. (Nadav Har'El via Yonik Seeley) 11. LUCENE-687: Lazy skipping on proximity file speeds up most queries involving term positions, including phrase queries. (Michael Busch via Yonik Seeley) 12. LUCENE-714: Replaced 2 cases of manual for-loop array copying with calls to System.arraycopy instead, in DocumentWriter.java. (Nicolas Lalevee via Mike McCandless) 13. LUCENE-729: Non-recursive skipTo and next implementation of TermDocs for a MultiReader. The old implementation could recurse up to the number of segments in the index. (Yonik Seeley) 14. LUCENE-739: Improve segment merging performance by reusing the norm array across different fields and doing bulk writes of norms of segments with no deleted docs. (Michael Busch via Yonik Seeley) 15. LUCENE-745: Add BooleanQuery.clauses(), allowing direct access to the List of clauses and replaced the internal synchronized Vector with an unsynchronized List. (Yonik Seeley) 16. LUCENE-750: Remove finalizers from FSIndexOutput and move the FSIndexInput finalizer to the actual file so all clones don't register a new finalizer. (Yonik Seeley) Test Cases 1. Added TestTermScorer.java (Grant Ingersoll) 2. Added TestWindowsMMap.java (Benson Margulies via Mike McCandless) 3. LUCENE-744 Append the user.name property onto the temporary directory that is created so it doesn't interfere with other users. (Grant Ingersoll) Documentation 1. Added style sheet to xdocs named lucene.css and included in the Anakia VSL descriptor. (Grant Ingersoll) 2. Added scoring.xml document into xdocs. Updated Similarity.java scoring formula.(Grant Ingersoll and Steve Rowe. Updates from: Michael McCandless, Doron Cohen, Chris Hostetter, Doug Cutting). Issue 664. 3. Added javadocs for FieldSelectorResult.java. (Grant Ingersoll) 4. Moved xdocs directory to src/site/src/documentation/content/xdocs per Issue 707. Site now builds using Forrest, just like the other Lucene siblings. See http://wiki.apache.org/jakarta-lucene/HowToUpdateTheWebsite for info on updating the website. (Grant Ingersoll with help from Steve Rowe, Chris Hostetter, Doug Cutting, Otis Gospodnetic, Yonik Seeley) 5. Added in Developer and System Requirements sections under Resources (Grant Ingersoll) 6. LUCENE-713 Updated the Term Vector section of File Formats to include documentation on how Offset and Position info are stored in the TVF file. (Grant Ingersoll, Samir Abdou) 7. Added in link to Clover Test Code Coverage Reports under the Develop section in Resources (Grant Ingersoll) 8. LUCENE-748: Added details for semantics of IndexWriter.close on hitting an Exception. (Jed Wesley-Smith via Mike McCandless) 9. Added some text about what is contained in releases. (Eric Haszlakiewicz via Grant Ingersoll) 10. LUCENE-758: Fix javadoc to clarify that RAMDirectory(Directory) makes a full copy of the starting Directory. (Mike McCandless) 11. LUCENE-764: Fix javadocs to detail temporary space requirements for IndexWriter's optimize(), addIndexes(*) and addDocument(...) methods. (Mike McCandless) Build 1. Added in clover test code coverage per http://issues.apache.org/jira/browse/LUCENE-721 To enable clover code coverage, you must have clover.jar in the ANT classpath and specify -Drun.clover=true on the command line. (Michael Busch and Grant Ingersoll) 2. Added a sysproperty in common-build.xml per Lucene 752 to map java.io.tmpdir to ${build.dir}/test just like the tempDir sysproperty. 3. LUCENE-757 Added new target named init-dist that does setup for distribution of both binary and source distributions. Called by package and package-*-src ======================= Release 2.0.0 2006-05-26 ======================= API Changes 1. All deprecated methods and fields have been removed, except DateField, which will still be supported for some time so Lucene can read its date fields from old indexes (Yonik Seeley & Grant Ingersoll) 2. DisjunctionSumScorer is no longer public. (Paul Elschot via Otis Gospodnetic) 3. Creating a Field with both an empty name and an empty value now throws an IllegalArgumentException (Daniel Naber) 4. LUCENE-301: Added new IndexWriter({String,File,Directory}, Analyzer) constructors that do not take a boolean "create" argument. These new constructors will create a new index if necessary, else append to the existing one. (Dan Armbrust via Mike McCandless) New features 1. LUCENE-496: Command line tool for modifying the field norms of an existing index; added to contrib/miscellaneous. (Chris Hostetter) 2. LUCENE-577: SweetSpotSimilarity added to contrib/miscellaneous. (Chris Hostetter) Bug fixes 1. LUCENE-330: Fix issue of FilteredQuery not working properly within BooleanQuery. (Paul Elschot via Erik Hatcher) 2. LUCENE-515: Make ConstantScoreRangeQuery and ConstantScoreQuery work with RemoteSearchable. (Philippe Laflamme via Yonik Seeley) 3. Added methods to get/set writeLockTimeout and commitLockTimeout in IndexWriter. These could be set in Lucene 1.4 using a system property. This feature had been removed without adding the corresponding getter/setter methods. (Daniel Naber) 4. LUCENE-413: Fixed ArrayIndexOutOfBoundsException exceptions when using SpanQueries. (Paul Elschot via Yonik Seeley) 5. Implemented FilterIndexReader.getVersion() and isCurrent() (Yonik Seeley) 6. LUCENE-540: Fixed a bug with IndexWriter.addIndexes(Directory[]) that sometimes caused the index order of documents to change. (Yonik Seeley) 7. LUCENE-526: Fixed a bug in FieldSortedHitQueue that caused subsequent String sorts with different locales to sort identically. (Paul Cowan via Yonik Seeley) 8. LUCENE-541: Add missing extractTerms() to DisjunctionMaxQuery (Stefan Will via Yonik Seeley) 9. LUCENE-514: Added getTermArrays() and extractTerms() to MultiPhraseQuery (Eric Jain & Yonik Seeley) 10. LUCENE-512: Fixed ClassCastException in ParallelReader.getTermFreqVectors (frederic via Yonik) 11. LUCENE-352: Fixed bug in SpanNotQuery that manifested as NullPointerException when "exclude" query was not a SpanTermQuery. (Chris Hostetter) 12. LUCENE-572: Fixed bug in SpanNotQuery hashCode, was ignoring exclude clause (Chris Hostetter) 13. LUCENE-561: Fixed some ParallelReader bugs. NullPointerException if the reader didn't know about the field yet, reader didn't keep track if it had deletions, and deleteDocument calls could circumvent synchronization on the subreaders. (Chuck Williams via Yonik Seeley) 14. LUCENE-556: Added empty extractTerms() implementation to MatchAllDocsQuery and ConstantScoreQuery in order to allow their use with a MultiSearcher. (Yonik Seeley) 15. LUCENE-546: Removed 2GB file size limitations for RAMDirectory. (Peter Royal, Michael Chan, Yonik Seeley) 16. LUCENE-485: Don't hold commit lock while removing obsolete index files. (Luc Vanlerberghe via cutting) 1.9.1 Bug fixes 1. LUCENE-511: Fix a bug in the BufferedIndexOutput optimization introduced in 1.9-final. (Shay Banon & Steven Tamm via cutting) 1.9 final Note that this release is mostly but not 100% source compatible with the previous release of Lucene (1.4.3). In other words, you should make sure your application compiles with this version of Lucene before you replace the old Lucene JAR with the new one. Many methods have been deprecated in anticipation of release 2.0, so deprecation warnings are to be expected when upgrading from 1.4.3 to 1.9. Bug fixes 1. The fix that made IndexWriter.setMaxBufferedDocs(1) work had negative effects on indexing performance and has thus been reverted. The argument for setMaxBufferedDocs(int) must now at least be 2, otherwise an exception is thrown. (Daniel Naber) Optimizations 1. Optimized BufferedIndexOutput.writeBytes() to use System.arraycopy() in more cases, rather than copying byte-by-byte. (Lukas Zapletal via Cutting) 1.9 RC1 Requirements 1. To compile and use Lucene you now need Java 1.4 or later. Changes in runtime behavior 1. FuzzyQuery can no longer throw a TooManyClauses exception. If a FuzzyQuery expands to more than BooleanQuery.maxClauseCount terms only the BooleanQuery.maxClauseCount most similar terms go into the rewritten query and thus the exception is avoided. (Christoph) 2. Changed system property from "org.apache.lucene.lockdir" to "org.apache.lucene.lockDir", so that its casing follows the existing pattern used in other Lucene system properties. (Bernhard) 3. The terms of RangeQueries and FuzzyQueries are now converted to lowercase by default (as it has been the case for PrefixQueries and WildcardQueries before). Use setLowercaseExpandedTerms(false) to disable that behavior but note that this also affects PrefixQueries and WildcardQueries. (Daniel Naber) 4. Document frequency that is computed when MultiSearcher is used is now computed correctly and "globally" across subsearchers and indices, while before it used to be computed locally to each index, which caused ranking across multiple indices not to be equivalent. (Chuck Williams, Wolf Siberski via Otis, bug #31841) 5. When opening an IndexWriter with create=true, Lucene now only deletes its own files from the index directory (looking at the file name suffixes to decide if a file belongs to Lucene). The old behavior was to delete all files. (Daniel Naber and Bernhard Messer, bug #34695) 6. The version of an IndexReader, as returned by getCurrentVersion() and getVersion() doesn't start at 0 anymore for new indexes. Instead, it is now initialized by the system time in milliseconds. (Bernhard Messer via Daniel Naber) 7. Several default values cannot be set via system properties anymore, as this has been considered inappropriate for a library like Lucene. For most properties there are set/get methods available in IndexWriter which you should use instead. This affects the following properties: See IndexWriter for getter/setter methods: org.apache.lucene.writeLockTimeout, org.apache.lucene.commitLockTimeout, org.apache.lucene.minMergeDocs, org.apache.lucene.maxMergeDocs, org.apache.lucene.maxFieldLength, org.apache.lucene.termIndexInterval, org.apache.lucene.mergeFactor, See BooleanQuery for getter/setter methods: org.apache.lucene.maxClauseCount See FSDirectory for getter/setter methods: disableLuceneLocks (Daniel Naber) 8. Fixed FieldCacheImpl to use user-provided IntParser and FloatParser, instead of using Integer and Float classes for parsing. (Yonik Seeley via Otis Gospodnetic) 9. Expert level search routines returning TopDocs and TopFieldDocs no longer normalize scores. This also fixes bugs related to MultiSearchers and score sorting/normalization. (Luc Vanlerberghe via Yonik Seeley, LUCENE-469) New features 1. Added support for stored compressed fields (patch #31149) (Bernhard Messer via Christoph) 2. Added support for binary stored fields (patch #29370) (Drew Farris and Bernhard Messer via Christoph) 3. Added support for position and offset information in term vectors (patch #18927). (Grant Ingersoll & Christoph) 4. A new class DateTools has been added. It allows you to format dates in a readable format adequate for indexing. Unlike the existing DateField class DateTools can cope with dates before 1970 and it forces you to specify the desired date resolution (e.g. month, day, second, ...) which can make RangeQuerys on those fields more efficient. (Daniel Naber) 5. QueryParser now correctly works with Analyzers that can return more than one token per position. For example, a query "+fast +car" would be parsed as "+fast +(car automobile)" if the Analyzer returns "car" and "automobile" at the same position whenever it finds "car" (Patch #23307). (Pierrick Brihaye, Daniel Naber) 6. Permit unbuffered Directory implementations (e.g., using mmap). InputStream is replaced by the new classes IndexInput and BufferedIndexInput. OutputStream is replaced by the new classes IndexOutput and BufferedIndexOutput. InputStream and OutputStream are now deprecated and FSDirectory is now subclassable. (cutting) 7. Add native Directory and TermDocs implementations that work under GCJ. These require GCC 3.4.0 or later and have only been tested on Linux. Use 'ant gcj' to build demo applications. (cutting) 8. Add MMapDirectory, which uses nio to mmap input files. This is still somewhat slower than FSDirectory. However it uses less memory per query term, since a new buffer is not allocated per term, which may help applications which use, e.g., wildcard queries. It may also someday be faster. (cutting & Paul Elschot) 9. Added javadocs-internal to build.xml - bug #30360 (Paul Elschot via Otis) 10. Added RangeFilter, a more generically useful filter than DateFilter. (Chris M Hostetter via Erik) 11. Added NumberTools, a utility class indexing numeric fields. (adapted from code contributed by Matt Quail; committed by Erik) 12. Added public static IndexReader.main(String[] args) method. IndexReader can now be used directly at command line level to list and optionally extract the individual files from an existing compound index file. (adapted from code contributed by Garrett Rooney; committed by Bernhard) 13. Add IndexWriter.setTermIndexInterval() method. See javadocs. (Doug Cutting) 14. Added LucenePackage, whose static get() method returns java.util.Package, which lets the caller get the Lucene version information specified in the Lucene Jar. (Doug Cutting via Otis) 15. Added Hits.iterator() method and corresponding HitIterator and Hit objects. This provides standard java.util.Iterator iteration over Hits. Each call to the iterator's next() method returns a Hit object. (Jeremy Rayner via Erik) 16. Add ParallelReader, an IndexReader that combines separate indexes over different fields into a single virtual index. (Doug Cutting) 17. Add IntParser and FloatParser interfaces to FieldCache, so that fields in arbitrarily formats can be cached as ints and floats. (Doug Cutting) 18. Added class org.apache.lucene.index.IndexModifier which combines IndexWriter and IndexReader, so you can add and delete documents without worrying about synchronization/locking issues. (Daniel Naber) 19. Lucene can now be used inside an unsigned applet, as Lucene's access to system properties will not cause a SecurityException anymore. (Jon Schuster via Daniel Naber, bug #34359) 20. Added a new class MatchAllDocsQuery that matches all documents. (John Wang via Daniel Naber, bug #34946) 21. Added ability to omit norms on a per field basis to decrease index size and memory consumption when there are many indexed fields. See Field.setOmitNorms() (Yonik Seeley, LUCENE-448) 22. Added NullFragmenter to contrib/highlighter, which is useful for highlighting entire documents or fields. (Erik Hatcher) 23. Added regular expression queries, RegexQuery and SpanRegexQuery. Note the same term enumeration caveats apply with these queries as apply to WildcardQuery and other term expanding queries. These two new queries are not currently supported via QueryParser. (Erik Hatcher) 24. Added ConstantScoreQuery which wraps a filter and produces a score equal to the query boost for every matching document. (Yonik Seeley, LUCENE-383) 25. Added ConstantScoreRangeQuery which produces a constant score for every document in the range. One advantage over a normal RangeQuery is that it doesn't expand to a BooleanQuery and thus doesn't have a maximum number of terms the range can cover. Both endpoints may also be open. (Yonik Seeley, LUCENE-383) 26. Added ability to specify a minimum number of optional clauses that must match in a BooleanQuery. See BooleanQuery.setMinimumNumberShouldMatch(). (Paul Elschot, Chris Hostetter via Yonik Seeley, LUCENE-395) 27. Added DisjunctionMaxQuery which provides the maximum score across its clauses. It's very useful for searching across multiple fields. (Chuck Williams via Yonik Seeley, LUCENE-323) 28. New class ISOLatin1AccentFilter that replaces accented characters in the ISO Latin 1 character set by their unaccented equivalent. (Sven Duzont via Erik Hatcher) 29. New class KeywordAnalyzer. "Tokenizes" the entire stream as a single token. This is useful for data like zip codes, ids, and some product names. (Erik Hatcher) 30. Copied LengthFilter from contrib area to core. Removes words that are too long and too short from the stream. (David Spencer via Otis and Daniel) 31. Added getPositionIncrementGap(String fieldName) to Analyzer. This allows custom analyzers to put gaps between Field instances with the same field name, preventing phrase or span queries crossing these boundaries. The default implementation issues a gap of 0, allowing the default token position increment of 1 to put the next field's first token into a successive position. (Erik Hatcher, with advice from Yonik) 32. StopFilter can now ignore case when checking for stop words. (Grant Ingersoll via Yonik, LUCENE-248) 33. Add TopDocCollector and TopFieldDocCollector. These simplify the implementation of hit collectors that collect only the top-scoring or top-sorting hits. API Changes 1. Several methods and fields have been deprecated. The API documentation contains information about the recommended replacements. It is planned that most of the deprecated methods and fields will be removed in Lucene 2.0. (Daniel Naber) 2. The Russian and the German analyzers have been moved to contrib/analyzers. Also, the WordlistLoader class has been moved one level up in the hierarchy and is now org.apache.lucene.analysis.WordlistLoader (Daniel Naber) 3. The API contained methods that declared to throw an IOException but that never did this. These declarations have been removed. If your code tries to catch these exceptions you might need to remove those catch clauses to avoid compile errors. (Daniel Naber) 4. Add a serializable Parameter Class to standardize parameter enum classes in BooleanClause and Field. (Christoph) 5. Added rewrite methods to all SpanQuery subclasses that nest other SpanQuerys. This allows custom SpanQuery subclasses that rewrite (for term expansion, for example) to nest within the built-in SpanQuery classes successfully. Bug fixes 1. The JSP demo page (src/jsp/results.jsp) now properly closes the IndexSearcher it opens. (Daniel Naber) 2. Fixed a bug in IndexWriter.addIndexes(IndexReader[] readers) that prevented deletion of obsolete segments. (Christoph Goller) 3. Fix in FieldInfos to avoid the return of an extra blank field in IndexReader.getFieldNames() (Patch #19058). (Mark Harwood via Bernhard) 4. Some combinations of BooleanQuery and MultiPhraseQuery (formerly PhrasePrefixQuery) could provoke UnsupportedOperationException (bug #33161). (Rhett Sutphin via Daniel Naber) 5. Small bug in skipTo of ConjunctionScorer that caused NullPointerException if skipTo() was called without prior call to next() fixed. (Christoph) 6. Disable Similiarty.coord() in the scoring of most automatically generated boolean queries. The coord() score factor is appropriate when clauses are independently specified by a user, but is usually not appropriate when clauses are generated automatically, e.g., by a fuzzy, wildcard or range query. Matches on such automatically generated queries are no longer penalized for not matching all terms. (Doug Cutting, Patch #33472) 7. Getting a lock file with Lock.obtain(long) was supposed to wait for a given amount of milliseconds, but this didn't work. (John Wang via Daniel Naber, Bug #33799) 8. Fix FSDirectory.createOutput() to always create new files. Previously, existing files were overwritten, and an index could be corrupted when the old version of a file was longer than the new. Now any existing file is first removed. (Doug Cutting) 9. Fix BooleanQuery containing nested SpanTermQuery's, which previously could return an incorrect number of hits. (Reece Wilton via Erik Hatcher, Bug #35157) 10. Fix NullPointerException that could occur with a MultiPhraseQuery inside a BooleanQuery. (Hans Hjelm and Scotty Allen via Daniel Naber, Bug #35626) 11. Fixed SnowballFilter to pass through the position increment from the original token. (Yonik Seeley via Erik Hatcher, LUCENE-437) 12. Added Unicode range of Korean characters to StandardTokenizer, grouping contiguous characters into a token rather than one token per character. This change also changes the token type to "" for Chinese and Japanese character tokens (previously it was ""). (Cheolgoo Kang via Otis and Erik, LUCENE-444 and LUCENE-461) 13. FieldsReader now looks at FieldInfo.storeOffsetWithTermVector and FieldInfo.storePositionWithTermVector and creates the Field with correct TermVector parameter. (Frank Steinmann via Bernhard, LUCENE-455) 14. Fixed WildcardQuery to prevent "cat" matching "ca??". (Xiaozheng Ma via Bernhard, LUCENE-306) 15. Fixed a bug where MultiSearcher and ParallelMultiSearcher could change the sort order when sorting by string for documents without a value for the sort field. (Luc Vanlerberghe via Yonik, LUCENE-453) 16. Fixed a sorting problem with MultiSearchers that can lead to missing or duplicate docs due to equal docs sorting in an arbitrary order. (Yonik Seeley, LUCENE-456) 17. A single hit using the expert level sorted search methods resulted in the score not being normalized. (Yonik Seeley, LUCENE-462) 18. Fixed inefficient memory usage when loading an index into RAMDirectory. (Volodymyr Bychkoviak via Bernhard, LUCENE-475) 19. Corrected term offsets returned by ChineseTokenizer. (Ray Tsang via Erik Hatcher, LUCENE-324) 20. Fixed MultiReader.undeleteAll() to correctly update numDocs. (Robert Kirchgessner via Doug Cutting, LUCENE-479) 21. Race condition in IndexReader.getCurrentVersion() and isCurrent() fixed by acquiring the commit lock. (Luc Vanlerberghe via Yonik Seeley, LUCENE-481) 22. IndexWriter.setMaxBufferedDocs(1) didn't have the expected effect, this has now been fixed. (Daniel Naber) 23. Fixed QueryParser when called with a date in local form like "[1/16/2000 TO 1/18/2000]". This query did not include the documents of 1/18/2000, i.e. the last day was not included. (Daniel Naber) 24. Removed sorting constraint that threw an exception if there were not yet any values for the sort field (Yonik Seeley, LUCENE-374) Optimizations 1. Disk usage (peak requirements during indexing and optimization) in case of compound file format has been improved. (Bernhard, Dmitry, and Christoph) 2. Optimize the performance of certain uses of BooleanScorer, TermScorer and IndexSearcher. In particular, a BooleanQuery composed of TermQuery, with not all terms required, that returns a TopDocs (e.g., through a Hits with no Sort specified) runs much faster. (cutting) 3. Removed synchronization from reading of term vectors with an IndexReader (Patch #30736). (Bernhard Messer via Christoph) 4. Optimize term-dictionary lookup to allocate far fewer terms when scanning for the matching term. This speeds searches involving low-frequency terms, where the cost of dictionary lookup can be significant. (cutting) 5. Optimize fuzzy queries so the standard fuzzy queries with a prefix of 0 now run 20-50% faster (Patch #31882). (Jonathan Hager via Daniel Naber) 6. A Version of BooleanScorer (BooleanScorer2) added that delivers documents in increasing order and implements skipTo. For queries with required or forbidden clauses it may be faster than the old BooleanScorer, for BooleanQueries consisting only of optional clauses it is probably slower. The new BooleanScorer is now the default. (Patch 31785 by Paul Elschot via Christoph) 7. Use uncached access to norms when merging to reduce RAM usage. (Bug #32847). (Doug Cutting) 8. Don't read term index when random-access is not required. This reduces time to open IndexReaders and they use less memory when random access is not required, e.g., when merging segments. The term index is now read into memory lazily at the first random-access. (Doug Cutting) 9. Optimize IndexWriter.addIndexes(Directory[]) when the number of added indexes is larger than mergeFactor. Previously this could result in quadratic performance. Now performance is n log(n). (Doug Cutting) 10. Speed up the creation of TermEnum for indices with multiple segments and deleted documents, and thus speed up PrefixQuery, RangeQuery, WildcardQuery, FuzzyQuery, RangeFilter, DateFilter, and sorting the first time on a field. (Yonik Seeley, LUCENE-454) 11. Optimized and generalized 32 bit floating point to byte (custom 8 bit floating point) conversions. Increased the speed of Similarity.encodeNorm() anywhere from 10% to 250%, depending on the JVM. (Yonik Seeley, LUCENE-467) Infrastructure 1. Lucene's source code repository has converted from CVS to Subversion. The new repository is at http://svn.apache.org/repos/asf/lucene/java/trunk 2. Lucene's issue tracker has migrated from Bugzilla to JIRA. Lucene's JIRA is at http://issues.apache.org/jira/browse/LUCENE The old issues are still available at http://issues.apache.org/bugzilla/show_bug.cgi?id=xxxx (use the bug number instead of xxxx) 1.4.3 1. The JSP demo page (src/jsp/results.jsp) now properly escapes error messages which might contain user input (e.g. error messages about query parsing). If you used that page as a starting point for your own code please make sure your code also properly escapes HTML characters from user input in order to avoid so-called cross site scripting attacks. (Daniel Naber) 2. QueryParser changes in 1.4.2 broke the QueryParser API. Now the old API is supported again. (Christoph) 1.4.2 1. Fixed bug #31241: Sorting could lead to incorrect results (documents missing, others duplicated) if the sort keys were not unique and there were more than 100 matches. (Daniel Naber) 2. Memory leak in Sort code (bug #31240) eliminated. (Rafal Krzewski via Christoph and Daniel) 3. FuzzyQuery now takes an additional parameter that specifies the minimum similarity that is required for a term to match the query. The QueryParser syntax for this is term~x, where x is a floating point number >= 0 and < 1 (a bigger number means that a higher similarity is required). Furthermore, a prefix can be specified for FuzzyQuerys so that only those terms are considered similar that start with this prefix. This can speed up FuzzyQuery greatly. (Daniel Naber, Christoph Goller) 4. PhraseQuery and PhrasePrefixQuery now allow the explicit specification of relative positions. (Christoph Goller) 5. QueryParser changes: Fix for ArrayIndexOutOfBoundsExceptions (patch #9110); some unused method parameters removed; The ability to specify a minimum similarity for FuzzyQuery has been added. (Christoph Goller) 6. IndexSearcher optimization: a new ScoreDoc is no longer allocated for every non-zero-scoring hit. This makes 'OR' queries that contain common terms substantially faster. (cutting) 1.4.1 1. Fixed a performance bug in hit sorting code, where values were not correctly cached. (Aviran via cutting) 2. Fixed errors in file format documentation. (Daniel Naber) 1.4 final 1. Added "an" to the list of stop words in StopAnalyzer, to complement the existing "a" there. Fix for bug 28960 (http://issues.apache.org/bugzilla/show_bug.cgi?id=28960). (Otis) 2. Added new class FieldCache to manage in-memory caches of field term values. (Tim Jones) 3. Added overloaded getFieldQuery method to QueryParser which accepts the slop factor specified for the phrase (or the default phrase slop for the QueryParser instance). This allows overriding methods to replace a PhraseQuery with a SpanNearQuery instead, keeping the proper slop factor. (Erik Hatcher) 4. Changed the encoding of GermanAnalyzer.java and GermanStemmer.java to UTF-8 and changed the build encoding to UTF-8, to make changed files compile. (Otis Gospodnetic) 5. Removed synchronization from term lookup under IndexReader methods termFreq(), termDocs() or termPositions() to improve multi-threaded performance. (cutting) 6. Fix a bug where obsolete segment files were not deleted on Win32. 1.4 RC3 1. Fixed several search bugs introduced by the skipTo() changes in release 1.4RC1. The index file format was changed a bit, so collections must be re-indexed to take advantage of the skipTo() optimizations. (Christoph Goller) 2. Added new Document methods, removeField() and removeFields(). (Christoph Goller) 3. Fixed inconsistencies with index closing. Indexes and directories are now only closed automatically by Lucene when Lucene opened them automatically. (Christoph Goller) 4. Added new class: FilteredQuery. (Tim Jones) 5. Added a new SortField type for custom comparators. (Tim Jones) 6. Lock obtain timed out message now displays the full path to the lock file. (Daniel Naber via Erik) 7. Fixed a bug in SpanNearQuery when ordered. (Paul Elschot via cutting) 8. Fixed so that FSDirectory's locks still work when the java.io.tmpdir system property is null. (cutting) 9. Changed FilteredTermEnum's constructor to take no parameters, as the parameters were ignored anyway (bug #28858) 1.4 RC2 1. GermanAnalyzer now throws an exception if the stopword file cannot be found (bug #27987). It now uses LowerCaseFilter (bug #18410) (Daniel Naber via Otis, Erik) 2. Fixed a few bugs in the file format documentation. (cutting) 1.4 RC1 1. Changed the format of the .tis file, so that: - it has a format version number, which makes it easier to back-compatibly change file formats in the future. - the term count is now stored as a long. This was the one aspect of the Lucene's file formats which limited index size. - a few internal index parameters are now stored in the index, so that they can (in theory) now be changed from index to index, although there is not yet an API to do so. These changes are back compatible. The new code can read old indexes. But old code will not be able read new indexes. (cutting) 2. Added an optimized implementation of TermDocs.skipTo(). A skip table is now stored for each term in the .frq file. This only adds a percent or two to overall index size, but can substantially speedup many searches. (cutting) 3. Restructured the Scorer API and all Scorer implementations to take advantage of an optimized TermDocs.skipTo() implementation. In particular, PhraseQuerys and conjunctive BooleanQuerys are faster when one clause has substantially fewer matches than the others. (A conjunctive BooleanQuery is a BooleanQuery where all clauses are required.) (cutting) 4. Added new class ParallelMultiSearcher. Combined with RemoteSearchable this makes it easy to implement distributed search systems. (Jean-Francois Halleux via cutting) 5. Added support for hit sorting. Results may now be sorted by any indexed field. For details see the javadoc for Searcher#search(Query, Sort). (Tim Jones via Cutting) 6. Changed FSDirectory to auto-create a full directory tree that it needs by using mkdirs() instead of mkdir(). (Mladen Turk via Otis) 7. Added a new span-based query API. This implements, among other things, nested phrases. See javadocs for details. (Doug Cutting) 8. Added new method Query.getSimilarity(Searcher), and changed scorers to use it. This permits one to subclass a Query class so that it can specify its own Similarity implementation, perhaps one that delegates through that of the Searcher. (Julien Nioche via Cutting) 9. Added MultiReader, an IndexReader that combines multiple other IndexReaders. (Cutting) 10. Added support for term vectors. See Field#isTermVectorStored(). (Grant Ingersoll, Cutting & Dmitry) 11. Fixed the old bug with escaping of special characters in query strings: http://issues.apache.org/bugzilla/show_bug.cgi?id=24665 (Jean-Francois Halleux via Otis) 12. Added support for overriding default values for the following, using system properties: - default commit lock timeout - default maxFieldLength - default maxMergeDocs - default mergeFactor - default minMergeDocs - default write lock timeout (Otis) 13. Changed QueryParser.jj to allow '-' and '+' within tokens: http://issues.apache.org/bugzilla/show_bug.cgi?id=27491 (Morus Walter via Otis) 14. Changed so that the compound index format is used by default. This makes indexing a bit slower, but vastly reduces the chances of file handle problems. (Cutting) 1.3 final 1. Added catch of BooleanQuery$TooManyClauses in QueryParser to throw ParseException instead. (Erik Hatcher) 2. Fixed a NullPointerException in Query.explain(). (Doug Cutting) 3. Added a new method IndexReader.setNorm(), that permits one to alter the boosting of fields after an index is created. 4. Distinguish between the final position and length when indexing a field. The length is now defined as the total number of tokens, instead of the final position, as it was previously. Length is used for score normalization (Similarity.lengthNorm()) and for controlling memory usage (IndexWriter.maxFieldLength). In both of these cases, the total number of tokens is a better value to use than the final token position. Position is used in phrase searching (see PhraseQuery and Token.setPositionIncrement()). 5. Fix StandardTokenizer's handling of CJK characters (Chinese, Japanese and Korean ideograms). Previously contiguous sequences were combined in a single token, which is not very useful. Now each ideogram generates a separate token, which is more useful. 1.3 RC3 1. Added minMergeDocs in IndexWriter. This can be raised to speed indexing without altering the number of files, but only using more memory. (Julien Nioche via Otis) 2. Fix bug #24786, in query rewriting. (bschneeman via Cutting) 3. Fix bug #16952, in demo HTML parser, skip comments in javascript. (Christoph Goller) 4. Fix bug #19253, in demo HTML parser, add whitespace as needed to output (Daniel Naber via Christoph Goller) 5. Fix bug #24301, in demo HTML parser, long titles no longer hang things. (Christoph Goller) 6. Fix bug #23534, Replace use of file timestamp of segments file with an index version number stored in the segments file. This resolves problems when running on file systems with low-resolution timestamps, e.g., HFS under MacOS X. (Christoph Goller) 7. Fix QueryParser so that TokenMgrError is not thrown, only ParseException. (Erik Hatcher) 8. Fix some bugs introduced by change 11 of RC2. (Christoph Goller) 9. Fixed a problem compiling TestRussianStem. (Christoph Goller) 10. Cleaned up some build stuff. (Erik Hatcher) 1.3 RC2 1. Added getFieldNames(boolean) to IndexReader, SegmentReader, and SegmentsReader. (Julien Nioche via otis) 2. Changed file locking to place lock files in System.getProperty("java.io.tmpdir"), where all users are permitted to write files. This way folks can open and correctly lock indexes which are read-only to them. 3. IndexWriter: added a new method, addDocument(Document, Analyzer), permitting one to easily use different analyzers for different documents in the same index. 4. Minor enhancements to FuzzyTermEnum. (Christoph Goller via Otis) 5. PriorityQueue: added insert(Object) method and adjusted IndexSearcher and MultiIndexSearcher to use it. (Christoph Goller via Otis) 6. Fixed a bug in IndexWriter that returned incorrect docCount(). (Christoph Goller via Otis) 7. Fixed SegmentsReader to eliminate the confusing and slightly different behaviour of TermEnum when dealing with an enumeration of all terms, versus an enumeration starting from a specific term. This patch also fixes incorrect term document frequencies when the same term is present in multiple segments. (Christoph Goller via Otis) 8. Added CachingWrapperFilter and PerFieldAnalyzerWrapper. (Erik Hatcher) 9. Added support for the new "compound file" index format (Dmitry Serebrennikov) 10. Added Locale setting to QueryParser, for use by date range parsing. 11. Changed IndexReader so that it can be subclassed by classes outside of its package. Previously it had package-private abstract methods. Also modified the index merging code so that it can work on an arbitrary IndexReader implementation, and added a new method, IndexWriter.addIndexes(IndexReader[]), to take advantage of this. (cutting) 12. Added a limit to the number of clauses which may be added to a BooleanQuery. The default limit is 1024 clauses. This should stop most OutOfMemoryExceptions by prefix, wildcard and fuzzy queries which run amok. (cutting) 13. Add new method: IndexReader.undeleteAll(). This undeletes all deleted documents which still remain in the index. (cutting) 1.3 RC1 1. Fixed PriorityQueue's clear() method. Fix for bug 9454, http://nagoya.apache.org/bugzilla/show_bug.cgi?id=9454 (Matthijs Bomhoff via otis) 2. Changed StandardTokenizer.jj grammar for EMAIL tokens. Fix for bug 9015, http://nagoya.apache.org/bugzilla/show_bug.cgi?id=9015 (Dale Anson via otis) 3. Added the ability to disable lock creation by using disableLuceneLocks system property. This is useful for read-only media, such as CD-ROMs. (otis) 4. Added id method to Hits to be able to access the index global id. Required for sorting options. (carlson) 5. Added support for new range query syntax to QueryParser.jj. (briangoetz) 6. Added the ability to retrieve HTML documents' META tag values to HTMLParser.jj. (Mark Harwood via otis) 7. Modified QueryParser to make it possible to programmatically specify the default Boolean operator (OR or AND). (Péter Halácsy via otis) 8. Made many search methods and classes non-final, per requests. This includes IndexWriter and IndexSearcher, among others. (cutting) 9. Added class RemoteSearchable, providing support for remote searching via RMI. The test class RemoteSearchableTest.java provides an example of how this can be used. (cutting) 10. Added PhrasePrefixQuery (and supporting MultipleTermPositions). The test class TestPhrasePrefixQuery provides the usage example. (Anders Nielsen via otis) 11. Changed the German stemming algorithm to ignore case while stripping. The new algorithm is faster and produces more equal stems from nouns and verbs derived from the same word. (gschwarz) 12. Added support for boosting the score of documents and fields via the new methods Document.setBoost(float) and Field.setBoost(float). Note: This changes the encoding of an indexed value. Indexes should be re-created from scratch in order for search scores to be correct. With the new code and an old index, searches will yield very large scores for shorter fields, and very small scores for longer fields. Once the index is re-created, scores will be as before. (cutting) 13. Added new method Token.setPositionIncrement(). This permits, for the purpose of phrase searching, placing multiple terms in a single position. This is useful with stemmers that produce multiple possible stems for a word. This also permits the introduction of gaps between terms, so that terms which are adjacent in a token stream will not be matched by and exact phrase query. This makes it possible, e.g., to build an analyzer where phrases are not matched over stop words which have been removed. Finally, repeating a token with an increment of zero can also be used to boost scores of matches on that token. (cutting) 14. Added new Filter class, QueryFilter. This constrains search results to only match those which also match a provided query. Results are cached, so that searches after the first on the same index using this filter are very fast. This could be used, for example, with a RangeQuery on a formatted date field to implement date filtering. One could re-use a single QueryFilter that matches, e.g., only documents modified within the last week. The QueryFilter and RangeQuery would only need to be reconstructed once per day. (cutting) 15. Added a new IndexWriter method, getAnalyzer(). This returns the analyzer used when adding documents to this index. (cutting) 16. Fixed a bug with IndexReader.lastModified(). Before, document deletion did not update this. Now it does. (cutting) 17. Added Russian Analyzer. (Boris Okner via otis) 18. Added a public, extensible scoring API. For details, see the javadoc for org.apache.lucene.search.Similarity. 19. Fixed return of Hits.id() from float to int. (Terry Steichen via Peter). 20. Added getFieldNames() to IndexReader and Segment(s)Reader classes. (Peter Mularien via otis) 21. Added getFields(String) and getValues(String) methods. Contributed by Rasik Pandey on 2002-10-09 (Rasik Pandey via otis) 22. Revised internal search APIs. Changes include: a. Queries are no longer modified during a search. This makes it possible, e.g., to reuse the same query instance with multiple indexes from multiple threads. b. Term-expanding queries (e.g. PrefixQuery, WildcardQuery, etc.) now work correctly with MultiSearcher, fixing bugs 12619 and 12667. c. Boosting BooleanQuery's now works, and is supported by the query parser (problem reported by Lee Mallabone). Thus a query like "(+foo +bar)^2 +baz" is now supported and equivalent to "(+foo^2 +bar^2) +baz". d. New method: Query.rewrite(IndexReader). This permits a query to re-write itself as an alternate, more primitive query. Most of the term-expanding query classes (PrefixQuery, WildcardQuery, etc.) are now implemented using this method. e. New method: Searchable.explain(Query q, int doc). This returns an Explanation instance that describes how a particular document is scored against a query. An explanation can be displayed as either plain text, with the toString() method, or as HTML, with the toHtml() method. Note that computing an explanation is as expensive as executing the query over the entire index. This is intended to be used in developing Similarity implementations, and, for good performance, should not be displayed with every hit. f. Scorer and Weight are public, not package protected. It now possible for someone to write a Scorer implementation that is not in the org.apache.lucene.search package. This is still fairly advanced programming, and I don't expect anyone to do this anytime soon, but at least now it is possible. g. Added public accessors to the primitive query classes (TermQuery, PhraseQuery and BooleanQuery), permitting access to their terms and clauses. Caution: These are extensive changes and they have not yet been tested extensively. Bug reports are appreciated. (cutting) 23. Added convenience RAMDirectory constructors taking File and String arguments, for easy FSDirectory to RAMDirectory conversion. (otis) 24. Added code for manual renaming of files in FSDirectory, since it has been reported that java.io.File's renameTo(File) method sometimes fails on Windows JVMs. (Matt Tucker via otis) 25. Refactored QueryParser to make it easier for people to extend it. Added the ability to automatically lower-case Wildcard terms in the QueryParser. (Tatu Saloranta via otis) 1.2 RC6 1. Changed QueryParser.jj to have "?" be a special character which allowed it to be used as a wildcard term. Updated TestWildcard unit test also. (Ralf Hettesheimer via carlson) 1.2 RC5 1. Renamed build.properties to default.properties and updated the BUILD.txt document to describe how to override the default.property settings without having to edit the file. This brings the build process closer to Scarab's build process. (jon) 2. Added MultiFieldQueryParser class. (Kelvin Tan, via otis) 3. Updated "powered by" links. (otis) 4. Fixed instruction for setting up JavaCC - Bug #7017 (otis) 5. Added throwing exception if FSDirectory could not create directory - Bug #6914 (Eugene Gluzberg via otis) 6. Update MultiSearcher, MultiFieldParse, Constants, DateFilter, LowerCaseTokenizer javadoc (otis) 7. Added fix to avoid NullPointerException in results.jsp (Mark Hayes via otis) 8. Changed Wildcard search to find 0 or more char instead of 1 or more (Lee Mallobone, via otis) 9. Fixed error in offset issue in GermanStemFilter - Bug #7412 (Rodrigo Reyes, via otis) 10. Added unit tests for wildcard search and DateFilter (otis) 11. Allow co-existence of indexed and non-indexed fields with the same name (cutting/casper, via otis) 12. Add escape character to query parser. (briangoetz) 13. Applied a patch that ensures that searches that use DateFilter don't throw an exception when no matches are found. (David Smiley, via otis) 14. Fixed bugs in DateFilter and wildcardquery unit tests. (cutting, otis, carlson) 1.2 RC4 1. Updated contributions section of website. Add XML Document #3 implementation to Document Section. Also added Term Highlighting to Misc Section. (carlson) 2. Fixed NullPointerException for phrase searches containing unindexed terms, introduced in 1.2RC3. (cutting) 3. Changed document deletion code to obtain the index write lock, enforcing the fact that document addition and deletion cannot be performed concurrently. (cutting) 4. Various documentation cleanups. (otis, acoliver) 5. Updated "powered by" links. (cutting, jon) 6. Fixed a bug in the GermanStemmer. (Bernhard Messer, via otis) 7. Changed Term and Query to implement Serializable. (scottganyo) 8. Fixed to never delete indexes added with IndexWriter.addIndexes(). (cutting) 9. Upgraded to JUnit 3.7. (otis) 1.2 RC3 1. IndexWriter: fixed a bug where adding an optimized index to an empty index failed. This was encountered using addIndexes to copy a RAMDirectory index to an FSDirectory. 2. RAMDirectory: fixed a bug where RAMInputStream could not read across more than across a single buffer boundary. 3. Fix query parser so it accepts queries with unicode characters. (briangoetz) 4. Fix query parser so that PrefixQuery is used in preference to WildcardQuery when there's only an asterisk at the end of the term. Previously PrefixQuery would never be used. 5. Fix tests so they compile; fix ant file so it compiles tests properly. Added test cases for Analyzers and PriorityQueue. 6. Updated demos, added Getting Started documentation. (acoliver) 7. Added 'contributions' section to website & docs. (carlson) 8. Removed JavaCC from source distribution for copyright reasons. Folks must now download this separately from metamata in order to compile Lucene. (cutting) 9. Substantially improved the performance of DateFilter by adding the ability to reuse TermDocs objects. (cutting) 10. Added IndexReader methods: public static boolean indexExists(String directory); public static boolean indexExists(File directory); public static boolean indexExists(Directory directory); public static boolean isLocked(Directory directory); public static void unlock(Directory directory); (cutting, otis) 11. Fixed bugs in GermanAnalyzer (gschwarz) 1.2 RC2, 19 October 2001: - added sources to distribution - removed broken build scripts and libraries from distribution - SegmentsReader: fixed potential race condition - FSDirectory: fixed so that getDirectory(xxx,true) correctly erases the directory contents, even when the directory has already been accessed in this JVM. - RangeQuery: Fix issue where an inclusive range query would include the nearest term in the index above a non-existant specified upper term. - SegmentTermEnum: Fix NullPointerException in clone() method when the Term is null. - JDK 1.1 compatibility fix: disabled lock files for JDK 1.1, since they rely on a feature added in JDK 1.2. 1.2 RC1 (first Apache release), 2 October 2001: - packages renamed from com.lucene to org.apache.lucene - license switched from LGPL to Apache - ant-only build -- no more makefiles - addition of lock files--now fully thread & process safe - addition of German stemmer - MultiSearcher now supports low-level search API - added RangeQuery, for term-range searching - Analyzers can choose tokenizer based on field name - misc bug fixes. 1.01b (last Sourceforge release), 2 July 2001 . a few bug fixes . new Query Parser . new prefix query (search for "foo*" matches "food") 1.0, 2000-10-04 This release fixes a few serious bugs and also includes some performance optimizations, a stemmer, and a few other minor enhancements. 0.04 2000-04-19 Lucene now includes a grammar-based tokenizer, StandardTokenizer. The only tokenizer included in the previous release (LetterTokenizer) identified terms consisting entirely of alphabetic characters. The new tokenizer uses a regular-expression grammar to identify more complex classes of terms, including numbers, acronyms, email addresses, etc. StandardTokenizer serves two purposes: 1. It is a much better, general purpose tokenizer for use by applications as is. The easiest way for applications to start using StandardTokenizer is to use StandardAnalyzer. 2. It provides a good example of grammar-based tokenization. If an application has special tokenization requirements, it can implement a custom tokenizer by copying the directory containing the new tokenizer into the application and modifying it accordingly. 0.01, 2000-03-30 First open source release. The code has been re-organized into a new package and directory structure for this release. It builds OK, but has not been tested beyond that since the re-organization. lucene-2.9.4/lucene-contrib-pom.xml.template0000644000175000017500000000361711474320267021526 0ustar janpascaljanpascal 4.0.0 org.apache.lucene lucene-parent @version@ lucene-contrib Lucene Java Contrib POM @version@ pom org.apache.lucene lucene-core @version@ 1.0.4 1.7 3.1 1.7.0 1.4 4.0 lucene-2.9.4/lucene-core-pom.xml.template0000644000175000017500000000264411474320267021015 0ustar janpascaljanpascal org.apache.lucene lucene-parent @version@ 4.0.0 org.apache.lucene lucene-core Lucene Core @version@ Apache Lucene Java Core jar lucene-2.9.4/build.xml0000644000175000017500000010005711474505321015301 0ustar janpascaljanpascal Specified tag '${tag}' could not be found in directory '${tags.dir}/${tag}'. DEPRECATED - Doing Nothing. See http://wiki.apache.org/lucene-java/HowToUpdateTheWebsite ${Name} ${version} Javadoc Index

${Name} ${version} Javadoc Index

  • All
  • Core
  • Contrib packages:
  • ]]> Contrib tests failed! Building checksum for '@{file}' lucene-2.9.4/LICENSE.txt0000644000175000017500000003053511474320267015312 0ustar janpascaljanpascal Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. Some code in src/java/org/apache/lucene/util/UnicodeUtil.java was derived from unicode conversion examples available at http://www.unicode.org/Public/PROGRAMS/CVTUTF. Here is the copyright from those sources: /* * Copyright 2001-2004 Unicode, Inc. * * Disclaimer * * This source code is provided as is by Unicode, Inc. No claims are * made as to fitness for any particular purpose. No warranties of any * kind are expressed or implied. The recipient agrees to determine * applicability of information provided. If this file has been * purchased on magnetic or optical media from Unicode, Inc., the * sole remedy for any claim will be exchange of defective media * within 90 days of receipt. * * Limitations on Rights to Redistribute This Code * * Unicode, Inc. hereby grants the right to freely use the information * supplied in this file in the creation of products supporting the * Unicode Standard, and to make copies of this file in any form * for internal or external distribution as long as this notice * remains attached. */ Some code in src/java/org/apache/lucene/util/ArrayUtil.java was derived from Python 2.4.2 sources available at http://www.python.org. Full license is here: http://www.python.org/download/releases/2.4.2/license/ lucene-2.9.4/lucene-parent-pom.xml.template0000644000175000017500000000607511474320267021360 0ustar janpascaljanpascal org.apache apache 4 4.0.0 org.apache.lucene lucene-parent Lucene Java POM @version@ Apache Lucene Java POM http://lucene.apache.org/java pom JIRA http://issues.apache.org/jira/browse/LUCENE Hudson http://lucene.zones.apache.org:8080/hudson/job/Lucene-Nightly/ Java User List java-user-subscribe@lucene.apache.org java-user-unsubscribe@lucene.apache.org http://mail-archives.apache.org/mod_mbox/java-user/ Java Developer List java-dev-subscribe@lucene.apache.org java-dev-unsubscribe@lucene.apache.org http://mail-archives.apache.org/mod_mbox/java-dev/ Java Commits List java-commits-subscribe@lucene.apache.org java-commits-unsubscribe@lucene.apache.org http://mail-archives.apache.org/mod_mbox/java-commits/ 2000 Apache 2 http://www.apache.org/licenses/LICENSE-2.0.txt scm:svn:http://svn.apache.org/repos/asf/lucene/java scm:svn:https://svn.apache.org/repos/asf/lucene/java lucene-2.9.4/src/0000755000175000017500000000000011474320233014241 5ustar janpascaljanpascallucene-2.9.4/src/java/0000755000175000017500000000000011554106562015170 5ustar janpascaljanpascallucene-2.9.4/src/java/org/0000755000175000017500000000000011474320221015746 5ustar janpascaljanpascallucene-2.9.4/src/java/org/apache/0000755000175000017500000000000011474320221017167 5ustar janpascaljanpascallucene-2.9.4/src/java/org/apache/lucene/0000755000175000017500000000000011554106562020453 5ustar janpascaljanpascallucene-2.9.4/src/java/org/apache/lucene/search/0000755000175000017500000000000011554106562021720 5ustar janpascaljanpascallucene-2.9.4/src/java/org/apache/lucene/search/QueryFilter.java0000644000175000017500000000305011474320224025026 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** Constrains search results to only match those which also match a provided * query. Results are cached, so that searches after the first on the same * index using this filter are much faster. * * @version $Id: QueryFilter.java 528298 2007-04-13 00:59:28Z hossman $ * @deprecated use a CachingWrapperFilter with QueryWrapperFilter */ public class QueryFilter extends CachingWrapperFilter { /** Constructs a filter which only matches documents matching * query. */ public QueryFilter(Query query) { super(new QueryWrapperFilter(query)); } public boolean equals(Object o) { return super.equals((QueryFilter)o); } public int hashCode() { return super.hashCode() ^ 0x923F64B9; } } lucene-2.9.4/src/java/org/apache/lucene/search/SpanQueryFilter.java0000644000175000017500000000570011474320224025654 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Copyright 2007 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.Spans; import org.apache.lucene.util.OpenBitSet; import java.io.IOException; import java.util.ArrayList; import java.util.List; /** * Constrains search results to only match those which also match a provided * query. Also provides position information about where each document matches * at the cost of extra space compared with the QueryWrapperFilter. * There is an added cost to this above what is stored in a {@link QueryWrapperFilter}. Namely, * the position information for each matching document is stored. *

    * This filter does not cache. See the {@link org.apache.lucene.search.CachingSpanFilter} for a wrapper that * caches. * * * @version $Id:$ */ public class SpanQueryFilter extends SpanFilter { protected SpanQuery query; protected SpanQueryFilter() { } /** Constructs a filter which only matches documents matching * query. * @param query The {@link org.apache.lucene.search.spans.SpanQuery} to use as the basis for the Filter. */ public SpanQueryFilter(SpanQuery query) { this.query = query; } public DocIdSet getDocIdSet(IndexReader reader) throws IOException { SpanFilterResult result = bitSpans(reader); return result.getDocIdSet(); } public SpanFilterResult bitSpans(IndexReader reader) throws IOException { final OpenBitSet bits = new OpenBitSet(reader.maxDoc()); Spans spans = query.getSpans(reader); List tmp = new ArrayList(20); int currentDoc = -1; SpanFilterResult.PositionInfo currentInfo = null; while (spans.next()) { int doc = spans.doc(); bits.set(doc); if (currentDoc != doc) { currentInfo = new SpanFilterResult.PositionInfo(doc); tmp.add(currentInfo); currentDoc = doc; } currentInfo.addPosition(spans.start(), spans.end()); } return new SpanFilterResult(bits, tmp); } public SpanQuery getQuery() { return query; } public String toString() { return "SpanQueryFilter(" + query + ")"; } public boolean equals(Object o) { return o instanceof SpanQueryFilter && this.query.equals(((SpanQueryFilter) o).query); } public int hashCode() { return query.hashCode() ^ 0x923F64B9; } } lucene-2.9.4/src/java/org/apache/lucene/search/TermQuery.java0000644000175000017500000001347211474320224024521 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Set; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.util.ToStringUtils; /** A Query that matches documents containing a term. This may be combined with other terms with a {@link BooleanQuery}. */ public class TermQuery extends Query { private Term term; private class TermWeight extends Weight { private Similarity similarity; private float value; private float idf; private float queryNorm; private float queryWeight; private IDFExplanation idfExp; public TermWeight(Searcher searcher) throws IOException { this.similarity = getSimilarity(searcher); idfExp = similarity.idfExplain(term, searcher); idf = idfExp.getIdf(); } public String toString() { return "weight(" + TermQuery.this + ")"; } public Query getQuery() { return TermQuery.this; } public float getValue() { return value; } public float sumOfSquaredWeights() { queryWeight = idf * getBoost(); // compute query weight return queryWeight * queryWeight; // square it } public void normalize(float queryNorm) { this.queryNorm = queryNorm; queryWeight *= queryNorm; // normalize query weight value = queryWeight * idf; // idf for document } public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException { TermDocs termDocs = reader.termDocs(term); if (termDocs == null) return null; return new TermScorer(this, termDocs, similarity, reader.norms(term.field())); } public Explanation explain(IndexReader reader, int doc) throws IOException { ComplexExplanation result = new ComplexExplanation(); result.setDescription("weight("+getQuery()+" in "+doc+"), product of:"); Explanation expl = new Explanation(idf, idfExp.explain()); // explain query weight Explanation queryExpl = new Explanation(); queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:"); Explanation boostExpl = new Explanation(getBoost(), "boost"); if (getBoost() != 1.0f) queryExpl.addDetail(boostExpl); queryExpl.addDetail(expl); Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm"); queryExpl.addDetail(queryNormExpl); queryExpl.setValue(boostExpl.getValue() * expl.getValue() * queryNormExpl.getValue()); result.addDetail(queryExpl); // explain field weight String field = term.field(); ComplexExplanation fieldExpl = new ComplexExplanation(); fieldExpl.setDescription("fieldWeight("+term+" in "+doc+ "), product of:"); Explanation tfExpl = scorer(reader, true, false).explain(doc); fieldExpl.addDetail(tfExpl); fieldExpl.addDetail(expl); Explanation fieldNormExpl = new Explanation(); byte[] fieldNorms = reader.norms(field); float fieldNorm = fieldNorms!=null ? Similarity.decodeNorm(fieldNorms[doc]) : 1.0f; fieldNormExpl.setValue(fieldNorm); fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")"); fieldExpl.addDetail(fieldNormExpl); fieldExpl.setMatch(Boolean.valueOf(tfExpl.isMatch())); fieldExpl.setValue(tfExpl.getValue() * expl.getValue() * fieldNormExpl.getValue()); result.addDetail(fieldExpl); result.setMatch(fieldExpl.getMatch()); // combine them result.setValue(queryExpl.getValue() * fieldExpl.getValue()); if (queryExpl.getValue() == 1.0f) return fieldExpl; return result; } } /** Constructs a query for the term t. */ public TermQuery(Term t) { term = t; } /** Returns the term of this query. */ public Term getTerm() { return term; } public Weight createWeight(Searcher searcher) throws IOException { return new TermWeight(searcher); } public void extractTerms(Set terms) { terms.add(getTerm()); } /** Prints a user-readable version of this query. */ public String toString(String field) { StringBuffer buffer = new StringBuffer(); if (!term.field().equals(field)) { buffer.append(term.field()); buffer.append(":"); } buffer.append(term.text()); buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } /** Returns true iff o is equal to this. */ public boolean equals(Object o) { if (!(o instanceof TermQuery)) return false; TermQuery other = (TermQuery)o; return (this.getBoost() == other.getBoost()) && this.term.equals(other.term); } /** Returns a hash code value for this object.*/ public int hashCode() { return Float.floatToIntBits(getBoost()) ^ term.hashCode(); } } lucene-2.9.4/src/java/org/apache/lucene/search/IndexSearcher.java0000644000175000017500000003014111474320225025301 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.util.ReaderUtil; /** Implements search over a single IndexReader. * *

    Applications usually need only call the inherited * {@link #search(Query,int)} * or {@link #search(Query,Filter,int)} methods. For performance reasons it is * recommended to open only one IndexSearcher and use it for all of your searches. * *

    Note that you can only access the deprecated {@link Hits} from an IndexSearcher as long as it is * not yet closed, otherwise an IOException will be thrown. * *

    NOTE: {@link * IndexSearcher} instances are completely * thread safe, meaning multiple threads can call any of its * methods, concurrently. If your application requires * external synchronization, you should not * synchronize on the IndexSearcher instance; * use your own (non-Lucene) objects instead.

    */ public class IndexSearcher extends Searcher { IndexReader reader; private boolean closeReader; private IndexReader[] subReaders; private int[] docStarts; /** Creates a searcher searching the index in the named directory. * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error * @deprecated Use {@link #IndexSearcher(Directory, boolean)} instead */ public IndexSearcher(String path) throws CorruptIndexException, IOException { this(IndexReader.open(path), true); } /** Creates a searcher searching the index in the named * directory. You should pass readOnly=true, since it * gives much better concurrent performance, unless you * intend to do write operations (delete documents or * change norms) with the underlying IndexReader. * @param path directory where IndexReader will be opened * @param readOnly if true, the underlying IndexReader * will be opened readOnly * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error * @deprecated Use {@link #IndexSearcher(Directory, boolean)} instead */ public IndexSearcher(String path, boolean readOnly) throws CorruptIndexException, IOException { this(IndexReader.open(path, readOnly), true); } /** Creates a searcher searching the index in the provided directory. * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error * @deprecated Use {@link #IndexSearcher(Directory, boolean)} instead */ public IndexSearcher(Directory directory) throws CorruptIndexException, IOException { this(IndexReader.open(directory), true); } /** Creates a searcher searching the index in the named * directory. You should pass readOnly=true, since it * gives much better concurrent performance, unless you * intend to do write operations (delete documents or * change norms) with the underlying IndexReader. * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error * @param path directory where IndexReader will be opened * @param readOnly if true, the underlying IndexReader * will be opened readOnly */ public IndexSearcher(Directory path, boolean readOnly) throws CorruptIndexException, IOException { this(IndexReader.open(path, readOnly), true); } /** Creates a searcher searching the provided index. */ public IndexSearcher(IndexReader r) { this(r, false); } private IndexSearcher(IndexReader r, boolean closeReader) { reader = r; this.closeReader = closeReader; List subReadersList = new ArrayList(); gatherSubReaders(subReadersList, reader); subReaders = (IndexReader[]) subReadersList.toArray(new IndexReader[subReadersList.size()]); docStarts = new int[subReaders.length]; int maxDoc = 0; for (int i = 0; i < subReaders.length; i++) { docStarts[i] = maxDoc; maxDoc += subReaders[i].maxDoc(); } } protected void gatherSubReaders(List allSubReaders, IndexReader r) { ReaderUtil.gatherSubReaders(allSubReaders, r); } /** Return the {@link IndexReader} this searches. */ public IndexReader getIndexReader() { return reader; } /** * Note that the underlying IndexReader is not closed, if * IndexSearcher was constructed with IndexSearcher(IndexReader r). * If the IndexReader was supplied implicitly by specifying a directory, then * the IndexReader gets closed. */ public void close() throws IOException { if(closeReader) reader.close(); } // inherit javadoc public int docFreq(Term term) throws IOException { return reader.docFreq(term); } // inherit javadoc public Document doc(int i) throws CorruptIndexException, IOException { return reader.document(i); } // inherit javadoc public Document doc(int i, FieldSelector fieldSelector) throws CorruptIndexException, IOException { return reader.document(i, fieldSelector); } // inherit javadoc public int maxDoc() throws IOException { return reader.maxDoc(); } // inherit javadoc public TopDocs search(Weight weight, Filter filter, int nDocs) throws IOException { if (nDocs <= 0) { throw new IllegalArgumentException("nDocs must be > 0"); } nDocs = Math.min(nDocs, reader.maxDoc()); TopScoreDocCollector collector = TopScoreDocCollector.create(nDocs, !weight.scoresDocsOutOfOrder()); search(weight, filter, collector); return collector.topDocs(); } public TopFieldDocs search(Weight weight, Filter filter, final int nDocs, Sort sort) throws IOException { return search(weight, filter, nDocs, sort, true); } /** * Just like {@link #search(Weight, Filter, int, Sort)}, but you choose * whether or not the fields in the returned {@link FieldDoc} instances should * be set by specifying fillFields.
    * *

    NOTE: this does not compute scores by default. If you * need scores, create a {@link TopFieldCollector} * instance by calling {@link TopFieldCollector#create} and * then pass that to {@link #search(Weight, Filter, * Collector)}.

    */ public TopFieldDocs search(Weight weight, Filter filter, int nDocs, Sort sort, boolean fillFields) throws IOException { nDocs = Math.min(nDocs, reader.maxDoc()); SortField[] fields = sort.fields; boolean legacy = false; for(int i = 0; i < fields.length; i++) { SortField field = fields[i]; String fieldname = field.getField(); int type = field.getType(); // Resolve AUTO into its true type if (type == SortField.AUTO) { int autotype = SortField.detectFieldType(reader, fieldname); if (autotype == SortField.STRING) { fields[i] = new SortField (fieldname, field.getLocale(), field.getReverse()); } else { fields[i] = new SortField (fieldname, autotype, field.getReverse()); } } if (field.getUseLegacySearch()) { legacy = true; } } if (legacy) { // Search the single top-level reader TopDocCollector collector = new TopFieldDocCollector(reader, sort, nDocs); HitCollectorWrapper hcw = new HitCollectorWrapper(collector); hcw.setNextReader(reader, 0); if (filter == null) { Scorer scorer = weight.scorer(reader, true, true); if (scorer != null) { scorer.score(hcw); } } else { searchWithFilter(reader, weight, filter, hcw); } return (TopFieldDocs) collector.topDocs(); } TopFieldCollector collector = TopFieldCollector.create(sort, nDocs, fillFields, fieldSortDoTrackScores, fieldSortDoMaxScore, !weight.scoresDocsOutOfOrder()); search(weight, filter, collector); return (TopFieldDocs) collector.topDocs(); } public void search(Weight weight, Filter filter, Collector collector) throws IOException { if (filter == null) { for (int i = 0; i < subReaders.length; i++) { // search each subreader collector.setNextReader(subReaders[i], docStarts[i]); Scorer scorer = weight.scorer(subReaders[i], !collector.acceptsDocsOutOfOrder(), true); if (scorer != null) { scorer.score(collector); } } } else { for (int i = 0; i < subReaders.length; i++) { // search each subreader collector.setNextReader(subReaders[i], docStarts[i]); searchWithFilter(subReaders[i], weight, filter, collector); } } } private void searchWithFilter(IndexReader reader, Weight weight, final Filter filter, final Collector collector) throws IOException { assert filter != null; Scorer scorer = weight.scorer(reader, true, false); if (scorer == null) { return; } int docID = scorer.docID(); assert docID == -1 || docID == DocIdSetIterator.NO_MORE_DOCS; // CHECKME: use ConjunctionScorer here? DocIdSet filterDocIdSet = filter.getDocIdSet(reader); if (filterDocIdSet == null) { // this means the filter does not accept any documents. return; } DocIdSetIterator filterIter = filterDocIdSet.iterator(); if (filterIter == null) { // this means the filter does not accept any documents. return; } int filterDoc = filterIter.nextDoc(); int scorerDoc = scorer.advance(filterDoc); collector.setScorer(scorer); while (true) { if (scorerDoc == filterDoc) { // Check if scorer has exhausted, only before collecting. if (scorerDoc == DocIdSetIterator.NO_MORE_DOCS) { break; } collector.collect(scorerDoc); filterDoc = filterIter.nextDoc(); scorerDoc = scorer.advance(filterDoc); } else if (scorerDoc > filterDoc) { filterDoc = filterIter.advance(scorerDoc); } else { scorerDoc = scorer.advance(filterDoc); } } } public Query rewrite(Query original) throws IOException { Query query = original; for (Query rewrittenQuery = query.rewrite(reader); rewrittenQuery != query; rewrittenQuery = query.rewrite(reader)) { query = rewrittenQuery; } return query; } public Explanation explain(Weight weight, int doc) throws IOException { int n = ReaderUtil.subIndex(doc, docStarts); int deBasedDoc = doc - docStarts[n]; return weight.explain(subReaders[n], deBasedDoc); } private boolean fieldSortDoTrackScores; private boolean fieldSortDoMaxScore; /** By default, no scores are computed when sorting by * field (using {@link #search(Query,Filter,int,Sort)}). * You can change that, per IndexSearcher instance, by * calling this method. Note that this will incur a CPU * cost. * * @param doTrackScores If true, then scores are * returned for every matching document in {@link * TopFieldDocs}. * * @param doMaxScore If true, then the max score for all * matching docs is computed. */ public void setDefaultFieldSortScoring(boolean doTrackScores, boolean doMaxScore) { fieldSortDoTrackScores = doTrackScores; fieldSortDoMaxScore = doMaxScore; } } lucene-2.9.4/src/java/org/apache/lucene/search/payloads/0000755000175000017500000000000011554106562023534 5ustar janpascaljanpascallucene-2.9.4/src/java/org/apache/lucene/search/payloads/PayloadFunction.java0000644000175000017500000000464311474320222027475 0ustar janpascaljanpascalpackage org.apache.lucene.search.payloads; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Serializable; /** * An abstract class that defines a way for Payload*Query instances * to transform the cumulative effects of payload scores for a document. * * @see org.apache.lucene.search.payloads.PayloadTermQuery for more information * *

    * This class and its derivations are experimental and subject to change * **/ public abstract class PayloadFunction implements Serializable { /** * Calculate the score up to this point for this doc and field * @param docId The current doc * @param field The field * @param start The start position of the matching Span * @param end The end position of the matching Span * @param numPayloadsSeen The number of payloads seen so far * @param currentScore The current score so far * @param currentPayloadScore The score for the current payload * @return The new current Score * * @see org.apache.lucene.search.spans.Spans */ public abstract float currentScore(int docId, String field, int start, int end, int numPayloadsSeen, float currentScore, float currentPayloadScore); /** * Calculate the final score for all the payloads seen so far for this doc/field * @param docId The current doc * @param field The current field * @param numPayloadsSeen The total number of payloads seen on this document * @param payloadScore The raw score for those payloads * @return The final score for the payloads */ public abstract float docScore(int docId, String field, int numPayloadsSeen, float payloadScore); public abstract int hashCode(); public abstract boolean equals(Object o); } lucene-2.9.4/src/java/org/apache/lucene/search/payloads/PayloadTermQuery.java0000644000175000017500000001635311474320222027646 0ustar janpascaljanpascalpackage org.apache.lucene.search.payloads; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.TermPositions; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Weight; import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.ComplexExplanation; import org.apache.lucene.search.spans.TermSpans; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.search.spans.SpanWeight; import org.apache.lucene.search.spans.SpanScorer; import java.io.IOException; /** * This class is very similar to * {@link org.apache.lucene.search.spans.SpanTermQuery} except that it factors * in the value of the payload located at each of the positions where the * {@link org.apache.lucene.index.Term} occurs. *

    * In order to take advantage of this, you must override * {@link org.apache.lucene.search.Similarity#scorePayload(String, byte[],int,int)} * which returns 1 by default. *

    * Payload scores are aggregated using a pluggable {@link PayloadFunction}. **/ public class PayloadTermQuery extends SpanTermQuery { protected PayloadFunction function; private boolean includeSpanScore; public PayloadTermQuery(Term term, PayloadFunction function) { this(term, function, true); } public PayloadTermQuery(Term term, PayloadFunction function, boolean includeSpanScore) { super(term); this.function = function; this.includeSpanScore = includeSpanScore; } public Weight createWeight(Searcher searcher) throws IOException { return new PayloadTermWeight(this, searcher); } protected class PayloadTermWeight extends SpanWeight { public PayloadTermWeight(PayloadTermQuery query, Searcher searcher) throws IOException { super(query, searcher); } public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException { return new PayloadTermSpanScorer((TermSpans) query.getSpans(reader), this, similarity, reader.norms(query.getField())); } protected class PayloadTermSpanScorer extends SpanScorer { // TODO: is this the best way to allocate this? protected byte[] payload = new byte[256]; protected TermPositions positions; protected float payloadScore; protected int payloadsSeen; public PayloadTermSpanScorer(TermSpans spans, Weight weight, Similarity similarity, byte[] norms) throws IOException { super(spans, weight, similarity, norms); positions = spans.getPositions(); } protected boolean setFreqCurrentDoc() throws IOException { if (!more) { return false; } doc = spans.doc(); freq = 0.0f; payloadScore = 0; payloadsSeen = 0; Similarity similarity1 = getSimilarity(); while (more && doc == spans.doc()) { int matchLength = spans.end() - spans.start(); freq += similarity1.sloppyFreq(matchLength); processPayload(similarity1); more = spans.next();// this moves positions to the next match in this // document } return more || (freq != 0); } protected void processPayload(Similarity similarity) throws IOException { if (positions.isPayloadAvailable()) { payload = positions.getPayload(payload, 0); payloadScore = function.currentScore(doc, term.field(), spans.start(), spans.end(), payloadsSeen, payloadScore, similarity.scorePayload(doc, term.field(), spans.start(), spans .end(), payload, 0, positions.getPayloadLength())); payloadsSeen++; } else { // zero out the payload? } } /** * * @return {@link #getSpanScore()} * {@link #getPayloadScore()} * @throws IOException */ public float score() throws IOException { return includeSpanScore ? getSpanScore() * getPayloadScore() : getPayloadScore(); } /** * Returns the SpanScorer score only. *

    * Should not be overridden without good cause! * * @return the score for just the Span part w/o the payload * @throws IOException * * @see #score() */ protected float getSpanScore() throws IOException { return super.score(); } /** * The score for the payload * * @return The score, as calculated by * {@link PayloadFunction#docScore(int, String, int, float)} */ protected float getPayloadScore() { return function.docScore(doc, term.field(), payloadsSeen, payloadScore); } public Explanation explain(final int doc) throws IOException { ComplexExplanation result = new ComplexExplanation(); Explanation nonPayloadExpl = super.explain(doc); result.addDetail(nonPayloadExpl); // QUESTION: Is there a way to avoid this skipTo call? We need to know // whether to load the payload or not Explanation payloadBoost = new Explanation(); result.addDetail(payloadBoost); float payloadScore = getPayloadScore(); payloadBoost.setValue(payloadScore); // GSI: I suppose we could toString the payload, but I don't think that // would be a good idea payloadBoost.setDescription("scorePayload(...)"); result.setValue(nonPayloadExpl.getValue() * payloadScore); result.setDescription("btq, product of:"); result.setMatch(nonPayloadExpl.getValue() == 0 ? Boolean.FALSE : Boolean.TRUE); // LUCENE-1303 return result; } } } public int hashCode() { final int prime = 31; int result = super.hashCode(); result = prime * result + ((function == null) ? 0 : function.hashCode()); result = prime * result + (includeSpanScore ? 1231 : 1237); return result; } public boolean equals(Object obj) { if (this == obj) return true; if (!super.equals(obj)) return false; if (getClass() != obj.getClass()) return false; PayloadTermQuery other = (PayloadTermQuery) obj; if (function == null) { if (other.function != null) return false; } else if (!function.equals(other.function)) return false; if (includeSpanScore != other.includeSpanScore) return false; return true; } } lucene-2.9.4/src/java/org/apache/lucene/search/payloads/MaxPayloadFunction.java0000644000175000017500000000347411474320222030144 0ustar janpascaljanpascalpackage org.apache.lucene.search.payloads; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Returns the maximum payload score seen, else 1 if there are no payloads on the doc. *

    * Is thread safe and completely reusable. * **/ public class MaxPayloadFunction extends PayloadFunction { public float currentScore(int docId, String field, int start, int end, int numPayloadsSeen, float currentScore, float currentPayloadScore) { if (numPayloadsSeen == 0) { return currentPayloadScore; } else { return Math.max(currentPayloadScore, currentScore); } } public float docScore(int docId, String field, int numPayloadsSeen, float payloadScore) { return numPayloadsSeen > 0 ? payloadScore : 1; } public int hashCode() { final int prime = 31; int result = 1; result = prime * result + this.getClass().hashCode(); return result; } public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; return true; } } lucene-2.9.4/src/java/org/apache/lucene/search/payloads/BoostingTermQuery.java0000644000175000017500000000525111474320222030034 0ustar janpascaljanpascalpackage org.apache.lucene.search.payloads; import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.Weight; import org.apache.lucene.search.spans.TermSpans; /** * Copyright 2004 The Apache Software Foundation *

    * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at *

    * http://www.apache.org/licenses/LICENSE-2.0 *

    * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * The BoostingTermQuery is very similar to the {@link org.apache.lucene.search.spans.SpanTermQuery} except * that it factors in the value of the payload located at each of the positions where the * {@link org.apache.lucene.index.Term} occurs. *

    * In order to take advantage of this, you must override {@link org.apache.lucene.search.Similarity#scorePayload(String, byte[],int,int)} * which returns 1 by default. *

    * Payload scores are averaged across term occurrences in the document. * * @see org.apache.lucene.search.Similarity#scorePayload(String, byte[], int, int) * * @deprecated See {@link org.apache.lucene.search.payloads.PayloadTermQuery} */ public class BoostingTermQuery extends PayloadTermQuery { public BoostingTermQuery(Term term) { this(term, true); } public BoostingTermQuery(Term term, boolean includeSpanScore) { super(term, new AveragePayloadFunction(), includeSpanScore); } public Weight createWeight(Searcher searcher) throws IOException { return new BoostingTermWeight(this, searcher); } protected class BoostingTermWeight extends PayloadTermWeight { public BoostingTermWeight(BoostingTermQuery query, Searcher searcher) throws IOException { super(query, searcher); } public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException { return new PayloadTermSpanScorer((TermSpans) query.getSpans(reader), this, similarity, reader.norms(query.getField())); } } public boolean equals(Object o) { if (!(o instanceof BoostingTermQuery)) return false; BoostingTermQuery other = (BoostingTermQuery) o; return (this.getBoost() == other.getBoost()) && this.term.equals(other.term); } } lucene-2.9.4/src/java/org/apache/lucene/search/payloads/package.html0000644000175000017500000000267611474320222026020 0ustar janpascaljanpascal org.apache.lucene.search.payloads

    The payloads package provides Query mechanisms for finding and using payloads. The following Query implementations are provided:
    1. PayloadTermQuery -- Boost a term's score based on the value of the payload located at that term.
    2. PayloadNearQuery -- A SpanNearQuery that factors in the value of the payloads located at each of the positions where the spans occur.
     
    lucene-2.9.4/src/java/org/apache/lucene/search/payloads/MinPayloadFunction.java0000644000175000017500000000333211474320222030133 0ustar janpascaljanpascalpackage org.apache.lucene.search.payloads; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Calculates the minimum payload seen * **/ public class MinPayloadFunction extends PayloadFunction { public float currentScore(int docId, String field, int start, int end, int numPayloadsSeen, float currentScore, float currentPayloadScore) { if (numPayloadsSeen == 0) { return currentPayloadScore; } else { return Math.min(currentPayloadScore, currentScore); } } public float docScore(int docId, String field, int numPayloadsSeen, float payloadScore) { return numPayloadsSeen > 0 ? payloadScore : 1; } public int hashCode() { final int prime = 31; int result = 1; result = prime * result + this.getClass().hashCode(); return result; } public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; return true; } } lucene-2.9.4/src/java/org/apache/lucene/search/payloads/PayloadNearQuery.java0000644000175000017500000002057611474320222027626 0ustar janpascaljanpascalpackage org.apache.lucene.search.payloads; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Weight; import org.apache.lucene.search.spans.NearSpansOrdered; import org.apache.lucene.search.spans.NearSpansUnordered; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanScorer; import org.apache.lucene.search.spans.SpanWeight; import org.apache.lucene.search.spans.Spans; import org.apache.lucene.util.ToStringUtils; import java.io.IOException; import java.util.Collection; import java.util.Iterator; /** * This class is very similar to * {@link org.apache.lucene.search.spans.SpanNearQuery} except that it factors * in the value of the payloads located at each of the positions where the * {@link org.apache.lucene.search.spans.TermSpans} occurs. *

    * In order to take advantage of this, you must override * {@link org.apache.lucene.search.Similarity#scorePayload(String, byte[],int,int)} * which returns 1 by default. *

    * Payload scores are aggregated using a pluggable {@link PayloadFunction}. * * @see org.apache.lucene.search.Similarity#scorePayload(String, byte[], int, * int) */ public class PayloadNearQuery extends SpanNearQuery { protected String fieldName; protected PayloadFunction function; public PayloadNearQuery(SpanQuery[] clauses, int slop, boolean inOrder) { this(clauses, slop, inOrder, new AveragePayloadFunction()); } public PayloadNearQuery(SpanQuery[] clauses, int slop, boolean inOrder, PayloadFunction function) { super(clauses, slop, inOrder); fieldName = clauses[0].getField(); // all clauses must have same field this.function = function; } public Weight createWeight(Searcher searcher) throws IOException { return new PayloadNearSpanWeight(this, searcher); } public Object clone() { int sz = clauses.size(); SpanQuery[] newClauses = new SpanQuery[sz]; for (int i = 0; i < sz; i++) { SpanQuery clause = (SpanQuery) clauses.get(i); newClauses[i] = (SpanQuery) clause.clone(); } PayloadNearQuery boostingNearQuery = new PayloadNearQuery(newClauses, slop, inOrder); boostingNearQuery.setBoost(getBoost()); return boostingNearQuery; } public String toString(String field) { StringBuffer buffer = new StringBuffer(); buffer.append("payloadNear(["); Iterator i = clauses.iterator(); while (i.hasNext()) { SpanQuery clause = (SpanQuery) i.next(); buffer.append(clause.toString(field)); if (i.hasNext()) { buffer.append(", "); } } buffer.append("], "); buffer.append(slop); buffer.append(", "); buffer.append(inOrder); buffer.append(")"); buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } // @Override public int hashCode() { final int prime = 31; int result = super.hashCode(); result = prime * result + ((fieldName == null) ? 0 : fieldName.hashCode()); result = prime * result + ((function == null) ? 0 : function.hashCode()); return result; } // @Override public boolean equals(Object obj) { if (this == obj) return true; if (!super.equals(obj)) return false; if (getClass() != obj.getClass()) return false; PayloadNearQuery other = (PayloadNearQuery) obj; if (fieldName == null) { if (other.fieldName != null) return false; } else if (!fieldName.equals(other.fieldName)) return false; if (function == null) { if (other.function != null) return false; } else if (!function.equals(other.function)) return false; return true; } public class PayloadNearSpanWeight extends SpanWeight { public PayloadNearSpanWeight(SpanQuery query, Searcher searcher) throws IOException { super(query, searcher); } public Scorer scorer(IndexReader reader) throws IOException { return new PayloadNearSpanScorer(query.getSpans(reader), this, similarity, reader.norms(query.getField())); } public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException { return new PayloadNearSpanScorer(query.getSpans(reader), this, similarity, reader.norms(query.getField())); } } public class PayloadNearSpanScorer extends SpanScorer { Spans spans; protected float payloadScore; private int payloadsSeen; Similarity similarity = getSimilarity(); protected PayloadNearSpanScorer(Spans spans, Weight weight, Similarity similarity, byte[] norms) throws IOException { super(spans, weight, similarity, norms); this.spans = spans; } // Get the payloads associated with all underlying subspans public void getPayloads(Spans[] subSpans) throws IOException { for (int i = 0; i < subSpans.length; i++) { if (subSpans[i] instanceof NearSpansOrdered) { if (((NearSpansOrdered) subSpans[i]).isPayloadAvailable()) { processPayloads(((NearSpansOrdered) subSpans[i]).getPayload(), subSpans[i].start(), subSpans[i].end()); } getPayloads(((NearSpansOrdered) subSpans[i]).getSubSpans()); } else if (subSpans[i] instanceof NearSpansUnordered) { if (((NearSpansUnordered) subSpans[i]).isPayloadAvailable()) { processPayloads(((NearSpansUnordered) subSpans[i]).getPayload(), subSpans[i].start(), subSpans[i].end()); } getPayloads(((NearSpansUnordered) subSpans[i]).getSubSpans()); } } } /** * By default, uses the {@link PayloadFunction} to score the payloads, but * can be overridden to do other things. * * @param payLoads The payloads * @param start The start position of the span being scored * @param end The end position of the span being scored * * @see Spans */ protected void processPayloads(Collection payLoads, int start, int end) { for (Iterator iterator = payLoads.iterator(); iterator.hasNext();) { byte[] thePayload = (byte[]) iterator.next(); payloadScore = function.currentScore(doc, fieldName, start, end, payloadsSeen, payloadScore, similarity.scorePayload(doc, fieldName, spans.start(), spans.end(), thePayload, 0, thePayload.length)); ++payloadsSeen; } } // protected boolean setFreqCurrentDoc() throws IOException { if (!more) { return false; } Spans[] spansArr = new Spans[1]; spansArr[0] = spans; payloadScore = 0; payloadsSeen = 0; getPayloads(spansArr); return super.setFreqCurrentDoc(); } public float score() throws IOException { return super.score() * function.docScore(doc, fieldName, payloadsSeen, payloadScore); } public Explanation explain(int doc) throws IOException { Explanation result = new Explanation(); Explanation nonPayloadExpl = super.explain(doc); result.addDetail(nonPayloadExpl); Explanation payloadBoost = new Explanation(); result.addDetail(payloadBoost); float avgPayloadScore = (payloadsSeen > 0 ? (payloadScore / payloadsSeen) : 1); payloadBoost.setValue(avgPayloadScore); payloadBoost.setDescription("scorePayload(...)"); result.setValue(nonPayloadExpl.getValue() * avgPayloadScore); result.setDescription("bnq, product of:"); return result; } } } lucene-2.9.4/src/java/org/apache/lucene/search/payloads/AveragePayloadFunction.java0000644000175000017500000000334411474320222030765 0ustar janpascaljanpascalpackage org.apache.lucene.search.payloads; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Calculate the final score as the average score of all payloads seen. *

    * Is thread safe and completely reusable. * **/ public class AveragePayloadFunction extends PayloadFunction{ public float currentScore(int docId, String field, int start, int end, int numPayloadsSeen, float currentScore, float currentPayloadScore) { return currentPayloadScore + currentScore; } public float docScore(int docId, String field, int numPayloadsSeen, float payloadScore) { return numPayloadsSeen > 0 ? (payloadScore / numPayloadsSeen) : 1; } public int hashCode() { final int prime = 31; int result = 1; result = prime * result + this.getClass().hashCode(); return result; } public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; return true; } } lucene-2.9.4/src/java/org/apache/lucene/search/payloads/PayloadSpanUtil.java0000644000175000017500000001461711474320222027451 0ustar janpascaljanpascalpackage org.apache.lucene.search.payloads; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; import java.util.List; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.DisjunctionMaxQuery; import org.apache.lucene.search.FilteredQuery; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.spans.SpanNearQuery; import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.search.spans.Spans; /** * Experimental class to get set of payloads for most standard Lucene queries. * Operates like Highlighter - IndexReader should only contain doc of interest, * best to use MemoryIndex. * *

    * * WARNING: The status of the Payloads feature is experimental. * The APIs introduced here might change in the future and will not be * supported anymore in such a case. * */ public class PayloadSpanUtil { private IndexReader reader; /** * @param reader * that contains doc with payloads to extract */ public PayloadSpanUtil(IndexReader reader) { this.reader = reader; } /** * Query should be rewritten for wild/fuzzy support. * * @param query * @return payloads Collection * @throws IOException */ public Collection getPayloadsForQuery(Query query) throws IOException { Collection payloads = new ArrayList(); queryToSpanQuery(query, payloads); return payloads; } private void queryToSpanQuery(Query query, Collection payloads) throws IOException { if (query instanceof BooleanQuery) { BooleanClause[] queryClauses = ((BooleanQuery) query).getClauses(); for (int i = 0; i < queryClauses.length; i++) { if (!queryClauses[i].isProhibited()) { queryToSpanQuery(queryClauses[i].getQuery(), payloads); } } } else if (query instanceof PhraseQuery) { Term[] phraseQueryTerms = ((PhraseQuery) query).getTerms(); SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.length]; for (int i = 0; i < phraseQueryTerms.length; i++) { clauses[i] = new SpanTermQuery(phraseQueryTerms[i]); } int slop = ((PhraseQuery) query).getSlop(); boolean inorder = false; if (slop == 0) { inorder = true; } SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder); sp.setBoost(query.getBoost()); getPayloads(payloads, sp); } else if (query instanceof TermQuery) { SpanTermQuery stq = new SpanTermQuery(((TermQuery) query).getTerm()); stq.setBoost(query.getBoost()); getPayloads(payloads, stq); } else if (query instanceof SpanQuery) { getPayloads(payloads, (SpanQuery) query); } else if (query instanceof FilteredQuery) { queryToSpanQuery(((FilteredQuery) query).getQuery(), payloads); } else if (query instanceof DisjunctionMaxQuery) { for (Iterator iterator = ((DisjunctionMaxQuery) query).iterator(); iterator .hasNext();) { queryToSpanQuery((Query) iterator.next(), payloads); } } else if (query instanceof MultiPhraseQuery) { final MultiPhraseQuery mpq = (MultiPhraseQuery) query; final List termArrays = mpq.getTermArrays(); final int[] positions = mpq.getPositions(); if (positions.length > 0) { int maxPosition = positions[positions.length - 1]; for (int i = 0; i < positions.length - 1; ++i) { if (positions[i] > maxPosition) { maxPosition = positions[i]; } } final List[] disjunctLists = new List[maxPosition + 1]; int distinctPositions = 0; for (int i = 0; i < termArrays.size(); ++i) { final Term[] termArray = (Term[]) termArrays.get(i); List disjuncts = disjunctLists[positions[i]]; if (disjuncts == null) { disjuncts = (disjunctLists[positions[i]] = new ArrayList( termArray.length)); ++distinctPositions; } for (int j = 0; j < termArray.length; ++j) { disjuncts.add(new SpanTermQuery(termArray[j])); } } int positionGaps = 0; int position = 0; final SpanQuery[] clauses = new SpanQuery[distinctPositions]; for (int i = 0; i < disjunctLists.length; ++i) { List disjuncts = disjunctLists[i]; if (disjuncts != null) { clauses[position++] = new SpanOrQuery((SpanQuery[]) disjuncts .toArray(new SpanQuery[disjuncts.size()])); } else { ++positionGaps; } } final int slop = mpq.getSlop(); final boolean inorder = (slop == 0); SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder); sp.setBoost(query.getBoost()); getPayloads(payloads, sp); } } } private void getPayloads(Collection payloads, SpanQuery query) throws IOException { Spans spans = query.getSpans(reader); while (spans.next() == true) { if (spans.isPayloadAvailable()) { Collection payload = spans.getPayload(); Iterator it = payload.iterator(); while (it.hasNext()) { byte[] bytes = (byte[]) it.next(); payloads.add(bytes); } } } } } lucene-2.9.4/src/java/org/apache/lucene/search/TimeLimitingCollector.java0000644000175000017500000001731211474320224027023 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.IndexReader; /** * The {@link TimeLimitingCollector} is used to timeout search requests that * take longer than the maximum allowed search time limit. After this time is * exceeded, the search thread is stopped by throwing a * {@link TimeExceededException}. */ public class TimeLimitingCollector extends Collector { /** * Default timer resolution. * @see #setResolution(long) */ public static final int DEFAULT_RESOLUTION = 20; /** * Default for {@link #isGreedy()}. * @see #isGreedy() */ public boolean DEFAULT_GREEDY = false; private static long resolution = DEFAULT_RESOLUTION; private boolean greedy = DEFAULT_GREEDY ; private static final class TimerThread extends Thread { // NOTE: we can avoid explicit synchronization here for several reasons: // * updates to volatile long variables are atomic // * only single thread modifies this value // * use of volatile keyword ensures that it does not reside in // a register, but in main memory (so that changes are visible to // other threads). // * visibility of changes does not need to be instantaneous, we can // afford losing a tick or two. // // See section 17 of the Java Language Specification for details. private volatile long time = 0; /** * TimerThread provides a pseudo-clock service to all searching * threads, so that they can count elapsed time with less overhead * than repeatedly calling System.currentTimeMillis. A single * thread should be created to be used for all searches. */ private TimerThread() { super("TimeLimitedCollector timer thread"); this.setDaemon( true ); } public void run() { while (true) { // TODO: Use System.nanoTime() when Lucene moves to Java SE 5. time += resolution; try { Thread.sleep( resolution ); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); throw new RuntimeException(ie); } } } /** * Get the timer value in milliseconds. */ public long getMilliseconds() { return time; } } /** Thrown when elapsed search time exceeds allowed search time. */ public static class TimeExceededException extends RuntimeException { private long timeAllowed; private long timeElapsed; private int lastDocCollected; private TimeExceededException(long timeAllowed, long timeElapsed, int lastDocCollected) { super("Elapsed time: " + timeElapsed + "Exceeded allowed search time: " + timeAllowed + " ms."); this.timeAllowed = timeAllowed; this.timeElapsed = timeElapsed; this.lastDocCollected = lastDocCollected; } /** Returns allowed time (milliseconds). */ public long getTimeAllowed() { return timeAllowed; } /** Returns elapsed time (milliseconds). */ public long getTimeElapsed() { return timeElapsed; } /** Returns last doc (absolute doc id) that was collected when the search time exceeded. */ public int getLastDocCollected() { return lastDocCollected; } } // Declare and initialize a single static timer thread to be used by // all TimeLimitedCollector instances. The JVM assures that // this only happens once. private final static TimerThread TIMER_THREAD = new TimerThread(); static { TIMER_THREAD.start(); } private final long t0; private final long timeout; private final Collector collector; private int docBase; /** * Create a TimeLimitedCollector wrapper over another {@link Collector} with a specified timeout. * @param collector the wrapped {@link Collector} * @param timeAllowed max time allowed for collecting hits after which {@link TimeExceededException} is thrown */ public TimeLimitingCollector(final Collector collector, final long timeAllowed ) { this.collector = collector; t0 = TIMER_THREAD.getMilliseconds(); this.timeout = t0 + timeAllowed; } /** * Return the timer resolution. * @see #setResolution(long) */ public static long getResolution() { return resolution; } /** * Set the timer resolution. * The default timer resolution is 20 milliseconds. * This means that a search required to take no longer than * 800 milliseconds may be stopped after 780 to 820 milliseconds. *
    Note that: *

      *
    • Finer (smaller) resolution is more accurate but less efficient.
    • *
    • Setting resolution to less than 5 milliseconds will be silently modified to 5 milliseconds.
    • *
    • Setting resolution smaller than current resolution might take effect only after current * resolution. (Assume current resolution of 20 milliseconds is modified to 5 milliseconds, * then it can take up to 20 milliseconds for the change to have effect.
    • *
    */ public static void setResolution(long newResolution) { resolution = Math.max(newResolution,5); // 5 milliseconds is about the minimum reasonable time for a Object.wait(long) call. } /** * Checks if this time limited collector is greedy in collecting the last hit. * A non greedy collector, upon a timeout, would throw a {@link TimeExceededException} * without allowing the wrapped collector to collect current doc. A greedy one would * first allow the wrapped hit collector to collect current doc and only then * throw a {@link TimeExceededException}. * @see #setGreedy(boolean) */ public boolean isGreedy() { return greedy; } /** * Sets whether this time limited collector is greedy. * @param greedy true to make this time limited greedy * @see #isGreedy() */ public void setGreedy(boolean greedy) { this.greedy = greedy; } /** * Calls {@link Collector#collect(int)} on the decorated {@link Collector} * unless the allowed time has passed, in which case it throws an exception. * * @throws TimeExceededException * if the time allowed has exceeded. */ public void collect(final int doc) throws IOException { long time = TIMER_THREAD.getMilliseconds(); if (timeout < time) { if (greedy) { //System.out.println(this+" greedy: before failing, collecting doc: "+(docBase + doc)+" "+(time-t0)); collector.collect(doc); } //System.out.println(this+" failing on: "+(docBase + doc)+" "+(time-t0)); throw new TimeExceededException( timeout-t0, time-t0, docBase + doc ); } //System.out.println(this+" collecting: "+(docBase + doc)+" "+(time-t0)); collector.collect(doc); } public void setNextReader(IndexReader reader, int base) throws IOException { collector.setNextReader(reader, base); this.docBase = base; } public void setScorer(Scorer scorer) throws IOException { collector.setScorer(scorer); } public boolean acceptsDocsOutOfOrder() { return collector.acceptsDocsOutOfOrder(); } } lucene-2.9.4/src/java/org/apache/lucene/search/Sort.java0000644000175000017500000002154611474320224023514 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Serializable; import java.util.Arrays; /** * Encapsulates sort criteria for returned hits. * *

    The fields used to determine sort order must be carefully chosen. * Documents must contain a single term in such a field, * and the value of the term should indicate the document's relative position in * a given sort order. The field must be indexed, but should not be tokenized, * and does not need to be stored (unless you happen to want it back with the * rest of your document data). In other words: * *

    document.add (new Field ("byNumber", Integer.toString(x), Field.Store.NO, Field.Index.NOT_ANALYZED));

    * * *

    Valid Types of Values

    * *

    There are four possible kinds of term values which may be put into * sorting fields: Integers, Longs, Floats, or Strings. Unless * {@link SortField SortField} objects are specified, the type of value * in the field is determined by parsing the first term in the field. * *

    Integer term values should contain only digits and an optional * preceding negative sign. Values must be base 10 and in the range * Integer.MIN_VALUE and Integer.MAX_VALUE inclusive. * Documents which should appear first in the sort * should have low value integers, later documents high values * (i.e. the documents should be numbered 1..n where * 1 is the first and n the last). * *

    Long term values should contain only digits and an optional * preceding negative sign. Values must be base 10 and in the range * Long.MIN_VALUE and Long.MAX_VALUE inclusive. * Documents which should appear first in the sort * should have low value integers, later documents high values. * *

    Float term values should conform to values accepted by * {@link Float Float.valueOf(String)} (except that NaN * and Infinity are not supported). * Documents which should appear first in the sort * should have low values, later documents high values. * *

    String term values can contain any valid String, but should * not be tokenized. The values are sorted according to their * {@link Comparable natural order}. Note that using this type * of term value has higher memory requirements than the other * two types. * *

    Object Reuse

    * *

    One of these objects can be * used multiple times and the sort order changed between usages. * *

    This class is thread safe. * *

    Memory Usage

    * *

    Sorting uses of caches of term values maintained by the * internal HitQueue(s). The cache is static and contains an integer * or float array of length IndexReader.maxDoc() for each field * name for which a sort is performed. In other words, the size of the * cache in bytes is: * *

    4 * IndexReader.maxDoc() * (# of different fields actually used to sort) * *

    For String fields, the cache is larger: in addition to the * above array, the value of every term in the field is kept in memory. * If there are many unique terms in the field, this could * be quite large. * *

    Note that the size of the cache is not affected by how many * fields are in the index and might be used to sort - only by * the ones actually used to sort a result set. * *

    Created: Feb 12, 2004 10:53:57 AM * * @since lucene 1.4 * @version $Id: Sort.java 795179 2009-07-17 18:23:30Z mikemccand $ */ public class Sort implements Serializable { /** * Represents sorting by computed relevance. Using this sort criteria returns * the same results as calling * {@link Searcher#search(Query) Searcher#search()}without a sort criteria, * only with slightly more overhead. */ public static final Sort RELEVANCE = new Sort(); /** Represents sorting by index order. */ public static final Sort INDEXORDER = new Sort(SortField.FIELD_DOC); // internal representation of the sort criteria SortField[] fields; /** * Sorts by computed relevance. This is the same sort criteria as calling * {@link Searcher#search(Query) Searcher#search()}without a sort criteria, * only with slightly more overhead. */ public Sort() { this(SortField.FIELD_SCORE); } /** * Sorts by the terms in field then by index order (document * number). The type of value in field is determined * automatically. * * @see SortField#AUTO * @deprecated Please specify the type explicitly by * first creating a {@link SortField} and then use {@link * #Sort(SortField)} */ public Sort(String field) { setSort(field, false); } /** * Sorts possibly in reverse by the terms in field then by * index order (document number). The type of value in field is * determined automatically. * * @see SortField#AUTO * @deprecated Please specify the type explicitly by * first creating a {@link SortField} and then use {@link * #Sort(SortField)} */ public Sort(String field, boolean reverse) { setSort(field, reverse); } /** * Sorts in succession by the terms in each field. The type of value in * field is determined automatically. * * @see SortField#AUTO * @deprecated Please specify the type explicitly by * first creating {@link SortField}s and then use {@link * #Sort(SortField[])} */ public Sort(String[] fields) { setSort(fields); } /** Sorts by the criteria in the given SortField. */ public Sort(SortField field) { setSort(field); } /** Sorts in succession by the criteria in each SortField. */ public Sort(SortField[] fields) { setSort(fields); } /** * Sets the sort to the terms in field then by index order * (document number). * @deprecated Please specify the type explicitly by * first creating a {@link SortField} and then use {@link * #setSort(SortField)} */ public final void setSort(String field) { setSort(field, false); } /** * Sets the sort to the terms in field possibly in reverse, * then by index order (document number). * @deprecated Please specify the type explicitly by * first creating a {@link SortField} and then use {@link * #setSort(SortField)} */ public void setSort(String field, boolean reverse) { fields = new SortField[] { new SortField(field, SortField.AUTO, reverse) }; } /** Sets the sort to the terms in each field in succession. * @deprecated Please specify the type explicitly by * first creating {@link SortField}s and then use {@link * #setSort(SortField[])} */ public void setSort(String[] fieldnames) { final int n = fieldnames.length; SortField[] nfields = new SortField[n]; for (int i = 0; i < n; ++i) { nfields[i] = new SortField(fieldnames[i], SortField.AUTO); } fields = nfields; } /** Sets the sort to the given criteria. */ public void setSort(SortField field) { this.fields = new SortField[] { field }; } /** Sets the sort to the given criteria in succession. */ public void setSort(SortField[] fields) { this.fields = fields; } /** * Representation of the sort criteria. * @return Array of SortField objects used in this sort criteria */ public SortField[] getSort() { return fields; } public String toString() { StringBuffer buffer = new StringBuffer(); for (int i = 0; i < fields.length; i++) { buffer.append(fields[i].toString()); if ((i+1) < fields.length) buffer.append(','); } return buffer.toString(); } /** Returns true if o is equal to this. */ public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof Sort)) return false; final Sort other = (Sort)o; return Arrays.equals(this.fields, other.fields); } /** Returns a hash code value for this object. */ public int hashCode() { // TODO in Java 1.5: switch to Arrays.hashCode(). The // Java 1.4 workaround below calculates the same hashCode // as Java 1.5's new Arrays.hashCode() return 0x45aaf665 + Arrays.asList(fields).hashCode(); } } lucene-2.9.4/src/java/org/apache/lucene/search/function/0000755000175000017500000000000011554106562023545 5ustar janpascaljanpascallucene-2.9.4/src/java/org/apache/lucene/search/function/ByteFieldSource.java0000644000175000017500000001063711474320224027441 0ustar janpascaljanpascalpackage org.apache.lucene.search.function; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.function.DocValues; import java.io.IOException; /** * Expert: obtains single byte field values from the * {@link org.apache.lucene.search.FieldCache FieldCache} * using getBytes() and makes those values * available as other numeric types, casting as needed. * *

    * WARNING: The status of the search.function package is experimental. * The APIs introduced here might change in the future and will not be * supported anymore in such a case. * * @see org.apache.lucene.search.function.FieldCacheSource for requirements * on the field. * *

    NOTE: with the switch in 2.9 to segment-based * searching, if {@link #getValues} is invoked with a * composite (multi-segment) reader, this can easily cause * double RAM usage for the values in the FieldCache. It's * best to switch your application to pass only atomic * (single segment) readers to this API. Alternatively, for * a short-term fix, you could wrap your ValueSource using * {@link MultiValueSource}, which costs more CPU per lookup * but will not consume double the FieldCache RAM.

    */ public class ByteFieldSource extends FieldCacheSource { private FieldCache.ByteParser parser; /** * Create a cached byte field source with default string-to-byte parser. */ public ByteFieldSource(String field) { this(field, null); } /** * Create a cached byte field source with a specific string-to-byte parser. */ public ByteFieldSource(String field, FieldCache.ByteParser parser) { super(field); this.parser = parser; } /*(non-Javadoc) @see org.apache.lucene.search.function.ValueSource#description() */ public String description() { return "byte(" + super.description() + ')'; } /*(non-Javadoc) @see org.apache.lucene.search.function.FieldCacheSource#getCachedValues(org.apache.lucene.search.FieldCache, java.lang.String, org.apache.lucene.index.IndexReader) */ public DocValues getCachedFieldValues (FieldCache cache, String field, IndexReader reader) throws IOException { final byte[] arr = cache.getBytes(reader, field, parser); return new DocValues() { /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#floatVal(int) */ public float floatVal(int doc) { return (float) arr[doc]; } /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#intVal(int) */ public int intVal(int doc) { return arr[doc]; } /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#toString(int) */ public String toString(int doc) { return description() + '=' + intVal(doc); } /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#getInnerArray() */ Object getInnerArray() { return arr; } }; } /*(non-Javadoc) @see org.apache.lucene.search.function.FieldCacheSource#cachedFieldSourceEquals(org.apache.lucene.search.function.FieldCacheSource) */ public boolean cachedFieldSourceEquals(FieldCacheSource o) { if (o.getClass() != ByteFieldSource.class) { return false; } ByteFieldSource other = (ByteFieldSource)o; return this.parser==null ? other.parser==null : this.parser.getClass() == other.parser.getClass(); } /*(non-Javadoc) @see org.apache.lucene.search.function.FieldCacheSource#cachedFieldSourceHashCode() */ public int cachedFieldSourceHashCode() { return parser==null ? Byte.class.hashCode() : parser.getClass().hashCode(); } } lucene-2.9.4/src/java/org/apache/lucene/search/function/FieldCacheSource.java0000644000175000017500000001040611474320224027533 0ustar janpascaljanpascalpackage org.apache.lucene.search.function; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.FieldCache; /** * Expert: A base class for ValueSource implementations that retrieve values for * a single field from the {@link org.apache.lucene.search.FieldCache FieldCache}. *

    * Fields used herein must be indexed (doesn't matter if these fields are stored or not). *

    * It is assumed that each such indexed field is untokenized, or at least has a single token in a document. * For documents with multiple tokens of the same field, behavior is undefined (It is likely that current * code would use the value of one of these tokens, but this is not guaranteed). *

    * Document with no tokens in this field are assigned the Zero value. * *

    * WARNING: The status of the search.function package is experimental. * The APIs introduced here might change in the future and will not be * supported anymore in such a case. * *

    NOTE: with the switch in 2.9 to segment-based * searching, if {@link #getValues} is invoked with a * composite (multi-segment) reader, this can easily cause * double RAM usage for the values in the FieldCache. It's * best to switch your application to pass only atomic * (single segment) readers to this API. Alternatively, for * a short-term fix, you could wrap your ValueSource using * {@link MultiValueSource}, which costs more CPU per lookup * but will not consume double the FieldCache RAM.

    */ public abstract class FieldCacheSource extends ValueSource { private String field; /** * Create a cached field source for the input field. */ public FieldCacheSource(String field) { this.field=field; } /* (non-Javadoc) @see org.apache.lucene.search.function.ValueSource#getValues(org.apache.lucene.index.IndexReader) */ public final DocValues getValues(IndexReader reader) throws IOException { return getCachedFieldValues(FieldCache.DEFAULT, field, reader); } /* (non-Javadoc) @see org.apache.lucene.search.function.ValueSource#description() */ public String description() { return field; } /** * Return cached DocValues for input field and reader. * @param cache FieldCache so that values of a field are loaded once per reader (RAM allowing) * @param field Field for which values are required. * @see ValueSource */ public abstract DocValues getCachedFieldValues(FieldCache cache, String field, IndexReader reader) throws IOException; /*(non-Javadoc) @see java.lang.Object#equals(java.lang.Object) */ public final boolean equals(Object o) { if (!(o instanceof FieldCacheSource)) { return false; } FieldCacheSource other = (FieldCacheSource) o; return this.field.equals(other.field) && cachedFieldSourceEquals(other); } /*(non-Javadoc) @see java.lang.Object#hashCode() */ public final int hashCode() { return field.hashCode() + cachedFieldSourceHashCode(); } /** * Check if equals to another {@link FieldCacheSource}, already knowing that cache and field are equal. * @see Object#equals(java.lang.Object) */ public abstract boolean cachedFieldSourceEquals(FieldCacheSource other); /** * Return a hash code of a {@link FieldCacheSource}, without the hash-codes of the field * and the cache (those are taken care of elsewhere). * @see Object#hashCode() */ public abstract int cachedFieldSourceHashCode(); } lucene-2.9.4/src/java/org/apache/lucene/search/function/DocValues.java0000644000175000017500000001331111474320224026266 0ustar janpascaljanpascalpackage org.apache.lucene.search.function; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.search.Explanation; /** * Expert: represents field values as different types. * Normally created via a * {@link org.apache.lucene.search.function.ValueSource ValueSuorce} * for a particular field and reader. * *

    * WARNING: The status of the search.function package is experimental. * The APIs introduced here might change in the future and will not be * supported anymore in such a case. * * */ public abstract class DocValues { /* * DocValues is distinct from ValueSource because * there needs to be an object created at query evaluation time that * is not referenced by the query itself because: * - Query objects should be MT safe * - For caching, Query objects are often used as keys... you don't * want the Query carrying around big objects */ /** * Return doc value as a float. *

    Mandatory: every DocValues implementation must implement at least this method. * @param doc document whose float value is requested. */ public abstract float floatVal(int doc); /** * Return doc value as an int. *

    Optional: DocValues implementation can (but don't have to) override this method. * @param doc document whose int value is requested. */ public int intVal(int doc) { return (int) floatVal(doc); } /** * Return doc value as a long. *

    Optional: DocValues implementation can (but don't have to) override this method. * @param doc document whose long value is requested. */ public long longVal(int doc) { return (long) floatVal(doc); } /** * Return doc value as a double. *

    Optional: DocValues implementation can (but don't have to) override this method. * @param doc document whose double value is requested. */ public double doubleVal(int doc) { return (double) floatVal(doc); } /** * Return doc value as a string. *

    Optional: DocValues implementation can (but don't have to) override this method. * @param doc document whose string value is requested. */ public String strVal(int doc) { return Float.toString(floatVal(doc)); } /** * Return a string representation of a doc value, as required for Explanations. */ public abstract String toString(int doc); /** * Explain the scoring value for the input doc. */ public Explanation explain(int doc) { return new Explanation(floatVal(doc), toString(doc)); } /** * Expert: for test purposes only, return the inner array of values, or null if not applicable. *

    * Allows tests to verify that loaded values are: *

      *
    1. indeed cached/reused.
    2. *
    3. stored in the expected size/type (byte/short/int/float).
    4. *
    * Note: implementations of DocValues must override this method for * these test elements to be tested, Otherwise the test would not fail, just * print a warning. */ Object getInnerArray() { throw new UnsupportedOperationException("this optional method is for test purposes only"); } // --- some simple statistics on values private float minVal = Float.NaN; private float maxVal = Float.NaN; private float avgVal = Float.NaN; private boolean computed=false; // compute optional values private void compute() { if (computed) { return; } float sum = 0; int n = 0; while (true) { float val; try { val = floatVal(n); } catch (ArrayIndexOutOfBoundsException e) { break; } sum += val; minVal = Float.isNaN(minVal) ? val : Math.min(minVal, val); maxVal = Float.isNaN(maxVal) ? val : Math.max(maxVal, val); ++n; } avgVal = n == 0 ? Float.NaN : sum / n; computed = true; } /** * Returns the minimum of all values or Float.NaN if this * DocValues instance does not contain any value. *

    * This operation is optional *

    * * @return the minimum of all values or Float.NaN if this * DocValues instance does not contain any value. */ public float getMinValue() { compute(); return minVal; } /** * Returns the maximum of all values or Float.NaN if this * DocValues instance does not contain any value. *

    * This operation is optional *

    * * @return the maximum of all values or Float.NaN if this * DocValues instance does not contain any value. */ public float getMaxValue() { compute(); return maxVal; } /** * Returns the average of all values or Float.NaN if this * DocValues instance does not contain any value. * *

    * This operation is optional *

    * * @return the average of all values or Float.NaN if this * DocValues instance does not contain any value */ public float getAverageValue() { compute(); return avgVal; } } lucene-2.9.4/src/java/org/apache/lucene/search/function/FloatFieldSource.java0000644000175000017500000001040311474320224027572 0ustar janpascaljanpascalpackage org.apache.lucene.search.function; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.function.DocValues; import java.io.IOException; /** * Expert: obtains float field values from the * {@link org.apache.lucene.search.FieldCache FieldCache} * using getFloats() and makes those values * available as other numeric types, casting as needed. * *

    * WARNING: The status of the search.function package is experimental. * The APIs introduced here might change in the future and will not be * supported anymore in such a case. * * @see org.apache.lucene.search.function.FieldCacheSource for requirements * on the field. * *

    NOTE: with the switch in 2.9 to segment-based * searching, if {@link #getValues} is invoked with a * composite (multi-segment) reader, this can easily cause * double RAM usage for the values in the FieldCache. It's * best to switch your application to pass only atomic * (single segment) readers to this API. Alternatively, for * a short-term fix, you could wrap your ValueSource using * {@link MultiValueSource}, which costs more CPU per lookup * but will not consume double the FieldCache RAM.

    */ public class FloatFieldSource extends FieldCacheSource { private FieldCache.FloatParser parser; /** * Create a cached float field source with default string-to-float parser. */ public FloatFieldSource(String field) { this(field, null); } /** * Create a cached float field source with a specific string-to-float parser. */ public FloatFieldSource(String field, FieldCache.FloatParser parser) { super(field); this.parser = parser; } /*(non-Javadoc) @see org.apache.lucene.search.function.ValueSource#description() */ public String description() { return "float(" + super.description() + ')'; } /*(non-Javadoc) @see org.apache.lucene.search.function.FieldCacheSource#getCachedValues(org.apache.lucene.search.FieldCache, java.lang.String, org.apache.lucene.index.IndexReader) */ public DocValues getCachedFieldValues (FieldCache cache, String field, IndexReader reader) throws IOException { final float[] arr = cache.getFloats(reader, field, parser); return new DocValues() { /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#floatVal(int) */ public float floatVal(int doc) { return arr[doc]; } /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#toString(int) */ public String toString(int doc) { return description() + '=' + arr[doc]; } /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#getInnerArray() */ Object getInnerArray() { return arr; } }; } /*(non-Javadoc) @see org.apache.lucene.search.function.FieldCacheSource#cachedFieldSourceEquals(org.apache.lucene.search.function.FieldCacheSource) */ public boolean cachedFieldSourceEquals(FieldCacheSource o) { if (o.getClass() != FloatFieldSource.class) { return false; } FloatFieldSource other = (FloatFieldSource)o; return this.parser==null ? other.parser==null : this.parser.getClass() == other.parser.getClass(); } /*(non-Javadoc) @see org.apache.lucene.search.function.FieldCacheSource#cachedFieldSourceHashCode() */ public int cachedFieldSourceHashCode() { return parser==null ? Float.class.hashCode() : parser.getClass().hashCode(); } }lucene-2.9.4/src/java/org/apache/lucene/search/function/IntFieldSource.java0000644000175000017500000001061111474320224027260 0ustar janpascaljanpascalpackage org.apache.lucene.search.function; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.function.DocValues; import java.io.IOException; /** * Expert: obtains int field values from the * {@link org.apache.lucene.search.FieldCache FieldCache} * using getInts() and makes those values * available as other numeric types, casting as needed. * *

    * WARNING: The status of the search.function package is experimental. * The APIs introduced here might change in the future and will not be * supported anymore in such a case. * * @see org.apache.lucene.search.function.FieldCacheSource for requirements * on the field. * *

    NOTE: with the switch in 2.9 to segment-based * searching, if {@link #getValues} is invoked with a * composite (multi-segment) reader, this can easily cause * double RAM usage for the values in the FieldCache. It's * best to switch your application to pass only atomic * (single segment) readers to this API. Alternatively, for * a short-term fix, you could wrap your ValueSource using * {@link MultiValueSource}, which costs more CPU per lookup * but will not consume double the FieldCache RAM.

    */ public class IntFieldSource extends FieldCacheSource { private FieldCache.IntParser parser; /** * Create a cached int field source with default string-to-int parser. */ public IntFieldSource(String field) { this(field, null); } /** * Create a cached int field source with a specific string-to-int parser. */ public IntFieldSource(String field, FieldCache.IntParser parser) { super(field); this.parser = parser; } /*(non-Javadoc) @see org.apache.lucene.search.function.ValueSource#description() */ public String description() { return "int(" + super.description() + ')'; } /*(non-Javadoc) @see org.apache.lucene.search.function.FieldCacheSource#getCachedValues(org.apache.lucene.search.FieldCache, java.lang.String, org.apache.lucene.index.IndexReader) */ public DocValues getCachedFieldValues (FieldCache cache, String field, IndexReader reader) throws IOException { final int[] arr = cache.getInts(reader, field, parser); return new DocValues() { /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#floatVal(int) */ public float floatVal(int doc) { return (float) arr[doc]; } /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#intVal(int) */ public int intVal(int doc) { return arr[doc]; } /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#toString(int) */ public String toString(int doc) { return description() + '=' + intVal(doc); } /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#getInnerArray() */ Object getInnerArray() { return arr; } }; } /*(non-Javadoc) @see org.apache.lucene.search.function.FieldCacheSource#cachedFieldSourceEquals(org.apache.lucene.search.function.FieldCacheSource) */ public boolean cachedFieldSourceEquals(FieldCacheSource o) { if (o.getClass() != IntFieldSource.class) { return false; } IntFieldSource other = (IntFieldSource)o; return this.parser==null ? other.parser==null : this.parser.getClass() == other.parser.getClass(); } /*(non-Javadoc) @see org.apache.lucene.search.function.FieldCacheSource#cachedFieldSourceHashCode() */ public int cachedFieldSourceHashCode() { return parser==null ? Integer.class.hashCode() : parser.getClass().hashCode(); } } lucene-2.9.4/src/java/org/apache/lucene/search/function/MultiValueSource.java0000644000175000017500000000753711474320224027666 0ustar janpascaljanpascalpackage org.apache.lucene.search.function; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.util.ReaderUtil; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Explanation; /** This class wraps another ValueSource, but protects * against accidental double RAM usage in FieldCache when * a composite reader is passed to {@link #getValues}. * *

    NOTE: this class adds a CPU penalty to every * lookup, as it must resolve the incoming document to the * right sub-reader using a binary search.

    * * @deprecated This class is temporary, to ease the * migration to segment-based searching. Please change your * code to not pass composite readers to these APIs. */ public final class MultiValueSource extends ValueSource { final ValueSource other; public MultiValueSource(ValueSource other) { this.other = other; } public DocValues getValues(IndexReader reader) throws IOException { IndexReader[] subReaders = reader.getSequentialSubReaders(); if (subReaders != null) { // This is a composite reader return new MultiDocValues(subReaders); } else { // Already an atomic reader -- just delegate return other.getValues(reader); } } public String description() { return other.description(); } public boolean equals(Object o) { if (o instanceof MultiValueSource) { return ((MultiValueSource) o).other.equals(other); } else { return false; } } public int hashCode() { return 31 * other.hashCode(); } private final class MultiDocValues extends DocValues { final DocValues[] docValues; final int[] docStarts; MultiDocValues(IndexReader[] subReaders) throws IOException { docValues = new DocValues[subReaders.length]; docStarts = new int[subReaders.length]; int base = 0; for(int i=0;i * This query provides a score for each and every undeleted document in the index. *

    * The value source can be based on a (cached) value of an indexed field, but it * can also be based on an external source, e.g. values read from an external database. *

    * Score is set as: Score(doc,query) = query.getBoost()2 * valueSource(doc). * *

    * WARNING: The status of the search.function package is experimental. * The APIs introduced here might change in the future and will not be * supported anymore in such a case. */ public class ValueSourceQuery extends Query { ValueSource valSrc; /** * Create a value source query * @param valSrc provides the values defines the function to be used for scoring */ public ValueSourceQuery(ValueSource valSrc) { this.valSrc=valSrc; } /*(non-Javadoc) @see org.apache.lucene.search.Query#rewrite(org.apache.lucene.index.IndexReader) */ public Query rewrite(IndexReader reader) throws IOException { return this; } /*(non-Javadoc) @see org.apache.lucene.search.Query#extractTerms(java.util.Set) */ public void extractTerms(Set terms) { // no terms involved here } class ValueSourceWeight extends Weight { Similarity similarity; float queryNorm; float queryWeight; public ValueSourceWeight(Searcher searcher) { this.similarity = getSimilarity(searcher); } /*(non-Javadoc) @see org.apache.lucene.search.Weight#getQuery() */ public Query getQuery() { return ValueSourceQuery.this; } /*(non-Javadoc) @see org.apache.lucene.search.Weight#getValue() */ public float getValue() { return queryWeight; } /*(non-Javadoc) @see org.apache.lucene.search.Weight#sumOfSquaredWeights() */ public float sumOfSquaredWeights() throws IOException { queryWeight = getBoost(); return queryWeight * queryWeight; } /*(non-Javadoc) @see org.apache.lucene.search.Weight#normalize(float) */ public void normalize(float norm) { this.queryNorm = norm; queryWeight *= this.queryNorm; } public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException { return new ValueSourceScorer(similarity, reader, this); } /*(non-Javadoc) @see org.apache.lucene.search.Weight#explain(org.apache.lucene.index.IndexReader, int) */ public Explanation explain(IndexReader reader, int doc) throws IOException { return new ValueSourceScorer(similarity, reader, this).explain(doc); } } /** * A scorer that (simply) matches all documents, and scores each document with * the value of the value source in effect. As an example, if the value source * is a (cached) field source, then value of that field in that document will * be used. (assuming field is indexed for this doc, with a single token.) */ private class ValueSourceScorer extends Scorer { private final ValueSourceWeight weight; private final float qWeight; private final DocValues vals; private final TermDocs termDocs; private int doc = -1; // constructor private ValueSourceScorer(Similarity similarity, IndexReader reader, ValueSourceWeight w) throws IOException { super(similarity); this.weight = w; this.qWeight = w.getValue(); // this is when/where the values are first created. vals = valSrc.getValues(reader); termDocs = reader.termDocs(null); } /** @deprecated use {@link #nextDoc()} instead. */ public boolean next() throws IOException { return termDocs.next(); } public int nextDoc() throws IOException { return doc = termDocs.next() ? termDocs.doc() : NO_MORE_DOCS; } /** @deprecated use {@link #docID()} instead. */ public int doc() { return termDocs.doc(); } public int docID() { return doc; } /*(non-Javadoc) @see org.apache.lucene.search.Scorer#score() */ public float score() throws IOException { return qWeight * vals.floatVal(termDocs.doc()); } /** @deprecated use {@link #advance(int)} instead. */ public boolean skipTo(int target) throws IOException { return termDocs.skipTo(target); } public int advance(int target) throws IOException { return doc = termDocs.skipTo(target) ? termDocs.doc() : NO_MORE_DOCS; } /*(non-Javadoc) @see org.apache.lucene.search.Scorer#explain(int) */ public Explanation explain(int doc) throws IOException { float sc = qWeight * vals.floatVal(doc); Explanation result = new ComplexExplanation( true, sc, ValueSourceQuery.this.toString() + ", product of:"); result.addDetail(vals.explain(doc)); result.addDetail(new Explanation(getBoost(), "boost")); result.addDetail(new Explanation(weight.queryNorm,"queryNorm")); return result; } } public Weight createWeight(Searcher searcher) { return new ValueSourceQuery.ValueSourceWeight(searcher); } public String toString(String field) { return valSrc.toString() + ToStringUtils.boost(getBoost()); } /** Returns true if o is equal to this. */ public boolean equals(Object o) { if (getClass() != o.getClass()) { return false; } ValueSourceQuery other = (ValueSourceQuery)o; return this.getBoost() == other.getBoost() && this.valSrc.equals(other.valSrc); } /** Returns a hash code value for this object. */ public int hashCode() { return (getClass().hashCode() + valSrc.hashCode()) ^ Float.floatToIntBits(getBoost()); } } lucene-2.9.4/src/java/org/apache/lucene/search/function/ReverseOrdFieldSource.java0000644000175000017500000001111611474320224030607 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search.function; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.FieldCache; import java.io.IOException; /** * Expert: obtains the ordinal of the field value from the default Lucene * {@link org.apache.lucene.search.FieldCache FieldCache} using getStringIndex() * and reverses the order. *

    * The native lucene index order is used to assign an ordinal value for each field value. *

    * Field values (terms) are lexicographically ordered by unicode value, and numbered starting at 1. *
    * Example of reverse ordinal (rord): *
    If there were only three field values: "apple","banana","pear" *
    then rord("apple")=3, rord("banana")=2, ord("pear")=1 *

    * WARNING: * rord() depends on the position in an index and can thus change * when other documents are inserted or deleted, * or if a MultiSearcher is used. * *

    * WARNING: The status of the search.function package is experimental. * The APIs introduced here might change in the future and will not be * supported anymore in such a case. * *

    NOTE: with the switch in 2.9 to segment-based * searching, if {@link #getValues} is invoked with a * composite (multi-segment) reader, this can easily cause * double RAM usage for the values in the FieldCache. It's * best to switch your application to pass only atomic * (single segment) readers to this API. Alternatively, for * a short-term fix, you could wrap your ValueSource using * {@link MultiValueSource}, which costs more CPU per lookup * but will not consume double the FieldCache RAM.

    */ public class ReverseOrdFieldSource extends ValueSource { public String field; /** * Contructor for a certain field. * @param field field whose values reverse order is used. */ public ReverseOrdFieldSource(String field) { this.field = field; } /*(non-Javadoc) @see org.apache.lucene.search.function.ValueSource#description() */ public String description() { return "rord("+field+')'; } /*(non-Javadoc) @see org.apache.lucene.search.function.ValueSource#getValues(org.apache.lucene.index.IndexReader) */ public DocValues getValues(IndexReader reader) throws IOException { final FieldCache.StringIndex sindex = FieldCache.DEFAULT.getStringIndex(reader, field); final int arr[] = sindex.order; final int end = sindex.lookup.length; return new DocValues() { /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#floatVal(int) */ public float floatVal(int doc) { return (float)(end - arr[doc]); } /* (non-Javadoc) @see org.apache.lucene.search.function.DocValues#intVal(int) */ public int intVal(int doc) { return end - arr[doc]; } /* (non-Javadoc) @see org.apache.lucene.search.function.DocValues#strVal(int) */ public String strVal(int doc) { // the string value of the ordinal, not the string itself return Integer.toString(intVal(doc)); } /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#toString(int) */ public String toString(int doc) { return description() + '=' + strVal(doc); } /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#getInnerArray() */ Object getInnerArray() { return arr; } }; } /*(non-Javadoc) @see java.lang.Object#equals(java.lang.Object) */ public boolean equals(Object o) { if (o.getClass() != ReverseOrdFieldSource.class) return false; ReverseOrdFieldSource other = (ReverseOrdFieldSource)o; return this.field.equals(other.field); } private static final int hcode = ReverseOrdFieldSource.class.hashCode(); /*(non-Javadoc) @see java.lang.Object#hashCode() */ public int hashCode() { return hcode + field.hashCode(); } } lucene-2.9.4/src/java/org/apache/lucene/search/function/ShortFieldSource.java0000644000175000017500000001065011474320224027630 0ustar janpascaljanpascalpackage org.apache.lucene.search.function; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.function.DocValues; import java.io.IOException; /** * Expert: obtains short field values from the * {@link org.apache.lucene.search.FieldCache FieldCache} * using getShorts() and makes those values * available as other numeric types, casting as needed. * *

    * WARNING: The status of the search.function package is experimental. * The APIs introduced here might change in the future and will not be * supported anymore in such a case. * * @see org.apache.lucene.search.function.FieldCacheSource for requirements * on the field. * *

    NOTE: with the switch in 2.9 to segment-based * searching, if {@link #getValues} is invoked with a * composite (multi-segment) reader, this can easily cause * double RAM usage for the values in the FieldCache. It's * best to switch your application to pass only atomic * (single segment) readers to this API. Alternatively, for * a short-term fix, you could wrap your ValueSource using * {@link MultiValueSource}, which costs more CPU per lookup * but will not consume double the FieldCache RAM.

    */ public class ShortFieldSource extends FieldCacheSource { private FieldCache.ShortParser parser; /** * Create a cached short field source with default string-to-short parser. */ public ShortFieldSource(String field) { this(field, null); } /** * Create a cached short field source with a specific string-to-short parser. */ public ShortFieldSource(String field, FieldCache.ShortParser parser) { super(field); this.parser = parser; } /*(non-Javadoc) @see org.apache.lucene.search.function.ValueSource#description() */ public String description() { return "short(" + super.description() + ')'; } /*(non-Javadoc) @see org.apache.lucene.search.function.FieldCacheSource#getCachedValues(org.apache.lucene.search.FieldCache, java.lang.String, org.apache.lucene.index.IndexReader) */ public DocValues getCachedFieldValues (FieldCache cache, String field, IndexReader reader) throws IOException { final short[] arr = cache.getShorts(reader, field, parser); return new DocValues() { /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#floatVal(int) */ public float floatVal(int doc) { return (float) arr[doc]; } /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#intVal(int) */ public int intVal(int doc) { return arr[doc]; } /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#toString(int) */ public String toString(int doc) { return description() + '=' + intVal(doc); } /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#getInnerArray() */ Object getInnerArray() { return arr; } }; } /*(non-Javadoc) @see org.apache.lucene.search.function.FieldCacheSource#cachedFieldSourceEquals(org.apache.lucene.search.function.FieldCacheSource) */ public boolean cachedFieldSourceEquals(FieldCacheSource o) { if (o.getClass() != ShortFieldSource.class) { return false; } ShortFieldSource other = (ShortFieldSource)o; return this.parser==null ? other.parser==null : this.parser.getClass() == other.parser.getClass(); } /*(non-Javadoc) @see org.apache.lucene.search.function.FieldCacheSource#cachedFieldSourceHashCode() */ public int cachedFieldSourceHashCode() { return parser==null ? Short.class.hashCode() : parser.getClass().hashCode(); } } lucene-2.9.4/src/java/org/apache/lucene/search/function/package.html0000644000175000017500000001541111474320224026022 0ustar janpascaljanpascal org.apache.lucene.search.function
    Programmatic control over documents scores.
    The function package provides tight control over documents scores.
    WARNING: The status of the search.function package is experimental. The APIs introduced here might change in the future and will not be supported anymore in such a case.
    Two types of queries are available in this package:
    1. Custom Score queries - allowing to set the score of a matching document as a mathematical expression over scores of that document by contained (sub) queries.
    2. Field score queries - allowing to base the score of a document on numeric values of indexed fields.
     
    Some possible uses of these queries:
    1. Normalizing the document scores by values indexed in a special field - for instance, experimenting with a different doc length normalization.
    2. Introducing some static scoring element, to the score of a document, - for instance using some topological attribute of the links to/from a document.
    3. Computing the score of a matching document as an arbitrary odd function of its score by a certain query.
    Performance and Quality Considerations:
    1. When scoring by values of indexed fields, these values are loaded into memory. Unlike the regular scoring, where the required information is read from disk as necessary, here field values are loaded once and cached by Lucene in memory for further use, anticipating reuse by further queries. While all this is carefully cached with performance in mind, it is recommended to use these features only when the default Lucene scoring does not match your "special" application needs.
    2. Use only with carefully selected fields, because in most cases, search quality with regular Lucene scoring would outperform that of scoring by field values.
    3. Values of fields used for scoring should match. Do not apply on a field containing arbitrary (long) text. Do not mix values in the same field if that field is used for scoring.
    4. Smaller (shorter) field tokens means less RAM (something always desired). When using FieldScoreQuery, select the shortest FieldScoreQuery.Type that is sufficient for the used field values.
    5. Reusing IndexReaders/IndexSearchers is essential, because the caching of field tokens is based on an IndexReader. Whenever a new IndexReader is used, values currently in the cache cannot be used and new values must be loaded from disk. So replace/refresh readers/searchers in a controlled manner.
    History and Credits:
    • A large part of the code of this package was originated from Yonik's FunctionQuery code that was imported from Solr (see LUCENE-446).
    • The idea behind CustomScoreQurey is borrowed from the "Easily create queries that transform sub-query scores arbitrarily" contribution by Mike Klaas (see LUCENE-850) though the implementation and API here are different.
    Code sample:

    Note: code snippets here should work, but they were never really compiled... so, tests sources under TestCustomScoreQuery, TestFieldScoreQuery and TestOrdValues may also be useful.

    1. Using field (byte) values to as scores:

      Indexing:

            f = new Field("score", "7", Field.Store.NO, Field.Index.UN_TOKENIZED);
            f.setOmitNorms(true);
            d1.add(f);
          

      Search:

            Query q = new FieldScoreQuery("score", FieldScoreQuery.Type.BYTE);
          
      Document d1 above would get a score of 7.
    2. Manipulating scores

      Dividing the original score of each document by a square root of its docid (just to demonstrate what it takes to manipulate scores this way)

            Query q = queryParser.parse("my query text");
            CustomScoreQuery customQ = new CustomScoreQuery(q) {
              public float customScore(int doc, float subQueryScore, float valSrcScore) {
                return subQueryScore / Math.sqrt(docid);
              }
            };
          

      For more informative debug info on the custom query, also override the name() method:

            CustomScoreQuery customQ = new CustomScoreQuery(q) {
              public float customScore(int doc, float subQueryScore, float valSrcScore) {
                return subQueryScore / Math.sqrt(docid);
              }
              public String name() {
                return "1/sqrt(docid)";
              }
            };
          

      Taking the square root of the original score and multiplying it by a "short field driven score", ie, the short value that was indexed for the scored doc in a certain field:

            Query q = queryParser.parse("my query text");
            FieldScoreQuery qf = new FieldScoreQuery("shortScore", FieldScoreQuery.Type.SHORT);
            CustomScoreQuery customQ = new CustomScoreQuery(q,qf) {
              public float customScore(int doc, float subQueryScore, float valSrcScore) {
                return Math.sqrt(subQueryScore) * valSrcScore;
              }
              public String name() {
                return "shortVal*sqrt(score)";
              }
            };
          
    lucene-2.9.4/src/java/org/apache/lucene/search/function/FieldScoreQuery.java0000644000175000017500000001211611474320224027450 0ustar janpascaljanpascalpackage org.apache.lucene.search.function; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * A query that scores each document as the value of the numeric input field. *

    * The query matches all documents, and scores each document according to the numeric * value of that field. *

    * It is assumed, and expected, that: *

      *
    • The field used here is indexed, and has exactly * one token in every scored document.
    • *
    • Best if this field is un_tokenized.
    • *
    • That token is parseable to the selected type.
    • *
    *

    * Combining this query in a FunctionQuery allows much freedom in affecting document scores. * Note, that with this freedom comes responsibility: it is more than likely that the * default Lucene scoring is superior in quality to scoring modified as explained here. * However, in some cases, and certainly for research experiments, this capability may turn useful. *

    * When constructing this query, select the appropriate type. That type should match the data stored in the * field. So in fact the "right" type should be selected before indexing. Type selection * has effect on the RAM usage: *

      *
    • {@link Type#BYTE} consumes 1 * maxDocs bytes.
    • *
    • {@link Type#SHORT} consumes 2 * maxDocs bytes.
    • *
    • {@link Type#INT} consumes 4 * maxDocs bytes.
    • *
    • {@link Type#FLOAT} consumes 8 * maxDocs bytes.
    • *
    *

    * Caching: * Values for the numeric field are loaded once and cached in memory for further use with the same IndexReader. * To take advantage of this, it is extremely important to reuse index-readers or index-searchers, * otherwise, for instance if for each query a new index reader is opened, large penalties would be * paid for loading the field values into memory over and over again! * *

    * WARNING: The status of the search.function package is experimental. * The APIs introduced here might change in the future and will not be * supported anymore in such a case. */ public class FieldScoreQuery extends ValueSourceQuery { /** * Type of score field, indicating how field values are interpreted/parsed. *

    * The type selected at search search time should match the data stored in the field. * Different types have different RAM requirements: *

      *
    • {@link #BYTE} consumes 1 * maxDocs bytes.
    • *
    • {@link #SHORT} consumes 2 * maxDocs bytes.
    • *
    • {@link #INT} consumes 4 * maxDocs bytes.
    • *
    • {@link #FLOAT} consumes 8 * maxDocs bytes.
    • *
    */ public static class Type { /** field values are interpreted as numeric byte values. */ public static final Type BYTE = new Type("byte"); /** field values are interpreted as numeric short values. */ public static final Type SHORT = new Type("short"); /** field values are interpreted as numeric int values. */ public static final Type INT = new Type("int"); /** field values are interpreted as numeric float values. */ public static final Type FLOAT = new Type("float"); private String typeName; private Type (String name) { this.typeName = name; } /*(non-Javadoc) @see java.lang.Object#toString() */ public String toString() { return getClass().getName()+"::"+typeName; } } /** * Create a FieldScoreQuery - a query that scores each document as the value of the numeric input field. *

    * The type param tells how to parse the field string values into a numeric score value. * @param field the numeric field to be used. * @param type the type of the field: either * {@link Type#BYTE}, {@link Type#SHORT}, {@link Type#INT}, or {@link Type#FLOAT}. */ public FieldScoreQuery(String field, Type type) { super(getValueSource(field,type)); } // create the appropriate (cached) field value source. private static ValueSource getValueSource(String field, Type type) { if (type == Type.BYTE) { return new ByteFieldSource(field); } if (type == Type.SHORT) { return new ShortFieldSource(field); } if (type == Type.INT) { return new IntFieldSource(field); } if (type == Type.FLOAT) { return new FloatFieldSource(field); } throw new IllegalArgumentException(type+" is not a known Field Score Query Type!"); } } lucene-2.9.4/src/java/org/apache/lucene/search/function/CustomScoreProvider.java0000644000175000017500000001412211474320224030363 0ustar janpascaljanpascalpackage org.apache.lucene.search.function; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.ComplexExplanation; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.FieldCache; // for javadocs /** * An instance of this subclass should be returned by * {@link CustomScoreQuery#getCustomScoreProvider}, if you want * to modify the custom score calculation of a {@link CustomScoreQuery}. *

    Since Lucene 2.9, queries operate on each segment of an Index separately, * so overriding the similar (now deprecated) methods in {@link CustomScoreQuery} * is no longer suitable, as the supplied doc ID is per-segment * and without knowledge of the IndexReader you cannot access the * document or {@link FieldCache}. * * @lucene.experimental * @since 2.9.2 */ public class CustomScoreProvider { protected final IndexReader reader; /** * Creates a new instance of the provider class for the given {@link IndexReader}. */ public CustomScoreProvider(IndexReader reader) { this.reader = reader; } /** * Compute a custom score by the subQuery score and a number of * {@link ValueSourceQuery} scores. *

    * Subclasses can override this method to modify the custom score. *

    * If your custom scoring is different than the default herein you * should override at least one of the two customScore() methods. * If the number of ValueSourceQueries is always < 2 it is * sufficient to override the other * {@link #customScore(int, float, float) customScore()} * method, which is simpler. *

    * The default computation herein is a multiplication of given scores: *

       *     ModifiedScore = valSrcScore * valSrcScores[0] * valSrcScores[1] * ...
       * 
    * * @param doc id of scored doc. * @param subQueryScore score of that doc by the subQuery. * @param valSrcScores scores of that doc by the ValueSourceQuery. * @return custom score. */ public float customScore(int doc, float subQueryScore, float valSrcScores[]) throws IOException { if (valSrcScores.length == 1) { return customScore(doc, subQueryScore, valSrcScores[0]); } if (valSrcScores.length == 0) { return customScore(doc, subQueryScore, 1); } float score = subQueryScore; for(int i = 0; i < valSrcScores.length; i++) { score *= valSrcScores[i]; } return score; } /** * Compute a custom score by the subQuery score and the ValueSourceQuery score. *

    * Subclasses can override this method to modify the custom score. *

    * If your custom scoring is different than the default herein you * should override at least one of the two customScore() methods. * If the number of ValueSourceQueries is always < 2 it is * sufficient to override this customScore() method, which is simpler. *

    * The default computation herein is a multiplication of the two scores: *

       *     ModifiedScore = subQueryScore * valSrcScore
       * 
    * * @param doc id of scored doc. * @param subQueryScore score of that doc by the subQuery. * @param valSrcScore score of that doc by the ValueSourceQuery. * @return custom score. */ public float customScore(int doc, float subQueryScore, float valSrcScore) throws IOException { return subQueryScore * valSrcScore; } /** * Explain the custom score. * Whenever overriding {@link #customScore(int, float, float[])}, * this method should also be overridden to provide the correct explanation * for the part of the custom scoring. * * @param doc doc being explained. * @param subQueryExpl explanation for the sub-query part. * @param valSrcExpls explanation for the value source part. * @return an explanation for the custom score */ public Explanation customExplain(int doc, Explanation subQueryExpl, Explanation valSrcExpls[]) throws IOException { if (valSrcExpls.length == 1) { return customExplain(doc, subQueryExpl, valSrcExpls[0]); } if (valSrcExpls.length == 0) { return subQueryExpl; } float valSrcScore = 1; for (int i = 0; i < valSrcExpls.length; i++) { valSrcScore *= valSrcExpls[i].getValue(); } Explanation exp = new Explanation( valSrcScore * subQueryExpl.getValue(), "custom score: product of:"); exp.addDetail(subQueryExpl); for (int i = 0; i < valSrcExpls.length; i++) { exp.addDetail(valSrcExpls[i]); } return exp; } /** * Explain the custom score. * Whenever overriding {@link #customScore(int, float, float)}, * this method should also be overridden to provide the correct explanation * for the part of the custom scoring. * * @param doc doc being explained. * @param subQueryExpl explanation for the sub-query part. * @param valSrcExpl explanation for the value source part. * @return an explanation for the custom score */ public Explanation customExplain(int doc, Explanation subQueryExpl, Explanation valSrcExpl) throws IOException { float valSrcScore = 1; if (valSrcExpl != null) { valSrcScore *= valSrcExpl.getValue(); } Explanation exp = new Explanation( valSrcScore * subQueryExpl.getValue(), "custom score: product of:"); exp.addDetail(subQueryExpl); exp.addDetail(valSrcExpl); return exp; } } lucene-2.9.4/src/java/org/apache/lucene/search/function/OrdFieldSource.java0000644000175000017500000001033511474320224027255 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search.function; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.FieldCache; import java.io.IOException; /** * Expert: obtains the ordinal of the field value from the default Lucene * {@link org.apache.lucene.search.FieldCache Fieldcache} using getStringIndex(). *

    * The native lucene index order is used to assign an ordinal value for each field value. *

    * Example: *
    If there were only three field values: "apple","banana","pear" *
    then ord("apple")=1, ord("banana")=2, ord("pear")=3 *

    * WARNING: * ord() depends on the position in an index and can thus change * when other documents are inserted or deleted, * or if a MultiSearcher is used. * *

    * WARNING: The status of the search.function package is experimental. * The APIs introduced here might change in the future and will not be * supported anymore in such a case. * *

    NOTE: with the switch in 2.9 to segment-based * searching, if {@link #getValues} is invoked with a * composite (multi-segment) reader, this can easily cause * double RAM usage for the values in the FieldCache. It's * best to switch your application to pass only atomic * (single segment) readers to this API. Alternatively, for * a short-term fix, you could wrap your ValueSource using * {@link MultiValueSource}, which costs more CPU per lookup * but will not consume double the FieldCache RAM.

    */ public class OrdFieldSource extends ValueSource { protected String field; /** * Constructor for a certain field. * @param field field whose values order is used. */ public OrdFieldSource(String field) { this.field = field; } /*(non-Javadoc) @see org.apache.lucene.search.function.ValueSource#description() */ public String description() { return "ord(" + field + ')'; } /*(non-Javadoc) @see org.apache.lucene.search.function.ValueSource#getValues(org.apache.lucene.index.IndexReader) */ public DocValues getValues(IndexReader reader) throws IOException { final int[] arr = FieldCache.DEFAULT.getStringIndex(reader, field).order; return new DocValues() { /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#floatVal(int) */ public float floatVal(int doc) { return (float)arr[doc]; } /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#strVal(int) */ public String strVal(int doc) { // the string value of the ordinal, not the string itself return Integer.toString(arr[doc]); } /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#toString(int) */ public String toString(int doc) { return description() + '=' + intVal(doc); } /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#getInnerArray() */ Object getInnerArray() { return arr; } }; } /*(non-Javadoc) @see java.lang.Object#equals(java.lang.Object) */ public boolean equals(Object o) { if (o.getClass() != OrdFieldSource.class) return false; OrdFieldSource other = (OrdFieldSource)o; return this.field.equals(other.field); } private static final int hcode = OrdFieldSource.class.hashCode(); /*(non-Javadoc) @see java.lang.Object#hashCode() */ public int hashCode() { return hcode + field.hashCode(); } } lucene-2.9.4/src/java/org/apache/lucene/search/function/CustomScoreQuery.java0000644000175000017500000004764111474320224027712 0ustar janpascaljanpascalpackage org.apache.lucene.search.function; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Set; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.ComplexExplanation; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.Query; import org.apache.lucene.search.Weight; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.Similarity; import org.apache.lucene.util.ToStringUtils; /** * Query that sets document score as a programmatic function of several (sub) scores: *
      *
    1. the score of its subQuery (any query)
    2. *
    3. (optional) the score of its ValueSourceQuery (or queries). * For most simple/convenient use cases this query is likely to be a * {@link org.apache.lucene.search.function.FieldScoreQuery FieldScoreQuery}
    4. *
    * Subclasses can modify the computation by overriding {@link #getCustomScoreProvider}. * *

    * WARNING: The status of the search.function package is experimental. * The APIs introduced here might change in the future and will not be * supported anymore in such a case. */ public class CustomScoreQuery extends Query { private Query subQuery; private ValueSourceQuery[] valSrcQueries; // never null (empty array if there are no valSrcQueries). private boolean strict = false; // if true, valueSource part of query does not take part in weights normalization. /** * Create a CustomScoreQuery over input subQuery. * @param subQuery the sub query whose scored is being customed. Must not be null. */ public CustomScoreQuery(Query subQuery) { this(subQuery, new ValueSourceQuery[0]); } /** * Create a CustomScoreQuery over input subQuery and a {@link ValueSourceQuery}. * @param subQuery the sub query whose score is being customized. Must not be null. * @param valSrcQuery a value source query whose scores are used in the custom score * computation. For most simple/convenient use case this would be a * {@link org.apache.lucene.search.function.FieldScoreQuery FieldScoreQuery}. * This parameter is optional - it can be null. */ public CustomScoreQuery(Query subQuery, ValueSourceQuery valSrcQuery) { this(subQuery, valSrcQuery!=null ? // don't want an array that contains a single null.. new ValueSourceQuery[] {valSrcQuery} : new ValueSourceQuery[0]); } /** * Create a CustomScoreQuery over input subQuery and a {@link ValueSourceQuery}. * @param subQuery the sub query whose score is being customized. Must not be null. * @param valSrcQueries value source queries whose scores are used in the custom score * computation. For most simple/convenient use case these would be * {@link org.apache.lucene.search.function.FieldScoreQuery FieldScoreQueries}. * This parameter is optional - it can be null or even an empty array. */ public CustomScoreQuery(Query subQuery, ValueSourceQuery valSrcQueries[]) { this.subQuery = subQuery; this.valSrcQueries = valSrcQueries!=null? valSrcQueries : new ValueSourceQuery[0]; if (subQuery == null) throw new IllegalArgumentException(" must not be null!"); } /*(non-Javadoc) @see org.apache.lucene.search.Query#rewrite(org.apache.lucene.index.IndexReader) */ public Query rewrite(IndexReader reader) throws IOException { CustomScoreQuery clone = null; final Query sq = subQuery.rewrite(reader); if (sq != subQuery) { clone = (CustomScoreQuery) clone(); clone.subQuery = sq; } for(int i = 0; i < valSrcQueries.length; i++) { final ValueSourceQuery v = (ValueSourceQuery) valSrcQueries[i].rewrite(reader); if (v != valSrcQueries[i]) { if (clone == null) clone = (CustomScoreQuery) clone(); clone.valSrcQueries[i] = v; } } return (clone == null) ? this : clone; } /*(non-Javadoc) @see org.apache.lucene.search.Query#extractTerms(java.util.Set) */ public void extractTerms(Set terms) { subQuery.extractTerms(terms); for(int i = 0; i < valSrcQueries.length; i++) { valSrcQueries[i].extractTerms(terms); } } /*(non-Javadoc) @see org.apache.lucene.search.Query#clone() */ public Object clone() { CustomScoreQuery clone = (CustomScoreQuery)super.clone(); clone.subQuery = (Query) subQuery.clone(); clone.valSrcQueries = new ValueSourceQuery[valSrcQueries.length]; for(int i = 0; i < valSrcQueries.length; i++) { clone.valSrcQueries[i] = (ValueSourceQuery) valSrcQueries[i].clone(); } return clone; } /* (non-Javadoc) @see org.apache.lucene.search.Query#toString(java.lang.String) */ public String toString(String field) { StringBuffer sb = new StringBuffer(name()).append("("); sb.append(subQuery.toString(field)); for(int i = 0; i < valSrcQueries.length; i++) { sb.append(", ").append(valSrcQueries[i].toString(field)); } sb.append(")"); sb.append(strict?" STRICT" : ""); return sb.toString() + ToStringUtils.boost(getBoost()); } /** Returns true if o is equal to this. */ public boolean equals(Object o) { if (getClass() != o.getClass()) { return false; } CustomScoreQuery other = (CustomScoreQuery)o; if (this.getBoost() != other.getBoost() || !this.subQuery.equals(other.subQuery) || this.strict != other.strict || this.valSrcQueries.length != other.valSrcQueries.length) { return false; } for (int i=0; i * Note: only has effect when the ValueSource part is not null. */ public boolean isStrict() { return strict; } /** * Set the strict mode of this query. * @param strict The strict mode to set. * @see #isStrict() */ public void setStrict(boolean strict) { this.strict = strict; } /** * A short name of this query, used in {@link #toString(String)}. */ public String name() { return "custom"; } } lucene-2.9.4/src/java/org/apache/lucene/search/function/ValueSource.java0000644000175000017500000000516211474320224026643 0ustar janpascaljanpascalpackage org.apache.lucene.search.function; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.function.DocValues; import java.io.IOException; import java.io.Serializable; /** * Expert: source of values for basic function queries. *

    At its default/simplest form, values - one per doc - are used as the score of that doc. *

    Values are instantiated as * {@link org.apache.lucene.search.function.DocValues DocValues} for a particular reader. *

    ValueSource implementations differ in RAM requirements: it would always be a factor * of the number of documents, but for each document the number of bytes can be 1, 2, 4, or 8. * *

    * WARNING: The status of the search.function package is experimental. * The APIs introduced here might change in the future and will not be * supported anymore in such a case. * * */ public abstract class ValueSource implements Serializable { /** * Return the DocValues used by the function query. * @param reader the IndexReader used to read these values. * If any caching is involved, that caching would also be IndexReader based. * @throws IOException for any error. */ public abstract DocValues getValues(IndexReader reader) throws IOException; /** * description of field, used in explain() */ public abstract String description(); /* (non-Javadoc) @see java.lang.Object#toString() */ public String toString() { return description(); } /** * Needed for possible caching of query results - used by {@link ValueSourceQuery#equals(Object)}. * @see Object#equals(Object) */ public abstract boolean equals(Object o); /** * Needed for possible caching of query results - used by {@link ValueSourceQuery#hashCode()}. * @see Object#hashCode() */ public abstract int hashCode(); } lucene-2.9.4/src/java/org/apache/lucene/search/TopScoreDocCollector.java0000644000175000017500000001176611474320224026623 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.IndexReader; /** * A {@link Collector} implementation that collects the top-scoring hits, * returning them as a {@link TopDocs}. This is used by {@link IndexSearcher} to * implement {@link TopDocs}-based search. Hits are sorted by score descending * and then (when the scores are tied) docID ascending. When you create an * instance of this collector you should know in advance whether documents are * going to be collected in doc Id order or not. * *

    NOTE: The values {@link Float#NaN} and * {Float#NEGATIVE_INFINITY} are not valid scores. This * collector will not properly collect hits with such * scores. */ public abstract class TopScoreDocCollector extends TopDocsCollector { // Assumes docs are scored in order. private static class InOrderTopScoreDocCollector extends TopScoreDocCollector { private InOrderTopScoreDocCollector(int numHits) { super(numHits); } public void collect(int doc) throws IOException { float score = scorer.score(); // This collector cannot handle these scores: assert score != Float.NEGATIVE_INFINITY; assert !Float.isNaN(score); totalHits++; if (score <= pqTop.score) { // Since docs are returned in-order (i.e., increasing doc Id), a document // with equal score to pqTop.score cannot compete since HitQueue favors // documents with lower doc Ids. Therefore reject those docs too. return; } pqTop.doc = doc + docBase; pqTop.score = score; pqTop = (ScoreDoc) pq.updateTop(); } public boolean acceptsDocsOutOfOrder() { return false; } } // Assumes docs are scored out of order. private static class OutOfOrderTopScoreDocCollector extends TopScoreDocCollector { private OutOfOrderTopScoreDocCollector(int numHits) { super(numHits); } public void collect(int doc) throws IOException { float score = scorer.score(); // This collector cannot handle NaN assert !Float.isNaN(score); totalHits++; doc += docBase; if (score < pqTop.score || (score == pqTop.score && doc > pqTop.doc)) { return; } pqTop.doc = doc; pqTop.score = score; pqTop = (ScoreDoc) pq.updateTop(); } public boolean acceptsDocsOutOfOrder() { return true; } } /** * Creates a new {@link TopScoreDocCollector} given the number of hits to * collect and whether documents are scored in order by the input * {@link Scorer} to {@link #setScorer(Scorer)}. * *

    NOTE: The instances returned by this method * pre-allocate a full array of length * numHits, and fill the array with sentinel * objects. */ public static TopScoreDocCollector create(int numHits, boolean docsScoredInOrder) { if (docsScoredInOrder) { return new InOrderTopScoreDocCollector(numHits); } else { return new OutOfOrderTopScoreDocCollector(numHits); } } ScoreDoc pqTop; int docBase = 0; Scorer scorer; // prevents instantiation private TopScoreDocCollector(int numHits) { super(new HitQueue(numHits, true)); // HitQueue implements getSentinelObject to return a ScoreDoc, so we know // that at this point top() is already initialized. pqTop = (ScoreDoc) pq.top(); } protected TopDocs newTopDocs(ScoreDoc[] results, int start) { if (results == null) { return EMPTY_TOPDOCS; } // We need to compute maxScore in order to set it in TopDocs. If start == 0, // it means the largest element is already in results, use its score as // maxScore. Otherwise pop everything else, until the largest element is // extracted and use its score as maxScore. float maxScore = Float.NaN; if (start == 0) { maxScore = results[0].score; } else { for (int i = pq.size(); i > 1; i--) { pq.pop(); } maxScore = ((ScoreDoc) pq.pop()).score; } return new TopDocs(totalHits, results, maxScore); } public void setNextReader(IndexReader reader, int base) { docBase = base; } public void setScorer(Scorer scorer) throws IOException { this.scorer = scorer; } } lucene-2.9.4/src/java/org/apache/lucene/search/ReqOptSumScorer.java0000644000175000017500000000660611474320224025642 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; /** A Scorer for queries with a required part and an optional part. * Delays skipTo() on the optional part until a score() is needed. *
    * This Scorer implements {@link Scorer#skipTo(int)}. */ class ReqOptSumScorer extends Scorer { /** The scorers passed from the constructor. * These are set to null as soon as their next() or skipTo() returns false. */ private Scorer reqScorer; private Scorer optScorer; /** Construct a ReqOptScorer. * @param reqScorer The required scorer. This must match. * @param optScorer The optional scorer. This is used for scoring only. */ public ReqOptSumScorer( Scorer reqScorer, Scorer optScorer) { super(null); // No similarity used. this.reqScorer = reqScorer; this.optScorer = optScorer; } /** @deprecated use {@link #nextDoc()} instead. */ public boolean next() throws IOException { return reqScorer.next(); } public int nextDoc() throws IOException { return reqScorer.nextDoc(); } /** @deprecated use {@link #advance(int)} instead. */ public boolean skipTo(int target) throws IOException { return reqScorer.skipTo(target); } public int advance(int target) throws IOException { return reqScorer.advance(target); } /** @deprecated use {@link #docID()} instead. */ public int doc() { return reqScorer.doc(); } public int docID() { return reqScorer.docID(); } /** Returns the score of the current document matching the query. * Initially invalid, until {@link #next()} is called the first time. * @return The score of the required scorer, eventually increased by the score * of the optional scorer when it also matches the current document. */ public float score() throws IOException { int curDoc = reqScorer.docID(); float reqScore = reqScorer.score(); if (optScorer == null) { return reqScore; } int optScorerDoc = optScorer.docID(); if (optScorerDoc < curDoc && (optScorerDoc = optScorer.advance(curDoc)) == NO_MORE_DOCS) { optScorer = null; return reqScore; } return optScorerDoc == curDoc ? reqScore + optScorer.score() : reqScore; } /** Explain the score of a document. * TODO: Also show the total score. * See BooleanScorer.explain() on how to do this. */ public Explanation explain(int doc) throws IOException { Explanation res = new Explanation(); res.setDescription("required, optional"); res.addDetail(reqScorer.explain(doc)); res.addDetail(optScorer.explain(doc)); return res; } } lucene-2.9.4/src/java/org/apache/lucene/search/DisjunctionMaxScorer.java0000644000175000017500000001600311474320224026672 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; /** * The Scorer for DisjunctionMaxQuery. The union of all documents generated by the the subquery scorers * is generated in document number order. The score for each document is the maximum of the scores computed * by the subquery scorers that generate that document, plus tieBreakerMultiplier times the sum of the scores * for the other subqueries that generate the document. */ class DisjunctionMaxScorer extends Scorer { /* The scorers for subqueries that have remaining docs, kept as a min heap by number of next doc. */ private final Scorer[] subScorers; private int numScorers; /* Multiplier applied to non-maximum-scoring subqueries for a document as they are summed into the result. */ private final float tieBreakerMultiplier; private int doc = -1; /** * Creates a new instance of DisjunctionMaxScorer * * @param tieBreakerMultiplier * Multiplier applied to non-maximum-scoring subqueries for a * document as they are summed into the result. * @param similarity * -- not used since our definition involves neither coord nor terms * directly * @param subScorers * The sub scorers this Scorer should iterate on * @param numScorers * The actual number of scorers to iterate on. Note that the array's * length may be larger than the actual number of scorers. */ public DisjunctionMaxScorer(float tieBreakerMultiplier, Similarity similarity, Scorer[] subScorers, int numScorers) throws IOException { super(similarity); this.tieBreakerMultiplier = tieBreakerMultiplier; // The passed subScorers array includes only scorers which have documents // (DisjunctionMaxQuery takes care of that), and their nextDoc() was already // called. this.subScorers = subScorers; this.numScorers = numScorers; heapify(); } /** * Generate the next document matching our associated DisjunctionMaxQuery. * * @return true iff there is a next document * @deprecated use {@link #nextDoc()} instead. */ public boolean next() throws IOException { return nextDoc() != NO_MORE_DOCS; } public int nextDoc() throws IOException { if (numScorers == 0) return doc = NO_MORE_DOCS; while (subScorers[0].docID() == doc) { if (subScorers[0].nextDoc() != NO_MORE_DOCS) { heapAdjust(0); } else { heapRemoveRoot(); if (numScorers == 0) { return doc = NO_MORE_DOCS; } } } return doc = subScorers[0].docID(); } /** @deprecated use {@link #docID()} instead. */ public int doc() { return subScorers[0].doc(); } public int docID() { return doc; } /** Determine the current document score. Initially invalid, until {@link #next()} is called the first time. * @return the score of the current generated document */ public float score() throws IOException { int doc = subScorers[0].docID(); float[] sum = { subScorers[0].score() }, max = { sum[0] }; int size = numScorers; scoreAll(1, size, doc, sum, max); scoreAll(2, size, doc, sum, max); return max[0] + (sum[0] - max[0]) * tieBreakerMultiplier; } // Recursively iterate all subScorers that generated last doc computing sum and max private void scoreAll(int root, int size, int doc, float[] sum, float[] max) throws IOException { if (root < size && subScorers[root].docID() == doc) { float sub = subScorers[root].score(); sum[0] += sub; max[0] = Math.max(max[0], sub); scoreAll((root<<1)+1, size, doc, sum, max); scoreAll((root<<1)+2, size, doc, sum, max); } } /** * Advance to the first document beyond the current whose number is greater * than or equal to target. * * @param target * the minimum number of the next desired document * @return true iff there is a document to be generated whose number is at * least target * @deprecated use {@link #advance(int)} instead. */ public boolean skipTo(int target) throws IOException { return advance(target) != NO_MORE_DOCS; } public int advance(int target) throws IOException { if (numScorers == 0) return doc = NO_MORE_DOCS; while (subScorers[0].docID() < target) { if (subScorers[0].advance(target) != NO_MORE_DOCS) { heapAdjust(0); } else { heapRemoveRoot(); if (numScorers == 0) { return doc = NO_MORE_DOCS; } } } return doc = subScorers[0].docID(); } /** Explain a score that we computed. UNSUPPORTED -- see explanation capability in DisjunctionMaxQuery. * @param doc the number of a document we scored * @return the Explanation for our score */ public Explanation explain(int doc) throws IOException { throw new UnsupportedOperationException(); } // Organize subScorers into a min heap with scorers generating the earliest document on top. private void heapify() { for (int i = (numScorers >> 1) - 1; i >= 0; i--) { heapAdjust(i); } } /* The subtree of subScorers at root is a min heap except possibly for its root element. * Bubble the root down as required to make the subtree a heap. */ private void heapAdjust(int root) { Scorer scorer = subScorers[root]; int doc = scorer.docID(); int i = root; while (i <= (numScorers >> 1) - 1) { int lchild = (i << 1) + 1; Scorer lscorer = subScorers[lchild]; int ldoc = lscorer.docID(); int rdoc = Integer.MAX_VALUE, rchild = (i << 1) + 2; Scorer rscorer = null; if (rchild < numScorers) { rscorer = subScorers[rchild]; rdoc = rscorer.docID(); } if (ldoc < doc) { if (rdoc < ldoc) { subScorers[i] = rscorer; subScorers[rchild] = scorer; i = rchild; } else { subScorers[i] = lscorer; subScorers[lchild] = scorer; i = lchild; } } else if (rdoc < doc) { subScorers[i] = rscorer; subScorers[rchild] = scorer; i = rchild; } else { return; } } } // Remove the root Scorer from subScorers and re-establish it as a heap private void heapRemoveRoot() { if (numScorers == 1) { subScorers[0] = null; numScorers = 0; } else { subScorers[0] = subScorers[numScorers - 1]; subScorers[numScorers - 1] = null; --numScorers; heapAdjust(0); } } } lucene-2.9.4/src/java/org/apache/lucene/search/Query.java0000644000175000017500000001766211474320224023676 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import org.apache.lucene.index.IndexReader; /** The abstract base class for queries.

    Instantiable subclasses are:

    • {@link TermQuery}
    • {@link MultiTermQuery}
    • {@link BooleanQuery}
    • {@link WildcardQuery}
    • {@link PhraseQuery}
    • {@link PrefixQuery}
    • {@link MultiPhraseQuery}
    • {@link FuzzyQuery}
    • {@link TermRangeQuery}
    • {@link NumericRangeQuery}
    • {@link org.apache.lucene.search.spans.SpanQuery}

    A parser for queries is contained in:

    • {@link org.apache.lucene.queryParser.QueryParser QueryParser}
    */ public abstract class Query implements java.io.Serializable, Cloneable { private float boost = 1.0f; // query boost factor /** Sets the boost for this query clause to b. Documents * matching this clause will (in addition to the normal weightings) have * their score multiplied by b. */ public void setBoost(float b) { boost = b; } /** Gets the boost for this clause. Documents matching * this clause will (in addition to the normal weightings) have their score * multiplied by b. The boost is 1.0 by default. */ public float getBoost() { return boost; } /** Prints a query to a string, with field assumed to be the * default field and omitted. *

    The representation used is one that is supposed to be readable * by {@link org.apache.lucene.queryParser.QueryParser QueryParser}. However, * there are the following limitations: *

      *
    • If the query was created by the parser, the printed * representation may not be exactly what was parsed. For example, * characters that need to be escaped will be represented without * the required backslash.
    • *
    • Some of the more complicated queries (e.g. span queries) * don't have a representation that can be parsed by QueryParser.
    • *
    */ public abstract String toString(String field); /** Prints a query to a string. */ public String toString() { return toString(""); } /** * Expert: Constructs an appropriate Weight implementation for this query. * *

    * Only implemented by primitive queries, which re-write to themselves. */ public Weight createWeight(Searcher searcher) throws IOException { throw new UnsupportedOperationException(); } /** * Expert: Constructs and initializes a Weight for a top-level query. */ public Weight weight(Searcher searcher) throws IOException { Query query = searcher.rewrite(this); Weight weight = query.createWeight(searcher); float sum = weight.sumOfSquaredWeights(); float norm = getSimilarity(searcher).queryNorm(sum); if (Float.isInfinite(norm) || Float.isNaN(norm)) norm = 1.0f; weight.normalize(norm); return weight; } /** Expert: called to re-write queries into primitive queries. For example, * a PrefixQuery will be rewritten into a BooleanQuery that consists * of TermQuerys. */ public Query rewrite(IndexReader reader) throws IOException { return this; } /** Expert: called when re-writing queries under MultiSearcher. * * Create a single query suitable for use by all subsearchers (in 1-1 * correspondence with queries). This is an optimization of the OR of * all queries. We handle the common optimization cases of equal * queries and overlapping clauses of boolean OR queries (as generated * by MultiTermQuery.rewrite()). * Be careful overriding this method as queries[0] determines which * method will be called and is not necessarily of the same type as * the other queries. */ public Query combine(Query[] queries) { HashSet uniques = new HashSet(); for (int i = 0; i < queries.length; i++) { Query query = queries[i]; BooleanClause[] clauses = null; // check if we can split the query into clauses boolean splittable = (query instanceof BooleanQuery); if(splittable){ BooleanQuery bq = (BooleanQuery) query; splittable = bq.isCoordDisabled(); clauses = bq.getClauses(); for (int j = 0; splittable && j < clauses.length; j++) { splittable = (clauses[j].getOccur() == BooleanClause.Occur.SHOULD); } } if(splittable){ for (int j = 0; j < clauses.length; j++) { uniques.add(clauses[j].getQuery()); } } else { uniques.add(query); } } // optimization: if we have just one query, just return it if(uniques.size() == 1){ return (Query)uniques.iterator().next(); } Iterator it = uniques.iterator(); BooleanQuery result = new BooleanQuery(true); while (it.hasNext()) result.add((Query) it.next(), BooleanClause.Occur.SHOULD); return result; } /** * Expert: adds all terms occurring in this query to the terms set. Only * works if this query is in its {@link #rewrite rewritten} form. * * @throws UnsupportedOperationException if this query is not yet rewritten */ public void extractTerms(Set terms) { // needs to be implemented by query subclasses throw new UnsupportedOperationException(); } /** Expert: merges the clauses of a set of BooleanQuery's into a single * BooleanQuery. * *

    A utility for use by {@link #combine(Query[])} implementations. */ public static Query mergeBooleanQueries(BooleanQuery[] queries) { HashSet allClauses = new HashSet(); for (int i = 0; i < queries.length; i++) { BooleanClause[] clauses = queries[i].getClauses(); for (int j = 0; j < clauses.length; j++) { allClauses.add(clauses[j]); } } boolean coordDisabled = queries.length==0? false : queries[0].isCoordDisabled(); BooleanQuery result = new BooleanQuery(coordDisabled); Iterator i = allClauses.iterator(); while (i.hasNext()) { result.add((BooleanClause)i.next()); } return result; } /** Expert: Returns the Similarity implementation to be used for this query. * Subclasses may override this method to specify their own Similarity * implementation, perhaps one that delegates through that of the Searcher. * By default the Searcher's Similarity implementation is returned.*/ public Similarity getSimilarity(Searcher searcher) { return searcher.getSimilarity(); } /** Returns a clone of this query. */ public Object clone() { try { return super.clone(); } catch (CloneNotSupportedException e) { throw new RuntimeException("Clone not supported: " + e.getMessage()); } } public int hashCode() { final int prime = 31; int result = 1; result = prime * result + Float.floatToIntBits(boost); return result; } public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; Query other = (Query) obj; if (Float.floatToIntBits(boost) != Float.floatToIntBits(other.boost)) return false; return true; } } lucene-2.9.4/src/java/org/apache/lucene/search/Explanation.java0000644000175000017500000001043511474320224025042 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Serializable; import java.util.ArrayList; /** Expert: Describes the score computation for document and query. */ public class Explanation implements java.io.Serializable { private float value; // the value of this node private String description; // what it represents private ArrayList details; // sub-explanations public Explanation() {} public Explanation(float value, String description) { this.value = value; this.description = description; } /** * Indicates whether or not this Explanation models a good match. * *

    * By default, an Explanation represents a "match" if the value is positive. *

    * @see #getValue */ public boolean isMatch() { return (0.0f < getValue()); } /** The value assigned to this explanation node. */ public float getValue() { return value; } /** Sets the value assigned to this explanation node. */ public void setValue(float value) { this.value = value; } /** A description of this explanation node. */ public String getDescription() { return description; } /** Sets the description of this explanation node. */ public void setDescription(String description) { this.description = description; } /** * A short one line summary which should contain all high level * information about this Explanation, without the "Details" */ protected String getSummary() { return getValue() + " = " + getDescription(); } /** The sub-nodes of this explanation node. */ public Explanation[] getDetails() { if (details == null) return null; return (Explanation[])details.toArray(new Explanation[0]); } /** Adds a sub-node to this explanation node. */ public void addDetail(Explanation detail) { if (details == null) details = new ArrayList(); details.add(detail); } /** Render an explanation as text. */ public String toString() { return toString(0); } protected String toString(int depth) { StringBuffer buffer = new StringBuffer(); for (int i = 0; i < depth; i++) { buffer.append(" "); } buffer.append(getSummary()); buffer.append("\n"); Explanation[] details = getDetails(); if (details != null) { for (int i = 0 ; i < details.length; i++) { buffer.append(details[i].toString(depth+1)); } } return buffer.toString(); } /** Render an explanation as HTML. */ public String toHtml() { StringBuffer buffer = new StringBuffer(); buffer.append("
      \n"); buffer.append("
    • "); buffer.append(getSummary()); buffer.append("
      \n"); Explanation[] details = getDetails(); if (details != null) { for (int i = 0 ; i < details.length; i++) { buffer.append(details[i].toHtml()); } } buffer.append("
    • \n"); buffer.append("
    \n"); return buffer.toString(); } /** * Small Util class used to pass both an idf factor as well as an * explanation for that factor. * * This class will likely be held on a {@link Weight}, so be aware * before storing any large or un-serializable fields. * */ public static abstract class IDFExplanation implements Serializable { /** * @return the idf factor */ public abstract float getIdf(); /** * This should be calculated lazily if possible. * * @return the explanation for the idf factor. */ public abstract String explain(); } } lucene-2.9.4/src/java/org/apache/lucene/search/MultiSearcher.java0000644000175000017500000002571711474320224025340 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.ReaderUtil; import java.io.IOException; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; /** Implements search over a set of Searchables. * *

    Applications usually need only call the inherited {@link #search(Query)} * or {@link #search(Query,Filter)} methods. */ public class MultiSearcher extends Searcher { /** * Document Frequency cache acting as a Dummy-Searcher. This class is no * full-fledged Searcher, but only supports the methods necessary to * initialize Weights. */ private static class CachedDfSource extends Searcher { private Map dfMap; // Map from Terms to corresponding doc freqs private int maxDoc; // document count public CachedDfSource(Map dfMap, int maxDoc, Similarity similarity) { this.dfMap = dfMap; this.maxDoc = maxDoc; setSimilarity(similarity); } public int docFreq(Term term) { int df; try { df = ((Integer) dfMap.get(term)).intValue(); } catch (NullPointerException e) { throw new IllegalArgumentException("df for term " + term.text() + " not available"); } return df; } public int[] docFreqs(Term[] terms) { int[] result = new int[terms.length]; for (int i = 0; i < terms.length; i++) { result[i] = docFreq(terms[i]); } return result; } public int maxDoc() { return maxDoc; } public Query rewrite(Query query) { // this is a bit of a hack. We know that a query which // creates a Weight based on this Dummy-Searcher is // always already rewritten (see preparedWeight()). // Therefore we just return the unmodified query here return query; } public void close() { throw new UnsupportedOperationException(); } public Document doc(int i) { throw new UnsupportedOperationException(); } public Document doc(int i, FieldSelector fieldSelector) { throw new UnsupportedOperationException(); } public Explanation explain(Weight weight,int doc) { throw new UnsupportedOperationException(); } public void search(Weight weight, Filter filter, Collector results) { throw new UnsupportedOperationException(); } public TopDocs search(Weight weight,Filter filter,int n) { throw new UnsupportedOperationException(); } public TopFieldDocs search(Weight weight,Filter filter,int n,Sort sort) { throw new UnsupportedOperationException(); } } private Searchable[] searchables; private int[] starts; private int maxDoc = 0; /** Creates a searcher which searches searchers. */ public MultiSearcher(Searchable[] searchables) throws IOException { this.searchables = searchables; starts = new int[searchables.length + 1]; // build starts array for (int i = 0; i < searchables.length; i++) { starts[i] = maxDoc; maxDoc += searchables[i].maxDoc(); // compute maxDocs } starts[searchables.length] = maxDoc; } /** Return the array of {@link Searchable}s this searches. */ public Searchable[] getSearchables() { return searchables; } protected int[] getStarts() { return starts; } // inherit javadoc public void close() throws IOException { for (int i = 0; i < searchables.length; i++) searchables[i].close(); } public int docFreq(Term term) throws IOException { int docFreq = 0; for (int i = 0; i < searchables.length; i++) docFreq += searchables[i].docFreq(term); return docFreq; } // inherit javadoc public Document doc(int n) throws CorruptIndexException, IOException { int i = subSearcher(n); // find searcher index return searchables[i].doc(n - starts[i]); // dispatch to searcher } // inherit javadoc public Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException { int i = subSearcher(n); // find searcher index return searchables[i].doc(n - starts[i], fieldSelector); // dispatch to searcher } /** Returns index of the searcher for document n in the array * used to construct this searcher. */ public int subSearcher(int n) { // find searcher for doc n: return ReaderUtil.subIndex(n, starts); } /** Returns the document number of document n within its * sub-index. */ public int subDoc(int n) { return n - starts[subSearcher(n)]; } public int maxDoc() throws IOException { return maxDoc; } public TopDocs search(Weight weight, Filter filter, int nDocs) throws IOException { HitQueue hq = new HitQueue(nDocs, false); int totalHits = 0; for (int i = 0; i < searchables.length; i++) { // search each searcher TopDocs docs = searchables[i].search(weight, filter, nDocs); totalHits += docs.totalHits; // update totalHits ScoreDoc[] scoreDocs = docs.scoreDocs; for (int j = 0; j < scoreDocs.length; j++) { // merge scoreDocs into hq ScoreDoc scoreDoc = scoreDocs[j]; scoreDoc.doc += starts[i]; // convert doc if(!hq.insert(scoreDoc)) break; // no more scores > minScore } } ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()]; for (int i = hq.size()-1; i >= 0; i--) // put docs in array scoreDocs[i] = (ScoreDoc)hq.pop(); float maxScore = (totalHits==0) ? Float.NEGATIVE_INFINITY : scoreDocs[0].score; return new TopDocs(totalHits, scoreDocs, maxScore); } public TopFieldDocs search (Weight weight, Filter filter, int n, Sort sort) throws IOException { FieldDocSortedHitQueue hq = null; int totalHits = 0; float maxScore=Float.NEGATIVE_INFINITY; for (int i = 0; i < searchables.length; i++) { // search each searcher TopFieldDocs docs = searchables[i].search (weight, filter, n, sort); // If one of the Sort fields is FIELD_DOC, need to fix its values, so that // it will break ties by doc Id properly. Otherwise, it will compare to // 'relative' doc Ids, that belong to two different searchers. for (int j = 0; j < docs.fields.length; j++) { if (docs.fields[j].getType() == SortField.DOC) { // iterate over the score docs and change their fields value for (int j2 = 0; j2 < docs.scoreDocs.length; j2++) { FieldDoc fd = (FieldDoc) docs.scoreDocs[j2]; fd.fields[j] = new Integer(((Integer) fd.fields[j]).intValue() + starts[i]); } break; } } if (hq == null) hq = new FieldDocSortedHitQueue (docs.fields, n); totalHits += docs.totalHits; // update totalHits maxScore = Math.max(maxScore, docs.getMaxScore()); ScoreDoc[] scoreDocs = docs.scoreDocs; for (int j = 0; j < scoreDocs.length; j++) { // merge scoreDocs into hq ScoreDoc scoreDoc = scoreDocs[j]; scoreDoc.doc += starts[i]; // convert doc if (!hq.insert (scoreDoc)) break; // no more scores > minScore } } ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()]; for (int i = hq.size() - 1; i >= 0; i--) // put docs in array scoreDocs[i] = (ScoreDoc) hq.pop(); return new TopFieldDocs (totalHits, scoreDocs, hq.getFields(), maxScore); } // inherit javadoc public void search(Weight weight, Filter filter, final Collector collector) throws IOException { for (int i = 0; i < searchables.length; i++) { final int start = starts[i]; final Collector hc = new Collector() { public void setScorer(Scorer scorer) throws IOException { collector.setScorer(scorer); } public void collect(int doc) throws IOException { collector.collect(doc); } public void setNextReader(IndexReader reader, int docBase) throws IOException { collector.setNextReader(reader, start + docBase); } public boolean acceptsDocsOutOfOrder() { return collector.acceptsDocsOutOfOrder(); } }; searchables[i].search(weight, filter, hc); } } public Query rewrite(Query original) throws IOException { Query[] queries = new Query[searchables.length]; for (int i = 0; i < searchables.length; i++) { queries[i] = searchables[i].rewrite(original); } return queries[0].combine(queries); } public Explanation explain(Weight weight, int doc) throws IOException { int i = subSearcher(doc); // find searcher index return searchables[i].explain(weight, doc - starts[i]); // dispatch to searcher } /** * Create weight in multiple index scenario. * * Distributed query processing is done in the following steps: * 1. rewrite query * 2. extract necessary terms * 3. collect dfs for these terms from the Searchables * 4. create query weight using aggregate dfs. * 5. distribute that weight to Searchables * 6. merge results * * Steps 1-4 are done here, 5+6 in the search() methods * * @return rewritten queries */ protected Weight createWeight(Query original) throws IOException { // step 1 Query rewrittenQuery = rewrite(original); // step 2 Set terms = new HashSet(); rewrittenQuery.extractTerms(terms); // step3 Term[] allTermsArray = new Term[terms.size()]; terms.toArray(allTermsArray); int[] aggregatedDfs = new int[terms.size()]; for (int i = 0; i < searchables.length; i++) { int[] dfs = searchables[i].docFreqs(allTermsArray); for(int j=0; j> 1; for (int i = 0; i < max; i++) { Scorer tmp = scorers[i]; int idx = end - i - 1; scorers[i] = scorers[idx]; scorers[idx] = tmp; } } private int doNext() throws IOException { int first = 0; int doc = scorers[scorers.length - 1].docID(); Scorer firstScorer; while ((firstScorer = scorers[first]).docID() < doc) { doc = firstScorer.advance(doc); first = first == scorers.length - 1 ? 0 : first + 1; } return doc; } public int advance(int target) throws IOException { if (lastDoc == NO_MORE_DOCS) { return lastDoc; } else if (scorers[(scorers.length - 1)].docID() < target) { scorers[(scorers.length - 1)].advance(target); } return lastDoc = doNext(); } /** @deprecated use {@link #docID()} instead. */ public int doc() { return lastDoc; } public int docID() { return lastDoc; } public Explanation explain(int doc) { throw new UnsupportedOperationException(); } /** @deprecated use {@link #nextDoc()} instead. */ public boolean next() throws IOException { return nextDoc() != NO_MORE_DOCS; } public int nextDoc() throws IOException { if (lastDoc == NO_MORE_DOCS) { return lastDoc; } else if (lastDoc == -1) { return lastDoc = scorers[scorers.length - 1].docID(); } scorers[(scorers.length - 1)].nextDoc(); return lastDoc = doNext(); } public float score() throws IOException { float sum = 0.0f; for (int i = 0; i < scorers.length; i++) { sum += scorers[i].score(); } return sum * coord; } /** @deprecated use {@link #advance(int)} instead. */ public boolean skipTo(int target) throws IOException { return advance(target) != NO_MORE_DOCS; } } lucene-2.9.4/src/java/org/apache/lucene/search/TermRangeFilter.java0000644000175000017500000001140511474320224025610 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.text.Collator; /** * A Filter that restricts search results to a range of term * values in a given field. * *

    This filter matches the documents looking for terms that fall into the * supplied range according to {@link * String#compareTo(String)}, unless a Collator is provided. It is not intended * for numerical ranges; use {@link NumericRangeFilter} instead. * *

    If you construct a large number of range filters with different ranges but on the * same field, {@link FieldCacheRangeFilter} may have significantly better performance. * @since 2.9 */ public class TermRangeFilter extends MultiTermQueryWrapperFilter { /** * @param fieldName The field this range applies to * @param lowerTerm The lower bound on this range * @param upperTerm The upper bound on this range * @param includeLower Does this range include the lower bound? * @param includeUpper Does this range include the upper bound? * @throws IllegalArgumentException if both terms are null or if * lowerTerm is null and includeLower is true (similar for upperTerm * and includeUpper) */ public TermRangeFilter(String fieldName, String lowerTerm, String upperTerm, boolean includeLower, boolean includeUpper) { super(new TermRangeQuery(fieldName, lowerTerm, upperTerm, includeLower, includeUpper)); } /** * WARNING: Using this constructor and supplying a non-null * value in the collator parameter will cause every single * index Term in the Field referenced by lowerTerm and/or upperTerm to be * examined. Depending on the number of index Terms in this Field, the * operation could be very slow. * * @param lowerTerm The lower bound on this range * @param upperTerm The upper bound on this range * @param includeLower Does this range include the lower bound? * @param includeUpper Does this range include the upper bound? * @param collator The collator to use when determining range inclusion; set * to null to use Unicode code point ordering instead of collation. * @throws IllegalArgumentException if both terms are null or if * lowerTerm is null and includeLower is true (similar for upperTerm * and includeUpper) */ public TermRangeFilter(String fieldName, String lowerTerm, String upperTerm, boolean includeLower, boolean includeUpper, Collator collator) { super(new TermRangeQuery(fieldName, lowerTerm, upperTerm, includeLower, includeUpper, collator)); } /** * Constructs a filter for field fieldName matching * less than or equal to upperTerm. */ public static TermRangeFilter Less(String fieldName, String upperTerm) { return new TermRangeFilter(fieldName, null, upperTerm, false, true); } /** * Constructs a filter for field fieldName matching * greater than or equal to lowerTerm. */ public static TermRangeFilter More(String fieldName, String lowerTerm) { return new TermRangeFilter(fieldName, lowerTerm, null, true, false); } /** Returns the field name for this filter */ public String getField() { return ((TermRangeQuery) query).getField(); } /** Returns the lower value of this range filter */ public String getLowerTerm() { return ((TermRangeQuery) query).getLowerTerm(); } /** Returns the upper value of this range filter */ public String getUpperTerm() { return ((TermRangeQuery) query).getUpperTerm(); } /** Returns true if the lower endpoint is inclusive */ public boolean includesLower() { return ((TermRangeQuery) query).includesLower(); } /** Returns true if the upper endpoint is inclusive */ public boolean includesUpper() { return ((TermRangeQuery) query).includesUpper(); } /** Returns the collator used to determine range inclusion, if any. */ public Collator getCollator() { return ((TermRangeQuery) query).getCollator(); } } lucene-2.9.4/src/java/org/apache/lucene/search/Scorer.java0000644000175000017500000001153311474320224024015 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; /** * Expert: Common scoring functionality for different types of queries. * *

    * A Scorer iterates over documents matching a * query in increasing order of doc Id. *

    *

    * Document scores are computed using a given Similarity * implementation. *

    * *

    NOTE: The values Float.Nan, * Float.NEGATIVE_INFINITY and Float.POSITIVE_INFINITY are * not valid scores. Certain collectors (eg {@link * TopScoreDocCollector}) will not properly collect hits * with these scores. * * @see BooleanQuery#setAllowDocsOutOfOrder */ public abstract class Scorer extends DocIdSetIterator { private Similarity similarity; /** Constructs a Scorer. * @param similarity The Similarity implementation used by this scorer. */ protected Scorer(Similarity similarity) { this.similarity = similarity; } /** Returns the Similarity implementation used by this scorer. */ public Similarity getSimilarity() { return this.similarity; } /** Scores and collects all matching documents. * @param hc The collector to which all matching documents are passed through * {@link HitCollector#collect(int, float)}. *
    When this method is used the {@link #explain(int)} method should not be used. * @deprecated use {@link #score(Collector)} instead. */ public void score(HitCollector hc) throws IOException { score(new HitCollectorWrapper(hc)); } /** Scores and collects all matching documents. * @param collector The collector to which all matching documents are passed. *
    When this method is used the {@link #explain(int)} method should not be used. */ public void score(Collector collector) throws IOException { collector.setScorer(this); int doc; while ((doc = nextDoc()) != NO_MORE_DOCS) { collector.collect(doc); } } /** Expert: Collects matching documents in a range. Hook for optimization. * Note that {@link #next()} must be called once before this method is called * for the first time. * @param hc The collector to which all matching documents are passed through * {@link HitCollector#collect(int, float)}. * @param max Do not score documents past this. * @return true if more matching documents may remain. * @deprecated use {@link #score(Collector, int, int)} instead. */ protected boolean score(HitCollector hc, int max) throws IOException { return score(new HitCollectorWrapper(hc), max, docID()); } /** * Expert: Collects matching documents in a range. Hook for optimization. * Note, firstDocID is added to ensure that {@link #nextDoc()} * was called before this method. * * @param collector * The collector to which all matching documents are passed. * @param max * Do not score documents past this. * @param firstDocID * The first document ID (ensures {@link #nextDoc()} is called before * this method. * @return true if more matching documents may remain. */ protected boolean score(Collector collector, int max, int firstDocID) throws IOException { collector.setScorer(this); int doc = firstDocID; while (doc < max) { collector.collect(doc); doc = nextDoc(); } return doc != NO_MORE_DOCS; } /** Returns the score of the current document matching the query. * Initially invalid, until {@link #next()} or {@link #skipTo(int)} * is called the first time, or when called from within * {@link Collector#collect}. */ public abstract float score() throws IOException; /** Returns an explanation of the score for a document. *
    When this method is used, the {@link #next()}, {@link #skipTo(int)} and * {@link #score(HitCollector)} methods should not be used. * @param doc The document number for the explanation. * * @deprecated Please use {@link IndexSearcher#explain} * or {@link Weight#explain} instead. */ public Explanation explain(int doc) throws IOException { throw new UnsupportedOperationException(); } } lucene-2.9.4/src/java/org/apache/lucene/search/TermRangeTermEnum.java0000644000175000017500000001222211474320224026115 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.text.Collator; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.util.StringHelper; /** * Subclass of FilteredTermEnum for enumerating all terms that match the * specified range parameters. *

    * Term enumerations are always ordered by Term.compareTo(). Each term in * the enumeration is greater than all that precede it. * @since 2.9 */ public class TermRangeTermEnum extends FilteredTermEnum { private Collator collator = null; private boolean endEnum = false; private String field; private String upperTermText; private String lowerTermText; private boolean includeLower; private boolean includeUpper; /** * Enumerates all terms greater/equal than lowerTerm * but less/equal than upperTerm. * * If an endpoint is null, it is said to be "open". Either or both * endpoints may be open. Open endpoints may not be exclusive * (you can't select all but the first or last term without * explicitly specifying the term to exclude.) * * @param reader * @param field * An interned field that holds both lower and upper terms. * @param lowerTermText * The term text at the lower end of the range * @param upperTermText * The term text at the upper end of the range * @param includeLower * If true, the lowerTerm is included in the range. * @param includeUpper * If true, the upperTerm is included in the range. * @param collator * The collator to use to collate index Terms, to determine their * membership in the range bounded by lowerTerm and * upperTerm. * * @throws IOException */ public TermRangeTermEnum(IndexReader reader, String field, String lowerTermText, String upperTermText, boolean includeLower, boolean includeUpper, Collator collator) throws IOException { this.collator = collator; this.upperTermText = upperTermText; this.lowerTermText = lowerTermText; this.includeLower = includeLower; this.includeUpper = includeUpper; this.field = StringHelper.intern(field); // do a little bit of normalization... // open ended range queries should always be inclusive. if (this.lowerTermText == null) { this.lowerTermText = ""; this.includeLower = true; } if (this.upperTermText == null) { this.includeUpper = true; } String startTermText = collator == null ? this.lowerTermText : ""; setEnum(reader.terms(new Term(this.field, startTermText))); } public float difference() { return 1.0f; } protected boolean endEnum() { return endEnum; } protected boolean termCompare(Term term) { if (collator == null) { // Use Unicode code point ordering boolean checkLower = false; if (!includeLower) // make adjustments to set to exclusive checkLower = true; if (term != null && term.field() == field) { // interned comparison if (!checkLower || null==lowerTermText || term.text().compareTo(lowerTermText) > 0) { checkLower = false; if (upperTermText != null) { int compare = upperTermText.compareTo(term.text()); /* * if beyond the upper term, or is exclusive and this is equal to * the upper term, break out */ if ((compare < 0) || (!includeUpper && compare==0)) { endEnum = true; return false; } } return true; } } else { // break endEnum = true; return false; } return false; } else { if (term != null && term.field() == field) { // interned comparison if ((lowerTermText == null || (includeLower ? collator.compare(term.text(), lowerTermText) >= 0 : collator.compare(term.text(), lowerTermText) > 0)) && (upperTermText == null || (includeUpper ? collator.compare(term.text(), upperTermText) <= 0 : collator.compare(term.text(), upperTermText) < 0))) { return true; } return false; } endEnum = true; return false; } } } lucene-2.9.4/src/java/org/apache/lucene/search/Searcher.java0000644000175000017500000002267211474320224024322 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.document.Document; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.Term; /** * An abstract base class for search implementations. Implements the main search * methods. * *

    * Note that you can only access hits from a Searcher as long as it is not yet * closed, otherwise an IOException will be thrown. */ public abstract class Searcher implements Searchable { /** Returns the documents matching query. * @throws BooleanQuery.TooManyClauses * @deprecated Hits will be removed in Lucene 3.0. Use * {@link #search(Query, Filter, int)} instead. */ public final Hits search(Query query) throws IOException { return search(query, (Filter)null); } /** Returns the documents matching query and * filter. * @throws BooleanQuery.TooManyClauses * @deprecated Hits will be removed in Lucene 3.0. Use * {@link #search(Query, Filter, int)} instead. */ public Hits search(Query query, Filter filter) throws IOException { return new Hits(this, query, filter); } /** Returns documents matching query sorted by * sort. * @throws BooleanQuery.TooManyClauses * @deprecated Hits will be removed in Lucene 3.0. Use * {@link #search(Query, Filter, int, Sort)} instead. */ public Hits search(Query query, Sort sort) throws IOException { return new Hits(this, query, null, sort); } /** Returns documents matching query and filter, * sorted by sort. * @throws BooleanQuery.TooManyClauses * @deprecated Hits will be removed in Lucene 3.0. Use * {@link #search(Query, Filter, int, Sort)} instead. */ public Hits search(Query query, Filter filter, Sort sort) throws IOException { return new Hits(this, query, filter, sort); } /** Search implementation with arbitrary sorting. Finds * the top n hits for query, applying * filter if non-null, and sorting the hits by the criteria in * sort. * *

    NOTE: this does not compute scores by default; use * {@link IndexSearcher#setDefaultFieldSortScoring} to * enable scoring. * * @throws BooleanQuery.TooManyClauses */ public TopFieldDocs search(Query query, Filter filter, int n, Sort sort) throws IOException { return search(createWeight(query), filter, n, sort); } /** Lower-level search API. * *

    {@link HitCollector#collect(int,float)} is called for every matching * document. * *

    Applications should only use this if they need all of the * matching documents. The high-level search API ({@link * Searcher#search(Query)}) is usually more efficient, as it skips * non-high-scoring hits. *

    Note: The score passed to this method is a raw score. * In other words, the score will not necessarily be a float whose value is * between 0 and 1. * @throws BooleanQuery.TooManyClauses * @deprecated use {@link #search(Query, Collector)} instead. */ public void search(Query query, HitCollector results) throws IOException { search(createWeight(query), null, new HitCollectorWrapper(results)); } /** Lower-level search API. * *

    {@link Collector#collect(int)} is called for every matching document. * *

    Applications should only use this if they need all of the * matching documents. The high-level search API ({@link * Searcher#search(Query, int)}) is usually more efficient, as it skips * non-high-scoring hits. *

    Note: The score passed to this method is a raw score. * In other words, the score will not necessarily be a float whose value is * between 0 and 1. * @throws BooleanQuery.TooManyClauses */ public void search(Query query, Collector results) throws IOException { search(createWeight(query), null, results); } /** Lower-level search API. * *

    {@link HitCollector#collect(int,float)} is called for every matching * document. *
    HitCollector-based access to remote indexes is discouraged. * *

    Applications should only use this if they need all of the * matching documents. The high-level search API ({@link * Searcher#search(Query, Filter, int)}) is usually more efficient, as it skips * non-high-scoring hits. * * @param query to match documents * @param filter if non-null, used to permit documents to be collected. * @param results to receive hits * @throws BooleanQuery.TooManyClauses * @deprecated use {@link #search(Query, Filter, Collector)} instead. */ public void search(Query query, Filter filter, HitCollector results) throws IOException { search(createWeight(query), filter, new HitCollectorWrapper(results)); } /** Lower-level search API. * *

    {@link Collector#collect(int)} is called for every matching * document. *
    Collector-based access to remote indexes is discouraged. * *

    Applications should only use this if they need all of the * matching documents. The high-level search API ({@link * Searcher#search(Query, Filter, int)}) is usually more efficient, as it skips * non-high-scoring hits. * * @param query to match documents * @param filter if non-null, used to permit documents to be collected. * @param results to receive hits * @throws BooleanQuery.TooManyClauses */ public void search(Query query, Filter filter, Collector results) throws IOException { search(createWeight(query), filter, results); } /** Finds the top n * hits for query, applying filter if non-null. * * @throws BooleanQuery.TooManyClauses */ public TopDocs search(Query query, Filter filter, int n) throws IOException { return search(createWeight(query), filter, n); } /** Finds the top n * hits for query. * * @throws BooleanQuery.TooManyClauses */ public TopDocs search(Query query, int n) throws IOException { return search(query, null, n); } /** Returns an Explanation that describes how doc scored against * query. * *

    This is intended to be used in developing Similarity implementations, * and, for good performance, should not be displayed with every hit. * Computing an explanation is as expensive as executing the query over the * entire index. */ public Explanation explain(Query query, int doc) throws IOException { return explain(createWeight(query), doc); } /** The Similarity implementation used by this searcher. */ private Similarity similarity = Similarity.getDefault(); /** Expert: Set the Similarity implementation used by this Searcher. * * @see Similarity#setDefault(Similarity) */ public void setSimilarity(Similarity similarity) { this.similarity = similarity; } /** Expert: Return the Similarity implementation used by this Searcher. * *

    This defaults to the current value of {@link Similarity#getDefault()}. */ public Similarity getSimilarity() { return this.similarity; } /** * creates a weight for query * @return new weight */ protected Weight createWeight(Query query) throws IOException { return query.weight(this); } // inherit javadoc public int[] docFreqs(Term[] terms) throws IOException { int[] result = new int[terms.length]; for (int i = 0; i < terms.length; i++) { result[i] = docFreq(terms[i]); } return result; } /* The following abstract methods were added as a workaround for GCJ bug #15411. * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=15411 */ /** * @deprecated use {@link #search(Weight, Filter, Collector)} instead. */ public void search(Weight weight, Filter filter, HitCollector results) throws IOException { search(weight, filter, new HitCollectorWrapper(results)); } abstract public void search(Weight weight, Filter filter, Collector results) throws IOException; abstract public void close() throws IOException; abstract public int docFreq(Term term) throws IOException; abstract public int maxDoc() throws IOException; abstract public TopDocs search(Weight weight, Filter filter, int n) throws IOException; abstract public Document doc(int i) throws CorruptIndexException, IOException; abstract public Query rewrite(Query query) throws IOException; abstract public Explanation explain(Weight weight, int doc) throws IOException; abstract public TopFieldDocs search(Weight weight, Filter filter, int n, Sort sort) throws IOException; /* End patch for GCJ bug #15411. */ } lucene-2.9.4/src/java/org/apache/lucene/search/PrefixFilter.java0000644000175000017500000000270311474320224025162 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.Term; /** * A Filter that restricts search results to values that have a matching prefix in a given * field. */ public class PrefixFilter extends MultiTermQueryWrapperFilter { public PrefixFilter(Term prefix) { super(new PrefixQuery(prefix)); } public Term getPrefix() { return ((PrefixQuery)query).getPrefix(); } /** Prints a user-readable version of this query. */ public String toString () { StringBuffer buffer = new StringBuffer(); buffer.append("PrefixFilter("); buffer.append(getPrefix().toString()); buffer.append(")"); return buffer.toString(); } } lucene-2.9.4/src/java/org/apache/lucene/search/ScoreDoc.java0000644000175000017500000000257311474320224024265 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** Expert: Returned by low-level search implementations. * @see TopDocs */ public class ScoreDoc implements java.io.Serializable { /** Expert: The score of this document for the query. */ public float score; /** Expert: A hit document's number. * @see Searcher#doc(int) */ public int doc; /** Expert: Constructs a ScoreDoc. */ public ScoreDoc(int doc, float score) { this.doc = doc; this.score = score; } // A convenience method for debugging. public String toString() { return "doc=" + doc + " score=" + score; } } lucene-2.9.4/src/java/org/apache/lucene/search/FieldSortedHitQueue.java0000644000175000017500000004422611474320224026443 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.PriorityQueue; import java.io.IOException; import java.text.Collator; import java.util.Locale; /** * Expert: A hit queue for sorting by hits by terms in more than one field. * Uses FieldCache.DEFAULT for maintaining internal term lookup tables. * *

    Created: Dec 8, 2003 12:56:03 PM * * @since lucene 1.4 * @version $Id: FieldSortedHitQueue.java 811070 2009-09-03 18:31:41Z hossman $ * @see Searcher#search(Query,Filter,int,Sort) * @see FieldCache * @deprecated see {@link FieldValueHitQueue} */ public class FieldSortedHitQueue extends PriorityQueue { /** * Creates a hit queue sorted by the given list of fields. * @param reader Index to use. * @param fields Fieldable names, in priority order (highest priority first). Cannot be null or empty. * @param size The number of hits to retain. Must be greater than zero. * @throws IOException */ public FieldSortedHitQueue (IndexReader reader, SortField[] fields, int size) throws IOException { final int n = fields.length; comparators = new ScoreDocComparator[n]; this.fields = new SortField[n]; for (int i=0; ia is less relevant than b. * @param a ScoreDoc * @param b ScoreDoc * @return true if document a should be sorted after document b. */ protected boolean lessThan (final Object a, final Object b) { final ScoreDoc docA = (ScoreDoc) a; final ScoreDoc docB = (ScoreDoc) b; // run comparators final int n = comparators.length; int c = 0; for (int i=0; i docB.doc; return c > 0; } /** * Given a FieldDoc object, stores the values used * to sort the given document. These values are not the raw * values out of the index, but the internal representation * of them. This is so the given search hit can be collated * by a MultiSearcher with other search hits. * @param doc The FieldDoc to store sort values into. * @return The same FieldDoc passed in. * @see Searchable#search(Weight,Filter,int,Sort) */ FieldDoc fillFields (final FieldDoc doc) { final int n = comparators.length; final Comparable[] fields = new Comparable[n]; for (int i=0; i 1.0f) doc.score /= maxscore; // normalize scores return doc; } /** Returns the SortFields being used by this hit queue. */ SortField[] getFields() { return fields; } static ScoreDocComparator getCachedComparator (IndexReader reader, String field, int type, FieldCache.Parser parser, Locale locale, SortComparatorSource factory) throws IOException { if (type == SortField.DOC) return ScoreDocComparator.INDEXORDER; if (type == SortField.SCORE) return ScoreDocComparator.RELEVANCE; FieldCacheImpl.Entry entry = (factory != null) ? new FieldCacheImpl.Entry (field, factory) : ( (parser != null) ? new FieldCacheImpl.Entry (field, type, parser) : new FieldCacheImpl.Entry (field, type, locale) ); return (ScoreDocComparator)Comparators.get(reader, entry); } /** Internal cache of comparators. Similar to FieldCache, only * caches comparators instead of term values. */ static final FieldCacheImpl.Cache Comparators = new FieldCacheImpl.Cache() { protected Object createValue(IndexReader reader, FieldCacheImpl.Entry entryKey) throws IOException { FieldCacheImpl.Entry entry = (FieldCacheImpl.Entry) entryKey; String fieldname = entry.field; int type = entry.type; Locale locale = entry.locale; FieldCache.Parser parser = null; SortComparatorSource factory = null; if (entry.custom instanceof SortComparatorSource) { factory = (SortComparatorSource) entry.custom; } else { parser = (FieldCache.Parser) entry.custom; } ScoreDocComparator comparator; switch (type) { case SortField.AUTO: comparator = comparatorAuto (reader, fieldname); break; case SortField.INT: comparator = comparatorInt (reader, fieldname, (FieldCache.IntParser)parser); break; case SortField.FLOAT: comparator = comparatorFloat (reader, fieldname, (FieldCache.FloatParser)parser); break; case SortField.LONG: comparator = comparatorLong(reader, fieldname, (FieldCache.LongParser)parser); break; case SortField.DOUBLE: comparator = comparatorDouble(reader, fieldname, (FieldCache.DoubleParser)parser); break; case SortField.SHORT: comparator = comparatorShort(reader, fieldname, (FieldCache.ShortParser)parser); break; case SortField.BYTE: comparator = comparatorByte(reader, fieldname, (FieldCache.ByteParser)parser); break; case SortField.STRING: if (locale != null) comparator = comparatorStringLocale (reader, fieldname, locale); else comparator = comparatorString (reader, fieldname); break; case SortField.CUSTOM: comparator = factory.newComparator (reader, fieldname); break; default: throw new RuntimeException ("unknown field type: "+type); } return comparator; } }; /** * Returns a comparator for sorting hits according to a field containing bytes. * @param reader Index to use. * @param fieldname Fieldable containing integer values. * @return Comparator for sorting hits. * @throws IOException If an error occurs reading the index. */ static ScoreDocComparator comparatorByte(final IndexReader reader, final String fieldname, final FieldCache.ByteParser parser) throws IOException { final String field = fieldname.intern(); final byte[] fieldOrder = FieldCache.DEFAULT.getBytes(reader, field, parser); return new ScoreDocComparator() { public final int compare (final ScoreDoc i, final ScoreDoc j) { final int fi = fieldOrder[i.doc]; final int fj = fieldOrder[j.doc]; if (fi < fj) return -1; if (fi > fj) return 1; return 0; } public Comparable sortValue (final ScoreDoc i) { return new Byte(fieldOrder[i.doc]); } public int sortType() { return SortField.BYTE; } }; } /** * Returns a comparator for sorting hits according to a field containing shorts. * @param reader Index to use. * @param fieldname Fieldable containing integer values. * @return Comparator for sorting hits. * @throws IOException If an error occurs reading the index. */ static ScoreDocComparator comparatorShort(final IndexReader reader, final String fieldname, final FieldCache.ShortParser parser) throws IOException { final String field = fieldname.intern(); final short[] fieldOrder = FieldCache.DEFAULT.getShorts(reader, field, parser); return new ScoreDocComparator() { public final int compare (final ScoreDoc i, final ScoreDoc j) { final int fi = fieldOrder[i.doc]; final int fj = fieldOrder[j.doc]; if (fi < fj) return -1; if (fi > fj) return 1; return 0; } public Comparable sortValue (final ScoreDoc i) { return new Short(fieldOrder[i.doc]); } public int sortType() { return SortField.SHORT; } }; } /** * Returns a comparator for sorting hits according to a field containing integers. * @param reader Index to use. * @param fieldname Fieldable containing integer values. * @return Comparator for sorting hits. * @throws IOException If an error occurs reading the index. */ static ScoreDocComparator comparatorInt (final IndexReader reader, final String fieldname, final FieldCache.IntParser parser) throws IOException { final String field = fieldname.intern(); final int[] fieldOrder = FieldCache.DEFAULT.getInts(reader, field, parser); return new ScoreDocComparator() { public final int compare (final ScoreDoc i, final ScoreDoc j) { final int fi = fieldOrder[i.doc]; final int fj = fieldOrder[j.doc]; if (fi < fj) return -1; if (fi > fj) return 1; return 0; } public Comparable sortValue (final ScoreDoc i) { return new Integer (fieldOrder[i.doc]); } public int sortType() { return SortField.INT; } }; } /** * Returns a comparator for sorting hits according to a field containing integers. * @param reader Index to use. * @param fieldname Fieldable containing integer values. * @return Comparator for sorting hits. * @throws IOException If an error occurs reading the index. */ static ScoreDocComparator comparatorLong (final IndexReader reader, final String fieldname, final FieldCache.LongParser parser) throws IOException { final String field = fieldname.intern(); final long[] fieldOrder = FieldCache.DEFAULT.getLongs (reader, field, parser); return new ScoreDocComparator() { public final int compare (final ScoreDoc i, final ScoreDoc j) { final long li = fieldOrder[i.doc]; final long lj = fieldOrder[j.doc]; if (li < lj) return -1; if (li > lj) return 1; return 0; } public Comparable sortValue (final ScoreDoc i) { return new Long(fieldOrder[i.doc]); } public int sortType() { return SortField.LONG; } }; } /** * Returns a comparator for sorting hits according to a field containing floats. * @param reader Index to use. * @param fieldname Fieldable containing float values. * @return Comparator for sorting hits. * @throws IOException If an error occurs reading the index. */ static ScoreDocComparator comparatorFloat (final IndexReader reader, final String fieldname, final FieldCache.FloatParser parser) throws IOException { final String field = fieldname.intern(); final float[] fieldOrder = FieldCache.DEFAULT.getFloats (reader, field, parser); return new ScoreDocComparator () { public final int compare (final ScoreDoc i, final ScoreDoc j) { final float fi = fieldOrder[i.doc]; final float fj = fieldOrder[j.doc]; if (fi < fj) return -1; if (fi > fj) return 1; return 0; } public Comparable sortValue (final ScoreDoc i) { return new Float (fieldOrder[i.doc]); } public int sortType() { return SortField.FLOAT; } }; } /** * Returns a comparator for sorting hits according to a field containing doubles. * @param reader Index to use. * @param fieldname Fieldable containing float values. * @return Comparator for sorting hits. * @throws IOException If an error occurs reading the index. */ static ScoreDocComparator comparatorDouble(final IndexReader reader, final String fieldname, final FieldCache.DoubleParser parser) throws IOException { final String field = fieldname.intern(); final double[] fieldOrder = FieldCache.DEFAULT.getDoubles (reader, field, parser); return new ScoreDocComparator () { public final int compare (final ScoreDoc i, final ScoreDoc j) { final double di = fieldOrder[i.doc]; final double dj = fieldOrder[j.doc]; if (di < dj) return -1; if (di > dj) return 1; return 0; } public Comparable sortValue (final ScoreDoc i) { return new Double (fieldOrder[i.doc]); } public int sortType() { return SortField.DOUBLE; } }; } /** * Returns a comparator for sorting hits according to a field containing strings. * @param reader Index to use. * @param fieldname Fieldable containing string values. * @return Comparator for sorting hits. * @throws IOException If an error occurs reading the index. */ static ScoreDocComparator comparatorString (final IndexReader reader, final String fieldname) throws IOException { final String field = fieldname.intern(); final FieldCache.StringIndex index = FieldCache.DEFAULT.getStringIndex (reader, field); return new ScoreDocComparator () { public final int compare (final ScoreDoc i, final ScoreDoc j) { final int fi = index.order[i.doc]; final int fj = index.order[j.doc]; if (fi < fj) return -1; if (fi > fj) return 1; return 0; } public Comparable sortValue (final ScoreDoc i) { return index.lookup[index.order[i.doc]]; } public int sortType() { return SortField.STRING; } }; } /** * Returns a comparator for sorting hits according to a field containing strings. * @param reader Index to use. * @param fieldname Fieldable containing string values. * @return Comparator for sorting hits. * @throws IOException If an error occurs reading the index. */ static ScoreDocComparator comparatorStringLocale (final IndexReader reader, final String fieldname, final Locale locale) throws IOException { final Collator collator = Collator.getInstance (locale); final String field = fieldname.intern(); final String[] index = FieldCache.DEFAULT.getStrings (reader, field); return new ScoreDocComparator() { public final int compare(final ScoreDoc i, final ScoreDoc j) { String is = index[i.doc]; String js = index[j.doc]; if (is == js) { return 0; } else if (is == null) { return -1; } else if (js == null) { return 1; } else { return collator.compare(is, js); } } public Comparable sortValue (final ScoreDoc i) { return index[i.doc]; } public int sortType() { return SortField.STRING; } }; } /** * Returns a comparator for sorting hits according to values in the given field. * The terms in the field are looked at to determine whether they contain integers, * floats or strings. Once the type is determined, one of the other static methods * in this class is called to get the comparator. * @param reader Index to use. * @param fieldname Fieldable containing values. * @return Comparator for sorting hits. * @throws IOException If an error occurs reading the index. */ static ScoreDocComparator comparatorAuto (final IndexReader reader, final String fieldname) throws IOException { final String field = fieldname.intern(); Object lookupArray = FieldCache.DEFAULT.getAuto (reader, field); if (lookupArray instanceof FieldCache.StringIndex) { return comparatorString (reader, field); } else if (lookupArray instanceof int[]) { return comparatorInt (reader, field, null); } else if (lookupArray instanceof long[]) { return comparatorLong (reader, field, null); } else if (lookupArray instanceof float[]) { return comparatorFloat (reader, field, null); } else if (lookupArray instanceof String[]) { return comparatorString (reader, field); } else { throw new RuntimeException ("unknown data type in field '"+field+"'"); } } } lucene-2.9.4/src/java/org/apache/lucene/search/MultiPhraseQuery.java0000644000175000017500000002634411474320224026051 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.*; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultipleTermPositions; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermPositions; import org.apache.lucene.util.ToStringUtils; /** * MultiPhraseQuery is a generalized version of PhraseQuery, with an added * method {@link #add(Term[])}. * To use this class, to search for the phrase "Microsoft app*" first use * add(Term) on the term "Microsoft", then find all terms that have "app" as * prefix using IndexReader.terms(Term), and use MultiPhraseQuery.add(Term[] * terms) to add them to the query. * * @version 1.0 */ public class MultiPhraseQuery extends Query { private String field; private ArrayList termArrays = new ArrayList(); private ArrayList positions = new ArrayList(); private int slop = 0; /** Sets the phrase slop for this query. * @see PhraseQuery#setSlop(int) */ public void setSlop(int s) { slop = s; } /** Sets the phrase slop for this query. * @see PhraseQuery#getSlop() */ public int getSlop() { return slop; } /** Add a single term at the next position in the phrase. * @see PhraseQuery#add(Term) */ public void add(Term term) { add(new Term[]{term}); } /** Add multiple terms at the next position in the phrase. Any of the terms * may match. * * @see PhraseQuery#add(Term) */ public void add(Term[] terms) { int position = 0; if (positions.size() > 0) position = ((Integer) positions.get(positions.size()-1)).intValue() + 1; add(terms, position); } /** * Allows to specify the relative position of terms within the phrase. * * @see PhraseQuery#add(Term, int) * @param terms * @param position */ public void add(Term[] terms, int position) { if (termArrays.size() == 0) field = terms[0].field(); for (int i = 0; i < terms.length; i++) { if (terms[i].field() != field) { throw new IllegalArgumentException( "All phrase terms must be in the same field (" + field + "): " + terms[i]); } } termArrays.add(terms); positions.add(new Integer(position)); } /** * Returns a List of the terms in the multiphrase. * Do not modify the List or its contents. */ public List getTermArrays() { return Collections.unmodifiableList(termArrays); } /** * Returns the relative positions of terms in this phrase. */ public int[] getPositions() { int[] result = new int[positions.size()]; for (int i = 0; i < positions.size(); i++) result[i] = ((Integer) positions.get(i)).intValue(); return result; } // inherit javadoc public void extractTerms(Set terms) { for (Iterator iter = termArrays.iterator(); iter.hasNext();) { Term[] arr = (Term[])iter.next(); for (int i=0; i 1) p = new MultipleTermPositions(reader, terms); else p = reader.termPositions(terms[0]); if (p == null) return null; tps[i] = p; } if (slop == 0) return new ExactPhraseScorer(this, tps, getPositions(), similarity, reader.norms(field)); else return new SloppyPhraseScorer(this, tps, getPositions(), similarity, slop, reader.norms(field)); } public Explanation explain(IndexReader reader, int doc) throws IOException { ComplexExplanation result = new ComplexExplanation(); result.setDescription("weight("+getQuery()+" in "+doc+"), product of:"); Explanation idfExpl = new Explanation(idf, "idf("+getQuery()+")"); // explain query weight Explanation queryExpl = new Explanation(); queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:"); Explanation boostExpl = new Explanation(getBoost(), "boost"); if (getBoost() != 1.0f) queryExpl.addDetail(boostExpl); queryExpl.addDetail(idfExpl); Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm"); queryExpl.addDetail(queryNormExpl); queryExpl.setValue(boostExpl.getValue() * idfExpl.getValue() * queryNormExpl.getValue()); result.addDetail(queryExpl); // explain field weight ComplexExplanation fieldExpl = new ComplexExplanation(); fieldExpl.setDescription("fieldWeight("+getQuery()+" in "+doc+ "), product of:"); Scorer scorer = scorer(reader, true, false); if (scorer == null) { return new Explanation(0.0f, "no matching docs"); } Explanation tfExpl = scorer.explain(doc); fieldExpl.addDetail(tfExpl); fieldExpl.addDetail(idfExpl); Explanation fieldNormExpl = new Explanation(); byte[] fieldNorms = reader.norms(field); float fieldNorm = fieldNorms!=null ? Similarity.decodeNorm(fieldNorms[doc]) : 1.0f; fieldNormExpl.setValue(fieldNorm); fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")"); fieldExpl.addDetail(fieldNormExpl); fieldExpl.setMatch(Boolean.valueOf(tfExpl.isMatch())); fieldExpl.setValue(tfExpl.getValue() * idfExpl.getValue() * fieldNormExpl.getValue()); result.addDetail(fieldExpl); result.setMatch(fieldExpl.getMatch()); // combine them result.setValue(queryExpl.getValue() * fieldExpl.getValue()); if (queryExpl.getValue() == 1.0f) return fieldExpl; return result; } } public Query rewrite(IndexReader reader) { if (termArrays.size() == 1) { // optimize one-term case Term[] terms = (Term[])termArrays.get(0); BooleanQuery boq = new BooleanQuery(true); for (int i=0; i 1) { buffer.append("("); for (int j = 0; j < terms.length; j++) { buffer.append(terms[j].text()); if (j < terms.length-1) buffer.append(" "); } buffer.append(")"); } else { buffer.append(terms[0].text()); } if (i.hasNext()) buffer.append(" "); } buffer.append("\""); if (slop != 0) { buffer.append("~"); buffer.append(slop); } buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } /** Returns true if o is equal to this. */ public boolean equals(Object o) { if (!(o instanceof MultiPhraseQuery)) return false; MultiPhraseQuery other = (MultiPhraseQuery)o; return this.getBoost() == other.getBoost() && this.slop == other.slop && termArraysEquals(this.termArrays, other.termArrays) && this.positions.equals(other.positions); } /** Returns a hash code value for this object.*/ public int hashCode() { return Float.floatToIntBits(getBoost()) ^ slop ^ termArraysHashCode() ^ positions.hashCode() ^ 0x4AC65113; } // Breakout calculation of the termArrays hashcode private int termArraysHashCode() { int hashCode = 1; Iterator iterator = termArrays.iterator(); while (iterator.hasNext()) { Term[] termArray = (Term[]) iterator.next(); hashCode = 31 * hashCode + (termArray == null ? 0 : arraysHashCode(termArray)); } return hashCode; } private int arraysHashCode(Term[] termArray) { if (termArray == null) return 0; int result = 1; for (int i = 0; i < termArray.length; i++) { Term term = termArray[i]; result = 31 * result + (term == null ? 0 : term.hashCode()); } return result; } // Breakout calculation of the termArrays equals private boolean termArraysEquals(List termArrays1, List termArrays2) { if (termArrays1.size() != termArrays2.size()) { return false; } ListIterator iterator1 = termArrays1.listIterator(); ListIterator iterator2 = termArrays2.listIterator(); while (iterator1.hasNext()) { Term[] termArray1 = (Term[]) iterator1.next(); Term[] termArray2 = (Term[]) iterator2.next(); if (!(termArray1 == null ? termArray2 == null : Arrays.equals(termArray1, termArray2))) { return false; } } return true; } } lucene-2.9.4/src/java/org/apache/lucene/search/SortComparator.java0000644000175000017500000000701311474320224025535 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import java.io.IOException; /** * Abstract base class for sorting hits returned by a Query. * *

    * This class should only be used if the other SortField types (SCORE, DOC, * STRING, INT, FLOAT) do not provide an adequate sorting. It maintains an * internal cache of values which could be quite large. The cache is an array of * Comparable, one for each document in the index. There is a distinct * Comparable for each unique term in the field - if some documents have the * same term in the field, the cache array will have entries which reference the * same Comparable. * * This class will be used as part of a key to a FieldCache value. You must * implement hashCode and equals to avoid an explosion in RAM usage if you use * instances that are not the same instance. If you are searching using the * Remote contrib, the same instance of this class on the client will be a new * instance on every call to the server, so hashCode/equals is very important in * that situation. * *

    * Created: Apr 21, 2004 5:08:38 PM * * * @version $Id: SortComparator.java 800119 2009-08-02 17:59:21Z markrmiller $ * @since 1.4 * @deprecated Please use {@link FieldComparatorSource} instead. */ public abstract class SortComparator implements SortComparatorSource { // inherit javadocs public ScoreDocComparator newComparator (final IndexReader reader, final String fieldname) throws IOException { final String field = fieldname.intern(); final Comparable[] cachedValues = FieldCache.DEFAULT.getCustom (reader, field, SortComparator.this); return new ScoreDocComparator() { public int compare (ScoreDoc i, ScoreDoc j) { return cachedValues[i.doc].compareTo (cachedValues[j.doc]); } public Comparable sortValue (ScoreDoc i) { return cachedValues[i.doc]; } public int sortType(){ return SortField.CUSTOM; } }; } /** * Returns an object which, when sorted according to natural order, * will order the Term values in the correct order. *

    For example, if the Terms contained integer values, this method * would return new Integer(termtext). Note that this * might not always be the most efficient implementation - for this * particular example, a better implementation might be to make a * ScoreDocLookupComparator that uses an internal lookup table of int. * @param termtext The textual value of the term. * @return An object representing termtext that sorts according to the natural order of termtext. * @see Comparable * @see ScoreDocComparator */ protected abstract Comparable getComparable (String termtext); } lucene-2.9.4/src/java/org/apache/lucene/search/MultiTermQuery.java0000644000175000017500000003733111474320224025534 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.queryParser.QueryParser; // for javadoc /** * An abstract {@link Query} that matches documents * containing a subset of terms provided by a {@link * FilteredTermEnum} enumeration. * *

    This query cannot be used directly; you must subclass * it and define {@link #getEnum} to provide a {@link * FilteredTermEnum} that iterates through the terms to be * matched. * *

    NOTE: if {@link #setRewriteMethod} is either * {@link #CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE} or {@link * #SCORING_BOOLEAN_QUERY_REWRITE}, you may encounter a * {@link BooleanQuery.TooManyClauses} exception during * searching, which happens when the number of terms to be * searched exceeds {@link * BooleanQuery#getMaxClauseCount()}. Setting {@link * #setRewriteMethod} to {@link #CONSTANT_SCORE_FILTER_REWRITE} * prevents this. * *

    The recommended rewrite method is {@link * #CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}: it doesn't spend CPU * computing unhelpful scores, and it tries to pick the most * performant rewrite method given the query. * * Note that {@link QueryParser} produces * MultiTermQueries using {@link * #CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} by default. */ public abstract class MultiTermQuery extends Query { /* @deprecated move to sub class */ protected Term term; protected RewriteMethod rewriteMethod = CONSTANT_SCORE_AUTO_REWRITE_DEFAULT; transient int numberOfTerms = 0; /** Abstract class that defines how the query is rewritten. */ public static abstract class RewriteMethod implements Serializable { public abstract Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException; } private static final class ConstantScoreFilterRewrite extends RewriteMethod implements Serializable { public Query rewrite(IndexReader reader, MultiTermQuery query) { Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query)); result.setBoost(query.getBoost()); return result; } // Make sure we are still a singleton even after deserializing protected Object readResolve() { return CONSTANT_SCORE_FILTER_REWRITE; } } /** A rewrite method that first creates a private Filter, * by visiting each term in sequence and marking all docs * for that term. Matching documents are assigned a * constant score equal to the query's boost. * *

    This method is faster than the BooleanQuery * rewrite methods when the number of matched terms or * matched documents is non-trivial. Also, it will never * hit an errant {@link BooleanQuery.TooManyClauses} * exception. * * @see #setRewriteMethod */ public final static RewriteMethod CONSTANT_SCORE_FILTER_REWRITE = new ConstantScoreFilterRewrite(); private static class ScoringBooleanQueryRewrite extends RewriteMethod implements Serializable { public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException { FilteredTermEnum enumerator = query.getEnum(reader); BooleanQuery result = new BooleanQuery(true); int count = 0; try { do { Term t = enumerator.term(); if (t != null) { TermQuery tq = new TermQuery(t); // found a match tq.setBoost(query.getBoost() * enumerator.difference()); // set the boost result.add(tq, BooleanClause.Occur.SHOULD); // add to query count++; } } while (enumerator.next()); } finally { enumerator.close(); } query.incTotalNumberOfTerms(count); return result; } // Make sure we are still a singleton even after deserializing protected Object readResolve() { return SCORING_BOOLEAN_QUERY_REWRITE; } } /** A rewrite method that first translates each term into * {@link BooleanClause.Occur#SHOULD} clause in a * BooleanQuery, and keeps the scores as computed by the * query. Note that typically such scores are * meaningless to the user, and require non-trivial CPU * to compute, so it's almost always better to use {@link * #CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} instead. * *

    NOTE: This rewrite method will hit {@link * BooleanQuery.TooManyClauses} if the number of terms * exceeds {@link BooleanQuery#getMaxClauseCount}. * * @see #setRewriteMethod */ public final static RewriteMethod SCORING_BOOLEAN_QUERY_REWRITE = new ScoringBooleanQueryRewrite(); private static class ConstantScoreBooleanQueryRewrite extends ScoringBooleanQueryRewrite implements Serializable { public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException { // strip the scores off Query result = new ConstantScoreQuery(new QueryWrapperFilter(super.rewrite(reader, query))); result.setBoost(query.getBoost()); return result; } // Make sure we are still a singleton even after deserializing protected Object readResolve() { return CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE; } } /** Like {@link #SCORING_BOOLEAN_QUERY_REWRITE} except * scores are not computed. Instead, each matching * document receives a constant score equal to the * query's boost. * *

    NOTE: This rewrite method will hit {@link * BooleanQuery.TooManyClauses} if the number of terms * exceeds {@link BooleanQuery#getMaxClauseCount}. * * @see #setRewriteMethod */ public final static RewriteMethod CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE = new ConstantScoreBooleanQueryRewrite(); /** A rewrite method that tries to pick the best * constant-score rewrite method based on term and * document counts from the query. If both the number of * terms and documents is small enough, then {@link * #CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE} is used. * Otherwise, {@link #CONSTANT_SCORE_FILTER_REWRITE} is * used. */ public static class ConstantScoreAutoRewrite extends RewriteMethod implements Serializable { // Defaults derived from rough tests with a 20.0 million // doc Wikipedia index. With more than 350 terms in the // query, the filter method is fastest: public static int DEFAULT_TERM_COUNT_CUTOFF = 350; // If the query will hit more than 1 in 1000 of the docs // in the index (0.1%), the filter method is fastest: public static double DEFAULT_DOC_COUNT_PERCENT = 0.1; private int termCountCutoff = DEFAULT_TERM_COUNT_CUTOFF; private double docCountPercent = DEFAULT_DOC_COUNT_PERCENT; /** If the number of terms in this query is equal to or * larger than this setting then {@link * #CONSTANT_SCORE_FILTER_REWRITE} is used. */ public void setTermCountCutoff(int count) { termCountCutoff = count; } /** @see #setTermCountCutoff */ public int getTermCountCutoff() { return termCountCutoff; } /** If the number of documents to be visited in the * postings exceeds this specified percentage of the * maxDoc() for the index, then {@link * #CONSTANT_SCORE_FILTER_REWRITE} is used. * @param percent 0.0 to 100.0 */ public void setDocCountPercent(double percent) { docCountPercent = percent; } /** @see #setDocCountPercent */ public double getDocCountPercent() { return docCountPercent; } public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException { // Get the enum and start visiting terms. If we // exhaust the enum before hitting either of the // cutoffs, we use ConstantBooleanQueryRewrite; else, // ConstantFilterRewrite: final Collection pendingTerms = new ArrayList(); final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc()); final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff); int docVisitCount = 0; FilteredTermEnum enumerator = query.getEnum(reader); try { while(true) { Term t = enumerator.term(); if (t != null) { pendingTerms.add(t); // Loading the TermInfo from the terms dict here // should not be costly, because 1) the // query/filter will load the TermInfo when it // runs, and 2) the terms dict has a cache: docVisitCount += reader.docFreq(t); } if (pendingTerms.size() >= termCountLimit || docVisitCount >= docCountCutoff) { // Too many terms -- make a filter. Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter(query)); result.setBoost(query.getBoost()); return result; } else if (!enumerator.next()) { // Enumeration is done, and we hit a small // enough number of terms & docs -- just make a // BooleanQuery, now Iterator it = pendingTerms.iterator(); BooleanQuery bq = new BooleanQuery(true); while(it.hasNext()) { TermQuery tq = new TermQuery((Term) it.next()); bq.add(tq, BooleanClause.Occur.SHOULD); } // Strip scores Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); result.setBoost(query.getBoost()); query.incTotalNumberOfTerms(pendingTerms.size()); return result; } } } finally { enumerator.close(); } } public int hashCode() { final int prime = 1279; return (int) (prime * termCountCutoff + Double.doubleToLongBits(docCountPercent)); } public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; ConstantScoreAutoRewrite other = (ConstantScoreAutoRewrite) obj; if (other.termCountCutoff != termCountCutoff) { return false; } if (Double.doubleToLongBits(other.docCountPercent) != Double.doubleToLongBits(docCountPercent)) { return false; } return true; } } /** Read-only default instance of {@link * ConstantScoreAutoRewrite}, with {@link * ConstantScoreAutoRewrite#setTermCountCutoff} set to * {@link * ConstantScoreAutoRewrite#DEFAULT_TERM_COUNT_CUTOFF} * and {@link * ConstantScoreAutoRewrite#setDocCountPercent} set to * {@link * ConstantScoreAutoRewrite#DEFAULT_DOC_COUNT_PERCENT}. * Note that you cannot alter the configuration of this * instance; you'll need to create a private instance * instead. */ public final static RewriteMethod CONSTANT_SCORE_AUTO_REWRITE_DEFAULT = new ConstantScoreAutoRewrite() { public void setTermCountCutoff(int count) { throw new UnsupportedOperationException("Please create a private instance"); } public void setDocCountPercent(double percent) { throw new UnsupportedOperationException("Please create a private instance"); } // Make sure we are still a singleton even after deserializing protected Object readResolve() { return CONSTANT_SCORE_AUTO_REWRITE_DEFAULT; } }; /** * Constructs a query for terms matching term. * @deprecated check sub class for possible term access - the Term does not * make sense for all MultiTermQuerys and will be removed. */ public MultiTermQuery(Term term) { this.term = term; } /** * Constructs a query matching terms that cannot be represented with a single * Term. */ public MultiTermQuery() { } /** * Returns the pattern term. * @deprecated check sub class for possible term access - getTerm does not * make sense for all MultiTermQuerys and will be removed. */ public Term getTerm() { return term; } /** Construct the enumeration to be used, expanding the pattern term. */ protected abstract FilteredTermEnum getEnum(IndexReader reader) throws IOException; /** * Expert: Return the number of unique terms visited during execution of the query. * If there are many of them, you may consider using another query type * or optimize your total term count in index. *

    This method is not thread safe, be sure to only call it when no query is running! * If you re-use the same query instance for another * search, be sure to first reset the term counter * with {@link #clearTotalNumberOfTerms}. *

    On optimized indexes / no MultiReaders, you get the correct number of * unique terms for the whole index. Use this number to compare different queries. * For non-optimized indexes this number can also be achieved in * non-constant-score mode. In constant-score mode you get the total number of * terms seeked for all segments / sub-readers. * @see #clearTotalNumberOfTerms */ public int getTotalNumberOfTerms() { return numberOfTerms; } /** * Expert: Resets the counting of unique terms. * Do this before executing the query/filter. * @see #getTotalNumberOfTerms */ public void clearTotalNumberOfTerms() { numberOfTerms = 0; } protected void incTotalNumberOfTerms(int inc) { numberOfTerms += inc; } public Query rewrite(IndexReader reader) throws IOException { return rewriteMethod.rewrite(reader, this); } /* Prints a user-readable version of this query. * Implemented for back compat in case MultiTermQuery * subclasses do no implement. */ public String toString(String field) { StringBuffer buffer = new StringBuffer(); if (term != null) { if (!term.field().equals(field)) { buffer.append(term.field()); buffer.append(":"); } buffer.append(term.text()); } else { buffer.append("termPattern:unknown"); } buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } /** * @see #setRewriteMethod */ public RewriteMethod getRewriteMethod() { return rewriteMethod; } /** * Sets the rewrite method to be used when executing the * query. You can use one of the four core methods, or * implement your own subclass of {@link RewriteMethod}. */ public void setRewriteMethod(RewriteMethod method) { rewriteMethod = method; } //@Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + Float.floatToIntBits(getBoost()); result = prime * result; result += rewriteMethod.hashCode(); return result; } //@Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; MultiTermQuery other = (MultiTermQuery) obj; if (Float.floatToIntBits(getBoost()) != Float.floatToIntBits(other.getBoost())) return false; if (!rewriteMethod.equals(other.rewriteMethod)) { return false; } return true; } } lucene-2.9.4/src/java/org/apache/lucene/search/PrefixQuery.java0000644000175000017500000000527111474320224025045 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.util.ToStringUtils; /** A Query that matches documents containing terms with a specified prefix. A PrefixQuery * is built by QueryParser for input like app*. * *

    This query uses the {@link * MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} * rewrite method. */ public class PrefixQuery extends MultiTermQuery { private Term prefix; /** Constructs a query for terms starting with prefix. */ public PrefixQuery(Term prefix) { super(prefix); //will be removed in 3.0 this.prefix = prefix; } /** Returns the prefix of this query. */ public Term getPrefix() { return prefix; } protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { return new PrefixTermEnum(reader, prefix); } /** Prints a user-readable version of this query. */ public String toString(String field) { StringBuffer buffer = new StringBuffer(); if (!prefix.field().equals(field)) { buffer.append(prefix.field()); buffer.append(":"); } buffer.append(prefix.text()); buffer.append('*'); buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } //@Override public int hashCode() { final int prime = 31; int result = super.hashCode(); result = prime * result + ((prefix == null) ? 0 : prefix.hashCode()); return result; } //@Override public boolean equals(Object obj) { if (this == obj) return true; if (!super.equals(obj)) return false; if (getClass() != obj.getClass()) return false; PrefixQuery other = (PrefixQuery) obj; if (prefix == null) { if (other.prefix != null) return false; } else if (!prefix.equals(other.prefix)) return false; return true; } } lucene-2.9.4/src/java/org/apache/lucene/search/ReqExclScorer.java0000644000175000017500000001063711474320224025305 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; /** A Scorer for queries with a required subscorer * and an excluding (prohibited) sub DocIdSetIterator. *
    * This Scorer implements {@link Scorer#skipTo(int)}, * and it uses the skipTo() on the given scorers. */ class ReqExclScorer extends Scorer { private Scorer reqScorer; private DocIdSetIterator exclDisi; private int doc = -1; /** Construct a ReqExclScorer. * @param reqScorer The scorer that must match, except where * @param exclDisi indicates exclusion. */ public ReqExclScorer(Scorer reqScorer, DocIdSetIterator exclDisi) { super(null); // No similarity used. this.reqScorer = reqScorer; this.exclDisi = exclDisi; } /** @deprecated use {@link #nextDoc()} instead. */ public boolean next() throws IOException { return nextDoc() != NO_MORE_DOCS; } public int nextDoc() throws IOException { if (reqScorer == null) { return doc; } doc = reqScorer.nextDoc(); if (doc == NO_MORE_DOCS) { reqScorer = null; // exhausted, nothing left return doc; } if (exclDisi == null) { return doc; } return doc = toNonExcluded(); } /** Advance to non excluded doc. *
    On entry: *

      *
    • reqScorer != null, *
    • exclScorer != null, *
    • reqScorer was advanced once via next() or skipTo() * and reqScorer.doc() may still be excluded. *
    * Advances reqScorer a non excluded required doc, if any. * @return true iff there is a non excluded required doc. */ private int toNonExcluded() throws IOException { int exclDoc = exclDisi.docID(); int reqDoc = reqScorer.docID(); // may be excluded do { if (reqDoc < exclDoc) { return reqDoc; // reqScorer advanced to before exclScorer, ie. not excluded } else if (reqDoc > exclDoc) { exclDoc = exclDisi.advance(reqDoc); if (exclDoc == NO_MORE_DOCS) { exclDisi = null; // exhausted, no more exclusions return reqDoc; } if (exclDoc > reqDoc) { return reqDoc; // not excluded } } } while ((reqDoc = reqScorer.nextDoc()) != NO_MORE_DOCS); reqScorer = null; // exhausted, nothing left return NO_MORE_DOCS; } /** @deprecated use {@link #docID()} instead. */ public int doc() { return reqScorer.doc(); // reqScorer may be null when next() or skipTo() already return false } public int docID() { return doc; } /** Returns the score of the current document matching the query. * Initially invalid, until {@link #next()} is called the first time. * @return The score of the required scorer. */ public float score() throws IOException { return reqScorer.score(); // reqScorer may be null when next() or skipTo() already return false } /** @deprecated use {@link #advance(int)} instead. */ public boolean skipTo(int target) throws IOException { return advance(target) != NO_MORE_DOCS; } public int advance(int target) throws IOException { if (reqScorer == null) { return doc = NO_MORE_DOCS; } if (exclDisi == null) { return doc = reqScorer.advance(target); } if (reqScorer.advance(target) == NO_MORE_DOCS) { reqScorer = null; return doc = NO_MORE_DOCS; } return doc = toNonExcluded(); } public Explanation explain(int doc) throws IOException { Explanation res = new Explanation(); if (exclDisi.advance(doc) == doc) { res.setDescription("excluded"); } else { res.setDescription("not excluded"); res.addDetail(reqScorer.explain(doc)); } return res; } } lucene-2.9.4/src/java/org/apache/lucene/search/Hit.java0000644000175000017500000000733311474320225023310 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search; import java.io.IOException; import org.apache.lucene.document.Document; import org.apache.lucene.index.CorruptIndexException; /** * Wrapper used by {@link HitIterator} to provide a lazily loaded hit * from {@link Hits}. * * @deprecated Use {@link TopScoreDocCollector} and {@link TopDocs} instead. Hits will be removed in Lucene 3.0. */ public class Hit implements java.io.Serializable { private Document doc = null; private boolean resolved = false; private Hits hits = null; private int hitNumber; /** * Constructed from {@link HitIterator} * @param hits Hits returned from a search * @param hitNumber Hit index in Hits */ Hit(Hits hits, int hitNumber) { this.hits = hits; this.hitNumber = hitNumber; } /** * Returns document for this hit. * * @see Hits#doc(int) * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public Document getDocument() throws CorruptIndexException, IOException { if (!resolved) fetchTheHit(); return doc; } /** * Returns score for this hit. * * @see Hits#score(int) */ public float getScore() throws IOException { return hits.score(hitNumber); } /** * Returns id for this hit. * * @see Hits#id(int) */ public int getId() throws IOException { return hits.id(hitNumber); } private void fetchTheHit() throws CorruptIndexException, IOException { doc = hits.doc(hitNumber); resolved = true; } // provide some of the Document style interface (the simple stuff) /** * Returns the boost factor for this hit on any field of the underlying document. * * @see Document#getBoost() * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public float getBoost() throws CorruptIndexException, IOException { return getDocument().getBoost(); } /** * Returns the string value of the field with the given name if any exist in * this document, or null. If multiple fields exist with this name, this * method returns the first value added. If only binary fields with this name * exist, returns null. * * @see Document#get(String) * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public String get(String name) throws CorruptIndexException, IOException { return getDocument().get(name); } /** * Prints the parameters to be used to discover the promised result. */ public String toString() { StringBuffer buffer = new StringBuffer(); buffer.append("Hit<"); buffer.append(hits.toString()); buffer.append(" ["); buffer.append(hitNumber); buffer.append("] "); if (resolved) { buffer.append("resolved"); } else { buffer.append("unresolved"); } buffer.append(">"); return buffer.toString(); } } lucene-2.9.4/src/java/org/apache/lucene/search/RangeQuery.java0000644000175000017500000001362711474320224024650 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.text.Collator; import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; /** * A Query that matches documents within an exclusive range of terms. * *

    This query matches the documents looking for terms that fall into the * supplied range according to {@link Term#compareTo(Term)}. It is not intended * for numerical ranges, use {@link NumericRangeQuery} instead. * *

    This query uses {@linkplain * MultiTermQuery#SCORING_BOOLEAN_QUERY_REWRITE}. If you * want to change this, use the new {@link TermRangeQuery} * instead. * * @deprecated Use {@link TermRangeQuery} for term ranges or * {@link NumericRangeQuery} for numeric ranges instead. * This class will be removed in Lucene 3.0. */ public class RangeQuery extends Query { private final TermRangeQuery delegate; /** Constructs a query selecting all terms greater than * lowerTerm but less than upperTerm. * There must be at least one term and either term may be null, * in which case there is no bound on that side, but if there are * two terms, both terms must be for the same field. * * @param lowerTerm The Term at the lower end of the range * @param upperTerm The Term at the upper end of the range * @param inclusive If true, both lowerTerm and * upperTerm will themselves be included in the range. */ public RangeQuery(Term lowerTerm, Term upperTerm, boolean inclusive) { this(lowerTerm, upperTerm, inclusive, null); } /** Constructs a query selecting all terms greater than * lowerTerm but less than upperTerm. * There must be at least one term and either term may be null, * in which case there is no bound on that side, but if there are * two terms, both terms must be for the same field. *

    * If collator is not null, it will be used to decide whether * index terms are within the given range, rather than using the Unicode code * point order in which index terms are stored. *

    * WARNING: Using this constructor and supplying a non-null * value in the collator parameter will cause every single * index Term in the Field referenced by lowerTerm and/or upperTerm to be * examined. Depending on the number of index Terms in this Field, the * operation could be very slow. * * @param lowerTerm The Term at the lower end of the range * @param upperTerm The Term at the upper end of the range * @param inclusive If true, both lowerTerm and * upperTerm will themselves be included in the range. * @param collator The collator to use to collate index Terms, to determine * their membership in the range bounded by lowerTerm and * upperTerm. */ public RangeQuery(Term lowerTerm, Term upperTerm, boolean inclusive, Collator collator) { if (lowerTerm == null && upperTerm == null) throw new IllegalArgumentException("At least one term must be non-null"); if (lowerTerm != null && upperTerm != null && lowerTerm.field() != upperTerm.field()) throw new IllegalArgumentException("Both terms must have the same field"); delegate = new TermRangeQuery( (lowerTerm == null) ? upperTerm.field() : lowerTerm.field(), (lowerTerm == null) ? null : lowerTerm.text(), (upperTerm == null) ? null : upperTerm.text(), inclusive, inclusive, collator ); delegate.setRewriteMethod(TermRangeQuery.SCORING_BOOLEAN_QUERY_REWRITE); } public void setBoost(float b) { super.setBoost(b); delegate.setBoost(b); } public Query rewrite(IndexReader reader) throws IOException { return delegate.rewrite(reader); } /** Returns the field name for this query */ public String getField() { return delegate.getField(); } /** Returns the lower term of this range query. */ public Term getLowerTerm() { final String term = delegate.getLowerTerm(); return (term == null) ? null : new Term(getField(), term); } /** Returns the upper term of this range query. */ public Term getUpperTerm() { final String term = delegate.getUpperTerm(); return (term == null) ? null : new Term(getField(), term); } /** Returns true if the range query is inclusive */ public boolean isInclusive() { return delegate.includesLower() && delegate.includesUpper(); } /** Returns the collator used to determine range inclusion, if any. */ public Collator getCollator() { return delegate.getCollator(); } /** Prints a user-readable version of this query. */ public String toString(String field) { return delegate.toString(field); } /** Returns true iff o is equal to this. */ public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof RangeQuery)) return false; final RangeQuery other = (RangeQuery) o; return this.delegate.equals(other.delegate); } /** Returns a hash code value for this object.*/ public int hashCode() { return delegate.hashCode(); } } lucene-2.9.4/src/java/org/apache/lucene/search/SloppyPhraseScorer.java0000644000175000017500000002167111474320224026373 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.TermPositions; import java.io.IOException; import java.util.HashMap; final class SloppyPhraseScorer extends PhraseScorer { private int slop; private PhrasePositions repeats[]; private PhrasePositions tmpPos[]; // for flipping repeating pps. private boolean checkedRepeats; SloppyPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity, int slop, byte[] norms) { super(weight, tps, offsets, similarity, norms); this.slop = slop; } /** * Score a candidate doc for all slop-valid position-combinations (matches) * encountered while traversing/hopping the PhrasePositions. *
    The score contribution of a match depends on the distance: *
    - highest score for distance=0 (exact match). *
    - score gets lower as distance gets higher. *
    Example: for query "a b"~2, a document "x a b a y" can be scored twice: * once for "a b" (distance=0), and once for "b a" (distance=2). *
    Possibly not all valid combinations are encountered, because for efficiency * we always propagate the least PhrasePosition. This allows to base on * PriorityQueue and move forward faster. * As result, for example, document "a b c b a" * would score differently for queries "a b c"~4 and "c b a"~4, although * they really are equivalent. * Similarly, for doc "a b c b a f g", query "c b"~2 * would get same score as "g f"~2, although "c b"~2 could be matched twice. * We may want to fix this in the future (currently not, for performance reasons). */ protected final float phraseFreq() throws IOException { int end = initPhrasePositions(); float freq = 0.0f; boolean done = (end<0); while (!done) { PhrasePositions pp = (PhrasePositions) pq.pop(); int start = pp.position; int next = ((PhrasePositions) pq.top()).position; boolean tpsDiffer = true; for (int pos = start; pos <= next || !tpsDiffer; pos = pp.position) { if (pos<=next && tpsDiffer) start = pos; // advance pp to min window if (!pp.nextPosition()) { done = true; // ran out of a term -- done break; } PhrasePositions pp2 = null; tpsDiffer = !pp.repeats || (pp2 = termPositionsDiffer(pp))==null; if (pp2!=null && pp2!=pp) { pp = flip(pp,pp2); // flip pp to pp2 } } int matchLength = end - start; if (matchLength <= slop) freq += getSimilarity().sloppyFreq(matchLength); // score match if (pp.position > end) end = pp.position; pq.put(pp); // restore pq } return freq; } // flip pp2 and pp in the queue: pop until finding pp2, insert back all but pp2, insert pp back. // assumes: pp!=pp2, pp2 in pq, pp not in pq. // called only when there are repeating pps. private PhrasePositions flip(PhrasePositions pp, PhrasePositions pp2) { int n=0; PhrasePositions pp3; //pop until finding pp2 while ((pp3=(PhrasePositions)pq.pop()) != pp2) { tmpPos[n++] = pp3; } //insert back all but pp2 for (n--; n>=0; n--) { pq.insert(tmpPos[n]); } //insert pp back pq.put(pp); return pp2; } /** * Init PhrasePositions in place. * There is a one time initialization for this scorer: *
    - Put in repeats[] each pp that has another pp with same position in the doc. *
    - Also mark each such pp by pp.repeats = true. *
    Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient. * In particular, this allows to score queries with no repetitions with no overhead due to this computation. *
    - Example 1 - query with no repetitions: "ho my"~2 *
    - Example 2 - query with repetitions: "ho my my"~2 *
    - Example 3 - query with repetitions: "my ho my"~2 *
    Init per doc w/repeats in query, includes propagating some repeating pp's to avoid false phrase detection. * @return end (max position), or -1 if any term ran out (i.e. done) * @throws IOException */ private int initPhrasePositions() throws IOException { int end = 0; // no repeats at all (most common case is also the simplest one) if (checkedRepeats && repeats==null) { // build queue from list pq.clear(); for (PhrasePositions pp = first; pp != null; pp = pp.next) { pp.firstPosition(); if (pp.position > end) end = pp.position; pq.put(pp); // build pq from list } return end; } // position the pp's for (PhrasePositions pp = first; pp != null; pp = pp.next) pp.firstPosition(); // one time initializatin for this scorer if (!checkedRepeats) { checkedRepeats = true; // check for repeats HashMap m = null; for (PhrasePositions pp = first; pp != null; pp = pp.next) { int tpPos = pp.position + pp.offset; for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) { int tpPos2 = pp2.position + pp2.offset; if (tpPos2 == tpPos) { if (m == null) m = new HashMap(); pp.repeats = true; pp2.repeats = true; m.put(pp,null); m.put(pp2,null); } } } if (m!=null) repeats = (PhrasePositions[]) m.keySet().toArray(new PhrasePositions[0]); } // with repeats must advance some repeating pp's so they all start with differing tp's if (repeats!=null) { for (int i = 0; i < repeats.length; i++) { PhrasePositions pp = repeats[i]; PhrasePositions pp2; while ((pp2 = termPositionsDiffer(pp)) != null) { if (!pp2.nextPosition()) // out of pps that do not differ, advance the pp with higher offset return -1; // ran out of a term -- done } } } // build queue from list pq.clear(); for (PhrasePositions pp = first; pp != null; pp = pp.next) { if (pp.position > end) end = pp.position; pq.put(pp); // build pq from list } if (repeats!=null) { tmpPos = new PhrasePositions[pq.size()]; } return end; } /** * We disallow two pp's to have the same TermPosition, thereby verifying multiple occurrences * in the query of the same word would go elsewhere in the matched doc. * @return null if differ (i.e. valid) otherwise return the higher offset PhrasePositions * out of the first two PPs found to not differ. */ private PhrasePositions termPositionsDiffer(PhrasePositions pp) { // efficiency note: a more efficient implementation could keep a map between repeating // pp's, so that if pp1a, pp1b, pp1c are repeats term1, and pp2a, pp2b are repeats // of term2, pp2a would only be checked against pp2b but not against pp1a, pp1b, pp1c. // However this would complicate code, for a rather rare case, so choice is to compromise here. int tpPos = pp.position + pp.offset; for (int i = 0; i < repeats.length; i++) { PhrasePositions pp2 = repeats[i]; if (pp2 == pp) continue; int tpPos2 = pp2.position + pp2.offset; if (tpPos2 == tpPos) return pp.offset > pp2.offset ? pp : pp2; // do not differ: return the one with higher offset. } return null; } } lucene-2.9.4/src/java/org/apache/lucene/search/Hits.java0000644000175000017500000002106011474320224023463 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.ConcurrentModificationException; import java.util.Iterator; import java.util.Vector; import org.apache.lucene.document.Document; import org.apache.lucene.index.CorruptIndexException; /** A ranked list of documents, used to hold search results. *

    * Caution: Iterate only over the hits needed. Iterating over all * hits is generally not desirable and may be the source of * performance issues. If you need to iterate over many or all hits, consider * using the search method that takes a {@link HitCollector}. *

    *

    Note: Deleting matching documents concurrently with traversing * the hits, might, when deleting hits that were not yet retrieved, decrease * {@link #length()}. In such case, * {@link java.util.ConcurrentModificationException ConcurrentModificationException} * is thrown when accessing hit n ≥ current_{@link #length()} * (but n < {@link #length()}_at_start). * * @deprecated * see {@link Searcher#search(Query, int)}, {@link Searcher#search(Query, Filter, int)} * and {@link Searcher#search(Query, Filter, int, Sort)}:
    *

     *   TopDocs topDocs = searcher.search(query, numHits);
     *   ScoreDoc[] hits = topDocs.scoreDocs;
     *   for (int i = 0; i < hits.length; i++) {
     *     int docId = hits[i].doc;
     *     Document d = searcher.doc(docId);
     *     // do something with current hit
     *     ...
     * 
    */ public final class Hits { private Weight weight; private Searcher searcher; private Filter filter = null; private Sort sort = null; private int length; // the total number of hits private Vector hitDocs = new Vector(); // cache of hits retrieved private HitDoc first; // head of LRU cache private HitDoc last; // tail of LRU cache private int numDocs = 0; // number cached private int maxDocs = 200; // max to cache private int nDeletions; // # deleted docs in the index. private int lengthAtStart; // this is the number apps usually count on (although deletions can bring it down). private int nDeletedHits = 0; // # of already collected hits that were meanwhile deleted. boolean debugCheckedForDeletions = false; // for test purposes. Hits(Searcher s, Query q, Filter f) throws IOException { weight = q.weight(s); searcher = s; filter = f; nDeletions = countDeletions(s); getMoreDocs(50); // retrieve 100 initially lengthAtStart = length; } Hits(Searcher s, Query q, Filter f, Sort o) throws IOException { weight = q.weight(s); searcher = s; filter = f; sort = o; nDeletions = countDeletions(s); getMoreDocs(50); // retrieve 100 initially lengthAtStart = length; } // count # deletions, return -1 if unknown. private int countDeletions(Searcher s) throws IOException { int cnt = -1; if (s instanceof IndexSearcher) { cnt = s.maxDoc() - ((IndexSearcher) s).getIndexReader().numDocs(); } return cnt; } /** * Tries to add new documents to hitDocs. * Ensures that the hit numbered min has been retrieved. */ private final void getMoreDocs(int min) throws IOException { if (hitDocs.size() > min) { min = hitDocs.size(); } int n = min * 2; // double # retrieved TopDocs topDocs = (sort == null) ? searcher.search(weight, filter, n) : searcher.search(weight, filter, n, sort); length = topDocs.totalHits; ScoreDoc[] scoreDocs = topDocs.scoreDocs; float scoreNorm = 1.0f; if (length > 0 && topDocs.getMaxScore() > 1.0f) { scoreNorm = 1.0f / topDocs.getMaxScore(); } int start = hitDocs.size() - nDeletedHits; // any new deletions? int nDels2 = countDeletions(searcher); debugCheckedForDeletions = false; if (nDeletions < 0 || nDels2 > nDeletions) { // either we cannot count deletions, or some "previously valid hits" might have been deleted, so find exact start point nDeletedHits = 0; debugCheckedForDeletions = true; int i2 = 0; for (int i1=0; i1th document in this set. *

    Documents are cached, so that repeated requests for the same element may * return the same Document object. * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public final Document doc(int n) throws CorruptIndexException, IOException { HitDoc hitDoc = hitDoc(n); // Update LRU cache of documents remove(hitDoc); // remove from list, if there addToFront(hitDoc); // add to front of list if (numDocs > maxDocs) { // if cache is full HitDoc oldLast = last; remove(last); // flush last oldLast.doc = null; // let doc get gc'd } if (hitDoc.doc == null) { hitDoc.doc = searcher.doc(hitDoc.id); // cache miss: read document } return hitDoc.doc; } /** Returns the score for the nth document in this set. */ public final float score(int n) throws IOException { return hitDoc(n).score; } /** Returns the id for the nth document in this set. * Note that ids may change when the index changes, so you cannot * rely on the id to be stable. */ public final int id(int n) throws IOException { return hitDoc(n).id; } /** * Returns a {@link HitIterator} to navigate the Hits. Each item returned * from {@link Iterator#next()} is a {@link Hit}. *

    * Caution: Iterate only over the hits needed. Iterating over all * hits is generally not desirable and may be the source of * performance issues. If you need to iterate over many or all hits, consider * using a search method that takes a {@link HitCollector}. *

    */ public Iterator iterator() { return new HitIterator(this); } private final HitDoc hitDoc(int n) throws IOException { if (n >= lengthAtStart) { throw new IndexOutOfBoundsException("Not a valid hit number: " + n); } if (n >= hitDocs.size()) { getMoreDocs(n); } if (n >= length) { throw new ConcurrentModificationException("Not a valid hit number: " + n); } return (HitDoc) hitDocs.elementAt(n); } private final void addToFront(HitDoc hitDoc) { // insert at front of cache if (first == null) { last = hitDoc; } else { first.prev = hitDoc; } hitDoc.next = first; first = hitDoc; hitDoc.prev = null; numDocs++; } private final void remove(HitDoc hitDoc) { // remove from cache if (hitDoc.doc == null) { // it's not in the list return; // abort } if (hitDoc.next == null) { last = hitDoc.prev; } else { hitDoc.next.prev = hitDoc.prev; } if (hitDoc.prev == null) { first = hitDoc.next; } else { hitDoc.prev.next = hitDoc.next; } numDocs--; } } final class HitDoc { float score; int id; Document doc = null; HitDoc next; // in doubly-linked cache HitDoc prev; // in doubly-linked cache HitDoc(float s, int i) { score = s; id = i; } } lucene-2.9.4/src/java/org/apache/lucene/search/TopDocCollector.java0000644000175000017500000000642511474320224025623 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.util.PriorityQueue; /** A {@link HitCollector} implementation that collects the top-scoring * documents, returning them as a {@link TopDocs}. This is used by {@link * IndexSearcher} to implement {@link TopDocs}-based search. * *

    This may be extended, overriding the collect method to, e.g., * conditionally invoke super() in order to filter which * documents are collected. * * @deprecated Please use {@link TopScoreDocCollector} * instead, which has better performance. **/ public class TopDocCollector extends HitCollector { private ScoreDoc reusableSD; /** The total number of hits the collector encountered. */ protected int totalHits; /** The priority queue which holds the top-scoring documents. */ protected PriorityQueue hq; /** Construct to collect a given number of hits. * @param numHits the maximum number of hits to collect */ public TopDocCollector(int numHits) { this(new HitQueue(numHits, false)); } /** @deprecated use TopDocCollector(hq) instead. numHits is not used by this * constructor. It will be removed in a future release. */ TopDocCollector(int numHits, PriorityQueue hq) { this.hq = hq; } /** Constructor to collect the top-scoring documents by using the given PQ. * @param hq the PQ to use by this instance. */ protected TopDocCollector(PriorityQueue hq) { this.hq = hq; } // javadoc inherited public void collect(int doc, float score) { if (score > 0.0f) { totalHits++; if (reusableSD == null) { reusableSD = new ScoreDoc(doc, score); } else if (score >= reusableSD.score) { // reusableSD holds the last "rejected" entry, so, if // this new score is not better than that, there's no // need to try inserting it reusableSD.doc = doc; reusableSD.score = score; } else { return; } reusableSD = (ScoreDoc) hq.insertWithOverflow(reusableSD); } } /** The total number of documents that matched this query. */ public int getTotalHits() { return totalHits; } /** The top-scoring hits. */ public TopDocs topDocs() { ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()]; for (int i = hq.size()-1; i >= 0; i--) // put docs in array scoreDocs[i] = (ScoreDoc)hq.pop(); float maxScore = (totalHits==0) ? Float.NEGATIVE_INFINITY : scoreDocs[0].score; return new TopDocs(totalHits, scoreDocs, maxScore); } } lucene-2.9.4/src/java/org/apache/lucene/search/PrefixTermEnum.java0000644000175000017500000000363711474320224025500 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; /** * Subclass of FilteredTermEnum for enumerating all terms that match the * specified prefix filter term. *

    * Term enumerations are always ordered by Term.compareTo(). Each term in * the enumeration is greater than all that precede it. * */ public class PrefixTermEnum extends FilteredTermEnum { private final Term prefix; private boolean endEnum = false; public PrefixTermEnum(IndexReader reader, Term prefix) throws IOException { this.prefix = prefix; setEnum(reader.terms(new Term(prefix.field(), prefix.text()))); } public float difference() { return 1.0f; } protected boolean endEnum() { return endEnum; } protected Term getPrefixTerm() { return prefix; } protected boolean termCompare(Term term) { if (term.field() == prefix.field() && term.text().startsWith(prefix.text())) { return true; } endEnum = true; return false; } } lucene-2.9.4/src/java/org/apache/lucene/search/WildcardTermEnum.java0000644000175000017500000001323411474320224025766 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; /** * Subclass of FilteredTermEnum for enumerating all terms that match the * specified wildcard filter term. *

    * Term enumerations are always ordered by Term.compareTo(). Each term in * the enumeration is greater than all that precede it. * * @version $Id: WildcardTermEnum.java 988682 2010-08-24 19:30:07Z rmuir $ */ public class WildcardTermEnum extends FilteredTermEnum { final Term searchTerm; final String field; final String text; final String pre; final int preLen; boolean endEnum = false; /** * Creates a new WildcardTermEnum. *

    * After calling the constructor the enumeration is already pointing to the first * valid term if such a term exists. */ public WildcardTermEnum(IndexReader reader, Term term) throws IOException { super(); searchTerm = term; field = searchTerm.field(); final String searchTermText = searchTerm.text(); final int sidx = searchTermText.indexOf(WILDCARD_STRING); final int cidx = searchTermText.indexOf(WILDCARD_CHAR); int idx = sidx; if (idx == -1) { idx = cidx; } else if (cidx >= 0) { idx = Math.min(idx, cidx); } pre = idx != -1?searchTerm.text().substring(0,idx): ""; preLen = pre.length(); text = searchTermText.substring(preLen); setEnum(reader.terms(new Term(searchTerm.field(), pre))); } protected final boolean termCompare(Term term) { if (field == term.field()) { String searchText = term.text(); if (searchText.startsWith(pre)) { return wildcardEquals(text, 0, searchText, preLen); } } endEnum = true; return false; } public float difference() { return 1.0f; } public final boolean endEnum() { return endEnum; } /******************************************** * String equality with support for wildcards ********************************************/ public static final char WILDCARD_STRING = '*'; public static final char WILDCARD_CHAR = '?'; /** * Determines if a word matches a wildcard pattern. * Work released by Granta Design Ltd after originally being done on * company time. */ public static final boolean wildcardEquals(String pattern, int patternIdx, String string, int stringIdx) { int p = patternIdx; for (int s = stringIdx; ; ++p, ++s) { // End of string yet? boolean sEnd = (s >= string.length()); // End of pattern yet? boolean pEnd = (p >= pattern.length()); // If we're looking at the end of the string... if (sEnd) { // Assume the only thing left on the pattern is/are wildcards boolean justWildcardsLeft = true; // Current wildcard position int wildcardSearchPos = p; // While we haven't found the end of the pattern, // and haven't encountered any non-wildcard characters while (wildcardSearchPos < pattern.length() && justWildcardsLeft) { // Check the character at the current position char wildchar = pattern.charAt(wildcardSearchPos); // If it's not a wildcard character, then there is more // pattern information after this/these wildcards. if (wildchar != WILDCARD_CHAR && wildchar != WILDCARD_STRING) { justWildcardsLeft = false; } else { // to prevent "cat" matches "ca??" if (wildchar == WILDCARD_CHAR) { return false; } // Look at the next character wildcardSearchPos++; } } // This was a prefix wildcard search, and we've matched, so // return true. if (justWildcardsLeft) { return true; } } // If we've gone past the end of the string, or the pattern, // return false. if (sEnd || pEnd) { break; } // Match a single character, so continue. if (pattern.charAt(p) == WILDCARD_CHAR) { continue; } // if (pattern.charAt(p) == WILDCARD_STRING) { // Look at the character beyond the '*' characters. while (p < pattern.length() && pattern.charAt(p) == WILDCARD_STRING) ++p; // Examine the string, starting at the last character. for (int i = string.length(); i >= s; --i) { if (wildcardEquals(pattern, p, string, i)) { return true; } } break; } if (pattern.charAt(p) != string.charAt(s)) { break; } } return false; } } lucene-2.9.4/src/java/org/apache/lucene/search/DocIdSet.java0000644000175000017500000000451411474320224024217 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; /** * A DocIdSet contains a set of doc ids. Implementing classes must * only implement {@link #iterator} to provide access to the set. */ public abstract class DocIdSet { /** An empty {@code DocIdSet} instance for easy use, e.g. in Filters that hit no documents. */ public static final DocIdSet EMPTY_DOCIDSET = new DocIdSet() { private final DocIdSetIterator iterator = new DocIdSetIterator() { public int advance(int target) throws IOException { return NO_MORE_DOCS; } public int docID() { return NO_MORE_DOCS; } public int nextDoc() throws IOException { return NO_MORE_DOCS; } }; public DocIdSetIterator iterator() { return iterator; } public boolean isCacheable() { return true; } }; /** Provides a {@link DocIdSetIterator} to access the set. * This implementation can return null or * {@linkplain #EMPTY_DOCIDSET}.iterator() if there * are no docs that match. */ public abstract DocIdSetIterator iterator() throws IOException; /** * This method is a hint for {@link CachingWrapperFilter}, if this DocIdSet * should be cached without copying it into a BitSet. The default is to return * false. If you have an own DocIdSet implementation * that does its iteration very effective and fast without doing disk I/O, * override this method and return true. */ public boolean isCacheable() { return false; } } lucene-2.9.4/src/java/org/apache/lucene/search/FieldCacheImpl.java0000644000175000017500000007066211474320225025362 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.PrintStream; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.WeakHashMap; import org.apache.lucene.document.NumericField; // javadoc import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.TermEnum; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.FieldCacheSanityChecker; /** * Expert: The default cache implementation, storing all values in memory. * A WeakHashMap is used for storage. * *

    Created: May 19, 2004 4:40:36 PM * * @since lucene 1.4 * @version $Id: FieldCacheImpl.java 957500 2010-06-24 10:37:32Z uschindler $ */ // TODO: change interface to FieldCache in 3.0 when removed class FieldCacheImpl implements ExtendedFieldCache { private Map caches; FieldCacheImpl() { init(); } private synchronized void init() { caches = new HashMap(7); caches.put(Byte.TYPE, new ByteCache(this)); caches.put(Short.TYPE, new ShortCache(this)); caches.put(Integer.TYPE, new IntCache(this)); caches.put(Float.TYPE, new FloatCache(this)); caches.put(Long.TYPE, new LongCache(this)); caches.put(Double.TYPE, new DoubleCache(this)); caches.put(String.class, new StringCache(this)); caches.put(StringIndex.class, new StringIndexCache(this)); caches.put(Comparable.class, new CustomCache(this)); caches.put(Object.class, new AutoCache(this)); } public void purgeAllCaches() { init(); } public void purge(IndexReader r) { Iterator it = caches.values().iterator(); while(it.hasNext()) { Cache c = (Cache) it.next(); c.purge(r); } } public CacheEntry[] getCacheEntries() { List result = new ArrayList(17); Iterator outerKeys = caches.keySet().iterator(); while (outerKeys.hasNext()) { Class cacheType = (Class)outerKeys.next(); Cache cache = (Cache)caches.get(cacheType); Iterator innerKeys = cache.readerCache.keySet().iterator(); while (innerKeys.hasNext()) { // we've now materialized a hard ref Object readerKey = innerKeys.next(); // innerKeys was backed by WeakHashMap, sanity check // that it wasn't GCed before we made hard ref if (null != readerKey && cache.readerCache.containsKey(readerKey)) { Map innerCache = ((Map)cache.readerCache.get(readerKey)); Iterator entrySetIterator = innerCache.entrySet().iterator(); while (entrySetIterator.hasNext()) { Map.Entry mapEntry = (Map.Entry) entrySetIterator.next(); Entry entry = (Entry) mapEntry.getKey(); result.add(new CacheEntryImpl(readerKey, entry.field, cacheType, entry.type, entry.custom, entry.locale, mapEntry.getValue())); } } } } return (CacheEntry[]) result.toArray(new CacheEntry[result.size()]); } private static final class CacheEntryImpl extends CacheEntry { /** * @deprecated Only needed because of Entry (ab)use by * FieldSortedHitQueue, remove when FieldSortedHitQueue * is removed */ private final int sortFieldType; /** * @deprecated Only needed because of Entry (ab)use by * FieldSortedHitQueue, remove when FieldSortedHitQueue * is removed */ private final Locale locale; private final Object readerKey; private final String fieldName; private final Class cacheType; private final Object custom; private final Object value; CacheEntryImpl(Object readerKey, String fieldName, Class cacheType, int sortFieldType, Object custom, Locale locale, Object value) { this.readerKey = readerKey; this.fieldName = fieldName; this.cacheType = cacheType; this.sortFieldType = sortFieldType; this.custom = custom; this.locale = locale; this.value = value; // :HACK: for testing. // if (null != locale || SortField.CUSTOM != sortFieldType) { // throw new RuntimeException("Locale/sortFieldType: " + this); // } } public Object getReaderKey() { return readerKey; } public String getFieldName() { return fieldName; } public Class getCacheType() { return cacheType; } public Object getCustom() { return custom; } public Object getValue() { return value; } /** * Adds warning to super.toString if Local or sortFieldType were specified * @deprecated Only needed because of Entry (ab)use by * FieldSortedHitQueue, remove when FieldSortedHitQueue * is removed */ public String toString() { String r = super.toString(); if (null != locale) { r = r + "...!!!Locale:" + locale + "???"; } if (SortField.CUSTOM != sortFieldType) { r = r + "...!!!SortType:" + sortFieldType + "???"; } return r; } } /** * Hack: When thrown from a Parser (NUMERIC_UTILS_* ones), this stops * processing terms and returns the current FieldCache * array. */ static final class StopFillCacheException extends RuntimeException { } /** Expert: Internal cache. */ abstract static class Cache { Cache() { this.wrapper = null; } Cache(FieldCache wrapper) { this.wrapper = wrapper; } final FieldCache wrapper; final Map readerCache = new WeakHashMap(); protected abstract Object createValue(IndexReader reader, Entry key) throws IOException; /** Remove this reader from the cache, if present. */ public void purge(IndexReader r) { Object readerKey = r.getFieldCacheKey(); synchronized(readerCache) { readerCache.remove(readerKey); } } public Object get(IndexReader reader, Entry key) throws IOException { Map innerCache; Object value; final Object readerKey = reader.getFieldCacheKey(); synchronized (readerCache) { innerCache = (Map) readerCache.get(readerKey); if (innerCache == null) { innerCache = new HashMap(); readerCache.put(readerKey, innerCache); value = null; } else { value = innerCache.get(key); } if (value == null) { value = new CreationPlaceholder(); innerCache.put(key, value); } } if (value instanceof CreationPlaceholder) { synchronized (value) { CreationPlaceholder progress = (CreationPlaceholder) value; if (progress.value == null) { progress.value = createValue(reader, key); synchronized (readerCache) { innerCache.put(key, progress.value); } // Only check if key.custom (the parser) is // non-null; else, we check twice for a single // call to FieldCache.getXXX if (key.custom != null && wrapper != null) { final PrintStream infoStream = wrapper.getInfoStream(); if (infoStream != null) { printNewInsanity(infoStream, progress.value); } } } return progress.value; } } return value; } private void printNewInsanity(PrintStream infoStream, Object value) { final FieldCacheSanityChecker.Insanity[] insanities = FieldCacheSanityChecker.checkSanity(wrapper); for(int i=0;i= mterms.length) break; // store term text mterms[t] = term.text(); termDocs.seek (termEnum); while (termDocs.next()) { retArray[termDocs.doc()] = t; } t++; } while (termEnum.next()); } finally { termDocs.close(); termEnum.close(); } if (t == 0) { // if there are no terms, make the term array // have a single null entry mterms = new String[1]; } else if (t < mterms.length) { // if there are less terms than documents, // trim off the dead array space String[] terms = new String[t]; System.arraycopy (mterms, 0, terms, 0, t); mterms = terms; } StringIndex value = new StringIndex (retArray, mterms); return value; } }; /** The pattern used to detect integer values in a field */ /** removed for java 1.3 compatibility protected static final Pattern pIntegers = Pattern.compile ("[0-9\\-]+"); **/ /** The pattern used to detect float values in a field */ /** * removed for java 1.3 compatibility * protected static final Object pFloats = Pattern.compile ("[0-9+\\-\\.eEfFdD]+"); */ // inherit javadocs public Object getAuto(IndexReader reader, String field) throws IOException { return ((Cache)caches.get(Object.class)).get(reader, new Entry(field, (Parser)null)); } /** * @deprecated Please specify the exact type, instead. * Especially, guessing does not work with the new * {@link NumericField} type. */ static final class AutoCache extends Cache { AutoCache(FieldCache wrapper) { super(wrapper); } protected Object createValue(IndexReader reader, Entry entryKey) throws IOException { String field = StringHelper.intern((String) entryKey.field); TermEnum enumerator = reader.terms (new Term (field)); try { Term term = enumerator.term(); if (term == null) { throw new RuntimeException ("no terms in field " + field + " - cannot determine type"); } Object ret = null; if (term.field() == field) { String termtext = term.text().trim(); try { Integer.parseInt (termtext); ret = wrapper.getInts (reader, field); } catch (NumberFormatException nfe1) { try { Long.parseLong(termtext); ret = wrapper.getLongs (reader, field); } catch (NumberFormatException nfe2) { try { Float.parseFloat (termtext); ret = wrapper.getFloats (reader, field); } catch (NumberFormatException nfe3) { ret = wrapper.getStringIndex (reader, field); } } } } else { throw new RuntimeException ("field \"" + field + "\" does not appear to be indexed"); } return ret; } finally { enumerator.close(); } } }; /** @deprecated */ public Comparable[] getCustom(IndexReader reader, String field, SortComparator comparator) throws IOException { return (Comparable[]) ((Cache)caches.get(Comparable.class)).get(reader, new Entry(field, comparator)); } /** @deprecated */ static final class CustomCache extends Cache { CustomCache(FieldCache wrapper) { super(wrapper); } protected Object createValue(IndexReader reader, Entry entryKey) throws IOException { Entry entry = (Entry) entryKey; String field = entry.field; SortComparator comparator = (SortComparator) entry.custom; final Comparable[] retArray = new Comparable[reader.maxDoc()]; TermDocs termDocs = reader.termDocs(); TermEnum termEnum = reader.terms (new Term (field)); try { do { Term term = termEnum.term(); if (term==null || term.field() != field) break; Comparable termval = comparator.getComparable (term.text()); termDocs.seek (termEnum); while (termDocs.next()) { retArray[termDocs.doc()] = termval; } } while (termEnum.next()); } finally { termDocs.close(); termEnum.close(); } return retArray; } }; private volatile PrintStream infoStream; public void setInfoStream(PrintStream stream) { infoStream = stream; } public PrintStream getInfoStream() { return infoStream; } } lucene-2.9.4/src/java/org/apache/lucene/search/ExactPhraseScorer.java0000644000175000017500000000366311474320225026153 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.*; final class ExactPhraseScorer extends PhraseScorer { ExactPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity, byte[] norms) { super(weight, tps, offsets, similarity, norms); } protected final float phraseFreq() throws IOException { // sort list with pq pq.clear(); for (PhrasePositions pp = first; pp != null; pp = pp.next) { pp.firstPosition(); pq.put(pp); // build pq from list } pqToList(); // rebuild list from pq // for counting how many times the exact phrase is found in current document, // just count how many times all PhrasePosition's have exactly the same position. int freq = 0; do { // find position w/ all terms while (first.position < last.position) { // scan forward in first do { if (!first.nextPosition()) return freq; } while (first.position < last.position); firstToLast(); } freq++; // all equal: a match } while (last.nextPosition()); return freq; } } lucene-2.9.4/src/java/org/apache/lucene/search/BooleanClause.java0000644000175000017500000000655611474320224025305 0ustar janpascaljanpascalpackage org.apache.lucene.search; import org.apache.lucene.util.Parameter; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** A clause in a BooleanQuery. */ public class BooleanClause implements java.io.Serializable { /** Specifies how clauses are to occur in matching documents. */ public static final class Occur extends Parameter implements java.io.Serializable { private Occur(String name) { // typesafe enum pattern, no public constructor super(name); } public String toString() { if (this == MUST) return "+"; if (this == MUST_NOT) return "-"; return ""; } /** Use this operator for clauses that must appear in the matching documents. */ public static final Occur MUST = new Occur("MUST"); /** Use this operator for clauses that should appear in the * matching documents. For a BooleanQuery with no MUST * clauses one or more SHOULD clauses must match a document * for the BooleanQuery to match. * @see BooleanQuery#setMinimumNumberShouldMatch */ public static final Occur SHOULD = new Occur("SHOULD"); /** Use this operator for clauses that must not appear in the matching documents. * Note that it is not possible to search for queries that only consist * of a MUST_NOT clause. */ public static final Occur MUST_NOT = new Occur("MUST_NOT"); } /** The query whose matching documents are combined by the boolean query. */ private Query query; private Occur occur; /** Constructs a BooleanClause. */ public BooleanClause(Query query, Occur occur) { this.query = query; this.occur = occur; } public Occur getOccur() { return occur; } public void setOccur(Occur occur) { this.occur = occur; } public Query getQuery() { return query; } public void setQuery(Query query) { this.query = query; } public boolean isProhibited() { return Occur.MUST_NOT.equals(occur); } public boolean isRequired() { return Occur.MUST.equals(occur); } /** Returns true if o is equal to this. */ public boolean equals(Object o) { if (o == null || !(o instanceof BooleanClause)) return false; BooleanClause other = (BooleanClause)o; return this.query.equals(other.query) && this.occur.equals(other.occur); } /** Returns a hash code value for this object.*/ public int hashCode() { return query.hashCode() ^ (Occur.MUST.equals(occur)?1:0) ^ (Occur.MUST_NOT.equals(occur)?2:0); } public String toString() { return occur.toString() + query.toString(); } } lucene-2.9.4/src/java/org/apache/lucene/search/RangeFilter.java0000644000175000017500000000774211474320224024771 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.text.Collator; /** * A Filter that restricts search results to a range of values in a given * field. * *

    This filter matches the documents looking for terms that fall into the * supplied range according to {@link String#compareTo(String)}. It is not intended * for numerical ranges, use {@link NumericRangeFilter} instead. * *

    If you construct a large number of range filters with different ranges but on the * same field, {@link FieldCacheRangeFilter} may have significantly better performance. * * @deprecated Use {@link TermRangeFilter} for term ranges or * {@link NumericRangeFilter} for numeric ranges instead. * This class will be removed in Lucene 3.0. */ public class RangeFilter extends MultiTermQueryWrapperFilter { /** * @param fieldName The field this range applies to * @param lowerTerm The lower bound on this range * @param upperTerm The upper bound on this range * @param includeLower Does this range include the lower bound? * @param includeUpper Does this range include the upper bound? * @throws IllegalArgumentException if both terms are null or if * lowerTerm is null and includeLower is true (similar for upperTerm * and includeUpper) */ public RangeFilter(String fieldName, String lowerTerm, String upperTerm, boolean includeLower, boolean includeUpper) { super(new TermRangeQuery(fieldName, lowerTerm, upperTerm, includeLower, includeUpper)); } /** * WARNING: Using this constructor and supplying a non-null * value in the collator parameter will cause every single * index Term in the Field referenced by lowerTerm and/or upperTerm to be * examined. Depending on the number of index Terms in this Field, the * operation could be very slow. * * @param lowerTerm The lower bound on this range * @param upperTerm The upper bound on this range * @param includeLower Does this range include the lower bound? * @param includeUpper Does this range include the upper bound? * @param collator The collator to use when determining range inclusion; set * to null to use Unicode code point ordering instead of collation. * @throws IllegalArgumentException if both terms are null or if * lowerTerm is null and includeLower is true (similar for upperTerm * and includeUpper) */ public RangeFilter(String fieldName, String lowerTerm, String upperTerm, boolean includeLower, boolean includeUpper, Collator collator) { super(new TermRangeQuery(fieldName, lowerTerm, upperTerm, includeLower, includeUpper, collator)); } /** * Constructs a filter for field fieldName matching * less than or equal to upperTerm. */ public static RangeFilter Less(String fieldName, String upperTerm) { return new RangeFilter(fieldName, null, upperTerm, false, true); } /** * Constructs a filter for field fieldName matching * greater than or equal to lowerTerm. */ public static RangeFilter More(String fieldName, String lowerTerm) { return new RangeFilter(fieldName, lowerTerm, null, true, false); } } lucene-2.9.4/src/java/org/apache/lucene/search/MatchAllDocsQuery.java0000644000175000017500000001132011474320224026076 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.TermDocs; import org.apache.lucene.util.ToStringUtils; import java.util.Set; import java.io.IOException; /** * A query that matches all documents. * */ public class MatchAllDocsQuery extends Query { public MatchAllDocsQuery() { this(null); } private final String normsField; /** * @param normsField Field used for normalization factor (document boost). Null if nothing. */ public MatchAllDocsQuery(String normsField) { this.normsField = normsField; } private class MatchAllScorer extends Scorer { final TermDocs termDocs; final float score; final byte[] norms; private int doc = -1; MatchAllScorer(IndexReader reader, Similarity similarity, Weight w, byte[] norms) throws IOException { super(similarity); this.termDocs = reader.termDocs(null); score = w.getValue(); this.norms = norms; } public Explanation explain(int doc) { return null; // not called... see MatchAllDocsWeight.explain() } /** @deprecated use {@link #docID()} instead. */ public int doc() { return termDocs.doc(); } public int docID() { return doc; } /** @deprecated use {@link #nextDoc()} instead. */ public boolean next() throws IOException { return nextDoc() != NO_MORE_DOCS; } public int nextDoc() throws IOException { return doc = termDocs.next() ? termDocs.doc() : NO_MORE_DOCS; } public float score() { return norms == null ? score : score * Similarity.decodeNorm(norms[docID()]); } /** @deprecated use {@link #advance(int)} instead. */ public boolean skipTo(int target) throws IOException { return advance(target) != NO_MORE_DOCS; } public int advance(int target) throws IOException { return doc = termDocs.skipTo(target) ? termDocs.doc() : NO_MORE_DOCS; } } private class MatchAllDocsWeight extends Weight { private Similarity similarity; private float queryWeight; private float queryNorm; public MatchAllDocsWeight(Searcher searcher) { this.similarity = searcher.getSimilarity(); } public String toString() { return "weight(" + MatchAllDocsQuery.this + ")"; } public Query getQuery() { return MatchAllDocsQuery.this; } public float getValue() { return queryWeight; } public float sumOfSquaredWeights() { queryWeight = getBoost(); return queryWeight * queryWeight; } public void normalize(float queryNorm) { this.queryNorm = queryNorm; queryWeight *= this.queryNorm; } public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException { return new MatchAllScorer(reader, similarity, this, normsField != null ? reader.norms(normsField) : null); } public Explanation explain(IndexReader reader, int doc) { // explain query weight Explanation queryExpl = new ComplexExplanation (true, getValue(), "MatchAllDocsQuery, product of:"); if (getBoost() != 1.0f) { queryExpl.addDetail(new Explanation(getBoost(),"boost")); } queryExpl.addDetail(new Explanation(queryNorm,"queryNorm")); return queryExpl; } } public Weight createWeight(Searcher searcher) { return new MatchAllDocsWeight(searcher); } public void extractTerms(Set terms) { } public String toString(String field) { StringBuffer buffer = new StringBuffer(); buffer.append("*:*"); buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } public boolean equals(Object o) { if (!(o instanceof MatchAllDocsQuery)) return false; MatchAllDocsQuery other = (MatchAllDocsQuery) o; return this.getBoost() == other.getBoost(); } public int hashCode() { return Float.floatToIntBits(getBoost()) ^ 0x1AA71190; } } lucene-2.9.4/src/java/org/apache/lucene/search/spans/0000755000175000017500000000000011554106562023044 5ustar janpascaljanpascallucene-2.9.4/src/java/org/apache/lucene/search/spans/Spans.java0000644000175000017500000000725011474320224024771 0ustar janpascaljanpascalpackage org.apache.lucene.search.spans; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Collection; /** Expert: an enumeration of span matches. Used to implement span searching. * Each span represents a range of term positions within a document. Matches * are enumerated in order, by increasing document number, within that by * increasing start position and finally by increasing end position. */ public abstract class Spans { /** Move to the next match, returning true iff any such exists. */ public abstract boolean next() throws IOException; /** Skips to the first match beyond the current, whose document number is * greater than or equal to target.

    Returns true iff there is such * a match.

    Behaves as if written:

       *   boolean skipTo(int target) {
       *     do {
       *       if (!next())
       * 	     return false;
       *     } while (target > doc());
       *     return true;
       *   }
       * 
    * Most implementations are considerably more efficient than that. */ public abstract boolean skipTo(int target) throws IOException; /** Returns the document number of the current match. Initially invalid. */ public abstract int doc(); /** Returns the start position of the current match. Initially invalid. */ public abstract int start(); /** Returns the end position of the current match. Initially invalid. */ public abstract int end(); /** * Returns the payload data for the current span. * This is invalid until {@link #next()} is called for * the first time. * This method must not be called more than once after each call * of {@link #next()}. However, most payloads are loaded lazily, * so if the payload data for the current position is not needed, * this method may not be called at all for performance reasons. An ordered * SpanQuery does not lazy load, so if you have payloads in your index and * you do not want ordered SpanNearQuerys to collect payloads, you can * disable collection with a constructor option.
    *
    * Note that the return type is a collection, thus the ordering should not be relied upon. *
    *

    * WARNING: The status of the Payloads feature is experimental. * The APIs introduced here might change in the future and will not be * supported anymore in such a case. * * @return a List of byte arrays containing the data of this payload, otherwise null if isPayloadAvailable is false * @throws java.io.IOException */ // TODO: Remove warning after API has been finalized public abstract Collection/**/ getPayload() throws IOException; /** * Checks if a payload can be loaded at this position. *

    * Payloads can only be loaded once per call to * {@link #next()}. * * @return true if there is a payload available at this position that can be loaded */ public abstract boolean isPayloadAvailable(); } lucene-2.9.4/src/java/org/apache/lucene/search/spans/SpanQuery.java0000644000175000017500000000332411474320224025632 0ustar janpascaljanpascalpackage org.apache.lucene.search.spans; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Collection; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Query; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.Weight; /** Base class for span-based queries. */ public abstract class SpanQuery extends Query { /** Expert: Returns the matches for this query in an index. Used internally * to search for spans. */ public abstract Spans getSpans(IndexReader reader) throws IOException; /** Returns the name of the field matched by this query.*/ public abstract String getField(); /** Returns a collection of all terms matched by this query. * @deprecated use extractTerms instead * @see Query#extractTerms(Set) */ public abstract Collection getTerms(); public Weight createWeight(Searcher searcher) throws IOException { return new SpanWeight(this, searcher); } } lucene-2.9.4/src/java/org/apache/lucene/search/spans/SpanScorer.java0000644000175000017500000000672511474320224025772 0ustar janpascaljanpascalpackage org.apache.lucene.search.spans; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.Weight; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Similarity; /** * Public for extension only. */ public class SpanScorer extends Scorer { protected Spans spans; protected Weight weight; protected byte[] norms; protected float value; /** @deprecated not needed anymore */ protected boolean firstTime = true; protected boolean more = true; protected int doc; protected float freq; protected SpanScorer(Spans spans, Weight weight, Similarity similarity, byte[] norms) throws IOException { super(similarity); this.spans = spans; this.norms = norms; this.weight = weight; this.value = weight.getValue(); if (this.spans.next()) { doc = -1; } else { doc = NO_MORE_DOCS; more = false; } } /** @deprecated use {@link #nextDoc()} instead. */ public boolean next() throws IOException { return nextDoc() != NO_MORE_DOCS; } public int nextDoc() throws IOException { if (!setFreqCurrentDoc()) { doc = NO_MORE_DOCS; } return doc; } /** @deprecated use {@link #advance(int)} instead. */ public boolean skipTo(int target) throws IOException { return advance(target) != NO_MORE_DOCS; } public int advance(int target) throws IOException { if (!more) { return doc = NO_MORE_DOCS; } if (spans.doc() < target) { // setFreqCurrentDoc() leaves spans.doc() ahead more = spans.skipTo(target); } if (!setFreqCurrentDoc()) { doc = NO_MORE_DOCS; } return doc; } protected boolean setFreqCurrentDoc() throws IOException { if (!more) { return false; } doc = spans.doc(); freq = 0.0f; do { int matchLength = spans.end() - spans.start(); freq += getSimilarity().sloppyFreq(matchLength); more = spans.next(); } while (more && (doc == spans.doc())); return true; } /** @deprecated use {@link #docID()} instead. */ public int doc() { return doc; } public int docID() { return doc; } public float score() throws IOException { float raw = getSimilarity().tf(freq) * value; // raw score return norms == null? raw : raw * Similarity.decodeNorm(norms[doc]); // normalize } public Explanation explain(final int doc) throws IOException { Explanation tfExplanation = new Explanation(); int expDoc = advance(doc); float phraseFreq = (expDoc == doc) ? freq : 0.0f; tfExplanation.setValue(getSimilarity().tf(phraseFreq)); tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")"); return tfExplanation; } } lucene-2.9.4/src/java/org/apache/lucene/search/spans/TermSpans.java0000644000175000017500000000513111474320224025615 0ustar janpascaljanpascalpackage org.apache.lucene.search.spans; /** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.Term; import org.apache.lucene.index.TermPositions; import java.io.IOException; import java.util.Collections; import java.util.Collection; /** * Expert: * Public for extension only */ public class TermSpans extends Spans { protected TermPositions positions; protected Term term; protected int doc; protected int freq; protected int count; protected int position; public TermSpans(TermPositions positions, Term term) throws IOException { this.positions = positions; this.term = term; doc = -1; } public boolean next() throws IOException { if (count == freq) { if (!positions.next()) { doc = Integer.MAX_VALUE; return false; } doc = positions.doc(); freq = positions.freq(); count = 0; } position = positions.nextPosition(); count++; return true; } public boolean skipTo(int target) throws IOException { if (!positions.skipTo(target)) { doc = Integer.MAX_VALUE; return false; } doc = positions.doc(); freq = positions.freq(); count = 0; position = positions.nextPosition(); count++; return true; } public int doc() { return doc; } public int start() { return position; } public int end() { return position + 1; } // TODO: Remove warning after API has been finalized public Collection/**/ getPayload() throws IOException { byte [] bytes = new byte[positions.getPayloadLength()]; bytes = positions.getPayload(bytes, 0); return Collections.singletonList(bytes); } // TODO: Remove warning after API has been finalized public boolean isPayloadAvailable() { return positions.isPayloadAvailable(); } public String toString() { return "spans(" + term.toString() + ")@" + (doc == -1 ? "START" : (doc == Integer.MAX_VALUE) ? "END" : doc + "-" + position); } public TermPositions getPositions() { return positions; } } lucene-2.9.4/src/java/org/apache/lucene/search/spans/SpanFirstQuery.java0000644000175000017500000001132211474320224026637 0ustar janpascaljanpascalpackage org.apache.lucene.search.spans; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Collection; import java.util.Set; import java.util.ArrayList; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Query; import org.apache.lucene.util.ToStringUtils; /** Matches spans near the beginning of a field. */ public class SpanFirstQuery extends SpanQuery implements Cloneable { private SpanQuery match; private int end; /** Construct a SpanFirstQuery matching spans in match whose end * position is less than or equal to end. */ public SpanFirstQuery(SpanQuery match, int end) { this.match = match; this.end = end; } /** Return the SpanQuery whose matches are filtered. */ public SpanQuery getMatch() { return match; } /** Return the maximum end position permitted in a match. */ public int getEnd() { return end; } public String getField() { return match.getField(); } /** Returns a collection of all terms matched by this query. * @deprecated use extractTerms instead * @see #extractTerms(Set) */ public Collection getTerms() { return match.getTerms(); } public String toString(String field) { StringBuffer buffer = new StringBuffer(); buffer.append("spanFirst("); buffer.append(match.toString(field)); buffer.append(", "); buffer.append(end); buffer.append(")"); buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } public Object clone() { SpanFirstQuery spanFirstQuery = new SpanFirstQuery((SpanQuery) match.clone(), end); spanFirstQuery.setBoost(getBoost()); return spanFirstQuery; } public void extractTerms(Set terms) { match.extractTerms(terms); } public Spans getSpans(final IndexReader reader) throws IOException { return new Spans() { private Spans spans = match.getSpans(reader); public boolean next() throws IOException { while (spans.next()) { // scan to next match if (end() <= end) return true; } return false; } public boolean skipTo(int target) throws IOException { if (!spans.skipTo(target)) return false; return spans.end() <= end || next(); } public int doc() { return spans.doc(); } public int start() { return spans.start(); } public int end() { return spans.end(); } // TODO: Remove warning after API has been finalized public Collection/**/ getPayload() throws IOException { ArrayList result = null; if (spans.isPayloadAvailable()) { result = new ArrayList(spans.getPayload()); } return result;//TODO: any way to avoid the new construction? } // TODO: Remove warning after API has been finalized public boolean isPayloadAvailable() { return spans.isPayloadAvailable(); } public String toString() { return "spans(" + SpanFirstQuery.this.toString() + ")"; } }; } public Query rewrite(IndexReader reader) throws IOException { SpanFirstQuery clone = null; SpanQuery rewritten = (SpanQuery) match.rewrite(reader); if (rewritten != match) { clone = (SpanFirstQuery) this.clone(); clone.match = rewritten; } if (clone != null) { return clone; // some clauses rewrote } else { return this; // no clauses rewrote } } public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof SpanFirstQuery)) return false; SpanFirstQuery other = (SpanFirstQuery)o; return this.end == other.end && this.match.equals(other.match) && this.getBoost() == other.getBoost(); } public int hashCode() { int h = match.hashCode(); h ^= (h << 8) | (h >>> 25); // reversible h ^= Float.floatToRawIntBits(getBoost()) ^ end; return h; } } lucene-2.9.4/src/java/org/apache/lucene/search/spans/SpanOrQuery.java0000644000175000017500000001726411474320224026143 0ustar janpascaljanpascalpackage org.apache.lucene.search.spans; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.List; import java.util.Collection; import java.util.ArrayList; import java.util.Iterator; import java.util.Set; import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.search.Query; /** Matches the union of its clauses.*/ public class SpanOrQuery extends SpanQuery implements Cloneable { private List clauses; private String field; /** Construct a SpanOrQuery merging the provided clauses. */ public SpanOrQuery(SpanQuery[] clauses) { // copy clauses array into an ArrayList this.clauses = new ArrayList(clauses.length); for (int i = 0; i < clauses.length; i++) { SpanQuery clause = clauses[i]; if (i == 0) { // check field field = clause.getField(); } else if (!clause.getField().equals(field)) { throw new IllegalArgumentException("Clauses must have same field."); } this.clauses.add(clause); } } /** Return the clauses whose spans are matched. */ public SpanQuery[] getClauses() { return (SpanQuery[])clauses.toArray(new SpanQuery[clauses.size()]); } public String getField() { return field; } /** Returns a collection of all terms matched by this query. * @deprecated use extractTerms instead * @see #extractTerms(Set) */ public Collection getTerms() { Collection terms = new ArrayList(); Iterator i = clauses.iterator(); while (i.hasNext()) { SpanQuery clause = (SpanQuery)i.next(); terms.addAll(clause.getTerms()); } return terms; } public void extractTerms(Set terms) { Iterator i = clauses.iterator(); while (i.hasNext()) { SpanQuery clause = (SpanQuery)i.next(); clause.extractTerms(terms); } } public Object clone() { int sz = clauses.size(); SpanQuery[] newClauses = new SpanQuery[sz]; for (int i = 0; i < sz; i++) { SpanQuery clause = (SpanQuery) clauses.get(i); newClauses[i] = (SpanQuery) clause.clone(); } SpanOrQuery soq = new SpanOrQuery(newClauses); soq.setBoost(getBoost()); return soq; } public Query rewrite(IndexReader reader) throws IOException { SpanOrQuery clone = null; for (int i = 0 ; i < clauses.size(); i++) { SpanQuery c = (SpanQuery)clauses.get(i); SpanQuery query = (SpanQuery) c.rewrite(reader); if (query != c) { // clause rewrote: must clone if (clone == null) clone = (SpanOrQuery) this.clone(); clone.clauses.set(i,query); } } if (clone != null) { return clone; // some clauses rewrote } else { return this; // no clauses rewrote } } public String toString(String field) { StringBuffer buffer = new StringBuffer(); buffer.append("spanOr(["); Iterator i = clauses.iterator(); while (i.hasNext()) { SpanQuery clause = (SpanQuery)i.next(); buffer.append(clause.toString(field)); if (i.hasNext()) { buffer.append(", "); } } buffer.append("])"); buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; final SpanOrQuery that = (SpanOrQuery) o; if (!clauses.equals(that.clauses)) return false; if (!clauses.isEmpty() && !field.equals(that.field)) return false; return getBoost() == that.getBoost(); } public int hashCode() { int h = clauses.hashCode(); h ^= (h << 10) | (h >>> 23); h ^= Float.floatToRawIntBits(getBoost()); return h; } private class SpanQueue extends PriorityQueue { public SpanQueue(int size) { initialize(size); } protected final boolean lessThan(Object o1, Object o2) { Spans spans1 = (Spans)o1; Spans spans2 = (Spans)o2; if (spans1.doc() == spans2.doc()) { if (spans1.start() == spans2.start()) { return spans1.end() < spans2.end(); } else { return spans1.start() < spans2.start(); } } else { return spans1.doc() < spans2.doc(); } } } public Spans getSpans(final IndexReader reader) throws IOException { if (clauses.size() == 1) // optimize 1-clause case return ((SpanQuery)clauses.get(0)).getSpans(reader); return new Spans() { private SpanQueue queue = null; private boolean initSpanQueue(int target) throws IOException { queue = new SpanQueue(clauses.size()); Iterator i = clauses.iterator(); while (i.hasNext()) { Spans spans = ((SpanQuery)i.next()).getSpans(reader); if ( ((target == -1) && spans.next()) || ((target != -1) && spans.skipTo(target))) { queue.put(spans); } } return queue.size() != 0; } public boolean next() throws IOException { if (queue == null) { return initSpanQueue(-1); } if (queue.size() == 0) { // all done return false; } if (top().next()) { // move to next queue.adjustTop(); return true; } queue.pop(); // exhausted a clause return queue.size() != 0; } private Spans top() { return (Spans)queue.top(); } public boolean skipTo(int target) throws IOException { if (queue == null) { return initSpanQueue(target); } boolean skipCalled = false; while (queue.size() != 0 && top().doc() < target) { if (top().skipTo(target)) { queue.adjustTop(); } else { queue.pop(); } skipCalled = true; } if (skipCalled) { return queue.size() != 0; } return next(); } public int doc() { return top().doc(); } public int start() { return top().start(); } public int end() { return top().end(); } // TODO: Remove warning after API has been finalized public Collection/**/ getPayload() throws IOException { ArrayList result = null; Spans theTop = top(); if (theTop != null && theTop.isPayloadAvailable()) { result = new ArrayList(theTop.getPayload()); } return result; } // TODO: Remove warning after API has been finalized public boolean isPayloadAvailable() { Spans top = top(); return top != null && top.isPayloadAvailable(); } public String toString() { return "spans("+SpanOrQuery.this+")@"+ ((queue == null)?"START" :(queue.size()>0?(doc()+":"+start()+"-"+end()):"END")); } }; } } lucene-2.9.4/src/java/org/apache/lucene/search/spans/SpanNotQuery.java0000644000175000017500000001622711474320224026321 0ustar janpascaljanpascalpackage org.apache.lucene.search.spans; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Query; import org.apache.lucene.util.ToStringUtils; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Set; /** Removes matches which overlap with another SpanQuery. */ public class SpanNotQuery extends SpanQuery implements Cloneable { private SpanQuery include; private SpanQuery exclude; /** Construct a SpanNotQuery matching spans from include which * have no overlap with spans from exclude.*/ public SpanNotQuery(SpanQuery include, SpanQuery exclude) { this.include = include; this.exclude = exclude; if (!include.getField().equals(exclude.getField())) throw new IllegalArgumentException("Clauses must have same field."); } /** Return the SpanQuery whose matches are filtered. */ public SpanQuery getInclude() { return include; } /** Return the SpanQuery whose matches must not overlap those returned. */ public SpanQuery getExclude() { return exclude; } public String getField() { return include.getField(); } /** Returns a collection of all terms matched by this query. * @deprecated use extractTerms instead * @see #extractTerms(Set) */ public Collection getTerms() { return include.getTerms(); } public void extractTerms(Set terms) { include.extractTerms(terms); } public String toString(String field) { StringBuffer buffer = new StringBuffer(); buffer.append("spanNot("); buffer.append(include.toString(field)); buffer.append(", "); buffer.append(exclude.toString(field)); buffer.append(")"); buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } public Object clone() { SpanNotQuery spanNotQuery = new SpanNotQuery((SpanQuery)include.clone(),(SpanQuery) exclude.clone()); spanNotQuery.setBoost(getBoost()); return spanNotQuery; } public Spans getSpans(final IndexReader reader) throws IOException { return new Spans() { private Spans includeSpans = include.getSpans(reader); private boolean moreInclude = true; private Spans excludeSpans = exclude.getSpans(reader); private boolean moreExclude = excludeSpans.next(); public boolean next() throws IOException { if (moreInclude) // move to next include moreInclude = includeSpans.next(); while (moreInclude && moreExclude) { if (includeSpans.doc() > excludeSpans.doc()) // skip exclude moreExclude = excludeSpans.skipTo(includeSpans.doc()); while (moreExclude // while exclude is before && includeSpans.doc() == excludeSpans.doc() && excludeSpans.end() <= includeSpans.start()) { moreExclude = excludeSpans.next(); // increment exclude } if (!moreExclude // if no intersection || includeSpans.doc() != excludeSpans.doc() || includeSpans.end() <= excludeSpans.start()) break; // we found a match moreInclude = includeSpans.next(); // intersected: keep scanning } return moreInclude; } public boolean skipTo(int target) throws IOException { if (moreInclude) // skip include moreInclude = includeSpans.skipTo(target); if (!moreInclude) return false; if (moreExclude // skip exclude && includeSpans.doc() > excludeSpans.doc()) moreExclude = excludeSpans.skipTo(includeSpans.doc()); while (moreExclude // while exclude is before && includeSpans.doc() == excludeSpans.doc() && excludeSpans.end() <= includeSpans.start()) { moreExclude = excludeSpans.next(); // increment exclude } if (!moreExclude // if no intersection || includeSpans.doc() != excludeSpans.doc() || includeSpans.end() <= excludeSpans.start()) return true; // we found a match return next(); // scan to next match } public int doc() { return includeSpans.doc(); } public int start() { return includeSpans.start(); } public int end() { return includeSpans.end(); } // TODO: Remove warning after API has been finalized public Collection/**/ getPayload() throws IOException { ArrayList result = null; if (includeSpans.isPayloadAvailable()) { result = new ArrayList(includeSpans.getPayload()); } return result; } // TODO: Remove warning after API has been finalized public boolean isPayloadAvailable() { return includeSpans.isPayloadAvailable(); } public String toString() { return "spans(" + SpanNotQuery.this.toString() + ")"; } }; } public Query rewrite(IndexReader reader) throws IOException { SpanNotQuery clone = null; SpanQuery rewrittenInclude = (SpanQuery) include.rewrite(reader); if (rewrittenInclude != include) { clone = (SpanNotQuery) this.clone(); clone.include = rewrittenInclude; } SpanQuery rewrittenExclude = (SpanQuery) exclude.rewrite(reader); if (rewrittenExclude != exclude) { if (clone == null) clone = (SpanNotQuery) this.clone(); clone.exclude = rewrittenExclude; } if (clone != null) { return clone; // some clauses rewrote } else { return this; // no clauses rewrote } } /** Returns true iff o is equal to this. */ public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof SpanNotQuery)) return false; SpanNotQuery other = (SpanNotQuery)o; return this.include.equals(other.include) && this.exclude.equals(other.exclude) && this.getBoost() == other.getBoost(); } public int hashCode() { int h = include.hashCode(); h = (h<<1) | (h >>> 31); // rotate left h ^= exclude.hashCode(); h = (h<<1) | (h >>> 31); // rotate left h ^= Float.floatToRawIntBits(getBoost()); return h; } } lucene-2.9.4/src/java/org/apache/lucene/search/spans/NearSpansUnordered.java0000644000175000017500000002125311474320224027446 0ustar janpascaljanpascalpackage org.apache.lucene.search.spans; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.PriorityQueue; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Set; import java.util.HashSet; /** * Similar to {@link NearSpansOrdered}, but for the unordered case. * * Expert: * Only public for subclassing. Most implementations should not need this class */ public class NearSpansUnordered extends Spans { private SpanNearQuery query; private List ordered = new ArrayList(); // spans in query order private Spans[] subSpans; private int slop; // from query private SpansCell first; // linked list of spans private SpansCell last; // sorted by doc only private int totalLength; // sum of current lengths private CellQueue queue; // sorted queue of spans private SpansCell max; // max element in queue private boolean more = true; // true iff not done private boolean firstTime = true; // true before first next() private class CellQueue extends PriorityQueue { public CellQueue(int size) { initialize(size); } protected final boolean lessThan(Object o1, Object o2) { SpansCell spans1 = (SpansCell)o1; SpansCell spans2 = (SpansCell)o2; if (spans1.doc() == spans2.doc()) { return NearSpansOrdered.docSpansOrdered(spans1, spans2); } else { return spans1.doc() < spans2.doc(); } } } /** Wraps a Spans, and can be used to form a linked list. */ private class SpansCell extends Spans { private Spans spans; private SpansCell next; private int length = -1; private int index; public SpansCell(Spans spans, int index) { this.spans = spans; this.index = index; } public boolean next() throws IOException { return adjust(spans.next()); } public boolean skipTo(int target) throws IOException { return adjust(spans.skipTo(target)); } private boolean adjust(boolean condition) { if (length != -1) { totalLength -= length; // subtract old length } if (condition) { length = end() - start(); totalLength += length; // add new length if (max == null || doc() > max.doc() || (doc() == max.doc()) && (end() > max.end())) { max = this; } } more = condition; return condition; } public int doc() { return spans.doc(); } public int start() { return spans.start(); } public int end() { return spans.end(); } // TODO: Remove warning after API has been finalized public Collection/**/ getPayload() throws IOException { return new ArrayList(spans.getPayload()); } // TODO: Remove warning after API has been finalized public boolean isPayloadAvailable() { return spans.isPayloadAvailable(); } public String toString() { return spans.toString() + "#" + index; } } public NearSpansUnordered(SpanNearQuery query, IndexReader reader) throws IOException { this.query = query; this.slop = query.getSlop(); SpanQuery[] clauses = query.getClauses(); queue = new CellQueue(clauses.length); subSpans = new Spans[clauses.length]; for (int i = 0; i < clauses.length; i++) { SpansCell cell = new SpansCell(clauses[i].getSpans(reader), i); ordered.add(cell); subSpans[i] = cell.spans; } } public Spans[] getSubSpans() { return subSpans; } public boolean next() throws IOException { if (firstTime) { initList(true); listToQueue(); // initialize queue firstTime = false; } else if (more) { if (min().next()) { // trigger further scanning queue.adjustTop(); // maintain queue } else { more = false; } } while (more) { boolean queueStale = false; if (min().doc() != max.doc()) { // maintain list queueToList(); queueStale = true; } // skip to doc w/ all clauses while (more && first.doc() < last.doc()) { more = first.skipTo(last.doc()); // skip first upto last firstToLast(); // and move it to the end queueStale = true; } if (!more) return false; // found doc w/ all clauses if (queueStale) { // maintain the queue listToQueue(); queueStale = false; } if (atMatch()) { return true; } more = min().next(); if (more) { queue.adjustTop(); // maintain queue } } return false; // no more matches } public boolean skipTo(int target) throws IOException { if (firstTime) { // initialize initList(false); for (SpansCell cell = first; more && cell!=null; cell=cell.next) { more = cell.skipTo(target); // skip all } if (more) { listToQueue(); } firstTime = false; } else { // normal case while (more && min().doc() < target) { // skip as needed if (min().skipTo(target)) { queue.adjustTop(); } else { more = false; } } } return more && (atMatch() || next()); } private SpansCell min() { return (SpansCell)queue.top(); } public int doc() { return min().doc(); } public int start() { return min().start(); } public int end() { return max.end(); } // TODO: Remove warning after API has been finalized /** * WARNING: The List is not necessarily in order of the the positions * @return Collection of byte[] payloads * @throws IOException */ public Collection/**/ getPayload() throws IOException { Set/* * The formed spans only contains minimum slop matches.
    * The matching slop is computed from the distance(s) between * the non overlapping matching Spans.
    * Successive matches are always formed from the successive Spans * of the SpanNearQuery. *

    * The formed spans may contain overlaps when the slop is at least 1. * For example, when querying using *

    t1 t2 t3
    * with slop at least 1, the fragment: *
    t1 t2 t1 t3 t2 t3
    * matches twice: *
    t1 t2 .. t3      
    *
          t1 .. t2 t3
    * * * Expert: * Only public for subclassing. Most implementations should not need this class */ public class NearSpansOrdered extends Spans { private final int allowedSlop; private boolean firstTime = true; private boolean more = false; /** The spans in the same order as the SpanNearQuery */ private final Spans[] subSpans; /** Indicates that all subSpans have same doc() */ private boolean inSameDoc = false; private int matchDoc = -1; private int matchStart = -1; private int matchEnd = -1; private List/**/ matchPayload; private final Spans[] subSpansByDoc; private final Comparator spanDocComparator = new Comparator() { public int compare(Object o1, Object o2) { return ((Spans)o1).doc() - ((Spans)o2).doc(); } }; private SpanNearQuery query; private boolean collectPayloads = true; public NearSpansOrdered(SpanNearQuery spanNearQuery, IndexReader reader) throws IOException { this(spanNearQuery, reader, true); } public NearSpansOrdered(SpanNearQuery spanNearQuery, IndexReader reader, boolean collectPayloads) throws IOException { if (spanNearQuery.getClauses().length < 2) { throw new IllegalArgumentException("Less than 2 clauses: " + spanNearQuery); } this.collectPayloads = collectPayloads; allowedSlop = spanNearQuery.getSlop(); SpanQuery[] clauses = spanNearQuery.getClauses(); subSpans = new Spans[clauses.length]; matchPayload = new LinkedList(); subSpansByDoc = new Spans[clauses.length]; for (int i = 0; i < clauses.length; i++) { subSpans[i] = clauses[i].getSpans(reader); subSpansByDoc[i] = subSpans[i]; // used in toSameDoc() } query = spanNearQuery; // kept for toString() only. } // inherit javadocs public int doc() { return matchDoc; } // inherit javadocs public int start() { return matchStart; } // inherit javadocs public int end() { return matchEnd; } public Spans[] getSubSpans() { return subSpans; } // TODO: Remove warning after API has been finalized // TODO: Would be nice to be able to lazy load payloads public Collection/**/ getPayload() throws IOException { return matchPayload; } // TODO: Remove warning after API has been finalized public boolean isPayloadAvailable() { return matchPayload.isEmpty() == false; } // inherit javadocs public boolean next() throws IOException { if (firstTime) { firstTime = false; for (int i = 0; i < subSpans.length; i++) { if (! subSpans[i].next()) { more = false; return false; } } more = true; } if(collectPayloads) { matchPayload.clear(); } return advanceAfterOrdered(); } // inherit javadocs public boolean skipTo(int target) throws IOException { if (firstTime) { firstTime = false; for (int i = 0; i < subSpans.length; i++) { if (! subSpans[i].skipTo(target)) { more = false; return false; } } more = true; } else if (more && (subSpans[0].doc() < target)) { if (subSpans[0].skipTo(target)) { inSameDoc = false; } else { more = false; return false; } } if(collectPayloads) { matchPayload.clear(); } return advanceAfterOrdered(); } /** Advances the subSpans to just after an ordered match with a minimum slop * that is smaller than the slop allowed by the SpanNearQuery. * @return true iff there is such a match. */ private boolean advanceAfterOrdered() throws IOException { while (more && (inSameDoc || toSameDoc())) { if (stretchToOrder() && shrinkToAfterShortestMatch()) { return true; } } return false; // no more matches } /** Advance the subSpans to the same document */ private boolean toSameDoc() throws IOException { Arrays.sort(subSpansByDoc, spanDocComparator); int firstIndex = 0; int maxDoc = subSpansByDoc[subSpansByDoc.length - 1].doc(); while (subSpansByDoc[firstIndex].doc() != maxDoc) { if (! subSpansByDoc[firstIndex].skipTo(maxDoc)) { more = false; inSameDoc = false; return false; } maxDoc = subSpansByDoc[firstIndex].doc(); if (++firstIndex == subSpansByDoc.length) { firstIndex = 0; } } for (int i = 0; i < subSpansByDoc.length; i++) { assert (subSpansByDoc[i].doc() == maxDoc) : " NearSpansOrdered.toSameDoc() spans " + subSpansByDoc[0] + "\n at doc " + subSpansByDoc[i].doc() + ", but should be at " + maxDoc; } inSameDoc = true; return true; } /** Check whether two Spans in the same document are ordered. * @param spans1 * @param spans2 * @return true iff spans1 starts before spans2 * or the spans start at the same position, * and spans1 ends before spans2. */ static final boolean docSpansOrdered(Spans spans1, Spans spans2) { assert spans1.doc() == spans2.doc() : "doc1 " + spans1.doc() + " != doc2 " + spans2.doc(); int start1 = spans1.start(); int start2 = spans2.start(); /* Do not call docSpansOrdered(int,int,int,int) to avoid invoking .end() : */ return (start1 == start2) ? (spans1.end() < spans2.end()) : (start1 < start2); } /** Like {@link #docSpansOrdered(Spans,Spans)}, but use the spans * starts and ends as parameters. */ private static final boolean docSpansOrdered(int start1, int end1, int start2, int end2) { return (start1 == start2) ? (end1 < end2) : (start1 < start2); } /** Order the subSpans within the same document by advancing all later spans * after the previous one. */ private boolean stretchToOrder() throws IOException { matchDoc = subSpans[0].doc(); for (int i = 1; inSameDoc && (i < subSpans.length); i++) { while (! docSpansOrdered(subSpans[i-1], subSpans[i])) { if (! subSpans[i].next()) { inSameDoc = false; more = false; break; } else if (matchDoc != subSpans[i].doc()) { inSameDoc = false; break; } } } return inSameDoc; } /** The subSpans are ordered in the same doc, so there is a possible match. * Compute the slop while making the match as short as possible by advancing * all subSpans except the last one in reverse order. */ private boolean shrinkToAfterShortestMatch() throws IOException { matchStart = subSpans[subSpans.length - 1].start(); matchEnd = subSpans[subSpans.length - 1].end(); Set possibleMatchPayloads = new HashSet(); if (subSpans[subSpans.length - 1].isPayloadAvailable()) { possibleMatchPayloads.addAll(subSpans[subSpans.length - 1].getPayload()); } Collection possiblePayload = null; int matchSlop = 0; int lastStart = matchStart; int lastEnd = matchEnd; for (int i = subSpans.length - 2; i >= 0; i--) { Spans prevSpans = subSpans[i]; if (collectPayloads && prevSpans.isPayloadAvailable()) { Collection payload = prevSpans.getPayload(); possiblePayload = new ArrayList(payload.size()); possiblePayload.addAll(payload); } int prevStart = prevSpans.start(); int prevEnd = prevSpans.end(); while (true) { // Advance prevSpans until after (lastStart, lastEnd) if (! prevSpans.next()) { inSameDoc = false; more = false; break; // Check remaining subSpans for final match. } else if (matchDoc != prevSpans.doc()) { inSameDoc = false; // The last subSpans is not advanced here. break; // Check remaining subSpans for last match in this document. } else { int ppStart = prevSpans.start(); int ppEnd = prevSpans.end(); // Cannot avoid invoking .end() if (! docSpansOrdered(ppStart, ppEnd, lastStart, lastEnd)) { break; // Check remaining subSpans. } else { // prevSpans still before (lastStart, lastEnd) prevStart = ppStart; prevEnd = ppEnd; if (collectPayloads && prevSpans.isPayloadAvailable()) { Collection payload = prevSpans.getPayload(); possiblePayload = new ArrayList(payload.size()); possiblePayload.addAll(payload); } } } } if (collectPayloads && possiblePayload != null) { possibleMatchPayloads.addAll(possiblePayload); } assert prevStart <= matchStart; if (matchStart > prevEnd) { // Only non overlapping spans add to slop. matchSlop += (matchStart - prevEnd); } /* Do not break on (matchSlop > allowedSlop) here to make sure * that subSpans[0] is advanced after the match, if any. */ matchStart = prevStart; lastStart = prevStart; lastEnd = prevEnd; } boolean match = matchSlop <= allowedSlop; if(collectPayloads && match && possibleMatchPayloads.size() > 0) { matchPayload.addAll(possibleMatchPayloads); } return match; // ordered and allowed slop } public String toString() { return getClass().getName() + "("+query.toString()+")@"+ (firstTime?"START":(more?(doc()+":"+start()+"-"+end()):"END")); } } lucene-2.9.4/src/java/org/apache/lucene/search/spans/SpanWeight.java0000644000175000017500000001100111474320224025743 0ustar janpascaljanpascalpackage org.apache.lucene.search.spans; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.*; import org.apache.lucene.search.Explanation.IDFExplanation; import java.io.IOException; import java.util.HashSet; import java.util.Set; /** * Expert-only. Public for use by other weight implementations */ public class SpanWeight extends Weight { protected Similarity similarity; protected float value; protected float idf; protected float queryNorm; protected float queryWeight; protected Set terms; protected SpanQuery query; private IDFExplanation idfExp; public SpanWeight(SpanQuery query, Searcher searcher) throws IOException { this.similarity = query.getSimilarity(searcher); this.query = query; terms=new HashSet(); query.extractTerms(terms); idfExp = similarity.idfExplain(terms, searcher); idf = idfExp.getIdf(); } public Query getQuery() { return query; } public float getValue() { return value; } public float sumOfSquaredWeights() throws IOException { queryWeight = idf * query.getBoost(); // compute query weight return queryWeight * queryWeight; // square it } public void normalize(float queryNorm) { this.queryNorm = queryNorm; queryWeight *= queryNorm; // normalize query weight value = queryWeight * idf; // idf for document } public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException { return new SpanScorer(query.getSpans(reader), this, similarity, reader .norms(query.getField())); } public Explanation explain(IndexReader reader, int doc) throws IOException { ComplexExplanation result = new ComplexExplanation(); result.setDescription("weight("+getQuery()+" in "+doc+"), product of:"); String field = ((SpanQuery)getQuery()).getField(); Explanation idfExpl = new Explanation(idf, "idf(" + field + ": " + idfExp.explain() + ")"); // explain query weight Explanation queryExpl = new Explanation(); queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:"); Explanation boostExpl = new Explanation(getQuery().getBoost(), "boost"); if (getQuery().getBoost() != 1.0f) queryExpl.addDetail(boostExpl); queryExpl.addDetail(idfExpl); Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm"); queryExpl.addDetail(queryNormExpl); queryExpl.setValue(boostExpl.getValue() * idfExpl.getValue() * queryNormExpl.getValue()); result.addDetail(queryExpl); // explain field weight ComplexExplanation fieldExpl = new ComplexExplanation(); fieldExpl.setDescription("fieldWeight("+field+":"+query.toString(field)+ " in "+doc+"), product of:"); Explanation tfExpl = scorer(reader, true, false).explain(doc); fieldExpl.addDetail(tfExpl); fieldExpl.addDetail(idfExpl); Explanation fieldNormExpl = new Explanation(); byte[] fieldNorms = reader.norms(field); float fieldNorm = fieldNorms!=null ? Similarity.decodeNorm(fieldNorms[doc]) : 1.0f; fieldNormExpl.setValue(fieldNorm); fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")"); fieldExpl.addDetail(fieldNormExpl); fieldExpl.setMatch(Boolean.valueOf(tfExpl.isMatch())); fieldExpl.setValue(tfExpl.getValue() * idfExpl.getValue() * fieldNormExpl.getValue()); result.addDetail(fieldExpl); result.setMatch(fieldExpl.getMatch()); // combine them result.setValue(queryExpl.getValue() * fieldExpl.getValue()); if (queryExpl.getValue() == 1.0f) return fieldExpl; return result; } } lucene-2.9.4/src/java/org/apache/lucene/search/spans/package.html0000644000175000017500000000657711474320224025336 0ustar janpascaljanpascal The calculus of spans.

    A span is a <doc,startPosition,endPosition> tuple.

    The following span query operators are implemented:

    • A SpanTermQuery matches all spans containing a particular Term.
    • A SpanNearQuery matches spans which occur near one another, and can be used to implement things like phrase search (when constructed from SpanTermQueries) and inter-phrase proximity (when constructed from other SpanNearQueries).
    • A SpanOrQuery merges spans from a number of other SpanQueries.
    • A SpanNotQuery removes spans matching one SpanQuery which overlap another. This can be used, e.g., to implement within-paragraph search.
    • A SpanFirstQuery matches spans matching q whose end position is less than n. This can be used to constrain matches to the first part of the document.
    In all cases, output spans are minimally inclusive. In other words, a span formed by matching a span in x and y starts at the lesser of the two starts and ends at the greater of the two ends.

    For example, a span query which matches "John Kerry" within ten words of "George Bush" within the first 100 words of the document could be constructed with:

    SpanQuery john   = new SpanTermQuery(new Term("content", "john"));
    SpanQuery kerry  = new SpanTermQuery(new Term("content", "kerry"));
    SpanQuery george = new SpanTermQuery(new Term("content", "george"));
    SpanQuery bush   = new SpanTermQuery(new Term("content", "bush"));
    
    SpanQuery johnKerry =
       new SpanNearQuery(new SpanQuery[] {john, kerry}, 0, true);
    
    SpanQuery georgeBush =
       new SpanNearQuery(new SpanQuery[] {george, bush}, 0, true);
    
    SpanQuery johnKerryNearGeorgeBush =
       new SpanNearQuery(new SpanQuery[] {johnKerry, georgeBush}, 10, false);
    
    SpanQuery johnKerryNearGeorgeBushAtStart =
       new SpanFirstQuery(johnKerryNearGeorgeBush, 100);
    

    Span queries may be freely intermixed with other Lucene queries. So, for example, the above query can be restricted to documents which also use the word "iraq" with:

    Query query = new BooleanQuery();
    query.add(johnKerryNearGeorgeBushAtStart, true, false);
    query.add(new TermQuery("content", "iraq"), true, false);
    
    lucene-2.9.4/src/java/org/apache/lucene/search/spans/SpanTermQuery.java0000644000175000017500000000534011474320224026462 0ustar janpascaljanpascalpackage org.apache.lucene.search.spans; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.util.ToStringUtils; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Set; /** Matches spans containing a term. */ public class SpanTermQuery extends SpanQuery { protected Term term; /** Construct a SpanTermQuery matching the named term's spans. */ public SpanTermQuery(Term term) { this.term = term; } /** Return the term whose spans are matched. */ public Term getTerm() { return term; } public String getField() { return term.field(); } /** Returns a collection of all terms matched by this query. * @deprecated use extractTerms instead * @see #extractTerms(Set) */ public Collection getTerms() { Collection terms = new ArrayList(); terms.add(term); return terms; } public void extractTerms(Set terms) { terms.add(term); } public String toString(String field) { StringBuffer buffer = new StringBuffer(); if (term.field().equals(field)) buffer.append(term.text()); else buffer.append(term.toString()); buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } public int hashCode() { final int prime = 31; int result = super.hashCode(); result = prime * result + ((term == null) ? 0 : term.hashCode()); return result; } public boolean equals(Object obj) { if (this == obj) return true; if (!super.equals(obj)) return false; if (getClass() != obj.getClass()) return false; SpanTermQuery other = (SpanTermQuery) obj; if (term == null) { if (other.term != null) return false; } else if (!term.equals(other.term)) return false; return true; } public Spans getSpans(final IndexReader reader) throws IOException { return new TermSpans(reader.termPositions(term), term); } } lucene-2.9.4/src/java/org/apache/lucene/search/spans/SpanNearQuery.java0000644000175000017500000001501611474320224026441 0ustar janpascaljanpascalpackage org.apache.lucene.search.spans; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Collection; import java.util.List; import java.util.ArrayList; import java.util.Iterator; import java.util.Set; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Query; import org.apache.lucene.util.ToStringUtils; /** Matches spans which are near one another. One can specify slop, the * maximum number of intervening unmatched positions, as well as whether * matches are required to be in-order. */ public class SpanNearQuery extends SpanQuery implements Cloneable { protected List clauses; protected int slop; protected boolean inOrder; protected String field; private boolean collectPayloads; /** Construct a SpanNearQuery. Matches spans matching a span from each * clause, with up to slop total unmatched positions between * them. * When inOrder is true, the spans from each clause * must be * ordered as in clauses. */ public SpanNearQuery(SpanQuery[] clauses, int slop, boolean inOrder) { this(clauses, slop, inOrder, true); } public SpanNearQuery(SpanQuery[] clauses, int slop, boolean inOrder, boolean collectPayloads) { // copy clauses array into an ArrayList this.clauses = new ArrayList(clauses.length); for (int i = 0; i < clauses.length; i++) { SpanQuery clause = clauses[i]; if (i == 0) { // check field field = clause.getField(); } else if (!clause.getField().equals(field)) { throw new IllegalArgumentException("Clauses must have same field."); } this.clauses.add(clause); } this.collectPayloads = collectPayloads; this.slop = slop; this.inOrder = inOrder; } /** Return the clauses whose spans are matched. */ public SpanQuery[] getClauses() { return (SpanQuery[])clauses.toArray(new SpanQuery[clauses.size()]); } /** Return the maximum number of intervening unmatched positions permitted.*/ public int getSlop() { return slop; } /** Return true if matches are required to be in-order.*/ public boolean isInOrder() { return inOrder; } public String getField() { return field; } /** Returns a collection of all terms matched by this query. * @deprecated use extractTerms instead * @see #extractTerms(Set) */ public Collection getTerms() { Collection terms = new ArrayList(); Iterator i = clauses.iterator(); while (i.hasNext()) { SpanQuery clause = (SpanQuery)i.next(); terms.addAll(clause.getTerms()); } return terms; } public void extractTerms(Set terms) { Iterator i = clauses.iterator(); while (i.hasNext()) { SpanQuery clause = (SpanQuery)i.next(); clause.extractTerms(terms); } } public String toString(String field) { StringBuffer buffer = new StringBuffer(); buffer.append("spanNear(["); Iterator i = clauses.iterator(); while (i.hasNext()) { SpanQuery clause = (SpanQuery)i.next(); buffer.append(clause.toString(field)); if (i.hasNext()) { buffer.append(", "); } } buffer.append("], "); buffer.append(slop); buffer.append(", "); buffer.append(inOrder); buffer.append(")"); buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } public Spans getSpans(final IndexReader reader) throws IOException { if (clauses.size() == 0) // optimize 0-clause case return new SpanOrQuery(getClauses()).getSpans(reader); if (clauses.size() == 1) // optimize 1-clause case return ((SpanQuery)clauses.get(0)).getSpans(reader); return inOrder ? (Spans) new NearSpansOrdered(this, reader, collectPayloads) : (Spans) new NearSpansUnordered(this, reader); } public Query rewrite(IndexReader reader) throws IOException { SpanNearQuery clone = null; for (int i = 0 ; i < clauses.size(); i++) { SpanQuery c = (SpanQuery)clauses.get(i); SpanQuery query = (SpanQuery) c.rewrite(reader); if (query != c) { // clause rewrote: must clone if (clone == null) clone = (SpanNearQuery) this.clone(); clone.clauses.set(i,query); } } if (clone != null) { return clone; // some clauses rewrote } else { return this; // no clauses rewrote } } public Object clone() { int sz = clauses.size(); SpanQuery[] newClauses = new SpanQuery[sz]; for (int i = 0; i < sz; i++) { SpanQuery clause = (SpanQuery) clauses.get(i); newClauses[i] = (SpanQuery) clause.clone(); } SpanNearQuery spanNearQuery = new SpanNearQuery(newClauses, slop, inOrder); spanNearQuery.setBoost(getBoost()); return spanNearQuery; } /** Returns true iff o is equal to this. */ public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof SpanNearQuery)) return false; final SpanNearQuery spanNearQuery = (SpanNearQuery) o; if (inOrder != spanNearQuery.inOrder) return false; if (slop != spanNearQuery.slop) return false; if (!clauses.equals(spanNearQuery.clauses)) return false; return getBoost() == spanNearQuery.getBoost(); } public int hashCode() { int result; result = clauses.hashCode(); // Mix bits before folding in things like boost, since it could cancel the // last element of clauses. This particular mix also serves to // differentiate SpanNearQuery hashcodes from others. result ^= (result << 14) | (result >>> 19); // reversible result += Float.floatToRawIntBits(getBoost()); result += slop; result ^= (inOrder ? 0x99AFD3BD : 0); return result; } } lucene-2.9.4/src/java/org/apache/lucene/search/spans/FieldMaskingSpanQuery.java0000644000175000017500000001204411474320224030107 0ustar janpascaljanpascalpackage org.apache.lucene.search.spans; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Collection; import java.util.Set; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Query; import org.apache.lucene.search.Weight; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.Similarity; import org.apache.lucene.util.ToStringUtils; /** *

    Wrapper to allow {@link SpanQuery} objects participate in composite * single-field SpanQueries by 'lying' about their search field. That is, * the masked SpanQuery will function as normal, * but {@link SpanQuery#getField()} simply hands back the value supplied * in this class's constructor.

    * *

    This can be used to support Queries like {@link SpanNearQuery} or * {@link SpanOrQuery} across different fields, which is not ordinarily * permitted.

    * *

    This can be useful for denormalized relational data: for example, when * indexing a document with conceptually many 'children':

    * *
     *  teacherid: 1
     *  studentfirstname: james
     *  studentsurname: jones
     *  
     *  teacherid: 2
     *  studenfirstname: james
     *  studentsurname: smith
     *  studentfirstname: sally
     *  studentsurname: jones
     * 
    * *

    a SpanNearQuery with a slop of 0 can be applied across two * {@link SpanTermQuery} objects as follows: *

     *    SpanQuery q1  = new SpanTermQuery(new Term("studentfirstname", "james"));
     *    SpanQuery q2  = new SpanTermQuery(new Term("studentsurname", "jones"));
     *    SpanQuery q2m new FieldMaskingSpanQuery(q2, "studentfirstname");
     *    Query q = new SpanNearQuery(new SpanQuery[]{q1, q2m}, -1, false);
     * 
    * to search for 'studentfirstname:james studentsurname:jones' and find * teacherid 1 without matching teacherid 2 (which has a 'james' in position 0 * and 'jones' in position 1).

    * *

    Note: as {@link #getField()} returns the masked field, scoring will be * done using the norms of the field name supplied. This may lead to unexpected * scoring behaviour.

    */ public class FieldMaskingSpanQuery extends SpanQuery { private SpanQuery maskedQuery; private String field; public FieldMaskingSpanQuery(SpanQuery maskedQuery, String maskedField) { this.maskedQuery = maskedQuery; this.field = maskedField; } public String getField() { return field; } public SpanQuery getMaskedQuery() { return maskedQuery; } // :NOTE: getBoost and setBoost are not proxied to the maskedQuery // ...this is done to be more consistent with things like SpanFirstQuery public Spans getSpans(IndexReader reader) throws IOException { return maskedQuery.getSpans(reader); } /** @deprecated use {@link #extractTerms(Set)} instead. */ public Collection getTerms() { return maskedQuery.getTerms(); } public void extractTerms(Set terms) { maskedQuery.extractTerms(terms); } public Weight createWeight(Searcher searcher) throws IOException { return maskedQuery.createWeight(searcher); } public Similarity getSimilarity(Searcher searcher) { return maskedQuery.getSimilarity(searcher); } public Query rewrite(IndexReader reader) throws IOException { FieldMaskingSpanQuery clone = null; SpanQuery rewritten = (SpanQuery) maskedQuery.rewrite(reader); if (rewritten != maskedQuery) { clone = (FieldMaskingSpanQuery) this.clone(); clone.maskedQuery = rewritten; } if (clone != null) { return clone; } else { return this; } } public String toString(String field) { StringBuffer buffer = new StringBuffer(); buffer.append("mask("); buffer.append(maskedQuery.toString(field)); buffer.append(")"); buffer.append(ToStringUtils.boost(getBoost())); buffer.append(" as "); buffer.append(this.field); return buffer.toString(); } public boolean equals(Object o) { if (!(o instanceof FieldMaskingSpanQuery)) return false; FieldMaskingSpanQuery other = (FieldMaskingSpanQuery) o; return (this.getField().equals(other.getField()) && (this.getBoost() == other.getBoost()) && this.getMaskedQuery().equals(other.getMaskedQuery())); } public int hashCode() { return getMaskedQuery().hashCode() ^ getField().hashCode() ^ Float.floatToRawIntBits(getBoost()); } } lucene-2.9.4/src/java/org/apache/lucene/search/FilteredDocIdSetIterator.java0000644000175000017500000000541711474320224027413 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; /** * Abstract decorator class of a DocIdSetIterator * implementation that provides on-demand filter/validation * mechanism on an underlying DocIdSetIterator. See {@link * FilteredDocIdSet}. */ public abstract class FilteredDocIdSetIterator extends DocIdSetIterator { protected DocIdSetIterator _innerIter; private int doc; /** * Constructor. * @param innerIter Underlying DocIdSetIterator. */ public FilteredDocIdSetIterator(DocIdSetIterator innerIter) { if (innerIter == null) { throw new IllegalArgumentException("null iterator"); } _innerIter = innerIter; doc = -1; } /** * Validation method to determine whether a docid should be in the result set. * @param doc docid to be tested * @return true if input docid should be in the result set, false otherwise. * @see #FilteredDocIdSetIterator(DocIdSetIterator). */ abstract protected boolean match(int doc) throws IOException; /** @deprecated use {@link #docID()} instead. */ public final int doc() { return doc; } public int docID() { return doc; } /** @deprecated use {@link #nextDoc()} instead. */ public final boolean next() throws IOException{ return nextDoc() != NO_MORE_DOCS; } public int nextDoc() throws IOException { while ((doc = _innerIter.nextDoc()) != NO_MORE_DOCS) { if (match(doc)) { return doc; } } return doc; } /** @deprecated use {@link #advance(int)} instead. */ public final boolean skipTo(int n) throws IOException{ return advance(n) != NO_MORE_DOCS; } public int advance(int target) throws IOException { doc = _innerIter.advance(target); if (doc != NO_MORE_DOCS) { if (match(doc)) { return doc; } else { while ((doc = _innerIter.nextDoc()) != NO_MORE_DOCS) { if (match(doc)) { return doc; } } return doc; } } return doc; } } lucene-2.9.4/src/java/org/apache/lucene/search/MultiTermQueryWrapperFilter.java0000644000175000017500000001260511474320224030240 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.TermEnum; import org.apache.lucene.util.OpenBitSet; import java.io.IOException; import java.util.BitSet; /** * A wrapper for {@link MultiTermQuery}, that exposes its * functionality as a {@link Filter}. *

    * MultiTermQueryWrapperFilter is not designed to * be used by itself. Normally you subclass it to provide a Filter * counterpart for a {@link MultiTermQuery} subclass. *

    * For example, {@link TermRangeFilter} and {@link PrefixFilter} extend * MultiTermQueryWrapperFilter. * This class also provides the functionality behind * {@link MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE}; * this is why it is not abstract. */ public class MultiTermQueryWrapperFilter extends Filter { protected final MultiTermQuery query; /** * Wrap a {@link MultiTermQuery} as a Filter. */ protected MultiTermQueryWrapperFilter(MultiTermQuery query) { this.query = query; } //@Override public String toString() { // query.toString should be ok for the filter, too, if the query boost is 1.0f return query.toString(); } //@Override public final boolean equals(final Object o) { if (o==this) return true; if (o==null) return false; if (this.getClass().equals(o.getClass())) { return this.query.equals( ((MultiTermQueryWrapperFilter)o).query ); } return false; } //@Override public final int hashCode() { return query.hashCode(); } /** * Expert: Return the number of unique terms visited during execution of the filter. * If there are many of them, you may consider using another filter type * or optimize your total term count in index. *

    This method is not thread safe, be sure to only call it when no filter is running! * If you re-use the same filter instance for another * search, be sure to first reset the term counter * with {@link #clearTotalNumberOfTerms}. * @see #clearTotalNumberOfTerms */ public int getTotalNumberOfTerms() { return query.getTotalNumberOfTerms(); } /** * Expert: Resets the counting of unique terms. * Do this before executing the filter. * @see #getTotalNumberOfTerms */ public void clearTotalNumberOfTerms() { query.clearTotalNumberOfTerms(); } abstract class TermGenerator { public void generate(IndexReader reader, TermEnum enumerator) throws IOException { final int[] docs = new int[32]; final int[] freqs = new int[32]; TermDocs termDocs = reader.termDocs(); try { int termCount = 0; do { Term term = enumerator.term(); if (term == null) break; termCount++; termDocs.seek(term); while (true) { final int count = termDocs.read(docs, freqs); if (count != 0) { for(int i=0;i shortcut if (enumerator.term() == null) return DocIdSet.EMPTY_DOCIDSET; // else fill into a OpenBitSet final OpenBitSet bitSet = new OpenBitSet(reader.maxDoc()); new TermGenerator() { public void handleDoc(int doc) { bitSet.set(doc); } }.generate(reader, enumerator); return bitSet; } finally { enumerator.close(); } } } lucene-2.9.4/src/java/org/apache/lucene/search/SortComparatorSource.java0000644000175000017500000000406611474320224026723 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import java.io.IOException; import java.io.Serializable; /** * Expert: returns a comparator for sorting ScoreDocs. * *

    * Created: Apr 21, 2004 3:49:28 PM * * This class will be used as part of a key to a FieldCache value. You must * implement hashCode and equals to avoid an explosion in RAM usage if you use * instances that are not the same instance. If you are searching using the * Remote contrib, the same instance of this class on the client will be a new * instance on every call to the server, so hashCode/equals is very important in * that situation. * * @version $Id: SortComparatorSource.java 747019 2009-02-23 13:59:50Z * mikemccand $ * @since 1.4 * @deprecated Please use {@link FieldComparatorSource} instead. */ public interface SortComparatorSource extends Serializable { /** * Creates a comparator for the field in the given index. * @param reader Index to create comparator for. * @param fieldname Name of the field to create comparator for. * @return Comparator of ScoreDoc objects. * @throws IOException If an error occurs reading the index. */ ScoreDocComparator newComparator (IndexReader reader, String fieldname) throws IOException; } lucene-2.9.4/src/java/org/apache/lucene/search/FieldDoc.java0000644000175000017500000000547711474320224024243 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Expert: A ScoreDoc which also contains information about * how to sort the referenced document. In addition to the * document number and score, this object contains an array * of values for the document from the field(s) used to sort. * For example, if the sort criteria was to sort by fields * "a", "b" then "c", the fields object array * will have three elements, corresponding respectively to * the term values for the document in fields "a", "b" and "c". * The class of each element in the array will be either * Integer, Float or String depending on the type of values * in the terms of each field. * *

    Created: Feb 11, 2004 1:23:38 PM * * @since lucene 1.4 * @version $Id: FieldDoc.java 950095 2010-06-01 14:44:59Z mikemccand $ * @see ScoreDoc * @see TopFieldDocs */ public class FieldDoc extends ScoreDoc { /** Expert: The values which are used to sort the referenced document. * The order of these will match the original sort criteria given by a * Sort object. Each Object will be either an Integer, Float or String, * depending on the type of values in the terms of the original field. * @see Sort * @see Searcher#search(Query,Filter,int,Sort) */ public Comparable[] fields; /** Expert: Creates one of these objects with empty sort information. */ public FieldDoc (int doc, float score) { super (doc, score); } /** Expert: Creates one of these objects with the given sort information. */ public FieldDoc (int doc, float score, Comparable[] fields) { super (doc, score); this.fields = fields; } // A convenience method for debugging. public String toString() { // super.toString returns the doc and score information, so just add the // fields information StringBuffer sb = new StringBuffer(super.toString()); sb.append("["); for (int i = 0; i < fields.length; i++) { sb.append(fields[i]).append(", "); } sb.setLength(sb.length() - 2); // discard last ", " sb.append("]"); return sb.toString(); } } lucene-2.9.4/src/java/org/apache/lucene/search/HitCollectorWrapper.java0000644000175000017500000000347611474320224026523 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.IndexReader; /** * Wrapper for ({@link HitCollector}) implementations, which simply re-bases the * incoming docID before calling {@link HitCollector#collect}. * * @deprecated Please migrate custom HitCollectors to the new {@link Collector} * class. This class will be removed when {@link HitCollector} is * removed. */ public class HitCollectorWrapper extends Collector { private HitCollector collector; private int base = 0; private Scorer scorer = null; public HitCollectorWrapper(HitCollector collector) { this.collector = collector; } public void setNextReader(IndexReader reader, int docBase) { base = docBase; } public void collect(int doc) throws IOException { collector.collect(doc + base, scorer.score()); } public void setScorer(Scorer scorer) throws IOException { this.scorer = scorer; } public boolean acceptsDocsOutOfOrder() { return false; } } lucene-2.9.4/src/java/org/apache/lucene/search/HitIterator.java0000644000175000017500000000422411474320224025015 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search; import java.util.Iterator; import java.util.NoSuchElementException; /** * An iterator over {@link Hits} that provides lazy fetching of each document. * {@link Hits#iterator()} returns an instance of this class. Calls to {@link #next()} * return a {@link Hit} instance. * * @deprecated Use {@link TopScoreDocCollector} and {@link TopDocs} instead. Hits will be removed in Lucene 3.0. */ public class HitIterator implements Iterator { private Hits hits; private int hitNumber = 0; /** * Constructed from {@link Hits#iterator()}. */ HitIterator(Hits hits) { this.hits = hits; } /** * @return true if current hit is less than the total number of {@link Hits}. */ public boolean hasNext() { return hitNumber < hits.length(); } /** * Returns a {@link Hit} instance representing the next hit in {@link Hits}. * * @return Next {@link Hit}. */ public Object next() { if (hitNumber == hits.length()) throw new NoSuchElementException(); Object next = new Hit(hits, hitNumber); hitNumber++; return next; } /** * Unsupported operation. * * @throws UnsupportedOperationException */ public void remove() { throw new UnsupportedOperationException(); } /** * Returns the total number of hits. */ public int length() { return hits.length(); } } lucene-2.9.4/src/java/org/apache/lucene/search/WildcardQuery.java0000644000175000017500000000631511474320224025341 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.util.ToStringUtils; import java.io.IOException; /** Implements the wildcard search query. Supported wildcards are *, which * matches any character sequence (including the empty one), and ?, * which matches any single character. Note this query can be slow, as it * needs to iterate over many terms. In order to prevent extremely slow WildcardQueries, * a Wildcard term should not start with one of the wildcards * or * ?. * *

    This query uses the {@link * MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} * rewrite method. * * @see WildcardTermEnum */ public class WildcardQuery extends MultiTermQuery { private boolean termContainsWildcard; protected Term term; public WildcardQuery(Term term) { super(term); //will be removed in 3.0 this.term = term; this.termContainsWildcard = (term.text().indexOf('*') != -1) || (term.text().indexOf('?') != -1); } protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { return new WildcardTermEnum(reader, getTerm()); } /** * Returns the pattern term. */ public Term getTerm() { return term; } public Query rewrite(IndexReader reader) throws IOException { if (!termContainsWildcard) return new TermQuery(getTerm()); else return super.rewrite(reader); } /** Prints a user-readable version of this query. */ public String toString(String field) { StringBuffer buffer = new StringBuffer(); if (!term.field().equals(field)) { buffer.append(term.field()); buffer.append(":"); } buffer.append(term.text()); buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } //@Override public int hashCode() { final int prime = 31; int result = super.hashCode(); result = prime * result + ((term == null) ? 0 : term.hashCode()); return result; } //@Override public boolean equals(Object obj) { if (this == obj) return true; if (!super.equals(obj)) return false; if (getClass() != obj.getClass()) return false; WildcardQuery other = (WildcardQuery) obj; if (term == null) { if (other.term != null) return false; } else if (!term.equals(other.term)) return false; return true; } } lucene-2.9.4/src/java/org/apache/lucene/search/BooleanScorer.java0000644000175000017500000003051111474320224025312 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Iterator; import java.util.List; import org.apache.lucene.index.IndexReader; /* Description from Doug Cutting (excerpted from * LUCENE-1483): * * BooleanScorer uses a ~16k array to score windows of * docs. So it scores docs 0-16k first, then docs 16-32k, * etc. For each window it iterates through all query terms * and accumulates a score in table[doc%16k]. It also stores * in the table a bitmask representing which terms * contributed to the score. Non-zero scores are chained in * a linked list. At the end of scoring each window it then * iterates through the linked list and, if the bitmask * matches the boolean constraints, collects a hit. For * boolean queries with lots of frequent terms this can be * much faster, since it does not need to update a priority * queue for each posting, instead performing constant-time * operations per posting. The only downside is that it * results in hits being delivered out-of-order within the * window, which means it cannot be nested within other * scorers. But it works well as a top-level scorer. * * The new BooleanScorer2 implementation instead works by * merging priority queues of postings, albeit with some * clever tricks. For example, a pure conjunction (all terms * required) does not require a priority queue. Instead it * sorts the posting streams at the start, then repeatedly * skips the first to to the last. If the first ever equals * the last, then there's a hit. When some terms are * required and some terms are optional, the conjunction can * be evaluated first, then the optional terms can all skip * to the match and be added to the score. Thus the * conjunction can reduce the number of priority queue * updates for the optional terms. */ final class BooleanScorer extends Scorer { private static final class BooleanScorerCollector extends Collector { private BucketTable bucketTable; private int mask; private Scorer scorer; public BooleanScorerCollector(int mask, BucketTable bucketTable) { this.mask = mask; this.bucketTable = bucketTable; } public final void collect(final int doc) throws IOException { final BucketTable table = bucketTable; final int i = doc & BucketTable.MASK; Bucket bucket = table.buckets[i]; if (bucket == null) table.buckets[i] = bucket = new Bucket(); if (bucket.doc != doc) { // invalid bucket bucket.doc = doc; // set doc bucket.score = scorer.score(); // initialize score bucket.bits = mask; // initialize mask bucket.coord = 1; // initialize coord bucket.next = table.first; // push onto valid list table.first = bucket; } else { // valid bucket bucket.score += scorer.score(); // increment score bucket.bits |= mask; // add bits in mask bucket.coord++; // increment coord } } public void setNextReader(IndexReader reader, int docBase) { // not needed by this implementation } public void setScorer(Scorer scorer) throws IOException { this.scorer = scorer; } public boolean acceptsDocsOutOfOrder() { return true; } } // An internal class which is used in score(Collector, int) for setting the // current score. This is required since Collector exposes a setScorer method // and implementations that need the score will call scorer.score(). // Therefore the only methods that are implemented are score() and doc(). private static final class BucketScorer extends Scorer { float score; int doc = NO_MORE_DOCS; public BucketScorer() { super(null); } public int advance(int target) throws IOException { return NO_MORE_DOCS; } /** @deprecated use {@link #docID()} instead. */ public int doc() { return doc; } public int docID() { return doc; } public Explanation explain(int doc) throws IOException { return null; } /** @deprecated use {@link #nextDoc()} instead. */ public boolean next() throws IOException { return false; } public int nextDoc() throws IOException { return NO_MORE_DOCS; } public float score() throws IOException { return score; } /** @deprecated use {@link #advance(int)} instead. */ public boolean skipTo(int target) throws IOException { return false; } } static final class Bucket { int doc = -1; // tells if bucket is valid float score; // incremental score int bits; // used for bool constraints int coord; // count of terms in score Bucket next; // next valid bucket } /** A simple hash table of document scores within a range. */ static final class BucketTable { public static final int SIZE = 1 << 11; public static final int MASK = SIZE - 1; final Bucket[] buckets = new Bucket[SIZE]; Bucket first = null; // head of valid list public BucketTable() {} public Collector newCollector(int mask) { return new BooleanScorerCollector(mask, this); } public final int size() { return SIZE; } } static final class SubScorer { public Scorer scorer; public boolean required = false; public boolean prohibited = false; public Collector collector; public SubScorer next; public SubScorer(Scorer scorer, boolean required, boolean prohibited, Collector collector, SubScorer next) throws IOException { this.scorer = scorer; this.required = required; this.prohibited = prohibited; this.collector = collector; this.next = next; } } private SubScorer scorers = null; private BucketTable bucketTable = new BucketTable(); private int maxCoord = 1; private final float[] coordFactors; private int requiredMask = 0; private int prohibitedMask = 0; private int nextMask = 1; private final int minNrShouldMatch; private int end; private Bucket current; private int doc = -1; BooleanScorer(Similarity similarity, int minNrShouldMatch, List optionalScorers, List prohibitedScorers) throws IOException { super(similarity); this.minNrShouldMatch = minNrShouldMatch; if (optionalScorers != null && optionalScorers.size() > 0) { for (Iterator si = optionalScorers.iterator(); si.hasNext();) { Scorer scorer = (Scorer) si.next(); maxCoord++; if (scorer.nextDoc() != NO_MORE_DOCS) { scorers = new SubScorer(scorer, false, false, bucketTable.newCollector(0), scorers); } } } if (prohibitedScorers != null && prohibitedScorers.size() > 0) { for (Iterator si = prohibitedScorers.iterator(); si.hasNext();) { Scorer scorer = (Scorer) si.next(); int mask = nextMask; nextMask = nextMask << 1; prohibitedMask |= mask; // update prohibited mask if (scorer.nextDoc() != NO_MORE_DOCS) { scorers = new SubScorer(scorer, false, true, bucketTable.newCollector(mask), scorers); } } } coordFactors = new float[maxCoord]; Similarity sim = getSimilarity(); for (int i = 0; i < maxCoord; i++) { coordFactors[i] = sim.coord(i, maxCoord - 1); } } // firstDocID is ignored since nextDoc() initializes 'current' protected boolean score(Collector collector, int max, int firstDocID) throws IOException { boolean more; Bucket tmp; BucketScorer bs = new BucketScorer(); // The internal loop will set the score and doc before calling collect. collector.setScorer(bs); do { bucketTable.first = null; while (current != null) { // more queued // check prohibited & required if ((current.bits & prohibitedMask) == 0 && (current.bits & requiredMask) == requiredMask) { if (current.doc >= max){ tmp = current; current = current.next; tmp.next = bucketTable.first; bucketTable.first = tmp; continue; } if (current.coord >= minNrShouldMatch) { bs.score = current.score * coordFactors[current.coord]; bs.doc = current.doc; collector.collect(current.doc); } } current = current.next; // pop the queue } if (bucketTable.first != null){ current = bucketTable.first; bucketTable.first = current.next; return true; } // refill the queue more = false; end += BucketTable.SIZE; for (SubScorer sub = scorers; sub != null; sub = sub.next) { int subScorerDocID = sub.scorer.docID(); if (subScorerDocID != NO_MORE_DOCS) { more |= sub.scorer.score(sub.collector, end, subScorerDocID); } } current = bucketTable.first; } while (current != null || more); return false; } /** @deprecated use {@link #score(Collector, int, int)} instead. */ protected boolean score(HitCollector hc, int max) throws IOException { return score(new HitCollectorWrapper(hc), max, docID()); } public int advance(int target) throws IOException { throw new UnsupportedOperationException(); } /** @deprecated use {@link #docID()} instead. */ public int doc() { return current.doc; } public int docID() { return doc; } public Explanation explain(int doc) { throw new UnsupportedOperationException(); } /** @deprecated use {@link #nextDoc()} instead. */ public boolean next() throws IOException { return nextDoc() != NO_MORE_DOCS; } public int nextDoc() throws IOException { boolean more; do { while (bucketTable.first != null) { // more queued current = bucketTable.first; bucketTable.first = current.next; // pop the queue // check prohibited & required, and minNrShouldMatch if ((current.bits & prohibitedMask) == 0 && (current.bits & requiredMask) == requiredMask && current.coord >= minNrShouldMatch) { return doc = current.doc; } } // refill the queue more = false; end += BucketTable.SIZE; for (SubScorer sub = scorers; sub != null; sub = sub.next) { Scorer scorer = sub.scorer; sub.collector.setScorer(scorer); int doc = scorer.docID(); while (doc < end) { sub.collector.collect(doc); doc = scorer.nextDoc(); } more |= (doc != NO_MORE_DOCS); } } while (bucketTable.first != null || more); return doc = NO_MORE_DOCS; } public float score() { return current.score * coordFactors[current.coord]; } public void score(Collector collector) throws IOException { score(collector, Integer.MAX_VALUE, nextDoc()); } /** @deprecated use {@link #score(Collector)} instead. */ public void score(HitCollector hc) throws IOException { score(new HitCollectorWrapper(hc)); } /** @deprecated use {@link #advance(int)} instead. */ public boolean skipTo(int target) { throw new UnsupportedOperationException(); } public String toString() { StringBuffer buffer = new StringBuffer(); buffer.append("boolean("); for (SubScorer sub = scorers; sub != null; sub = sub.next) { buffer.append(sub.scorer.toString()); buffer.append(" "); } buffer.append(")"); return buffer.toString(); } } lucene-2.9.4/src/java/org/apache/lucene/search/package.html0000644000175000017500000004537311474320224024207 0ustar janpascaljanpascal Code to search indices.

    Table Of Contents

    1. Search Basics
    2. The Query Classes
    3. Changing the Scoring

    Search

    Search over indices. Applications usually call {@link org.apache.lucene.search.Searcher#search(Query)} or {@link org.apache.lucene.search.Searcher#search(Query,Filter)}.

    Query Classes

    TermQuery

    Of the various implementations of Query, the TermQuery is the easiest to understand and the most often used in applications. A TermQuery matches all the documents that contain the specified Term, which is a word that occurs in a certain Field. Thus, a TermQuery identifies and scores all Documents that have a Field with the specified string in it. Constructing a TermQuery is as simple as:

            TermQuery tq = new TermQuery(new Term("fieldName", "term"));
        
    In this example, the Query identifies all Documents that have the Field named "fieldName" containing the word "term".

    BooleanQuery

    Things start to get interesting when one combines multiple TermQuery instances into a BooleanQuery. A BooleanQuery contains multiple BooleanClauses, where each clause contains a sub-query (Query instance) and an operator (from BooleanClause.Occur) describing how that sub-query is combined with the other clauses:

    1. SHOULD — Use this operator when a clause can occur in the result set, but is not required. If a query is made up of all SHOULD clauses, then every document in the result set matches at least one of these clauses.

    2. MUST — Use this operator when a clause is required to occur in the result set. Every document in the result set will match all such clauses.

    3. MUST NOT — Use this operator when a clause must not occur in the result set. No document in the result set will match any such clauses.

    Boolean queries are constructed by adding two or more BooleanClause instances. If too many clauses are added, a TooManyClauses exception will be thrown during searching. This most often occurs when a Query is rewritten into a BooleanQuery with many TermQuery clauses, for example by WildcardQuery. The default setting for the maximum number of clauses 1024, but this can be changed via the static method setMaxClauseCount in BooleanQuery.

    Phrases

    Another common search is to find documents containing certain phrases. This is handled two different ways:

    1. PhraseQuery — Matches a sequence of Terms. PhraseQuery uses a slop factor to determine how many positions may occur between any two terms in the phrase and still be considered a match.

    2. SpanNearQuery — Matches a sequence of other SpanQuery instances. SpanNearQuery allows for much more complicated phrase queries since it is constructed from other SpanQuery instances, instead of only TermQuery instances.

    TermRangeQuery

    The TermRangeQuery matches all documents that occur in the exclusive range of a lower Term and an upper Term. according to {@link java.lang.String#compareTo(String)}. It is not intended for numerical ranges, use NumericRangeQuery instead. For example, one could find all documents that have terms beginning with the letters a through c. This type of Query is frequently used to find documents that occur in a specific date range.

    NumericRangeQuery

    The NumericRangeQuery matches all documents that occur in a numeric range. For NumericRangeQuery to work, you must index the values using a special NumericField.

    PrefixQuery, WildcardQuery

    While the PrefixQuery has a different implementation, it is essentially a special case of the WildcardQuery. The PrefixQuery allows an application to identify all documents with terms that begin with a certain string. The WildcardQuery generalizes this by allowing for the use of * (matches 0 or more characters) and ? (matches exactly one character) wildcards. Note that the WildcardQuery can be quite slow. Also note that WildcardQuery should not start with * and ?, as these are extremely slow. To remove this protection and allow a wildcard at the beginning of a term, see method setAllowLeadingWildcard in QueryParser.

    FuzzyQuery

    A FuzzyQuery matches documents that contain terms similar to the specified term. Similarity is determined using Levenshtein (edit) distance. This type of query can be useful when accounting for spelling variations in the collection.

    Changing Similarity

    Chances are DefaultSimilarity is sufficient for all your searching needs. However, in some applications it may be necessary to customize your Similarity implementation. For instance, some applications do not need to distinguish between shorter and longer documents (see a "fair" similarity).

    To change Similarity, one must do so for both indexing and searching, and the changes must happen before either of these actions take place. Although in theory there is nothing stopping you from changing mid-stream, it just isn't well-defined what is going to happen.

    To make this change, implement your own Similarity (likely you'll want to simply subclass DefaultSimilarity) and then use the new class by calling IndexWriter.setSimilarity before indexing and Searcher.setSimilarity before searching.

    If you are interested in use cases for changing your similarity, see the Lucene users's mailing list at Overriding Similarity. In summary, here are a few use cases:

    1. SweetSpotSimilaritySweetSpotSimilarity gives small increases as the frequency increases a small amount and then greater increases when you hit the "sweet spot", i.e. where you think the frequency of terms is more significant.

    2. Overriding tf — In some applications, it doesn't matter what the score of a document is as long as a matching term occurs. In these cases people have overridden Similarity to return 1 from the tf() method.

    3. Changing Length Normalization — By overriding lengthNorm, it is possible to discount how the length of a field contributes to a score. In DefaultSimilarity, lengthNorm = 1 / (numTerms in field)^0.5, but if one changes this to be 1 / (numTerms in field), all fields will be treated "fairly".

    In general, Chris Hostetter sums it up best in saying (from the Lucene users's mailing list):
    [One would override the Similarity in] ... any situation where you know more about your data then just that it's "text" is a situation where it *might* make sense to to override your Similarity method.

    Changing Scoring — Expert Level

    Changing scoring is an expert level task, so tread carefully and be prepared to share your code if you want help.

    With the warning out of the way, it is possible to change a lot more than just the Similarity when it comes to scoring in Lucene. Lucene's scoring is a complex mechanism that is grounded by three main classes:

    1. Query — The abstract object representation of the user's information need.
    2. Weight — The internal interface representation of the user's Query, so that Query objects may be reused.
    3. Scorer — An abstract class containing common functionality for scoring. Provides both scoring and explanation capabilities.
    Details on each of these classes, and their children, can be found in the subsections below.

    The Query Class

    In some sense, the Query class is where it all begins. Without a Query, there would be nothing to score. Furthermore, the Query class is the catalyst for the other scoring classes as it is often responsible for creating them or coordinating the functionality between them. The Query class has several methods that are important for derived classes:

    1. createWeight(Searcher searcher) — A Weight is the internal representation of the Query, so each Query implementation must provide an implementation of Weight. See the subsection on The Weight Interface below for details on implementing the Weight interface.
    2. rewrite(IndexReader reader) — Rewrites queries into primitive queries. Primitive queries are: TermQuery, BooleanQuery, and other queries that implement Query.html#createWeight(Searcher searcher)

    The Weight Interface

    The Weight interface provides an internal representation of the Query so that it can be reused. Any Searcher dependent state should be stored in the Weight implementation, not in the Query class. The interface defines six methods that must be implemented:

    1. Weight#getQuery() — Pointer to the Query that this Weight represents.
    2. Weight#getValue() — The weight for this Query. For example, the TermQuery.TermWeight value is equal to the idf^2 * boost * queryNorm
    3. Weight#sumOfSquaredWeights() — The sum of squared weights. For TermQuery, this is (idf * boost)^2
    4. Weight#normalize(float) — Determine the query normalization factor. The query normalization may allow for comparing scores between queries.
    5. Weight#scorer(IndexReader, boolean, boolean) — Construct a new Scorer for this Weight. See The Scorer Class below for help defining a Scorer. As the name implies, the Scorer is responsible for doing the actual scoring of documents given the Query.
    6. Weight#explain(Searcher, IndexReader, int) — Provide a means for explaining why a given document was scored the way it was.

    The Scorer Class

    The Scorer abstract class provides common scoring functionality for all Scorer implementations and is the heart of the Lucene scoring process. The Scorer defines the following abstract (they are not yet abstract, but will be in Lucene 3.0 and should be considered as such now) methods which must be implemented (some of them inherited from DocIdSetIterator ):

    1. DocIdSetIterator#nextDoc() — Advances to the next document that matches this Query, returning true if and only if there is another document that matches.
    2. DocIdSetIterator#docID() — Returns the id of the Document that contains the match. It is not valid until next() has been called at least once.
    3. Scorer#score(Collector) — Scores and collects all matching documents using the given Collector.
    4. Scorer#score() — Return the score of the current document. This value can be determined in any appropriate way for an application. For instance, the TermScorer returns the tf * Weight.getValue() * fieldNorm.
    5. DocIdSetIterator#advance(int) — Skip ahead in the document matches to the document whose id is greater than or equal to the passed in value. In many instances, advance can be implemented more efficiently than simply looping through all the matching documents until the target document is identified.

    Why would I want to add my own Query?

    In a nutshell, you want to add your own custom Query implementation when you think that Lucene's aren't appropriate for the task that you want to do. You might be doing some cutting edge research or you need more information back out of Lucene (similar to Doug adding SpanQuery functionality).

    lucene-2.9.4/src/java/org/apache/lucene/search/CachingWrapperFilter.java0000644000175000017500000002037311474320224026625 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Serializable; import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.DocIdBitSet; import org.apache.lucene.util.OpenBitSetDISI; import org.apache.lucene.util.Parameter; import java.util.BitSet; import java.util.WeakHashMap; import java.util.Map; /** * Wraps another filter's result and caches it. The purpose is to allow * filters to simply filter, and then wrap with this class to add caching. */ public class CachingWrapperFilter extends Filter { protected Filter filter; /** * Expert: Specifies how new deletions against a reopened * reader should be handled. * *

    The default is IGNORE, which means the cache entry * will be re-used for a given segment, even when that * segment has been reopened due to changes in deletions. * This is a big performance gain, especially with * near-real-timer readers, since you don't hit a cache * miss on every reopened reader for prior segments.

    * *

    However, in some cases this can cause invalid query * results, allowing deleted documents to be returned. * This only happens if the main query does not rule out * deleted documents on its own, such as a toplevel * ConstantScoreQuery. To fix this, use RECACHE to * re-create the cached filter (at a higher per-reopen * cost, but at faster subsequent search performance), or * use DYNAMIC to dynamically intersect deleted docs (fast * reopen time but some hit to search performance).

    */ public static final class DeletesMode extends Parameter implements Serializable { private DeletesMode(String name) { super(name); } public static DeletesMode IGNORE = new DeletesMode("IGNORE"); public static DeletesMode RECACHE = new DeletesMode("RECACHE"); public static DeletesMode DYNAMIC = new DeletesMode("DYNAMIC"); } protected final FilterCache cache; static abstract class FilterCache implements Serializable { /** * A transient Filter cache (package private because of test) */ // NOTE: not final so that we can dynamically re-init // after de-serialize transient Map cache; private final DeletesMode deletesMode; public FilterCache(DeletesMode deletesMode) { this.deletesMode = deletesMode; } public synchronized Object get(IndexReader reader, Object coreKey, Object delCoreKey) throws IOException { Object value; if (cache == null) { cache = new WeakHashMap(); } if (deletesMode == DeletesMode.IGNORE) { // key on core value = cache.get(coreKey); } else if (deletesMode == DeletesMode.RECACHE) { // key on deletes, if any, else core value = cache.get(delCoreKey); } else { assert deletesMode == DeletesMode.DYNAMIC; // first try for exact match value = cache.get(delCoreKey); if (value == null) { // now for core match, but dynamically AND NOT // deletions value = cache.get(coreKey); if (value != null && reader.hasDeletions()) { value = mergeDeletes(reader, value); } } } return value; } protected abstract Object mergeDeletes(IndexReader reader, Object value); public synchronized void put(Object coreKey, Object delCoreKey, Object value) { if (deletesMode == DeletesMode.IGNORE) { cache.put(coreKey, value); } else if (deletesMode == DeletesMode.RECACHE) { cache.put(delCoreKey, value); } else { cache.put(coreKey, value); cache.put(delCoreKey, value); } } } /** * New deletes are ignored by default, which gives higher * cache hit rate on reopened readers. Most of the time * this is safe, because the filter will be AND'd with a * Query that fully enforces deletions. If instead you * need this filter to always enforce deletions, pass * either {@link DeletesMode#RECACHE} or {@link * DeletesMode#DYNAMIC}. * @param filter Filter to cache results of */ public CachingWrapperFilter(Filter filter) { this(filter, DeletesMode.IGNORE); } /** * Expert: by default, the cached filter will be shared * across reopened segments that only had changes to their * deletions. * * @param filter Filter to cache results of * @param deletesMode See {@link DeletesMode} */ public CachingWrapperFilter(Filter filter, DeletesMode deletesMode) { this.filter = filter; cache = new FilterCache(deletesMode) { public Object mergeDeletes(final IndexReader r, final Object docIdSet) { return new FilteredDocIdSet((DocIdSet) docIdSet) { protected boolean match(int docID) { return !r.isDeleted(docID); } }; } }; } /** * @deprecated Use {@link #getDocIdSet(IndexReader)} instead. */ public BitSet bits(IndexReader reader) throws IOException { final Object coreKey = reader.getFieldCacheKey(); final Object delCoreKey = reader.hasDeletions() ? reader.getDeletesCacheKey() : coreKey; Object cached = cache.get(reader, coreKey, delCoreKey); if (cached != null) { if (cached instanceof BitSet) { return (BitSet) cached; } else if (cached instanceof DocIdBitSet) return ((DocIdBitSet) cached).getBitSet(); // It would be nice to handle the DocIdSet case, but that's not really possible } final BitSet bits = filter.bits(reader); if (bits != null) { cache.put(coreKey, delCoreKey, bits); } return bits; } /** Provide the DocIdSet to be cached, using the DocIdSet provided * by the wrapped Filter. *

    This implementation returns the given {@link DocIdSet}, if {@link DocIdSet#isCacheable} * returns true, else it copies the {@link DocIdSetIterator} into * an {@link OpenBitSetDISI}. */ protected DocIdSet docIdSetToCache(DocIdSet docIdSet, IndexReader reader) throws IOException { if (docIdSet == null) { // this is better than returning null, as the nonnull result can be cached return DocIdSet.EMPTY_DOCIDSET; } else if (docIdSet.isCacheable()) { return docIdSet; } else { final DocIdSetIterator it = docIdSet.iterator(); // null is allowed to be returned by iterator(), // in this case we wrap with the empty set, // which is cacheable. return (it == null) ? DocIdSet.EMPTY_DOCIDSET : new OpenBitSetDISI(it, reader.maxDoc()); } } // for testing int hitCount, missCount; public DocIdSet getDocIdSet(IndexReader reader) throws IOException { final Object coreKey = reader.getFieldCacheKey(); final Object delCoreKey = reader.hasDeletions() ? reader.getDeletesCacheKey() : coreKey; Object cached = cache.get(reader, coreKey, delCoreKey); if (cached != null) { hitCount++; if (cached instanceof DocIdSet) return (DocIdSet) cached; else return new DocIdBitSet((BitSet) cached); } missCount++; // cache miss final DocIdSet docIdSet = docIdSetToCache(filter.getDocIdSet(reader), reader); if (docIdSet != null) { cache.put(coreKey, delCoreKey, docIdSet); } return docIdSet; } public String toString() { return "CachingWrapperFilter("+filter+")"; } public boolean equals(Object o) { if (!(o instanceof CachingWrapperFilter)) return false; return this.filter.equals(((CachingWrapperFilter)o).filter); } public int hashCode() { return filter.hashCode() ^ 0x1117BF25; } } lucene-2.9.4/src/java/org/apache/lucene/search/CachingSpanFilter.java0000644000175000017500000000661611474320224026112 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import java.io.IOException; import java.util.BitSet; /** * Wraps another SpanFilter's result and caches it. The purpose is to allow * filters to simply filter, and then wrap with this class to add caching. */ public class CachingSpanFilter extends SpanFilter { protected SpanFilter filter; /** * A transient Filter cache. */ private final CachingWrapperFilter.FilterCache cache; /** * New deletions always result in a cache miss, by default * ({@link CachingWrapperFilter.DeletesMode#RECACHE}. * @param filter Filter to cache results of */ public CachingSpanFilter(SpanFilter filter) { this(filter, CachingWrapperFilter.DeletesMode.RECACHE); } /** * @param filter Filter to cache results of * @param deletesMode See {@link CachingWrapperFilter.DeletesMode} */ public CachingSpanFilter(SpanFilter filter, CachingWrapperFilter.DeletesMode deletesMode) { this.filter = filter; if (deletesMode == CachingWrapperFilter.DeletesMode.DYNAMIC) { throw new IllegalArgumentException("DeletesMode.DYNAMIC is not supported"); } this.cache = new CachingWrapperFilter.FilterCache(deletesMode) { protected Object mergeDeletes(final IndexReader r, final Object value) { throw new IllegalStateException("DeletesMode.DYNAMIC is not supported"); } }; } /** * @deprecated Use {@link #getDocIdSet(IndexReader)} instead. */ public BitSet bits(IndexReader reader) throws IOException { SpanFilterResult result = getCachedResult(reader); return result != null ? result.getBits() : null; } public DocIdSet getDocIdSet(IndexReader reader) throws IOException { SpanFilterResult result = getCachedResult(reader); return result != null ? result.getDocIdSet() : null; } // for testing int hitCount, missCount; private SpanFilterResult getCachedResult(IndexReader reader) throws IOException { final Object coreKey = reader.getFieldCacheKey(); final Object delCoreKey = reader.hasDeletions() ? reader.getDeletesCacheKey() : coreKey; SpanFilterResult result = (SpanFilterResult) cache.get(reader, coreKey, delCoreKey); if (result != null) { hitCount++; return result; } missCount++; result = filter.bitSpans(reader); cache.put(coreKey, delCoreKey, result); return result; } public SpanFilterResult bitSpans(IndexReader reader) throws IOException { return getCachedResult(reader); } public String toString() { return "CachingSpanFilter("+filter+")"; } public boolean equals(Object o) { if (!(o instanceof CachingSpanFilter)) return false; return this.filter.equals(((CachingSpanFilter)o).filter); } public int hashCode() { return filter.hashCode() ^ 0x1117BF25; } } lucene-2.9.4/src/java/org/apache/lucene/search/FuzzyQuery.java0000644000175000017500000001767611474320224024753 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.ToStringUtils; import java.io.IOException; /** Implements the fuzzy search query. The similarity measurement * is based on the Levenshtein (edit distance) algorithm. * * Warning: this query is not very scalable with its default prefix * length of 0 - in this case, *every* term will be enumerated and * cause an edit score calculation. * */ public class FuzzyQuery extends MultiTermQuery { public final static float defaultMinSimilarity = 0.5f; public final static int defaultPrefixLength = 0; private float minimumSimilarity; private int prefixLength; private boolean termLongEnough = false; protected Term term; /** * Create a new FuzzyQuery that will match terms with a similarity * of at least minimumSimilarity to term. * If a prefixLength > 0 is specified, a common prefix * of that length is also required. * * @param term the term to search for * @param minimumSimilarity a value between 0 and 1 to set the required similarity * between the query term and the matching terms. For example, for a * minimumSimilarity of 0.5 a term of the same length * as the query term is considered similar to the query term if the edit distance * between both terms is less than length(term)*0.5 * @param prefixLength length of common (non-fuzzy) prefix * @throws IllegalArgumentException if minimumSimilarity is >= 1 or < 0 * or if prefixLength < 0 */ public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) throws IllegalArgumentException { super(term); // will be removed in 3.0 this.term = term; if (minimumSimilarity >= 1.0f) throw new IllegalArgumentException("minimumSimilarity >= 1"); else if (minimumSimilarity < 0.0f) throw new IllegalArgumentException("minimumSimilarity < 0"); if (prefixLength < 0) throw new IllegalArgumentException("prefixLength < 0"); if (term.text().length() > 1.0f / (1.0f - minimumSimilarity)) { this.termLongEnough = true; } this.minimumSimilarity = minimumSimilarity; this.prefixLength = prefixLength; rewriteMethod = SCORING_BOOLEAN_QUERY_REWRITE; } /** * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0)}. */ public FuzzyQuery(Term term, float minimumSimilarity) throws IllegalArgumentException { this(term, minimumSimilarity, defaultPrefixLength); } /** * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, 0.5f, 0)}. */ public FuzzyQuery(Term term) { this(term, defaultMinSimilarity, defaultPrefixLength); } /** * Returns the minimum similarity that is required for this query to match. * @return float value between 0.0 and 1.0 */ public float getMinSimilarity() { return minimumSimilarity; } /** * Returns the non-fuzzy prefix length. This is the number of characters at the start * of a term that must be identical (not fuzzy) to the query term if the query * is to match that term. */ public int getPrefixLength() { return prefixLength; } protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { return new FuzzyTermEnum(reader, getTerm(), minimumSimilarity, prefixLength); } /** * Returns the pattern term. */ public Term getTerm() { return term; } public void setRewriteMethod(RewriteMethod method) { throw new UnsupportedOperationException("FuzzyQuery cannot change rewrite method"); } public Query rewrite(IndexReader reader) throws IOException { if(!termLongEnough) { // can only match if it's exact return new TermQuery(term); } FilteredTermEnum enumerator = getEnum(reader); int maxClauseCount = BooleanQuery.getMaxClauseCount(); ScoreTermQueue stQueue = new ScoreTermQueue(maxClauseCount); ScoreTerm reusableST = null; try { do { float score = 0.0f; Term t = enumerator.term(); if (t != null) { score = enumerator.difference(); if (reusableST == null) { reusableST = new ScoreTerm(t, score); } else if (score >= reusableST.score) { // reusableST holds the last "rejected" entry, so, if // this new score is not better than that, there's no // need to try inserting it reusableST.score = score; reusableST.term = t; } else { continue; } reusableST = (ScoreTerm) stQueue.insertWithOverflow(reusableST); } } while (enumerator.next()); } finally { enumerator.close(); } BooleanQuery query = new BooleanQuery(true); int size = stQueue.size(); for(int i = 0; i < size; i++){ ScoreTerm st = (ScoreTerm) stQueue.pop(); TermQuery tq = new TermQuery(st.term); // found a match tq.setBoost(getBoost() * st.score); // set the boost query.add(tq, BooleanClause.Occur.SHOULD); // add to query } return query; } public String toString(String field) { StringBuffer buffer = new StringBuffer(); if (!term.field().equals(field)) { buffer.append(term.field()); buffer.append(":"); } buffer.append(term.text()); buffer.append('~'); buffer.append(Float.toString(minimumSimilarity)); buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } protected static class ScoreTerm { public Term term; public float score; public ScoreTerm(Term term, float score){ this.term = term; this.score = score; } } protected static class ScoreTermQueue extends PriorityQueue { public ScoreTermQueue(int size){ initialize(size); } /* (non-Javadoc) * @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object) */ protected boolean lessThan(Object a, Object b) { ScoreTerm termA = (ScoreTerm)a; ScoreTerm termB = (ScoreTerm)b; if (termA.score == termB.score) return termA.term.compareTo(termB.term) > 0; else return termA.score < termB.score; } } public int hashCode() { final int prime = 31; int result = super.hashCode(); result = prime * result + Float.floatToIntBits(minimumSimilarity); result = prime * result + prefixLength; result = prime * result + ((term == null) ? 0 : term.hashCode()); return result; } public boolean equals(Object obj) { if (this == obj) return true; if (!super.equals(obj)) return false; if (getClass() != obj.getClass()) return false; FuzzyQuery other = (FuzzyQuery) obj; if (Float.floatToIntBits(minimumSimilarity) != Float .floatToIntBits(other.minimumSimilarity)) return false; if (prefixLength != other.prefixLength) return false; if (term == null) { if (other.term != null) return false; } else if (!term.equals(other.term)) return false; return true; } } lucene-2.9.4/src/java/org/apache/lucene/search/Searchable.java0000644000175000017500000001700011474320224024604 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.Term; /** * The interface for search implementations. * *

    * Searchable is the abstract network protocol for searching. Implementations * provide search over a single index, over multiple indices, and over indices * on remote servers. * *

    * Queries, filters and sort criteria are designed to be compact so that they * may be efficiently passed to a remote index, with only the top-scoring hits * being returned, rather than every matching hit. * * NOTE: this interface is kept public for convenience. Since it is not * expected to be implemented directly, it may be changed unexpectedly between * releases. */ public interface Searchable { /** Lower-level search API. * *

    {@link HitCollector#collect(int,float)} is called for every non-zero * scoring document. *
    HitCollector-based access to remote indexes is discouraged. * *

    Applications should only use this if they need all of the * matching documents. The high-level search API ({@link * Searcher#search(Query)}) is usually more efficient, as it skips * non-high-scoring hits. * * @param weight to match documents * @param filter if non-null, used to permit documents to be collected. * @param results to receive hits * @throws BooleanQuery.TooManyClauses * @deprecated use {@link #search(Weight, Filter, Collector)} instead. */ void search(Weight weight, Filter filter, HitCollector results) throws IOException; /** * Lower-level search API. * *

    * {@link Collector#collect(int)} is called for every document.
    * Collector-based access to remote indexes is discouraged. * *

    * Applications should only use this if they need all of the matching * documents. The high-level search API ({@link Searcher#search(Query)}) is * usually more efficient, as it skips non-high-scoring hits. * * @param weight * to match documents * @param filter * if non-null, used to permit documents to be collected. * @param collector * to receive hits * @throws BooleanQuery.TooManyClauses */ void search(Weight weight, Filter filter, Collector collector) throws IOException; /** Frees resources associated with this Searcher. * Be careful not to call this method while you are still using objects * like {@link Hits}. */ void close() throws IOException; /** Expert: Returns the number of documents containing term. * * @see org.apache.lucene.index.IndexReader#docFreq(Term) */ int docFreq(Term term) throws IOException; /** Expert: For each term in the terms array, calculates the number of * documents containing term. Returns an array with these * document frequencies. Used to minimize number of remote calls. */ int[] docFreqs(Term[] terms) throws IOException; /** Expert: Returns one greater than the largest possible document number. * * @see org.apache.lucene.index.IndexReader#maxDoc() */ int maxDoc() throws IOException; /** Expert: Low-level search implementation. Finds the top n * hits for query, applying filter if non-null. * *

    Applications should usually call {@link Searcher#search(Query)} or * {@link Searcher#search(Query,Filter)} instead. * @throws BooleanQuery.TooManyClauses */ TopDocs search(Weight weight, Filter filter, int n) throws IOException; /** * Returns the stored fields of document i. * * @see org.apache.lucene.index.IndexReader#document(int) * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ Document doc(int i) throws CorruptIndexException, IOException; /** * Get the {@link org.apache.lucene.document.Document} at the nth position. The {@link org.apache.lucene.document.FieldSelector} * may be used to determine what {@link org.apache.lucene.document.Field}s to load and how they should be loaded. * * NOTE: If the underlying Reader (more specifically, the underlying FieldsReader) is closed before the lazy {@link org.apache.lucene.document.Field} is * loaded an exception may be thrown. If you want the value of a lazy {@link org.apache.lucene.document.Field} to be available after closing you must * explicitly load it or fetch the Document again with a new loader. * * * @param n Get the document at the nth position * @param fieldSelector The {@link org.apache.lucene.document.FieldSelector} to use to determine what Fields should be loaded on the Document. May be null, in which case all Fields will be loaded. * @return The stored fields of the {@link org.apache.lucene.document.Document} at the nth position * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error * * @see org.apache.lucene.index.IndexReader#document(int, FieldSelector) * @see org.apache.lucene.document.Fieldable * @see org.apache.lucene.document.FieldSelector * @see org.apache.lucene.document.SetBasedFieldSelector * @see org.apache.lucene.document.LoadFirstFieldSelector */ Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException; /** Expert: called to re-write queries into primitive queries. * @throws BooleanQuery.TooManyClauses */ Query rewrite(Query query) throws IOException; /** Expert: low-level implementation method * Returns an Explanation that describes how doc scored against * weight. * *

    This is intended to be used in developing Similarity implementations, * and, for good performance, should not be displayed with every hit. * Computing an explanation is as expensive as executing the query over the * entire index. *

    Applications should call {@link Searcher#explain(Query, int)}. * @throws BooleanQuery.TooManyClauses */ Explanation explain(Weight weight, int doc) throws IOException; /** Expert: Low-level search implementation with arbitrary sorting. Finds * the top n hits for query, applying * filter if non-null, and sorting the hits by the criteria in * sort. * *

    Applications should usually call {@link * Searcher#search(Query,Filter,int,Sort)} instead. * * @throws BooleanQuery.TooManyClauses */ TopFieldDocs search(Weight weight, Filter filter, int n, Sort sort) throws IOException; } lucene-2.9.4/src/java/org/apache/lucene/search/DocIdSetIterator.java0000644000175000017500000001205511474320224025730 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; /** * This abstract class defines methods to iterate over a set of non-decreasing * doc ids. Note that this class assumes it iterates on doc Ids, and therefore * {@link #NO_MORE_DOCS} is set to {@value #NO_MORE_DOCS} in order to be used as * a sentinel object. Implementations of this class are expected to consider * {@link Integer#MAX_VALUE} as an invalid value. */ public abstract class DocIdSetIterator { // TODO (3.0): review the javadocs and remove any references to '3.0'. private int doc = -1; /** * When returned by {@link #nextDoc()}, {@link #advance(int)} and * {@link #doc()} it means there are no more docs in the iterator. */ public static final int NO_MORE_DOCS = Integer.MAX_VALUE; /** * Unsupported anymore. Call {@link #docID()} instead. This method throws * {@link UnsupportedOperationException} if called. * * @deprecated use {@link #docID()} instead. */ public int doc() { throw new UnsupportedOperationException("Call docID() instead."); } /** * Returns the following: *

      *
    • -1 or {@link #NO_MORE_DOCS} if {@link #nextDoc()} or * {@link #advance(int)} were not called yet. *
    • {@link #NO_MORE_DOCS} if the iterator has exhausted. *
    • Otherwise it should return the doc ID it is currently on. *
    *

    * NOTE: in 3.0, this method will become abstract. * * @since 2.9 */ public int docID() { return doc; } /** * Unsupported anymore. Call {@link #nextDoc()} instead. This method throws * {@link UnsupportedOperationException} if called. * * @deprecated use {@link #nextDoc()} instead. This will be removed in 3.0 */ public boolean next() throws IOException { throw new UnsupportedOperationException("Call nextDoc() instead."); } /** * Unsupported anymore. Call {@link #advance(int)} instead. This method throws * {@link UnsupportedOperationException} if called. * * @deprecated use {@link #advance(int)} instead. This will be removed in 3.0 */ public boolean skipTo(int target) throws IOException { throw new UnsupportedOperationException("Call advance() instead."); } /** * Advances to the next document in the set and returns the doc it is * currently on, or {@link #NO_MORE_DOCS} if there are no more docs in the * set.
    * * NOTE: in 3.0 this method will become abstract, following the removal * of {@link #next()}. For backward compatibility it is implemented as: * *

       * public int nextDoc() throws IOException {
       *   return next() ? doc() : NO_MORE_DOCS;
       * }
       * 
    * * NOTE: after the iterator has exhausted you should not call this * method, as it may result in unpredicted behavior. * * @since 2.9 */ public int nextDoc() throws IOException { return doc = next() ? doc() : NO_MORE_DOCS; } /** * Advances to the first beyond the current whose document number is greater * than or equal to target. Returns the current document number or * {@link #NO_MORE_DOCS} if there are no more docs in the set. *

    * Behaves as if written: * *

       * int advance(int target) {
       *   int doc;
       *   while ((doc = nextDoc()) < target) {
       *   }
       *   return doc;
       * }
       * 
    * * Some implementations are considerably more efficient than that. *

    * NOTE: certain implementations may return a different value (each * time) if called several times in a row with the same target. *

    * NOTE: this method may be called with {@value #NO_MORE_DOCS} for * efficiency by some Scorers. If your implementation cannot efficiently * determine that it should exhaust, it is recommended that you check for that * value in each call to this method. *

    * NOTE: after the iterator has exhausted you should not call this * method, as it may result in unpredicted behavior. *

    * NOTE: in 3.0 this method will become abstract, following the removal * of {@link #skipTo(int)}. * * @since 2.9 */ public int advance(int target) throws IOException { if (target == NO_MORE_DOCS) { return doc = NO_MORE_DOCS; } return doc = skipTo(target) ? doc() : NO_MORE_DOCS; } } lucene-2.9.4/src/java/org/apache/lucene/search/DisjunctionMaxQuery.java0000644000175000017500000002540411474320224026547 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; import java.util.Set; import org.apache.lucene.index.IndexReader; /** * A query that generates the union of documents produced by its subqueries, and that scores each document with the maximum * score for that document as produced by any subquery, plus a tie breaking increment for any additional matching subqueries. * This is useful when searching for a word in multiple fields with different boost factors (so that the fields cannot be * combined equivalently into a single search field). We want the primary score to be the one associated with the highest boost, * not the sum of the field scores (as BooleanQuery would give). * If the query is "albino elephant" this ensures that "albino" matching one field and "elephant" matching * another gets a higher score than "albino" matching both fields. * To get this result, use both BooleanQuery and DisjunctionMaxQuery: for each term a DisjunctionMaxQuery searches for it in * each field, while the set of these DisjunctionMaxQuery's is combined into a BooleanQuery. * The tie breaker capability allows results that include the same term in multiple fields to be judged better than results that * include this term in only the best of those multiple fields, without confusing this with the better case of two different terms * in the multiple fields. */ public class DisjunctionMaxQuery extends Query { /* The subqueries */ private ArrayList disjuncts = new ArrayList(); /* Multiple of the non-max disjunct scores added into our final score. Non-zero values support tie-breaking. */ private float tieBreakerMultiplier = 0.0f; /** Creates a new empty DisjunctionMaxQuery. Use add() to add the subqueries. * @param tieBreakerMultiplier the score of each non-maximum disjunct for a document is multiplied by this weight * and added into the final score. If non-zero, the value should be small, on the order of 0.1, which says that * 10 occurrences of word in a lower-scored field that is also in a higher scored field is just as good as a unique * word in the lower scored field (i.e., one that is not in any higher scored field. */ public DisjunctionMaxQuery(float tieBreakerMultiplier) { this.tieBreakerMultiplier = tieBreakerMultiplier; } /** * Creates a new DisjunctionMaxQuery * @param disjuncts a Collection of all the disjuncts to add * @param tieBreakerMultiplier the weight to give to each matching non-maximum disjunct */ public DisjunctionMaxQuery(Collection disjuncts, float tieBreakerMultiplier) { this.tieBreakerMultiplier = tieBreakerMultiplier; add(disjuncts); } /** Add a subquery to this disjunction * @param query the disjunct added */ public void add(Query query) { disjuncts.add(query); } /** Add a collection of disjuncts to this disjunction * via Iterable */ public void add(Collection disjuncts) { this.disjuncts.addAll(disjuncts); } /** An Iterator over the disjuncts */ public Iterator iterator() { return disjuncts.iterator(); } /** * Expert: the Weight for DisjunctionMaxQuery, used to * normalize, score and explain these queries. * *

    NOTE: this API and implementation is subject to * change suddenly in the next release.

    */ protected class DisjunctionMaxWeight extends Weight { /** The Similarity implementation. */ protected Similarity similarity; /** The Weights for our subqueries, in 1-1 correspondence with disjuncts */ protected ArrayList weights = new ArrayList(); // The Weight's for our subqueries, in 1-1 correspondence with disjuncts /* Construct the Weight for this Query searched by searcher. Recursively construct subquery weights. */ public DisjunctionMaxWeight(Searcher searcher) throws IOException { this.similarity = searcher.getSimilarity(); for (Iterator iter = disjuncts.iterator(); iter.hasNext();) { weights.add(((Query) iter.next()).createWeight(searcher)); } } /* Return our associated DisjunctionMaxQuery */ public Query getQuery() { return DisjunctionMaxQuery.this; } /* Return our boost */ public float getValue() { return getBoost(); } /* Compute the sub of squared weights of us applied to our subqueries. Used for normalization. */ public float sumOfSquaredWeights() throws IOException { float max = 0.0f, sum = 0.0f; for (Iterator iter = weights.iterator(); iter.hasNext();) { float sub = ((Weight) iter.next()).sumOfSquaredWeights(); sum += sub; max = Math.max(max, sub); } float boost = getBoost(); return (((sum - max) * tieBreakerMultiplier * tieBreakerMultiplier) + max) * boost * boost; } /* Apply the computed normalization factor to our subqueries */ public void normalize(float norm) { norm *= getBoost(); // Incorporate our boost for (Iterator iter = weights.iterator(); iter.hasNext();) { ((Weight) iter.next()).normalize(norm); } } /* Create the scorer used to score our associated DisjunctionMaxQuery */ public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException { Scorer[] scorers = new Scorer[weights.size()]; int idx = 0; for (Iterator iter = weights.iterator(); iter.hasNext();) { Weight w = (Weight) iter.next(); Scorer subScorer = w.scorer(reader, true, false); if (subScorer != null && subScorer.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { scorers[idx++] = subScorer; } } if (idx == 0) return null; // all scorers did not have documents DisjunctionMaxScorer result = new DisjunctionMaxScorer(tieBreakerMultiplier, similarity, scorers, idx); return result; } /* Explain the score we computed for doc */ public Explanation explain(IndexReader reader, int doc) throws IOException { if (disjuncts.size() == 1) return ((Weight) weights.get(0)).explain(reader,doc); ComplexExplanation result = new ComplexExplanation(); float max = 0.0f, sum = 0.0f; result.setDescription(tieBreakerMultiplier == 0.0f ? "max of:" : "max plus " + tieBreakerMultiplier + " times others of:"); for (Iterator iter = weights.iterator(); iter.hasNext();) { Explanation e = ((Weight) iter.next()).explain(reader, doc); if (e.isMatch()) { result.setMatch(Boolean.TRUE); result.addDetail(e); sum += e.getValue(); max = Math.max(max, e.getValue()); } } result.setValue(max + (sum - max) * tieBreakerMultiplier); return result; } } // end of DisjunctionMaxWeight inner class /* Create the Weight used to score us */ public Weight createWeight(Searcher searcher) throws IOException { return new DisjunctionMaxWeight(searcher); } /** Optimize our representation and our subqueries representations * @param reader the IndexReader we query * @return an optimized copy of us (which may not be a copy if there is nothing to optimize) */ public Query rewrite(IndexReader reader) throws IOException { int numDisjunctions = disjuncts.size(); if (numDisjunctions == 1) { Query singleton = (Query) disjuncts.get(0); Query result = singleton.rewrite(reader); if (getBoost() != 1.0f) { if (result == singleton) result = (Query)result.clone(); result.setBoost(getBoost() * result.getBoost()); } return result; } DisjunctionMaxQuery clone = null; for (int i = 0 ; i < numDisjunctions; i++) { Query clause = (Query) disjuncts.get(i); Query rewrite = clause.rewrite(reader); if (rewrite != clause) { if (clone == null) clone = (DisjunctionMaxQuery)this.clone(); clone.disjuncts.set(i, rewrite); } } if (clone != null) return clone; else return this; } /** Create a shallow copy of us -- used in rewriting if necessary * @return a copy of us (but reuse, don't copy, our subqueries) */ public Object clone() { DisjunctionMaxQuery clone = (DisjunctionMaxQuery)super.clone(); clone.disjuncts = (ArrayList)this.disjuncts.clone(); return clone; } // inherit javadoc public void extractTerms(Set terms) { for (Iterator iter = disjuncts.iterator(); iter.hasNext();) { ((Query) iter.next()).extractTerms(terms); } } /** Prettyprint us. * @param field the field to which we are applied * @return a string that shows what we do, of the form "(disjunct1 | disjunct2 | ... | disjunctn)^boost" */ public String toString(String field) { StringBuffer buffer = new StringBuffer(); buffer.append("("); int numDisjunctions = disjuncts.size(); for (int i = 0 ; i < numDisjunctions; i++) { Query subquery = (Query) disjuncts.get(i); if (subquery instanceof BooleanQuery) { // wrap sub-bools in parens buffer.append("("); buffer.append(subquery.toString(field)); buffer.append(")"); } else buffer.append(subquery.toString(field)); if (i != numDisjunctions-1) buffer.append(" | "); } buffer.append(")"); if (tieBreakerMultiplier != 0.0f) { buffer.append("~"); buffer.append(tieBreakerMultiplier); } if (getBoost() != 1.0) { buffer.append("^"); buffer.append(getBoost()); } return buffer.toString(); } /** Return true iff we represent the same query as o * @param o another object * @return true iff o is a DisjunctionMaxQuery with the same boost and the same subqueries, in the same order, as us */ public boolean equals(Object o) { if (! (o instanceof DisjunctionMaxQuery) ) return false; DisjunctionMaxQuery other = (DisjunctionMaxQuery)o; return this.getBoost() == other.getBoost() && this.tieBreakerMultiplier == other.tieBreakerMultiplier && this.disjuncts.equals(other.disjuncts); } /** Compute a hash code for hashing us * @return the hash code */ public int hashCode() { return Float.floatToIntBits(getBoost()) + Float.floatToIntBits(tieBreakerMultiplier) + disjuncts.hashCode(); } } lucene-2.9.4/src/java/org/apache/lucene/search/BooleanScorer2.java0000644000175000017500000003305011474320224025375 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.ArrayList; import java.util.List; /* See the description in BooleanScorer.java, comparing * BooleanScorer & BooleanScorer2 */ /** An alternative to BooleanScorer that also allows a minimum number * of optional scorers that should match. *
    Implements skipTo(), and has no limitations on the numbers of added scorers. *
    Uses ConjunctionScorer, DisjunctionScorer, ReqOptScorer and ReqExclScorer. */ class BooleanScorer2 extends Scorer { private final List requiredScorers; private final List optionalScorers; private final List prohibitedScorers; private class Coordinator { float[] coordFactors = null; int maxCoord = 0; // to be increased for each non prohibited scorer int nrMatchers; // to be increased by score() of match counting scorers. void init() { // use after all scorers have been added. coordFactors = new float[maxCoord + 1]; Similarity sim = getSimilarity(); for (int i = 0; i <= maxCoord; i++) { coordFactors[i] = sim.coord(i, maxCoord); } } } private final Coordinator coordinator; /** The scorer to which all scoring will be delegated, * except for computing and using the coordination factor. */ private final Scorer countingSumScorer; /** The number of optionalScorers that need to match (if there are any) */ private final int minNrShouldMatch; private int doc = -1; /** * Creates a {@link Scorer} with the given similarity and lists of required, * prohibited and optional scorers. In no required scorers are added, at least * one of the optional scorers will have to match during the search. * * @param similarity * The similarity to be used. * @param minNrShouldMatch * The minimum number of optional added scorers that should match * during the search. In case no required scorers are added, at least * one of the optional scorers will have to match during the search. * @param required * the list of required scorers. * @param prohibited * the list of prohibited scorers. * @param optional * the list of optional scorers. */ public BooleanScorer2(Similarity similarity, int minNrShouldMatch, List required, List prohibited, List optional) throws IOException { super(similarity); if (minNrShouldMatch < 0) { throw new IllegalArgumentException("Minimum number of optional scorers should not be negative"); } coordinator = new Coordinator(); this.minNrShouldMatch = minNrShouldMatch; optionalScorers = optional; coordinator.maxCoord += optional.size(); requiredScorers = required; coordinator.maxCoord += required.size(); prohibitedScorers = prohibited; coordinator.init(); countingSumScorer = makeCountingSumScorer(); } /** Count a scorer as a single match. */ private class SingleMatchScorer extends Scorer { private Scorer scorer; private int lastScoredDoc = -1; // Save the score of lastScoredDoc, so that we don't compute it more than // once in score(). private float lastDocScore = Float.NaN; SingleMatchScorer(Scorer scorer) { super(scorer.getSimilarity()); this.scorer = scorer; } public float score() throws IOException { int doc = docID(); if (doc >= lastScoredDoc) { if (doc > lastScoredDoc) { lastDocScore = scorer.score(); lastScoredDoc = doc; } coordinator.nrMatchers++; } return lastDocScore; } /** @deprecated use {@link #docID()} instead. */ public int doc() { return scorer.doc(); } public int docID() { return scorer.docID(); } /** @deprecated use {@link #nextDoc()} instead. */ public boolean next() throws IOException { return scorer.nextDoc() != NO_MORE_DOCS; } public int nextDoc() throws IOException { return scorer.nextDoc(); } /** @deprecated use {@link #advance(int)} instead. */ public boolean skipTo(int docNr) throws IOException { return scorer.advance(docNr) != NO_MORE_DOCS; } public int advance(int target) throws IOException { return scorer.advance(target); } public Explanation explain(int docNr) throws IOException { return scorer.explain(docNr); } } private Scorer countingDisjunctionSumScorer(final List scorers, int minNrShouldMatch) throws IOException { // each scorer from the list counted as a single matcher return new DisjunctionSumScorer(scorers, minNrShouldMatch) { private int lastScoredDoc = -1; // Save the score of lastScoredDoc, so that we don't compute it more than // once in score(). private float lastDocScore = Float.NaN; public float score() throws IOException { int doc = docID(); if (doc >= lastScoredDoc) { if (doc > lastScoredDoc) { lastDocScore = super.score(); lastScoredDoc = doc; } coordinator.nrMatchers += super.nrMatchers; } return lastDocScore; } }; } private static final Similarity defaultSimilarity = Similarity.getDefault(); private Scorer countingConjunctionSumScorer(List requiredScorers) throws IOException { // each scorer from the list counted as a single matcher final int requiredNrMatchers = requiredScorers.size(); return new ConjunctionScorer(defaultSimilarity, requiredScorers) { private int lastScoredDoc = -1; // Save the score of lastScoredDoc, so that we don't compute it more than // once in score(). private float lastDocScore = Float.NaN; public float score() throws IOException { int doc = docID(); if (doc >= lastScoredDoc) { if (doc > lastScoredDoc) { lastDocScore = super.score(); lastScoredDoc = doc; } coordinator.nrMatchers += requiredNrMatchers; } // All scorers match, so defaultSimilarity super.score() always has 1 as // the coordination factor. // Therefore the sum of the scores of the requiredScorers // is used as score. return lastDocScore; } }; } private Scorer dualConjunctionSumScorer(Scorer req1, Scorer req2) throws IOException { // non counting. return new ConjunctionScorer(defaultSimilarity, new Scorer[]{req1, req2}); // All scorers match, so defaultSimilarity always has 1 as // the coordination factor. // Therefore the sum of the scores of two scorers // is used as score. } /** Returns the scorer to be used for match counting and score summing. * Uses requiredScorers, optionalScorers and prohibitedScorers. */ private Scorer makeCountingSumScorer() throws IOException { // each scorer counted as a single matcher return (requiredScorers.size() == 0) ? makeCountingSumScorerNoReq() : makeCountingSumScorerSomeReq(); } private Scorer makeCountingSumScorerNoReq() throws IOException { // No required scorers // minNrShouldMatch optional scorers are required, but at least 1 int nrOptRequired = (minNrShouldMatch < 1) ? 1 : minNrShouldMatch; Scorer requiredCountingSumScorer; if (optionalScorers.size() > nrOptRequired) requiredCountingSumScorer = countingDisjunctionSumScorer(optionalScorers, nrOptRequired); else if (optionalScorers.size() == 1) requiredCountingSumScorer = new SingleMatchScorer((Scorer) optionalScorers.get(0)); else requiredCountingSumScorer = countingConjunctionSumScorer(optionalScorers); return addProhibitedScorers(requiredCountingSumScorer); } private Scorer makeCountingSumScorerSomeReq() throws IOException { // At least one required scorer. if (optionalScorers.size() == minNrShouldMatch) { // all optional scorers also required. ArrayList allReq = new ArrayList(requiredScorers); allReq.addAll(optionalScorers); return addProhibitedScorers(countingConjunctionSumScorer(allReq)); } else { // optionalScorers.size() > minNrShouldMatch, and at least one required scorer Scorer requiredCountingSumScorer = requiredScorers.size() == 1 ? new SingleMatchScorer((Scorer) requiredScorers.get(0)) : countingConjunctionSumScorer(requiredScorers); if (minNrShouldMatch > 0) { // use a required disjunction scorer over the optional scorers return addProhibitedScorers( dualConjunctionSumScorer( // non counting requiredCountingSumScorer, countingDisjunctionSumScorer( optionalScorers, minNrShouldMatch))); } else { // minNrShouldMatch == 0 return new ReqOptSumScorer( addProhibitedScorers(requiredCountingSumScorer), optionalScorers.size() == 1 ? new SingleMatchScorer((Scorer) optionalScorers.get(0)) // require 1 in combined, optional scorer. : countingDisjunctionSumScorer(optionalScorers, 1)); } } } /** Returns the scorer to be used for match counting and score summing. * Uses the given required scorer and the prohibitedScorers. * @param requiredCountingSumScorer A required scorer already built. */ private Scorer addProhibitedScorers(Scorer requiredCountingSumScorer) throws IOException { return (prohibitedScorers.size() == 0) ? requiredCountingSumScorer // no prohibited : new ReqExclScorer(requiredCountingSumScorer, ((prohibitedScorers.size() == 1) ? (Scorer) prohibitedScorers.get(0) : new DisjunctionSumScorer(prohibitedScorers))); } /** Scores and collects all matching documents. * @param hc The collector to which all matching documents are passed through * {@link HitCollector#collect(int, float)}. *
    When this method is used the {@link #explain(int)} method should not be used. * @deprecated use {@link #score(Collector)} instead. */ public void score(HitCollector hc) throws IOException { score(new HitCollectorWrapper(hc)); } /** Scores and collects all matching documents. * @param collector The collector to which all matching documents are passed through. *
    When this method is used the {@link #explain(int)} method should not be used. */ public void score(Collector collector) throws IOException { collector.setScorer(this); while ((doc = countingSumScorer.nextDoc()) != NO_MORE_DOCS) { collector.collect(doc); } } /** Expert: Collects matching documents in a range. *
    Note that {@link #next()} must be called once before this method is * called for the first time. * @param hc The collector to which all matching documents are passed through * {@link HitCollector#collect(int, float)}. * @param max Do not score documents past this. * @return true if more matching documents may remain. * @deprecated use {@link #score(Collector, int, int)} instead. */ protected boolean score(HitCollector hc, int max) throws IOException { return score(new HitCollectorWrapper(hc), max, docID()); } protected boolean score(Collector collector, int max, int firstDocID) throws IOException { doc = firstDocID; collector.setScorer(this); while (doc < max) { collector.collect(doc); doc = countingSumScorer.nextDoc(); } return doc != NO_MORE_DOCS; } /** @deprecated use {@link #docID()} instead. */ public int doc() { return countingSumScorer.doc(); } public int docID() { return doc; } /** @deprecated use {@link #nextDoc()} instead. */ public boolean next() throws IOException { return nextDoc() != NO_MORE_DOCS; } public int nextDoc() throws IOException { return doc = countingSumScorer.nextDoc(); } public float score() throws IOException { coordinator.nrMatchers = 0; float sum = countingSumScorer.score(); return sum * coordinator.coordFactors[coordinator.nrMatchers]; } /** @deprecated use {@link #advance(int)} instead. */ public boolean skipTo(int target) throws IOException { return advance(target) != NO_MORE_DOCS; } public int advance(int target) throws IOException { return doc = countingSumScorer.advance(target); } /** Throws an UnsupportedOperationException. * TODO: Implement an explanation of the coordination factor. * @param doc The document number for the explanation. * @throws UnsupportedOperationException */ public Explanation explain(int doc) { throw new UnsupportedOperationException(); /* How to explain the coordination factor? initCountingSumScorer(); return countingSumScorer.explain(doc); // misses coord factor. */ } } lucene-2.9.4/src/java/org/apache/lucene/search/PhraseQueue.java0000644000175000017500000000271211474320225025007 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.util.PriorityQueue; final class PhraseQueue extends PriorityQueue { PhraseQueue(int size) { initialize(size); } protected final boolean lessThan(Object o1, Object o2) { PhrasePositions pp1 = (PhrasePositions)o1; PhrasePositions pp2 = (PhrasePositions)o2; if (pp1.doc == pp2.doc) if (pp1.position == pp2.position) // same doc and pp.position, so decide by actual term positions. // rely on: pp.position == tp.position - offset. return pp1.offset < pp2.offset; else return pp1.position < pp2.position; else return pp1.doc < pp2.doc; } } lucene-2.9.4/src/java/org/apache/lucene/search/FilteredDocIdSet.java0000644000175000017500000000505111474320224025673 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; /** * Abstract decorator class for a DocIdSet implementation * that provides on-demand filtering/validation * mechanism on a given DocIdSet. * *

    * * Technically, this same functionality could be achieved * with ChainedFilter (under contrib/misc), however the * benefit of this class is it never materializes the full * bitset for the filter. Instead, the {@link #match} * method is invoked on-demand, per docID visited during * searching. If you know few docIDs will be visited, and * the logic behind {@link #match} is relatively costly, * this may be a better way to filter than ChainedFilter. * * @see DocIdSet */ public abstract class FilteredDocIdSet extends DocIdSet { private final DocIdSet _innerSet; /** * Constructor. * @param innerSet Underlying DocIdSet */ public FilteredDocIdSet(DocIdSet innerSet) { _innerSet = innerSet; } /** This DocIdSet implementation is cacheable if the inner set is cacheable. */ public boolean isCacheable() { return _innerSet.isCacheable(); } /** * Validation method to determine whether a docid should be in the result set. * @param docid docid to be tested * @return true if input docid should be in the result set, false otherwise. */ protected abstract boolean match(int docid) throws IOException; /** * Implementation of the contract to build a DocIdSetIterator. * @see DocIdSetIterator * @see FilteredDocIdSetIterator */ // @Override public DocIdSetIterator iterator() throws IOException { return new FilteredDocIdSetIterator(_innerSet.iterator()) { protected boolean match(int docid) throws IOException { return FilteredDocIdSet.this.match(docid); } }; } } lucene-2.9.4/src/java/org/apache/lucene/search/TopFieldCollector.java0000644000175000017500000007742511474320224026151 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.FieldValueHitQueue.Entry; import org.apache.lucene.util.PriorityQueue; /** * A {@link Collector} that sorts by {@link SortField} using * {@link FieldComparator}s. *

    * See the {@link #create(org.apache.lucene.search.Sort, int, boolean, boolean, boolean, boolean)} method * for instantiating a TopFieldCollector. * *

    NOTE: This API is experimental and might change in * incompatible ways in the next release.

    */ public abstract class TopFieldCollector extends TopDocsCollector { // TODO: one optimization we could do is to pre-fill // the queue with sentinel value that guaranteed to // always compare lower than a real hit; this would // save having to check queueFull on each insert /* * Implements a TopFieldCollector over one SortField criteria, without * tracking document scores and maxScore. */ private static class OneComparatorNonScoringCollector extends TopFieldCollector { final FieldComparator comparator; final int reverseMul; public OneComparatorNonScoringCollector(FieldValueHitQueue queue, int numHits, boolean fillFields) throws IOException { super(queue, numHits, fillFields); comparator = queue.getComparators()[0]; reverseMul = queue.getReverseMul()[0]; } final void updateBottom(int doc) { // bottom.score is already set to Float.NaN in add(). bottom.docID = docBase + doc; bottom = (Entry) pq.updateTop(); } public void collect(int doc) throws IOException { ++totalHits; if (queueFull) { if ((reverseMul * comparator.compareBottom(doc)) <= 0) { // since docs are visited in doc Id order, if compare is 0, it means // this document is largest than anything else in the queue, and // therefore not competitive. return; } // This hit is competitive - replace bottom element in queue & adjustTop comparator.copy(bottom.slot, doc); updateBottom(doc); comparator.setBottom(bottom.slot); } else { // Startup transient: queue hasn't gathered numHits yet final int slot = totalHits - 1; // Copy hit into queue comparator.copy(slot, doc); add(slot, doc, Float.NaN); if (queueFull) { comparator.setBottom(bottom.slot); } } } public void setNextReader(IndexReader reader, int docBase) throws IOException { this.docBase = docBase; comparator.setNextReader(reader, docBase); } public void setScorer(Scorer scorer) throws IOException { comparator.setScorer(scorer); } } /* * Implements a TopFieldCollector over one SortField criteria, without * tracking document scores and maxScore, and assumes out of orderness in doc * Ids collection. */ private static class OutOfOrderOneComparatorNonScoringCollector extends OneComparatorNonScoringCollector { public OutOfOrderOneComparatorNonScoringCollector(FieldValueHitQueue queue, int numHits, boolean fillFields) throws IOException { super(queue, numHits, fillFields); } public void collect(int doc) throws IOException { ++totalHits; if (queueFull) { // Fastmatch: return if this hit is not competitive final int cmp = reverseMul * comparator.compareBottom(doc); if (cmp < 0 || (cmp == 0 && doc + docBase > bottom.docID)) { return; } // This hit is competitive - replace bottom element in queue & adjustTop comparator.copy(bottom.slot, doc); updateBottom(doc); comparator.setBottom(bottom.slot); } else { // Startup transient: queue hasn't gathered numHits yet final int slot = totalHits - 1; // Copy hit into queue comparator.copy(slot, doc); add(slot, doc, Float.NaN); if (queueFull) { comparator.setBottom(bottom.slot); } } } public boolean acceptsDocsOutOfOrder() { return true; } } /* * Implements a TopFieldCollector over one SortField criteria, while tracking * document scores but no maxScore. */ private static class OneComparatorScoringNoMaxScoreCollector extends OneComparatorNonScoringCollector { Scorer scorer; public OneComparatorScoringNoMaxScoreCollector(FieldValueHitQueue queue, int numHits, boolean fillFields) throws IOException { super(queue, numHits, fillFields); } final void updateBottom(int doc, float score) { bottom.docID = docBase + doc; bottom.score = score; bottom = (Entry) pq.updateTop(); } public void collect(int doc) throws IOException { ++totalHits; if (queueFull) { if ((reverseMul * comparator.compareBottom(doc)) <= 0) { // since docs are visited in doc Id order, if compare is 0, it means // this document is largest than anything else in the queue, and // therefore not competitive. return; } // Compute the score only if the hit is competitive. final float score = scorer.score(); // This hit is competitive - replace bottom element in queue & adjustTop comparator.copy(bottom.slot, doc); updateBottom(doc, score); comparator.setBottom(bottom.slot); } else { // Compute the score only if the hit is competitive. final float score = scorer.score(); // Startup transient: queue hasn't gathered numHits yet final int slot = totalHits - 1; // Copy hit into queue comparator.copy(slot, doc); add(slot, doc, score); if (queueFull) { comparator.setBottom(bottom.slot); } } } public void setScorer(Scorer scorer) throws IOException { this.scorer = scorer; comparator.setScorer(scorer); } } /* * Implements a TopFieldCollector over one SortField criteria, while tracking * document scores but no maxScore, and assumes out of orderness in doc Ids * collection. */ private static class OutOfOrderOneComparatorScoringNoMaxScoreCollector extends OneComparatorScoringNoMaxScoreCollector { public OutOfOrderOneComparatorScoringNoMaxScoreCollector( FieldValueHitQueue queue, int numHits, boolean fillFields) throws IOException { super(queue, numHits, fillFields); } public void collect(int doc) throws IOException { ++totalHits; if (queueFull) { // Fastmatch: return if this hit is not competitive final int cmp = reverseMul * comparator.compareBottom(doc); if (cmp < 0 || (cmp == 0 && doc + docBase > bottom.docID)) { return; } // Compute the score only if the hit is competitive. final float score = scorer.score(); // This hit is competitive - replace bottom element in queue & adjustTop comparator.copy(bottom.slot, doc); updateBottom(doc, score); comparator.setBottom(bottom.slot); } else { // Compute the score only if the hit is competitive. final float score = scorer.score(); // Startup transient: queue hasn't gathered numHits yet final int slot = totalHits - 1; // Copy hit into queue comparator.copy(slot, doc); add(slot, doc, score); if (queueFull) { comparator.setBottom(bottom.slot); } } } public boolean acceptsDocsOutOfOrder() { return true; } } /* * Implements a TopFieldCollector over one SortField criteria, with tracking * document scores and maxScore. */ private static class OneComparatorScoringMaxScoreCollector extends OneComparatorNonScoringCollector { Scorer scorer; public OneComparatorScoringMaxScoreCollector(FieldValueHitQueue queue, int numHits, boolean fillFields) throws IOException { super(queue, numHits, fillFields); // Must set maxScore to NEG_INF, or otherwise Math.max always returns NaN. maxScore = Float.NEGATIVE_INFINITY; } final void updateBottom(int doc, float score) { bottom.docID = docBase + doc; bottom.score = score; bottom = (Entry) pq.updateTop(); } public void collect(int doc) throws IOException { final float score = scorer.score(); if (score > maxScore) { maxScore = score; } ++totalHits; if (queueFull) { if ((reverseMul * comparator.compareBottom(doc)) <= 0) { // since docs are visited in doc Id order, if compare is 0, it means // this document is largest than anything else in the queue, and // therefore not competitive. return; } // This hit is competitive - replace bottom element in queue & adjustTop comparator.copy(bottom.slot, doc); updateBottom(doc, score); comparator.setBottom(bottom.slot); } else { // Startup transient: queue hasn't gathered numHits yet final int slot = totalHits - 1; // Copy hit into queue comparator.copy(slot, doc); add(slot, doc, score); if (queueFull) { comparator.setBottom(bottom.slot); } } } public void setScorer(Scorer scorer) throws IOException { this.scorer = scorer; super.setScorer(scorer); } } /* * Implements a TopFieldCollector over one SortField criteria, with tracking * document scores and maxScore, and assumes out of orderness in doc Ids * collection. */ private static class OutOfOrderOneComparatorScoringMaxScoreCollector extends OneComparatorScoringMaxScoreCollector { public OutOfOrderOneComparatorScoringMaxScoreCollector(FieldValueHitQueue queue, int numHits, boolean fillFields) throws IOException { super(queue, numHits, fillFields); } public void collect(int doc) throws IOException { final float score = scorer.score(); if (score > maxScore) { maxScore = score; } ++totalHits; if (queueFull) { // Fastmatch: return if this hit is not competitive final int cmp = reverseMul * comparator.compareBottom(doc); if (cmp < 0 || (cmp == 0 && doc + docBase > bottom.docID)) { return; } // This hit is competitive - replace bottom element in queue & adjustTop comparator.copy(bottom.slot, doc); updateBottom(doc, score); comparator.setBottom(bottom.slot); } else { // Startup transient: queue hasn't gathered numHits yet final int slot = totalHits - 1; // Copy hit into queue comparator.copy(slot, doc); add(slot, doc, score); if (queueFull) { comparator.setBottom(bottom.slot); } } } public boolean acceptsDocsOutOfOrder() { return true; } } /* * Implements a TopFieldCollector over multiple SortField criteria, without * tracking document scores and maxScore. */ private static class MultiComparatorNonScoringCollector extends TopFieldCollector { final FieldComparator[] comparators; final int[] reverseMul; public MultiComparatorNonScoringCollector(FieldValueHitQueue queue, int numHits, boolean fillFields) throws IOException { super(queue, numHits, fillFields); comparators = queue.getComparators(); reverseMul = queue.getReverseMul(); } final void updateBottom(int doc) { // bottom.score is already set to Float.NaN in add(). bottom.docID = docBase + doc; bottom = (Entry) pq.updateTop(); } public void collect(int doc) throws IOException { ++totalHits; if (queueFull) { // Fastmatch: return if this hit is not competitive for (int i = 0;; i++) { final int c = reverseMul[i] * comparators[i].compareBottom(doc); if (c < 0) { // Definitely not competitive. return; } else if (c > 0) { // Definitely competitive. break; } else if (i == comparators.length - 1) { // Here c=0. If we're at the last comparator, this doc is not // competitive, since docs are visited in doc Id order, which means // this doc cannot compete with any other document in the queue. return; } } // This hit is competitive - replace bottom element in queue & adjustTop for (int i = 0; i < comparators.length; i++) { comparators[i].copy(bottom.slot, doc); } updateBottom(doc); for (int i = 0; i < comparators.length; i++) { comparators[i].setBottom(bottom.slot); } } else { // Startup transient: queue hasn't gathered numHits yet final int slot = totalHits - 1; // Copy hit into queue for (int i = 0; i < comparators.length; i++) { comparators[i].copy(slot, doc); } add(slot, doc, Float.NaN); if (queueFull) { for (int i = 0; i < comparators.length; i++) { comparators[i].setBottom(bottom.slot); } } } } public void setNextReader(IndexReader reader, int docBase) throws IOException { this.docBase = docBase; for (int i = 0; i < comparators.length; i++) { comparators[i].setNextReader(reader, docBase); } } public void setScorer(Scorer scorer) throws IOException { // set the scorer on all comparators for (int i = 0; i < comparators.length; i++) { comparators[i].setScorer(scorer); } } } /* * Implements a TopFieldCollector over multiple SortField criteria, without * tracking document scores and maxScore, and assumes out of orderness in doc * Ids collection. */ private static class OutOfOrderMultiComparatorNonScoringCollector extends MultiComparatorNonScoringCollector { public OutOfOrderMultiComparatorNonScoringCollector(FieldValueHitQueue queue, int numHits, boolean fillFields) throws IOException { super(queue, numHits, fillFields); } public void collect(int doc) throws IOException { ++totalHits; if (queueFull) { // Fastmatch: return if this hit is not competitive for (int i = 0;; i++) { final int c = reverseMul[i] * comparators[i].compareBottom(doc); if (c < 0) { // Definitely not competitive. return; } else if (c > 0) { // Definitely competitive. break; } else if (i == comparators.length - 1) { // This is the equals case. if (doc + docBase > bottom.docID) { // Definitely not competitive return; } break; } } // This hit is competitive - replace bottom element in queue & adjustTop for (int i = 0; i < comparators.length; i++) { comparators[i].copy(bottom.slot, doc); } updateBottom(doc); for (int i = 0; i < comparators.length; i++) { comparators[i].setBottom(bottom.slot); } } else { // Startup transient: queue hasn't gathered numHits yet final int slot = totalHits - 1; // Copy hit into queue for (int i = 0; i < comparators.length; i++) { comparators[i].copy(slot, doc); } add(slot, doc, Float.NaN); if (queueFull) { for (int i = 0; i < comparators.length; i++) { comparators[i].setBottom(bottom.slot); } } } } public boolean acceptsDocsOutOfOrder() { return true; } } /* * Implements a TopFieldCollector over multiple SortField criteria, with * tracking document scores and maxScore. */ private static class MultiComparatorScoringMaxScoreCollector extends MultiComparatorNonScoringCollector { Scorer scorer; public MultiComparatorScoringMaxScoreCollector(FieldValueHitQueue queue, int numHits, boolean fillFields) throws IOException { super(queue, numHits, fillFields); // Must set maxScore to NEG_INF, or otherwise Math.max always returns NaN. maxScore = Float.NEGATIVE_INFINITY; } final void updateBottom(int doc, float score) { bottom.docID = docBase + doc; bottom.score = score; bottom = (Entry) pq.updateTop(); } public void collect(int doc) throws IOException { final float score = scorer.score(); if (score > maxScore) { maxScore = score; } ++totalHits; if (queueFull) { // Fastmatch: return if this hit is not competitive for (int i = 0;; i++) { final int c = reverseMul[i] * comparators[i].compareBottom(doc); if (c < 0) { // Definitely not competitive. return; } else if (c > 0) { // Definitely competitive. break; } else if (i == comparators.length - 1) { // Here c=0. If we're at the last comparator, this doc is not // competitive, since docs are visited in doc Id order, which means // this doc cannot compete with any other document in the queue. return; } } // This hit is competitive - replace bottom element in queue & adjustTop for (int i = 0; i < comparators.length; i++) { comparators[i].copy(bottom.slot, doc); } updateBottom(doc, score); for (int i = 0; i < comparators.length; i++) { comparators[i].setBottom(bottom.slot); } } else { // Startup transient: queue hasn't gathered numHits yet final int slot = totalHits - 1; // Copy hit into queue for (int i = 0; i < comparators.length; i++) { comparators[i].copy(slot, doc); } add(slot, doc, score); if (queueFull) { for (int i = 0; i < comparators.length; i++) { comparators[i].setBottom(bottom.slot); } } } } public void setScorer(Scorer scorer) throws IOException { this.scorer = scorer; super.setScorer(scorer); } } /* * Implements a TopFieldCollector over multiple SortField criteria, with * tracking document scores and maxScore, and assumes out of orderness in doc * Ids collection. */ private final static class OutOfOrderMultiComparatorScoringMaxScoreCollector extends MultiComparatorScoringMaxScoreCollector { public OutOfOrderMultiComparatorScoringMaxScoreCollector(FieldValueHitQueue queue, int numHits, boolean fillFields) throws IOException { super(queue, numHits, fillFields); } public void collect(int doc) throws IOException { final float score = scorer.score(); if (score > maxScore) { maxScore = score; } ++totalHits; if (queueFull) { // Fastmatch: return if this hit is not competitive for (int i = 0;; i++) { final int c = reverseMul[i] * comparators[i].compareBottom(doc); if (c < 0) { // Definitely not competitive. return; } else if (c > 0) { // Definitely competitive. break; } else if (i == comparators.length - 1) { // This is the equals case. if (doc + docBase > bottom.docID) { // Definitely not competitive return; } break; } } // This hit is competitive - replace bottom element in queue & adjustTop for (int i = 0; i < comparators.length; i++) { comparators[i].copy(bottom.slot, doc); } updateBottom(doc, score); for (int i = 0; i < comparators.length; i++) { comparators[i].setBottom(bottom.slot); } } else { // Startup transient: queue hasn't gathered numHits yet final int slot = totalHits - 1; // Copy hit into queue for (int i = 0; i < comparators.length; i++) { comparators[i].copy(slot, doc); } add(slot, doc, score); if (queueFull) { for (int i = 0; i < comparators.length; i++) { comparators[i].setBottom(bottom.slot); } } } } public boolean acceptsDocsOutOfOrder() { return true; } } /* * Implements a TopFieldCollector over multiple SortField criteria, with * tracking document scores and maxScore. */ private static class MultiComparatorScoringNoMaxScoreCollector extends MultiComparatorNonScoringCollector { Scorer scorer; public MultiComparatorScoringNoMaxScoreCollector(FieldValueHitQueue queue, int numHits, boolean fillFields) throws IOException { super(queue, numHits, fillFields); } final void updateBottom(int doc, float score) { bottom.docID = docBase + doc; bottom.score = score; bottom = (Entry) pq.updateTop(); } public void collect(int doc) throws IOException { ++totalHits; if (queueFull) { // Fastmatch: return if this hit is not competitive for (int i = 0;; i++) { final int c = reverseMul[i] * comparators[i].compareBottom(doc); if (c < 0) { // Definitely not competitive. return; } else if (c > 0) { // Definitely competitive. break; } else if (i == comparators.length - 1) { // Here c=0. If we're at the last comparator, this doc is not // competitive, since docs are visited in doc Id order, which means // this doc cannot compete with any other document in the queue. return; } } // This hit is competitive - replace bottom element in queue & adjustTop for (int i = 0; i < comparators.length; i++) { comparators[i].copy(bottom.slot, doc); } // Compute score only if it is competitive. final float score = scorer.score(); updateBottom(doc, score); for (int i = 0; i < comparators.length; i++) { comparators[i].setBottom(bottom.slot); } } else { // Startup transient: queue hasn't gathered numHits yet final int slot = totalHits - 1; // Copy hit into queue for (int i = 0; i < comparators.length; i++) { comparators[i].copy(slot, doc); } // Compute score only if it is competitive. final float score = scorer.score(); add(slot, doc, score); if (queueFull) { for (int i = 0; i < comparators.length; i++) { comparators[i].setBottom(bottom.slot); } } } } public void setScorer(Scorer scorer) throws IOException { this.scorer = scorer; super.setScorer(scorer); } } /* * Implements a TopFieldCollector over multiple SortField criteria, with * tracking document scores and maxScore, and assumes out of orderness in doc * Ids collection. */ private final static class OutOfOrderMultiComparatorScoringNoMaxScoreCollector extends MultiComparatorScoringNoMaxScoreCollector { public OutOfOrderMultiComparatorScoringNoMaxScoreCollector( FieldValueHitQueue queue, int numHits, boolean fillFields) throws IOException { super(queue, numHits, fillFields); } public void collect(int doc) throws IOException { ++totalHits; if (queueFull) { // Fastmatch: return if this hit is not competitive for (int i = 0;; i++) { final int c = reverseMul[i] * comparators[i].compareBottom(doc); if (c < 0) { // Definitely not competitive. return; } else if (c > 0) { // Definitely competitive. break; } else if (i == comparators.length - 1) { // This is the equals case. if (doc + docBase > bottom.docID) { // Definitely not competitive return; } break; } } // This hit is competitive - replace bottom element in queue & adjustTop for (int i = 0; i < comparators.length; i++) { comparators[i].copy(bottom.slot, doc); } // Compute score only if it is competitive. final float score = scorer.score(); updateBottom(doc, score); for (int i = 0; i < comparators.length; i++) { comparators[i].setBottom(bottom.slot); } } else { // Startup transient: queue hasn't gathered numHits yet final int slot = totalHits - 1; // Copy hit into queue for (int i = 0; i < comparators.length; i++) { comparators[i].copy(slot, doc); } // Compute score only if it is competitive. final float score = scorer.score(); add(slot, doc, score); if (queueFull) { for (int i = 0; i < comparators.length; i++) { comparators[i].setBottom(bottom.slot); } } } } public void setScorer(Scorer scorer) throws IOException { this.scorer = scorer; super.setScorer(scorer); } public boolean acceptsDocsOutOfOrder() { return true; } } private static final ScoreDoc[] EMPTY_SCOREDOCS = new ScoreDoc[0]; private final boolean fillFields; /* * Stores the maximum score value encountered, needed for normalizing. If * document scores are not tracked, this value is initialized to NaN. */ float maxScore = Float.NaN; final int numHits; FieldValueHitQueue.Entry bottom = null; boolean queueFull; int docBase; // Declaring the constructor private prevents extending this class by anyone // else. Note that the class cannot be final since it's extended by the // internal versions. If someone will define a constructor with any other // visibility, then anyone will be able to extend the class, which is not what // we want. private TopFieldCollector(PriorityQueue pq, int numHits, boolean fillFields) { super(pq); this.numHits = numHits; this.fillFields = fillFields; } /** * Creates a new {@link TopFieldCollector} from the given * arguments. * *

    NOTE: The instances returned by this method * pre-allocate a full array of length * numHits. * * @param sort * the sort criteria (SortFields). * @param numHits * the number of results to collect. * @param fillFields * specifies whether the actual field values should be returned on * the results (FieldDoc). * @param trackDocScores * specifies whether document scores should be tracked and set on the * results. Note that if set to false, then the results' scores will * be set to Float.NaN. Setting this to true affects performance, as * it incurs the score computation on each competitive result. * Therefore if document scores are not required by the application, * it is recommended to set it to false. * @param trackMaxScore * specifies whether the query's maxScore should be tracked and set * on the resulting {@link TopDocs}. Note that if set to false, * {@link TopDocs#getMaxScore()} returns Float.NaN. Setting this to * true affects performance as it incurs the score computation on * each result. Also, setting this true automatically sets * trackDocScores to true as well. * @param docsScoredInOrder * specifies whether documents are scored in doc Id order or not by * the given {@link Scorer} in {@link #setScorer(Scorer)}. * @return a {@link TopFieldCollector} instance which will sort the results by * the sort criteria. * @throws IOException */ public static TopFieldCollector create(Sort sort, int numHits, boolean fillFields, boolean trackDocScores, boolean trackMaxScore, boolean docsScoredInOrder) throws IOException { if (sort.fields.length == 0) { throw new IllegalArgumentException("Sort must contain at least one field"); } FieldValueHitQueue queue = FieldValueHitQueue.create(sort.fields, numHits); if (queue.getComparators().length == 1) { if (docsScoredInOrder) { if (trackMaxScore) { return new OneComparatorScoringMaxScoreCollector(queue, numHits, fillFields); } else if (trackDocScores) { return new OneComparatorScoringNoMaxScoreCollector(queue, numHits, fillFields); } else { return new OneComparatorNonScoringCollector(queue, numHits, fillFields); } } else { if (trackMaxScore) { return new OutOfOrderOneComparatorScoringMaxScoreCollector(queue, numHits, fillFields); } else if (trackDocScores) { return new OutOfOrderOneComparatorScoringNoMaxScoreCollector(queue, numHits, fillFields); } else { return new OutOfOrderOneComparatorNonScoringCollector(queue, numHits, fillFields); } } } // multiple comparators. if (docsScoredInOrder) { if (trackMaxScore) { return new MultiComparatorScoringMaxScoreCollector(queue, numHits, fillFields); } else if (trackDocScores) { return new MultiComparatorScoringNoMaxScoreCollector(queue, numHits, fillFields); } else { return new MultiComparatorNonScoringCollector(queue, numHits, fillFields); } } else { if (trackMaxScore) { return new OutOfOrderMultiComparatorScoringMaxScoreCollector(queue, numHits, fillFields); } else if (trackDocScores) { return new OutOfOrderMultiComparatorScoringNoMaxScoreCollector(queue, numHits, fillFields); } else { return new OutOfOrderMultiComparatorNonScoringCollector(queue, numHits, fillFields); } } } final void add(int slot, int doc, float score) { bottom = (Entry) pq.add(new Entry(slot, docBase + doc, score)); queueFull = totalHits == numHits; } /* * Only the following callback methods need to be overridden since * topDocs(int, int) calls them to return the results. */ protected void populateResults(ScoreDoc[] results, int howMany) { if (fillFields) { // avoid casting if unnecessary. FieldValueHitQueue queue = (FieldValueHitQueue) pq; for (int i = howMany - 1; i >= 0; i--) { results[i] = queue.fillFields((Entry) queue.pop()); } } else { for (int i = howMany - 1; i >= 0; i--) { Entry entry = (Entry) pq.pop(); results[i] = new FieldDoc(entry.docID, entry.score); } } } protected TopDocs newTopDocs(ScoreDoc[] results, int start) { if (results == null) { results = EMPTY_SCOREDOCS; // Set maxScore to NaN, in case this is a maxScore tracking collector. maxScore = Float.NaN; } // If this is a maxScoring tracking collector and there were no results, return new TopFieldDocs(totalHits, results, ((FieldValueHitQueue) pq).getFields(), maxScore); } public boolean acceptsDocsOutOfOrder() { return false; } } lucene-2.9.4/src/java/org/apache/lucene/search/Filter.java0000644000175000017500000000572411474320224024012 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.BitSet; import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.DocIdBitSet; /** * Abstract base class for restricting which documents may * be returned during searching. *

    * Note: In Lucene 3.0 {@link #bits(IndexReader)} will be removed * and {@link #getDocIdSet(IndexReader)} will be defined as abstract. * All implementing classes must therefore implement {@link #getDocIdSet(IndexReader)} * in order to work with Lucene 3.0. */ public abstract class Filter implements java.io.Serializable { /** * @return A BitSet with true for documents which should be permitted in * search results, and false for those that should not. * *

    NOTE: See {@link #getDocIdSet(IndexReader)} for * handling of multi-segment indexes (which applies to * this method as well). * @deprecated Use {@link #getDocIdSet(IndexReader)} instead. */ public BitSet bits(IndexReader reader) throws IOException { throw new UnsupportedOperationException(); } /** * Creates a {@link DocIdSet} enumerating the documents that should be * permitted in search results. NOTE: null can be * returned if no documents are accepted by this Filter. *

    * Note: This method will be called once per segment in * the index during searching. The returned {@link DocIdSet} * must refer to document IDs for that segment, not for * the top-level reader. * * @param reader a {@link IndexReader} instance opened on the index currently * searched on. Note, it is likely that the provided reader does not * represent the whole underlying index i.e. if the index has more than * one segment the given reader only represents a single segment. * * @return a DocIdSet that provides the documents which should be permitted or * prohibited in search results. NOTE: null can be returned if * no documents will be accepted by this Filter. * * @see DocIdBitSet */ public DocIdSet getDocIdSet(IndexReader reader) throws IOException { return new DocIdBitSet(bits(reader)); } } lucene-2.9.4/src/java/org/apache/lucene/search/FieldCacheTermsFilter.java0000644000175000017500000001363611474320224026716 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.OpenBitSet; import org.apache.lucene.index.TermDocs; // for javadocs /** * A {@link Filter} that only accepts documents whose single * term value in the specified field is contained in the * provided set of allowed terms. * *

    * * This is the same functionality as TermsFilter (from * contrib/queries), except this filter requires that the * field contains only a single term for all documents. * Because of drastically different implementations, they * also have different performance characteristics, as * described below. * *

    * * The first invocation of this filter on a given field will * be slower, since a {@link FieldCache.StringIndex} must be * created. Subsequent invocations using the same field * will re-use this cache. However, as with all * functionality based on {@link FieldCache}, persistent RAM * is consumed to hold the cache, and is not freed until the * {@link IndexReader} is closed. In contrast, TermsFilter * has no persistent RAM consumption. * * *

    * * With each search, this filter translates the specified * set of Terms into a private {@link OpenBitSet} keyed by * term number per unique {@link IndexReader} (normally one * reader per segment). Then, during matching, the term * number for each docID is retrieved from the cache and * then checked for inclusion using the {@link OpenBitSet}. * Since all testing is done using RAM resident data * structures, performance should be very fast, most likely * fast enough to not require further caching of the * DocIdSet for each possible combination of terms. * However, because docIDs are simply scanned linearly, an * index with a great many small documents may find this * linear scan too costly. * *

    * * In contrast, TermsFilter builds up an {@link OpenBitSet}, * keyed by docID, every time it's created, by enumerating * through all matching docs using {@link TermDocs} to seek * and scan through each term's docID list. While there is * no linear scan of all docIDs, besides the allocation of * the underlying array in the {@link OpenBitSet}, this * approach requires a number of "disk seeks" in proportion * to the number of terms, which can be exceptionally costly * when there are cache misses in the OS's IO cache. * *

    * * Generally, this filter will be slower on the first * invocation for a given field, but subsequent invocations, * even if you change the allowed set of Terms, should be * faster than TermsFilter, especially as the number of * Terms being matched increases. If you are matching only * a very small number of terms, and those terms in turn * match a very small number of documents, TermsFilter may * perform faster. * *

    * * Which filter is best is very application dependent. */ public class FieldCacheTermsFilter extends Filter { private String field; private String[] terms; public FieldCacheTermsFilter(String field, String[] terms) { this.field = field; this.terms = terms; } public FieldCache getFieldCache() { return FieldCache.DEFAULT; } public DocIdSet getDocIdSet(IndexReader reader) throws IOException { return new FieldCacheTermsFilterDocIdSet(getFieldCache().getStringIndex(reader, field)); } protected class FieldCacheTermsFilterDocIdSet extends DocIdSet { private FieldCache.StringIndex fcsi; private OpenBitSet openBitSet; public FieldCacheTermsFilterDocIdSet(FieldCache.StringIndex fcsi) { this.fcsi = fcsi; openBitSet = new OpenBitSet(this.fcsi.lookup.length); for (int i=0;i 0) { openBitSet.fastSet(termNumber); } } } public DocIdSetIterator iterator() { return new FieldCacheTermsFilterDocIdSetIterator(); } /** This DocIdSet implementation is cacheable. */ public boolean isCacheable() { return true; } protected class FieldCacheTermsFilterDocIdSetIterator extends DocIdSetIterator { private int doc = -1; /** @deprecated use {@link #docID()} instead. */ public int doc() { return doc; } public int docID() { return doc; } /** @deprecated use {@link #nextDoc()} instead. */ public boolean next() { return nextDoc() != NO_MORE_DOCS; } public int nextDoc() { try { while (!openBitSet.fastGet(fcsi.order[++doc])) {} } catch (ArrayIndexOutOfBoundsException e) { doc = NO_MORE_DOCS; } return doc; } /** @deprecated use {@link #advance(int)} instead. */ public boolean skipTo(int target) { return advance(target) != NO_MORE_DOCS; } public int advance(int target) { try { doc = target; while (!openBitSet.fastGet(fcsi.order[doc])) { doc++; } } catch (ArrayIndexOutOfBoundsException e) { doc = NO_MORE_DOCS; } return doc; } } } } lucene-2.9.4/src/java/org/apache/lucene/search/FieldComparator.java0000644000175000017500000006222211474320224025634 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.text.Collator; import java.util.Locale; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.FieldCache.DoubleParser; import org.apache.lucene.search.FieldCache.LongParser; import org.apache.lucene.search.FieldCache.ByteParser; import org.apache.lucene.search.FieldCache.FloatParser; import org.apache.lucene.search.FieldCache.IntParser; import org.apache.lucene.search.FieldCache.ShortParser; import org.apache.lucene.search.FieldCache.StringIndex; /** * Expert: a FieldComparator compares hits so as to determine their * sort order when collecting the top results with {@link * TopFieldCollector}. The concrete public FieldComparator * classes here correspond to the SortField types. * *

    This API is designed to achieve high performance * sorting, by exposing a tight interaction with {@link * FieldValueHitQueue} as it visits hits. Whenever a hit is * competitive, it's enrolled into a virtual slot, which is * an int ranging from 0 to numHits-1. The {@link * FieldComparator} is made aware of segment transitions * during searching in case any internal state it's tracking * needs to be recomputed during these transitions.

    * *

    A comparator must define these functions:

    * *
      * *
    • {@link #compare} Compare a hit at 'slot a' * with hit 'slot b'. * *
    • {@link #setBottom} This method is called by * {@link FieldValueHitQueue} to notify the * FieldComparator of the current weakest ("bottom") * slot. Note that this slot may not hold the weakest * value according to your comparator, in cases where * your comparator is not the primary one (ie, is only * used to break ties from the comparators before it). * *
    • {@link #compareBottom} Compare a new hit (docID) * against the "weakest" (bottom) entry in the queue. * *
    • {@link #copy} Installs a new hit into the * priority queue. The {@link FieldValueHitQueue} * calls this method when a new hit is competitive. * *
    • {@link #setNextReader} Invoked * when the search is switching to the next segment. * You may need to update internal state of the * comparator, for example retrieving new values from * the {@link FieldCache}. * *
    • {@link #value} Return the sort value stored in * the specified slot. This is only called at the end * of the search, in order to populate {@link * FieldDoc#fields} when returning the top results. *
    * * NOTE: This API is experimental and might change in * incompatible ways in the next release. */ public abstract class FieldComparator { /** Parses field's values as byte (using {@link * FieldCache#getBytes} and sorts by ascending value */ public static final class ByteComparator extends FieldComparator { private final byte[] values; private byte[] currentReaderValues; private final String field; private ByteParser parser; private byte bottom; ByteComparator(int numHits, String field, FieldCache.Parser parser) { values = new byte[numHits]; this.field = field; this.parser = (ByteParser) parser; } public int compare(int slot1, int slot2) { return values[slot1] - values[slot2]; } public int compareBottom(int doc) { return bottom - currentReaderValues[doc]; } public void copy(int slot, int doc) { values[slot] = currentReaderValues[doc]; } public void setNextReader(IndexReader reader, int docBase) throws IOException { currentReaderValues = FieldCache.DEFAULT.getBytes(reader, field, parser); } public void setBottom(final int bottom) { this.bottom = values[bottom]; } public Comparable value(int slot) { return new Byte(values[slot]); } } /** Sorts by ascending docID */ public static final class DocComparator extends FieldComparator { private final int[] docIDs; private int docBase; private int bottom; DocComparator(int numHits) { docIDs = new int[numHits]; } public int compare(int slot1, int slot2) { // No overflow risk because docIDs are non-negative return docIDs[slot1] - docIDs[slot2]; } public int compareBottom(int doc) { // No overflow risk because docIDs are non-negative return bottom - (docBase + doc); } public void copy(int slot, int doc) { docIDs[slot] = docBase + doc; } public void setNextReader(IndexReader reader, int docBase) { // TODO: can we "map" our docIDs to the current // reader? saves having to then subtract on every // compare call this.docBase = docBase; } public void setBottom(final int bottom) { this.bottom = docIDs[bottom]; } public Comparable value(int slot) { return new Integer(docIDs[slot]); } } /** Parses field's values as double (using {@link * FieldCache#getDoubles} and sorts by ascending value */ public static final class DoubleComparator extends FieldComparator { private final double[] values; private double[] currentReaderValues; private final String field; private DoubleParser parser; private double bottom; DoubleComparator(int numHits, String field, FieldCache.Parser parser) { values = new double[numHits]; this.field = field; this.parser = (DoubleParser) parser; } public int compare(int slot1, int slot2) { final double v1 = values[slot1]; final double v2 = values[slot2]; if (v1 > v2) { return 1; } else if (v1 < v2) { return -1; } else { return 0; } } public int compareBottom(int doc) { final double v2 = currentReaderValues[doc]; if (bottom > v2) { return 1; } else if (bottom < v2) { return -1; } else { return 0; } } public void copy(int slot, int doc) { values[slot] = currentReaderValues[doc]; } public void setNextReader(IndexReader reader, int docBase) throws IOException { currentReaderValues = FieldCache.DEFAULT.getDoubles(reader, field, parser); } public void setBottom(final int bottom) { this.bottom = values[bottom]; } public Comparable value(int slot) { return new Double(values[slot]); } } /** Parses field's values as float (using {@link * FieldCache#getFloats} and sorts by ascending value */ public static final class FloatComparator extends FieldComparator { private final float[] values; private float[] currentReaderValues; private final String field; private FloatParser parser; private float bottom; FloatComparator(int numHits, String field, FieldCache.Parser parser) { values = new float[numHits]; this.field = field; this.parser = (FloatParser) parser; } public int compare(int slot1, int slot2) { // TODO: are there sneaky non-branch ways to compute // sign of float? final float v1 = values[slot1]; final float v2 = values[slot2]; if (v1 > v2) { return 1; } else if (v1 < v2) { return -1; } else { return 0; } } public int compareBottom(int doc) { // TODO: are there sneaky non-branch ways to compute // sign of float? final float v2 = currentReaderValues[doc]; if (bottom > v2) { return 1; } else if (bottom < v2) { return -1; } else { return 0; } } public void copy(int slot, int doc) { values[slot] = currentReaderValues[doc]; } public void setNextReader(IndexReader reader, int docBase) throws IOException { currentReaderValues = FieldCache.DEFAULT.getFloats(reader, field, parser); } public void setBottom(final int bottom) { this.bottom = values[bottom]; } public Comparable value(int slot) { return new Float(values[slot]); } } /** Parses field's values as int (using {@link * FieldCache#getInts} and sorts by ascending value */ public static final class IntComparator extends FieldComparator { private final int[] values; private int[] currentReaderValues; private final String field; private IntParser parser; private int bottom; // Value of bottom of queue IntComparator(int numHits, String field, FieldCache.Parser parser) { values = new int[numHits]; this.field = field; this.parser = (IntParser) parser; } public int compare(int slot1, int slot2) { // TODO: there are sneaky non-branch ways to compute // -1/+1/0 sign // Cannot return values[slot1] - values[slot2] because that // may overflow final int v1 = values[slot1]; final int v2 = values[slot2]; if (v1 > v2) { return 1; } else if (v1 < v2) { return -1; } else { return 0; } } public int compareBottom(int doc) { // TODO: there are sneaky non-branch ways to compute // -1/+1/0 sign // Cannot return bottom - values[slot2] because that // may overflow final int v2 = currentReaderValues[doc]; if (bottom > v2) { return 1; } else if (bottom < v2) { return -1; } else { return 0; } } public void copy(int slot, int doc) { values[slot] = currentReaderValues[doc]; } public void setNextReader(IndexReader reader, int docBase) throws IOException { currentReaderValues = FieldCache.DEFAULT.getInts(reader, field, parser); } public void setBottom(final int bottom) { this.bottom = values[bottom]; } public Comparable value(int slot) { return new Integer(values[slot]); } } /** Parses field's values as long (using {@link * FieldCache#getLongs} and sorts by ascending value */ public static final class LongComparator extends FieldComparator { private final long[] values; private long[] currentReaderValues; private final String field; private LongParser parser; private long bottom; LongComparator(int numHits, String field, FieldCache.Parser parser) { values = new long[numHits]; this.field = field; this.parser = (LongParser) parser; } public int compare(int slot1, int slot2) { // TODO: there are sneaky non-branch ways to compute // -1/+1/0 sign final long v1 = values[slot1]; final long v2 = values[slot2]; if (v1 > v2) { return 1; } else if (v1 < v2) { return -1; } else { return 0; } } public int compareBottom(int doc) { // TODO: there are sneaky non-branch ways to compute // -1/+1/0 sign final long v2 = currentReaderValues[doc]; if (bottom > v2) { return 1; } else if (bottom < v2) { return -1; } else { return 0; } } public void copy(int slot, int doc) { values[slot] = currentReaderValues[doc]; } public void setNextReader(IndexReader reader, int docBase) throws IOException { currentReaderValues = FieldCache.DEFAULT.getLongs(reader, field, parser); } public void setBottom(final int bottom) { this.bottom = values[bottom]; } public Comparable value(int slot) { return new Long(values[slot]); } } /** Sorts by descending relevance. NOTE: if you are * sorting only by descending relevance and then * secondarily by ascending docID, performance is faster * using {@link TopScoreDocCollector} directly (which {@link * IndexSearcher#search} uses when no {@link Sort} is * specified). */ public static final class RelevanceComparator extends FieldComparator { private final float[] scores; private float bottom; private Scorer scorer; RelevanceComparator(int numHits) { scores = new float[numHits]; } public int compare(int slot1, int slot2) { final float score1 = scores[slot1]; final float score2 = scores[slot2]; return score1 > score2 ? -1 : (score1 < score2 ? 1 : 0); } public int compareBottom(int doc) throws IOException { float score = scorer.score(); return bottom > score ? -1 : (bottom < score ? 1 : 0); } public void copy(int slot, int doc) throws IOException { scores[slot] = scorer.score(); } public void setNextReader(IndexReader reader, int docBase) { } public void setBottom(final int bottom) { this.bottom = scores[bottom]; } public void setScorer(Scorer scorer) { // wrap with a ScoreCachingWrappingScorer so that successive calls to // score() will not incur score computation over and over again. this.scorer = new ScoreCachingWrappingScorer(scorer); } public Comparable value(int slot) { return new Float(scores[slot]); } } /** Parses field's values as short (using {@link * FieldCache#getShorts} and sorts by ascending value */ public static final class ShortComparator extends FieldComparator { private final short[] values; private short[] currentReaderValues; private final String field; private ShortParser parser; private short bottom; ShortComparator(int numHits, String field, FieldCache.Parser parser) { values = new short[numHits]; this.field = field; this.parser = (ShortParser) parser; } public int compare(int slot1, int slot2) { return values[slot1] - values[slot2]; } public int compareBottom(int doc) { return bottom - currentReaderValues[doc]; } public void copy(int slot, int doc) { values[slot] = currentReaderValues[doc]; } public void setNextReader(IndexReader reader, int docBase) throws IOException { currentReaderValues = FieldCache.DEFAULT.getShorts(reader, field, parser); } public void setBottom(final int bottom) { this.bottom = values[bottom]; } public Comparable value(int slot) { return new Short(values[slot]); } } /** Sorts by a field's value using the Collator for a * given Locale.*/ public static final class StringComparatorLocale extends FieldComparator { private final String[] values; private String[] currentReaderValues; private final String field; final Collator collator; private String bottom; StringComparatorLocale(int numHits, String field, Locale locale) { values = new String[numHits]; this.field = field; collator = Collator.getInstance(locale); } public int compare(int slot1, int slot2) { final String val1 = values[slot1]; final String val2 = values[slot2]; if (val1 == null) { if (val2 == null) { return 0; } return -1; } else if (val2 == null) { return 1; } return collator.compare(val1, val2); } public int compareBottom(int doc) { final String val2 = currentReaderValues[doc]; if (bottom == null) { if (val2 == null) { return 0; } return -1; } else if (val2 == null) { return 1; } return collator.compare(bottom, val2); } public void copy(int slot, int doc) { values[slot] = currentReaderValues[doc]; } public void setNextReader(IndexReader reader, int docBase) throws IOException { currentReaderValues = FieldCache.DEFAULT.getStrings(reader, field); } public void setBottom(final int bottom) { this.bottom = values[bottom]; } public Comparable value(int slot) { return values[slot]; } } /** Sorts by field's natural String sort order, using * ordinals. This is functionally equivalent to {@link * StringValComparator}, but it first resolves the string * to their relative ordinal positions (using the index * returned by {@link FieldCache#getStringIndex}), and * does most comparisons using the ordinals. For medium * to large results, this comparator will be much faster * than {@link StringValComparator}. For very small * result sets it may be slower. */ public static final class StringOrdValComparator extends FieldComparator { private final int[] ords; private final String[] values; private final int[] readerGen; private int currentReaderGen = -1; private String[] lookup; private int[] order; private final String field; private int bottomSlot = -1; private int bottomOrd; private String bottomValue; private final boolean reversed; private final int sortPos; public StringOrdValComparator(int numHits, String field, int sortPos, boolean reversed) { ords = new int[numHits]; values = new String[numHits]; readerGen = new int[numHits]; this.sortPos = sortPos; this.reversed = reversed; this.field = field; } public int compare(int slot1, int slot2) { if (readerGen[slot1] == readerGen[slot2]) { int cmp = ords[slot1] - ords[slot2]; if (cmp != 0) { return cmp; } } final String val1 = values[slot1]; final String val2 = values[slot2]; if (val1 == null) { if (val2 == null) { return 0; } return -1; } else if (val2 == null) { return 1; } return val1.compareTo(val2); } public int compareBottom(int doc) { assert bottomSlot != -1; int order = this.order[doc]; final int cmp = bottomOrd - order; if (cmp != 0) { return cmp; } final String val2 = lookup[order]; if (bottomValue == null) { if (val2 == null) { return 0; } // bottom wins return -1; } else if (val2 == null) { // doc wins return 1; } return bottomValue.compareTo(val2); } private void convert(int slot) { readerGen[slot] = currentReaderGen; int index = 0; String value = values[slot]; if (value == null) { ords[slot] = 0; return; } if (sortPos == 0 && bottomSlot != -1 && bottomSlot != slot) { // Since we are the primary sort, the entries in the // queue are bounded by bottomOrd: assert bottomOrd < lookup.length; if (reversed) { index = binarySearch(lookup, value, bottomOrd, lookup.length-1); } else { index = binarySearch(lookup, value, 0, bottomOrd); } } else { // Full binary search index = binarySearch(lookup, value); } if (index < 0) { index = -index - 2; } ords[slot] = index; } public void copy(int slot, int doc) { final int ord = order[doc]; ords[slot] = ord; assert ord >= 0; values[slot] = lookup[ord]; readerGen[slot] = currentReaderGen; } public void setNextReader(IndexReader reader, int docBase) throws IOException { StringIndex currentReaderValues = FieldCache.DEFAULT.getStringIndex(reader, field); currentReaderGen++; order = currentReaderValues.order; lookup = currentReaderValues.lookup; assert lookup.length > 0; if (bottomSlot != -1) { convert(bottomSlot); bottomOrd = ords[bottomSlot]; } } public void setBottom(final int bottom) { bottomSlot = bottom; if (readerGen[bottom] != currentReaderGen) { convert(bottomSlot); } bottomOrd = ords[bottom]; assert bottomOrd >= 0; assert bottomOrd < lookup.length; bottomValue = values[bottom]; } public Comparable value(int slot) { return values[slot]; } public String[] getValues() { return values; } public int getBottomSlot() { return bottomSlot; } public String getField() { return field; } } /** Sorts by field's natural String sort order. All * comparisons are done using String.compareTo, which is * slow for medium to large result sets but possibly * very fast for very small results sets. */ public static final class StringValComparator extends FieldComparator { private String[] values; private String[] currentReaderValues; private final String field; private String bottom; StringValComparator(int numHits, String field) { values = new String[numHits]; this.field = field; } public int compare(int slot1, int slot2) { final String val1 = values[slot1]; final String val2 = values[slot2]; if (val1 == null) { if (val2 == null) { return 0; } return -1; } else if (val2 == null) { return 1; } return val1.compareTo(val2); } public int compareBottom(int doc) { final String val2 = currentReaderValues[doc]; if (bottom == null) { if (val2 == null) { return 0; } return -1; } else if (val2 == null) { return 1; } return bottom.compareTo(val2); } public void copy(int slot, int doc) { values[slot] = currentReaderValues[doc]; } public void setNextReader(IndexReader reader, int docBase) throws IOException { currentReaderValues = FieldCache.DEFAULT.getStrings(reader, field); } public void setBottom(final int bottom) { this.bottom = values[bottom]; } public Comparable value(int slot) { return values[slot]; } } final protected static int binarySearch(String[] a, String key) { return binarySearch(a, key, 0, a.length-1); } final protected static int binarySearch(String[] a, String key, int low, int high) { while (low <= high) { int mid = (low + high) >>> 1; String midVal = a[mid]; int cmp; if (midVal != null) { cmp = midVal.compareTo(key); } else { cmp = -1; } if (cmp < 0) low = mid + 1; else if (cmp > 0) high = mid - 1; else return mid; } return -(low + 1); } /** * Compare hit at slot1 with hit at slot2. * * @param slot1 first slot to compare * @param slot2 second slot to compare * @return any N < 0 if slot2's value is sorted after * slot1, any N > 0 if the slot2's value is sorted before * slot1 and 0 if they are equal */ public abstract int compare(int slot1, int slot2); /** * Set the bottom slot, ie the "weakest" (sorted last) * entry in the queue. When {@link #compareBottom} is * called, you should compare against this slot. This * will always be called before {@link #compareBottom}. * * @param slot the currently weakest (sorted last) slot in the queue */ public abstract void setBottom(final int slot); /** * Compare the bottom of the queue with doc. This will * only invoked after setBottom has been called. This * should return the same result as {@link * #compare(int,int)}} as if bottom were slot1 and the new * document were slot 2. * *

    For a search that hits many results, this method * will be the hotspot (invoked by far the most * frequently).

    * * @param doc that was hit * @return any N < 0 if the doc's value is sorted after * the bottom entry (not competitive), any N > 0 if the * doc's value is sorted before the bottom entry and 0 if * they are equal. */ public abstract int compareBottom(int doc) throws IOException; /** * This method is called when a new hit is competitive. * You should copy any state associated with this document * that will be required for future comparisons, into the * specified slot. * * @param slot which slot to copy the hit to * @param doc docID relative to current reader */ public abstract void copy(int slot, int doc) throws IOException; /** * Set a new Reader. All doc correspond to the current Reader. * * @param reader current reader * @param docBase docBase of this reader * @throws IOException * @throws IOException */ public abstract void setNextReader(IndexReader reader, int docBase) throws IOException; /** Sets the Scorer to use in case a document's score is * needed. * * @param scorer Scorer instance that you should use to * obtain the current hit's score, if necessary. */ public void setScorer(Scorer scorer) { // Empty implementation since most comparators don't need the score. This // can be overridden by those that need it. } /** * Return the actual value in the slot. * * @param slot the value * @return value in this slot upgraded to Comparable */ public abstract Comparable value(int slot); } lucene-2.9.4/src/java/org/apache/lucene/search/FieldCache.java0000644000175000017500000005775511474320224024547 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.NumericUtils; import org.apache.lucene.util.RamUsageEstimator; import org.apache.lucene.document.NumericField; // for javadocs import org.apache.lucene.analysis.NumericTokenStream; // for javadocs import java.io.IOException; import java.io.Serializable; import java.io.PrintStream; import java.text.DecimalFormat; /** * Expert: Maintains caches of term values. * *

    Created: May 19, 2004 11:13:14 AM * * @since lucene 1.4 * @version $Id: FieldCache.java 950451 2010-06-02 09:33:57Z mikemccand $ * @see org.apache.lucene.util.FieldCacheSanityChecker */ public interface FieldCache { public static final class CreationPlaceholder { Object value; } /** Indicator for StringIndex values in the cache. */ // NOTE: the value assigned to this constant must not be // the same as any of those in SortField!! public static final int STRING_INDEX = -1; /** Expert: Stores term text values and document ordering data. */ public static class StringIndex { public int binarySearchLookup(String key) { // this special case is the reason that Arrays.binarySearch() isn't useful. if (key == null) return 0; int low = 1; int high = lookup.length-1; while (low <= high) { int mid = (low + high) >>> 1; int cmp = lookup[mid].compareTo(key); if (cmp < 0) low = mid + 1; else if (cmp > 0) high = mid - 1; else return mid; // key found } return -(low + 1); // key not found. } /** All the term values, in natural order. */ public final String[] lookup; /** For each document, an index into the lookup array. */ public final int[] order; /** Creates one of these objects */ public StringIndex (int[] values, String[] lookup) { this.order = values; this.lookup = lookup; } } /** * Marker interface as super-interface to all parsers. It * is used to specify a custom parser to {@link * SortField#SortField(String, FieldCache.Parser)}. */ public interface Parser extends Serializable { } /** Interface to parse bytes from document fields. * @see FieldCache#getBytes(IndexReader, String, FieldCache.ByteParser) */ public interface ByteParser extends Parser { /** Return a single Byte representation of this field's value. */ public byte parseByte(String string); } /** Interface to parse shorts from document fields. * @see FieldCache#getShorts(IndexReader, String, FieldCache.ShortParser) */ public interface ShortParser extends Parser { /** Return a short representation of this field's value. */ public short parseShort(String string); } /** Interface to parse ints from document fields. * @see FieldCache#getInts(IndexReader, String, FieldCache.IntParser) */ public interface IntParser extends Parser { /** Return an integer representation of this field's value. */ public int parseInt(String string); } /** Interface to parse floats from document fields. * @see FieldCache#getFloats(IndexReader, String, FieldCache.FloatParser) */ public interface FloatParser extends Parser { /** Return an float representation of this field's value. */ public float parseFloat(String string); } /** Interface to parse long from document fields. * @see FieldCache#getLongs(IndexReader, String, FieldCache.LongParser) */ public interface LongParser extends Parser { /** Return an long representation of this field's value. */ public long parseLong(String string); } /** Interface to parse doubles from document fields. * @see FieldCache#getDoubles(IndexReader, String, FieldCache.DoubleParser) */ public interface DoubleParser extends Parser { /** Return an long representation of this field's value. */ public double parseDouble(String string); } /** Expert: The cache used internally by sorting and range query classes. */ public static FieldCache DEFAULT = new FieldCacheImpl(); /** The default parser for byte values, which are encoded by {@link Byte#toString(byte)} */ public static final ByteParser DEFAULT_BYTE_PARSER = new ByteParser() { public byte parseByte(String value) { return Byte.parseByte(value); } protected Object readResolve() { return DEFAULT_BYTE_PARSER; } public String toString() { return FieldCache.class.getName()+".DEFAULT_BYTE_PARSER"; } }; /** The default parser for short values, which are encoded by {@link Short#toString(short)} */ public static final ShortParser DEFAULT_SHORT_PARSER = new ShortParser() { public short parseShort(String value) { return Short.parseShort(value); } protected Object readResolve() { return DEFAULT_SHORT_PARSER; } public String toString() { return FieldCache.class.getName()+".DEFAULT_SHORT_PARSER"; } }; /** The default parser for int values, which are encoded by {@link Integer#toString(int)} */ public static final IntParser DEFAULT_INT_PARSER = new IntParser() { public int parseInt(String value) { return Integer.parseInt(value); } protected Object readResolve() { return DEFAULT_INT_PARSER; } public String toString() { return FieldCache.class.getName()+".DEFAULT_INT_PARSER"; } }; /** The default parser for float values, which are encoded by {@link Float#toString(float)} */ public static final FloatParser DEFAULT_FLOAT_PARSER = new FloatParser() { public float parseFloat(String value) { return Float.parseFloat(value); } protected Object readResolve() { return DEFAULT_FLOAT_PARSER; } public String toString() { return FieldCache.class.getName()+".DEFAULT_FLOAT_PARSER"; } }; /** The default parser for long values, which are encoded by {@link Long#toString(long)} */ public static final LongParser DEFAULT_LONG_PARSER = new LongParser() { public long parseLong(String value) { return Long.parseLong(value); } protected Object readResolve() { return DEFAULT_LONG_PARSER; } public String toString() { return FieldCache.class.getName()+".DEFAULT_LONG_PARSER"; } }; /** The default parser for double values, which are encoded by {@link Double#toString(double)} */ public static final DoubleParser DEFAULT_DOUBLE_PARSER = new DoubleParser() { public double parseDouble(String value) { return Double.parseDouble(value); } protected Object readResolve() { return DEFAULT_DOUBLE_PARSER; } public String toString() { return FieldCache.class.getName()+".DEFAULT_DOUBLE_PARSER"; } }; /** * A parser instance for int values encoded by {@link NumericUtils#intToPrefixCoded(int)}, e.g. when indexed * via {@link NumericField}/{@link NumericTokenStream}. */ public static final IntParser NUMERIC_UTILS_INT_PARSER=new IntParser(){ public int parseInt(String val) { final int shift = val.charAt(0)-NumericUtils.SHIFT_START_INT; if (shift>0 && shift<=31) throw new FieldCacheImpl.StopFillCacheException(); return NumericUtils.prefixCodedToInt(val); } protected Object readResolve() { return NUMERIC_UTILS_INT_PARSER; } public String toString() { return FieldCache.class.getName()+".NUMERIC_UTILS_INT_PARSER"; } }; /** * A parser instance for float values encoded with {@link NumericUtils}, e.g. when indexed * via {@link NumericField}/{@link NumericTokenStream}. */ public static final FloatParser NUMERIC_UTILS_FLOAT_PARSER=new FloatParser(){ public float parseFloat(String val) { final int shift = val.charAt(0)-NumericUtils.SHIFT_START_INT; if (shift>0 && shift<=31) throw new FieldCacheImpl.StopFillCacheException(); return NumericUtils.sortableIntToFloat(NumericUtils.prefixCodedToInt(val)); } protected Object readResolve() { return NUMERIC_UTILS_FLOAT_PARSER; } public String toString() { return FieldCache.class.getName()+".NUMERIC_UTILS_FLOAT_PARSER"; } }; /** * A parser instance for long values encoded by {@link NumericUtils#longToPrefixCoded(long)}, e.g. when indexed * via {@link NumericField}/{@link NumericTokenStream}. */ public static final LongParser NUMERIC_UTILS_LONG_PARSER = new LongParser(){ public long parseLong(String val) { final int shift = val.charAt(0)-NumericUtils.SHIFT_START_LONG; if (shift>0 && shift<=63) throw new FieldCacheImpl.StopFillCacheException(); return NumericUtils.prefixCodedToLong(val); } protected Object readResolve() { return NUMERIC_UTILS_LONG_PARSER; } public String toString() { return FieldCache.class.getName()+".NUMERIC_UTILS_LONG_PARSER"; } }; /** * A parser instance for double values encoded with {@link NumericUtils}, e.g. when indexed * via {@link NumericField}/{@link NumericTokenStream}. */ public static final DoubleParser NUMERIC_UTILS_DOUBLE_PARSER = new DoubleParser(){ public double parseDouble(String val) { final int shift = val.charAt(0)-NumericUtils.SHIFT_START_LONG; if (shift>0 && shift<=63) throw new FieldCacheImpl.StopFillCacheException(); return NumericUtils.sortableLongToDouble(NumericUtils.prefixCodedToLong(val)); } protected Object readResolve() { return NUMERIC_UTILS_DOUBLE_PARSER; } public String toString() { return FieldCache.class.getName()+".NUMERIC_UTILS_DOUBLE_PARSER"; } }; /** Checks the internal cache for an appropriate entry, and if none is * found, reads the terms in field as a single byte and returns an array * of size reader.maxDoc() of the value each document * has in the given field. * @param reader Used to get field values. * @param field Which field contains the single byte values. * @return The values in the given field for each document. * @throws IOException If any error occurs. */ public byte[] getBytes (IndexReader reader, String field) throws IOException; /** Checks the internal cache for an appropriate entry, and if none is found, * reads the terms in field as bytes and returns an array of * size reader.maxDoc() of the value each document has in the * given field. * @param reader Used to get field values. * @param field Which field contains the bytes. * @param parser Computes byte for string values. * @return The values in the given field for each document. * @throws IOException If any error occurs. */ public byte[] getBytes (IndexReader reader, String field, ByteParser parser) throws IOException; /** Checks the internal cache for an appropriate entry, and if none is * found, reads the terms in field as shorts and returns an array * of size reader.maxDoc() of the value each document * has in the given field. * @param reader Used to get field values. * @param field Which field contains the shorts. * @return The values in the given field for each document. * @throws IOException If any error occurs. */ public short[] getShorts (IndexReader reader, String field) throws IOException; /** Checks the internal cache for an appropriate entry, and if none is found, * reads the terms in field as shorts and returns an array of * size reader.maxDoc() of the value each document has in the * given field. * @param reader Used to get field values. * @param field Which field contains the shorts. * @param parser Computes short for string values. * @return The values in the given field for each document. * @throws IOException If any error occurs. */ public short[] getShorts (IndexReader reader, String field, ShortParser parser) throws IOException; /** Checks the internal cache for an appropriate entry, and if none is * found, reads the terms in field as integers and returns an array * of size reader.maxDoc() of the value each document * has in the given field. * @param reader Used to get field values. * @param field Which field contains the integers. * @return The values in the given field for each document. * @throws IOException If any error occurs. */ public int[] getInts (IndexReader reader, String field) throws IOException; /** Checks the internal cache for an appropriate entry, and if none is found, * reads the terms in field as integers and returns an array of * size reader.maxDoc() of the value each document has in the * given field. * @param reader Used to get field values. * @param field Which field contains the integers. * @param parser Computes integer for string values. * @return The values in the given field for each document. * @throws IOException If any error occurs. */ public int[] getInts (IndexReader reader, String field, IntParser parser) throws IOException; /** Checks the internal cache for an appropriate entry, and if * none is found, reads the terms in field as floats and returns an array * of size reader.maxDoc() of the value each document * has in the given field. * @param reader Used to get field values. * @param field Which field contains the floats. * @return The values in the given field for each document. * @throws IOException If any error occurs. */ public float[] getFloats (IndexReader reader, String field) throws IOException; /** Checks the internal cache for an appropriate entry, and if * none is found, reads the terms in field as floats and returns an array * of size reader.maxDoc() of the value each document * has in the given field. * @param reader Used to get field values. * @param field Which field contains the floats. * @param parser Computes float for string values. * @return The values in the given field for each document. * @throws IOException If any error occurs. */ public float[] getFloats (IndexReader reader, String field, FloatParser parser) throws IOException; /** * Checks the internal cache for an appropriate entry, and if none is * found, reads the terms in field as longs and returns an array * of size reader.maxDoc() of the value each document * has in the given field. * * @param reader Used to get field values. * @param field Which field contains the longs. * @return The values in the given field for each document. * @throws java.io.IOException If any error occurs. */ public long[] getLongs(IndexReader reader, String field) throws IOException; /** * Checks the internal cache for an appropriate entry, and if none is found, * reads the terms in field as longs and returns an array of * size reader.maxDoc() of the value each document has in the * given field. * * @param reader Used to get field values. * @param field Which field contains the longs. * @param parser Computes integer for string values. * @return The values in the given field for each document. * @throws IOException If any error occurs. */ public long[] getLongs(IndexReader reader, String field, LongParser parser) throws IOException; /** * Checks the internal cache for an appropriate entry, and if none is * found, reads the terms in field as integers and returns an array * of size reader.maxDoc() of the value each document * has in the given field. * * @param reader Used to get field values. * @param field Which field contains the doubles. * @return The values in the given field for each document. * @throws IOException If any error occurs. */ public double[] getDoubles(IndexReader reader, String field) throws IOException; /** * Checks the internal cache for an appropriate entry, and if none is found, * reads the terms in field as doubles and returns an array of * size reader.maxDoc() of the value each document has in the * given field. * * @param reader Used to get field values. * @param field Which field contains the doubles. * @param parser Computes integer for string values. * @return The values in the given field for each document. * @throws IOException If any error occurs. */ public double[] getDoubles(IndexReader reader, String field, DoubleParser parser) throws IOException; /** Checks the internal cache for an appropriate entry, and if none * is found, reads the term values in field and returns an array * of size reader.maxDoc() containing the value each document * has in the given field. * @param reader Used to get field values. * @param field Which field contains the strings. * @return The values in the given field for each document. * @throws IOException If any error occurs. */ public String[] getStrings (IndexReader reader, String field) throws IOException; /** Checks the internal cache for an appropriate entry, and if none * is found reads the term values in field and returns * an array of them in natural order, along with an array telling * which element in the term array each document uses. * @param reader Used to get field values. * @param field Which field contains the strings. * @return Array of terms and index into the array for each document. * @throws IOException If any error occurs. */ public StringIndex getStringIndex (IndexReader reader, String field) throws IOException; /** Checks the internal cache for an appropriate entry, and if * none is found reads field to see if it contains integers, longs, floats * or strings, and then calls one of the other methods in this class to get the * values. For string values, a StringIndex is returned. After * calling this method, there is an entry in the cache for both * type AUTO and the actual found type. * @param reader Used to get field values. * @param field Which field contains the values. * @return int[], long[], float[] or StringIndex. * @throws IOException If any error occurs. * @deprecated Please specify the exact type, instead. * Especially, guessing does not work with the new * {@link NumericField} type. */ public Object getAuto (IndexReader reader, String field) throws IOException; /** Checks the internal cache for an appropriate entry, and if none * is found reads the terms out of field and calls the given SortComparator * to get the sort values. A hit in the cache will happen if reader, * field, and comparator are the same (using equals()) * as a previous call to this method. * @param reader Used to get field values. * @param field Which field contains the values. * @param comparator Used to convert terms into something to sort by. * @return Array of sort objects, one for each document. * @throws IOException If any error occurs. * @deprecated Please implement {@link * FieldComparatorSource} directly, instead. */ public Comparable[] getCustom (IndexReader reader, String field, SortComparator comparator) throws IOException; /** * EXPERT: A unique Identifier/Description for each item in the FieldCache. * Can be useful for logging/debugging. *

    * EXPERIMENTAL API: This API is considered extremely advanced * and experimental. It may be removed or altered w/o warning in future * releases * of Lucene. *

    */ public static abstract class CacheEntry { public abstract Object getReaderKey(); public abstract String getFieldName(); public abstract Class getCacheType(); public abstract Object getCustom(); public abstract Object getValue(); private String size = null; protected final void setEstimatedSize(String size) { this.size = size; } /** * @see #estimateSize(RamUsageEstimator) */ public void estimateSize() { estimateSize(new RamUsageEstimator(false)); // doesn't check for interned } /** * Computes (and stores) the estimated size of the cache Value * @see #getEstimatedSize */ public void estimateSize(RamUsageEstimator ramCalc) { long size = ramCalc.estimateRamUsage(getValue()); setEstimatedSize(RamUsageEstimator.humanReadableUnits (size, new DecimalFormat("0.#"))); } /** * The most recently estimated size of the value, null unless * estimateSize has been called. */ public final String getEstimatedSize() { return size; } public String toString() { StringBuffer b = new StringBuffer(); b.append("'").append(getReaderKey()).append("'=>"); b.append("'").append(getFieldName()).append("',"); b.append(getCacheType()).append(",").append(getCustom()); b.append("=>").append(getValue().getClass().getName()).append("#"); b.append(System.identityHashCode(getValue())); String s = getEstimatedSize(); if(null != s) { b.append(" (size =~ ").append(s).append(')'); } return b.toString(); } } /** * EXPERT: Generates an array of CacheEntry objects representing all items * currently in the FieldCache. *

    * NOTE: These CacheEntry objects maintain a strong reference to the * Cached Values. Maintaining references to a CacheEntry the IndexReader * associated with it has garbage collected will prevent the Value itself * from being garbage collected when the Cache drops the WeakRefrence. *

    *

    * EXPERIMENTAL API: This API is considered extremely advanced * and experimental. It may be removed or altered w/o warning in future * releases * of Lucene. *

    */ public abstract CacheEntry[] getCacheEntries(); /** *

    * EXPERT: Instructs the FieldCache to forcibly expunge all entries * from the underlying caches. This is intended only to be used for * test methods as a way to ensure a known base state of the Cache * (with out needing to rely on GC to free WeakReferences). * It should not be relied on for "Cache maintenance" in general * application code. *

    *

    * EXPERIMENTAL API: This API is considered extremely advanced * and experimental. It may be removed or altered w/o warning in future * releases * of Lucene. *

    */ public abstract void purgeAllCaches(); /** * Expert: drops all cache entries associated with this * reader. NOTE: this reader must precisely match the * reader that the cache entry is keyed on. If you pass a * top-level reader, it usually will have no effect as * Lucene now caches at the segment reader level. */ public abstract void purge(IndexReader r); /** * If non-null, FieldCacheImpl will warn whenever * entries are created that are not sane according to * {@link org.apache.lucene.util.FieldCacheSanityChecker}. */ public void setInfoStream(PrintStream stream); /** counterpart of {@link #setInfoStream(PrintStream)} */ public PrintStream getInfoStream(); } lucene-2.9.4/src/java/org/apache/lucene/search/PhrasePositions.java0000644000175000017500000000506111474320224025711 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.*; /** * Position of a term in a document that takes into account the term offset within the phrase. */ final class PhrasePositions { int doc; // current doc int position; // position in doc int count; // remaining pos in this doc int offset; // position in phrase TermPositions tp; // stream of positions PhrasePositions next; // used to make lists boolean repeats; // there's other pp for same term (e.g. query="1st word 2nd word"~1) PhrasePositions(TermPositions t, int o) { tp = t; offset = o; } final boolean next() throws IOException { // increments to next doc if (!tp.next()) { tp.close(); // close stream doc = Integer.MAX_VALUE; // sentinel value return false; } doc = tp.doc(); position = 0; return true; } final boolean skipTo(int target) throws IOException { if (!tp.skipTo(target)) { tp.close(); // close stream doc = Integer.MAX_VALUE; // sentinel value return false; } doc = tp.doc(); position = 0; return true; } final void firstPosition() throws IOException { count = tp.freq(); // read first pos nextPosition(); } /** * Go to next location of this term current document, and set * position as location - offset, so that a * matching exact phrase is easily identified when all PhrasePositions * have exactly the same position. */ final boolean nextPosition() throws IOException { if (count-- > 0) { // read subsequent pos's position = tp.nextPosition() - offset; return true; } else return false; } } lucene-2.9.4/src/java/org/apache/lucene/search/FieldComparatorSource.java0000644000175000017500000000302111474320225027006 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.Serializable; /** * Provides a {@link FieldComparator} for custom field sorting. * * NOTE: This API is experimental and might change in * incompatible ways in the next release. * */ public abstract class FieldComparatorSource implements Serializable { /** * Creates a comparator for the field in the given index. * * @param fieldname * Name of the field to create comparator for. * @return FieldComparator. * @throws IOException * If an error occurs reading the index. */ public abstract FieldComparator newComparator(String fieldname, int numHits, int sortPos, boolean reversed) throws IOException; } lucene-2.9.4/src/java/org/apache/lucene/search/NumericRangeFilter.java0000644000175000017500000002175011474320224026307 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.analysis.NumericTokenStream; // for javadocs import org.apache.lucene.document.NumericField; // for javadocs import org.apache.lucene.util.NumericUtils; // for javadocs /** * A {@link Filter} that only accepts numeric values within * a specified range. To use this, you must first index the * numeric values using {@link NumericField} (expert: {@link * NumericTokenStream}). * *

    You create a new NumericRangeFilter with the static * factory methods, eg: * *

     * Filter f = NumericRangeFilter.newFloatRange("weight",
     *                                             new Float(0.3f), new Float(0.10f),
     *                                             true, true);
     * 
    * * accepts all documents whose float valued "weight" field * ranges from 0.3 to 0.10, inclusive. * See {@link NumericRangeQuery} for details on how Lucene * indexes and searches numeric valued fields. * *

    NOTE: This API is experimental and * might change in incompatible ways in the next * release. * * @since 2.9 **/ public final class NumericRangeFilter extends MultiTermQueryWrapperFilter { private NumericRangeFilter(final NumericRangeQuery query) { super(query); } /** * Factory that creates a NumericRangeFilter, that filters a long * range using the given precisionStep. * You can have half-open ranges (which are in fact </≤ or >/≥ queries) * by setting the min or max value to null. By setting inclusive to false, it will * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. */ public static NumericRangeFilter newLongRange(final String field, final int precisionStep, Long min, Long max, final boolean minInclusive, final boolean maxInclusive ) { return new NumericRangeFilter( NumericRangeQuery.newLongRange(field, precisionStep, min, max, minInclusive, maxInclusive) ); } /** * Factory that creates a NumericRangeFilter, that queries a long * range using the default precisionStep {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). * You can have half-open ranges (which are in fact </≤ or >/≥ queries) * by setting the min or max value to null. By setting inclusive to false, it will * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. */ public static NumericRangeFilter newLongRange(final String field, Long min, Long max, final boolean minInclusive, final boolean maxInclusive ) { return new NumericRangeFilter( NumericRangeQuery.newLongRange(field, min, max, minInclusive, maxInclusive) ); } /** * Factory that creates a NumericRangeFilter, that filters a int * range using the given precisionStep. * You can have half-open ranges (which are in fact </≤ or >/≥ queries) * by setting the min or max value to null. By setting inclusive to false, it will * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. */ public static NumericRangeFilter newIntRange(final String field, final int precisionStep, Integer min, Integer max, final boolean minInclusive, final boolean maxInclusive ) { return new NumericRangeFilter( NumericRangeQuery.newIntRange(field, precisionStep, min, max, minInclusive, maxInclusive) ); } /** * Factory that creates a NumericRangeFilter, that queries a int * range using the default precisionStep {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). * You can have half-open ranges (which are in fact </≤ or >/≥ queries) * by setting the min or max value to null. By setting inclusive to false, it will * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. */ public static NumericRangeFilter newIntRange(final String field, Integer min, Integer max, final boolean minInclusive, final boolean maxInclusive ) { return new NumericRangeFilter( NumericRangeQuery.newIntRange(field, min, max, minInclusive, maxInclusive) ); } /** * Factory that creates a NumericRangeFilter, that filters a double * range using the given precisionStep. * You can have half-open ranges (which are in fact </≤ or >/≥ queries) * by setting the min or max value to null. By setting inclusive to false, it will * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. */ public static NumericRangeFilter newDoubleRange(final String field, final int precisionStep, Double min, Double max, final boolean minInclusive, final boolean maxInclusive ) { return new NumericRangeFilter( NumericRangeQuery.newDoubleRange(field, precisionStep, min, max, minInclusive, maxInclusive) ); } /** * Factory that creates a NumericRangeFilter, that queries a double * range using the default precisionStep {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). * You can have half-open ranges (which are in fact </≤ or >/≥ queries) * by setting the min or max value to null. By setting inclusive to false, it will * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. */ public static NumericRangeFilter newDoubleRange(final String field, Double min, Double max, final boolean minInclusive, final boolean maxInclusive ) { return new NumericRangeFilter( NumericRangeQuery.newDoubleRange(field, min, max, minInclusive, maxInclusive) ); } /** * Factory that creates a NumericRangeFilter, that filters a float * range using the given precisionStep. * You can have half-open ranges (which are in fact </≤ or >/≥ queries) * by setting the min or max value to null. By setting inclusive to false, it will * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. */ public static NumericRangeFilter newFloatRange(final String field, final int precisionStep, Float min, Float max, final boolean minInclusive, final boolean maxInclusive ) { return new NumericRangeFilter( NumericRangeQuery.newFloatRange(field, precisionStep, min, max, minInclusive, maxInclusive) ); } /** * Factory that creates a NumericRangeFilter, that queries a float * range using the default precisionStep {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). * You can have half-open ranges (which are in fact </≤ or >/≥ queries) * by setting the min or max value to null. By setting inclusive to false, it will * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. */ public static NumericRangeFilter newFloatRange(final String field, Float min, Float max, final boolean minInclusive, final boolean maxInclusive ) { return new NumericRangeFilter( NumericRangeQuery.newFloatRange(field, min, max, minInclusive, maxInclusive) ); } /** Returns the field name for this filter */ public String getField() { return ((NumericRangeQuery)query).getField(); } /** Returns true if the lower endpoint is inclusive */ public boolean includesMin() { return ((NumericRangeQuery)query).includesMin(); } /** Returns true if the upper endpoint is inclusive */ public boolean includesMax() { return ((NumericRangeQuery)query).includesMax(); } /** Returns the lower value of this range filter */ public Number getMin() { return ((NumericRangeQuery)query).getMin(); } /** Returns the upper value of this range filter */ public Number getMax() { return ((NumericRangeQuery)query).getMax(); } } lucene-2.9.4/src/java/org/apache/lucene/search/FilteredQuery.java0000644000175000017500000001775011474320224025353 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.ToStringUtils; import java.io.IOException; import java.util.Set; /** * A query that applies a filter to the results of another query. * *

    Note: the bits are retrieved from the filter each time this * query is used in a search - use a CachingWrapperFilter to avoid * regenerating the bits every time. * *

    Created: Apr 20, 2004 8:58:29 AM * * @since 1.4 * @version $Id: FilteredQuery.java 807821 2009-08-25 21:55:49Z mikemccand $ * @see CachingWrapperFilter */ public class FilteredQuery extends Query { Query query; Filter filter; /** * Constructs a new query which applies a filter to the results of the original query. * Filter.getDocIdSet() will be called every time this query is used in a search. * @param query Query to be filtered, cannot be null. * @param filter Filter to apply to query results, cannot be null. */ public FilteredQuery (Query query, Filter filter) { this.query = query; this.filter = filter; } /** * Returns a Weight that applies the filter to the enclosed query's Weight. * This is accomplished by overriding the Scorer returned by the Weight. */ public Weight createWeight(final Searcher searcher) throws IOException { final Weight weight = query.createWeight (searcher); final Similarity similarity = query.getSimilarity(searcher); return new Weight() { private float value; // pass these methods through to enclosed query's weight public float getValue() { return value; } public float sumOfSquaredWeights() throws IOException { return weight.sumOfSquaredWeights() * getBoost() * getBoost(); } public void normalize (float v) { weight.normalize(v); value = weight.getValue() * getBoost(); } public Explanation explain (IndexReader ir, int i) throws IOException { Explanation inner = weight.explain (ir, i); if (getBoost()!=1) { Explanation preBoost = inner; inner = new Explanation(inner.getValue()*getBoost(),"product of:"); inner.addDetail(new Explanation(getBoost(),"boost")); inner.addDetail(preBoost); } Filter f = FilteredQuery.this.filter; DocIdSet docIdSet = f.getDocIdSet(ir); DocIdSetIterator docIdSetIterator = docIdSet == null ? DocIdSet.EMPTY_DOCIDSET.iterator() : docIdSet.iterator(); if (docIdSetIterator == null) { docIdSetIterator = DocIdSet.EMPTY_DOCIDSET.iterator(); } if (docIdSetIterator.advance(i) == i) { return inner; } else { Explanation result = new Explanation (0.0f, "failure to match filter: " + f.toString()); result.addDetail(inner); return result; } } // return this query public Query getQuery() { return FilteredQuery.this; } // return a filtering scorer public Scorer scorer(IndexReader indexReader, boolean scoreDocsInOrder, boolean topScorer) throws IOException { final Scorer scorer = weight.scorer(indexReader, true, false); if (scorer == null) { return null; } DocIdSet docIdSet = filter.getDocIdSet(indexReader); if (docIdSet == null) { return null; } final DocIdSetIterator docIdSetIterator = docIdSet.iterator(); if (docIdSetIterator == null) { return null; } return new Scorer(similarity) { private int doc = -1; private int advanceToCommon(int scorerDoc, int disiDoc) throws IOException { while (scorerDoc != disiDoc) { if (scorerDoc < disiDoc) { scorerDoc = scorer.advance(disiDoc); } else { disiDoc = docIdSetIterator.advance(scorerDoc); } } return scorerDoc; } /** @deprecated use {@link #nextDoc()} instead. */ public boolean next() throws IOException { return nextDoc() != NO_MORE_DOCS; } public int nextDoc() throws IOException { int scorerDoc, disiDoc; return doc = (disiDoc = docIdSetIterator.nextDoc()) != NO_MORE_DOCS && (scorerDoc = scorer.nextDoc()) != NO_MORE_DOCS && advanceToCommon(scorerDoc, disiDoc) != NO_MORE_DOCS ? scorer.docID() : NO_MORE_DOCS; } /** @deprecated use {@link #docID()} instead. */ public int doc() { return scorer.doc(); } public int docID() { return doc; } /** @deprecated use {@link #advance(int)} instead. */ public boolean skipTo(int i) throws IOException { return advance(i) != NO_MORE_DOCS; } public int advance(int target) throws IOException { int disiDoc, scorerDoc; return doc = (disiDoc = docIdSetIterator.advance(target)) != NO_MORE_DOCS && (scorerDoc = scorer.advance(disiDoc)) != NO_MORE_DOCS && advanceToCommon(scorerDoc, disiDoc) != NO_MORE_DOCS ? scorer.docID() : NO_MORE_DOCS; } public float score() throws IOException { return getBoost() * scorer.score(); } // add an explanation about whether the document was filtered public Explanation explain (int i) throws IOException { Explanation exp = scorer.explain(i); if (docIdSetIterator.advance(i) == i) { exp.setDescription ("allowed by filter: "+exp.getDescription()); exp.setValue(getBoost() * exp.getValue()); } else { exp.setDescription ("removed by filter: "+exp.getDescription()); exp.setValue(0.0f); } return exp; } }; } }; } /** Rewrites the wrapped query. */ public Query rewrite(IndexReader reader) throws IOException { Query rewritten = query.rewrite(reader); if (rewritten != query) { FilteredQuery clone = (FilteredQuery)this.clone(); clone.query = rewritten; return clone; } else { return this; } } public Query getQuery() { return query; } public Filter getFilter() { return filter; } // inherit javadoc public void extractTerms(Set terms) { getQuery().extractTerms(terms); } /** Prints a user-readable version of this query. */ public String toString (String s) { StringBuffer buffer = new StringBuffer(); buffer.append("filtered("); buffer.append(query.toString(s)); buffer.append(")->"); buffer.append(filter); buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } /** Returns true iff o is equal to this. */ public boolean equals(Object o) { if (o instanceof FilteredQuery) { FilteredQuery fq = (FilteredQuery) o; return (query.equals(fq.query) && filter.equals(fq.filter) && getBoost()==fq.getBoost()); } return false; } /** Returns a hash code value for this object. */ public int hashCode() { return query.hashCode() ^ filter.hashCode() + Float.floatToRawIntBits(getBoost()); } } lucene-2.9.4/src/java/org/apache/lucene/search/HitCollector.java0000644000175000017500000000453511474320224025157 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Lower-level search API.
    * HitCollectors are primarily meant to be used to implement queries, sorting * and filtering. See {@link Collector} for a lower level and higher performance * (on a multi-segment index) API. * * @see Searcher#search(Query,HitCollector) * @version $Id: HitCollector.java 764551 2009-04-13 18:33:56Z mikemccand $ * @deprecated Please use {@link Collector} instead. */ public abstract class HitCollector { /** Called once for every document matching a query, with the document * number and its raw score. * *

    If, for example, an application wished to collect all of the hits for a * query in a BitSet, then it might:

       *   Searcher searcher = new IndexSearcher(indexReader);
       *   final BitSet bits = new BitSet(indexReader.maxDoc());
       *   searcher.search(query, new HitCollector() {
       *       public void collect(int doc, float score) {
       *         bits.set(doc);
       *       }
       *     });
       * 
    * *

    Note: This is called in an inner search loop. For good search * performance, implementations of this method should not call * {@link Searcher#doc(int)} or * {@link org.apache.lucene.index.IndexReader#document(int)} on every * document number encountered. Doing so can slow searches by an order * of magnitude or more. *

    Note: The score passed to this method is a raw score. * In other words, the score will not necessarily be a float whose value is * between 0 and 1. */ public abstract void collect(int doc, float score); } lucene-2.9.4/src/java/org/apache/lucene/search/ScoreDocComparator.java0000644000175000017500000000637211474320224026316 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Expert: Compares two ScoreDoc objects for sorting. * *

    Created: Feb 3, 2004 9:00:16 AM * * @since lucene 1.4 * @version $Id: ScoreDocComparator.java 738219 2009-01-27 20:15:21Z mikemccand $ * @deprecated use {@link FieldComparator} */ public interface ScoreDocComparator { /** Special comparator for sorting hits according to computed relevance (document score). */ static final ScoreDocComparator RELEVANCE = new ScoreDocComparator() { public int compare (ScoreDoc i, ScoreDoc j) { if (i.score > j.score) return -1; if (i.score < j.score) return 1; return 0; } public Comparable sortValue (ScoreDoc i) { return new Float (i.score); } public int sortType() { return SortField.SCORE; } }; /** Special comparator for sorting hits according to index order (document number). */ static final ScoreDocComparator INDEXORDER = new ScoreDocComparator() { public int compare (ScoreDoc i, ScoreDoc j) { if (i.doc < j.doc) return -1; if (i.doc > j.doc) return 1; return 0; } public Comparable sortValue (ScoreDoc i) { return new Integer (i.doc); } public int sortType() { return SortField.DOC; } }; /** * Compares two ScoreDoc objects and returns a result indicating their * sort order. * @param i First ScoreDoc * @param j Second ScoreDoc * @return a negative integer if i should come before j
    * a positive integer if i should come after j
    * 0 if they are equal * @see java.util.Comparator */ int compare (ScoreDoc i, ScoreDoc j); /** * Returns the value used to sort the given document. The * object returned must implement the java.io.Serializable * interface. This is used by multisearchers to determine how * to collate results from their searchers. * @see FieldDoc * @param i Document * @return Serializable object */ Comparable sortValue (ScoreDoc i); /** * Returns the type of sort. Should return SortField.SCORE, * SortField.DOC, SortField.STRING, * SortField.INTEGER, SortField.FLOAT or * SortField.CUSTOM. It is not valid to return * SortField.AUTO. * This is used by multisearchers to determine how to collate results * from their searchers. * @return One of the constants in SortField. * @see SortField */ int sortType(); } lucene-2.9.4/src/java/org/apache/lucene/search/SortField.java0000644000175000017500000004745711474320224024471 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.Serializable; import java.util.Locale; import org.apache.lucene.document.NumericField; // javadocs import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import org.apache.lucene.util.StringHelper; /** * Stores information about how to sort documents by terms in an individual * field. Fields must be indexed in order to sort by them. * *

    Created: Feb 11, 2004 1:25:29 PM * * @since lucene 1.4 * @version $Id: SortField.java 833298 2009-11-06 04:32:24Z uschindler $ * @see Sort */ public class SortField implements Serializable { /** Sort by document score (relevancy). Sort values are Float and higher * values are at the front. */ public static final int SCORE = 0; /** Sort by document number (index order). Sort values are Integer and lower * values are at the front. */ public static final int DOC = 1; /** Guess type of sort based on field contents. A regular expression is used * to look at the first term indexed for the field and determine if it * represents an integer number, a floating point number, or just arbitrary * string characters. * @deprecated Please specify the exact type, instead. * Especially, guessing does not work with the new * {@link NumericField} type. */ public static final int AUTO = 2; /** Sort using term values as Strings. Sort values are String and lower * values are at the front. */ public static final int STRING = 3; /** Sort using term values as encoded Integers. Sort values are Integer and * lower values are at the front. */ public static final int INT = 4; /** Sort using term values as encoded Floats. Sort values are Float and * lower values are at the front. */ public static final int FLOAT = 5; /** Sort using term values as encoded Longs. Sort values are Long and * lower values are at the front. */ public static final int LONG = 6; /** Sort using term values as encoded Doubles. Sort values are Double and * lower values are at the front. */ public static final int DOUBLE = 7; /** Sort using term values as encoded Shorts. Sort values are Short and * lower values are at the front. */ public static final int SHORT = 8; /** Sort using a custom Comparator. Sort values are any Comparable and * sorting is done according to natural order. */ public static final int CUSTOM = 9; /** Sort using term values as encoded Bytes. Sort values are Byte and * lower values are at the front. */ public static final int BYTE = 10; /** Sort using term values as Strings, but comparing by * value (using String.compareTo) for all comparisons. * This is typically slower than {@link #STRING}, which * uses ordinals to do the sorting. */ public static final int STRING_VAL = 11; // IMPLEMENTATION NOTE: the FieldCache.STRING_INDEX is in the same "namespace" // as the above static int values. Any new values must not have the same value // as FieldCache.STRING_INDEX. /** Represents sorting by document score (relevancy). */ public static final SortField FIELD_SCORE = new SortField (null, SCORE); /** Represents sorting by document number (index order). */ public static final SortField FIELD_DOC = new SortField (null, DOC); private String field; private int type = AUTO; // defaults to determining type dynamically private Locale locale; // defaults to "natural order" (no Locale) boolean reverse = false; // defaults to natural order private SortComparatorSource factory; private FieldCache.Parser parser; // Used for CUSTOM sort private FieldComparatorSource comparatorSource; private boolean useLegacy = false; // remove in Lucene 3.0 /** Creates a sort by terms in the given field where the type of term value * is determined dynamically ({@link #AUTO AUTO}). * @param field Name of field to sort by, cannot be * null. * @deprecated Please specify the exact type instead. */ public SortField (String field) { initFieldType(field, AUTO); } /** Creates a sort, possibly in reverse, by terms in the given field where * the type of term value is determined dynamically ({@link #AUTO AUTO}). * @param field Name of field to sort by, cannot be null. * @param reverse True if natural order should be reversed. * @deprecated Please specify the exact type instead. */ public SortField (String field, boolean reverse) { initFieldType(field, AUTO); this.reverse = reverse; } /** Creates a sort by terms in the given field with the type of term * values explicitly given. * @param field Name of field to sort by. Can be null if * type is SCORE or DOC. * @param type Type of values in the terms. */ public SortField (String field, int type) { initFieldType(field, type); } /** Creates a sort, possibly in reverse, by terms in the given field with the * type of term values explicitly given. * @param field Name of field to sort by. Can be null if * type is SCORE or DOC. * @param type Type of values in the terms. * @param reverse True if natural order should be reversed. */ public SortField (String field, int type, boolean reverse) { initFieldType(field, type); this.reverse = reverse; } /** Creates a sort by terms in the given field, parsed * to numeric values using a custom {@link FieldCache.Parser}. * @param field Name of field to sort by. Must not be null. * @param parser Instance of a {@link FieldCache.Parser}, * which must subclass one of the existing numeric * parsers from {@link FieldCache}. Sort type is inferred * by testing which numeric parser the parser subclasses. * @throws IllegalArgumentException if the parser fails to * subclass an existing numeric parser, or field is null */ public SortField (String field, FieldCache.Parser parser) { this(field, parser, false); } /** Creates a sort, possibly in reverse, by terms in the given field, parsed * to numeric values using a custom {@link FieldCache.Parser}. * @param field Name of field to sort by. Must not be null. * @param parser Instance of a {@link FieldCache.Parser}, * which must subclass one of the existing numeric * parsers from {@link FieldCache}. Sort type is inferred * by testing which numeric parser the parser subclasses. * @param reverse True if natural order should be reversed. * @throws IllegalArgumentException if the parser fails to * subclass an existing numeric parser, or field is null */ public SortField (String field, FieldCache.Parser parser, boolean reverse) { if (parser instanceof FieldCache.IntParser) initFieldType(field, INT); else if (parser instanceof FieldCache.FloatParser) initFieldType(field, FLOAT); else if (parser instanceof FieldCache.ShortParser) initFieldType(field, SHORT); else if (parser instanceof FieldCache.ByteParser) initFieldType(field, BYTE); else if (parser instanceof FieldCache.LongParser) initFieldType(field, LONG); else if (parser instanceof FieldCache.DoubleParser) initFieldType(field, DOUBLE); else throw new IllegalArgumentException("Parser instance does not subclass existing numeric parser from FieldCache (got " + parser + ")"); this.reverse = reverse; this.parser = parser; } /** Creates a sort by terms in the given field sorted * according to the given locale. * @param field Name of field to sort by, cannot be null. * @param locale Locale of values in the field. */ public SortField (String field, Locale locale) { initFieldType(field, STRING); this.locale = locale; } /** Creates a sort, possibly in reverse, by terms in the given field sorted * according to the given locale. * @param field Name of field to sort by, cannot be null. * @param locale Locale of values in the field. */ public SortField (String field, Locale locale, boolean reverse) { initFieldType(field, STRING); this.locale = locale; this.reverse = reverse; } /** Creates a sort with a custom comparison function. * @param field Name of field to sort by; cannot be null. * @param comparator Returns a comparator for sorting hits. * @deprecated use SortField (String field, FieldComparatorSource comparator) */ public SortField (String field, SortComparatorSource comparator) { initFieldType(field, CUSTOM); setUseLegacySearch(true); this.factory = comparator; } /** Creates a sort with a custom comparison function. * @param field Name of field to sort by; cannot be null. * @param comparator Returns a comparator for sorting hits. */ public SortField (String field, FieldComparatorSource comparator) { initFieldType(field, CUSTOM); this.comparatorSource = comparator; } /** Creates a sort, possibly in reverse, with a custom comparison function. * @param field Name of field to sort by; cannot be null. * @param comparator Returns a comparator for sorting hits. * @param reverse True if natural order should be reversed. * @deprecated use SortField (String field, FieldComparatorSource comparator, boolean reverse) */ public SortField (String field, SortComparatorSource comparator, boolean reverse) { initFieldType(field, CUSTOM); setUseLegacySearch(true); this.reverse = reverse; this.factory = comparator; } /** Creates a sort, possibly in reverse, with a custom comparison function. * @param field Name of field to sort by; cannot be null. * @param comparator Returns a comparator for sorting hits. * @param reverse True if natural order should be reversed. */ public SortField (String field, FieldComparatorSource comparator, boolean reverse) { initFieldType(field, CUSTOM); this.reverse = reverse; this.comparatorSource = comparator; } // Sets field & type, and ensures field is not NULL unless // type is SCORE or DOC private void initFieldType(String field, int type) { this.type = type; if (field == null) { if (type != SCORE && type != DOC) throw new IllegalArgumentException("field can only be null when type is SCORE or DOC"); } else { this.field = StringHelper.intern(field); } } /** Returns the name of the field. Could return null * if the sort is by SCORE or DOC. * @return Name of field, possibly null. */ public String getField() { return field; } /** Returns the type of contents in the field. * @return One of the constants SCORE, DOC, AUTO, STRING, INT or FLOAT. */ public int getType() { return type; } /** Returns the Locale by which term values are interpreted. * May return null if no Locale was specified. * @return Locale, or null. */ public Locale getLocale() { return locale; } /** Returns the instance of a {@link FieldCache} parser that fits to the given sort type. * May return null if no parser was specified. Sorting is using the default parser then. * @return An instance of a {@link FieldCache} parser, or null. */ public FieldCache.Parser getParser() { return parser; } /** Returns whether the sort should be reversed. * @return True if natural order should be reversed. */ public boolean getReverse() { return reverse; } /** * @deprecated use {@link #getComparatorSource()} */ public SortComparatorSource getFactory() { return factory; } public FieldComparatorSource getComparatorSource() { return comparatorSource; } /** * Use legacy IndexSearch implementation: search with a DirectoryReader rather * than passing a single hit collector to multiple SegmentReaders. * * @param legacy true for legacy behavior * @deprecated will be removed in Lucene 3.0. */ public void setUseLegacySearch(boolean legacy) { this.useLegacy = legacy; } /** * @return if true, IndexSearch will use legacy sorting search implementation. * eg. multiple Priority Queues. * @deprecated will be removed in Lucene 3.0. */ public boolean getUseLegacySearch() { return this.useLegacy; } public String toString() { StringBuffer buffer = new StringBuffer(); switch (type) { case SCORE: buffer.append(""); break; case DOC: buffer.append(""); break; case AUTO: buffer.append(""); break; case STRING: buffer.append(""); break; case STRING_VAL: buffer.append(""); break; case BYTE: buffer.append(""); break; case SHORT: buffer.append(""); break; case INT: buffer.append(""); break; case LONG: buffer.append(""); break; case FLOAT: buffer.append(""); break; case DOUBLE: buffer.append(""); break; case CUSTOM: buffer.append("'); break; default: buffer.append(""); break; } if (locale != null) buffer.append('(').append(locale).append(')'); if (parser != null) buffer.append('(').append(parser).append(')'); if (reverse) buffer.append('!'); return buffer.toString(); } /** Returns true if o is equal to this. If a * {@link SortComparatorSource} (deprecated) or {@link * FieldCache.Parser} was provided, it must properly * implement equals (unless a singleton is always used). */ public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof SortField)) return false; final SortField other = (SortField)o; return ( other.field == this.field // field is always interned && other.type == this.type && other.reverse == this.reverse && (other.locale == null ? this.locale == null : other.locale.equals(this.locale)) && (other.factory == null ? this.factory == null : other.factory.equals(this.factory)) && (other.comparatorSource == null ? this.comparatorSource == null : other.comparatorSource.equals(this.comparatorSource)) && (other.parser == null ? this.parser == null : other.parser.equals(this.parser)) ); } /** Returns true if o is equal to this. If a * {@link SortComparatorSource} (deprecated) or {@link * FieldCache.Parser} was provided, it must properly * implement hashCode (unless a singleton is always * used). */ public int hashCode() { int hash=type^0x346565dd + Boolean.valueOf(reverse).hashCode()^0xaf5998bb; if (field != null) hash += field.hashCode()^0xff5685dd; if (locale != null) hash += locale.hashCode()^0x08150815; if (factory != null) hash += factory.hashCode()^0x34987555; if (comparatorSource != null) hash += comparatorSource.hashCode(); if (parser != null) hash += parser.hashCode()^0x3aaf56ff; return hash; } // field must be interned after reading from stream private void readObject(java.io.ObjectInputStream in) throws java.io.IOException, ClassNotFoundException { in.defaultReadObject(); if (field != null) field = StringHelper.intern(field); } /** Returns the {@link FieldComparator} to use for * sorting. * * NOTE: This API is experimental and might change in * incompatible ways in the next release. * * @param numHits number of top hits the queue will store * @param sortPos position of this SortField within {@link * Sort}. The comparator is primary if sortPos==0, * secondary if sortPos==1, etc. Some comparators can * optimize themselves when they are the primary sort. * @return {@link FieldComparator} to use when sorting */ public FieldComparator getComparator(final int numHits, final int sortPos) throws IOException { if (locale != null) { // TODO: it'd be nice to allow FieldCache.getStringIndex // to optionally accept a Locale so sorting could then use // the faster StringComparator impls return new FieldComparator.StringComparatorLocale(numHits, field, locale); } switch (type) { case SortField.SCORE: return new FieldComparator.RelevanceComparator(numHits); case SortField.DOC: return new FieldComparator.DocComparator(numHits); case SortField.INT: return new FieldComparator.IntComparator(numHits, field, parser); case SortField.FLOAT: return new FieldComparator.FloatComparator(numHits, field, parser); case SortField.LONG: return new FieldComparator.LongComparator(numHits, field, parser); case SortField.DOUBLE: return new FieldComparator.DoubleComparator(numHits, field, parser); case SortField.BYTE: return new FieldComparator.ByteComparator(numHits, field, parser); case SortField.SHORT: return new FieldComparator.ShortComparator(numHits, field, parser); case SortField.CUSTOM: assert factory == null && comparatorSource != null; return comparatorSource.newComparator(field, numHits, sortPos, reverse); case SortField.STRING: return new FieldComparator.StringOrdValComparator(numHits, field, sortPos, reverse); case SortField.STRING_VAL: return new FieldComparator.StringValComparator(numHits, field); default: throw new IllegalStateException("Illegal sort type: " + type); } } /** * Attempts to detect the given field type for an IndexReader. * @deprecated */ static int detectFieldType(IndexReader reader, String fieldKey) throws IOException { String field = StringHelper.intern(fieldKey); TermEnum enumerator = reader.terms(new Term(field)); try { Term term = enumerator.term(); if (term == null) { throw new RuntimeException("no terms in field " + field + " - cannot determine sort type"); } int ret = 0; if (term.field() == field) { String termtext = term.text().trim(); try { Integer.parseInt (termtext); ret = SortField.INT; } catch (NumberFormatException nfe1) { try { Long.parseLong(termtext); ret = SortField.LONG; } catch (NumberFormatException nfe2) { try { Float.parseFloat (termtext); ret = SortField.FLOAT; } catch (NumberFormatException nfe3) { ret = SortField.STRING; } } } } else { throw new RuntimeException("field \"" + field + "\" does not appear to be indexed"); } return ret; } finally { enumerator.close(); } } } lucene-2.9.4/src/java/org/apache/lucene/search/PhraseScorer.java0000644000175000017500000001471111474320224025161 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.TermPositions; /** Expert: Scoring functionality for phrase queries. *
    A document is considered matching if it contains the phrase-query terms * at "valid" positions. What "valid positions" are * depends on the type of the phrase query: for an exact phrase query terms are required * to appear in adjacent locations, while for a sloppy phrase query some distance between * the terms is allowed. The abstract method {@link #phraseFreq()} of extending classes * is invoked for each document containing all the phrase query terms, in order to * compute the frequency of the phrase query in that document. A non zero frequency * means a match. */ abstract class PhraseScorer extends Scorer { private Weight weight; protected byte[] norms; protected float value; private boolean firstTime = true; private boolean more = true; protected PhraseQueue pq; protected PhrasePositions first, last; private float freq; //phrase frequency in current doc as computed by phraseFreq(). PhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity, byte[] norms) { super(similarity); this.norms = norms; this.weight = weight; this.value = weight.getValue(); // convert tps to a list of phrase positions. // note: phrase-position differs from term-position in that its position // reflects the phrase offset: pp.pos = tp.pos - offset. // this allows to easily identify a matching (exact) phrase // when all PhrasePositions have exactly the same position. for (int i = 0; i < tps.length; i++) { PhrasePositions pp = new PhrasePositions(tps[i], offsets[i]); if (last != null) { // add next to end of list last.next = pp; } else { first = pp; } last = pp; } pq = new PhraseQueue(tps.length); // construct empty pq first.doc = -1; } /** @deprecated use {@link #docID()} instead. */ public int doc() { return first.doc; } public int docID() { return first.doc; } /** @deprecated use {@link #nextDoc()} instead. */ public boolean next() throws IOException { return nextDoc() != NO_MORE_DOCS; } public int nextDoc() throws IOException { if (firstTime) { init(); firstTime = false; } else if (more) { more = last.next(); // trigger further scanning } if (!doNext()) { first.doc = NO_MORE_DOCS; } return first.doc; } // next without initial increment private boolean doNext() throws IOException { while (more) { while (more && first.doc < last.doc) { // find doc w/ all the terms more = first.skipTo(last.doc); // skip first upto last firstToLast(); // and move it to the end } if (more) { // found a doc with all of the terms freq = phraseFreq(); // check for phrase if (freq == 0.0f) // no match more = last.next(); // trigger further scanning else return true; // found a match } } return false; // no more matches } public float score() throws IOException { //System.out.println("scoring " + first.doc); float raw = getSimilarity().tf(freq) * value; // raw score return norms == null ? raw : raw * Similarity.decodeNorm(norms[first.doc]); // normalize } /** @deprecated use {@link #advance(int)} instead. */ public boolean skipTo(int target) throws IOException { return advance(target) != NO_MORE_DOCS; } public int advance(int target) throws IOException { firstTime = false; for (PhrasePositions pp = first; more && pp != null; pp = pp.next) { more = pp.skipTo(target); } if (more) { sort(); // re-sort } if (!doNext()) { first.doc = NO_MORE_DOCS; } return first.doc; } /** * For a document containing all the phrase query terms, compute the * frequency of the phrase in that document. * A non zero frequency means a match. *
    Note, that containing all phrase terms does not guarantee a match - they have to be found in matching locations. * @return frequency of the phrase in current doc, 0 if not found. */ protected abstract float phraseFreq() throws IOException; private void init() throws IOException { for (PhrasePositions pp = first; more && pp != null; pp = pp.next) { more = pp.next(); } if (more) { sort(); } } private void sort() { pq.clear(); for (PhrasePositions pp = first; pp != null; pp = pp.next) { pq.add(pp); } pqToList(); } protected final void pqToList() { last = first = null; while (pq.top() != null) { PhrasePositions pp = (PhrasePositions) pq.pop(); if (last != null) { // add next to end of list last.next = pp; } else first = pp; last = pp; pp.next = null; } } protected final void firstToLast() { last.next = first; // move first to end of list last = first; first = first.next; last.next = null; } public Explanation explain(final int doc) throws IOException { Explanation tfExplanation = new Explanation(); int d = advance(doc); float phraseFreq = (d == doc) ? freq : 0.0f; tfExplanation.setValue(getSimilarity().tf(phraseFreq)); tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")"); return tfExplanation; } public String toString() { return "scorer(" + weight + ")"; } } lucene-2.9.4/src/java/org/apache/lucene/search/DefaultSimilarity.java0000644000175000017500000000626711474320225026224 0ustar janpascaljanpascalpackage org.apache.lucene.search; import org.apache.lucene.index.FieldInvertState; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** Expert: Default scoring implementation. */ public class DefaultSimilarity extends Similarity { /** Implemented as * state.getBoost()*lengthNorm(numTerms), where * numTerms is {@link FieldInvertState#getLength()} if {@link * #setDiscountOverlaps} is false, else it's {@link * FieldInvertState#getLength()} - {@link * FieldInvertState#getNumOverlap()}. * *

    WARNING: This API is new and experimental, and may suddenly * change.

    */ public float computeNorm(String field, FieldInvertState state) { final int numTerms; if (discountOverlaps) numTerms = state.getLength() - state.getNumOverlap(); else numTerms = state.getLength(); return (float) (state.getBoost() * lengthNorm(field, numTerms)); } /** Implemented as 1/sqrt(numTerms). */ public float lengthNorm(String fieldName, int numTerms) { return (float)(1.0 / Math.sqrt(numTerms)); } /** Implemented as 1/sqrt(sumOfSquaredWeights). */ public float queryNorm(float sumOfSquaredWeights) { return (float)(1.0 / Math.sqrt(sumOfSquaredWeights)); } /** Implemented as sqrt(freq). */ public float tf(float freq) { return (float)Math.sqrt(freq); } /** Implemented as 1 / (distance + 1). */ public float sloppyFreq(int distance) { return 1.0f / (distance + 1); } /** Implemented as log(numDocs/(docFreq+1)) + 1. */ public float idf(int docFreq, int numDocs) { return (float)(Math.log(numDocs/(double)(docFreq+1)) + 1.0); } /** Implemented as overlap / maxOverlap. */ public float coord(int overlap, int maxOverlap) { return overlap / (float)maxOverlap; } // Default false protected boolean discountOverlaps; /** Determines whether overlap tokens (Tokens with * 0 position increment) are ignored when computing * norm. By default this is false, meaning overlap * tokens are counted just like non-overlap tokens. * *

    WARNING: This API is new and experimental, and may suddenly * change.

    * * @see #computeNorm */ public void setDiscountOverlaps(boolean v) { discountOverlaps = v; } /** @see #setDiscountOverlaps */ public boolean getDiscountOverlaps() { return discountOverlaps; } } lucene-2.9.4/src/java/org/apache/lucene/search/FieldValueHitQueue.java0000644000175000017500000001672011474320224026255 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.util.PriorityQueue; /** * Expert: A hit queue for sorting by hits by terms in more than one field. * Uses FieldCache.DEFAULT for maintaining * internal term lookup tables. * * This class will not resolve SortField.AUTO types, and expects the type * of all SortFields used for construction to already have been resolved. * {@link SortField#detectFieldType(IndexReader, String)} is a utility method which * may be used for field type detection. * * NOTE: This API is experimental and might change in * incompatible ways in the next release. * * @since 2.9 * @version $Id: * @see Searcher#search(Query,Filter,int,Sort) * @see FieldCache */ public abstract class FieldValueHitQueue extends PriorityQueue { final static class Entry { int slot; int docID; float score; Entry(int slot, int docID, float score) { this.slot = slot; this.docID = docID; this.score = score; } public String toString() { return "slot:" + slot + " docID:" + docID + " score=" + score; } } /** * An implementation of {@link FieldValueHitQueue} which is optimized in case * there is just one comparator. */ private static final class OneComparatorFieldValueHitQueue extends FieldValueHitQueue { private final FieldComparator comparator; private final int oneReverseMul; public OneComparatorFieldValueHitQueue(SortField[] fields, int size) throws IOException { super(fields); if (fields.length == 0) { throw new IllegalArgumentException("Sort must contain at least one field"); } SortField field = fields[0]; // AUTO is resolved before we are called assert field.getType() != SortField.AUTO; comparator = field.getComparator(size, 0); oneReverseMul = field.reverse ? -1 : 1; comparators[0] = comparator; reverseMul[0] = oneReverseMul; initialize(size); } /** * Returns whether a is less relevant than b. * @param a ScoreDoc * @param b ScoreDoc * @return true if document a should be sorted after document b. */ protected boolean lessThan(final Object a, final Object b) { final Entry hitA = (Entry) a; final Entry hitB = (Entry) b; assert hitA != hitB; assert hitA.slot != hitB.slot; final int c = oneReverseMul * comparator.compare(hitA.slot, hitB.slot); if (c != 0) { return c > 0; } // avoid random sort order that could lead to duplicates (bug #31241): return hitA.docID > hitB.docID; } } /** * An implementation of {@link FieldValueHitQueue} which is optimized in case * there is more than one comparator. */ private static final class MultiComparatorsFieldValueHitQueue extends FieldValueHitQueue { public MultiComparatorsFieldValueHitQueue(SortField[] fields, int size) throws IOException { super(fields); int numComparators = comparators.length; for (int i = 0; i < numComparators; ++i) { SortField field = fields[i]; // AUTO is resolved before we are called assert field.getType() != SortField.AUTO; reverseMul[i] = field.reverse ? -1 : 1; comparators[i] = field.getComparator(size, i); } initialize(size); } protected boolean lessThan(final Object a, final Object b) { final Entry hitA = (Entry) a; final Entry hitB = (Entry) b; assert hitA != hitB; assert hitA.slot != hitB.slot; int numComparators = comparators.length; for (int i = 0; i < numComparators; ++i) { final int c = reverseMul[i] * comparators[i].compare(hitA.slot, hitB.slot); if (c != 0) { // Short circuit return c > 0; } } // avoid random sort order that could lead to duplicates (bug #31241): return hitA.docID > hitB.docID; } } // prevent instantiation and extension. private FieldValueHitQueue(SortField[] fields) { // When we get here, fields.length is guaranteed to be > 0, therefore no // need to check it again. // All these are required by this class's API - need to return arrays. // Therefore even in the case of a single comparator, create an array // anyway. this.fields = fields; int numComparators = fields.length; comparators = new FieldComparator[numComparators]; reverseMul = new int[numComparators]; } /** * Creates a hit queue sorted by the given list of fields. * *

    NOTE: The instances returned by this method * pre-allocate a full array of length numHits. * * @param fields * SortField array we are sorting by in priority order (highest * priority first); cannot be null or empty * @param size * The number of hits to retain. Must be greater than zero. * @throws IOException */ public static FieldValueHitQueue create(SortField[] fields, int size) throws IOException { if (fields.length == 0) { throw new IllegalArgumentException("Sort must contain at least one field"); } if (fields.length == 1) { return new OneComparatorFieldValueHitQueue(fields, size); } else { return new MultiComparatorsFieldValueHitQueue(fields, size); } } FieldComparator[] getComparators() { return comparators; } int[] getReverseMul() { return reverseMul; } /** Stores the sort criteria being used. */ protected final SortField[] fields; protected final FieldComparator[] comparators; protected final int[] reverseMul; protected abstract boolean lessThan (final Object a, final Object b); /** * Given a queue Entry, creates a corresponding FieldDoc * that contains the values used to sort the given document. * These values are not the raw values out of the index, but the internal * representation of them. This is so the given search hit can be collated by * a MultiSearcher with other search hits. * * @param entry The Entry used to create a FieldDoc * @return The newly created FieldDoc * @see Searchable#search(Weight,Filter,int,Sort) */ FieldDoc fillFields(final Entry entry) { final int n = comparators.length; final Comparable[] fields = new Comparable[n]; for (int i = 0; i < n; ++i) { fields[i] = comparators[i].value(entry.slot); } //if (maxscore > 1.0f) doc.score /= maxscore; // normalize scores return new FieldDoc(entry.docID, entry.score, fields); } /** Returns the SortFields being used by this hit queue. */ SortField[] getFields() { return fields; } } lucene-2.9.4/src/java/org/apache/lucene/search/FilteredTermEnum.java0000644000175000017500000000676011474320224026001 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; /** Abstract class for enumerating a subset of all terms.

    Term enumerations are always ordered by Term.compareTo(). Each term in the enumeration is greater than all that precede it. */ public abstract class FilteredTermEnum extends TermEnum { /** the current term */ protected Term currentTerm = null; /** the delegate enum - to set this member use {@link #setEnum} */ protected TermEnum actualEnum = null; public FilteredTermEnum() {} /** Equality compare on the term */ protected abstract boolean termCompare(Term term); /** Equality measure on the term */ public abstract float difference(); /** Indicates the end of the enumeration has been reached */ protected abstract boolean endEnum(); /** * use this method to set the actual TermEnum (e.g. in ctor), * it will be automatically positioned on the first matching term. */ protected void setEnum(TermEnum actualEnum) throws IOException { this.actualEnum = actualEnum; // Find the first term that matches Term term = actualEnum.term(); if (term != null && termCompare(term)) currentTerm = term; else next(); } /** * Returns the docFreq of the current Term in the enumeration. * Returns -1 if no Term matches or all terms have been enumerated. */ public int docFreq() { if (currentTerm == null) return -1; assert actualEnum != null; return actualEnum.docFreq(); } /** Increments the enumeration to the next element. True if one exists. */ public boolean next() throws IOException { if (actualEnum == null) return false; // the actual enumerator is not initialized! currentTerm = null; while (currentTerm == null) { if (endEnum()) return false; if (actualEnum.next()) { Term term = actualEnum.term(); if (termCompare(term)) { currentTerm = term; return true; } } else return false; } currentTerm = null; return false; } /** Returns the current Term in the enumeration. * Returns null if no Term matches or all terms have been enumerated. */ public Term term() { return currentTerm; } /** Closes the enumeration to further activity, freeing resources. */ public void close() throws IOException { if (actualEnum != null) actualEnum.close(); currentTerm = null; actualEnum = null; } } lucene-2.9.4/src/java/org/apache/lucene/search/ExtendedFieldCache.java0000644000175000017500000000400411474320224026203 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import java.io.IOException; /** * This interface is obsolete, use {@link FieldCache} instead. * * @deprecated Use {@link FieldCache}, this will be removed in Lucene 3.0 **/ public interface ExtendedFieldCache extends FieldCache { /** @deprecated Use {@link FieldCache#DEFAULT}; this will be removed in Lucene 3.0 */ public static ExtendedFieldCache EXT_DEFAULT = (ExtendedFieldCache) FieldCache.DEFAULT; /** @deprecated Use {@link FieldCache.LongParser}, this will be removed in Lucene 3.0 */ public interface LongParser extends FieldCache.LongParser { } /** @deprecated Use {@link FieldCache.DoubleParser}, this will be removed in Lucene 3.0 */ public interface DoubleParser extends FieldCache.DoubleParser { } /** @deprecated Will be removed in 3.0, this is for binary compatibility only */ public long[] getLongs(IndexReader reader, String field, ExtendedFieldCache.LongParser parser) throws IOException; /** @deprecated Will be removed in 3.0, this is for binary compatibility only */ public double[] getDoubles(IndexReader reader, String field, ExtendedFieldCache.DoubleParser parser) throws IOException; } lucene-2.9.4/src/java/org/apache/lucene/search/PhraseQuery.java0000644000175000017500000002440711474320224025034 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Set; import java.util.ArrayList; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermPositions; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.util.ToStringUtils; /** A Query that matches documents containing a particular sequence of terms. * A PhraseQuery is built by QueryParser for input like "new york". * *

    This query may be combined with other terms or queries with a {@link BooleanQuery}. */ public class PhraseQuery extends Query { private String field; private ArrayList terms = new ArrayList(4); private ArrayList positions = new ArrayList(4); private int maxPosition = 0; private int slop = 0; /** Constructs an empty phrase query. */ public PhraseQuery() {} /** Sets the number of other words permitted between words in query phrase. If zero, then this is an exact phrase search. For larger values this works like a WITHIN or NEAR operator.

    The slop is in fact an edit-distance, where the units correspond to moves of terms in the query phrase out of position. For example, to switch the order of two words requires two moves (the first move places the words atop one another), so to permit re-orderings of phrases, the slop must be at least two.

    More exact matches are scored higher than sloppier matches, thus search results are sorted by exactness.

    The slop is zero by default, requiring exact matches.*/ public void setSlop(int s) { slop = s; } /** Returns the slop. See setSlop(). */ public int getSlop() { return slop; } /** * Adds a term to the end of the query phrase. * The relative position of the term is the one immediately after the last term added. */ public void add(Term term) { int position = 0; if(positions.size() > 0) position = ((Integer) positions.get(positions.size()-1)).intValue() + 1; add(term, position); } /** * Adds a term to the end of the query phrase. * The relative position of the term within the phrase is specified explicitly. * This allows e.g. phrases with more than one term at the same position * or phrases with gaps (e.g. in connection with stopwords). * * @param term * @param position */ public void add(Term term, int position) { if (terms.size() == 0) field = term.field(); else if (term.field() != field) throw new IllegalArgumentException("All phrase terms must be in the same field: " + term); terms.add(term); positions.add(new Integer(position)); if (position > maxPosition) maxPosition = position; } /** Returns the set of terms in this phrase. */ public Term[] getTerms() { return (Term[])terms.toArray(new Term[0]); } /** * Returns the relative positions of terms in this phrase. */ public int[] getPositions() { int[] result = new int[positions.size()]; for(int i = 0; i < positions.size(); i++) result[i] = ((Integer) positions.get(i)).intValue(); return result; } private class PhraseWeight extends Weight { private Similarity similarity; private float value; private float idf; private float queryNorm; private float queryWeight; private IDFExplanation idfExp; public PhraseWeight(Searcher searcher) throws IOException { this.similarity = getSimilarity(searcher); idfExp = similarity.idfExplain(terms, searcher); idf = idfExp.getIdf(); } public String toString() { return "weight(" + PhraseQuery.this + ")"; } public Query getQuery() { return PhraseQuery.this; } public float getValue() { return value; } public float sumOfSquaredWeights() { queryWeight = idf * getBoost(); // compute query weight return queryWeight * queryWeight; // square it } public void normalize(float queryNorm) { this.queryNorm = queryNorm; queryWeight *= queryNorm; // normalize query weight value = queryWeight * idf; // idf for document } public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException { if (terms.size() == 0) // optimize zero-term case return null; TermPositions[] tps = new TermPositions[terms.size()]; for (int i = 0; i < terms.size(); i++) { TermPositions p = reader.termPositions((Term)terms.get(i)); if (p == null) return null; tps[i] = p; } if (slop == 0) // optimize exact case return new ExactPhraseScorer(this, tps, getPositions(), similarity, reader.norms(field)); else return new SloppyPhraseScorer(this, tps, getPositions(), similarity, slop, reader.norms(field)); } public Explanation explain(IndexReader reader, int doc) throws IOException { Explanation result = new Explanation(); result.setDescription("weight("+getQuery()+" in "+doc+"), product of:"); StringBuffer docFreqs = new StringBuffer(); StringBuffer query = new StringBuffer(); query.append('\"'); docFreqs.append(idfExp.explain()); for (int i = 0; i < terms.size(); i++) { if (i != 0) { query.append(" "); } Term term = (Term)terms.get(i); query.append(term.text()); } query.append('\"'); Explanation idfExpl = new Explanation(idf, "idf(" + field + ":" + docFreqs + ")"); // explain query weight Explanation queryExpl = new Explanation(); queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:"); Explanation boostExpl = new Explanation(getBoost(), "boost"); if (getBoost() != 1.0f) queryExpl.addDetail(boostExpl); queryExpl.addDetail(idfExpl); Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm"); queryExpl.addDetail(queryNormExpl); queryExpl.setValue(boostExpl.getValue() * idfExpl.getValue() * queryNormExpl.getValue()); result.addDetail(queryExpl); // explain field weight Explanation fieldExpl = new Explanation(); fieldExpl.setDescription("fieldWeight("+field+":"+query+" in "+doc+ "), product of:"); Scorer scorer = scorer(reader, true, false); if (scorer == null) { return new Explanation(0.0f, "no matching docs"); } Explanation tfExpl = scorer.explain(doc); fieldExpl.addDetail(tfExpl); fieldExpl.addDetail(idfExpl); Explanation fieldNormExpl = new Explanation(); byte[] fieldNorms = reader.norms(field); float fieldNorm = fieldNorms!=null ? Similarity.decodeNorm(fieldNorms[doc]) : 1.0f; fieldNormExpl.setValue(fieldNorm); fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")"); fieldExpl.addDetail(fieldNormExpl); fieldExpl.setValue(tfExpl.getValue() * idfExpl.getValue() * fieldNormExpl.getValue()); result.addDetail(fieldExpl); // combine them result.setValue(queryExpl.getValue() * fieldExpl.getValue()); if (queryExpl.getValue() == 1.0f) return fieldExpl; return result; } } public Weight createWeight(Searcher searcher) throws IOException { if (terms.size() == 1) { // optimize one-term case Term term = (Term)terms.get(0); Query termQuery = new TermQuery(term); termQuery.setBoost(getBoost()); return termQuery.createWeight(searcher); } return new PhraseWeight(searcher); } /** * @see org.apache.lucene.search.Query#extractTerms(java.util.Set) */ public void extractTerms(Set queryTerms) { queryTerms.addAll(terms); } /** Prints a user-readable version of this query. */ public String toString(String f) { StringBuffer buffer = new StringBuffer(); if (field != null && !field.equals(f)) { buffer.append(field); buffer.append(":"); } buffer.append("\""); String[] pieces = new String[maxPosition + 1]; for (int i = 0; i < terms.size(); i++) { int pos = ((Integer)positions.get(i)).intValue(); String s = pieces[pos]; if (s == null) { s = ((Term)terms.get(i)).text(); } else { s = s + "|" + ((Term)terms.get(i)).text(); } pieces[pos] = s; } for (int i = 0; i < pieces.length; i++) { if (i > 0) { buffer.append(' '); } String s = pieces[i]; if (s == null) { buffer.append('?'); } else { buffer.append(s); } } buffer.append("\""); if (slop != 0) { buffer.append("~"); buffer.append(slop); } buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } /** Returns true iff o is equal to this. */ public boolean equals(Object o) { if (!(o instanceof PhraseQuery)) return false; PhraseQuery other = (PhraseQuery)o; return (this.getBoost() == other.getBoost()) && (this.slop == other.slop) && this.terms.equals(other.terms) && this.positions.equals(other.positions); } /** Returns a hash code value for this object.*/ public int hashCode() { return Float.floatToIntBits(getBoost()) ^ slop ^ terms.hashCode() ^ positions.hashCode(); } } lucene-2.9.4/src/java/org/apache/lucene/search/ConstantScoreQuery.java0000644000175000017500000001324511474320224026375 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import java.io.IOException; import java.util.Set; /** * A query that wraps a filter and simply returns a constant score equal to the * query boost for every document in the filter. * * * @version $Id: ConstantScoreQuery.java 807180 2009-08-24 12:26:43Z markrmiller $ */ public class ConstantScoreQuery extends Query { protected final Filter filter; public ConstantScoreQuery(Filter filter) { this.filter=filter; } /** Returns the encapsulated filter */ public Filter getFilter() { return filter; } public Query rewrite(IndexReader reader) throws IOException { return this; } public void extractTerms(Set terms) { // OK to not add any terms when used for MultiSearcher, // but may not be OK for highlighting } protected class ConstantWeight extends Weight { private Similarity similarity; private float queryNorm; private float queryWeight; public ConstantWeight(Searcher searcher) { this.similarity = getSimilarity(searcher); } public Query getQuery() { return ConstantScoreQuery.this; } public float getValue() { return queryWeight; } public float sumOfSquaredWeights() throws IOException { queryWeight = getBoost(); return queryWeight * queryWeight; } public void normalize(float norm) { this.queryNorm = norm; queryWeight *= this.queryNorm; } public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException { return new ConstantScorer(similarity, reader, this); } public Explanation explain(IndexReader reader, int doc) throws IOException { ConstantScorer cs = new ConstantScorer(similarity, reader, this); boolean exists = cs.docIdSetIterator.advance(doc) == doc; ComplexExplanation result = new ComplexExplanation(); if (exists) { result.setDescription("ConstantScoreQuery(" + filter + "), product of:"); result.setValue(queryWeight); result.setMatch(Boolean.TRUE); result.addDetail(new Explanation(getBoost(), "boost")); result.addDetail(new Explanation(queryNorm,"queryNorm")); } else { result.setDescription("ConstantScoreQuery(" + filter + ") doesn't match id " + doc); result.setValue(0); result.setMatch(Boolean.FALSE); } return result; } } protected class ConstantScorer extends Scorer { final DocIdSetIterator docIdSetIterator; final float theScore; int doc = -1; public ConstantScorer(Similarity similarity, IndexReader reader, Weight w) throws IOException { super(similarity); theScore = w.getValue(); DocIdSet docIdSet = filter.getDocIdSet(reader); if (docIdSet == null) { docIdSetIterator = DocIdSet.EMPTY_DOCIDSET.iterator(); } else { DocIdSetIterator iter = docIdSet.iterator(); if (iter == null) { docIdSetIterator = DocIdSet.EMPTY_DOCIDSET.iterator(); } else { docIdSetIterator = iter; } } } /** @deprecated use {@link #nextDoc()} instead. */ public boolean next() throws IOException { return docIdSetIterator.nextDoc() != NO_MORE_DOCS; } public int nextDoc() throws IOException { return docIdSetIterator.nextDoc(); } /** @deprecated use {@link #docID()} instead. */ public int doc() { return docIdSetIterator.doc(); } public int docID() { return docIdSetIterator.docID(); } public float score() throws IOException { return theScore; } /** @deprecated use {@link #advance(int)} instead. */ public boolean skipTo(int target) throws IOException { return docIdSetIterator.advance(target) != NO_MORE_DOCS; } public int advance(int target) throws IOException { return docIdSetIterator.advance(target); } public Explanation explain(int doc) throws IOException { throw new UnsupportedOperationException(); } } public Weight createWeight(Searcher searcher) { return new ConstantScoreQuery.ConstantWeight(searcher); } /** Prints a user-readable version of this query. */ public String toString(String field) { return "ConstantScore(" + filter.toString() + (getBoost()==1.0 ? ")" : "^" + getBoost()); } /** Returns true if o is equal to this. */ public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof ConstantScoreQuery)) return false; ConstantScoreQuery other = (ConstantScoreQuery)o; return this.getBoost()==other.getBoost() && filter.equals(other.filter); } /** Returns a hash code value for this object. */ public int hashCode() { // Simple add is OK since no existing filter hashcode has a float component. return filter.hashCode() + Float.floatToIntBits(getBoost()); } } lucene-2.9.4/src/java/org/apache/lucene/search/NumericRangeQuery.java0000644000175000017500000006026111474320224026167 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.LinkedList; import org.apache.lucene.analysis.NumericTokenStream; // for javadocs import org.apache.lucene.document.NumericField; // for javadocs import org.apache.lucene.util.NumericUtils; import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.util.StringHelper; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; /** *

    A {@link Query} that matches numeric values within a * specified range. To use this, you must first index the * numeric values using {@link NumericField} (expert: {@link * NumericTokenStream}). If your terms are instead textual, * you should use {@link TermRangeQuery}. {@link * NumericRangeFilter} is the filter equivalent of this * query.

    * *

    You create a new NumericRangeQuery with the static * factory methods, eg: * *

     * Query q = NumericRangeQuery.newFloatRange("weight",
     *                                           new Float(0.3f), new Float(0.10f),
     *                                           true, true);
     * 
    * * matches all documents whose float valued "weight" field * ranges from 0.3 to 0.10, inclusive. * *

    The performance of NumericRangeQuery is much better * than the corresponding {@link TermRangeQuery} because the * number of terms that must be searched is usually far * fewer, thanks to trie indexing, described below.

    * *

    You can optionally specify a precisionStep * when creating this query. This is necessary if you've * changed this configuration from its default (4) during * indexing. Lower values consume more disk space but speed * up searching. Suitable values are between 1 and * 8. A good starting point to test is 4, * which is the default value for all Numeric* * classes. See below for * details. * *

    This query defaults to {@linkplain * MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} for * 32 bit (int/float) ranges with precisionStep ≤8 and 64 * bit (long/double) ranges with precisionStep ≤6. * Otherwise it uses {@linkplain * MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE} as the * number of terms is likely to be high. With precision * steps of ≤4, this query can be run with one of the * BooleanQuery rewrite methods without changing * BooleanQuery's default max clause count. * *

    NOTE: This API is experimental and * might change in incompatible ways in the next release. * *

    How it works

    * *

    See the publication about panFMP, * where this algorithm was described (referred to as TrieRangeQuery): * *

    Schindler, U, Diepenbroek, M, 2008. * Generic XML-based Framework for Metadata Portals. * Computers & Geosciences 34 (12), 1947-1955. * doi:10.1016/j.cageo.2008.02.023
    * *

    A quote from this paper: Because Apache Lucene is a full-text * search engine and not a conventional database, it cannot handle numerical ranges * (e.g., field value is inside user defined bounds, even dates are numerical values). * We have developed an extension to Apache Lucene that stores * the numerical values in a special string-encoded format with variable precision * (all numerical values like doubles, longs, floats, and ints are converted to * lexicographic sortable string representations and stored with different precisions * (for a more detailed description of how the values are stored, * see {@link NumericUtils}). A range is then divided recursively into multiple intervals for searching: * The center of the range is searched only with the lowest possible precision in the trie, * while the boundaries are matched more exactly. This reduces the number of terms dramatically.

    * *

    For the variant that stores long values in 8 different precisions (each reduced by 8 bits) that * uses a lowest precision of 1 byte, the index contains only a maximum of 256 distinct values in the * lowest precision. Overall, a range could consist of a theoretical maximum of * 7*255*2 + 255 = 3825 distinct terms (when there is a term for every distinct value of an * 8-byte-number in the index and the range covers almost all of them; a maximum of 255 distinct values is used * because it would always be possible to reduce the full 256 values to one term with degraded precision). * In practice, we have seen up to 300 terms in most cases (index with 500,000 metadata records * and a uniform value distribution).

    * *

    Precision Step

    *

    You can choose any precisionStep when encoding values. * Lower step values mean more precisions and so more terms in index (and index gets larger). * On the other hand, the maximum number of terms to match reduces, which optimized query speed. * The formula to calculate the maximum term count is: *

     *  n = [ (bitsPerValue/precisionStep - 1) * (2^precisionStep - 1 ) * 2 ] + (2^precisionStep - 1 )
     * 
    *

    (this formula is only correct, when bitsPerValue/precisionStep is an integer; * in other cases, the value must be rounded up and the last summand must contain the modulo of the division as * precision step). * For longs stored using a precision step of 4, n = 15*15*2 + 15 = 465, and for a precision * step of 2, n = 31*3*2 + 3 = 189. But the faster search speed is reduced by more seeking * in the term enum of the index. Because of this, the ideal precisionStep value can only * be found out by testing. Important: You can index with a lower precision step value and test search speed * using a multiple of the original step value.

    * *

    Good values for precisionStep are depending on usage and data type: *

      *
    • The default for all data types is 4, which is used, when no precisionStep is given. *
    • Ideal value in most cases for 64 bit data types (long, double) is 6 or 8. *
    • Ideal value in most cases for 32 bit data types (int, float) is 4. *
    • For low cardinality fields larger precision steps are good. If the cardinality is < 100, it is * fair to use {@link Integer#MAX_VALUE} (see below). *
    • Steps ≥64 for long/double and ≥32 for int/float produces one token * per value in the index and querying is as slow as a conventional {@link TermRangeQuery}. But it can be used * to produce fields, that are solely used for sorting (in this case simply use {@link Integer#MAX_VALUE} as * precisionStep). Using {@link NumericField NumericFields} for sorting * is ideal, because building the field cache is much faster than with text-only numbers. * These fields have one term per value and therefore also work with term enumeration for building distinct lists * (e.g. facets / preselected values to search for). * Sorting is also possible with range query optimized fields using one of the above precisionSteps. *
    * *

    Comparisons of the different types of RangeQueries on an index with about 500,000 docs showed * that {@link TermRangeQuery} in boolean rewrite mode (with raised {@link BooleanQuery} clause count) * took about 30-40 secs to complete, {@link TermRangeQuery} in constant score filter rewrite mode took 5 secs * and executing this class took <100ms to complete (on an Opteron64 machine, Java 1.5, 8 bit * precision step). This query type was developed for a geographic portal, where the performance for * e.g. bounding boxes or exact date/time stamps is important.

    * * @since 2.9 **/ public final class NumericRangeQuery extends MultiTermQuery { private NumericRangeQuery(final String field, final int precisionStep, final int valSize, Number min, Number max, final boolean minInclusive, final boolean maxInclusive ) { assert (valSize == 32 || valSize == 64); if (precisionStep < 1) throw new IllegalArgumentException("precisionStep must be >=1"); this.field = StringHelper.intern(field); this.precisionStep = precisionStep; this.valSize = valSize; this.min = min; this.max = max; this.minInclusive = minInclusive; this.maxInclusive = maxInclusive; // For bigger precisionSteps this query likely // hits too many terms, so set to CONSTANT_SCORE_FILTER right off // (especially as the FilteredTermEnum is costly if wasted only for AUTO tests because it // creates new enums from IndexReader for each sub-range) switch (valSize) { case 64: setRewriteMethod( (precisionStep > 6) ? CONSTANT_SCORE_FILTER_REWRITE : CONSTANT_SCORE_AUTO_REWRITE_DEFAULT ); break; case 32: setRewriteMethod( (precisionStep > 8) ? CONSTANT_SCORE_FILTER_REWRITE : CONSTANT_SCORE_AUTO_REWRITE_DEFAULT ); break; default: // should never happen throw new IllegalArgumentException("valSize must be 32 or 64"); } // shortcut if upper bound == lower bound if (min != null && min.equals(max)) { setRewriteMethod(CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE); } } /** * Factory that creates a NumericRangeQuery, that queries a long * range using the given
    precisionStep. * You can have half-open ranges (which are in fact </≤ or >/≥ queries) * by setting the min or max value to null. By setting inclusive to false, it will * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. */ public static NumericRangeQuery newLongRange(final String field, final int precisionStep, Long min, Long max, final boolean minInclusive, final boolean maxInclusive ) { return new NumericRangeQuery(field, precisionStep, 64, min, max, minInclusive, maxInclusive); } /** * Factory that creates a NumericRangeQuery, that queries a long * range using the default precisionStep {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). * You can have half-open ranges (which are in fact </≤ or >/≥ queries) * by setting the min or max value to null. By setting inclusive to false, it will * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. */ public static NumericRangeQuery newLongRange(final String field, Long min, Long max, final boolean minInclusive, final boolean maxInclusive ) { return new NumericRangeQuery(field, NumericUtils.PRECISION_STEP_DEFAULT, 64, min, max, minInclusive, maxInclusive); } /** * Factory that creates a NumericRangeQuery, that queries a int * range using the given precisionStep. * You can have half-open ranges (which are in fact </≤ or >/≥ queries) * by setting the min or max value to null. By setting inclusive to false, it will * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. */ public static NumericRangeQuery newIntRange(final String field, final int precisionStep, Integer min, Integer max, final boolean minInclusive, final boolean maxInclusive ) { return new NumericRangeQuery(field, precisionStep, 32, min, max, minInclusive, maxInclusive); } /** * Factory that creates a NumericRangeQuery, that queries a int * range using the default precisionStep {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). * You can have half-open ranges (which are in fact </≤ or >/≥ queries) * by setting the min or max value to null. By setting inclusive to false, it will * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. */ public static NumericRangeQuery newIntRange(final String field, Integer min, Integer max, final boolean minInclusive, final boolean maxInclusive ) { return new NumericRangeQuery(field, NumericUtils.PRECISION_STEP_DEFAULT, 32, min, max, minInclusive, maxInclusive); } /** * Factory that creates a NumericRangeQuery, that queries a double * range using the given precisionStep. * You can have half-open ranges (which are in fact </≤ or >/≥ queries) * by setting the min or max value to null. By setting inclusive to false, it will * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. */ public static NumericRangeQuery newDoubleRange(final String field, final int precisionStep, Double min, Double max, final boolean minInclusive, final boolean maxInclusive ) { return new NumericRangeQuery(field, precisionStep, 64, min, max, minInclusive, maxInclusive); } /** * Factory that creates a NumericRangeQuery, that queries a double * range using the default precisionStep {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). * You can have half-open ranges (which are in fact </≤ or >/≥ queries) * by setting the min or max value to null. By setting inclusive to false, it will * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. */ public static NumericRangeQuery newDoubleRange(final String field, Double min, Double max, final boolean minInclusive, final boolean maxInclusive ) { return new NumericRangeQuery(field, NumericUtils.PRECISION_STEP_DEFAULT, 64, min, max, minInclusive, maxInclusive); } /** * Factory that creates a NumericRangeQuery, that queries a float * range using the given precisionStep. * You can have half-open ranges (which are in fact </≤ or >/≥ queries) * by setting the min or max value to null. By setting inclusive to false, it will * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. */ public static NumericRangeQuery newFloatRange(final String field, final int precisionStep, Float min, Float max, final boolean minInclusive, final boolean maxInclusive ) { return new NumericRangeQuery(field, precisionStep, 32, min, max, minInclusive, maxInclusive); } /** * Factory that creates a NumericRangeQuery, that queries a float * range using the default precisionStep {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). * You can have half-open ranges (which are in fact </≤ or >/≥ queries) * by setting the min or max value to null. By setting inclusive to false, it will * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too. */ public static NumericRangeQuery newFloatRange(final String field, Float min, Float max, final boolean minInclusive, final boolean maxInclusive ) { return new NumericRangeQuery(field, NumericUtils.PRECISION_STEP_DEFAULT, 32, min, max, minInclusive, maxInclusive); } //@Override protected FilteredTermEnum getEnum(final IndexReader reader) throws IOException { return new NumericRangeTermEnum(reader); } /** Returns the field name for this query */ public String getField() { return field; } /** Returns true if the lower endpoint is inclusive */ public boolean includesMin() { return minInclusive; } /** Returns true if the upper endpoint is inclusive */ public boolean includesMax() { return maxInclusive; } /** Returns the lower value of this range query */ public Number getMin() { return min; } /** Returns the upper value of this range query */ public Number getMax() { return max; } //@Override public String toString(final String field) { final StringBuffer sb = new StringBuffer(); if (!this.field.equals(field)) sb.append(this.field).append(':'); return sb.append(minInclusive ? '[' : '{') .append((min == null) ? "*" : min.toString()) .append(" TO ") .append((max == null) ? "*" : max.toString()) .append(maxInclusive ? ']' : '}') .append(ToStringUtils.boost(getBoost())) .toString(); } //@Override public final boolean equals(final Object o) { if (o==this) return true; if (!super.equals(o)) return false; if (o instanceof NumericRangeQuery) { final NumericRangeQuery q=(NumericRangeQuery)o; return ( field==q.field && (q.min == null ? min == null : q.min.equals(min)) && (q.max == null ? max == null : q.max.equals(max)) && minInclusive == q.minInclusive && maxInclusive == q.maxInclusive && precisionStep == q.precisionStep ); } return false; } //@Override public final int hashCode() { int hash = super.hashCode(); hash += field.hashCode()^0x4565fd66 + precisionStep^0x64365465; if (min != null) hash += min.hashCode()^0x14fa55fb; if (max != null) hash += max.hashCode()^0x733fa5fe; return hash + (Boolean.valueOf(minInclusive).hashCode()^0x14fa55fb)+ (Boolean.valueOf(maxInclusive).hashCode()^0x733fa5fe); } // field must be interned after reading from stream private void readObject(java.io.ObjectInputStream in) throws java.io.IOException, ClassNotFoundException { in.defaultReadObject(); field = StringHelper.intern(field); } // members (package private, to be also fast accessible by NumericRangeTermEnum) String field; final int precisionStep, valSize; final Number min, max; final boolean minInclusive,maxInclusive; /** * Subclass of FilteredTermEnum for enumerating all terms that match the * sub-ranges for trie range queries. *

    * WARNING: This term enumeration is not guaranteed to be always ordered by * {@link Term#compareTo}. * The ordering depends on how {@link NumericUtils#splitLongRange} and * {@link NumericUtils#splitIntRange} generates the sub-ranges. For * {@link MultiTermQuery} ordering is not relevant. */ private final class NumericRangeTermEnum extends FilteredTermEnum { private final IndexReader reader; private final LinkedList/**/ rangeBounds = new LinkedList/**/(); private String currentUpperBound = null; NumericRangeTermEnum(final IndexReader reader) throws IOException { this.reader = reader; switch (valSize) { case 64: { // lower long minBound = Long.MIN_VALUE; if (min instanceof Long) { minBound = min.longValue(); } else if (min instanceof Double) { minBound = NumericUtils.doubleToSortableLong(min.doubleValue()); } if (!minInclusive && min != null) { if (minBound == Long.MAX_VALUE) break; minBound++; } // upper long maxBound = Long.MAX_VALUE; if (max instanceof Long) { maxBound = max.longValue(); } else if (max instanceof Double) { maxBound = NumericUtils.doubleToSortableLong(max.doubleValue()); } if (!maxInclusive && max != null) { if (maxBound == Long.MIN_VALUE) break; maxBound--; } NumericUtils.splitLongRange(new NumericUtils.LongRangeBuilder() { //@Override public final void addRange(String minPrefixCoded, String maxPrefixCoded) { rangeBounds.add(minPrefixCoded); rangeBounds.add(maxPrefixCoded); } }, precisionStep, minBound, maxBound); break; } case 32: { // lower int minBound = Integer.MIN_VALUE; if (min instanceof Integer) { minBound = min.intValue(); } else if (min instanceof Float) { minBound = NumericUtils.floatToSortableInt(min.floatValue()); } if (!minInclusive && min != null) { if (minBound == Integer.MAX_VALUE) break; minBound++; } // upper int maxBound = Integer.MAX_VALUE; if (max instanceof Integer) { maxBound = max.intValue(); } else if (max instanceof Float) { maxBound = NumericUtils.floatToSortableInt(max.floatValue()); } if (!maxInclusive && max != null) { if (maxBound == Integer.MIN_VALUE) break; maxBound--; } NumericUtils.splitIntRange(new NumericUtils.IntRangeBuilder() { //@Override public final void addRange(String minPrefixCoded, String maxPrefixCoded) { rangeBounds.add(minPrefixCoded); rangeBounds.add(maxPrefixCoded); } }, precisionStep, minBound, maxBound); break; } default: // should never happen throw new IllegalArgumentException("valSize must be 32 or 64"); } // seek to first term next(); } //@Override public float difference() { return 1.0f; } /** this is a dummy, it is not used by this class. */ //@Override protected boolean endEnum() { assert false; // should never be called return (currentTerm != null); } /** * Compares if current upper bound is reached, * this also updates the term count for statistics. * In contrast to {@link FilteredTermEnum}, a return value * of false ends iterating the current enum * and forwards to the next sub-range. */ //@Override protected boolean termCompare(Term term) { return (term.field() == field && term.text().compareTo(currentUpperBound) <= 0); } /** Increments the enumeration to the next element. True if one exists. */ //@Override public boolean next() throws IOException { // if a current term exists, the actual enum is initialized: // try change to next term, if no such term exists, fall-through if (currentTerm != null) { assert actualEnum!=null; if (actualEnum.next()) { currentTerm = actualEnum.term(); if (termCompare(currentTerm)) return true; } } // if all above fails, we go forward to the next enum, // if one is available currentTerm = null; if (rangeBounds.size() < 2) return false; // close the current enum and read next bounds if (actualEnum != null) { actualEnum.close(); actualEnum = null; } final String lowerBound = (String)rangeBounds.removeFirst(); this.currentUpperBound = (String)rangeBounds.removeFirst(); // this call recursively uses next(), if no valid term in // next enum found. // if this behavior is changed/modified in the superclass, // this enum will not work anymore! setEnum(reader.terms(new Term(field, lowerBound))); return (currentTerm != null); } /** Closes the enumeration to further activity, freeing resources. */ //@Override public void close() throws IOException { rangeBounds.clear(); currentUpperBound = null; super.close(); } } } lucene-2.9.4/src/java/org/apache/lucene/search/BooleanQuery.java0000644000175000017500000004352711474320224025175 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.search.BooleanClause.Occur; import java.io.IOException; import java.util.*; /** A Query that matches documents matching boolean combinations of other * queries, e.g. {@link TermQuery}s, {@link PhraseQuery}s or other * BooleanQuerys. */ public class BooleanQuery extends Query { private static int maxClauseCount = 1024; /** Thrown when an attempt is made to add more than {@link * #getMaxClauseCount()} clauses. This typically happens if * a PrefixQuery, FuzzyQuery, WildcardQuery, or TermRangeQuery * is expanded to many terms during search. */ public static class TooManyClauses extends RuntimeException { public TooManyClauses() {} public String getMessage() { return "maxClauseCount is set to " + maxClauseCount; } } /** Return the maximum number of clauses permitted, 1024 by default. * Attempts to add more than the permitted number of clauses cause {@link * TooManyClauses} to be thrown. * @see #setMaxClauseCount(int) */ public static int getMaxClauseCount() { return maxClauseCount; } /** * Set the maximum number of clauses permitted per BooleanQuery. * Default value is 1024. */ public static void setMaxClauseCount(int maxClauseCount) { if (maxClauseCount < 1) throw new IllegalArgumentException("maxClauseCount must be >= 1"); BooleanQuery.maxClauseCount = maxClauseCount; } private ArrayList clauses = new ArrayList(); private boolean disableCoord; /** Constructs an empty boolean query. */ public BooleanQuery() {} /** Constructs an empty boolean query. * * {@link Similarity#coord(int,int)} may be disabled in scoring, as * appropriate. For example, this score factor does not make sense for most * automatically generated queries, like {@link WildcardQuery} and {@link * FuzzyQuery}. * * @param disableCoord disables {@link Similarity#coord(int,int)} in scoring. */ public BooleanQuery(boolean disableCoord) { this.disableCoord = disableCoord; } /** Returns true iff {@link Similarity#coord(int,int)} is disabled in * scoring for this query instance. * @see #BooleanQuery(boolean) */ public boolean isCoordDisabled() { return disableCoord; } // Implement coord disabling. // Inherit javadoc. public Similarity getSimilarity(Searcher searcher) { Similarity result = super.getSimilarity(searcher); if (disableCoord) { // disable coord as requested result = new SimilarityDelegator(result) { public float coord(int overlap, int maxOverlap) { return 1.0f; } }; } return result; } /** * Specifies a minimum number of the optional BooleanClauses * which must be satisfied. * *

    * By default no optional clauses are necessary for a match * (unless there are no required clauses). If this method is used, * then the specified number of clauses is required. *

    *

    * Use of this method is totally independent of specifying that * any specific clauses are required (or prohibited). This number will * only be compared against the number of matching optional clauses. *

    *

    * EXPERT NOTE: Using this method may force collecting docs in order, * regardless of whether setAllowDocsOutOfOrder(true) has been called. *

    * * @param min the number of optional clauses that must match * @see #setAllowDocsOutOfOrder */ public void setMinimumNumberShouldMatch(int min) { this.minNrShouldMatch = min; } protected int minNrShouldMatch = 0; /** * Gets the minimum number of the optional BooleanClauses * which must be satisfied. */ public int getMinimumNumberShouldMatch() { return minNrShouldMatch; } /** Adds a clause to a boolean query. * * @throws TooManyClauses if the new number of clauses exceeds the maximum clause number * @see #getMaxClauseCount() */ public void add(Query query, BooleanClause.Occur occur) { add(new BooleanClause(query, occur)); } /** Adds a clause to a boolean query. * @throws TooManyClauses if the new number of clauses exceeds the maximum clause number * @see #getMaxClauseCount() */ public void add(BooleanClause clause) { if (clauses.size() >= maxClauseCount) throw new TooManyClauses(); clauses.add(clause); } /** Returns the set of clauses in this query. */ public BooleanClause[] getClauses() { return (BooleanClause[])clauses.toArray(new BooleanClause[clauses.size()]); } /** Returns the list of clauses in this query. */ public List clauses() { return clauses; } /** * Expert: the Weight for BooleanQuery, used to * normalize, score and explain these queries. * *

    NOTE: this API and implementation is subject to * change suddenly in the next release.

    */ protected class BooleanWeight extends Weight { /** The Similarity implementation. */ protected Similarity similarity; protected ArrayList weights; public BooleanWeight(Searcher searcher) throws IOException { this.similarity = getSimilarity(searcher); weights = new ArrayList(clauses.size()); for (int i = 0 ; i < clauses.size(); i++) { BooleanClause c = (BooleanClause)clauses.get(i); weights.add(c.getQuery().createWeight(searcher)); } } public Query getQuery() { return BooleanQuery.this; } public float getValue() { return getBoost(); } public float sumOfSquaredWeights() throws IOException { float sum = 0.0f; for (int i = 0 ; i < weights.size(); i++) { BooleanClause c = (BooleanClause)clauses.get(i); Weight w = (Weight)weights.get(i); // call sumOfSquaredWeights for all clauses in case of side effects float s = w.sumOfSquaredWeights(); // sum sub weights if (!c.isProhibited()) // only add to sum for non-prohibited clauses sum += s; } sum *= getBoost() * getBoost(); // boost each sub-weight return sum ; } public void normalize(float norm) { norm *= getBoost(); // incorporate boost for (Iterator iter = weights.iterator(); iter.hasNext();) { Weight w = (Weight) iter.next(); // normalize all clauses, (even if prohibited in case of side affects) w.normalize(norm); } } public Explanation explain(IndexReader reader, int doc) throws IOException { final int minShouldMatch = BooleanQuery.this.getMinimumNumberShouldMatch(); ComplexExplanation sumExpl = new ComplexExplanation(); sumExpl.setDescription("sum of:"); int coord = 0; int maxCoord = 0; float sum = 0.0f; boolean fail = false; int shouldMatchCount = 0; for (Iterator wIter = weights.iterator(), cIter = clauses.iterator(); wIter.hasNext();) { Weight w = (Weight) wIter.next(); BooleanClause c = (BooleanClause) cIter.next(); if (w.scorer(reader, true, true) == null) { continue; } Explanation e = w.explain(reader, doc); if (!c.isProhibited()) maxCoord++; if (e.isMatch()) { if (!c.isProhibited()) { sumExpl.addDetail(e); sum += e.getValue(); coord++; } else { Explanation r = new Explanation(0.0f, "match on prohibited clause (" + c.getQuery().toString() + ")"); r.addDetail(e); sumExpl.addDetail(r); fail = true; } if (c.getOccur() == Occur.SHOULD) shouldMatchCount++; } else if (c.isRequired()) { Explanation r = new Explanation(0.0f, "no match on required clause (" + c.getQuery().toString() + ")"); r.addDetail(e); sumExpl.addDetail(r); fail = true; } } if (fail) { sumExpl.setMatch(Boolean.FALSE); sumExpl.setValue(0.0f); sumExpl.setDescription ("Failure to meet condition(s) of required/prohibited clause(s)"); return sumExpl; } else if (shouldMatchCount < minShouldMatch) { sumExpl.setMatch(Boolean.FALSE); sumExpl.setValue(0.0f); sumExpl.setDescription("Failure to match minimum number "+ "of optional clauses: " + minShouldMatch); return sumExpl; } sumExpl.setMatch(0 < coord ? Boolean.TRUE : Boolean.FALSE); sumExpl.setValue(sum); float coordFactor = similarity.coord(coord, maxCoord); if (coordFactor == 1.0f) // coord is no-op return sumExpl; // eliminate wrapper else { ComplexExplanation result = new ComplexExplanation(sumExpl.isMatch(), sum*coordFactor, "product of:"); result.addDetail(sumExpl); result.addDetail(new Explanation(coordFactor, "coord("+coord+"/"+maxCoord+")")); return result; } } public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException { List required = new ArrayList(); List prohibited = new ArrayList(); List optional = new ArrayList(); for (Iterator wIter = weights.iterator(), cIter = clauses.iterator(); wIter.hasNext();) { Weight w = (Weight) wIter.next(); BooleanClause c = (BooleanClause) cIter.next(); Scorer subScorer = w.scorer(reader, true, false); if (subScorer == null) { if (c.isRequired()) { return null; } } else if (c.isRequired()) { required.add(subScorer); } else if (c.isProhibited()) { prohibited.add(subScorer); } else { optional.add(subScorer); } } // Check if we can return a BooleanScorer scoreDocsInOrder |= !allowDocsOutOfOrder; // until it is removed, factor in the static setting. if (!scoreDocsInOrder && topScorer && required.size() == 0 && prohibited.size() < 32) { return new BooleanScorer(similarity, minNrShouldMatch, optional, prohibited); } if (required.size() == 0 && optional.size() == 0) { // no required and optional clauses. return null; } else if (optional.size() < minNrShouldMatch) { // either >1 req scorer, or there are 0 req scorers and at least 1 // optional scorer. Therefore if there are not enough optional scorers // no documents will be matched by the query return null; } // Return a BooleanScorer2 return new BooleanScorer2(similarity, minNrShouldMatch, required, prohibited, optional); } public boolean scoresDocsOutOfOrder() { int numProhibited = 0; for (Iterator cIter = clauses.iterator(); cIter.hasNext();) { BooleanClause c = (BooleanClause) cIter.next(); if (c.isRequired()) { return false; // BS2 (in-order) will be used by scorer() } else if (c.isProhibited()) { ++numProhibited; } } if (numProhibited > 32) { // cannot use BS return false; } // scorer() will return an out-of-order scorer if requested. return true; } } /** * Whether hit docs may be collected out of docid order. * * @deprecated this will not be needed anymore, as * {@link Weight#scoresDocsOutOfOrder()} is used. */ private static boolean allowDocsOutOfOrder = true; /** * Expert: Indicates whether hit docs may be collected out of docid order. * *

    * Background: although the contract of the Scorer class requires that * documents be iterated in order of doc id, this was not true in early * versions of Lucene. Many pieces of functionality in the current Lucene code * base have undefined behavior if this contract is not upheld, but in some * specific simple cases may be faster. (For example: disjunction queries with * less than 32 prohibited clauses; This setting has no effect for other * queries.) *

    * *

    * Specifics: By setting this option to true, docid N might be scored for a * single segment before docid N-1. Across multiple segments, docs may be * scored out of order regardless of this setting - it only applies to scoring * a single segment. * * Being static, this setting is system wide. *

    * * @deprecated this is not needed anymore, as * {@link Weight#scoresDocsOutOfOrder()} is used. */ public static void setAllowDocsOutOfOrder(boolean allow) { allowDocsOutOfOrder = allow; } /** * Whether hit docs may be collected out of docid order. * * @see #setAllowDocsOutOfOrder(boolean) * @deprecated this is not needed anymore, as * {@link Weight#scoresDocsOutOfOrder()} is used. */ public static boolean getAllowDocsOutOfOrder() { return allowDocsOutOfOrder; } /** * @deprecated Use {@link #setAllowDocsOutOfOrder(boolean)} instead. */ public static void setUseScorer14(boolean use14) { setAllowDocsOutOfOrder(use14); } /** * @deprecated Use {@link #getAllowDocsOutOfOrder()} instead. */ public static boolean getUseScorer14() { return getAllowDocsOutOfOrder(); } public Weight createWeight(Searcher searcher) throws IOException { return new BooleanWeight(searcher); } public Query rewrite(IndexReader reader) throws IOException { if (minNrShouldMatch == 0 && clauses.size() == 1) { // optimize 1-clause queries BooleanClause c = (BooleanClause)clauses.get(0); if (!c.isProhibited()) { // just return clause Query query = c.getQuery().rewrite(reader); // rewrite first if (getBoost() != 1.0f) { // incorporate boost if (query == c.getQuery()) // if rewrite was no-op query = (Query)query.clone(); // then clone before boost query.setBoost(getBoost() * query.getBoost()); } return query; } } BooleanQuery clone = null; // recursively rewrite for (int i = 0 ; i < clauses.size(); i++) { BooleanClause c = (BooleanClause)clauses.get(i); Query query = c.getQuery().rewrite(reader); if (query != c.getQuery()) { // clause rewrote: must clone if (clone == null) clone = (BooleanQuery)this.clone(); clone.clauses.set(i, new BooleanClause(query, c.getOccur())); } } if (clone != null) { return clone; // some clauses rewrote } else return this; // no clauses rewrote } // inherit javadoc public void extractTerms(Set terms) { for (Iterator i = clauses.iterator(); i.hasNext();) { BooleanClause clause = (BooleanClause) i.next(); clause.getQuery().extractTerms(terms); } } public Object clone() { BooleanQuery clone = (BooleanQuery)super.clone(); clone.clauses = (ArrayList)this.clauses.clone(); return clone; } /** Prints a user-readable version of this query. */ public String toString(String field) { StringBuffer buffer = new StringBuffer(); boolean needParens=(getBoost() != 1.0) || (getMinimumNumberShouldMatch()>0) ; if (needParens) { buffer.append("("); } for (int i = 0 ; i < clauses.size(); i++) { BooleanClause c = (BooleanClause)clauses.get(i); if (c.isProhibited()) buffer.append("-"); else if (c.isRequired()) buffer.append("+"); Query subQuery = c.getQuery(); if (subQuery != null) { if (subQuery instanceof BooleanQuery) { // wrap sub-bools in parens buffer.append("("); buffer.append(subQuery.toString(field)); buffer.append(")"); } else { buffer.append(subQuery.toString(field)); } } else { buffer.append("null"); } if (i != clauses.size()-1) buffer.append(" "); } if (needParens) { buffer.append(")"); } if (getMinimumNumberShouldMatch()>0) { buffer.append('~'); buffer.append(getMinimumNumberShouldMatch()); } if (getBoost() != 1.0f) { buffer.append(ToStringUtils.boost(getBoost())); } return buffer.toString(); } /** Returns true iff o is equal to this. */ public boolean equals(Object o) { if (!(o instanceof BooleanQuery)) return false; BooleanQuery other = (BooleanQuery)o; return (this.getBoost() == other.getBoost()) && this.clauses.equals(other.clauses) && this.getMinimumNumberShouldMatch() == other.getMinimumNumberShouldMatch() && this.disableCoord == other.disableCoord; } /** Returns a hash code value for this object.*/ public int hashCode() { return Float.floatToIntBits(getBoost()) ^ clauses.hashCode() + getMinimumNumberShouldMatch() + (disableCoord ? 17:0); } } lucene-2.9.4/src/java/org/apache/lucene/search/DisjunctionSumScorer.java0000644000175000017500000002701011474320224026711 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.List; import java.util.Iterator; import java.io.IOException; import org.apache.lucene.util.ScorerDocQueue; /** A Scorer for OR like queries, counterpart of ConjunctionScorer. * This Scorer implements {@link Scorer#skipTo(int)} and uses skipTo() on the given Scorers. * TODO: Implement score(HitCollector, int). */ class DisjunctionSumScorer extends Scorer { /** The number of subscorers. */ private final int nrScorers; /** The subscorers. */ protected final List subScorers; /** The minimum number of scorers that should match. */ private final int minimumNrMatchers; /** The scorerDocQueue contains all subscorers ordered by their current doc(), * with the minimum at the top. *
    The scorerDocQueue is initialized the first time next() or skipTo() is called. *
    An exhausted scorer is immediately removed from the scorerDocQueue. *
    If less than the minimumNrMatchers scorers * remain in the scorerDocQueue next() and skipTo() return false. *

    * After each to call to next() or skipTo() * currentSumScore is the total score of the current matching doc, * nrMatchers is the number of matching scorers, * and all scorers are after the matching doc, or are exhausted. */ private ScorerDocQueue scorerDocQueue; /** The document number of the current match. */ private int currentDoc = -1; /** The number of subscorers that provide the current match. */ protected int nrMatchers = -1; private float currentScore = Float.NaN; /** Construct a DisjunctionScorer. * @param subScorers A collection of at least two subscorers. * @param minimumNrMatchers The positive minimum number of subscorers that should * match to match this query. *
    When minimumNrMatchers is bigger than * the number of subScorers, * no matches will be produced. *
    When minimumNrMatchers equals the number of subScorers, * it more efficient to use ConjunctionScorer. */ public DisjunctionSumScorer( List subScorers, int minimumNrMatchers) throws IOException { super(null); nrScorers = subScorers.size(); if (minimumNrMatchers <= 0) { throw new IllegalArgumentException("Minimum nr of matchers must be positive"); } if (nrScorers <= 1) { throw new IllegalArgumentException("There must be at least 2 subScorers"); } this.minimumNrMatchers = minimumNrMatchers; this.subScorers = subScorers; initScorerDocQueue(); } /** Construct a DisjunctionScorer, using one as the minimum number * of matching subscorers. */ public DisjunctionSumScorer(List subScorers) throws IOException { this(subScorers, 1); } /** Called the first time next() or skipTo() is called to * initialize scorerDocQueue. */ private void initScorerDocQueue() throws IOException { Iterator si = subScorers.iterator(); scorerDocQueue = new ScorerDocQueue(nrScorers); while (si.hasNext()) { Scorer se = (Scorer) si.next(); if (se.nextDoc() != NO_MORE_DOCS) { // doc() method will be used in scorerDocQueue. scorerDocQueue.insert(se); } } } /** Scores and collects all matching documents. * @param hc The collector to which all matching documents are passed through * {@link HitCollector#collect(int, float)}. *
    When this method is used the {@link #explain(int)} method should not be used. * @deprecated use {@link #score(Collector)} instead. */ public void score(HitCollector hc) throws IOException { score(new HitCollectorWrapper(hc)); } /** Scores and collects all matching documents. * @param collector The collector to which all matching documents are passed through. *
    When this method is used the {@link #explain(int)} method should not be used. */ public void score(Collector collector) throws IOException { collector.setScorer(this); while (nextDoc() != NO_MORE_DOCS) { collector.collect(currentDoc); } } /** Expert: Collects matching documents in a range. Hook for optimization. * Note that {@link #next()} must be called once before this method is called * for the first time. * @param hc The collector to which all matching documents are passed through * {@link HitCollector#collect(int, float)}. * @param max Do not score documents past this. * @return true if more matching documents may remain. * @deprecated use {@link #score(Collector, int, int)} instead. */ protected boolean score(HitCollector hc, int max) throws IOException { return score(new HitCollectorWrapper(hc), max, docID()); } /** Expert: Collects matching documents in a range. Hook for optimization. * Note that {@link #next()} must be called once before this method is called * for the first time. * @param collector The collector to which all matching documents are passed through. * @param max Do not score documents past this. * @return true if more matching documents may remain. */ protected boolean score(Collector collector, int max, int firstDocID) throws IOException { // firstDocID is ignored since nextDoc() sets 'currentDoc' collector.setScorer(this); while (currentDoc < max) { collector.collect(currentDoc); if (nextDoc() == NO_MORE_DOCS) { return false; } } return true; } /** @deprecated use {@link #nextDoc()} instead. */ public boolean next() throws IOException { return nextDoc() != NO_MORE_DOCS; } public int nextDoc() throws IOException { if (scorerDocQueue.size() < minimumNrMatchers || !advanceAfterCurrent()) { currentDoc = NO_MORE_DOCS; } return currentDoc; } /** Advance all subscorers after the current document determined by the * top of the scorerDocQueue. * Repeat until at least the minimum number of subscorers match on the same * document and all subscorers are after that document or are exhausted. *
    On entry the scorerDocQueue has at least minimumNrMatchers * available. At least the scorer with the minimum document number will be advanced. * @return true iff there is a match. *
    In case there is a match,
    currentDoc
    , currentSumScore, * and nrMatchers describe the match. * * TODO: Investigate whether it is possible to use skipTo() when * the minimum number of matchers is bigger than one, ie. try and use the * character of ConjunctionScorer for the minimum number of matchers. * Also delay calling score() on the sub scorers until the minimum number of * matchers is reached. *
    For this, a Scorer array with minimumNrMatchers elements might * hold Scorers at currentDoc that are temporarily popped from scorerQueue. */ protected boolean advanceAfterCurrent() throws IOException { do { // repeat until minimum nr of matchers currentDoc = scorerDocQueue.topDoc(); currentScore = scorerDocQueue.topScore(); nrMatchers = 1; do { // Until all subscorers are after currentDoc if (!scorerDocQueue.topNextAndAdjustElsePop()) { if (scorerDocQueue.size() == 0) { break; // nothing more to advance, check for last match. } } if (scorerDocQueue.topDoc() != currentDoc) { break; // All remaining subscorers are after currentDoc. } currentScore += scorerDocQueue.topScore(); nrMatchers++; } while (true); if (nrMatchers >= minimumNrMatchers) { return true; } else if (scorerDocQueue.size() < minimumNrMatchers) { return false; } } while (true); } /** Returns the score of the current document matching the query. * Initially invalid, until {@link #next()} is called the first time. */ public float score() throws IOException { return currentScore; } /** @deprecated use {@link #docID()} instead. */ public int doc() { return currentDoc; } public int docID() { return currentDoc; } /** Returns the number of subscorers matching the current document. * Initially invalid, until {@link #next()} is called the first time. */ public int nrMatchers() { return nrMatchers; } /** * Skips to the first match beyond the current whose document number is * greater than or equal to a given target.
    * When this method is used the {@link #explain(int)} method should not be * used.
    * The implementation uses the skipTo() method on the subscorers. * * @param target * The target document number. * @return true iff there is such a match. * @deprecated use {@link #advance(int)} instead. */ public boolean skipTo(int target) throws IOException { return advance(target) != NO_MORE_DOCS; } /** * Advances to the first match beyond the current whose document number is * greater than or equal to a given target.
    * When this method is used the {@link #explain(int)} method should not be * used.
    * The implementation uses the skipTo() method on the subscorers. * * @param target * The target document number. * @return the document whose number is greater than or equal to the given * target, or -1 if none exist. */ public int advance(int target) throws IOException { if (scorerDocQueue.size() < minimumNrMatchers) { return currentDoc = NO_MORE_DOCS; } if (target <= currentDoc) { return currentDoc; } do { if (scorerDocQueue.topDoc() >= target) { return advanceAfterCurrent() ? currentDoc : (currentDoc = NO_MORE_DOCS); } else if (!scorerDocQueue.topSkipToAndAdjustElsePop(target)) { if (scorerDocQueue.size() < minimumNrMatchers) { return currentDoc = NO_MORE_DOCS; } } } while (true); } /** @return An explanation for the score of a given document. */ public Explanation explain(int doc) throws IOException { Explanation res = new Explanation(); Iterator ssi = subScorers.iterator(); float sumScore = 0.0f; int nrMatches = 0; while (ssi.hasNext()) { Explanation es = ((Scorer) ssi.next()).explain(doc); if (es.getValue() > 0.0f) { // indicates match sumScore += es.getValue(); nrMatches++; } res.addDetail(es); } if (nrMatchers >= minimumNrMatchers) { res.setValue(sumScore); res.setDescription("sum over at least " + minimumNrMatchers + " of " + subScorers.size() + ":"); } else { res.setValue(0.0f); res.setDescription(nrMatches + " match(es) but at least " + minimumNrMatchers + " of " + subScorers.size() + " needed"); } return res; } } lucene-2.9.4/src/java/org/apache/lucene/search/TopDocs.java0000644000175000017500000000361211474320224024132 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** Represents hits returned by {@link * Searcher#search(Query,Filter,int)} and {@link * Searcher#search(Query,int)}. */ public class TopDocs implements java.io.Serializable { /** The total number of hits for the query. */ public int totalHits; /** The top hits for the query. */ public ScoreDoc[] scoreDocs; /** Stores the maximum score value encountered, needed for normalizing. */ private float maxScore; /** * Returns the maximum score value encountered. Note that in case * scores are not tracked, this returns {@link Float#NaN}. */ public float getMaxScore() { return maxScore; } /** Sets the maximum score value encountered. */ public void setMaxScore(float maxScore) { this.maxScore=maxScore; } /** Constructs a TopDocs with a default maxScore=Float.NaN. */ TopDocs(int totalHits, ScoreDoc[] scoreDocs) { this(totalHits, scoreDocs, Float.NaN); } public TopDocs(int totalHits, ScoreDoc[] scoreDocs, float maxScore) { this.totalHits = totalHits; this.scoreDocs = scoreDocs; this.maxScore = maxScore; } } lucene-2.9.4/src/java/org/apache/lucene/search/HitQueue.java0000644000175000017500000000624011474320225024311 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.util.PriorityQueue; final class HitQueue extends PriorityQueue { private boolean prePopulate; /** * Creates a new instance with size elements. If * prePopulate is set to true, the queue will pre-populate itself * with sentinel objects and set its {@link #size()} to size. In * that case, you should not rely on {@link #size()} to get the number of * actual elements that were added to the queue, but keep track yourself.
    * NOTE: in case prePopulate is true, you should pop * elements from the queue using the following code example: * *

       * PriorityQueue pq = new HitQueue(10, true); // pre-populate.
       * ScoreDoc top = pq.top();
       * 
       * // Add/Update one element.
       * top.score = 1.0f;
       * top.doc = 0;
       * top = (ScoreDoc) pq.updateTop();
       * int totalHits = 1;
       * 
       * // Now pop only the elements that were *truly* inserted.
       * // First, pop all the sentinel elements (there are pq.size() - totalHits).
       * for (int i = pq.size() - totalHits; i > 0; i--) pq.pop();
       * 
       * // Now pop the truly added elements.
       * ScoreDoc[] results = new ScoreDoc[totalHits];
       * for (int i = totalHits - 1; i >= 0; i--) {
       *   results[i] = (ScoreDoc) pq.pop();
       * }
       * 
    * *

    NOTE: This class pre-allocate a full array of * length size. * * @param size * the requested size of this queue. * @param prePopulate * specifies whether to pre-populate the queue with sentinel values. * @see #getSentinelObject() */ HitQueue(int size, boolean prePopulate) { this.prePopulate = prePopulate; initialize(size); } // Returns null if prePopulate is false. protected Object getSentinelObject() { // Always set the doc Id to MAX_VALUE so that it won't be favored by // lessThan. This generally should not happen since if score is not NEG_INF, // TopScoreDocCollector will always add the object to the queue. return !prePopulate ? null : new ScoreDoc(Integer.MAX_VALUE, Float.NEGATIVE_INFINITY); } protected final boolean lessThan(Object a, Object b) { ScoreDoc hitA = (ScoreDoc)a; ScoreDoc hitB = (ScoreDoc)b; if (hitA.score == hitB.score) return hitA.doc > hitB.doc; else return hitA.score < hitB.score; } } lucene-2.9.4/src/java/org/apache/lucene/search/QueryWrapperFilter.java0000644000175000017500000000572511474320224026402 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.BitSet; import org.apache.lucene.index.IndexReader; /** * Constrains search results to only match those which also match a provided * query. * *

    This could be used, for example, with a {@link TermRangeQuery} on a suitably * formatted date field to implement date filtering. One could re-use a single * QueryFilter that matches, e.g., only documents modified within the last * week. The QueryFilter and TermRangeQuery would only need to be reconstructed * once per day. * * @version $Id:$ */ public class QueryWrapperFilter extends Filter { private Query query; /** Constructs a filter which only matches documents matching * query. */ public QueryWrapperFilter(Query query) { this.query = query; } /** * @deprecated Use {@link #getDocIdSet(IndexReader)} instead. */ public BitSet bits(IndexReader reader) throws IOException { final BitSet bits = new BitSet(reader.maxDoc()); new IndexSearcher(reader).search(query, new Collector() { private int base = 0; public void setScorer(Scorer scorer) throws IOException { // score is not needed by this collector } public final void collect(int doc) { bits.set(doc + base); // set bit for hit } public void setNextReader(IndexReader reader, int docBase) { base = docBase; } public boolean acceptsDocsOutOfOrder() { return true; } }); return bits; } public DocIdSet getDocIdSet(final IndexReader reader) throws IOException { final Weight weight = query.weight(new IndexSearcher(reader)); return new DocIdSet() { public DocIdSetIterator iterator() throws IOException { return weight.scorer(reader, true, false); } public boolean isCacheable() { return false; } }; } public String toString() { return "QueryWrapperFilter(" + query + ")"; } public boolean equals(Object o) { if (!(o instanceof QueryWrapperFilter)) return false; return this.query.equals(((QueryWrapperFilter)o).query); } public int hashCode() { return query.hashCode() ^ 0x923F64B9; } } lucene-2.9.4/src/java/org/apache/lucene/search/TopFieldDocCollector.java0000644000175000017500000000544011474320224026563 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.IndexReader; /** A {@link HitCollector} implementation that collects the top-sorting * documents, returning them as a {@link TopFieldDocs}. This is used by {@link * IndexSearcher} to implement {@link TopFieldDocs}-based search. * *

    This may be extended, overriding the collect method to, e.g., * conditionally invoke super() in order to filter which * documents are collected. * * @deprecated Please use {@link TopFieldCollector} instead. */ public class TopFieldDocCollector extends TopDocCollector { private FieldDoc reusableFD; /** Construct to collect a given number of hits. * @param reader the index to be searched * @param sort the sort criteria * @param numHits the maximum number of hits to collect */ public TopFieldDocCollector(IndexReader reader, Sort sort, int numHits) throws IOException { super(new FieldSortedHitQueue(reader, sort.fields, numHits)); } // javadoc inherited public void collect(int doc, float score) { if (score > 0.0f) { totalHits++; if (reusableFD == null) reusableFD = new FieldDoc(doc, score); else { // Whereas TopScoreDocCollector can skip this if the // score is not competitive, we cannot because the // comparators in the FieldSortedHitQueue.lessThan // aren't in general congruent with "higher score // wins" reusableFD.score = score; reusableFD.doc = doc; } reusableFD = (FieldDoc) hq.insertWithOverflow(reusableFD); } } // javadoc inherited public TopDocs topDocs() { FieldSortedHitQueue fshq = (FieldSortedHitQueue)hq; ScoreDoc[] scoreDocs = new ScoreDoc[fshq.size()]; for (int i = fshq.size()-1; i >= 0; i--) // put docs in array scoreDocs[i] = fshq.fillFields ((FieldDoc) fshq.pop()); return new TopFieldDocs(totalHits, scoreDocs, fshq.getFields(), fshq.getMaxScore()); } } lucene-2.9.4/src/java/org/apache/lucene/search/Similarity.java0000644000175000017500000011757311474320224024721 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.Term; import org.apache.lucene.search.Explanation.IDFExplanation; import org.apache.lucene.util.SmallFloat; import java.io.IOException; import java.io.Serializable; import java.util.Collection; import java.util.IdentityHashMap; import java.util.Iterator; /** * Expert: Scoring API. * *

    Similarity defines the components of Lucene scoring. * Overriding computation of these components is a convenient * way to alter Lucene scoring. * *

    Suggested reading: * * Introduction To Information Retrieval, Chapter 6. * *

    The following describes how Lucene scoring evolves from * underlying information retrieval models to (efficient) implementation. * We first brief on VSM Score, * then derive from it Lucene's Conceptual Scoring Formula, * from which, finally, evolves Lucene's Practical Scoring Function * (the latter is connected directly with Lucene classes and methods). * *

    Lucene combines * * Boolean model (BM) of Information Retrieval * with * * Vector Space Model (VSM) of Information Retrieval - * documents "approved" by BM are scored by VSM. * *

    In VSM, documents and queries are represented as * weighted vectors in a multi-dimensional space, * where each distinct index term is a dimension, * and weights are * Tf-idf values. * *

    VSM does not require weights to be Tf-idf values, * but Tf-idf values are believed to produce search results of high quality, * and so Lucene is using Tf-idf. * Tf and Idf are described in more detail below, * but for now, for completion, let's just say that * for given term t and document (or query) x, * Tf(t,x) varies with the number of occurrences of term t in x * (when one increases so does the other) and * idf(t) similarly varies with the inverse of the * number of index documents containing term t. * *

    VSM score of document d for query q is the * * Cosine Similarity * of the weighted query vectors V(q) and V(d): * *
     
    * * * *
    * * *
    * * * * * *
    * cosine-similarity(q,d)   =   * * * * * *
    V(q) · V(d)
    –––––––––
    |V(q)| |V(d)|
    *
    *
    *
    *
    VSM Score
    *
    *
     
    * * * Where V(q) · V(d) is the * dot product * of the weighted vectors, * and |V(q)| and |V(d)| are their * Euclidean norms. * *

    Note: the above equation can be viewed as the dot product of * the normalized weighted vectors, in the sense that dividing * V(q) by its euclidean norm is normalizing it to a unit vector. * *

    Lucene refines VSM score for both search quality and usability: *

      *
    • Normalizing V(d) to the unit vector is known to be problematic in that * it removes all document length information. * For some documents removing this info is probably ok, * e.g. a document made by duplicating a certain paragraph 10 times, * especially if that paragraph is made of distinct terms. * But for a document which contains no duplicated paragraphs, * this might be wrong. * To avoid this problem, a different document length normalization * factor is used, which normalizes to a vector equal to or larger * than the unit vector: doc-len-norm(d). *
    • * *
    • At indexing, users can specify that certain documents are more * important than others, by assigning a document boost. * For this, the score of each document is also multiplied by its boost value * doc-boost(d). *
    • * *
    • Lucene is field based, hence each query term applies to a single * field, document length normalization is by the length of the certain field, * and in addition to document boost there are also document fields boosts. *
    • * *
    • The same field can be added to a document during indexing several times, * and so the boost of that field is the multiplication of the boosts of * the separate additions (or parts) of that field within the document. *
    • * *
    • At search time users can specify boosts to each query, sub-query, and * each query term, hence the contribution of a query term to the score of * a document is multiplied by the boost of that query term query-boost(q). *
    • * *
    • A document may match a multi term query without containing all * the terms of that query (this is correct for some of the queries), * and users can further reward documents matching more query terms * through a coordination factor, which is usually larger when * more terms are matched: coord-factor(q,d). *
    • *
    * *

    Under the simplifying assumption of a single field in the index, * we get Lucene's Conceptual scoring formula: * *
     
    * * * *
    * * *
    * * * * * * *
    * score(q,d)   =   * coord-factor(q,d) ·   * query-boost(q) ·   * * * * * *
    V(q) · V(d)
    –––––––––
    |V(q)|
    *
    *   ·   doc-len-norm(d) *   ·   doc-boost(d) *
    *
    *
    *
    Lucene Conceptual Scoring Formula
    *
    *
     
    * *

    The conceptual formula is a simplification in the sense that (1) terms and documents * are fielded and (2) boosts are usually per query term rather than per query. * *

    We now describe how Lucene implements this conceptual scoring formula, and * derive from it Lucene's Practical Scoring Function. * *

    For efficient score computation some scoring components * are computed and aggregated in advance: * *

      *
    • Query-boost for the query (actually for each query term) * is known when search starts. *
    • * *
    • Query Euclidean norm |V(q)| can be computed when search starts, * as it is independent of the document being scored. * From search optimization perspective, it is a valid question * why bother to normalize the query at all, because all * scored documents will be multiplied by the same |V(q)|, * and hence documents ranks (their order by score) will not * be affected by this normalization. * There are two good reasons to keep this normalization: *
        *
      • Recall that * * Cosine Similarity can be used find how similar * two documents are. One can use Lucene for e.g. * clustering, and use a document as a query to compute * its similarity to other documents. * In this use case it is important that the score of document d3 * for query d1 is comparable to the score of document d3 * for query d2. In other words, scores of a document for two * distinct queries should be comparable. * There are other applications that may require this. * And this is exactly what normalizing the query vector V(q) * provides: comparability (to a certain extent) of two or more queries. *
      • * *
      • Applying query normalization on the scores helps to keep the * scores around the unit vector, hence preventing loss of score data * because of floating point precision limitations. *
      • *
      *
    • * *
    • Document length norm doc-len-norm(d) and document * boost doc-boost(d) are known at indexing time. * They are computed in advance and their multiplication * is saved as a single value in the index: norm(d). * (In the equations below, norm(t in d) means norm(field(t) in doc d) * where field(t) is the field associated with term t.) *
    • *
    * *

    Lucene's Practical Scoring Function is derived from the above. * The color codes demonstrate how it relates * to those of the conceptual formula: * *

    * * * *
    * * *
    * * * * * * * * * * * *
    * score(q,d)   =   * coord(q,d)  ·  * queryNorm(q)  ·  * * * * ( * tf(t in d)  ·  * idf(t)2  ·  * t.getBoost() ·  * norm(t,d) * ) *
    t in q
    *
    *
    *
    Lucene Practical Scoring Function
    *
    * *

    where *

      *
    1. * * tf(t in d) * correlates to the term's frequency, * defined as the number of times term t appears in the currently scored document d. * Documents that have more occurrences of a given term receive a higher score. * Note that tf(t in q) is assumed to be 1 and therefore it does not appear in this equation, * However if a query contains twice the same term, there will be * two term-queries with that same term and hence the computation would still be correct (although * not very efficient). * The default computation for tf(t in d) in * {@link org.apache.lucene.search.DefaultSimilarity#tf(float) DefaultSimilarity} is: * *
       
      * * * * * *
      * {@link org.apache.lucene.search.DefaultSimilarity#tf(float) tf(t in d)}   =   * * frequency½ *
      *
       
      *
    2. * *
    3. * * idf(t) stands for Inverse Document Frequency. This value * correlates to the inverse of docFreq * (the number of documents in which the term t appears). * This means rarer terms give higher contribution to the total score. * idf(t) appears for t in both the query and the document, * hence it is squared in the equation. * The default computation for idf(t) in * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) DefaultSimilarity} is: * *
       
      * * * * * * * *
      * {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) idf(t)}  =   * * 1 + log ( * * * * * *
      numDocs
      –––––––––
      docFreq+1
      *
      * ) *
      *
       
      *
    4. * *
    5. * * coord(q,d) * is a score factor based on how many of the query terms are found in the specified document. * Typically, a document that contains more of the query's terms will receive a higher score * than another document with fewer query terms. * This is a search time factor computed in * {@link #coord(int, int) coord(q,d)} * by the Similarity in effect at search time. *
       
      *
    6. * *
    7. * * queryNorm(q) * * is a normalizing factor used to make scores between queries comparable. * This factor does not affect document ranking (since all ranked documents are multiplied by the same factor), * but rather just attempts to make scores from different queries (or even different indexes) comparable. * This is a search time factor computed by the Similarity in effect at search time. * * The default computation in * {@link org.apache.lucene.search.DefaultSimilarity#queryNorm(float) DefaultSimilarity} * produces a Euclidean norm: *
       
      * * * * * *
      * queryNorm(q)   =   * {@link org.apache.lucene.search.DefaultSimilarity#queryNorm(float) queryNorm(sumOfSquaredWeights)} *   =   * * * * * *
      1
      * –––––––––––––– *
      sumOfSquaredWeights½
      *
      *
       
      * * The sum of squared weights (of the query terms) is * computed by the query {@link org.apache.lucene.search.Weight} object. * For example, a {@link org.apache.lucene.search.BooleanQuery boolean query} * computes this value as: * *
       
      * * * * * * * * * * * *
      * {@link org.apache.lucene.search.Weight#sumOfSquaredWeights() sumOfSquaredWeights}   =   * {@link org.apache.lucene.search.Query#getBoost() q.getBoost()} 2 *  ·  * * * * ( * idf(t)  ·  * t.getBoost() * ) 2 *
      t in q
      *
       
      * *
    8. * *
    9. * * t.getBoost() * is a search time boost of term t in the query q as * specified in the query text * (see query syntax), * or as set by application calls to * {@link org.apache.lucene.search.Query#setBoost(float) setBoost()}. * Notice that there is really no direct API for accessing a boost of one term in a multi term query, * but rather multi terms are represented in a query as multi * {@link org.apache.lucene.search.TermQuery TermQuery} objects, * and so the boost of a term in the query is accessible by calling the sub-query * {@link org.apache.lucene.search.Query#getBoost() getBoost()}. *
       
      *
    10. * *
    11. * * norm(t,d) encapsulates a few (indexing time) boost and length factors: * *
        *
      • Document boost - set by calling * {@link org.apache.lucene.document.Document#setBoost(float) doc.setBoost()} * before adding the document to the index. *
      • *
      • Field boost - set by calling * {@link org.apache.lucene.document.Fieldable#setBoost(float) field.setBoost()} * before adding the field to a document. *
      • *
      • {@link #lengthNorm(String, int) lengthNorm(field)} - computed * when the document is added to the index in accordance with the number of tokens * of this field in the document, so that shorter fields contribute more to the score. * LengthNorm is computed by the Similarity class in effect at indexing. *
      • *
      * *

      * When a document is added to the index, all the above factors are multiplied. * If the document has multiple fields with the same name, all their boosts are multiplied together: * *
       
      * * * * * * * * * * * *
      * norm(t,d)   =   * {@link org.apache.lucene.document.Document#getBoost() doc.getBoost()} *  ·  * {@link #lengthNorm(String, int) lengthNorm(field)} *  ·  * * * * {@link org.apache.lucene.document.Fieldable#getBoost() f.getBoost}() *
      field f in d named as t
      *
       
      * However the resulted norm value is {@link #encodeNorm(float) encoded} as a single byte * before being stored. * At search time, the norm byte value is read from the index * {@link org.apache.lucene.store.Directory directory} and * {@link #decodeNorm(byte) decoded} back to a float norm value. * This encoding/decoding, while reducing index size, comes with the price of * precision loss - it is not guaranteed that decode(encode(x)) = x. * For instance, decode(encode(0.89)) = 0.75. *
       
      * Compression of norm values to a single byte saves memory at search time, * because once a field is referenced at search time, its norms - for * all documents - are maintained in memory. *
       
      * The rationale supporting such lossy compression of norm values is that * given the difficulty (and inaccuracy) of users to express their true information * need by a query, only big differences matter. *
       
      * Last, note that search time is too late to modify this norm part of scoring, e.g. by * using a different {@link Similarity} for search. *
       
      *

    12. *
    * * @see #setDefault(Similarity) * @see org.apache.lucene.index.IndexWriter#setSimilarity(Similarity) * @see Searcher#setSimilarity(Similarity) */ public abstract class Similarity implements Serializable { public static final int NO_DOC_ID_PROVIDED = -1; /** Set the default Similarity implementation used by indexing and search * code. * * @see Searcher#setSimilarity(Similarity) * @see org.apache.lucene.index.IndexWriter#setSimilarity(Similarity) */ public static void setDefault(Similarity similarity) { Similarity.defaultImpl = similarity; } /** Return the default Similarity implementation used by indexing and search * code. * *

    This is initially an instance of {@link DefaultSimilarity}. * * @see Searcher#setSimilarity(Similarity) * @see org.apache.lucene.index.IndexWriter#setSimilarity(Similarity) */ public static Similarity getDefault() { return Similarity.defaultImpl; } /** Cache of decoded bytes. */ private static final float[] NORM_TABLE = new float[256]; static { for (int i = 0; i < 256; i++) NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i); } /** Decodes a normalization factor stored in an index. * @see #encodeNorm(float) */ public static float decodeNorm(byte b) { return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127 } /** Returns a table for decoding normalization bytes. * @see #encodeNorm(float) */ public static float[] getNormDecoder() { return NORM_TABLE; } /** * Compute the normalization value for a field, given the accumulated * state of term processing for this field (see {@link FieldInvertState}). * *

    Implementations should calculate a float value based on the field * state and then return that value. * *

    For backward compatibility this method by default calls * {@link #lengthNorm(String, int)} passing * {@link FieldInvertState#getLength()} as the second argument, and * then multiplies this value by {@link FieldInvertState#getBoost()}.

    * *

    WARNING: This API is new and experimental and may * suddenly change.

    * * @param field field name * @param state current processing state for this field * @return the calculated float norm */ public float computeNorm(String field, FieldInvertState state) { return (float) (state.getBoost() * lengthNorm(field, state.getLength())); } /** Computes the normalization value for a field given the total number of * terms contained in a field. These values, together with field boosts, are * stored in an index and multipled into scores for hits on each field by the * search code. * *

    Matches in longer fields are less precise, so implementations of this * method usually return smaller values when numTokens is large, * and larger values when numTokens is small. * *

    Note that the return values are computed under * {@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document)} * and then stored using * {@link #encodeNorm(float)}. * Thus they have limited precision, and documents * must be re-indexed if this method is altered. * * @param fieldName the name of the field * @param numTokens the total number of tokens contained in fields named * fieldName of doc. * @return a normalization factor for hits on this field of this document * * @see org.apache.lucene.document.Field#setBoost(float) */ public abstract float lengthNorm(String fieldName, int numTokens); /** Computes the normalization value for a query given the sum of the squared * weights of each of the query terms. This value is multiplied into the * weight of each query term. While the classic query normalization factor is * computed as 1/sqrt(sumOfSquaredWeights), other implementations might * completely ignore sumOfSquaredWeights (ie return 1). * *

    This does not affect ranking, but the default implementation does make scores * from different queries more comparable than they would be by eliminating the * magnitude of the Query vector as a factor in the score. * * @param sumOfSquaredWeights the sum of the squares of query term weights * @return a normalization factor for query weights */ public abstract float queryNorm(float sumOfSquaredWeights); /** Encodes a normalization factor for storage in an index. * *

    The encoding uses a three-bit mantissa, a five-bit exponent, and * the zero-exponent point at 15, thus * representing values from around 7x10^9 to 2x10^-9 with about one * significant decimal digit of accuracy. Zero is also represented. * Negative numbers are rounded up to zero. Values too large to represent * are rounded down to the largest representable value. Positive values too * small to represent are rounded up to the smallest positive representable * value. * * @see org.apache.lucene.document.Field#setBoost(float) * @see org.apache.lucene.util.SmallFloat */ public static byte encodeNorm(float f) { return SmallFloat.floatToByte315(f); } /** Computes a score factor based on a term or phrase's frequency in a * document. This value is multiplied by the {@link #idf(Term, Searcher)} * factor for each term in the query and these products are then summed to * form the initial score for a document. * *

    Terms and phrases repeated in a document indicate the topic of the * document, so implementations of this method usually return larger values * when freq is large, and smaller values when freq * is small. * *

    The default implementation calls {@link #tf(float)}. * * @param freq the frequency of a term within a document * @return a score factor based on a term's within-document frequency */ public float tf(int freq) { return tf((float)freq); } /** Computes the amount of a sloppy phrase match, based on an edit distance. * This value is summed for each sloppy phrase match in a document to form * the frequency that is passed to {@link #tf(float)}. * *

    A phrase match with a small edit distance to a document passage more * closely matches the document, so implementations of this method usually * return larger values when the edit distance is small and smaller values * when it is large. * * @see PhraseQuery#setSlop(int) * @param distance the edit distance of this sloppy phrase match * @return the frequency increment for this match */ public abstract float sloppyFreq(int distance); /** Computes a score factor based on a term or phrase's frequency in a * document. This value is multiplied by the {@link #idf(Term, Searcher)} * factor for each term in the query and these products are then summed to * form the initial score for a document. * *

    Terms and phrases repeated in a document indicate the topic of the * document, so implementations of this method usually return larger values * when freq is large, and smaller values when freq * is small. * * @param freq the frequency of a term within a document * @return a score factor based on a term's within-document frequency */ public abstract float tf(float freq); /** Computes a score factor for a simple term. * *

    The default implementation is:

       *   return idf(searcher.docFreq(term), searcher.maxDoc());
       * 
    * * Note that {@link Searcher#maxDoc()} is used instead of * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also * {@link Searcher#docFreq(Term)} is used, and when the latter * is inaccurate, so is {@link Searcher#maxDoc()}, and in the same direction. * In addition, {@link Searcher#maxDoc()} is more efficient to compute * * @param term the term in question * @param searcher the document collection being searched * @return a score factor for the term * @deprecated see {@link #idfExplain(Term, Searcher)} */ public float idf(Term term, Searcher searcher) throws IOException { return idf(searcher.docFreq(term), searcher.maxDoc()); } /** * Computes a score factor for a simple term and returns an explanation * for that score factor. * *

    * The default implementation uses: * *

       * idf(searcher.docFreq(term), searcher.maxDoc());
       * 
    * * Note that {@link Searcher#maxDoc()} is used instead of * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also * {@link Searcher#docFreq(Term)} is used, and when the latter * is inaccurate, so is {@link Searcher#maxDoc()}, and in the same direction. * In addition, {@link Searcher#maxDoc()} is more efficient to compute * * @param term the term in question * @param searcher the document collection being searched * @return an IDFExplain object that includes both an idf score factor and an explanation for the term. * @throws IOException */ public IDFExplanation idfExplain(final Term term, final Searcher searcher) throws IOException { if(supportedMethods.overridesTermIDF) { final float idf = idf(term, searcher); return new IDFExplanation() { //@Override public float getIdf() { return idf; } //@Override public String explain() { return "Inexplicable"; } }; } final int df = searcher.docFreq(term); final int max = searcher.maxDoc(); final float idf = idf(df, max); return new IDFExplanation() { //@Override public String explain() { return "idf(docFreq=" + df + ", maxDocs=" + max + ")"; } //@Override public float getIdf() { return idf; }}; } /** Computes a score factor for a phrase. * *

    The default implementation sums the {@link #idf(Term,Searcher)} factor * for each term in the phrase. * * @param terms the terms in the phrase * @param searcher the document collection being searched * @return idf score factor * @deprecated see {@link #idfExplain(Collection, Searcher)} */ public float idf(Collection terms, Searcher searcher) throws IOException { float idf = 0.0f; Iterator i = terms.iterator(); while (i.hasNext()) { idf += idf((Term)i.next(), searcher); } return idf; } /** * Computes a score factor for a phrase. * *

    * The default implementation sums the idf factor for * each term in the phrase. * * @param terms the terms in the phrase * @param searcher the document collection being searched * @return an IDFExplain object that includes both an idf * score factor for the phrase and an explanation * for each term. * @throws IOException */ public IDFExplanation idfExplain(Collection terms, Searcher searcher) throws IOException { if(supportedMethods.overridesCollectionIDF) { final float idf = idf(terms, searcher); return new IDFExplanation() { //@Override public float getIdf() { return idf; } //@Override public String explain() { return "Inexplicable"; } }; } final int max = searcher.maxDoc(); float idf = 0.0f; final StringBuffer exp = new StringBuffer(); Iterator i = terms.iterator(); while (i.hasNext()) { Term term = (Term)i.next(); final int df = searcher.docFreq(term); idf += idf(df, max); exp.append(" "); exp.append(term.text()); exp.append("="); exp.append(df); } final float fIdf = idf; return new IDFExplanation() { //@Override public float getIdf() { return fIdf; } //@Override public String explain() { return exp.toString(); } }; } /** Computes a score factor based on a term's document frequency (the number * of documents which contain the term). This value is multiplied by the * {@link #tf(int)} factor for each term in the query and these products are * then summed to form the initial score for a document. * *

    Terms that occur in fewer documents are better indicators of topic, so * implementations of this method usually return larger values for rare terms, * and smaller values for common terms. * * @param docFreq the number of documents which contain the term * @param numDocs the total number of documents in the collection * @return a score factor based on the term's document frequency */ public abstract float idf(int docFreq, int numDocs); /** Computes a score factor based on the fraction of all query terms that a * document contains. This value is multiplied into scores. * *

    The presence of a large portion of the query terms indicates a better * match with the query, so implementations of this method usually return * larger values when the ratio between these parameters is large and smaller * values when the ratio between them is small. * * @param overlap the number of query terms matched in the document * @param maxOverlap the total number of terms in the query * @return a score factor based on term overlap with the query */ public abstract float coord(int overlap, int maxOverlap); /** * Calculate a scoring factor based on the data in the payload. Overriding implementations * are responsible for interpreting what is in the payload. Lucene makes no assumptions about * what is in the byte array. *

    * The default implementation returns 1. * * @param fieldName The fieldName of the term this payload belongs to * @param payload The payload byte array to be scored * @param offset The offset into the payload array * @param length The length in the array * @return An implementation dependent float to be used as a scoring factor * * @deprecated See {@link #scorePayload(int, String, int, int, byte[], int, int)} */ //TODO: When removing this, set the default value below to return 1. public float scorePayload(String fieldName, byte [] payload, int offset, int length) { //Do nothing return 1; } /** * Calculate a scoring factor based on the data in the payload. Overriding implementations * are responsible for interpreting what is in the payload. Lucene makes no assumptions about * what is in the byte array. *

    * The default implementation returns 1. * * @param docId The docId currently being scored. If this value is {@link #NO_DOC_ID_PROVIDED}, then it should be assumed that the PayloadQuery implementation does not provide document information * @param fieldName The fieldName of the term this payload belongs to * @param start The start position of the payload * @param end The end position of the payload * @param payload The payload byte array to be scored * @param offset The offset into the payload array * @param length The length in the array * @return An implementation dependent float to be used as a scoring factor * */ public float scorePayload(int docId, String fieldName, int start, int end, byte [] payload, int offset, int length) { //TODO: When removing the deprecated scorePayload above, set this to return 1 return scorePayload(fieldName, payload, offset, length); } /** @deprecated Remove this when old API is removed! */ private final MethodSupport supportedMethods = getSupportedMethods(this.getClass()); /** @deprecated Remove this when old API is removed! */ private static final class MethodSupport implements Serializable { final boolean overridesCollectionIDF, overridesTermIDF; MethodSupport(Class clazz) { overridesCollectionIDF = isMethodOverridden(clazz, "idf", C_IDF_METHOD_PARAMS); overridesTermIDF = isMethodOverridden(clazz, "idf", T_IDF_METHOD_PARAMS); } private static boolean isMethodOverridden(Class clazz, String name, Class[] params) { try { return clazz.getMethod(name, params).getDeclaringClass() != Similarity.class; } catch (NoSuchMethodException e) { // should not happen throw new RuntimeException(e); } } /** @deprecated Remove this when old API is removed! */ private static final Class[] T_IDF_METHOD_PARAMS = new Class[]{Term.class, Searcher.class}; /** @deprecated Remove this when old API is removed! */ private static final Class[] C_IDF_METHOD_PARAMS = new Class[]{Collection.class, Searcher.class}; } /** @deprecated Remove this when old API is removed! */ private static final IdentityHashMap/*,MethodSupport>*/ knownMethodSupport = new IdentityHashMap(); /** @deprecated Remove this when old API is removed! */ private static MethodSupport getSupportedMethods(Class clazz) { MethodSupport supportedMethods; synchronized(knownMethodSupport) { supportedMethods = (MethodSupport) knownMethodSupport.get(clazz); if (supportedMethods == null) { knownMethodSupport.put(clazz, supportedMethods = new MethodSupport(clazz)); } } return supportedMethods; } /** The Similarity implementation used by default. * TODO: move back to top when old API is removed! **/ private static Similarity defaultImpl = new DefaultSimilarity(); } lucene-2.9.4/src/java/org/apache/lucene/search/TermRangeQuery.java0000644000175000017500000001677111474320224025503 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.text.Collator; import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.ToStringUtils; /** * A Query that matches documents within an range of terms. * *

    This query matches the documents looking for terms that fall into the * supplied range according to {@link * String#compareTo(String)}, unless a Collator is provided. It is not intended * for numerical ranges; use {@link NumericRangeQuery} instead. * *

    This query uses the {@link * MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} * rewrite method. * @since 2.9 */ public class TermRangeQuery extends MultiTermQuery { private String lowerTerm; private String upperTerm; private Collator collator; private String field; private boolean includeLower; private boolean includeUpper; /** * Constructs a query selecting all terms greater/equal than lowerTerm * but less/equal than upperTerm. * *

    * If an endpoint is null, it is said * to be "open". Either or both endpoints may be open. Open endpoints may not * be exclusive (you can't select all but the first or last term without * explicitly specifying the term to exclude.) * * @param field The field that holds both lower and upper terms. * @param lowerTerm * The term text at the lower end of the range * @param upperTerm * The term text at the upper end of the range * @param includeLower * If true, the lowerTerm is * included in the range. * @param includeUpper * If true, the upperTerm is * included in the range. */ public TermRangeQuery(String field, String lowerTerm, String upperTerm, boolean includeLower, boolean includeUpper) { this(field, lowerTerm, upperTerm, includeLower, includeUpper, null); } /** Constructs a query selecting all terms greater/equal than * lowerTerm but less/equal than upperTerm. *

    * If an endpoint is null, it is said * to be "open". Either or both endpoints may be open. Open endpoints may not * be exclusive (you can't select all but the first or last term without * explicitly specifying the term to exclude.) *

    * If collator is not null, it will be used to decide whether * index terms are within the given range, rather than using the Unicode code * point order in which index terms are stored. *

    * WARNING: Using this constructor and supplying a non-null * value in the collator parameter will cause every single * index Term in the Field referenced by lowerTerm and/or upperTerm to be * examined. Depending on the number of index Terms in this Field, the * operation could be very slow. * * @param lowerTerm The Term text at the lower end of the range * @param upperTerm The Term text at the upper end of the range * @param includeLower * If true, the lowerTerm is * included in the range. * @param includeUpper * If true, the upperTerm is * included in the range. * @param collator The collator to use to collate index Terms, to determine * their membership in the range bounded by lowerTerm and * upperTerm. */ public TermRangeQuery(String field, String lowerTerm, String upperTerm, boolean includeLower, boolean includeUpper, Collator collator) { this.field = field; this.lowerTerm = lowerTerm; this.upperTerm = upperTerm; this.includeLower = includeLower; this.includeUpper = includeUpper; this.collator = collator; } /** Returns the field name for this query */ public String getField() { return field; } /** Returns the lower value of this range query */ public String getLowerTerm() { return lowerTerm; } /** Returns the upper value of this range query */ public String getUpperTerm() { return upperTerm; } /** Returns true if the lower endpoint is inclusive */ public boolean includesLower() { return includeLower; } /** Returns true if the upper endpoint is inclusive */ public boolean includesUpper() { return includeUpper; } /** Returns the collator used to determine range inclusion, if any. */ public Collator getCollator() { return collator; } protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { return new TermRangeTermEnum(reader, field, lowerTerm, upperTerm, includeLower, includeUpper, collator); } /** Prints a user-readable version of this query. */ public String toString(String field) { StringBuffer buffer = new StringBuffer(); if (!getField().equals(field)) { buffer.append(getField()); buffer.append(":"); } buffer.append(includeLower ? '[' : '{'); buffer.append(lowerTerm != null ? lowerTerm : "*"); buffer.append(" TO "); buffer.append(upperTerm != null ? upperTerm : "*"); buffer.append(includeUpper ? ']' : '}'); buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } //@Override public int hashCode() { final int prime = 31; int result = super.hashCode(); result = prime * result + ((collator == null) ? 0 : collator.hashCode()); result = prime * result + ((field == null) ? 0 : field.hashCode()); result = prime * result + (includeLower ? 1231 : 1237); result = prime * result + (includeUpper ? 1231 : 1237); result = prime * result + ((lowerTerm == null) ? 0 : lowerTerm.hashCode()); result = prime * result + ((upperTerm == null) ? 0 : upperTerm.hashCode()); return result; } //@Override public boolean equals(Object obj) { if (this == obj) return true; if (!super.equals(obj)) return false; if (getClass() != obj.getClass()) return false; TermRangeQuery other = (TermRangeQuery) obj; if (collator == null) { if (other.collator != null) return false; } else if (!collator.equals(other.collator)) return false; if (field == null) { if (other.field != null) return false; } else if (!field.equals(other.field)) return false; if (includeLower != other.includeLower) return false; if (includeUpper != other.includeUpper) return false; if (lowerTerm == null) { if (other.lowerTerm != null) return false; } else if (!lowerTerm.equals(other.lowerTerm)) return false; if (upperTerm == null) { if (other.upperTerm != null) return false; } else if (!upperTerm.equals(other.upperTerm)) return false; return true; } } lucene-2.9.4/src/java/org/apache/lucene/search/SimilarityDelegator.java0000644000175000017500000000436611474320224026543 0ustar janpascaljanpascalpackage org.apache.lucene.search; import org.apache.lucene.index.FieldInvertState; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** Expert: Delegating scoring implementation. Useful in {@link * Query#getSimilarity(Searcher)} implementations, to override only certain * methods of a Searcher's Similarity implementation.. */ public class SimilarityDelegator extends Similarity { private Similarity delegee; /** Construct a {@link Similarity} that delegates all methods to another. * * @param delegee the Similarity implementation to delegate to */ public SimilarityDelegator(Similarity delegee) { this.delegee = delegee; } public float computeNorm(String fieldName, FieldInvertState state) { return delegee.computeNorm(fieldName, state); } public float lengthNorm(String fieldName, int numTerms) { return delegee.lengthNorm(fieldName, numTerms); } public float queryNorm(float sumOfSquaredWeights) { return delegee.queryNorm(sumOfSquaredWeights); } public float tf(float freq) { return delegee.tf(freq); } public float sloppyFreq(int distance) { return delegee.sloppyFreq(distance); } public float idf(int docFreq, int numDocs) { return delegee.idf(docFreq, numDocs); } public float coord(int overlap, int maxOverlap) { return delegee.coord(overlap, maxOverlap); } public float scorePayload(String fieldName, byte[] payload, int offset, int length) { return delegee.scorePayload(fieldName, payload, offset, length); } } lucene-2.9.4/src/java/org/apache/lucene/search/TopDocsCollector.java0000644000175000017500000001441611474320224026005 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.util.PriorityQueue; /** * A base class for all collectors that return a {@link TopDocs} output. This * collector allows easy extension by providing a single constructor which * accepts a {@link PriorityQueue} as well as protected members for that * priority queue and a counter of the number of total hits.
    * Extending classes can override {@link #topDocs(int, int)} and * {@link #getTotalHits()} in order to provide their own implementation. */ public abstract class TopDocsCollector extends Collector { // This is used in case topDocs() is called with illegal parameters, or there // simply aren't (enough) results. protected static final TopDocs EMPTY_TOPDOCS = new TopDocs(0, new ScoreDoc[0], Float.NaN); /** * The priority queue which holds the top documents. Note that different * implementations of PriorityQueue give different meaning to 'top documents'. * HitQueue for example aggregates the top scoring documents, while other PQ * implementations may hold documents sorted by other criteria. */ protected PriorityQueue pq; /** The total number of documents that the collector encountered. */ protected int totalHits; protected TopDocsCollector(PriorityQueue pq) { this.pq = pq; } /** * Populates the results array with the ScoreDoc instaces. This can be * overridden in case a different ScoreDoc type should be returned. */ protected void populateResults(ScoreDoc[] results, int howMany) { for (int i = howMany - 1; i >= 0; i--) { results[i] = (ScoreDoc) pq.pop(); } } /** * Returns a {@link TopDocs} instance containing the given results. If * results is null it means there are no results to return, * either because there were 0 calls to collect() or because the arguments to * topDocs were invalid. */ protected TopDocs newTopDocs(ScoreDoc[] results, int start) { return results == null ? EMPTY_TOPDOCS : new TopDocs(totalHits, results); } /** The total number of documents that matched this query. */ public int getTotalHits() { return totalHits; } /** Returns the top docs that were collected by this collector. */ public final TopDocs topDocs() { // In case pq was populated with sentinel values, there might be less // results than pq.size(). Therefore return all results until either // pq.size() or totalHits. return topDocs(0, totalHits < pq.size() ? totalHits : pq.size()); } /** * Returns the documents in the rage [start .. pq.size()) that were collected * by this collector. Note that if start >= pq.size(), an empty TopDocs is * returned.
    * This method is convenient to call if the application always asks for the * last results, starting from the last 'page'.
    * NOTE: you cannot call this method more than once for each search * execution. If you need to call it more than once, passing each time a * different start, you should call {@link #topDocs()} and work * with the returned {@link TopDocs} object, which will contain all the * results this search execution collected. */ public final TopDocs topDocs(int start) { // In case pq was populated with sentinel values, there might be less // results than pq.size(). Therefore return all results until either // pq.size() or totalHits. return topDocs(start, totalHits < pq.size() ? totalHits : pq.size()); } /** * Returns the documents in the rage [start .. start+howMany) that were * collected by this collector. Note that if start >= pq.size(), an empty * TopDocs is returned, and if pq.size() - start < howMany, then only the * available documents in [start .. pq.size()) are returned.
    * This method is useful to call in case pagination of search results is * allowed by the search application, as well as it attempts to optimize the * memory used by allocating only as much as requested by howMany.
    * NOTE: you cannot call this method more than once for each search * execution. If you need to call it more than once, passing each time a * different range, you should call {@link #topDocs()} and work with the * returned {@link TopDocs} object, which will contain all the results this * search execution collected. */ public final TopDocs topDocs(int start, int howMany) { // In case pq was populated with sentinel values, there might be less // results than pq.size(). Therefore return all results until either // pq.size() or totalHits. int size = totalHits < pq.size() ? totalHits : pq.size(); // Don't bother to throw an exception, just return an empty TopDocs in case // the parameters are invalid or out of range. if (start < 0 || start >= size || howMany <= 0) { return newTopDocs(null, start); } // We know that start < pqsize, so just fix howMany. howMany = Math.min(size - start, howMany); ScoreDoc[] results = new ScoreDoc[howMany]; // pq's pop() returns the 'least' element in the queue, therefore need // to discard the first ones, until we reach the requested range. // Note that this loop will usually not be executed, since the common usage // should be that the caller asks for the last howMany results. However it's // needed here for completeness. for (int i = pq.size() - start - howMany; i > 0; i--) { pq.pop(); } // Get the requested results from pq. populateResults(results, howMany); return newTopDocs(results, start); } } lucene-2.9.4/src/java/org/apache/lucene/search/TermScorer.java0000644000175000017500000001656011474320225024653 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.TermDocs; /** Expert: A Scorer for documents matching a Term. */ final class TermScorer extends Scorer { private static final float[] SIM_NORM_DECODER = Similarity.getNormDecoder(); private Weight weight; private TermDocs termDocs; private byte[] norms; private float weightValue; private int doc = -1; private final int[] docs = new int[32]; // buffered doc numbers private final int[] freqs = new int[32]; // buffered term freqs private int pointer; private int pointerMax; private static final int SCORE_CACHE_SIZE = 32; private float[] scoreCache = new float[SCORE_CACHE_SIZE]; /** * Construct a TermScorer. * * @param weight * The weight of the Term in the query. * @param td * An iterator over the documents matching the Term. * @param similarity * The Similarity implementation to be used for score * computations. * @param norms * The field norms of the document fields for the Term. */ TermScorer(Weight weight, TermDocs td, Similarity similarity, byte[] norms) { super(similarity); this.weight = weight; this.termDocs = td; this.norms = norms; this.weightValue = weight.getValue(); for (int i = 0; i < SCORE_CACHE_SIZE; i++) scoreCache[i] = getSimilarity().tf(i) * weightValue; } /** @deprecated use {@link #score(Collector)} instead. */ public void score(HitCollector hc) throws IOException { score(new HitCollectorWrapper(hc)); } public void score(Collector c) throws IOException { score(c, Integer.MAX_VALUE, nextDoc()); } /** @deprecated use {@link #score(Collector, int, int)} instead. */ protected boolean score(HitCollector c, int end) throws IOException { return score(new HitCollectorWrapper(c), end, doc); } // firstDocID is ignored since nextDoc() sets 'doc' protected boolean score(Collector c, int end, int firstDocID) throws IOException { c.setScorer(this); while (doc < end) { // for docs in window c.collect(doc); // collect score if (++pointer >= pointerMax) { pointerMax = termDocs.read(docs, freqs); // refill buffers if (pointerMax != 0) { pointer = 0; } else { termDocs.close(); // close stream doc = Integer.MAX_VALUE; // set to sentinel value return false; } } doc = docs[pointer]; } return true; } /** @deprecated use {@link #docID()} instead. */ public int doc() { return doc; } public int docID() { return doc; } /** * Advances to the next document matching the query.
    * The iterator over the matching documents is buffered using * {@link TermDocs#read(int[],int[])}. * * @return true iff there is another document matching the query. * @deprecated use {@link #nextDoc()} instead. */ public boolean next() throws IOException { return nextDoc() != NO_MORE_DOCS; } /** * Advances to the next document matching the query.
    * The iterator over the matching documents is buffered using * {@link TermDocs#read(int[],int[])}. * * @return the document matching the query or -1 if there are no more documents. */ public int nextDoc() throws IOException { pointer++; if (pointer >= pointerMax) { pointerMax = termDocs.read(docs, freqs); // refill buffer if (pointerMax != 0) { pointer = 0; } else { termDocs.close(); // close stream return doc = NO_MORE_DOCS; } } doc = docs[pointer]; return doc; } public float score() { assert doc != -1; int f = freqs[pointer]; float raw = // compute tf(f)*weight f < SCORE_CACHE_SIZE // check cache ? scoreCache[f] // cache hit : getSimilarity().tf(f)*weightValue; // cache miss return norms == null ? raw : raw * SIM_NORM_DECODER[norms[doc] & 0xFF]; // normalize for field } /** * Skips to the first match beyond the current whose document number is * greater than or equal to a given target.
    * The implementation uses {@link TermDocs#skipTo(int)}. * * @param target * The target document number. * @return true iff there is such a match. * @deprecated use {@link #advance(int)} instead. */ public boolean skipTo(int target) throws IOException { return advance(target) != NO_MORE_DOCS; } /** * Advances to the first match beyond the current whose document number is * greater than or equal to a given target.
    * The implementation uses {@link TermDocs#skipTo(int)}. * * @param target * The target document number. * @return the matching document or -1 if none exist. */ public int advance(int target) throws IOException { // first scan in cache for (pointer++; pointer < pointerMax; pointer++) { if (docs[pointer] >= target) { return doc = docs[pointer]; } } // not found in cache, seek underlying stream boolean result = termDocs.skipTo(target); if (result) { pointerMax = 1; pointer = 0; docs[pointer] = doc = termDocs.doc(); freqs[pointer] = termDocs.freq(); } else { doc = NO_MORE_DOCS; } return doc; } /** Returns an explanation of the score for a document. *
    When this method is used, the {@link #next()} method * and the {@link #score(HitCollector)} method should not be used. * @param doc The document number for the explanation. */ public Explanation explain(int doc) throws IOException { TermQuery query = (TermQuery) weight.getQuery(); Explanation tfExplanation = new Explanation(); int tf = 0; while (pointer < pointerMax) { if (docs[pointer] == doc) tf = freqs[pointer]; pointer++; } if (tf == 0) { if (termDocs.skipTo(doc)) { if (termDocs.doc() == doc) { tf = termDocs.freq(); } } } termDocs.close(); tfExplanation.setValue(getSimilarity().tf(tf)); tfExplanation.setDescription("tf(termFreq("+query.getTerm()+")="+tf+")"); return tfExplanation; } /** Returns a string representation of this TermScorer. */ public String toString() { return "scorer(" + weight + ")"; } } lucene-2.9.4/src/java/org/apache/lucene/search/FuzzyTermEnum.java0000644000175000017500000002663411474320224025374 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import java.io.IOException; /** Subclass of FilteredTermEnum for enumerating all terms that are similar * to the specified filter term. * *

    Term enumerations are always ordered by Term.compareTo(). Each term in * the enumeration is greater than all that precede it. */ public final class FuzzyTermEnum extends FilteredTermEnum { /* This should be somewhere around the average long word. * If it is longer, we waste time and space. If it is shorter, we waste a * little bit of time growing the array as we encounter longer words. */ private static final int TYPICAL_LONGEST_WORD_IN_INDEX = 19; /* Allows us save time required to create a new array * every time similarity is called. */ private int[][] d; private float similarity; private boolean endEnum = false; private Term searchTerm = null; private final String field; private final String text; private final String prefix; private final float minimumSimilarity; private final float scale_factor; private final int[] maxDistances = new int[TYPICAL_LONGEST_WORD_IN_INDEX]; /** * Creates a FuzzyTermEnum with an empty prefix and a minSimilarity of 0.5f. *

    * After calling the constructor the enumeration is already pointing to the first * valid term if such a term exists. * * @param reader * @param term * @throws IOException * @see #FuzzyTermEnum(IndexReader, Term, float, int) */ public FuzzyTermEnum(IndexReader reader, Term term) throws IOException { this(reader, term, FuzzyQuery.defaultMinSimilarity, FuzzyQuery.defaultPrefixLength); } /** * Creates a FuzzyTermEnum with an empty prefix. *

    * After calling the constructor the enumeration is already pointing to the first * valid term if such a term exists. * * @param reader * @param term * @param minSimilarity * @throws IOException * @see #FuzzyTermEnum(IndexReader, Term, float, int) */ public FuzzyTermEnum(IndexReader reader, Term term, float minSimilarity) throws IOException { this(reader, term, minSimilarity, FuzzyQuery.defaultPrefixLength); } /** * Constructor for enumeration of all terms from specified reader which share a prefix of * length prefixLength with term and which have a fuzzy similarity > * minSimilarity. *

    * After calling the constructor the enumeration is already pointing to the first * valid term if such a term exists. * * @param reader Delivers terms. * @param term Pattern term. * @param minSimilarity Minimum required similarity for terms from the reader. Default value is 0.5f. * @param prefixLength Length of required common prefix. Default value is 0. * @throws IOException */ public FuzzyTermEnum(IndexReader reader, Term term, final float minSimilarity, final int prefixLength) throws IOException { super(); if (minSimilarity >= 1.0f) throw new IllegalArgumentException("minimumSimilarity cannot be greater than or equal to 1"); else if (minSimilarity < 0.0f) throw new IllegalArgumentException("minimumSimilarity cannot be less than 0"); if(prefixLength < 0) throw new IllegalArgumentException("prefixLength cannot be less than 0"); this.minimumSimilarity = minSimilarity; this.scale_factor = 1.0f / (1.0f - minimumSimilarity); this.searchTerm = term; this.field = searchTerm.field(); //The prefix could be longer than the word. //It's kind of silly though. It means we must match the entire word. final int fullSearchTermLength = searchTerm.text().length(); final int realPrefixLength = prefixLength > fullSearchTermLength ? fullSearchTermLength : prefixLength; this.text = searchTerm.text().substring(realPrefixLength); this.prefix = searchTerm.text().substring(0, realPrefixLength); initializeMaxDistances(); this.d = initDistanceArray(); setEnum(reader.terms(new Term(searchTerm.field(), prefix))); } /** * The termCompare method in FuzzyTermEnum uses Levenshtein distance to * calculate the distance between the given term and the comparing term. */ protected final boolean termCompare(Term term) { if (field == term.field() && term.text().startsWith(prefix)) { final String target = term.text().substring(prefix.length()); this.similarity = similarity(target); return (similarity > minimumSimilarity); } endEnum = true; return false; } public final float difference() { return (float)((similarity - minimumSimilarity) * scale_factor); } public final boolean endEnum() { return endEnum; } /****************************** * Compute Levenshtein distance ******************************/ /** * Finds and returns the smallest of three integers */ private static final int min(int a, int b, int c) { final int t = (a < b) ? a : b; return (t < c) ? t : c; } private final int[][] initDistanceArray(){ return new int[this.text.length() + 1][TYPICAL_LONGEST_WORD_IN_INDEX]; } /** *

    Similarity returns a number that is 1.0f or less (including negative numbers) * based on how similar the Term is compared to a target term. It returns * exactly 0.0f when *

       *    editDistance < maximumEditDistance
    * Otherwise it returns: *
       *    1 - (editDistance / length)
    * where length is the length of the shortest term (text or target) including a * prefix that are identical and editDistance is the Levenshtein distance for * the two words.

    * *

    Embedded within this algorithm is a fail-fast Levenshtein distance * algorithm. The fail-fast algorithm differs from the standard Levenshtein * distance algorithm in that it is aborted if it is discovered that the * minimum distance between the words is greater than some threshold. * *

    To calculate the maximum distance threshold we use the following formula: *

       *     (1 - minimumSimilarity) * length
    * where length is the shortest term including any prefix that is not part of the * similarity comparison. This formula was derived by solving for what maximum value * of distance returns false for the following statements: *
       *   similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
       *   return (similarity > minimumSimilarity);
    * where distance is the Levenshtein distance for the two words. *

    *

    Levenshtein distance (also known as edit distance) is a measure of similarity * between two strings where the distance is measured as the number of character * deletions, insertions or substitutions required to transform one string to * the other string. * @param target the target word or phrase * @return the similarity, 0.0 or less indicates that it matches less than the required * threshold and 1.0 indicates that the text and target are identical */ private float similarity(final String target) { final int m = target.length(); final int n = text.length(); if (n == 0) { //we don't have anything to compare. That means if we just add //the letters for m we get the new word return prefix.length() == 0 ? 0.0f : 1.0f - ((float) m / prefix.length()); } if (m == 0) { return prefix.length() == 0 ? 0.0f : 1.0f - ((float) n / prefix.length()); } final int maxDistance = getMaxDistance(m); if (maxDistance < Math.abs(m-n)) { //just adding the characters of m to n or vice-versa results in //too many edits //for example "pre" length is 3 and "prefixes" length is 8. We can see that //given this optimal circumstance, the edit distance cannot be less than 5. //which is 8-3 or more precisely Math.abs(3-8). //if our maximum edit distance is 4, then we can discard this word //without looking at it. return 0.0f; } //let's make sure we have enough room in our array to do the distance calculations. if (d[0].length <= m) { growDistanceArray(m); } // init matrix d for (int i = 0; i <= n; i++) d[i][0] = i; for (int j = 0; j <= m; j++) d[0][j] = j; // start computing edit distance for (int i = 1; i <= n; i++) { int bestPossibleEditDistance = m; final char s_i = text.charAt(i - 1); for (int j = 1; j <= m; j++) { if (s_i != target.charAt(j-1)) { d[i][j] = min(d[i-1][j], d[i][j-1], d[i-1][j-1])+1; } else { d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]); } bestPossibleEditDistance = Math.min(bestPossibleEditDistance, d[i][j]); } //After calculating row i, the best possible edit distance //can be found by found by finding the smallest value in a given column. //If the bestPossibleEditDistance is greater than the max distance, abort. if (i > maxDistance && bestPossibleEditDistance > maxDistance) { //equal is okay, but not greater //the closest the target can be to the text is just too far away. //this target is leaving the party early. return 0.0f; } } // this will return less than 0.0 when the edit distance is // greater than the number of characters in the shorter word. // but this was the formula that was previously used in FuzzyTermEnum, // so it has not been changed (even though minimumSimilarity must be // greater than 0.0) return 1.0f - ((float)d[n][m] / (float) (prefix.length() + Math.min(n, m))); } /** * Grow the second dimension of the array, so that we can calculate the * Levenshtein difference. */ private void growDistanceArray(int m) { for (int i = 0; i < d.length; i++) { d[i] = new int[m+1]; } } /** * The max Distance is the maximum Levenshtein distance for the text * compared to some other value that results in score that is * better than the minimum similarity. * @param m the length of the "other value" * @return the maximum levenshtein distance that we care about */ private final int getMaxDistance(int m) { return (m < maxDistances.length) ? maxDistances[m] : calculateMaxDistance(m); } private void initializeMaxDistances() { for (int i = 0; i < maxDistances.length; i++) { maxDistances[i] = calculateMaxDistance(i); } } private int calculateMaxDistance(int m) { return (int) ((1-minimumSimilarity) * (Math.min(text.length(), m) + prefix.length())); } public void close() throws IOException { super.close(); //call super.close() and let the garbage collector do its work. } } lucene-2.9.4/src/java/org/apache/lucene/search/TimeLimitedCollector.java0000644000175000017500000001636711474320224026647 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** *

    * The TimeLimitedCollector is used to timeout search requests that take longer * than the maximum allowed search time limit. After this time is exceeded, the * search thread is stopped by throwing a TimeExceeded Exception. *

    * * @deprecated Use {@link TimeLimitingCollector} instead, which extends the new * {@link Collector}. This class will be removed in 3.0. */ public class TimeLimitedCollector extends HitCollector { /** * Default timer resolution. * @see #setResolution(long) */ public static final int DEFAULT_RESOLUTION = 20; /** * Default for {@link #isGreedy()}. * @see #isGreedy() */ public boolean DEFAULT_GREEDY = false; private static long resolution = DEFAULT_RESOLUTION; private boolean greedy = DEFAULT_GREEDY ; private static class TimerThread extends Thread { // NOTE: we can avoid explicit synchronization here for several reasons: // * updates to volatile long variables are atomic // * only single thread modifies this value // * use of volatile keyword ensures that it does not reside in // a register, but in main memory (so that changes are visible to // other threads). // * visibility of changes does not need to be instantaneous, we can // afford losing a tick or two. // // See section 17 of the Java Language Specification for details. private volatile long time = 0; /** * TimerThread provides a pseudo-clock service to all searching * threads, so that they can count elapsed time with less overhead * than repeatedly calling System.currentTimeMillis. A single * thread should be created to be used for all searches. */ private TimerThread() { super("TimeLimitedCollector timer thread"); this.setDaemon( true ); } public void run() { while( true ) { // TODO: Use System.nanoTime() when Lucene moves to Java SE 5. time += resolution; try { Thread.sleep( resolution ); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); throw new RuntimeException(ie); } } } /** * Get the timer value in milliseconds. */ public long getMilliseconds() { return time; } } /** * Thrown when elapsed search time exceeds allowed search time. */ public static class TimeExceededException extends RuntimeException { private long timeAllowed; private long timeElapsed; private int lastDocCollected; private TimeExceededException(long timeAllowed, long timeElapsed, int lastDocCollected) { super("Elapsed time: " + timeElapsed + "Exceeded allowed search time: " + timeAllowed + " ms."); this.timeAllowed = timeAllowed; this.timeElapsed = timeElapsed; this.lastDocCollected = lastDocCollected; } /** * Returns allowed time (milliseconds). */ public long getTimeAllowed() { return timeAllowed; } /** * Returns elapsed time (milliseconds). */ public long getTimeElapsed() { return timeElapsed; } /** * Returns last doc that was collected when the search time exceeded. */ public int getLastDocCollected() { return lastDocCollected; } } // Declare and initialize a single static timer thread to be used by // all TimeLimitedCollector instances. The JVM assures that // this only happens once. private final static TimerThread TIMER_THREAD = new TimerThread(); static { TIMER_THREAD.start(); } private final long t0; private final long timeout; private final HitCollector hc; /** * Create a TimeLimitedCollector wrapper over another HitCollector with a specified timeout. * @param hc the wrapped HitCollector * @param timeAllowed max time allowed for collecting hits after which {@link TimeExceededException} is thrown */ public TimeLimitedCollector(final HitCollector hc, final long timeAllowed) { this.hc = hc; t0 = TIMER_THREAD.getMilliseconds(); this.timeout = t0 + timeAllowed; } /** * Calls collect() on the decorated HitCollector. * * @throws TimeExceededException if the time allowed has been exceeded. */ public void collect( final int doc, final float score ) { long time = TIMER_THREAD.getMilliseconds(); if( timeout < time) { if (greedy) { //System.out.println(this+" greedy: before failing, collecting doc: "+doc+" "+(time-t0)); hc.collect( doc, score ); } //System.out.println(this+" failing on: "+doc+" "+(time-t0)); throw new TimeExceededException( timeout-t0, time-t0, doc ); } //System.out.println(this+" collecting: "+doc+" "+(time-t0)); hc.collect( doc, score ); } /** * Return the timer resolution. * @see #setResolution(long) */ public static long getResolution() { return resolution; } /** * Set the timer resolution. * The default timer resolution is 20 milliseconds. * This means that a search required to take no longer than * 800 milliseconds may be stopped after 780 to 820 milliseconds. *
    Note that: *
      *
    • Finer (smaller) resolution is more accurate but less efficient.
    • *
    • Setting resolution to less than 5 milliseconds will be silently modified to 5 milliseconds.
    • *
    • Setting resolution smaller than current resolution might take effect only after current * resolution. (Assume current resolution of 20 milliseconds is modified to 5 milliseconds, * then it can take up to 20 milliseconds for the change to have effect.
    • *
    */ public static void setResolution(long newResolution) { resolution = Math.max(newResolution,5); // 5 milliseconds is about the minimum reasonable time for a Object.wait(long) call. } /** * Checks if this time limited collector is greedy in collecting the last hit. * A non greedy collector, upon a timeout, would throw a {@link TimeExceededException} * without allowing the wrapped collector to collect current doc. A greedy one would * first allow the wrapped hit collector to collect current doc and only then * throw a {@link TimeExceededException}. * @see #setGreedy(boolean) */ public boolean isGreedy() { return greedy; } /** * Sets whether this time limited collector is greedy. * @param greedy true to make this time limited greedy * @see #isGreedy() */ public void setGreedy(boolean greedy) { this.greedy = greedy; } } lucene-2.9.4/src/java/org/apache/lucene/search/ParallelMultiSearcher.java0000644000175000017500000002305611474320224027007 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.Term; import org.apache.lucene.index.IndexReader; import org.apache.lucene.util.PriorityQueue; /** Implements parallel search over a set of Searchables. * *

    Applications usually need only call the inherited {@link #search(Query)} * or {@link #search(Query,Filter)} methods. */ public class ParallelMultiSearcher extends MultiSearcher { private Searchable[] searchables; private int[] starts; /** Creates a searchable which searches searchables. */ public ParallelMultiSearcher(Searchable[] searchables) throws IOException { super(searchables); this.searchables = searchables; this.starts = getStarts(); } /** * TODO: parallelize this one too */ public int docFreq(Term term) throws IOException { return super.docFreq(term); } /** * A search implementation which spans a new thread for each * Searchable, waits for each search to complete and merge * the results back together. */ public TopDocs search(Weight weight, Filter filter, int nDocs) throws IOException { HitQueue hq = new HitQueue(nDocs, false); int totalHits = 0; MultiSearcherThread[] msta = new MultiSearcherThread[searchables.length]; for (int i = 0; i < searchables.length; i++) { // search each searchable // Assume not too many searchables and cost of creating a thread is by far inferior to a search msta[i] = new MultiSearcherThread(searchables[i], weight, filter, nDocs, hq, i, starts, "MultiSearcher thread #" + (i + 1)); msta[i].start(); } for (int i = 0; i < searchables.length; i++) { try { msta[i].join(); } catch (InterruptedException ie) { // In 3.0 we will change this to throw // InterruptedException instead Thread.currentThread().interrupt(); throw new RuntimeException(ie); } IOException ioe = msta[i].getIOException(); if (ioe == null) { totalHits += msta[i].hits(); } else { // if one search produced an IOException, rethrow it throw ioe; } } ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()]; for (int i = hq.size() - 1; i >= 0; i--) // put docs in array scoreDocs[i] = (ScoreDoc) hq.pop(); float maxScore = (totalHits==0) ? Float.NEGATIVE_INFINITY : scoreDocs[0].score; return new TopDocs(totalHits, scoreDocs, maxScore); } /** * A search implementation allowing sorting which spans a new thread for each * Searchable, waits for each search to complete and merges * the results back together. */ public TopFieldDocs search(Weight weight, Filter filter, int nDocs, Sort sort) throws IOException { // don't specify the fields - we'll wait to do this until we get results FieldDocSortedHitQueue hq = new FieldDocSortedHitQueue (null, nDocs); int totalHits = 0; MultiSearcherThread[] msta = new MultiSearcherThread[searchables.length]; for (int i = 0; i < searchables.length; i++) { // search each searchable // Assume not too many searchables and cost of creating a thread is by far inferior to a search msta[i] = new MultiSearcherThread(searchables[i], weight, filter, nDocs, hq, sort, i, starts, "MultiSearcher thread #" + (i + 1)); msta[i].start(); } float maxScore=Float.NEGATIVE_INFINITY; for (int i = 0; i < searchables.length; i++) { try { msta[i].join(); } catch (InterruptedException ie) { // In 3.0 we will change this to throw // InterruptedException instead Thread.currentThread().interrupt(); throw new RuntimeException(ie); } IOException ioe = msta[i].getIOException(); if (ioe == null) { totalHits += msta[i].hits(); maxScore=Math.max(maxScore, msta[i].getMaxScore()); } else { // if one search produced an IOException, rethrow it throw ioe; } } ScoreDoc[] scoreDocs = new ScoreDoc[hq.size()]; for (int i = hq.size() - 1; i >= 0; i--) // put docs in array scoreDocs[i] = (ScoreDoc) hq.pop(); return new TopFieldDocs(totalHits, scoreDocs, hq.getFields(), maxScore); } /** Lower-level search API. * *

    {@link Collector#collect(int)} is called for every matching document. * *

    Applications should only use this if they need all of the * matching documents. The high-level search API ({@link * Searcher#search(Query)}) is usually more efficient, as it skips * non-high-scoring hits. * * @param weight to match documents * @param filter if non-null, a bitset used to eliminate some documents * @param collector to receive hits * * TODO: parallelize this one too */ public void search(Weight weight, Filter filter, final Collector collector) throws IOException { for (int i = 0; i < searchables.length; i++) { final int start = starts[i]; final Collector hc = new Collector() { public void setScorer(Scorer scorer) throws IOException { collector.setScorer(scorer); } public void collect(int doc) throws IOException { collector.collect(doc); } public void setNextReader(IndexReader reader, int docBase) throws IOException { collector.setNextReader(reader, start + docBase); } public boolean acceptsDocsOutOfOrder() { return collector.acceptsDocsOutOfOrder(); } }; searchables[i].search(weight, filter, hc); } } /* * TODO: this one could be parallelized too * @see org.apache.lucene.search.Searchable#rewrite(org.apache.lucene.search.Query) */ public Query rewrite(Query original) throws IOException { return super.rewrite(original); } } /** * A thread subclass for searching a single searchable */ class MultiSearcherThread extends Thread { private Searchable searchable; private Weight weight; private Filter filter; private int nDocs; private TopDocs docs; private int i; private PriorityQueue hq; private int[] starts; private IOException ioe; private Sort sort; public MultiSearcherThread(Searchable searchable, Weight weight, Filter filter, int nDocs, HitQueue hq, int i, int[] starts, String name) { super(name); this.searchable = searchable; this.weight = weight; this.filter = filter; this.nDocs = nDocs; this.hq = hq; this.i = i; this.starts = starts; } public MultiSearcherThread(Searchable searchable, Weight weight, Filter filter, int nDocs, FieldDocSortedHitQueue hq, Sort sort, int i, int[] starts, String name) { super(name); this.searchable = searchable; this.weight = weight; this.filter = filter; this.nDocs = nDocs; this.hq = hq; this.i = i; this.starts = starts; this.sort = sort; } public void run() { try { docs = (sort == null) ? searchable.search (weight, filter, nDocs) : searchable.search (weight, filter, nDocs, sort); } // Store the IOException for later use by the caller of this thread catch (IOException ioe) { this.ioe = ioe; } if (ioe == null) { // if we are sorting by fields, we need to tell the field sorted hit queue // the actual type of fields, in case the original list contained AUTO. // if the searchable returns null for fields, we'll have problems. if (sort != null) { TopFieldDocs docsFields = (TopFieldDocs) docs; // If one of the Sort fields is FIELD_DOC, need to fix its values, so that // it will break ties by doc Id properly. Otherwise, it will compare to // 'relative' doc Ids, that belong to two different searchables. for (int j = 0; j < docsFields.fields.length; j++) { if (docsFields.fields[j].getType() == SortField.DOC) { // iterate over the score docs and change their fields value for (int j2 = 0; j2 < docs.scoreDocs.length; j2++) { FieldDoc fd = (FieldDoc) docs.scoreDocs[j2]; fd.fields[j] = new Integer(((Integer) fd.fields[j]).intValue() + starts[i]); } break; } } ((FieldDocSortedHitQueue) hq).setFields(docsFields.fields); } ScoreDoc[] scoreDocs = docs.scoreDocs; for (int j = 0; j < scoreDocs.length; j++) { // merge scoreDocs into hq ScoreDoc scoreDoc = scoreDocs[j]; scoreDoc.doc += starts[i]; // convert doc //it would be so nice if we had a thread-safe insert synchronized (hq) { if (!hq.insert(scoreDoc)) break; } // no more scores > minScore } } } public int hits() { return docs.totalHits; } public float getMaxScore() { return docs.getMaxScore(); } public IOException getIOException() { return ioe; } } lucene-2.9.4/src/java/org/apache/lucene/search/ScoreCachingWrappingScorer.java0000644000175000017500000000601211474320224027772 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; /** * A {@link Scorer} which wraps another scorer and caches the score of the * current document. Successive calls to {@link #score()} will return the same * result and will not invoke the wrapped Scorer's score() method, unless the * current document has changed.
    * This class might be useful due to the changes done to the {@link Collector} * interface, in which the score is not computed for a document by default, only * if the collector requests it. Some collectors may need to use the score in * several places, however all they have in hand is a {@link Scorer} object, and * might end up computing the score of a document more than once. */ public class ScoreCachingWrappingScorer extends Scorer { private Scorer scorer; private int curDoc = -1; private float curScore; /** Creates a new instance by wrapping the given scorer. */ public ScoreCachingWrappingScorer(Scorer scorer) { super(scorer.getSimilarity()); this.scorer = scorer; } protected boolean score(Collector collector, int max, int firstDocID) throws IOException { return scorer.score(collector, max, firstDocID); } public Similarity getSimilarity() { return scorer.getSimilarity(); } public Explanation explain(int doc) throws IOException { return scorer.explain(doc); } public float score() throws IOException { int doc = scorer.docID(); if (doc != curDoc) { curScore = scorer.score(); curDoc = doc; } return curScore; } /** @deprecated use {@link #docID()} instead. */ public int doc() { return scorer.doc(); } public int docID() { return scorer.docID(); } /** @deprecated use {@link #nextDoc()} instead. */ public boolean next() throws IOException { return scorer.next(); } public int nextDoc() throws IOException { return scorer.nextDoc(); } public void score(Collector collector) throws IOException { scorer.score(collector); } /** @deprecated use {@link #advance(int)} instead. */ public boolean skipTo(int target) throws IOException { return scorer.skipTo(target); } public int advance(int target) throws IOException { return scorer.advance(target); } } lucene-2.9.4/src/java/org/apache/lucene/search/Collector.java0000644000175000017500000001541211474320224024506 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.IndexReader; /** *

    Expert: Collectors are primarily meant to be used to * gather raw results from a search, and implement sorting * or custom result filtering, collation, etc.

    * *

    As of 2.9, this class replaces the deprecated * HitCollector, and offers an API for efficient collection * of hits across sequential {@link IndexReader}s. {@link * IndexSearcher} advances the collector through each of the * sub readers, in an arbitrary order. This results in a * higher performance means of collection.

    * *

    Lucene's core collectors are derived from Collector. * Likely your application can use one of these classes, or * subclass {@link TopDocsCollector}, instead of * implementing Collector directly: * *

      * *
    • {@link TopDocsCollector} is an abstract base class * that assumes you will retrieve the top N docs, * according to some criteria, after collection is * done.
    • * *
    • {@link TopScoreDocCollector} is a concrete subclass * {@link TopDocsCollector} and sorts according to score + * docID. This is used internally by the {@link * IndexSearcher} search methods that do not take an * explicit {@link Sort}. It is likely the most frequently * used collector.
    • * *
    • {@link TopFieldCollector} subclasses {@link * TopDocsCollector} and sorts according to a specified * {@link Sort} object (sort by field). This is used * internally by the {@link IndexSearcher} search methods * that take an explicit {@link Sort}. * *
    • {@link TimeLimitingCollector}, which wraps any other * Collector and aborts the search if it's taken too much * time, will subclass Collector in 3.0 (presently it * subclasses the deprecated HitCollector).
    • * *
    • {@link PositiveScoresOnlyCollector} wraps any other * Collector and prevents collection of hits whose score * is <= 0.0
    • * *
    * *

    Collector decouples the score from the collected doc: * the score computation is skipped entirely if it's not * needed. Collectors that do need the score should * implement the {@link #setScorer} method, to hold onto the * passed {@link Scorer} instance, and call {@link * Scorer#score()} within the collect method to compute the * current hit's score. If your collector may request the * score for a single hit multiple times, you should use * {@link ScoreCachingWrappingScorer}.

    * *

    NOTE: The doc that is passed to the collect * method is relative to the current reader. If your * collector needs to resolve this to the docID space of the * Multi*Reader, you must re-base it by recording the * docBase from the most recent setNextReader call. Here's * a simple example showing how to collect docIDs into a * BitSet:

    * *
     * Searcher searcher = new IndexSearcher(indexReader);
     * final BitSet bits = new BitSet(indexReader.maxDoc());
     * searcher.search(query, new Collector() {
     *   private int docBase;
     * 
     *   // ignore scorer
     *   public void setScorer(Scorer scorer) {
     *   }
     *
     *   // accept docs out of order (for a BitSet it doesn't matter)
     *   public boolean acceptsDocsOutOfOrder() {
     *     return true;
     *   }
     * 
     *   public void collect(int doc) {
     *     bits.set(doc + docBase);
     *   }
     * 
     *   public void setNextReader(IndexReader reader, int docBase) {
     *     this.docBase = docBase;
     *   }
     * });
     * 
    * *

    Not all collectors will need to rebase the docID. For * example, a collector that simply counts the total number * of hits would skip it.

    * *

    NOTE: Prior to 2.9, Lucene silently filtered * out hits with score <= 0. As of 2.9, the core Collectors * no longer do that. It's very unusual to have such hits * (a negative query boost, or function query returning * negative custom scores, could cause it to happen). If * you need that behavior, use {@link * PositiveScoresOnlyCollector}.

    * *

    NOTE: This API is experimental and might change * in incompatible ways in the next release.

    * * @since 2.9 */ public abstract class Collector { /** * Called before successive calls to {@link #collect(int)}. Implementations * that need the score of the current document (passed-in to * {@link #collect(int)}), should save the passed-in Scorer and call * scorer.score() when needed. */ public abstract void setScorer(Scorer scorer) throws IOException; /** * Called once for every document matching a query, with the unbased document * number. * *

    * Note: This is called in an inner search loop. For good search performance, * implementations of this method should not call {@link Searcher#doc(int)} or * {@link org.apache.lucene.index.IndexReader#document(int)} on every hit. * Doing so can slow searches by an order of magnitude or more. */ public abstract void collect(int doc) throws IOException; /** * Called before collecting from each IndexReader. All doc ids in * {@link #collect(int)} will correspond to reader. * * Add docBase to the current IndexReaders internal document id to re-base ids * in {@link #collect(int)}. * * @param reader * next IndexReader * @param docBase */ public abstract void setNextReader(IndexReader reader, int docBase) throws IOException; /** * Return true if this collector does not * require the matching docIDs to be delivered in int sort * order (smallest to largest) to {@link #collect}. * *

    Most Lucene Query implementations will visit * matching docIDs in order. However, some queries * (currently limited to certain cases of {@link * BooleanQuery}) can achieve faster searching if the * Collector allows them to deliver the * docIDs out of order.

    * *

    Many collectors don't mind getting docIDs out of * order, so it's important to return true * here. */ public abstract boolean acceptsDocsOutOfOrder(); } lucene-2.9.4/src/java/org/apache/lucene/search/QueryTermVector.java0000644000175000017500000001061211474320224025675 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.index.TermFreqVector; /** * * **/ public class QueryTermVector implements TermFreqVector { private String [] terms = new String[0]; private int [] termFreqs = new int[0]; public String getField() { return null; } /** * * @param queryTerms The original list of terms from the query, can contain duplicates */ public QueryTermVector(String [] queryTerms) { processTerms(queryTerms); } public QueryTermVector(String queryString, Analyzer analyzer) { if (analyzer != null) { TokenStream stream = analyzer.tokenStream("", new StringReader(queryString)); if (stream != null) { List terms = new ArrayList(); try { boolean hasMoreTokens = false; stream.reset(); TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class); hasMoreTokens = stream.incrementToken(); while (hasMoreTokens) { terms.add(termAtt.term()); hasMoreTokens = stream.incrementToken(); } processTerms((String[])terms.toArray(new String[terms.size()])); } catch (IOException e) { } } } } private void processTerms(String[] queryTerms) { if (queryTerms != null) { Arrays.sort(queryTerms); Map tmpSet = new HashMap(queryTerms.length); //filter out duplicates List tmpList = new ArrayList(queryTerms.length); List tmpFreqs = new ArrayList(queryTerms.length); int j = 0; for (int i = 0; i < queryTerms.length; i++) { String term = queryTerms[i]; Integer position = (Integer)tmpSet.get(term); if (position == null) { tmpSet.put(term, new Integer(j++)); tmpList.add(term); tmpFreqs.add(new Integer(1)); } else { Integer integer = (Integer)tmpFreqs.get(position.intValue()); tmpFreqs.set(position.intValue(), new Integer(integer.intValue() + 1)); } } terms = (String[])tmpList.toArray(terms); //termFreqs = (int[])tmpFreqs.toArray(termFreqs); termFreqs = new int[tmpFreqs.size()]; int i = 0; for (Iterator iter = tmpFreqs.iterator(); iter.hasNext();) { Integer integer = (Integer) iter.next(); termFreqs[i++] = integer.intValue(); } } } public final String toString() { StringBuffer sb = new StringBuffer(); sb.append('{'); for (int i=0; i0) sb.append(", "); sb.append(terms[i]).append('/').append(termFreqs[i]); } sb.append('}'); return sb.toString(); } public int size() { return terms.length; } public String[] getTerms() { return terms; } public int[] getTermFrequencies() { return termFreqs; } public int indexOf(String term) { int res = Arrays.binarySearch(terms, term); return res >= 0 ? res : -1; } public int[] indexesOf(String[] terms, int start, int len) { int res[] = new int[len]; for (int i=0; i < len; i++) { res[i] = indexOf(terms[i]); } return res; } } lucene-2.9.4/src/java/org/apache/lucene/search/TopFieldDocs.java0000644000175000017500000000274411474320224025103 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** Represents hits returned by {@link * Searcher#search(Query,Filter,int,Sort)}. */ public class TopFieldDocs extends TopDocs { /** The fields which were used to sort results by. */ public SortField[] fields; /** Creates one of these objects. * @param totalHits Total number of hits for the query. * @param scoreDocs The top hits for the query. * @param fields The sort criteria used to find the top hits. * @param maxScore The maximum score encountered. */ public TopFieldDocs (int totalHits, ScoreDoc[] scoreDocs, SortField[] fields, float maxScore) { super (totalHits, scoreDocs, maxScore); this.fields = fields; } }lucene-2.9.4/src/java/org/apache/lucene/search/ComplexExplanation.java0000644000175000017500000000442411474320224026373 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** Expert: Describes the score computation for document and query, and * can distinguish a match independent of a positive value. */ public class ComplexExplanation extends Explanation { private Boolean match; public ComplexExplanation() { super(); } public ComplexExplanation(boolean match, float value, String description) { // NOTE: use of "boolean" instead of "Boolean" in params is conscious // choice to encourage clients to be specific. super(value, description); this.match = Boolean.valueOf(match); } /** * The match status of this explanation node. * @return May be null if match status is unknown */ public Boolean getMatch() { return match; } /** * Sets the match status assigned to this explanation node. * @param match May be null if match status is unknown */ public void setMatch(Boolean match) { this.match = match; } /** * Indicates whether or not this Explanation models a good match. * *

    * If the match status is explicitly set (i.e.: not null) this method * uses it; otherwise it defers to the superclass. *

    * @see #getMatch */ public boolean isMatch() { Boolean m = getMatch(); return (null != m ? m.booleanValue() : super.isMatch()); } protected String getSummary() { if (null == getMatch()) return super.getSummary(); return getValue() + " = " + (isMatch() ? "(MATCH) " : "(NON-MATCH) ") + getDescription(); } } lucene-2.9.4/src/java/org/apache/lucene/search/SpanFilterResult.java0000644000175000017500000000606311474320224026030 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.ArrayList; import java.util.BitSet; import java.util.List; /** * The results of a SpanQueryFilter. Wraps the BitSet and the position information from the SpanQuery * *

    * NOTE: This API is still experimental and subject to change. * **/ public class SpanFilterResult { /** @deprecated */ private BitSet bits; private DocIdSet docIdSet; private List positions;//Spans spans; /** * * @param bits The bits for the Filter * @param positions A List of {@link org.apache.lucene.search.SpanFilterResult.PositionInfo} objects * @deprecated Use {@link #SpanFilterResult(DocIdSet, List)} instead */ public SpanFilterResult(BitSet bits, List positions) { this.bits = bits; this.positions = positions; } /** * * @param docIdSet The DocIdSet for the Filter * @param positions A List of {@link org.apache.lucene.search.SpanFilterResult.PositionInfo} objects */ public SpanFilterResult(DocIdSet docIdSet, List positions) { this.docIdSet = docIdSet; this.positions = positions; } /** * The first entry in the array corresponds to the first "on" bit. * Entries are increasing by document order * @return A List of PositionInfo objects */ public List getPositions() { return positions; } /** * @deprecated Use {@link #getDocIdSet()} */ public BitSet getBits() { return bits; } /** Returns the docIdSet */ public DocIdSet getDocIdSet() { return docIdSet; } public static class PositionInfo { private int doc; private List positions; public PositionInfo(int doc) { this.doc = doc; positions = new ArrayList(); } public void addPosition(int start, int end) { positions.add(new StartEnd(start, end)); } public int getDoc() { return doc; } /** * * @return A List of {@link org.apache.lucene.search.SpanFilterResult.StartEnd} objects */ public List getPositions() { return positions; } } public static class StartEnd { private int start; private int end; public StartEnd(int start, int end) { this.start = start; this.end = end; } /** * * @return The end position of this match */ public int getEnd() { return end; } /** * The Start position * @return The start position of this match */ public int getStart() { return start; } } } lucene-2.9.4/src/java/org/apache/lucene/search/FieldDocSortedHitQueue.java0000644000175000017500000001457111474320224027071 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.util.PriorityQueue; import java.text.Collator; import java.util.Locale; /** * Expert: Collects sorted results from Searchable's and collates them. * The elements put into this queue must be of type FieldDoc. * *

    Created: Feb 11, 2004 2:04:21 PM * * @since lucene 1.4 * @version $Id: FieldDocSortedHitQueue.java 695514 2008-09-15 15:42:11Z otis $ */ class FieldDocSortedHitQueue extends PriorityQueue { // this cannot contain AUTO fields - any AUTO fields should // have been resolved by the time this class is used. volatile SortField[] fields; // used in the case where the fields are sorted by locale // based strings volatile Collator[] collators; /** * Creates a hit queue sorted by the given list of fields. * @param fields Fieldable names, in priority order (highest priority first). * @param size The number of hits to retain. Must be greater than zero. */ FieldDocSortedHitQueue (SortField[] fields, int size) { this.fields = fields; this.collators = hasCollators (fields); initialize (size); } /** * Allows redefinition of sort fields if they are null. * This is to handle the case using ParallelMultiSearcher where the * original list contains AUTO and we don't know the actual sort * type until the values come back. The fields can only be set once. * This method is thread safe. * @param fields */ synchronized void setFields (SortField[] fields) { if (this.fields == null) { this.fields = fields; this.collators = hasCollators (fields); } } /** Returns the fields being used to sort. */ SortField[] getFields() { return fields; } /** Returns an array of collators, possibly null. The collators * correspond to any SortFields which were given a specific locale. * @param fields Array of sort fields. * @return Array, possibly null. */ private Collator[] hasCollators (final SortField[] fields) { if (fields == null) return null; Collator[] ret = new Collator[fields.length]; for (int i=0; ia is less relevant than b. * @param a ScoreDoc * @param b ScoreDoc * @return true if document a should be sorted after document b. */ protected final boolean lessThan (final Object a, final Object b) { final FieldDoc docA = (FieldDoc) a; final FieldDoc docB = (FieldDoc) b; final int n = fields.length; int c = 0; for (int i=0; i r2) c = -1; if (r1 < r2) c = 1; break; } case SortField.DOC: case SortField.INT:{ int i1 = ((Integer)docA.fields[i]).intValue(); int i2 = ((Integer)docB.fields[i]).intValue(); if (i1 < i2) c = -1; if (i1 > i2) c = 1; break; } case SortField.LONG:{ long l1 = ((Long)docA.fields[i]).longValue(); long l2 = ((Long)docB.fields[i]).longValue(); if (l1 < l2) c = -1; if (l1 > l2) c = 1; break; } case SortField.STRING:{ String s1 = (String) docA.fields[i]; String s2 = (String) docB.fields[i]; // null values need to be sorted first, because of how FieldCache.getStringIndex() // works - in that routine, any documents without a value in the given field are // put first. If both are null, the next SortField is used if (s1 == null) c = (s2==null) ? 0 : -1; else if (s2 == null) c = 1; // else if (fields[i].getLocale() == null) { c = s1.compareTo(s2); } else { c = collators[i].compare (s1, s2); } break; } case SortField.FLOAT:{ float f1 = ((Float)docA.fields[i]).floatValue(); float f2 = ((Float)docB.fields[i]).floatValue(); if (f1 < f2) c = -1; if (f1 > f2) c = 1; break; } case SortField.DOUBLE:{ double d1 = ((Double)docA.fields[i]).doubleValue(); double d2 = ((Double)docB.fields[i]).doubleValue(); if (d1 < d2) c = -1; if (d1 > d2) c = 1; break; } case SortField.BYTE:{ int i1 = ((Byte)docA.fields[i]).byteValue(); int i2 = ((Byte)docB.fields[i]).byteValue(); if (i1 < i2) c = -1; if (i1 > i2) c = 1; break; } case SortField.SHORT:{ int i1 = ((Short)docA.fields[i]).shortValue(); int i2 = ((Short)docB.fields[i]).shortValue(); if (i1 < i2) c = -1; if (i1 > i2) c = 1; break; } case SortField.CUSTOM:{ c = docA.fields[i].compareTo (docB.fields[i]); break; } case SortField.AUTO:{ // we cannot handle this - even if we determine the type of object (Float or // Integer), we don't necessarily know how to compare them (both SCORE and // FLOAT contain floats, but are sorted opposite of each other). Before // we get here, each AUTO should have been replaced with its actual value. throw new RuntimeException ("FieldDocSortedHitQueue cannot use an AUTO SortField"); } default:{ throw new RuntimeException ("invalid SortField type: "+type); } } if (fields[i].getReverse()) { c = -c; } } // avoid random sort order that could lead to duplicates (bug #31241): if (c == 0) return docA.doc > docB.doc; return c > 0; } } lucene-2.9.4/src/java/org/apache/lucene/search/FieldCacheRangeFilter.java0000644000175000017500000006430011474320224026652 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.TermDocs; import org.apache.lucene.util.NumericUtils; import org.apache.lucene.document.NumericField; // for javadocs /** * A range filter built on top of a cached single term field (in {@link FieldCache}). * *

    FieldCacheRangeFilter builds a single cache for the field the first time it is used. * Each subsequent FieldCacheRangeFilter on the same field then reuses this cache, * even if the range itself changes. * *

    This means that FieldCacheRangeFilter is much faster (sometimes more than 100x as fast) * as building a {@link TermRangeFilter} (or {@link ConstantScoreRangeQuery} on a {@link TermRangeFilter}) * for each query, if using a {@link #newStringRange}. However, if the range never changes it * is slower (around 2x as slow) than building a CachingWrapperFilter on top of a single TermRangeFilter. * * For numeric data types, this filter may be significantly faster than {@link NumericRangeFilter}. * Furthermore, it does not need the numeric values encoded by {@link NumericField}. But * it has the problem that it only works with exact one value/document (see below). * *

    As with all {@link FieldCache} based functionality, FieldCacheRangeFilter is only valid for * fields which exact one term for each document (except for {@link #newStringRange} * where 0 terms are also allowed). Due to a restriction of {@link FieldCache}, for numeric ranges * all terms that do not have a numeric value, 0 is assumed. * *

    Thus it works on dates, prices and other single value fields but will not work on * regular text fields. It is preferable to use a NOT_ANALYZED field to ensure that * there is only a single term. * *

    This class does not have an constructor, use one of the static factory methods available, * that create a correct instance for different data types supported by {@link FieldCache}. */ public abstract class FieldCacheRangeFilter extends Filter { final String field; final FieldCache.Parser parser; final Object lowerVal; final Object upperVal; final boolean includeLower; final boolean includeUpper; private FieldCacheRangeFilter(String field, FieldCache.Parser parser, Object lowerVal, Object upperVal, boolean includeLower, boolean includeUpper) { this.field = field; this.parser = parser; this.lowerVal = lowerVal; this.upperVal = upperVal; this.includeLower = includeLower; this.includeUpper = includeUpper; } /** This method is implemented for each data type */ public abstract DocIdSet getDocIdSet(IndexReader reader) throws IOException; /** * Creates a string range query using {@link FieldCache#getStringIndex}. This works with all * fields containing zero or one term in the field. The range can be half-open by setting one * of the values to null. */ public static FieldCacheRangeFilter newStringRange(String field, String lowerVal, String upperVal, boolean includeLower, boolean includeUpper) { return new FieldCacheRangeFilter(field, null, lowerVal, upperVal, includeLower, includeUpper) { public DocIdSet getDocIdSet(IndexReader reader) throws IOException { final FieldCache.StringIndex fcsi = FieldCache.DEFAULT.getStringIndex(reader, field); final int lowerPoint = fcsi.binarySearchLookup((String) lowerVal); final int upperPoint = fcsi.binarySearchLookup((String) upperVal); final int inclusiveLowerPoint, inclusiveUpperPoint; // Hints: // * binarySearchLookup returns 0, if value was null. // * the value is <0 if no exact hit was found, the returned value // is (-(insertion point) - 1) if (lowerPoint == 0) { assert lowerVal == null; inclusiveLowerPoint = 1; } else if (includeLower && lowerPoint > 0) { inclusiveLowerPoint = lowerPoint; } else if (lowerPoint > 0) { inclusiveLowerPoint = lowerPoint + 1; } else { inclusiveLowerPoint = Math.max(1, -lowerPoint - 1); } if (upperPoint == 0) { assert upperVal == null; inclusiveUpperPoint = Integer.MAX_VALUE; } else if (includeUpper && upperPoint > 0) { inclusiveUpperPoint = upperPoint; } else if (upperPoint > 0) { inclusiveUpperPoint = upperPoint - 1; } else { inclusiveUpperPoint = -upperPoint - 2; } if (inclusiveUpperPoint <= 0 || inclusiveLowerPoint > inclusiveUpperPoint) return DocIdSet.EMPTY_DOCIDSET; assert inclusiveLowerPoint > 0 && inclusiveUpperPoint > 0; // for this DocIdSet, we never need to use TermDocs, // because deleted docs have an order of 0 (null entry in StringIndex) return new FieldCacheDocIdSet(reader, false) { final boolean matchDoc(int doc) { return fcsi.order[doc] >= inclusiveLowerPoint && fcsi.order[doc] <= inclusiveUpperPoint; } }; } }; } /** * Creates a numeric range query using {@link FieldCache#getBytes(IndexReader,String)}. This works with all * byte fields containing exactly one numeric term in the field. The range can be half-open by setting one * of the values to null. */ public static FieldCacheRangeFilter newByteRange(String field, Byte lowerVal, Byte upperVal, boolean includeLower, boolean includeUpper) { return newByteRange(field, null, lowerVal, upperVal, includeLower, includeUpper); } /** * Creates a numeric range query using {@link FieldCache#getBytes(IndexReader,String,FieldCache.ByteParser)}. This works with all * byte fields containing exactly one numeric term in the field. The range can be half-open by setting one * of the values to null. */ public static FieldCacheRangeFilter newByteRange(String field, FieldCache.ByteParser parser, Byte lowerVal, Byte upperVal, boolean includeLower, boolean includeUpper) { return new FieldCacheRangeFilter(field, parser, lowerVal, upperVal, includeLower, includeUpper) { public DocIdSet getDocIdSet(IndexReader reader) throws IOException { final byte inclusiveLowerPoint, inclusiveUpperPoint; if (lowerVal != null) { final byte i = ((Number) lowerVal).byteValue(); if (!includeLower && i == Byte.MAX_VALUE) return DocIdSet.EMPTY_DOCIDSET; inclusiveLowerPoint = (byte) (includeLower ? i : (i + 1)); } else { inclusiveLowerPoint = Byte.MIN_VALUE; } if (upperVal != null) { final byte i = ((Number) upperVal).byteValue(); if (!includeUpper && i == Byte.MIN_VALUE) return DocIdSet.EMPTY_DOCIDSET; inclusiveUpperPoint = (byte) (includeUpper ? i : (i - 1)); } else { inclusiveUpperPoint = Byte.MAX_VALUE; } if (inclusiveLowerPoint > inclusiveUpperPoint) return DocIdSet.EMPTY_DOCIDSET; final byte[] values = FieldCache.DEFAULT.getBytes(reader, field, (FieldCache.ByteParser) parser); // we only request the usage of termDocs, if the range contains 0 return new FieldCacheDocIdSet(reader, (inclusiveLowerPoint <= 0 && inclusiveUpperPoint >= 0)) { boolean matchDoc(int doc) { return values[doc] >= inclusiveLowerPoint && values[doc] <= inclusiveUpperPoint; } }; } }; } /** * Creates a numeric range query using {@link FieldCache#getShorts(IndexReader,String)}. This works with all * short fields containing exactly one numeric term in the field. The range can be half-open by setting one * of the values to null. */ public static FieldCacheRangeFilter newShortRange(String field, Short lowerVal, Short upperVal, boolean includeLower, boolean includeUpper) { return newShortRange(field, null, lowerVal, upperVal, includeLower, includeUpper); } /** * Creates a numeric range query using {@link FieldCache#getShorts(IndexReader,String,FieldCache.ShortParser)}. This works with all * short fields containing exactly one numeric term in the field. The range can be half-open by setting one * of the values to null. */ public static FieldCacheRangeFilter newShortRange(String field, FieldCache.ShortParser parser, Short lowerVal, Short upperVal, boolean includeLower, boolean includeUpper) { return new FieldCacheRangeFilter(field, parser, lowerVal, upperVal, includeLower, includeUpper) { public DocIdSet getDocIdSet(IndexReader reader) throws IOException { final short inclusiveLowerPoint, inclusiveUpperPoint; if (lowerVal != null) { short i = ((Number) lowerVal).shortValue(); if (!includeLower && i == Short.MAX_VALUE) return DocIdSet.EMPTY_DOCIDSET; inclusiveLowerPoint = (short) (includeLower ? i : (i + 1)); } else { inclusiveLowerPoint = Short.MIN_VALUE; } if (upperVal != null) { short i = ((Number) upperVal).shortValue(); if (!includeUpper && i == Short.MIN_VALUE) return DocIdSet.EMPTY_DOCIDSET; inclusiveUpperPoint = (short) (includeUpper ? i : (i - 1)); } else { inclusiveUpperPoint = Short.MAX_VALUE; } if (inclusiveLowerPoint > inclusiveUpperPoint) return DocIdSet.EMPTY_DOCIDSET; final short[] values = FieldCache.DEFAULT.getShorts(reader, field, (FieldCache.ShortParser) parser); // we only request the usage of termDocs, if the range contains 0 return new FieldCacheDocIdSet(reader, (inclusiveLowerPoint <= 0 && inclusiveUpperPoint >= 0)) { boolean matchDoc(int doc) { return values[doc] >= inclusiveLowerPoint && values[doc] <= inclusiveUpperPoint; } }; } }; } /** * Creates a numeric range query using {@link FieldCache#getInts(IndexReader,String)}. This works with all * int fields containing exactly one numeric term in the field. The range can be half-open by setting one * of the values to null. */ public static FieldCacheRangeFilter newIntRange(String field, Integer lowerVal, Integer upperVal, boolean includeLower, boolean includeUpper) { return newIntRange(field, null, lowerVal, upperVal, includeLower, includeUpper); } /** * Creates a numeric range query using {@link FieldCache#getInts(IndexReader,String,FieldCache.IntParser)}. This works with all * int fields containing exactly one numeric term in the field. The range can be half-open by setting one * of the values to null. */ public static FieldCacheRangeFilter newIntRange(String field, FieldCache.IntParser parser, Integer lowerVal, Integer upperVal, boolean includeLower, boolean includeUpper) { return new FieldCacheRangeFilter(field, parser, lowerVal, upperVal, includeLower, includeUpper) { public DocIdSet getDocIdSet(IndexReader reader) throws IOException { final int inclusiveLowerPoint, inclusiveUpperPoint; if (lowerVal != null) { int i = ((Number) lowerVal).intValue(); if (!includeLower && i == Integer.MAX_VALUE) return DocIdSet.EMPTY_DOCIDSET; inclusiveLowerPoint = includeLower ? i : (i + 1); } else { inclusiveLowerPoint = Integer.MIN_VALUE; } if (upperVal != null) { int i = ((Number) upperVal).intValue(); if (!includeUpper && i == Integer.MIN_VALUE) return DocIdSet.EMPTY_DOCIDSET; inclusiveUpperPoint = includeUpper ? i : (i - 1); } else { inclusiveUpperPoint = Integer.MAX_VALUE; } if (inclusiveLowerPoint > inclusiveUpperPoint) return DocIdSet.EMPTY_DOCIDSET; final int[] values = FieldCache.DEFAULT.getInts(reader, field, (FieldCache.IntParser) parser); // we only request the usage of termDocs, if the range contains 0 return new FieldCacheDocIdSet(reader, (inclusiveLowerPoint <= 0 && inclusiveUpperPoint >= 0)) { boolean matchDoc(int doc) { return values[doc] >= inclusiveLowerPoint && values[doc] <= inclusiveUpperPoint; } }; } }; } /** * Creates a numeric range query using {@link FieldCache#getLongs(IndexReader,String)}. This works with all * long fields containing exactly one numeric term in the field. The range can be half-open by setting one * of the values to null. */ public static FieldCacheRangeFilter newLongRange(String field, Long lowerVal, Long upperVal, boolean includeLower, boolean includeUpper) { return newLongRange(field, null, lowerVal, upperVal, includeLower, includeUpper); } /** * Creates a numeric range query using {@link FieldCache#getLongs(IndexReader,String,FieldCache.LongParser)}. This works with all * long fields containing exactly one numeric term in the field. The range can be half-open by setting one * of the values to null. */ public static FieldCacheRangeFilter newLongRange(String field, FieldCache.LongParser parser, Long lowerVal, Long upperVal, boolean includeLower, boolean includeUpper) { return new FieldCacheRangeFilter(field, parser, lowerVal, upperVal, includeLower, includeUpper) { public DocIdSet getDocIdSet(IndexReader reader) throws IOException { final long inclusiveLowerPoint, inclusiveUpperPoint; if (lowerVal != null) { long i = ((Number) lowerVal).longValue(); if (!includeLower && i == Long.MAX_VALUE) return DocIdSet.EMPTY_DOCIDSET; inclusiveLowerPoint = includeLower ? i : (i + 1L); } else { inclusiveLowerPoint = Long.MIN_VALUE; } if (upperVal != null) { long i = ((Number) upperVal).longValue(); if (!includeUpper && i == Long.MIN_VALUE) return DocIdSet.EMPTY_DOCIDSET; inclusiveUpperPoint = includeUpper ? i : (i - 1L); } else { inclusiveUpperPoint = Long.MAX_VALUE; } if (inclusiveLowerPoint > inclusiveUpperPoint) return DocIdSet.EMPTY_DOCIDSET; final long[] values = FieldCache.DEFAULT.getLongs(reader, field, (FieldCache.LongParser) parser); // we only request the usage of termDocs, if the range contains 0 return new FieldCacheDocIdSet(reader, (inclusiveLowerPoint <= 0L && inclusiveUpperPoint >= 0L)) { boolean matchDoc(int doc) { return values[doc] >= inclusiveLowerPoint && values[doc] <= inclusiveUpperPoint; } }; } }; } /** * Creates a numeric range query using {@link FieldCache#getFloats(IndexReader,String)}. This works with all * float fields containing exactly one numeric term in the field. The range can be half-open by setting one * of the values to null. */ public static FieldCacheRangeFilter newFloatRange(String field, Float lowerVal, Float upperVal, boolean includeLower, boolean includeUpper) { return newFloatRange(field, null, lowerVal, upperVal, includeLower, includeUpper); } /** * Creates a numeric range query using {@link FieldCache#getFloats(IndexReader,String,FieldCache.FloatParser)}. This works with all * float fields containing exactly one numeric term in the field. The range can be half-open by setting one * of the values to null. */ public static FieldCacheRangeFilter newFloatRange(String field, FieldCache.FloatParser parser, Float lowerVal, Float upperVal, boolean includeLower, boolean includeUpper) { return new FieldCacheRangeFilter(field, parser, lowerVal, upperVal, includeLower, includeUpper) { public DocIdSet getDocIdSet(IndexReader reader) throws IOException { // we transform the floating point numbers to sortable integers // using NumericUtils to easier find the next bigger/lower value final float inclusiveLowerPoint, inclusiveUpperPoint; if (lowerVal != null) { float f = ((Number) lowerVal).floatValue(); if (!includeUpper && f > 0.0f && Float.isInfinite(f)) return DocIdSet.EMPTY_DOCIDSET; int i = NumericUtils.floatToSortableInt(f); inclusiveLowerPoint = NumericUtils.sortableIntToFloat( includeLower ? i : (i + 1) ); } else { inclusiveLowerPoint = Float.NEGATIVE_INFINITY; } if (upperVal != null) { float f = ((Number) upperVal).floatValue(); if (!includeUpper && f < 0.0f && Float.isInfinite(f)) return DocIdSet.EMPTY_DOCIDSET; int i = NumericUtils.floatToSortableInt(f); inclusiveUpperPoint = NumericUtils.sortableIntToFloat( includeUpper ? i : (i - 1) ); } else { inclusiveUpperPoint = Float.POSITIVE_INFINITY; } if (inclusiveLowerPoint > inclusiveUpperPoint) return DocIdSet.EMPTY_DOCIDSET; final float[] values = FieldCache.DEFAULT.getFloats(reader, field, (FieldCache.FloatParser) parser); // we only request the usage of termDocs, if the range contains 0 return new FieldCacheDocIdSet(reader, (inclusiveLowerPoint <= 0.0f && inclusiveUpperPoint >= 0.0f)) { boolean matchDoc(int doc) { return values[doc] >= inclusiveLowerPoint && values[doc] <= inclusiveUpperPoint; } }; } }; } /** * Creates a numeric range query using {@link FieldCache#getDoubles(IndexReader,String)}. This works with all * double fields containing exactly one numeric term in the field. The range can be half-open by setting one * of the values to null. */ public static FieldCacheRangeFilter newDoubleRange(String field, Double lowerVal, Double upperVal, boolean includeLower, boolean includeUpper) { return newDoubleRange(field, null, lowerVal, upperVal, includeLower, includeUpper); } /** * Creates a numeric range query using {@link FieldCache#getDoubles(IndexReader,String,FieldCache.DoubleParser)}. This works with all * double fields containing exactly one numeric term in the field. The range can be half-open by setting one * of the values to null. */ public static FieldCacheRangeFilter newDoubleRange(String field, FieldCache.DoubleParser parser, Double lowerVal, Double upperVal, boolean includeLower, boolean includeUpper) { return new FieldCacheRangeFilter(field, parser, lowerVal, upperVal, includeLower, includeUpper) { public DocIdSet getDocIdSet(IndexReader reader) throws IOException { // we transform the floating point numbers to sortable integers // using NumericUtils to easier find the next bigger/lower value final double inclusiveLowerPoint, inclusiveUpperPoint; if (lowerVal != null) { double f = ((Number) lowerVal).doubleValue(); if (!includeUpper && f > 0.0 && Double.isInfinite(f)) return DocIdSet.EMPTY_DOCIDSET; long i = NumericUtils.doubleToSortableLong(f); inclusiveLowerPoint = NumericUtils.sortableLongToDouble( includeLower ? i : (i + 1L) ); } else { inclusiveLowerPoint = Double.NEGATIVE_INFINITY; } if (upperVal != null) { double f = ((Number) upperVal).doubleValue(); if (!includeUpper && f < 0.0 && Double.isInfinite(f)) return DocIdSet.EMPTY_DOCIDSET; long i = NumericUtils.doubleToSortableLong(f); inclusiveUpperPoint = NumericUtils.sortableLongToDouble( includeUpper ? i : (i - 1L) ); } else { inclusiveUpperPoint = Double.POSITIVE_INFINITY; } if (inclusiveLowerPoint > inclusiveUpperPoint) return DocIdSet.EMPTY_DOCIDSET; final double[] values = FieldCache.DEFAULT.getDoubles(reader, field, (FieldCache.DoubleParser) parser); // we only request the usage of termDocs, if the range contains 0 return new FieldCacheDocIdSet(reader, (inclusiveLowerPoint <= 0.0 && inclusiveUpperPoint >= 0.0)) { boolean matchDoc(int doc) { return values[doc] >= inclusiveLowerPoint && values[doc] <= inclusiveUpperPoint; } }; } }; } public final String toString() { final StringBuffer sb = new StringBuffer(field).append(":"); return sb.append(includeLower ? '[' : '{') .append((lowerVal == null) ? "*" : lowerVal.toString()) .append(" TO ") .append((upperVal == null) ? "*" : upperVal.toString()) .append(includeUpper ? ']' : '}') .toString(); } public final boolean equals(Object o) { if (this == o) return true; if (!(o instanceof FieldCacheRangeFilter)) return false; FieldCacheRangeFilter other = (FieldCacheRangeFilter) o; if (!this.field.equals(other.field) || this.includeLower != other.includeLower || this.includeUpper != other.includeUpper ) { return false; } if (this.lowerVal != null ? !this.lowerVal.equals(other.lowerVal) : other.lowerVal != null) return false; if (this.upperVal != null ? !this.upperVal.equals(other.upperVal) : other.upperVal != null) return false; if (this.parser != null ? !this.parser.equals(other.parser) : other.parser != null) return false; return true; } public final int hashCode() { int h = field.hashCode(); h ^= (lowerVal != null) ? lowerVal.hashCode() : 550356204; h = (h << 1) | (h >>> 31); // rotate to distinguish lower from upper h ^= (upperVal != null) ? upperVal.hashCode() : -1674416163; h ^= (parser != null) ? parser.hashCode() : -1572457324; h ^= (includeLower ? 1549299360 : -365038026) ^ (includeUpper ? 1721088258 : 1948649653); return h; } static abstract class FieldCacheDocIdSet extends DocIdSet { private final IndexReader reader; private boolean mayUseTermDocs; FieldCacheDocIdSet(IndexReader reader, boolean mayUseTermDocs) { this.reader = reader; this.mayUseTermDocs = mayUseTermDocs; } /** this method checks, if a doc is a hit, should throw AIOBE, when position invalid */ abstract boolean matchDoc(int doc) throws ArrayIndexOutOfBoundsException; /** this DocIdSet is cacheable, if it works solely with FieldCache and no TermDocs */ public boolean isCacheable() { return !(mayUseTermDocs && reader.hasDeletions()); } public DocIdSetIterator iterator() throws IOException { // Synchronization needed because deleted docs BitVector // can change after call to hasDeletions until TermDocs creation. // We only use an iterator with termDocs, when this was requested (e.g. range contains 0) // and the index has deletions final TermDocs termDocs; synchronized(reader) { termDocs = isCacheable() ? null : reader.termDocs(null); } if (termDocs != null) { // a DocIdSetIterator using TermDocs to iterate valid docIds return new DocIdSetIterator() { private int doc = -1; /** @deprecated use {@link #nextDoc()} instead. */ public boolean next() throws IOException { return nextDoc() != NO_MORE_DOCS; } /** @deprecated use {@link #advance(int)} instead. */ public boolean skipTo(int target) throws IOException { return advance(target) != NO_MORE_DOCS; } /** @deprecated use {@link #docID()} instead. */ public int doc() { return termDocs.doc(); } public int docID() { return doc; } public int nextDoc() throws IOException { do { if (!termDocs.next()) return doc = NO_MORE_DOCS; } while (!matchDoc(doc = termDocs.doc())); return doc; } public int advance(int target) throws IOException { if (!termDocs.skipTo(target)) return doc = NO_MORE_DOCS; while (!matchDoc(doc = termDocs.doc())) { if (!termDocs.next()) return doc = NO_MORE_DOCS; } return doc; } }; } else { // a DocIdSetIterator generating docIds by incrementing a variable - // this one can be used if there are no deletions are on the index return new DocIdSetIterator() { private int doc = -1; /** @deprecated use {@link #nextDoc()} instead. */ public boolean next() throws IOException { return nextDoc() != NO_MORE_DOCS; } /** @deprecated use {@link #advance(int)} instead. */ public boolean skipTo(int target) throws IOException { return advance(target) != NO_MORE_DOCS; } /** @deprecated use {@link #docID()} instead. */ public int doc() { return doc; } public int docID() { return doc; } public int nextDoc() { try { do { doc++; } while (!matchDoc(doc)); return doc; } catch (ArrayIndexOutOfBoundsException e) { return doc = NO_MORE_DOCS; } } public int advance(int target) { try { doc = target; while (!matchDoc(doc)) { doc++; } return doc; } catch (ArrayIndexOutOfBoundsException e) { return doc = NO_MORE_DOCS; } } }; } } } } lucene-2.9.4/src/java/org/apache/lucene/search/PositiveScoresOnlyCollector.java0000644000175000017500000000346311474320224030255 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.IndexReader; /** * A {@link Collector} implementation which wraps another * {@link Collector} and makes sure only documents with * scores > 0 are collected. */ public class PositiveScoresOnlyCollector extends Collector { final private Collector c; private Scorer scorer; public PositiveScoresOnlyCollector(Collector c) { this.c = c; } public void collect(int doc) throws IOException { if (scorer.score() > 0) { c.collect(doc); } } public void setNextReader(IndexReader reader, int docBase) throws IOException { c.setNextReader(reader, docBase); } public void setScorer(Scorer scorer) throws IOException { // Set a ScoreCachingWrappingScorer in case the wrapped Collector will call // score() also. this.scorer = new ScoreCachingWrappingScorer(scorer); c.setScorer(this.scorer); } public boolean acceptsDocsOutOfOrder() { return c.acceptsDocsOutOfOrder(); } } lucene-2.9.4/src/java/org/apache/lucene/search/ConstantScoreRangeQuery.java0000644000175000017500000000536211474320224027353 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.text.Collator; /** * A range query that returns a constant score equal to its boost for * all documents in the exclusive range of terms. * *

    It does not have an upper bound on the number of clauses covered in the range. * *

    This query matches the documents looking for terms that fall into the * supplied range according to {@link String#compareTo(String)}. It is not intended * for numerical ranges, use {@link NumericRangeQuery} instead. * *

    This query is hardwired to {@link MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}. * If you want to change this, use {@link TermRangeQuery} instead. * * @deprecated Use {@link TermRangeQuery} for term ranges or * {@link NumericRangeQuery} for numeric ranges instead. * This class will be removed in Lucene 3.0. * @version $Id: ConstantScoreRangeQuery.java 797694 2009-07-25 00:03:33Z mikemccand $ */ public class ConstantScoreRangeQuery extends TermRangeQuery { public ConstantScoreRangeQuery(String fieldName, String lowerVal, String upperVal, boolean includeLower, boolean includeUpper) { super(fieldName, lowerVal, upperVal, includeLower, includeUpper); rewriteMethod = CONSTANT_SCORE_AUTO_REWRITE_DEFAULT; } public ConstantScoreRangeQuery(String fieldName, String lowerVal, String upperVal, boolean includeLower, boolean includeUpper, Collator collator) { super(fieldName, lowerVal, upperVal, includeLower, includeUpper, collator); rewriteMethod = CONSTANT_SCORE_AUTO_REWRITE_DEFAULT; } public String getLowerVal() { return getLowerTerm(); } public String getUpperVal() { return getUpperTerm(); } /** Changes of mode are not supported by this class (fixed to constant score rewrite mode) */ public void setRewriteMethod(RewriteMethod method) { throw new UnsupportedOperationException("Use TermRangeQuery instead to change the rewrite method."); } } lucene-2.9.4/src/java/org/apache/lucene/search/FilterManager.java0000644000175000017500000001551311474320224025302 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Comparator; import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.TreeSet; /** * Filter caching singleton. It can be used * to save filters locally for reuse. * This class makes it possible to cache Filters even when using RMI, as it * keeps the cache on the searcher side of the RMI connection. * * Also could be used as a persistent storage for any filter as long as the * filter provides a proper hashCode(), as that is used as the key in the cache. * * The cache is periodically cleaned up from a separate thread to ensure the * cache doesn't exceed the maximum size. */ public class FilterManager { protected static FilterManager manager; /** The default maximum number of Filters in the cache */ protected static final int DEFAULT_CACHE_CLEAN_SIZE = 100; /** The default frequency of cache cleanup */ protected static final long DEFAULT_CACHE_SLEEP_TIME = 1000 * 60 * 10; /** The cache itself */ protected Map cache; /** Maximum allowed cache size */ protected int cacheCleanSize; /** Cache cleaning frequency */ protected long cleanSleepTime; /** Cache cleaner that runs in a separate thread */ protected FilterCleaner filterCleaner; public synchronized static FilterManager getInstance() { if (manager == null) { manager = new FilterManager(); } return manager; } /** * Sets up the FilterManager singleton. */ protected FilterManager() { cache = new HashMap(); cacheCleanSize = DEFAULT_CACHE_CLEAN_SIZE; // Let the cache get to 100 items cleanSleepTime = DEFAULT_CACHE_SLEEP_TIME; // 10 minutes between cleanings filterCleaner = new FilterCleaner(); Thread fcThread = new Thread(filterCleaner); // set to be a Daemon so it doesn't have to be stopped fcThread.setDaemon(true); fcThread.start(); } /** * Sets the max size that cache should reach before it is cleaned up * @param cacheCleanSize maximum allowed cache size */ public void setCacheSize(int cacheCleanSize) { this.cacheCleanSize = cacheCleanSize; } /** * Sets the cache cleaning frequency in milliseconds. * @param cleanSleepTime cleaning frequency in milliseconds */ public void setCleanThreadSleepTime(long cleanSleepTime) { this.cleanSleepTime = cleanSleepTime; } /** * Returns the cached version of the filter. Allows the caller to pass up * a small filter but this will keep a persistent version around and allow * the caching filter to do its job. * * @param filter The input filter * @return The cached version of the filter */ public Filter getFilter(Filter filter) { synchronized(cache) { FilterItem fi = null; fi = (FilterItem)cache.get(new Integer(filter.hashCode())); if (fi != null) { fi.timestamp = new Date().getTime(); return fi.filter; } cache.put(new Integer(filter.hashCode()), new FilterItem(filter)); return filter; } } /** * Holds the filter and the last time the filter was used, to make LRU-based * cache cleaning possible. * TODO: Clean this up when we switch to Java 1.5 */ protected class FilterItem { public Filter filter; public long timestamp; public FilterItem (Filter filter) { this.filter = filter; this.timestamp = new Date().getTime(); } } /** * Keeps the cache from getting too big. * If we were using Java 1.5, we could use LinkedHashMap and we would not need this thread * to clean out the cache. * * The SortedSet sortedFilterItems is used only to sort the items from the cache, * so when it's time to clean up we have the TreeSet sort the FilterItems by * timestamp. * * Removes 1.5 * the numbers of items to make the cache smaller. * For example: * If cache clean size is 10, and the cache is at 15, we would remove (15 - 10) * 1.5 = 7.5 round up to 8. * This way we clean the cache a bit more, and avoid having the cache cleaner having to do it frequently. */ protected class FilterCleaner implements Runnable { private boolean running = true; private TreeSet sortedFilterItems; public FilterCleaner() { sortedFilterItems = new TreeSet(new Comparator() { public int compare(Object a, Object b) { if( a instanceof Map.Entry && b instanceof Map.Entry) { FilterItem fia = (FilterItem) ((Map.Entry)a).getValue(); FilterItem fib = (FilterItem) ((Map.Entry)b).getValue(); if ( fia.timestamp == fib.timestamp ) { return 0; } // smaller timestamp first if ( fia.timestamp < fib.timestamp ) { return -1; } // larger timestamp last return 1; } else { throw new ClassCastException("Objects are not Map.Entry"); } } }); } public void run () { while (running) { // sort items from oldest to newest // we delete the oldest filters if (cache.size() > cacheCleanSize) { // empty the temporary set sortedFilterItems.clear(); synchronized (cache) { sortedFilterItems.addAll(cache.entrySet()); Iterator it = sortedFilterItems.iterator(); int numToDelete = (int) ((cache.size() - cacheCleanSize) * 1.5); int counter = 0; // loop over the set and delete all of the cache entries not used in a while while (it.hasNext() && counter++ < numToDelete) { Map.Entry entry = (Map.Entry)it.next(); cache.remove(entry.getKey()); } } // empty the set so we don't tie up the memory sortedFilterItems.clear(); } // take a nap try { Thread.sleep(cleanSleepTime); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); throw new RuntimeException(ie); } } } } } lucene-2.9.4/src/java/org/apache/lucene/search/Weight.java0000644000175000017500000001162511474320225024012 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.Serializable; import org.apache.lucene.index.IndexReader; /** * Expert: Calculate query weights and build query scorers. *

    * The purpose of {@link Weight} is to ensure searching does not * modify a {@link Query}, so that a {@link Query} instance can be reused.
    * {@link Searcher} dependent state of the query should reside in the * {@link Weight}.
    * {@link IndexReader} dependent state should reside in the {@link Scorer}. *

    * A Weight is used in the following way: *

      *
    1. A Weight is constructed by a top-level query, given a * Searcher ({@link Query#createWeight(Searcher)}). *
    2. The {@link #sumOfSquaredWeights()} method is called on the * Weight to compute the query normalization factor * {@link Similarity#queryNorm(float)} of the query clauses contained in the * query. *
    3. The query normalization factor is passed to {@link #normalize(float)}. At * this point the weighting is complete. *
    4. A Scorer is constructed by {@link #scorer(IndexReader,boolean,boolean)}. *
    * * @since 2.9 */ public abstract class Weight implements Serializable { /** * An explanation of the score computation for the named document. * * @param reader sub-reader containing the give doc * @param doc * @return an Explanation for the score * @throws IOException */ public abstract Explanation explain(IndexReader reader, int doc) throws IOException; /** The query that this concerns. */ public abstract Query getQuery(); /** The weight for this query. */ public abstract float getValue(); /** Assigns the query normalization factor to this. */ public abstract void normalize(float norm); /** * Returns a {@link Scorer} which scores documents in/out-of order according * to scoreDocsInOrder. *

    * NOTE: even if scoreDocsInOrder is false, it is * recommended to check whether the returned Scorer indeed scores * documents out of order (i.e., call {@link #scoresDocsOutOfOrder()}), as * some Scorer implementations will always return documents * in-order.
    * NOTE: null can be returned if no documents will be scored by this * query. * * @param reader * the {@link IndexReader} for which to return the {@link Scorer}. * @param scoreDocsInOrder * specifies whether in-order scoring of documents is required. Note * that if set to false (i.e., out-of-order scoring is required), * this method can return whatever scoring mode it supports, as every * in-order scorer is also an out-of-order one. However, an * out-of-order scorer may not support {@link Scorer#nextDoc()} * and/or {@link Scorer#advance(int)}, therefore it is recommended to * request an in-order scorer if use of these methods is required. * @param topScorer * if true, {@link Scorer#score(Collector)} will be called; if false, * {@link Scorer#nextDoc()} and/or {@link Scorer#advance(int)} will * be called. * @return a {@link Scorer} which scores documents in/out-of order. * @throws IOException */ public abstract Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException; /** The sum of squared weights of contained query clauses. */ public abstract float sumOfSquaredWeights() throws IOException; /** * Returns true iff this implementation scores docs only out of order. This * method is used in conjunction with {@link Collector}'s * {@link Collector#acceptsDocsOutOfOrder() acceptsDocsOutOfOrder} and * {@link #scorer(org.apache.lucene.index.IndexReader, boolean, boolean)} to * create a matching {@link Scorer} instance for a given {@link Collector}, or * vice versa. *

    * NOTE: the default implementation returns false, i.e. * the Scorer scores documents in-order. */ public boolean scoresDocsOutOfOrder() { return false; } } lucene-2.9.4/src/java/org/apache/lucene/search/SpanFilter.java0000644000175000017500000000344311474320224024630 0ustar janpascaljanpascalpackage org.apache.lucene.search; /** * Copyright 2007 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.IndexReader; import java.io.IOException; /** Abstract base class providing a mechanism to restrict searches to a subset of an index and also maintains and returns position information. This is useful if you want to compare the positions from a SpanQuery with the positions of items in a filter. For instance, if you had a SpanFilter that marked all the occurrences of the word "foo" in documents, and then you entered a new SpanQuery containing bar, you could not only filter by the word foo, but you could then compare position information for post processing. */ public abstract class SpanFilter extends Filter{ /** Returns a SpanFilterResult with true for documents which should be permitted in search results, and false for those that should not and Spans for where the true docs match. * @param reader The {@link org.apache.lucene.index.IndexReader} to load position and DocIdSet information from * @return A {@link SpanFilterResult} * @throws java.io.IOException if there was an issue accessing the necessary information * */ public abstract SpanFilterResult bitSpans(IndexReader reader) throws IOException; } lucene-2.9.4/src/java/org/apache/lucene/LucenePackage.java0000644000175000017500000000216711474320232024004 0ustar janpascaljanpascalpackage org.apache.lucene; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** Lucene's package information, including version. **/ public final class LucenePackage { private LucenePackage() {} // can't construct /** Return Lucene's package, including version information. */ public static Package get() { return LucenePackage.class.getPackage(); } } lucene-2.9.4/src/java/org/apache/lucene/analysis/0000755000175000017500000000000011554106561022275 5ustar janpascaljanpascallucene-2.9.4/src/java/org/apache/lucene/analysis/StopAnalyzer.java0000644000175000017500000002304311474320222025566 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.File; import java.io.IOException; import java.io.Reader; import java.util.Arrays; import java.util.Set; import org.apache.lucene.util.Version; /** Filters {@link LetterTokenizer} with {@link * LowerCaseFilter} and {@link StopFilter}. * * *

    You must specify the required {@link Version} * compatibility when creating StopAnalyzer: *

    */ public final class StopAnalyzer extends Analyzer { private final Set/**/ stopWords; // @deprecated private final boolean useDefaultStopPositionIncrement; private final boolean enablePositionIncrements; /** An array containing some common English words that are not usually useful for searching. @deprecated Use {@link #ENGLISH_STOP_WORDS_SET} instead */ public static final String[] ENGLISH_STOP_WORDS = { "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with" }; /** An unmodifiable set containing some common English words that are not usually useful for searching.*/ public static final Set/**/ ENGLISH_STOP_WORDS_SET; static { final String[] stopWords = new String[]{ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with" }; final CharArraySet stopSet = new CharArraySet(stopWords.length, false); stopSet.addAll(Arrays.asList(stopWords)); ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet); } /** Builds an analyzer which removes words in * ENGLISH_STOP_WORDS. * @deprecated Use {@link #StopAnalyzer(Version)} instead */ public StopAnalyzer() { stopWords = ENGLISH_STOP_WORDS_SET; useDefaultStopPositionIncrement = true; enablePositionIncrements = false; } /** Builds an analyzer which removes words in * ENGLISH_STOP_WORDS.*/ public StopAnalyzer(Version matchVersion) { stopWords = ENGLISH_STOP_WORDS_SET; useDefaultStopPositionIncrement = false; enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion); } /** Builds an analyzer which removes words in * ENGLISH_STOP_WORDS. * @param enablePositionIncrements See {@link * StopFilter#setEnablePositionIncrements} * @deprecated Use {@link #StopAnalyzer(Version)} instead */ public StopAnalyzer(boolean enablePositionIncrements) { stopWords = ENGLISH_STOP_WORDS_SET; this.enablePositionIncrements = enablePositionIncrements; useDefaultStopPositionIncrement = false; } /** Builds an analyzer with the stop words from the given set. * @deprecated Use {@link #StopAnalyzer(Version, Set)} instead */ public StopAnalyzer(Set stopWords) { this.stopWords = stopWords; useDefaultStopPositionIncrement = true; enablePositionIncrements = false; } /** Builds an analyzer with the stop words from the given * set. */ public StopAnalyzer(Version matchVersion, Set stopWords) { this.stopWords = stopWords; useDefaultStopPositionIncrement = false; enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion); } /** Builds an analyzer with the stop words from the given set. * @param stopWords Set of stop words * @param enablePositionIncrements See {@link * StopFilter#setEnablePositionIncrements} * @deprecated Use {@link #StopAnalyzer(Version, Set)} instead */ public StopAnalyzer(Set stopWords, boolean enablePositionIncrements) { this.stopWords = stopWords; this.enablePositionIncrements = enablePositionIncrements; useDefaultStopPositionIncrement = false; } /** Builds an analyzer which removes words in the provided array. * @deprecated Use {@link #StopAnalyzer(Version, Set)} instead */ public StopAnalyzer(String[] stopWords) { this.stopWords = StopFilter.makeStopSet(stopWords); useDefaultStopPositionIncrement = true; enablePositionIncrements = false; } /** Builds an analyzer which removes words in the provided array. * @param stopWords Array of stop words * @param enablePositionIncrements See {@link * StopFilter#setEnablePositionIncrements} * @deprecated Use {@link #StopAnalyzer(Version, Set)} instead*/ public StopAnalyzer(String[] stopWords, boolean enablePositionIncrements) { this.stopWords = StopFilter.makeStopSet(stopWords); this.enablePositionIncrements = enablePositionIncrements; useDefaultStopPositionIncrement = false; } /** Builds an analyzer with the stop words from the given file. * @see WordlistLoader#getWordSet(File) * @deprecated Use {@link #StopAnalyzer(Version, File)} instead */ public StopAnalyzer(File stopwordsFile) throws IOException { stopWords = WordlistLoader.getWordSet(stopwordsFile); useDefaultStopPositionIncrement = true; enablePositionIncrements = false; } /** Builds an analyzer with the stop words from the given file. * @see WordlistLoader#getWordSet(File) * @param stopwordsFile File to load stop words from * @param enablePositionIncrements See {@link * StopFilter#setEnablePositionIncrements} * @deprecated Use {@link #StopAnalyzer(Version, File)} instead */ public StopAnalyzer(File stopwordsFile, boolean enablePositionIncrements) throws IOException { stopWords = WordlistLoader.getWordSet(stopwordsFile); this.enablePositionIncrements = enablePositionIncrements; useDefaultStopPositionIncrement = false; } /** Builds an analyzer with the stop words from the given file. * @see WordlistLoader#getWordSet(File) * @param matchVersion See above * @param stopwordsFile File to load stop words from */ public StopAnalyzer(Version matchVersion, File stopwordsFile) throws IOException { stopWords = WordlistLoader.getWordSet(stopwordsFile); this.enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion); useDefaultStopPositionIncrement = false; } /** Builds an analyzer with the stop words from the given reader. * @see WordlistLoader#getWordSet(Reader) * @deprecated Use {@link #StopAnalyzer(Version, Reader)} instead */ public StopAnalyzer(Reader stopwords) throws IOException { stopWords = WordlistLoader.getWordSet(stopwords); useDefaultStopPositionIncrement = true; enablePositionIncrements = false; } /** Builds an analyzer with the stop words from the given reader. * @see WordlistLoader#getWordSet(Reader) * @param stopwords Reader to load stop words from * @param enablePositionIncrements See {@link * StopFilter#setEnablePositionIncrements} * @deprecated Use {@link #StopAnalyzer(Version, Reader)} instead */ public StopAnalyzer(Reader stopwords, boolean enablePositionIncrements) throws IOException { stopWords = WordlistLoader.getWordSet(stopwords); this.enablePositionIncrements = enablePositionIncrements; useDefaultStopPositionIncrement = false; } /** Builds an analyzer with the stop words from the given reader. * @see WordlistLoader#getWordSet(Reader) * @param matchVersion See above * @param stopwords Reader to load stop words from */ public StopAnalyzer(Version matchVersion, Reader stopwords) throws IOException { stopWords = WordlistLoader.getWordSet(stopwords); this.enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion); useDefaultStopPositionIncrement = false; } /** Filters LowerCaseTokenizer with StopFilter. */ public TokenStream tokenStream(String fieldName, Reader reader) { if (useDefaultStopPositionIncrement) { return new StopFilter(new LowerCaseTokenizer(reader), stopWords); } else { return new StopFilter(enablePositionIncrements, new LowerCaseTokenizer(reader), stopWords); } } /** Filters LowerCaseTokenizer with StopFilter. */ private class SavedStreams { Tokenizer source; TokenStream result; }; public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(); streams.source = new LowerCaseTokenizer(reader); if (useDefaultStopPositionIncrement) { streams.result = new StopFilter(streams.source, stopWords); } else { streams.result = new StopFilter(enablePositionIncrements, streams.source, stopWords); } setPreviousTokenStream(streams); } else streams.source.reset(reader); return streams.result; } } lucene-2.9.4/src/java/org/apache/lucene/analysis/KeywordAnalyzer.java0000644000175000017500000000365611474320222026275 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.Reader; /** * "Tokenizes" the entire stream as a single token. This is useful * for data like zip codes, ids, and some product names. */ public class KeywordAnalyzer extends Analyzer { public KeywordAnalyzer() { setOverridesTokenStreamMethod(KeywordAnalyzer.class); } public TokenStream tokenStream(String fieldName, final Reader reader) { return new KeywordTokenizer(reader); } public TokenStream reusableTokenStream(String fieldName, final Reader reader) throws IOException { if (overridesTokenStreamMethod) { // LUCENE-1678: force fallback to tokenStream() if we // have been subclassed and that subclass overrides // tokenStream but not reusableTokenStream return tokenStream(fieldName, reader); } Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream(); if (tokenizer == null) { tokenizer = new KeywordTokenizer(reader); setPreviousTokenStream(tokenizer); } else tokenizer.reset(reader); return tokenizer; } } lucene-2.9.4/src/java/org/apache/lucene/analysis/StopFilter.java0000644000175000017500000002611511474320222025231 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Arrays; import java.util.Set; import java.util.List; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.queryParser.QueryParser; // for javadoc import org.apache.lucene.util.Version; /** * Removes stop words from a token stream. */ public final class StopFilter extends TokenFilter { // deprecated private static boolean ENABLE_POSITION_INCREMENTS_DEFAULT = false; private final CharArraySet stopWords; private boolean enablePositionIncrements = ENABLE_POSITION_INCREMENTS_DEFAULT; private TermAttribute termAtt; private PositionIncrementAttribute posIncrAtt; /** * Construct a token stream filtering the given input. * @deprecated Use {@link #StopFilter(boolean, TokenStream, String[])} instead */ public StopFilter(TokenStream input, String [] stopWords) { this(ENABLE_POSITION_INCREMENTS_DEFAULT, input, stopWords, false); } /** * Construct a token stream filtering the given input. * @param enablePositionIncrements true if token positions should record the removed stop words * @param input input TokenStream * @param stopWords array of stop words * @deprecated Use {@link #StopFilter(boolean, TokenStream, Set)} instead. */ public StopFilter(boolean enablePositionIncrements, TokenStream input, String [] stopWords) { this(enablePositionIncrements, input, stopWords, false); } /** * Constructs a filter which removes words from the input * TokenStream that are named in the array of words. * @deprecated Use {@link #StopFilter(boolean, TokenStream, String[], boolean)} instead */ public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) { this(ENABLE_POSITION_INCREMENTS_DEFAULT, in, stopWords, ignoreCase); } /** * Constructs a filter which removes words from the input * TokenStream that are named in the array of words. * @param enablePositionIncrements true if token positions should record the removed stop words * @param in input TokenStream * @param stopWords array of stop words * @param ignoreCase true if case is ignored * @deprecated Use {@link #StopFilter(boolean, TokenStream, Set, boolean)} instead. */ public StopFilter(boolean enablePositionIncrements, TokenStream in, String[] stopWords, boolean ignoreCase) { super(in); this.stopWords = (CharArraySet)makeStopSet(stopWords, ignoreCase); this.enablePositionIncrements = enablePositionIncrements; init(); } /** * Construct a token stream filtering the given input. * If stopWords is an instance of {@link CharArraySet} (true if * makeStopSet() was used to construct the set) it will be directly used * and ignoreCase will be ignored since CharArraySet * directly controls case sensitivity. *

    * If stopWords is not an instance of {@link CharArraySet}, * a new CharArraySet will be constructed and ignoreCase will be * used to specify the case sensitivity of that set. * * @param input * @param stopWords The set of Stop Words. * @param ignoreCase -Ignore case when stopping. * @deprecated Use {@link #StopFilter(boolean, TokenStream, Set, boolean)} instead */ public StopFilter(TokenStream input, Set stopWords, boolean ignoreCase) { this(ENABLE_POSITION_INCREMENTS_DEFAULT, input, stopWords, ignoreCase); } /** * Construct a token stream filtering the given input. * If stopWords is an instance of {@link CharArraySet} (true if * makeStopSet() was used to construct the set) it will be directly used * and ignoreCase will be ignored since CharArraySet * directly controls case sensitivity. *

    * If stopWords is not an instance of {@link CharArraySet}, * a new CharArraySet will be constructed and ignoreCase will be * used to specify the case sensitivity of that set. * * @param enablePositionIncrements true if token positions should record the removed stop words * @param input Input TokenStream * @param stopWords The set of Stop Words. * @param ignoreCase -Ignore case when stopping. */ public StopFilter(boolean enablePositionIncrements, TokenStream input, Set stopWords, boolean ignoreCase) { super(input); if (stopWords instanceof CharArraySet) { this.stopWords = (CharArraySet)stopWords; } else { this.stopWords = new CharArraySet(stopWords.size(), ignoreCase); this.stopWords.addAll(stopWords); } this.enablePositionIncrements = enablePositionIncrements; init(); } /** * Constructs a filter which removes words from the input * TokenStream that are named in the Set. * * @see #makeStopSet(java.lang.String[]) * @deprecated Use {@link #StopFilter(boolean, TokenStream, Set)} instead */ public StopFilter(TokenStream in, Set stopWords) { this(ENABLE_POSITION_INCREMENTS_DEFAULT, in, stopWords, false); } /** * Constructs a filter which removes words from the input * TokenStream that are named in the Set. * * @param enablePositionIncrements true if token positions should record the removed stop words * @param in Input stream * @param stopWords The set of Stop Words. * @see #makeStopSet(java.lang.String[]) */ public StopFilter(boolean enablePositionIncrements, TokenStream in, Set stopWords) { this(enablePositionIncrements, in, stopWords, false); } public void init() { termAtt = (TermAttribute) addAttribute(TermAttribute.class); posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); } /** * Builds a Set from an array of stop words, * appropriate for passing into the StopFilter constructor. * This permits this stopWords construction to be cached once when * an Analyzer is constructed. * * @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase */ public static final Set makeStopSet(String[] stopWords) { return makeStopSet(stopWords, false); } /** * Builds a Set from an array of stop words, * appropriate for passing into the StopFilter constructor. * This permits this stopWords construction to be cached once when * an Analyzer is constructed. * * @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase */ public static final Set makeStopSet(List/**/ stopWords) { return makeStopSet(stopWords, false); } /** * * @param stopWords An array of stopwords * @param ignoreCase If true, all words are lower cased first. * @return a Set containing the words */ public static final Set makeStopSet(String[] stopWords, boolean ignoreCase) { CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase); stopSet.addAll(Arrays.asList(stopWords)); return stopSet; } /** * * @param stopWords A List of Strings representing the stopwords * @param ignoreCase if true, all words are lower cased first * @return A Set containing the words */ public static final Set makeStopSet(List/**/ stopWords, boolean ignoreCase){ CharArraySet stopSet = new CharArraySet(stopWords.size(), ignoreCase); stopSet.addAll(stopWords); return stopSet; } /** * Returns the next input Token whose term() is not a stop word. */ public final boolean incrementToken() throws IOException { // return the first non-stop word found int skippedPositions = 0; while (input.incrementToken()) { if (!stopWords.contains(termAtt.termBuffer(), 0, termAtt.termLength())) { if (enablePositionIncrements) { posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); } return true; } skippedPositions += posIncrAtt.getPositionIncrement(); } // reached EOS -- return null return false; } /** * @see #setEnablePositionIncrementsDefault(boolean). * @deprecated Please specify this when you create the StopFilter */ public static boolean getEnablePositionIncrementsDefault() { return ENABLE_POSITION_INCREMENTS_DEFAULT; } /** * Returns version-dependent default for * enablePositionIncrements. Analyzers that embed * StopFilter use this method when creating the * StopFilter. Prior to 2.9, this returns {@link #getEnablePositionIncrementsDefault}. * On 2.9 or later, it returns true. */ public static boolean getEnablePositionIncrementsVersionDefault(Version matchVersion) { if (matchVersion.onOrAfter(Version.LUCENE_29)) { return true; } else { return ENABLE_POSITION_INCREMENTS_DEFAULT; } } /** * Set the default position increments behavior of every StopFilter created from now on. *

    * Note: behavior of a single StopFilter instance can be modified * with {@link #setEnablePositionIncrements(boolean)}. * This static method allows control over behavior of classes using StopFilters internally, * for example {@link org.apache.lucene.analysis.standard.StandardAnalyzer StandardAnalyzer} * if used with the no-arg ctor. *

    * Default : false. * @see #setEnablePositionIncrements(boolean). * @deprecated Please specify this when you create the StopFilter */ public static void setEnablePositionIncrementsDefault(boolean defaultValue) { ENABLE_POSITION_INCREMENTS_DEFAULT = defaultValue; } /** * @see #setEnablePositionIncrements(boolean). */ public boolean getEnablePositionIncrements() { return enablePositionIncrements; } /** * If true, this StopFilter will preserve * positions of the incoming tokens (ie, accumulate and * set position increments of the removed stop tokens). * Generally, true is best as it does not * lose information (positions of the original tokens) * during indexing. * *

    When set, when a token is stopped * (omitted), the position increment of the following * token is incremented. * *

    NOTE: be sure to also * set {@link QueryParser#setEnablePositionIncrements} if * you use QueryParser to create queries. */ public void setEnablePositionIncrements(boolean enable) { this.enablePositionIncrements = enable; } } lucene-2.9.4/src/java/org/apache/lucene/analysis/SimpleAnalyzer.java0000644000175000017500000000274611474320222026101 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Reader; import java.io.IOException; /** An {@link Analyzer} that filters {@link LetterTokenizer} * with {@link LowerCaseFilter} */ public final class SimpleAnalyzer extends Analyzer { public TokenStream tokenStream(String fieldName, Reader reader) { return new LowerCaseTokenizer(reader); } public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { Tokenizer tokenizer = (Tokenizer) getPreviousTokenStream(); if (tokenizer == null) { tokenizer = new LowerCaseTokenizer(reader); setPreviousTokenStream(tokenizer); } else tokenizer.reset(reader); return tokenizer; } } lucene-2.9.4/src/java/org/apache/lucene/analysis/LetterTokenizer.java0000644000175000017500000000370011474505315026273 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Reader; import org.apache.lucene.util.AttributeSource; /** A LetterTokenizer is a tokenizer that divides text at non-letters. That's to say, it defines tokens as maximal strings of adjacent letters, as defined by java.lang.Character.isLetter() predicate. Note: this does a decent job for most European languages, but does a terrible job for some Asian languages, where words are not separated by spaces. */ public class LetterTokenizer extends CharTokenizer { /** Construct a new LetterTokenizer. */ public LetterTokenizer(Reader in) { super(in); } /** Construct a new LetterTokenizer using a given {@link AttributeSource}. */ public LetterTokenizer(AttributeSource source, Reader in) { super(source, in); } /** Construct a new LetterTokenizer using a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. */ public LetterTokenizer(AttributeFactory factory, Reader in) { super(factory, in); } /** Collects only characters which satisfy * {@link Character#isLetter(char)}.*/ protected boolean isTokenChar(char c) { return Character.isLetter(c); } } lucene-2.9.4/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java0000644000175000017500000000341511474320222027123 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Reader; import org.apache.lucene.util.AttributeSource; /** A WhitespaceTokenizer is a tokenizer that divides text at whitespace. * Adjacent sequences of non-Whitespace characters form tokens. */ public class WhitespaceTokenizer extends CharTokenizer { /** Construct a new WhitespaceTokenizer. */ public WhitespaceTokenizer(Reader in) { super(in); } /** Construct a new WhitespaceTokenizer using a given {@link AttributeSource}. */ public WhitespaceTokenizer(AttributeSource source, Reader in) { super(source, in); } /** Construct a new WhitespaceTokenizer using a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. */ public WhitespaceTokenizer(AttributeFactory factory, Reader in) { super(factory, in); } /** Collects only characters which do not satisfy * {@link Character#isWhitespace(char)}.*/ protected boolean isTokenChar(char c) { return !Character.isWhitespace(c); } } lucene-2.9.4/src/java/org/apache/lucene/analysis/TokenStream.java0000644000175000017500000004375311474320222025401 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.IdentityHashMap; import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Payload; import org.apache.lucene.util.Attribute; import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.AttributeSource; /** * A TokenStream enumerates the sequence of tokens, either from * {@link Field}s of a {@link Document} or from query text. *

    * This is an abstract class; concrete subclasses are: *

      *
    • {@link Tokenizer}, a TokenStream whose input is a Reader; and *
    • {@link TokenFilter}, a TokenStream whose input is another * TokenStream. *
    * A new TokenStream API has been introduced with Lucene 2.9. This API * has moved from being {@link Token}-based to {@link Attribute}-based. While * {@link Token} still exists in 2.9 as a convenience class, the preferred way * to store the information of a {@link Token} is to use {@link AttributeImpl}s. *

    * TokenStream now extends {@link AttributeSource}, which provides * access to all of the token {@link Attribute}s for the TokenStream. * Note that only one instance per {@link AttributeImpl} is created and reused * for every token. This approach reduces object creation and allows local * caching of references to the {@link AttributeImpl}s. See * {@link #incrementToken()} for further details. *

    * The workflow of the new TokenStream API is as follows: *

      *
    1. Instantiation of TokenStream/{@link TokenFilter}s which add/get * attributes to/from the {@link AttributeSource}. *
    2. The consumer calls {@link TokenStream#reset()}. *
    3. The consumer retrieves attributes from the stream and stores local * references to all attributes it wants to access. *
    4. The consumer calls {@link #incrementToken()} until it returns false * consuming the attributes after each call. *
    5. The consumer calls {@link #end()} so that any end-of-stream operations * can be performed. *
    6. The consumer calls {@link #close()} to release any resource when finished * using the TokenStream. *
    * To make sure that filters and consumers know which attributes are available, * the attributes must be added during instantiation. Filters and consumers are * not required to check for availability of attributes in * {@link #incrementToken()}. *

    * You can find some example code for the new API in the analysis package level * Javadoc. *

    * Sometimes it is desirable to capture a current state of a TokenStream, * e.g., for buffering purposes (see {@link CachingTokenFilter}, * {@link TeeSinkTokenFilter}). For this usecase * {@link AttributeSource#captureState} and {@link AttributeSource#restoreState} * can be used. */ public abstract class TokenStream extends AttributeSource { /** @deprecated Remove this when old API is removed! */ private static final AttributeFactory DEFAULT_TOKEN_WRAPPER_ATTRIBUTE_FACTORY = new TokenWrapperAttributeFactory(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); /** @deprecated Remove this when old API is removed! */ private final TokenWrapper tokenWrapper; /** @deprecated Remove this when old API is removed! */ private static boolean onlyUseNewAPI = false; /** @deprecated Remove this when old API is removed! */ private final MethodSupport supportedMethods = getSupportedMethods(this.getClass()); /** @deprecated Remove this when old API is removed! */ private static final class MethodSupport { final boolean hasIncrementToken, hasReusableNext, hasNext; MethodSupport(Class clazz) { hasIncrementToken = isMethodOverridden(clazz, "incrementToken", METHOD_NO_PARAMS); hasReusableNext = isMethodOverridden(clazz, "next", METHOD_TOKEN_PARAM); hasNext = isMethodOverridden(clazz, "next", METHOD_NO_PARAMS); } private static boolean isMethodOverridden(Class clazz, String name, Class[] params) { try { return clazz.getMethod(name, params).getDeclaringClass() != TokenStream.class; } catch (NoSuchMethodException e) { // should not happen throw new RuntimeException(e); } } private static final Class[] METHOD_NO_PARAMS = new Class[0]; private static final Class[] METHOD_TOKEN_PARAM = new Class[]{Token.class}; } /** @deprecated Remove this when old API is removed! */ private static final IdentityHashMap/*,MethodSupport>*/ knownMethodSupport = new IdentityHashMap(); /** @deprecated Remove this when old API is removed! */ private static MethodSupport getSupportedMethods(Class clazz) { MethodSupport supportedMethods; synchronized(knownMethodSupport) { supportedMethods = (MethodSupport) knownMethodSupport.get(clazz); if (supportedMethods == null) { knownMethodSupport.put(clazz, supportedMethods = new MethodSupport(clazz)); } } return supportedMethods; } /** @deprecated Remove this when old API is removed! */ private static final class TokenWrapperAttributeFactory extends AttributeFactory { private final AttributeFactory delegate; private TokenWrapperAttributeFactory(AttributeFactory delegate) { this.delegate = delegate; } public AttributeImpl createAttributeInstance(Class attClass) { return attClass.isAssignableFrom(TokenWrapper.class) ? new TokenWrapper() : delegate.createAttributeInstance(attClass); } // this is needed for TeeSinkTokenStream's check for compatibility of AttributeSource, // so two TokenStreams using old API have the same AttributeFactory wrapped by this one. public boolean equals(Object other) { if (this == other) return true; if (other instanceof TokenWrapperAttributeFactory) { final TokenWrapperAttributeFactory af = (TokenWrapperAttributeFactory) other; return this.delegate.equals(af.delegate); } return false; } public int hashCode() { return delegate.hashCode() ^ 0x0a45ff31; } } /** * A TokenStream using the default attribute factory. */ protected TokenStream() { super(onlyUseNewAPI ? AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY : TokenStream.DEFAULT_TOKEN_WRAPPER_ATTRIBUTE_FACTORY ); tokenWrapper = initTokenWrapper(null); check(); } /** * A TokenStream that uses the same attributes as the supplied one. */ protected TokenStream(AttributeSource input) { super(input); tokenWrapper = initTokenWrapper(input); check(); } /** * A TokenStream using the supplied AttributeFactory for creating new {@link Attribute} instances. */ protected TokenStream(AttributeFactory factory) { super(onlyUseNewAPI ? factory : new TokenWrapperAttributeFactory(factory) ); tokenWrapper = initTokenWrapper(null); check(); } /** @deprecated Remove this when old API is removed! */ private TokenWrapper initTokenWrapper(AttributeSource input) { if (onlyUseNewAPI) { // no wrapper needed return null; } else { // if possible get the wrapper from the filter's input stream if (input instanceof TokenStream && ((TokenStream) input).tokenWrapper != null) { return ((TokenStream) input).tokenWrapper; } // check that all attributes are implemented by the same TokenWrapper instance final Attribute att = addAttribute(TermAttribute.class); if (att instanceof TokenWrapper && addAttribute(TypeAttribute.class) == att && addAttribute(PositionIncrementAttribute.class) == att && addAttribute(FlagsAttribute.class) == att && addAttribute(OffsetAttribute.class) == att && addAttribute(PayloadAttribute.class) == att ) { return (TokenWrapper) att; } else { throw new UnsupportedOperationException( "If onlyUseNewAPI is disabled, all basic Attributes must be implemented by the internal class "+ "TokenWrapper. Please make sure, that all TokenStreams/TokenFilters in this chain have been "+ "instantiated with this flag disabled and do not add any custom instances for the basic Attributes!" ); } } } /** @deprecated Remove this when old API is removed! */ private void check() { if (onlyUseNewAPI && !supportedMethods.hasIncrementToken) { throw new UnsupportedOperationException(getClass().getName()+" does not implement incrementToken() which is needed for onlyUseNewAPI."); } // a TokenStream subclass must at least implement one of the methods! if (!(supportedMethods.hasIncrementToken || supportedMethods.hasNext || supportedMethods.hasReusableNext)) { throw new UnsupportedOperationException(getClass().getName()+" does not implement any of incrementToken(), next(Token), next()."); } } /** * For extra performance you can globally enable the new * {@link #incrementToken} API using {@link Attribute}s. There will be a * small, but in most cases negligible performance increase by enabling this, * but it only works if all TokenStreams use the new API and * implement {@link #incrementToken}. This setting can only be enabled * globally. *

    * This setting only affects TokenStreams instantiated after this * call. All TokenStreams already created use the other setting. *

    * All core {@link Analyzer}s are compatible with this setting, if you have * your own TokenStreams that are also compatible, you should enable * this. *

    * When enabled, tokenization may throw {@link UnsupportedOperationException} * s, if the whole tokenizer chain is not compatible eg one of the * TokenStreams does not implement the new TokenStream API. *

    * The default is false, so there is the fallback to the old API * available. * * @deprecated This setting will no longer be needed in Lucene 3.0 as the old * API will be removed. */ public static void setOnlyUseNewAPI(boolean onlyUseNewAPI) { TokenStream.onlyUseNewAPI = onlyUseNewAPI; } /** * Returns if only the new API is used. * * @see #setOnlyUseNewAPI * @deprecated This setting will no longer be needed in Lucene 3.0 as * the old API will be removed. */ public static boolean getOnlyUseNewAPI() { return onlyUseNewAPI; } /** * Consumers (i.e., {@link IndexWriter}) use this method to advance the stream to * the next token. Implementing classes must implement this method and update * the appropriate {@link AttributeImpl}s with the attributes of the next * token. *

    * The producer must make no assumptions about the attributes after the method * has been returned: the caller may arbitrarily change it. If the producer * needs to preserve the state for subsequent calls, it can use * {@link #captureState} to create a copy of the current attribute state. *

    * This method is called for every token of a document, so an efficient * implementation is crucial for good performance. To avoid calls to * {@link #addAttribute(Class)} and {@link #getAttribute(Class)} or downcasts, * references to all {@link AttributeImpl}s that this stream uses should be * retrieved during instantiation. *

    * To ensure that filters and consumers know which attributes are available, * the attributes must be added during instantiation. Filters and consumers * are not required to check for availability of attributes in * {@link #incrementToken()}. * * @return false for end of stream; true otherwise * *

    * Note that this method will be defined abstract in Lucene * 3.0. */ public boolean incrementToken() throws IOException { assert tokenWrapper != null; final Token token; if (supportedMethods.hasReusableNext) { token = next(tokenWrapper.delegate); } else { assert supportedMethods.hasNext; token = next(); } if (token == null) return false; tokenWrapper.delegate = token; return true; } /** * This method is called by the consumer after the last token has been * consumed, after {@link #incrementToken()} returned false * (using the new TokenStream API). Streams implementing the old API * should upgrade to use this feature. *

    * This method can be used to perform any end-of-stream operations, such as * setting the final offset of a stream. The final offset of a stream might * differ from the offset of the last token eg in case one or more whitespaces * followed after the last token, but a {@link WhitespaceTokenizer} was used. * * @throws IOException */ public void end() throws IOException { // do nothing by default } /** * Returns the next token in the stream, or null at EOS. When possible, the * input Token should be used as the returned Token (this gives fastest * tokenization performance), but this is not required and a new Token may be * returned. Callers may re-use a single Token instance for successive calls * to this method. *

    * This implicitly defines a "contract" between consumers (callers of this * method) and producers (implementations of this method that are the source * for tokens): *

      *
    • A consumer must fully consume the previously returned {@link Token} * before calling this method again.
    • *
    • A producer must call {@link Token#clear()} before setting the fields in * it and returning it
    • *
    * Also, the producer must make no assumptions about a {@link Token} after it * has been returned: the caller may arbitrarily change it. If the producer * needs to hold onto the {@link Token} for subsequent calls, it must clone() * it before storing it. Note that a {@link TokenFilter} is considered a * consumer. * * @param reusableToken a {@link Token} that may or may not be used to return; * this parameter should never be null (the callee is not required to * check for null before using it, but it is a good idea to assert that * it is not null.) * @return next {@link Token} in the stream or null if end-of-stream was hit * @deprecated The new {@link #incrementToken()} and {@link AttributeSource} * APIs should be used instead. */ public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; if (tokenWrapper == null) throw new UnsupportedOperationException("This TokenStream only supports the new Attributes API."); if (supportedMethods.hasIncrementToken) { tokenWrapper.delegate = reusableToken; return incrementToken() ? tokenWrapper.delegate : null; } else { assert supportedMethods.hasNext; return next(); } } /** * Returns the next {@link Token} in the stream, or null at EOS. * * @deprecated The returned Token is a "full private copy" (not re-used across * calls to {@link #next()}) but will be slower than calling * {@link #next(Token)} or using the new {@link #incrementToken()} * method with the new {@link AttributeSource} API. */ public Token next() throws IOException { if (tokenWrapper == null) throw new UnsupportedOperationException("This TokenStream only supports the new Attributes API."); final Token nextToken; if (supportedMethods.hasIncrementToken) { final Token savedDelegate = tokenWrapper.delegate; tokenWrapper.delegate = new Token(); nextToken = incrementToken() ? tokenWrapper.delegate : null; tokenWrapper.delegate = savedDelegate; } else { assert supportedMethods.hasReusableNext; nextToken = next(new Token()); } if (nextToken != null) { Payload p = nextToken.getPayload(); if (p != null) { nextToken.setPayload((Payload) p.clone()); } } return nextToken; } /** * Resets this stream to the beginning. This is an optional operation, so * subclasses may or may not implement this method. {@link #reset()} is not needed for * the standard indexing process. However, if the tokens of a * TokenStream are intended to be consumed more than once, it is * necessary to implement {@link #reset()}. Note that if your TokenStream * caches tokens and feeds them back again after a reset, it is imperative * that you clone the tokens when you store them away (on the first pass) as * well as when you return them (on future passes after {@link #reset()}). */ public void reset() throws IOException {} /** Releases resources associated with this stream. */ public void close() throws IOException {} } lucene-2.9.4/src/java/org/apache/lucene/analysis/CharacterCache.java0000644000175000017500000000270611474320222025756 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis; /** * Replacement for Java 1.5 Character.valueOf() * @deprecated Move to Character.valueOf() in 3.0 */ class CharacterCache { private static final Character cache[] = new Character[128]; static { for (int i = 0; i < cache.length; i++) { cache[i] = new Character((char) i); } } /** * Returns a Character instance representing the given char value * * @param c * a char value * @return a Character representation of the given char value. */ public static Character valueOf(char c) { if (c < cache.length) { return cache[(int) c]; } return new Character(c); } } lucene-2.9.4/src/java/org/apache/lucene/analysis/TeeTokenFilter.java0000644000175000017500000000627111474320222026023 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis; import java.io.IOException; /** * Works in conjunction with the SinkTokenizer to provide the ability to set aside tokens * that have already been analyzed. This is useful in situations where multiple fields share * many common analysis steps and then go their separate ways. *

    * It is also useful for doing things like entity extraction or proper noun analysis as * part of the analysis workflow and saving off those tokens for use in another field. * *

    SinkTokenizer sink1 = new SinkTokenizer();
    SinkTokenizer sink2 = new SinkTokenizer();
    
    TokenStream source1 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(reader1), sink1), sink2);
    TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(reader2), sink1), sink2);
    
    TokenStream final1 = new LowerCaseFilter(source1);
    TokenStream final2 = source2;
    TokenStream final3 = new EntityDetect(sink1);
    TokenStream final4 = new URLDetect(sink2);
    
    d.add(new Field("f1", final1));
    d.add(new Field("f2", final2));
    d.add(new Field("f3", final3));
    d.add(new Field("f4", final4));
     * 
    * In this example, sink1 and sink2 will both get tokens from both * reader1 and reader2 after whitespace tokenizer * and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired. * It is important, that tees are consumed before sinks (in the above example, the field names must be * less the sink's field names). * Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene

    * * See LUCENE-1058. *

    * WARNING: {@link TeeTokenFilter} and {@link SinkTokenizer} only work with the old TokenStream API. * If you switch to the new API, you need to use {@link TeeSinkTokenFilter} instead, which offers * the same functionality. * @see SinkTokenizer * @deprecated Use {@link TeeSinkTokenFilter} instead **/ public class TeeTokenFilter extends TokenFilter { SinkTokenizer sink; public TeeTokenFilter(TokenStream input, SinkTokenizer sink) { super(input); this.sink = sink; } public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; Token nextToken = input.next(reusableToken); sink.add(nextToken); return nextToken; } } lucene-2.9.4/src/java/org/apache/lucene/analysis/BaseCharFilter.java0000644000175000017500000000462011474320222025751 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis; import org.apache.lucene.util.ArrayUtil; /** * Base utility class for implementing a {@link CharFilter}. * You subclass this, and then record mappings by calling * {@link #addOffCorrectMap}, and then invoke the correct * method to correct an offset. */ public abstract class BaseCharFilter extends CharFilter { private int offsets[]; private int diffs[]; private int size = 0; public BaseCharFilter(CharStream in) { super(in); } /** Retrieve the corrected offset. */ //@Override protected int correct(int currentOff) { if (offsets == null || currentOff < offsets[0]) { return currentOff; } int hi = size - 1; if(currentOff >= offsets[hi]) return currentOff + diffs[hi]; int lo = 0; int mid = -1; while (hi >= lo) { mid = (lo + hi) >>> 1; if (currentOff < offsets[mid]) hi = mid - 1; else if (currentOff > offsets[mid]) lo = mid + 1; else return currentOff + diffs[mid]; } if (currentOff < offsets[mid]) return mid == 0 ? currentOff : currentOff + diffs[mid-1]; else return currentOff + diffs[mid]; } protected int getLastCumulativeDiff() { return offsets == null ? 0 : diffs[size-1]; } protected void addOffCorrectMap(int off, int cumulativeDiff) { if (offsets == null) { offsets = new int[64]; diffs = new int[64]; } else if (size == offsets.length) { offsets = ArrayUtil.grow(offsets); diffs = ArrayUtil.grow(diffs); } offsets[size] = off; diffs[size++] = cumulativeDiff; } } lucene-2.9.4/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java0000644000175000017500000000420211474505315026716 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Reader; import org.apache.lucene.util.AttributeSource; /** * LowerCaseTokenizer performs the function of LetterTokenizer * and LowerCaseFilter together. It divides text at non-letters and converts * them to lower case. While it is functionally equivalent to the combination * of LetterTokenizer and LowerCaseFilter, there is a performance advantage * to doing the two tasks at once, hence this (redundant) implementation. *

    * Note: this does a decent job for most European languages, but does a terrible * job for some Asian languages, where words are not separated by spaces. */ public final class LowerCaseTokenizer extends LetterTokenizer { /** Construct a new LowerCaseTokenizer. */ public LowerCaseTokenizer(Reader in) { super(in); } /** Construct a new LowerCaseTokenizer using a given {@link AttributeSource}. */ public LowerCaseTokenizer(AttributeSource source, Reader in) { super(source, in); } /** Construct a new LowerCaseTokenizer using a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. */ public LowerCaseTokenizer(AttributeFactory factory, Reader in) { super(factory, in); } /** Converts char to lower case * {@link Character#toLowerCase(char)}.*/ protected char normalize(char c) { return Character.toLowerCase(c); } } lucene-2.9.4/src/java/org/apache/lucene/analysis/CharReader.java0000644000175000017500000000352511474320222025136 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis; import java.io.IOException; import java.io.Reader; /** * CharReader is a Reader wrapper. It reads chars from * Reader and outputs {@link CharStream}, defining an * identify function {@link #correctOffset} method that * simply returns the provided offset. */ public final class CharReader extends CharStream { protected Reader input; public static CharStream get(Reader input) { return input instanceof CharStream ? (CharStream)input : new CharReader(input); } private CharReader(Reader in) { input = in; } public int correctOffset(int currentOff) { return currentOff; } public void close() throws IOException { input.close(); } public int read(char[] cbuf, int off, int len) throws IOException { return input.read(cbuf, off, len); } public boolean markSupported(){ return input.markSupported(); } public void mark( int readAheadLimit ) throws IOException { input.mark(readAheadLimit); } public void reset() throws IOException { input.reset(); } } lucene-2.9.4/src/java/org/apache/lucene/analysis/CharStream.java0000644000175000017500000000310111474320222025155 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis; import java.io.Reader; /** * CharStream adds {@link #correctOffset} * functionality over {@link Reader}. All Tokenizers accept a * CharStream instead of {@link Reader} as input, which enables * arbitrary character based filtering before tokenization. * The {@link #correctOffset} method fixed offsets to account for * removal or insertion of characters, so that the offsets * reported in the tokens match the character offsets of the * original Reader. */ public abstract class CharStream extends Reader { /** * Called by CharFilter(s) and Tokenizer to correct token offset. * * @param currentOff offset as seen in the output * @return corrected offset based on the input */ public abstract int correctOffset(int currentOff); } lucene-2.9.4/src/java/org/apache/lucene/analysis/NumericTokenStream.java0000644000175000017500000002255011474320222026714 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.NumericUtils; import org.apache.lucene.document.NumericField; // for javadocs import org.apache.lucene.search.NumericRangeQuery; // for javadocs import org.apache.lucene.search.NumericRangeFilter; // for javadocs import org.apache.lucene.search.SortField; // for javadocs import org.apache.lucene.search.FieldCache; // javadocs import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; /** * Expert: This class provides a {@link TokenStream} * for indexing numeric values that can be used by {@link * NumericRangeQuery} or {@link NumericRangeFilter}. * *

    Note that for simple usage, {@link NumericField} is * recommended. {@link NumericField} disables norms and * term freqs, as they are not usually needed during * searching. If you need to change these settings, you * should use this class. * *

    See {@link NumericField} for capabilities of fields * indexed numerically.

    * *

    Here's an example usage, for an int field: * *

     *  Field field = new Field(name, new NumericTokenStream(precisionStep).setIntValue(value));
     *  field.setOmitNorms(true);
     *  field.setOmitTermFreqAndPositions(true);
     *  document.add(field);
     * 
    * *

    For optimal performance, re-use the TokenStream and Field instance * for more than one document: * *

     *  NumericTokenStream stream = new NumericTokenStream(precisionStep);
     *  Field field = new Field(name, stream);
     *  field.setOmitNorms(true);
     *  field.setOmitTermFreqAndPositions(true);
     *  Document document = new Document();
     *  document.add(field);
     *
     *  for(all documents) {
     *    stream.setIntValue(value)
     *    writer.addDocument(document);
     *  }
     * 
    * *

    This stream is not intended to be used in analyzers; * it's more for iterating the different precisions during * indexing a specific numeric value.

    *

    NOTE: as token streams are only consumed once * the document is added to the index, if you index more * than one numeric field, use a separate NumericTokenStream * instance for each.

    * *

    See {@link NumericRangeQuery} for more details on the * precisionStep * parameter as well as how numeric fields work under the hood.

    * *

    NOTE: This API is experimental and * might change in incompatible ways in the next release. * * @since 2.9 */ public final class NumericTokenStream extends TokenStream { /** The full precision token gets this token type assigned. */ public static final String TOKEN_TYPE_FULL_PREC = "fullPrecNumeric"; /** The lower precision tokens gets this token type assigned. */ public static final String TOKEN_TYPE_LOWER_PREC = "lowerPrecNumeric"; /** * Creates a token stream for numeric values using the default precisionStep * {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). The stream is not yet initialized, * before using set a value using the various set???Value() methods. */ public NumericTokenStream() { this(NumericUtils.PRECISION_STEP_DEFAULT); } /** * Creates a token stream for numeric values with the specified * precisionStep. The stream is not yet initialized, * before using set a value using the various set???Value() methods. */ public NumericTokenStream(final int precisionStep) { super(); this.precisionStep = precisionStep; if (precisionStep < 1) throw new IllegalArgumentException("precisionStep must be >=1"); } /** * Expert: Creates a token stream for numeric values with the specified * precisionStep using the given {@link AttributeSource}. * The stream is not yet initialized, * before using set a value using the various set???Value() methods. */ public NumericTokenStream(AttributeSource source, final int precisionStep) { super(source); this.precisionStep = precisionStep; if (precisionStep < 1) throw new IllegalArgumentException("precisionStep must be >=1"); } /** * Expert: Creates a token stream for numeric values with the specified * precisionStep using the given * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. * The stream is not yet initialized, * before using set a value using the various set???Value() methods. */ public NumericTokenStream(AttributeFactory factory, final int precisionStep) { super(factory); this.precisionStep = precisionStep; if (precisionStep < 1) throw new IllegalArgumentException("precisionStep must be >=1"); } /** * Initializes the token stream with the supplied long value. * @param value the value, for which this TokenStream should enumerate tokens. * @return this instance, because of this you can use it the following way: * new Field(name, new NumericTokenStream(precisionStep).setLongValue(value)) */ public NumericTokenStream setLongValue(final long value) { this.value = value; valSize = 64; shift = 0; return this; } /** * Initializes the token stream with the supplied int value. * @param value the value, for which this TokenStream should enumerate tokens. * @return this instance, because of this you can use it the following way: * new Field(name, new NumericTokenStream(precisionStep).setIntValue(value)) */ public NumericTokenStream setIntValue(final int value) { this.value = (long) value; valSize = 32; shift = 0; return this; } /** * Initializes the token stream with the supplied double value. * @param value the value, for which this TokenStream should enumerate tokens. * @return this instance, because of this you can use it the following way: * new Field(name, new NumericTokenStream(precisionStep).setDoubleValue(value)) */ public NumericTokenStream setDoubleValue(final double value) { this.value = NumericUtils.doubleToSortableLong(value); valSize = 64; shift = 0; return this; } /** * Initializes the token stream with the supplied float value. * @param value the value, for which this TokenStream should enumerate tokens. * @return this instance, because of this you can use it the following way: * new Field(name, new NumericTokenStream(precisionStep).setFloatValue(value)) */ public NumericTokenStream setFloatValue(final float value) { this.value = (long) NumericUtils.floatToSortableInt(value); valSize = 32; shift = 0; return this; } // @Override public void reset() { if (valSize == 0) throw new IllegalStateException("call set???Value() before usage"); shift = 0; } // @Override public boolean incrementToken() { if (valSize == 0) throw new IllegalStateException("call set???Value() before usage"); if (shift >= valSize) return false; clearAttributes(); final char[] buffer; switch (valSize) { case 64: buffer = termAtt.resizeTermBuffer(NumericUtils.BUF_SIZE_LONG); termAtt.setTermLength(NumericUtils.longToPrefixCoded(value, shift, buffer)); break; case 32: buffer = termAtt.resizeTermBuffer(NumericUtils.BUF_SIZE_INT); termAtt.setTermLength(NumericUtils.intToPrefixCoded((int) value, shift, buffer)); break; default: // should not happen throw new IllegalArgumentException("valSize must be 32 or 64"); } typeAtt.setType((shift == 0) ? TOKEN_TYPE_FULL_PREC : TOKEN_TYPE_LOWER_PREC); posIncrAtt.setPositionIncrement((shift == 0) ? 1 : 0); shift += precisionStep; return true; } // @Override public String toString() { final StringBuffer sb = new StringBuffer("(numeric,valSize=").append(valSize); sb.append(",precisionStep=").append(precisionStep).append(')'); return sb.toString(); } // members private final TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class); private final TypeAttribute typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); private final PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); private int shift = 0, valSize = 0; // valSize==0 means not initialized private final int precisionStep; private long value = 0L; } lucene-2.9.4/src/java/org/apache/lucene/analysis/KeywordTokenizer.java0000644000175000017500000000654411474320222026461 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.AttributeSource; /** * Emits the entire input as a single token. */ public class KeywordTokenizer extends Tokenizer { private static final int DEFAULT_BUFFER_SIZE = 256; private boolean done; private int finalOffset; private TermAttribute termAtt; private OffsetAttribute offsetAtt; public KeywordTokenizer(Reader input) { this(input, DEFAULT_BUFFER_SIZE); } public KeywordTokenizer(Reader input, int bufferSize) { super(input); init(bufferSize); } public KeywordTokenizer(AttributeSource source, Reader input, int bufferSize) { super(source, input); init(bufferSize); } public KeywordTokenizer(AttributeFactory factory, Reader input, int bufferSize) { super(factory, input); init(bufferSize); } private void init(int bufferSize) { this.done = false; termAtt = (TermAttribute) addAttribute(TermAttribute.class); offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); termAtt.resizeTermBuffer(bufferSize); } public final boolean incrementToken() throws IOException { if (!done) { clearAttributes(); done = true; int upto = 0; char[] buffer = termAtt.termBuffer(); while (true) { final int length = input.read(buffer, upto, buffer.length-upto); if (length == -1) break; upto += length; if (upto == buffer.length) buffer = termAtt.resizeTermBuffer(1+buffer.length); } termAtt.setTermLength(upto); finalOffset = correctOffset(upto); offsetAtt.setOffset(correctOffset(0), finalOffset); return true; } return false; } public final void end() { // set final offset offsetAtt.setOffset(finalOffset, finalOffset); } /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should * not be overridden. Delegates to the backwards compatibility layer. */ public final Token next(final Token reusableToken) throws IOException { return super.next(reusableToken); } /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should * not be overridden. Delegates to the backwards compatibility layer. */ public final Token next() throws IOException { return super.next(); } public void reset(Reader input) throws IOException { super.reset(input); this.done = false; } } lucene-2.9.4/src/java/org/apache/lucene/analysis/LengthFilter.java0000644000175000017500000000365611474505315025542 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Removes words that are too long or too short from the stream. * * * @version $Id: LengthFilter.java 1039905 2010-11-28 16:58:14Z uschindler $ */ public final class LengthFilter extends TokenFilter { final int min; final int max; private TermAttribute termAtt; /** * Build a filter that removes words that are too long or too * short from the text. */ public LengthFilter(TokenStream in, int min, int max) { super(in); this.min = min; this.max = max; termAtt = (TermAttribute) addAttribute(TermAttribute.class); } /** * Returns the next input Token whose term() is the right len */ public final boolean incrementToken() throws IOException { // return the first non-stop word found while (input.incrementToken()) { int len = termAtt.termLength(); if (len >= min && len <= max) { return true; } // note: else we ignore it but should we index each part of it? } // reached EOS -- return null return false; } } lucene-2.9.4/src/java/org/apache/lucene/analysis/Analyzer.java0000644000175000017500000001242111474320222024716 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Reader; import java.io.IOException; import java.lang.reflect.Method; import org.apache.lucene.util.CloseableThreadLocal; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.document.Fieldable; /** An Analyzer builds TokenStreams, which analyze text. It thus represents a * policy for extracting index terms from text. *

    * Typical implementations first build a Tokenizer, which breaks the stream of * characters from the Reader into raw Tokens. One or more TokenFilters may * then be applied to the output of the Tokenizer. */ public abstract class Analyzer { /** Creates a TokenStream which tokenizes all the text in the provided * Reader. Must be able to handle null field name for * backward compatibility. */ public abstract TokenStream tokenStream(String fieldName, Reader reader); /** Creates a TokenStream that is allowed to be re-used * from the previous time that the same thread called * this method. Callers that do not need to use more * than one TokenStream at the same time from this * analyzer should use this method for better * performance. */ public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { return tokenStream(fieldName, reader); } private CloseableThreadLocal tokenStreams = new CloseableThreadLocal(); /** Used by Analyzers that implement reusableTokenStream * to retrieve previously saved TokenStreams for re-use * by the same thread. */ protected Object getPreviousTokenStream() { try { return tokenStreams.get(); } catch (NullPointerException npe) { if (tokenStreams == null) { throw new AlreadyClosedException("this Analyzer is closed"); } else { throw npe; } } } /** Used by Analyzers that implement reusableTokenStream * to save a TokenStream for later re-use by the same * thread. */ protected void setPreviousTokenStream(Object obj) { try { tokenStreams.set(obj); } catch (NullPointerException npe) { if (tokenStreams == null) { throw new AlreadyClosedException("this Analyzer is closed"); } else { throw npe; } } } protected boolean overridesTokenStreamMethod; /** @deprecated This is only present to preserve * back-compat of classes that subclass a core analyzer * and override tokenStream but not reusableTokenStream */ protected void setOverridesTokenStreamMethod(Class baseClass) { final Class[] params = new Class[2]; params[0] = String.class; params[1] = Reader.class; try { Method m = this.getClass().getMethod("tokenStream", params); if (m != null) { overridesTokenStreamMethod = m.getDeclaringClass() != baseClass; } else { overridesTokenStreamMethod = false; } } catch (NoSuchMethodException nsme) { overridesTokenStreamMethod = false; } } /** * Invoked before indexing a Fieldable instance if * terms have already been added to that field. This allows custom * analyzers to place an automatic position increment gap between * Fieldable instances using the same field name. The default value * position increment gap is 0. With a 0 position increment gap and * the typical default token position increment of 1, all terms in a field, * including across Fieldable instances, are in successive positions, allowing * exact PhraseQuery matches, for instance, across Fieldable instance boundaries. * * @param fieldName Fieldable name being indexed. * @return position increment gap, added to the next token emitted from {@link #tokenStream(String,Reader)} */ public int getPositionIncrementGap(String fieldName) { return 0; } /** * Just like {@link #getPositionIncrementGap}, except for * Token offsets instead. By default this returns 1 for * tokenized fields and, as if the fields were joined * with an extra space character, and 0 for un-tokenized * fields. This method is only called if the field * produced at least one token for indexing. * * @param field the field just indexed * @return offset gap, added to the next token emitted from {@link #tokenStream(String,Reader)} */ public int getOffsetGap(Fieldable field) { if (field.isTokenized()) return 1; else return 0; } /** Frees persistent resources used by this Analyzer */ public void close() { tokenStreams.close(); tokenStreams = null; } } lucene-2.9.4/src/java/org/apache/lucene/analysis/CharTokenizer.java0000644000175000017500000001133511474320222025704 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.AttributeSource; /** An abstract base class for simple, character-oriented tokenizers.*/ public abstract class CharTokenizer extends Tokenizer { public CharTokenizer(Reader input) { super(input); offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); termAtt = (TermAttribute) addAttribute(TermAttribute.class); } public CharTokenizer(AttributeSource source, Reader input) { super(source, input); offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); termAtt = (TermAttribute) addAttribute(TermAttribute.class); } public CharTokenizer(AttributeFactory factory, Reader input) { super(factory, input); offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); termAtt = (TermAttribute) addAttribute(TermAttribute.class); } private int offset = 0, bufferIndex = 0, dataLen = 0; private static final int MAX_WORD_LEN = 255; private static final int IO_BUFFER_SIZE = 4096; private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; private TermAttribute termAtt; private OffsetAttribute offsetAtt; /** Returns true iff a character should be included in a token. This * tokenizer generates as tokens adjacent sequences of characters which * satisfy this predicate. Characters for which this is false are used to * define token boundaries and are not included in tokens. */ protected abstract boolean isTokenChar(char c); /** Called on each token character to normalize it before it is added to the * token. The default implementation does nothing. Subclasses may use this * to, e.g., lowercase tokens. */ protected char normalize(char c) { return c; } public final boolean incrementToken() throws IOException { clearAttributes(); int length = 0; int start = bufferIndex; char[] buffer = termAtt.termBuffer(); while (true) { if (bufferIndex >= dataLen) { offset += dataLen; dataLen = input.read(ioBuffer); if (dataLen == -1) { dataLen = 0; // so next offset += dataLen won't decrement offset if (length > 0) break; else return false; } bufferIndex = 0; } final char c = ioBuffer[bufferIndex++]; if (isTokenChar(c)) { // if it's a token char if (length == 0) // start of token start = offset + bufferIndex - 1; else if (length == buffer.length) buffer = termAtt.resizeTermBuffer(1+length); buffer[length++] = normalize(c); // buffer it, normalized if (length == MAX_WORD_LEN) // buffer overflow! break; } else if (length > 0) // at non-Letter w/ chars break; // return 'em } termAtt.setTermLength(length); offsetAtt.setOffset(correctOffset(start), correctOffset(start+length)); return true; } public final void end() { // set final offset int finalOffset = correctOffset(offset); offsetAtt.setOffset(finalOffset, finalOffset); } /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should * not be overridden. Delegates to the backwards compatibility layer. */ public final Token next(final Token reusableToken) throws IOException { return super.next(reusableToken); } /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should * not be overridden. Delegates to the backwards compatibility layer. */ public final Token next() throws IOException { return super.next(); } public void reset(Reader input) throws IOException { super.reset(input); bufferIndex = 0; offset = 0; dataLen = 0; } } lucene-2.9.4/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java0000644000175000017500000002040411474320222026642 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.lang.ref.WeakReference; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.AttributeSource; /** * This TokenFilter provides the ability to set aside attribute states * that have already been analyzed. This is useful in situations where multiple fields share * many common analysis steps and then go their separate ways. *

    * It is also useful for doing things like entity extraction or proper noun analysis as * part of the analysis workflow and saving off those tokens for use in another field. * *

    TeeSinkTokenFilter source1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader1));
    TeeSinkTokenFilter.SinkTokenStream sink1 = source1.newSinkTokenStream();
    TeeSinkTokenFilter.SinkTokenStream sink2 = source1.newSinkTokenStream();
    
    TeeSinkTokenFilter source2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader2));
    source2.addSinkTokenStream(sink1);
    source2.addSinkTokenStream(sink2);
    
    TokenStream final1 = new LowerCaseFilter(source1);
    TokenStream final2 = source2;
    TokenStream final3 = new EntityDetect(sink1);
    TokenStream final4 = new URLDetect(sink2);
    
    d.add(new Field("f1", final1));
    d.add(new Field("f2", final2));
    d.add(new Field("f3", final3));
    d.add(new Field("f4", final4));
     * 
    * In this example, sink1 and sink2 will both get tokens from both * reader1 and reader2 after whitespace tokenizer * and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired. * It is important, that tees are consumed before sinks (in the above example, the field names must be * less the sink's field names). If you are not sure, which stream is consumed first, you can simply * add another sink and then pass all tokens to the sinks at once using {@link #consumeAllTokens}. * This TokenFilter is exhausted after this. In the above example, change * the example above to: *
    ...
    TokenStream final1 = new LowerCaseFilter(source1.newSinkTokenStream());
    TokenStream final2 = source2.newSinkTokenStream();
    sink1.consumeAllTokens();
    sink2.consumeAllTokens();
    ...
     * 
    * In this case, the fields can be added in any order, because the sources are not used anymore and all sinks are ready. *

    Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene. */ public final class TeeSinkTokenFilter extends TokenFilter { private final List sinks = new LinkedList(); /** * Instantiates a new TeeSinkTokenFilter. */ public TeeSinkTokenFilter(TokenStream input) { super(input); } /** * Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream. */ public SinkTokenStream newSinkTokenStream() { return newSinkTokenStream(ACCEPT_ALL_FILTER); } /** * Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream * that pass the supplied filter. * @see SinkFilter */ public SinkTokenStream newSinkTokenStream(SinkFilter filter) { SinkTokenStream sink = new SinkTokenStream(this.cloneAttributes(), filter); this.sinks.add(new WeakReference(sink)); return sink; } /** * Adds a {@link SinkTokenStream} created by another TeeSinkTokenFilter * to this one. The supplied stream will also receive all consumed tokens. * This method can be used to pass tokens from two different tees to one sink. */ public void addSinkTokenStream(final SinkTokenStream sink) { // check that sink has correct factory if (!this.getAttributeFactory().equals(sink.getAttributeFactory())) { throw new IllegalArgumentException("The supplied sink is not compatible to this tee"); } // add eventually missing attribute impls to the existing sink for (Iterator it = this.cloneAttributes().getAttributeImplsIterator(); it.hasNext(); ) { sink.addAttributeImpl((AttributeImpl) it.next()); } this.sinks.add(new WeakReference(sink)); } /** * TeeSinkTokenFilter passes all tokens to the added sinks * when itself is consumed. To be sure, that all tokens from the input * stream are passed to the sinks, you can call this methods. * This instance is exhausted after this, but all sinks are instant available. */ public void consumeAllTokens() throws IOException { while (incrementToken()); } public boolean incrementToken() throws IOException { if (input.incrementToken()) { // capture state lazily - maybe no SinkFilter accepts this state AttributeSource.State state = null; for (Iterator it = sinks.iterator(); it.hasNext(); ) { final SinkTokenStream sink = (SinkTokenStream) ((WeakReference) it.next()).get(); if (sink != null) { if (sink.accept(this)) { if (state == null) { state = this.captureState(); } sink.addState(state); } } } return true; } return false; } public final void end() throws IOException { super.end(); AttributeSource.State finalState = captureState(); for (Iterator it = sinks.iterator(); it.hasNext(); ) { final SinkTokenStream sink = (SinkTokenStream) ((WeakReference) it.next()).get(); if (sink != null) { sink.setFinalState(finalState); } } } /** * A filter that decides which {@link AttributeSource} states to store in the sink. */ public static abstract class SinkFilter { /** * Returns true, iff the current state of the passed-in {@link AttributeSource} shall be stored * in the sink. */ public abstract boolean accept(AttributeSource source); /** * Called by {@link SinkTokenStream#reset()}. This method does nothing by default * and can optionally be overridden. */ public void reset() throws IOException { // nothing to do; can be overridden } } public static final class SinkTokenStream extends TokenStream { private final List cachedStates = new LinkedList(); private AttributeSource.State finalState; private Iterator it = null; private SinkFilter filter; private SinkTokenStream(AttributeSource source, SinkFilter filter) { super(source); this.filter = filter; } private boolean accept(AttributeSource source) { return filter.accept(source); } private void addState(AttributeSource.State state) { if (it != null) { throw new IllegalStateException("The tee must be consumed before sinks are consumed."); } cachedStates.add(state); } private void setFinalState(AttributeSource.State finalState) { this.finalState = finalState; } public final boolean incrementToken() throws IOException { // lazy init the iterator if (it == null) { it = cachedStates.iterator(); } if (!it.hasNext()) { return false; } AttributeSource.State state = (State) it.next(); restoreState(state); return true; } public final void end() throws IOException { if (finalState != null) { restoreState(finalState); } } public final void reset() { it = cachedStates.iterator(); } } private static final SinkFilter ACCEPT_ALL_FILTER = new SinkFilter() { public boolean accept(AttributeSource source) { return true; } }; } lucene-2.9.4/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java0000644000175000017500000002030611474320222027121 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * A filter that replaces accented characters in the ISO Latin 1 character set * (ISO-8859-1) by their unaccented equivalent. The case will not be altered. *

    * For instance, 'à' will be replaced by 'a'. *

    * * @deprecated in favor of {@link ASCIIFoldingFilter} which covers a superset * of Latin 1. This class will be removed in Lucene 3.0. */ public class ISOLatin1AccentFilter extends TokenFilter { public ISOLatin1AccentFilter(TokenStream input) { super(input); termAtt = (TermAttribute) addAttribute(TermAttribute.class); } private char[] output = new char[256]; private int outputPos; private TermAttribute termAtt; public final boolean incrementToken() throws java.io.IOException { if (input.incrementToken()) { final char[] buffer = termAtt.termBuffer(); final int length = termAtt.termLength(); // If no characters actually require rewriting then we // just return token as-is: for(int i=0;i= '\u00c0' && c <= '\uFB06') { removeAccents(buffer, length); termAtt.setTermBuffer(output, 0, outputPos); break; } } return true; } else return false; } /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should * not be overridden. Delegates to the backwards compatibility layer. */ public final Token next(final Token reusableToken) throws java.io.IOException { return super.next(reusableToken); } /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should * not be overridden. Delegates to the backwards compatibility layer. */ public final Token next() throws java.io.IOException { return super.next(); } /** * To replace accented characters in a String by unaccented equivalents. */ public final void removeAccents(char[] input, int length) { // Worst-case length required: final int maxSizeNeeded = 2*length; int size = output.length; while (size < maxSizeNeeded) size *= 2; if (size != output.length) output = new char[size]; outputPos = 0; int pos = 0; for (int i=0; i '\uFB06') output[outputPos++] = c; else { switch (c) { case '\u00C0' : // À case '\u00C1' : // à case '\u00C2' : //  case '\u00C3' : // à case '\u00C4' : // Ä case '\u00C5' : // Ã… output[outputPos++] = 'A'; break; case '\u00C6' : // Æ output[outputPos++] = 'A'; output[outputPos++] = 'E'; break; case '\u00C7' : // Ç output[outputPos++] = 'C'; break; case '\u00C8' : // È case '\u00C9' : // É case '\u00CA' : // Ê case '\u00CB' : // Ë output[outputPos++] = 'E'; break; case '\u00CC' : // ÃŒ case '\u00CD' : // à case '\u00CE' : // ÃŽ case '\u00CF' : // à output[outputPos++] = 'I'; break; case '\u0132' : // IJ output[outputPos++] = 'I'; output[outputPos++] = 'J'; break; case '\u00D0' : // à output[outputPos++] = 'D'; break; case '\u00D1' : // Ñ output[outputPos++] = 'N'; break; case '\u00D2' : // Ã’ case '\u00D3' : // Ó case '\u00D4' : // Ô case '\u00D5' : // Õ case '\u00D6' : // Ö case '\u00D8' : // Ø output[outputPos++] = 'O'; break; case '\u0152' : // Å’ output[outputPos++] = 'O'; output[outputPos++] = 'E'; break; case '\u00DE' : // Þ output[outputPos++] = 'T'; output[outputPos++] = 'H'; break; case '\u00D9' : // Ù case '\u00DA' : // Ú case '\u00DB' : // Û case '\u00DC' : // Ü output[outputPos++] = 'U'; break; case '\u00DD' : // à case '\u0178' : // Ÿ output[outputPos++] = 'Y'; break; case '\u00E0' : // à case '\u00E1' : // á case '\u00E2' : // â case '\u00E3' : // ã case '\u00E4' : // ä case '\u00E5' : // Ã¥ output[outputPos++] = 'a'; break; case '\u00E6' : // æ output[outputPos++] = 'a'; output[outputPos++] = 'e'; break; case '\u00E7' : // ç output[outputPos++] = 'c'; break; case '\u00E8' : // è case '\u00E9' : // é case '\u00EA' : // ê case '\u00EB' : // ë output[outputPos++] = 'e'; break; case '\u00EC' : // ì case '\u00ED' : // í case '\u00EE' : // î case '\u00EF' : // ï output[outputPos++] = 'i'; break; case '\u0133' : // ij output[outputPos++] = 'i'; output[outputPos++] = 'j'; break; case '\u00F0' : // ð output[outputPos++] = 'd'; break; case '\u00F1' : // ñ output[outputPos++] = 'n'; break; case '\u00F2' : // ò case '\u00F3' : // ó case '\u00F4' : // ô case '\u00F5' : // õ case '\u00F6' : // ö case '\u00F8' : // ø output[outputPos++] = 'o'; break; case '\u0153' : // Å“ output[outputPos++] = 'o'; output[outputPos++] = 'e'; break; case '\u00DF' : // ß output[outputPos++] = 's'; output[outputPos++] = 's'; break; case '\u00FE' : // þ output[outputPos++] = 't'; output[outputPos++] = 'h'; break; case '\u00F9' : // ù case '\u00FA' : // ú case '\u00FB' : // û case '\u00FC' : // ü output[outputPos++] = 'u'; break; case '\u00FD' : // ý case '\u00FF' : // ÿ output[outputPos++] = 'y'; break; case '\uFB00': // ff output[outputPos++] = 'f'; output[outputPos++] = 'f'; break; case '\uFB01': // ï¬ output[outputPos++] = 'f'; output[outputPos++] = 'i'; break; case '\uFB02': // fl output[outputPos++] = 'f'; output[outputPos++] = 'l'; break; // following 2 are commented as they can break the maxSizeNeeded (and doing *3 could be expensive) // case '\uFB03': // ffi // output[outputPos++] = 'f'; // output[outputPos++] = 'f'; // output[outputPos++] = 'i'; // break; // case '\uFB04': // ffl // output[outputPos++] = 'f'; // output[outputPos++] = 'f'; // output[outputPos++] = 'l'; // break; case '\uFB05': // ſt output[outputPos++] = 'f'; output[outputPos++] = 't'; break; case '\uFB06': // st output[outputPos++] = 's'; output[outputPos++] = 't'; break; default : output[outputPos++] = c; break; } } } } } lucene-2.9.4/src/java/org/apache/lucene/analysis/CharFilter.java0000644000175000017500000000413511474320222025157 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis; import java.io.IOException; /** * Subclasses of CharFilter can be chained to filter CharStream. * They can be used as {@link java.io.Reader} with additional offset * correction. {@link Tokenizer}s will automatically use {@link #correctOffset} * if a CharFilter/CharStream subclass is used. * * @version $Id$ * */ public abstract class CharFilter extends CharStream { protected CharStream input; protected CharFilter(CharStream in) { input = in; } /** * Subclass may want to override to correct the current offset. * * @param currentOff current offset * @return corrected offset */ protected int correct(int currentOff) { return currentOff; } /** * Chains the corrected offset through the input * CharFilter. */ public final int correctOffset(int currentOff) { return input.correctOffset(correct(currentOff)); } public void close() throws IOException { input.close(); } public int read(char[] cbuf, int off, int len) throws IOException { return input.read(cbuf, off, len); } public boolean markSupported(){ return input.markSupported(); } public void mark( int readAheadLimit ) throws IOException { input.mark(readAheadLimit); } public void reset() throws IOException { input.reset(); } } lucene-2.9.4/src/java/org/apache/lucene/analysis/PorterStemFilter.java0000644000175000017500000000434211474320222026406 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** Transforms the token stream as per the Porter stemming algorithm. Note: the input to the stemming filter must already be in lower case, so you will need to use LowerCaseFilter or LowerCaseTokenizer farther down the Tokenizer chain in order for this to work properly!

    To use this filter with other analyzers, you'll want to write an Analyzer class that sets up the TokenStream chain as you want it. To use this with LowerCaseTokenizer, for example, you'd write an analyzer like this:

        class MyAnalyzer extends Analyzer {
          public final TokenStream tokenStream(String fieldName, Reader reader) {
            return new PorterStemFilter(new LowerCaseTokenizer(reader));
          }
        }
        
    */ public final class PorterStemFilter extends TokenFilter { private PorterStemmer stemmer; private TermAttribute termAtt; public PorterStemFilter(TokenStream in) { super(in); stemmer = new PorterStemmer(); termAtt = (TermAttribute) addAttribute(TermAttribute.class); } public final boolean incrementToken() throws IOException { if (!input.incrementToken()) return false; if (stemmer.stem(termAtt.termBuffer(), 0, termAtt.termLength())) termAtt.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength()); return true; } } lucene-2.9.4/src/java/org/apache/lucene/analysis/CharArraySet.java0000644000175000017500000002243011474320222025462 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; import java.util.AbstractSet; import java.util.Collection; import java.util.Collections; import java.util.Iterator; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * A simple class that stores Strings as char[]'s in a * hash table. Note that this is not a general purpose * class. For example, it cannot remove items from the * set, nor does it resize its hash table to be smaller, * etc. It is designed to be quick to test if a char[] * is in the set without the necessity of converting it * to a String first. */ public class CharArraySet extends AbstractSet { private final static int INIT_SIZE = 8; private char[][] entries; private int count; private final boolean ignoreCase; /** Create set with enough capacity to hold startSize * terms */ public CharArraySet(int startSize, boolean ignoreCase) { this.ignoreCase = ignoreCase; int size = INIT_SIZE; while(startSize + (startSize>>2) > size) size <<= 1; entries = new char[size][]; } /** Create set from a Collection of char[] or String */ public CharArraySet(Collection c, boolean ignoreCase) { this(c.size(), ignoreCase); addAll(c); } /** Create set from entries */ private CharArraySet(char[][] entries, boolean ignoreCase, int count){ this.entries = entries; this.ignoreCase = ignoreCase; this.count = count; } /** true if the len chars of text starting at off * are in the set */ public boolean contains(char[] text, int off, int len) { return entries[getSlot(text, off, len)] != null; } /** true if the CharSequence is in the set */ public boolean contains(CharSequence cs) { return entries[getSlot(cs)] != null; } private int getSlot(char[] text, int off, int len) { int code = getHashCode(text, off, len); int pos = code & (entries.length-1); char[] text2 = entries[pos]; if (text2 != null && !equals(text, off, len, text2)) { final int inc = ((code>>8)+code)|1; do { code += inc; pos = code & (entries.length-1); text2 = entries[pos]; } while (text2 != null && !equals(text, off, len, text2)); } return pos; } /** Returns true if the String is in the set */ private int getSlot(CharSequence text) { int code = getHashCode(text); int pos = code & (entries.length-1); char[] text2 = entries[pos]; if (text2 != null && !equals(text, text2)) { final int inc = ((code>>8)+code)|1; do { code += inc; pos = code & (entries.length-1); text2 = entries[pos]; } while (text2 != null && !equals(text, text2)); } return pos; } /** Add this CharSequence into the set */ public boolean add(CharSequence text) { return add(text.toString()); // could be more efficient } /** Add this String into the set */ public boolean add(String text) { return add(text.toCharArray()); } /** Add this char[] directly to the set. * If ignoreCase is true for this Set, the text array will be directly modified. * The user should never modify this text array after calling this method. */ public boolean add(char[] text) { if (ignoreCase) for(int i=0;i>2) > entries.length) { rehash(); } return true; } private boolean equals(char[] text1, int off, int len, char[] text2) { if (len != text2.length) return false; if (ignoreCase) { for(int i=0;inull
    . */ public static CharArraySet unmodifiableSet(CharArraySet set) { if (set == null) throw new NullPointerException("Given set is null"); /* * Instead of delegating calls to the given set copy the low-level values to * the unmodifiable Subclass */ return new UnmodifiableCharArraySet(set.entries, set.ignoreCase, set.count); } /** The Iterator for this set. Strings are constructed on the fly, so * use nextCharArray for more efficient access. */ public class CharArraySetIterator implements Iterator { int pos=-1; char[] next; CharArraySetIterator() { goNext(); } private void goNext() { next = null; pos++; while (pos < entries.length && (next=entries[pos]) == null) pos++; } public boolean hasNext() { return next != null; } /** do not modify the returned char[] */ public char[] nextCharArray() { char[] ret = next; goNext(); return ret; } /** Returns the next String, as a Set would... * use nextCharArray() for better efficiency. */ public Object next() { return new String(nextCharArray()); } public void remove() { throw new UnsupportedOperationException(); } } public Iterator iterator() { return new CharArraySetIterator(); } /** * Efficient unmodifiable {@link CharArraySet}. This implementation does not * delegate calls to a give {@link CharArraySet} like * {@link Collections#unmodifiableSet(java.util.Set)} does. Instead is passes * the internal representation of a {@link CharArraySet} to a super * constructor and overrides all mutators. */ private static final class UnmodifiableCharArraySet extends CharArraySet { private UnmodifiableCharArraySet(char[][] entries, boolean ignoreCase, int count) { super(entries, ignoreCase, count); } public boolean add(Object o){ throw new UnsupportedOperationException(); } public boolean addAll(Collection coll) { throw new UnsupportedOperationException(); } public boolean add(char[] text) { throw new UnsupportedOperationException(); } public boolean add(CharSequence text) { throw new UnsupportedOperationException(); } public boolean add(String text) { throw new UnsupportedOperationException(); } } } lucene-2.9.4/src/java/org/apache/lucene/analysis/standard/0000755000175000017500000000000011554106562024076 5ustar janpascaljanpascallucene-2.9.4/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java0000644000175000017500000005451611474320221031220 0ustar janpascaljanpascal/* The following code was generated by JFlex 1.4.1 on 9/4/08 6:49 PM */ package org.apache.lucene.analysis.standard; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* NOTE: if you change this file and need to regenerate the tokenizer, remember to use JRE 1.4 when running jflex (before Lucene 3.0). This grammar now uses constructs (eg :digit:) whose meaning can vary according to the JRE used to run jflex. See https://issues.apache.org/jira/browse/LUCENE-1126 for details */ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * This class is a scanner generated by * JFlex 1.4.1 * on 9/4/08 6:49 PM from the specification file * /tango/mike/src/lucene.standarddigit/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex */ class StandardTokenizerImpl { /** This character denotes the end of file */ public static final int YYEOF = -1; /** initial size of the lookahead buffer */ private static final int ZZ_BUFFERSIZE = 16384; /** lexical states */ public static final int YYINITIAL = 0; /** * Translates characters to character classes */ private static final String ZZ_CMAP_PACKED = "\11\0\1\0\1\15\1\0\1\0\1\14\22\0\1\0\5\0\1\5"+ "\1\3\4\0\1\11\1\7\1\4\1\11\12\2\6\0\1\6\32\12"+ "\4\0\1\10\1\0\32\12\57\0\1\12\12\0\1\12\4\0\1\12"+ "\5\0\27\12\1\0\37\12\1\0\u0128\12\2\0\22\12\34\0\136\12"+ "\2\0\11\12\2\0\7\12\16\0\2\12\16\0\5\12\11\0\1\12"+ "\213\0\1\12\13\0\1\12\1\0\3\12\1\0\1\12\1\0\24\12"+ "\1\0\54\12\1\0\10\12\2\0\32\12\14\0\202\12\12\0\71\12"+ "\2\0\2\12\2\0\2\12\3\0\46\12\2\0\2\12\67\0\46\12"+ "\2\0\1\12\7\0\47\12\110\0\33\12\5\0\3\12\56\0\32\12"+ "\5\0\13\12\25\0\12\2\7\0\143\12\1\0\1\12\17\0\2\12"+ "\11\0\12\2\3\12\23\0\1\12\1\0\33\12\123\0\46\12\u015f\0"+ "\65\12\3\0\1\12\22\0\1\12\7\0\12\12\4\0\12\2\25\0"+ "\10\12\2\0\2\12\2\0\26\12\1\0\7\12\1\0\1\12\3\0"+ "\4\12\42\0\2\12\1\0\3\12\4\0\12\2\2\12\23\0\6\12"+ "\4\0\2\12\2\0\26\12\1\0\7\12\1\0\2\12\1\0\2\12"+ "\1\0\2\12\37\0\4\12\1\0\1\12\7\0\12\2\2\0\3\12"+ "\20\0\7\12\1\0\1\12\1\0\3\12\1\0\26\12\1\0\7\12"+ "\1\0\2\12\1\0\5\12\3\0\1\12\22\0\1\12\17\0\1\12"+ "\5\0\12\2\25\0\10\12\2\0\2\12\2\0\26\12\1\0\7\12"+ "\1\0\2\12\2\0\4\12\3\0\1\12\36\0\2\12\1\0\3\12"+ "\4\0\12\2\25\0\6\12\3\0\3\12\1\0\4\12\3\0\2\12"+ "\1\0\1\12\1\0\2\12\3\0\2\12\3\0\3\12\3\0\10\12"+ "\1\0\3\12\55\0\11\2\25\0\10\12\1\0\3\12\1\0\27\12"+ "\1\0\12\12\1\0\5\12\46\0\2\12\4\0\12\2\25\0\10\12"+ "\1\0\3\12\1\0\27\12\1\0\12\12\1\0\5\12\44\0\1\12"+ "\1\0\2\12\4\0\12\2\25\0\10\12\1\0\3\12\1\0\27\12"+ "\1\0\20\12\46\0\2\12\4\0\12\2\25\0\22\12\3\0\30\12"+ "\1\0\11\12\1\0\1\12\2\0\7\12\71\0\1\1\60\12\1\1"+ "\2\12\14\1\7\12\11\1\12\2\47\0\2\12\1\0\1\12\2\0"+ "\2\12\1\0\1\12\2\0\1\12\6\0\4\12\1\0\7\12\1\0"+ "\3\12\1\0\1\12\1\0\1\12\2\0\2\12\1\0\4\12\1\0"+ "\2\12\11\0\1\12\2\0\5\12\1\0\1\12\11\0\12\2\2\0"+ "\2\12\42\0\1\12\37\0\12\2\26\0\10\12\1\0\42\12\35\0"+ "\4\12\164\0\42\12\1\0\5\12\1\0\2\12\25\0\12\2\6\0"+ "\6\12\112\0\46\12\12\0\47\12\11\0\132\12\5\0\104\12\5\0"+ "\122\12\6\0\7\12\1\0\77\12\1\0\1\12\1\0\4\12\2\0"+ "\7\12\1\0\1\12\1\0\4\12\2\0\47\12\1\0\1\12\1\0"+ "\4\12\2\0\37\12\1\0\1\12\1\0\4\12\2\0\7\12\1\0"+ "\1\12\1\0\4\12\2\0\7\12\1\0\7\12\1\0\27\12\1\0"+ "\37\12\1\0\1\12\1\0\4\12\2\0\7\12\1\0\47\12\1\0"+ "\23\12\16\0\11\2\56\0\125\12\14\0\u026c\12\2\0\10\12\12\0"+ "\32\12\5\0\113\12\225\0\64\12\54\0\12\2\46\0\12\2\6\0"+ "\130\12\10\0\51\12\u0557\0\234\12\4\0\132\12\6\0\26\12\2\0"+ "\6\12\2\0\46\12\2\0\6\12\2\0\10\12\1\0\1\12\1\0"+ "\1\12\1\0\1\12\1\0\37\12\2\0\65\12\1\0\7\12\1\0"+ "\1\12\3\0\3\12\1\0\7\12\3\0\4\12\2\0\6\12\4\0"+ "\15\12\5\0\3\12\1\0\7\12\202\0\1\12\202\0\1\12\4\0"+ "\1\12\2\0\12\12\1\0\1\12\3\0\5\12\6\0\1\12\1\0"+ "\1\12\1\0\1\12\1\0\4\12\1\0\3\12\1\0\7\12\u0ecb\0"+ "\2\12\52\0\5\12\12\0\1\13\124\13\10\13\2\13\2\13\132\13"+ "\1\13\3\13\6\13\50\13\3\13\1\0\136\12\21\0\30\12\70\0"+ "\20\13\u0100\0\200\13\200\0\u19b6\13\12\13\100\0\u51a6\13\132\13\u048d\12"+ "\u0773\0\u2ba4\12\u215c\0\u012e\13\322\13\7\12\14\0\5\12\5\0\1\12"+ "\1\0\12\12\1\0\15\12\1\0\5\12\1\0\1\12\1\0\2\12"+ "\1\0\2\12\1\0\154\12\41\0\u016b\12\22\0\100\12\2\0\66\12"+ "\50\0\14\12\164\0\3\12\1\0\1\12\1\0\207\12\23\0\12\2"+ "\7\0\32\12\6\0\32\12\12\0\1\13\72\13\37\12\3\0\6\12"+ "\2\0\6\12\2\0\6\12\2\0\3\12\43\0"; /** * Translates characters to character classes */ private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED); /** * Translates DFA states to action switch labels. */ private static final int [] ZZ_ACTION = zzUnpackAction(); private static final String ZZ_ACTION_PACKED_0 = "\1\0\1\1\3\2\1\3\1\1\13\0\1\2\3\4"+ "\2\0\1\5\1\0\1\5\3\4\6\5\1\6\1\4"+ "\2\7\1\10\1\0\1\10\3\0\2\10\1\11\1\12"+ "\1\4"; private static int [] zzUnpackAction() { int [] result = new int[51]; int offset = 0; offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result); return result; } private static int zzUnpackAction(String packed, int offset, int [] result) { int i = 0; /* index in packed string */ int j = offset; /* index in unpacked array */ int l = packed.length(); while (i < l) { int count = packed.charAt(i++); int value = packed.charAt(i++); do result[j++] = value; while (--count > 0); } return j; } /** * Translates a state to a row index in the transition table */ private static final int [] ZZ_ROWMAP = zzUnpackRowMap(); private static final String ZZ_ROWMAP_PACKED_0 = "\0\0\0\16\0\34\0\52\0\70\0\16\0\106\0\124"+ "\0\142\0\160\0\176\0\214\0\232\0\250\0\266\0\304"+ "\0\322\0\340\0\356\0\374\0\u010a\0\u0118\0\u0126\0\u0134"+ "\0\u0142\0\u0150\0\u015e\0\u016c\0\u017a\0\u0188\0\u0196\0\u01a4"+ "\0\u01b2\0\u01c0\0\u01ce\0\u01dc\0\u01ea\0\u01f8\0\322\0\u0206"+ "\0\u0214\0\u0222\0\u0230\0\u023e\0\u024c\0\u025a\0\124\0\214"+ "\0\u0268\0\u0276\0\u0284"; private static int [] zzUnpackRowMap() { int [] result = new int[51]; int offset = 0; offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result); return result; } private static int zzUnpackRowMap(String packed, int offset, int [] result) { int i = 0; /* index in packed string */ int j = offset; /* index in unpacked array */ int l = packed.length(); while (i < l) { int high = packed.charAt(i++) << 16; result[j++] = high | packed.charAt(i++); } return j; } /** * The transition table of the DFA */ private static final int [] ZZ_TRANS = zzUnpackTrans(); private static final String ZZ_TRANS_PACKED_0 = "\1\2\1\3\1\4\7\2\1\5\1\6\1\7\1\2"+ "\17\0\2\3\1\0\1\10\1\0\1\11\2\12\1\13"+ "\1\3\4\0\1\3\1\4\1\0\1\14\1\0\1\11"+ "\2\15\1\16\1\4\4\0\1\3\1\4\1\17\1\20"+ "\1\21\1\22\2\12\1\13\1\23\20\0\1\2\1\0"+ "\1\24\1\25\7\0\1\26\4\0\2\27\7\0\1\27"+ "\4\0\1\30\1\31\7\0\1\32\5\0\1\33\7\0"+ "\1\13\4\0\1\34\1\35\7\0\1\36\4\0\1\37"+ "\1\40\7\0\1\41\4\0\1\42\1\43\7\0\1\44"+ "\15\0\1\45\4\0\1\24\1\25\7\0\1\46\15\0"+ "\1\47\4\0\2\27\7\0\1\50\4\0\1\3\1\4"+ "\1\17\1\10\1\21\1\22\2\12\1\13\1\23\4\0"+ "\2\24\1\0\1\51\1\0\1\11\2\52\1\0\1\24"+ "\4\0\1\24\1\25\1\0\1\53\1\0\1\11\2\54"+ "\1\55\1\25\4\0\1\24\1\25\1\0\1\51\1\0"+ "\1\11\2\52\1\0\1\26\4\0\2\27\1\0\1\56"+ "\2\0\1\56\2\0\1\27\4\0\2\30\1\0\1\52"+ "\1\0\1\11\2\52\1\0\1\30\4\0\1\30\1\31"+ "\1\0\1\54\1\0\1\11\2\54\1\55\1\31\4\0"+ "\1\30\1\31\1\0\1\52\1\0\1\11\2\52\1\0"+ "\1\32\5\0\1\33\1\0\1\55\2\0\3\55\1\33"+ "\4\0\2\34\1\0\1\57\1\0\1\11\2\12\1\13"+ "\1\34\4\0\1\34\1\35\1\0\1\60\1\0\1\11"+ "\2\15\1\16\1\35\4\0\1\34\1\35\1\0\1\57"+ "\1\0\1\11\2\12\1\13\1\36\4\0\2\37\1\0"+ "\1\12\1\0\1\11\2\12\1\13\1\37\4\0\1\37"+ "\1\40\1\0\1\15\1\0\1\11\2\15\1\16\1\40"+ "\4\0\1\37\1\40\1\0\1\12\1\0\1\11\2\12"+ "\1\13\1\41\4\0\2\42\1\0\1\13\2\0\3\13"+ "\1\42\4\0\1\42\1\43\1\0\1\16\2\0\3\16"+ "\1\43\4\0\1\42\1\43\1\0\1\13\2\0\3\13"+ "\1\44\6\0\1\17\6\0\1\45\4\0\1\24\1\25"+ "\1\0\1\61\1\0\1\11\2\52\1\0\1\26\4\0"+ "\2\27\1\0\1\56\2\0\1\56\2\0\1\50\4\0"+ "\2\24\7\0\1\24\4\0\2\30\7\0\1\30\4\0"+ "\2\34\7\0\1\34\4\0\2\37\7\0\1\37\4\0"+ "\2\42\7\0\1\42\4\0\2\62\7\0\1\62\4\0"+ "\2\24\7\0\1\63\4\0\2\62\1\0\1\56\2\0"+ "\1\56\2\0\1\62\4\0\2\24\1\0\1\61\1\0"+ "\1\11\2\52\1\0\1\24\3\0"; private static int [] zzUnpackTrans() { int [] result = new int[658]; int offset = 0; offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result); return result; } private static int zzUnpackTrans(String packed, int offset, int [] result) { int i = 0; /* index in packed string */ int j = offset; /* index in unpacked array */ int l = packed.length(); while (i < l) { int count = packed.charAt(i++); int value = packed.charAt(i++); value--; do result[j++] = value; while (--count > 0); } return j; } /* error codes */ private static final int ZZ_UNKNOWN_ERROR = 0; private static final int ZZ_NO_MATCH = 1; private static final int ZZ_PUSHBACK_2BIG = 2; /* error messages for the codes above */ private static final String ZZ_ERROR_MSG[] = { "Unkown internal scanner error", "Error: could not match input", "Error: pushback value was too large" }; /** * ZZ_ATTRIBUTE[aState] contains the attributes of state aState */ private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute(); private static final String ZZ_ATTRIBUTE_PACKED_0 = "\1\0\1\11\3\1\1\11\1\1\13\0\4\1\2\0"+ "\1\1\1\0\17\1\1\0\1\1\3\0\5\1"; private static int [] zzUnpackAttribute() { int [] result = new int[51]; int offset = 0; offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result); return result; } private static int zzUnpackAttribute(String packed, int offset, int [] result) { int i = 0; /* index in packed string */ int j = offset; /* index in unpacked array */ int l = packed.length(); while (i < l) { int count = packed.charAt(i++); int value = packed.charAt(i++); do result[j++] = value; while (--count > 0); } return j; } /** the input device */ private java.io.Reader zzReader; /** the current state of the DFA */ private int zzState; /** the current lexical state */ private int zzLexicalState = YYINITIAL; /** this buffer contains the current text to be matched and is the source of the yytext() string */ private char zzBuffer[] = new char[ZZ_BUFFERSIZE]; /** the textposition at the last accepting state */ private int zzMarkedPos; /** the textposition at the last state to be included in yytext */ private int zzPushbackPos; /** the current text position in the buffer */ private int zzCurrentPos; /** startRead marks the beginning of the yytext() string in the buffer */ private int zzStartRead; /** endRead marks the last character in the buffer, that has been read from input */ private int zzEndRead; /** number of newlines encountered up to the start of the matched text */ private int yyline; /** the number of characters up to the start of the matched text */ private int yychar; /** * the number of characters from the last newline up to the start of the * matched text */ private int yycolumn; /** * zzAtBOL == true <=> the scanner is currently at the beginning of a line */ private boolean zzAtBOL = true; /** zzAtEOF == true <=> the scanner is at the EOF */ private boolean zzAtEOF; /* user code: */ public static final int ALPHANUM = StandardTokenizer.ALPHANUM; public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE; public static final int ACRONYM = StandardTokenizer.ACRONYM; public static final int COMPANY = StandardTokenizer.COMPANY; public static final int EMAIL = StandardTokenizer.EMAIL; public static final int HOST = StandardTokenizer.HOST; public static final int NUM = StandardTokenizer.NUM; public static final int CJ = StandardTokenizer.CJ; /** * @deprecated this solves a bug where HOSTs that end with '.' are identified * as ACRONYMs. It is deprecated and will be removed in the next * release. */ public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP; public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES; public final int yychar() { return yychar; } /** * Resets the Tokenizer to a new Reader. */ final void reset(java.io.Reader r) { // reset to default buffer size, if buffer has grown if (zzBuffer.length > ZZ_BUFFERSIZE) { zzBuffer = new char[ZZ_BUFFERSIZE]; } yyreset(r); } /** * Fills Lucene token with the current token text. */ final void getText(Token t) { t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); } /** * Fills TermAttribute with the current token text. */ final void getText(TermAttribute t) { t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); } /** * Creates a new scanner * There is also a java.io.InputStream version of this constructor. * * @param in the java.io.Reader to read input from. */ StandardTokenizerImpl(java.io.Reader in) { this.zzReader = in; } /** * Creates a new scanner. * There is also java.io.Reader version of this constructor. * * @param in the java.io.Inputstream to read input from. */ StandardTokenizerImpl(java.io.InputStream in) { this(new java.io.InputStreamReader(in)); } /** * Unpacks the compressed character translation table. * * @param packed the packed character translation table * @return the unpacked character translation table */ private static char [] zzUnpackCMap(String packed) { char [] map = new char[0x10000]; int i = 0; /* index in packed string */ int j = 0; /* index in unpacked array */ while (i < 1154) { int count = packed.charAt(i++); char value = packed.charAt(i++); do map[j++] = value; while (--count > 0); } return map; } /** * Refills the input buffer. * * @return false, iff there was new input. * * @exception java.io.IOException if any I/O-Error occurs */ private boolean zzRefill() throws java.io.IOException { /* first: make room (if you can) */ if (zzStartRead > 0) { System.arraycopy(zzBuffer, zzStartRead, zzBuffer, 0, zzEndRead-zzStartRead); /* translate stored positions */ zzEndRead-= zzStartRead; zzCurrentPos-= zzStartRead; zzMarkedPos-= zzStartRead; zzPushbackPos-= zzStartRead; zzStartRead = 0; } /* is the buffer big enough? */ if (zzCurrentPos >= zzBuffer.length) { /* if not: blow it up */ char newBuffer[] = new char[zzCurrentPos*2]; System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length); zzBuffer = newBuffer; } /* finally: fill the buffer with new input */ int numRead = zzReader.read(zzBuffer, zzEndRead, zzBuffer.length-zzEndRead); if (numRead < 0) { return true; } else { zzEndRead+= numRead; return false; } } /** * Closes the input stream. */ public final void yyclose() throws java.io.IOException { zzAtEOF = true; /* indicate end of file */ zzEndRead = zzStartRead; /* invalidate buffer */ if (zzReader != null) zzReader.close(); } /** * Resets the scanner to read from a new input stream. * Does not close the old reader. * * All internal variables are reset, the old input stream * cannot be reused (internal buffer is discarded and lost). * Lexical state is set to ZZ_INITIAL. * * @param reader the new input stream */ public final void yyreset(java.io.Reader reader) { zzReader = reader; zzAtBOL = true; zzAtEOF = false; zzEndRead = zzStartRead = 0; zzCurrentPos = zzMarkedPos = zzPushbackPos = 0; yyline = yychar = yycolumn = 0; zzLexicalState = YYINITIAL; } /** * Returns the current lexical state. */ public final int yystate() { return zzLexicalState; } /** * Enters a new lexical state * * @param newState the new lexical state */ public final void yybegin(int newState) { zzLexicalState = newState; } /** * Returns the text matched by the current regular expression. */ public final String yytext() { return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead ); } /** * Returns the character at position pos from the * matched text. * * It is equivalent to yytext().charAt(pos), but faster * * @param pos the position of the character to fetch. * A value from 0 to yylength()-1. * * @return the character at position pos */ public final char yycharat(int pos) { return zzBuffer[zzStartRead+pos]; } /** * Returns the length of the matched text region. */ public final int yylength() { return zzMarkedPos-zzStartRead; } /** * Reports an error that occured while scanning. * * In a wellformed scanner (no or only correct usage of * yypushback(int) and a match-all fallback rule) this method * will only be called with things that "Can't Possibly Happen". * If this method is called, something is seriously wrong * (e.g. a JFlex bug producing a faulty scanner etc.). * * Usual syntax/scanner level error handling should be done * in error fallback rules. * * @param errorCode the code of the errormessage to display */ private void zzScanError(int errorCode) { String message; try { message = ZZ_ERROR_MSG[errorCode]; } catch (ArrayIndexOutOfBoundsException e) { message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR]; } throw new Error(message); } /** * Pushes the specified amount of characters back into the input stream. * * They will be read again by then next call of the scanning method * * @param number the number of characters to be read again. * This number must not be greater than yylength()! */ public void yypushback(int number) { if ( number > yylength() ) zzScanError(ZZ_PUSHBACK_2BIG); zzMarkedPos -= number; } /** * Resumes scanning until the next regular expression is matched, * the end of input is encountered or an I/O-Error occurs. * * @return the next token * @exception java.io.IOException if any I/O-Error occurs */ public int getNextToken() throws java.io.IOException { int zzInput; int zzAction; // cached fields: int zzCurrentPosL; int zzMarkedPosL; int zzEndReadL = zzEndRead; char [] zzBufferL = zzBuffer; char [] zzCMapL = ZZ_CMAP; int [] zzTransL = ZZ_TRANS; int [] zzRowMapL = ZZ_ROWMAP; int [] zzAttrL = ZZ_ATTRIBUTE; while (true) { zzMarkedPosL = zzMarkedPos; yychar+= zzMarkedPosL-zzStartRead; zzAction = -1; zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL; zzState = zzLexicalState; zzForAction: { while (true) { if (zzCurrentPosL < zzEndReadL) zzInput = zzBufferL[zzCurrentPosL++]; else if (zzAtEOF) { zzInput = YYEOF; break zzForAction; } else { // store back cached positions zzCurrentPos = zzCurrentPosL; zzMarkedPos = zzMarkedPosL; boolean eof = zzRefill(); // get translated positions and possibly new buffer zzCurrentPosL = zzCurrentPos; zzMarkedPosL = zzMarkedPos; zzBufferL = zzBuffer; zzEndReadL = zzEndRead; if (eof) { zzInput = YYEOF; break zzForAction; } else { zzInput = zzBufferL[zzCurrentPosL++]; } } int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ]; if (zzNext == -1) break zzForAction; zzState = zzNext; int zzAttributes = zzAttrL[zzState]; if ( (zzAttributes & 1) == 1 ) { zzAction = zzState; zzMarkedPosL = zzCurrentPosL; if ( (zzAttributes & 8) == 8 ) break zzForAction; } } } // store back cached position zzMarkedPos = zzMarkedPosL; switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) { case 4: { return HOST; } case 11: break; case 9: { return ACRONYM; } case 12: break; case 8: { return ACRONYM_DEP; } case 13: break; case 1: { /* ignore */ } case 14: break; case 5: { return NUM; } case 15: break; case 3: { return CJ; } case 16: break; case 2: { return ALPHANUM; } case 17: break; case 7: { return COMPANY; } case 18: break; case 6: { return APOSTROPHE; } case 19: break; case 10: { return EMAIL; } case 20: break; default: if (zzInput == YYEOF && zzStartRead == zzCurrentPos) { zzAtEOF = true; return YYEOF; } else { zzScanError(ZZ_NO_MATCH); } } } } } lucene-2.9.4/src/java/org/apache/lucene/analysis/standard/StandardFilter.java0000644000175000017500000000533211474320221027641 0ustar janpascaljanpascalpackage org.apache.lucene.analysis.standard; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; /** Normalizes tokens extracted with {@link StandardTokenizer}. */ public final class StandardFilter extends TokenFilter { /** Construct filtering in. */ public StandardFilter(TokenStream in) { super(in); termAtt = (TermAttribute) addAttribute(TermAttribute.class); typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); } private static final String APOSTROPHE_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.APOSTROPHE]; private static final String ACRONYM_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]; // this filters uses attribute type private TypeAttribute typeAtt; private TermAttribute termAtt; /** Returns the next token in the stream, or null at EOS. *

    Removes 's from the end of words. *

    Removes dots from acronyms. */ public final boolean incrementToken() throws java.io.IOException { if (!input.incrementToken()) { return false; } char[] buffer = termAtt.termBuffer(); final int bufferLength = termAtt.termLength(); final String type = typeAtt.type(); if (type == APOSTROPHE_TYPE && // remove 's bufferLength >= 2 && buffer[bufferLength-2] == '\'' && (buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) { // Strip last 2 characters off termAtt.setTermLength(bufferLength - 2); } else if (type == ACRONYM_TYPE) { // remove dots int upto = 0; for(int i=0;i *

    You must specify the required {@link Version} * compatibility when creating StandardAnalyzer: *

      *
    • As of 2.9, StopFilter preserves position * increments *
    • As of 2.4, Tokens incorrectly identified as acronyms * are corrected (see LUCENE-1608 *
    * * @version $Id: StandardAnalyzer.java 829134 2009-10-23 17:18:53Z mikemccand $ */ public class StandardAnalyzer extends Analyzer { private Set stopSet; /** * Specifies whether deprecated acronyms should be replaced with HOST type. * This is false by default to support backward compatibility. * * @deprecated this should be removed in the next release (3.0). * * See https://issues.apache.org/jira/browse/LUCENE-1068 */ private boolean replaceInvalidAcronym = defaultReplaceInvalidAcronym; private static boolean defaultReplaceInvalidAcronym; private boolean enableStopPositionIncrements; // @deprecated private boolean useDefaultStopPositionIncrements; // Default to true (fixed the bug), unless the system prop is set static { final String v = System.getProperty("org.apache.lucene.analysis.standard.StandardAnalyzer.replaceInvalidAcronym"); if (v == null || v.equals("true")) defaultReplaceInvalidAcronym = true; else defaultReplaceInvalidAcronym = false; } /** * * @return true if new instances of StandardTokenizer will * replace mischaracterized acronyms * * See https://issues.apache.org/jira/browse/LUCENE-1068 * @deprecated This will be removed (hardwired to true) in 3.0 */ public static boolean getDefaultReplaceInvalidAcronym() { return defaultReplaceInvalidAcronym; } /** * * @param replaceInvalidAcronym Set to true to have new * instances of StandardTokenizer replace mischaracterized * acronyms by default. Set to false to preserve the * previous (before 2.4) buggy behavior. Alternatively, * set the system property * org.apache.lucene.analysis.standard.StandardAnalyzer.replaceInvalidAcronym * to false. * * See https://issues.apache.org/jira/browse/LUCENE-1068 * @deprecated This will be removed (hardwired to true) in 3.0 */ public static void setDefaultReplaceInvalidAcronym(boolean replaceInvalidAcronym) { defaultReplaceInvalidAcronym = replaceInvalidAcronym; } /** An array containing some common English words that are usually not useful for searching. @deprecated Use {@link #STOP_WORDS_SET} instead */ public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS; /** An unmodifiable set containing some common English words that are usually not useful for searching. */ public static final Set/**/ STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; /** Builds an analyzer with the default stop words ({@link * #STOP_WORDS_SET}). * @deprecated Use {@link #StandardAnalyzer(Version)} instead. */ public StandardAnalyzer() { this(Version.LUCENE_24, STOP_WORDS_SET); } /** Builds an analyzer with the default stop words ({@link * #STOP_WORDS}). * @param matchVersion Lucene version to match See {@link * above} */ public StandardAnalyzer(Version matchVersion) { this(matchVersion, STOP_WORDS_SET); } /** Builds an analyzer with the given stop words. * @deprecated Use {@link #StandardAnalyzer(Version, Set)} * instead */ public StandardAnalyzer(Set stopWords) { this(Version.LUCENE_24, stopWords); } /** Builds an analyzer with the given stop words. * @param matchVersion Lucene version to match See {@link * above} * @param stopWords stop words */ public StandardAnalyzer(Version matchVersion, Set stopWords) { stopSet = stopWords; init(matchVersion); } /** Builds an analyzer with the given stop words. * @deprecated Use {@link #StandardAnalyzer(Version, Set)} instead */ public StandardAnalyzer(String[] stopWords) { this(Version.LUCENE_24, StopFilter.makeStopSet(stopWords)); } /** Builds an analyzer with the stop words from the given file. * @see WordlistLoader#getWordSet(File) * @deprecated Use {@link #StandardAnalyzer(Version, File)} * instead */ public StandardAnalyzer(File stopwords) throws IOException { this(Version.LUCENE_24, stopwords); } /** Builds an analyzer with the stop words from the given file. * @see WordlistLoader#getWordSet(File) * @param matchVersion Lucene version to match See {@link * above} * @param stopwords File to read stop words from */ public StandardAnalyzer(Version matchVersion, File stopwords) throws IOException { stopSet = WordlistLoader.getWordSet(stopwords); init(matchVersion); } /** Builds an analyzer with the stop words from the given reader. * @see WordlistLoader#getWordSet(Reader) * @deprecated Use {@link #StandardAnalyzer(Version, Reader)} * instead */ public StandardAnalyzer(Reader stopwords) throws IOException { this(Version.LUCENE_24, stopwords); } /** Builds an analyzer with the stop words from the given reader. * @see WordlistLoader#getWordSet(Reader) * @param matchVersion Lucene version to match See {@link * above} * @param stopwords Reader to read stop words from */ public StandardAnalyzer(Version matchVersion, Reader stopwords) throws IOException { stopSet = WordlistLoader.getWordSet(stopwords); init(matchVersion); } /** * * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer * * See https://issues.apache.org/jira/browse/LUCENE-1068 * * @deprecated Remove in 3.X and make true the only valid value */ public StandardAnalyzer(boolean replaceInvalidAcronym) { this(Version.LUCENE_24, STOP_WORDS_SET); this.replaceInvalidAcronym = replaceInvalidAcronym; useDefaultStopPositionIncrements = true; } /** * @param stopwords The stopwords to use * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer * * See https://issues.apache.org/jira/browse/LUCENE-1068 * * @deprecated Remove in 3.X and make true the only valid value */ public StandardAnalyzer(Reader stopwords, boolean replaceInvalidAcronym) throws IOException{ this(Version.LUCENE_24, stopwords); this.replaceInvalidAcronym = replaceInvalidAcronym; } /** * @param stopwords The stopwords to use * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer * * See https://issues.apache.org/jira/browse/LUCENE-1068 * * @deprecated Remove in 3.X and make true the only valid value */ public StandardAnalyzer(File stopwords, boolean replaceInvalidAcronym) throws IOException{ this(Version.LUCENE_24, stopwords); this.replaceInvalidAcronym = replaceInvalidAcronym; } /** * * @param stopwords The stopwords to use * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer * * See https://issues.apache.org/jira/browse/LUCENE-1068 * * @deprecated Remove in 3.X and make true the only valid value */ public StandardAnalyzer(String [] stopwords, boolean replaceInvalidAcronym) throws IOException{ this(Version.LUCENE_24, StopFilter.makeStopSet(stopwords)); this.replaceInvalidAcronym = replaceInvalidAcronym; } /** * @param stopwords The stopwords to use * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer * * See https://issues.apache.org/jira/browse/LUCENE-1068 * * @deprecated Remove in 3.X and make true the only valid value */ public StandardAnalyzer(Set stopwords, boolean replaceInvalidAcronym) throws IOException{ this(Version.LUCENE_24, stopwords); this.replaceInvalidAcronym = replaceInvalidAcronym; } private final void init(Version matchVersion) { setOverridesTokenStreamMethod(StandardAnalyzer.class); if (matchVersion.onOrAfter(Version.LUCENE_29)) { enableStopPositionIncrements = true; } else { useDefaultStopPositionIncrements = true; } if (matchVersion.onOrAfter(Version.LUCENE_24)) { replaceInvalidAcronym = defaultReplaceInvalidAcronym; } else { replaceInvalidAcronym = false; } } /** Constructs a {@link StandardTokenizer} filtered by a {@link StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */ public TokenStream tokenStream(String fieldName, Reader reader) { StandardTokenizer tokenStream = new StandardTokenizer(reader, replaceInvalidAcronym); tokenStream.setMaxTokenLength(maxTokenLength); TokenStream result = new StandardFilter(tokenStream); result = new LowerCaseFilter(result); if (useDefaultStopPositionIncrements) { result = new StopFilter(result, stopSet); } else { result = new StopFilter(enableStopPositionIncrements, result, stopSet); } return result; } private static final class SavedStreams { StandardTokenizer tokenStream; TokenStream filteredTokenStream; } /** Default maximum allowed token length */ public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; /** * Set maximum allowed token length. If a token is seen * that exceeds this length then it is discarded. This * setting only takes effect the next time tokenStream or * reusableTokenStream is called. */ public void setMaxTokenLength(int length) { maxTokenLength = length; } /** * @see #setMaxTokenLength */ public int getMaxTokenLength() { return maxTokenLength; } /** @deprecated Use {@link #tokenStream} instead */ public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { if (overridesTokenStreamMethod) { // LUCENE-1678: force fallback to tokenStream() if we // have been subclassed and that subclass overrides // tokenStream but not reusableTokenStream return tokenStream(fieldName, reader); } SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(); setPreviousTokenStream(streams); streams.tokenStream = new StandardTokenizer(reader); streams.filteredTokenStream = new StandardFilter(streams.tokenStream); streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream); if (useDefaultStopPositionIncrements) { streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet); } else { streams.filteredTokenStream = new StopFilter(enableStopPositionIncrements, streams.filteredTokenStream, stopSet); } } else { streams.tokenStream.reset(reader); } streams.tokenStream.setMaxTokenLength(maxTokenLength); streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym); return streams.filteredTokenStream; } /** * * @return true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer * * See https://issues.apache.org/jira/browse/LUCENE-1068 * @deprecated This will be removed (hardwired to true) in 3.0 */ public boolean isReplaceInvalidAcronym() { return replaceInvalidAcronym; } /** * * @param replaceInvalidAcronym Set to true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer * * See https://issues.apache.org/jira/browse/LUCENE-1068 * @deprecated This will be removed (hardwired to true) in 3.0 */ public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) { this.replaceInvalidAcronym = replaceInvalidAcronym; } } lucene-2.9.4/src/java/org/apache/lucene/analysis/standard/package.html0000644000175000017500000000176711474320221026361 0ustar janpascaljanpascal A fast grammar-based tokenizer constructed with JFlex. lucene-2.9.4/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java0000644000175000017500000002571711474320221030377 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis.standard; import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Version; /** A grammar-based tokenizer constructed with JFlex * *

    This should be a good tokenizer for most European-language documents: * *

      *
    • Splits words at punctuation characters, removing punctuation. However, a * dot that's not followed by whitespace is considered part of a token. *
    • Splits words at hyphens, unless there's a number in the token, in which case * the whole token is interpreted as a product number and is not split. *
    • Recognizes email addresses and internet hostnames as one token. *
    * *

    Many applications have specific tokenizer needs. If this tokenizer does * not suit your application, please consider copying this source code * directory to your project and maintaining your own grammar-based tokenizer. * * *

    You must specify the required {@link Version} * compatibility when creating StandardAnalyzer: *

    */ public class StandardTokenizer extends Tokenizer { /** A private instance of the JFlex-constructed scanner */ private final StandardTokenizerImpl scanner; public static final int ALPHANUM = 0; public static final int APOSTROPHE = 1; public static final int ACRONYM = 2; public static final int COMPANY = 3; public static final int EMAIL = 4; public static final int HOST = 5; public static final int NUM = 6; public static final int CJ = 7; /** * @deprecated this solves a bug where HOSTs that end with '.' are identified * as ACRONYMs. It is deprecated and will be removed in the next * release. */ public static final int ACRONYM_DEP = 8; /** String token types that correspond to token type int constants */ public static final String [] TOKEN_TYPES = new String [] { "", "", "", "", "", "", "", "", "" }; /** @deprecated Please use {@link #TOKEN_TYPES} instead */ public static final String [] tokenImage = TOKEN_TYPES; /** * Specifies whether deprecated acronyms should be replaced with HOST type. * This is false by default to support backward compatibility. *

    * See http://issues.apache.org/jira/browse/LUCENE-1068 * * @deprecated this should be removed in the next release (3.0). */ private boolean replaceInvalidAcronym; private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; /** Set the max allowed token length. Any token longer * than this is skipped. */ public void setMaxTokenLength(int length) { this.maxTokenLength = length; } /** @see #setMaxTokenLength */ public int getMaxTokenLength() { return maxTokenLength; } /** * Creates a new instance of the {@link StandardTokenizer}. Attaches the * input to a newly created JFlex scanner. * * @deprecated Use {@link #StandardTokenizer(Version, * Reader)} instead */ public StandardTokenizer(Reader input) { this(Version.LUCENE_24, input); } /** * Creates a new instance of the {@link org.apache.lucene.analysis.standard.StandardTokenizer}. Attaches * the input to the newly created JFlex scanner. * * @param input The input reader * @param replaceInvalidAcronym Set to true to replace mischaracterized acronyms with HOST. * * See http://issues.apache.org/jira/browse/LUCENE-1068 * * @deprecated Use {@link #StandardTokenizer(Version, Reader)} instead */ public StandardTokenizer(Reader input, boolean replaceInvalidAcronym) { super(); this.scanner = new StandardTokenizerImpl(input); init(input, replaceInvalidAcronym); } /** * Creates a new instance of the {@link org.apache.lucene.analysis.standard.StandardTokenizer}. Attaches * the input to the newly created JFlex scanner. * * @param input The input reader * * See http://issues.apache.org/jira/browse/LUCENE-1068 */ public StandardTokenizer(Version matchVersion, Reader input) { super(); this.scanner = new StandardTokenizerImpl(input); init(input, matchVersion); } /** * Creates a new StandardTokenizer with a given {@link AttributeSource}. * * @deprecated Use {@link #StandardTokenizer(Version, AttributeSource, Reader)} instead */ public StandardTokenizer(AttributeSource source, Reader input, boolean replaceInvalidAcronym) { super(source); this.scanner = new StandardTokenizerImpl(input); init(input, replaceInvalidAcronym); } /** * Creates a new StandardTokenizer with a given {@link AttributeSource}. */ public StandardTokenizer(Version matchVersion, AttributeSource source, Reader input) { super(source); this.scanner = new StandardTokenizerImpl(input); init(input, matchVersion); } /** * Creates a new StandardTokenizer with a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory} * * @deprecated Use {@link #StandardTokenizer(Version, org.apache.lucene.util.AttributeSource.AttributeFactory, Reader)} instead */ public StandardTokenizer(AttributeFactory factory, Reader input, boolean replaceInvalidAcronym) { super(factory); this.scanner = new StandardTokenizerImpl(input); init(input, replaceInvalidAcronym); } /** * Creates a new StandardTokenizer with a given {@link org.apache.lucene.util.AttributeSource.AttributeFactory} */ public StandardTokenizer(Version matchVersion, AttributeFactory factory, Reader input) { super(factory); this.scanner = new StandardTokenizerImpl(input); init(input, matchVersion); } private void init(Reader input, boolean replaceInvalidAcronym) { this.replaceInvalidAcronym = replaceInvalidAcronym; this.input = input; termAtt = (TermAttribute) addAttribute(TermAttribute.class); offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class); } private void init(Reader input, Version matchVersion) { if (matchVersion.onOrAfter(Version.LUCENE_24)) { init(input, true); } else { init(input, false); } } // this tokenizer generates three attributes: // offset, positionIncrement and type private TermAttribute termAtt; private OffsetAttribute offsetAtt; private PositionIncrementAttribute posIncrAtt; private TypeAttribute typeAtt; /* * (non-Javadoc) * * @see org.apache.lucene.analysis.TokenStream#next() */ public final boolean incrementToken() throws IOException { clearAttributes(); int posIncr = 1; while(true) { int tokenType = scanner.getNextToken(); if (tokenType == StandardTokenizerImpl.YYEOF) { return false; } if (scanner.yylength() <= maxTokenLength) { posIncrAtt.setPositionIncrement(posIncr); scanner.getText(termAtt); final int start = scanner.yychar(); offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.termLength())); // This 'if' should be removed in the next release. For now, it converts // invalid acronyms to HOST. When removed, only the 'else' part should // remain. if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) { if (replaceInvalidAcronym) { typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]); termAtt.setTermLength(termAtt.termLength() - 1); // remove extra '.' } else { typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]); } } else { typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]); } return true; } else // When we skip a too-long term, we still increment the // position increment posIncr++; } } public final void end() { // set final offset int finalOffset = correctOffset(scanner.yychar() + scanner.yylength()); offsetAtt.setOffset(finalOffset, finalOffset); } /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should * not be overridden. Delegates to the backwards compatibility layer. */ public final Token next(final Token reusableToken) throws IOException { return super.next(reusableToken); } /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should * not be overridden. Delegates to the backwards compatibility layer. */ public final Token next() throws IOException { return super.next(); } public void reset(Reader reader) throws IOException { super.reset(reader); scanner.reset(reader); } /** * Prior to https://issues.apache.org/jira/browse/LUCENE-1068, StandardTokenizer mischaracterized as acronyms tokens like www.abc.com * when they should have been labeled as hosts instead. * @return true if StandardTokenizer now returns these tokens as Hosts, otherwise false * * @deprecated Remove in 3.X and make true the only valid value */ public boolean isReplaceInvalidAcronym() { return replaceInvalidAcronym; } /** * * @param replaceInvalidAcronym Set to true to replace mischaracterized acronyms as HOST. * @deprecated Remove in 3.X and make true the only valid value * * See https://issues.apache.org/jira/browse/LUCENE-1068 */ public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) { this.replaceInvalidAcronym = replaceInvalidAcronym; } } lucene-2.9.4/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex0000644000175000017500000001262211474320221031377 0ustar janpascaljanpascalpackage org.apache.lucene.analysis.standard; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* NOTE: if you change StandardTokenizerImpl.jflex and need to regenerate the tokenizer, remember to use JRE 1.4 to run jflex (before Lucene 3.0). This grammar now uses constructs (eg :digit:, :letter:) whose meaning can vary according to the JRE used to run jflex. See https://issues.apache.org/jira/browse/LUCENE-1126 for details. */ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.tokenattributes.TermAttribute; %% %class StandardTokenizerImpl %unicode %integer %function getNextToken %pack %char %{ public static final int ALPHANUM = StandardTokenizer.ALPHANUM; public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE; public static final int ACRONYM = StandardTokenizer.ACRONYM; public static final int COMPANY = StandardTokenizer.COMPANY; public static final int EMAIL = StandardTokenizer.EMAIL; public static final int HOST = StandardTokenizer.HOST; public static final int NUM = StandardTokenizer.NUM; public static final int CJ = StandardTokenizer.CJ; /** * @deprecated this solves a bug where HOSTs that end with '.' are identified * as ACRONYMs. It is deprecated and will be removed in the next * release. */ public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP; public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES; public final int yychar() { return yychar; } /** * Resets the Tokenizer to a new Reader. */ final void reset(java.io.Reader r) { // reset to default buffer size, if buffer has grown if (zzBuffer.length > ZZ_BUFFERSIZE) { zzBuffer = new char[ZZ_BUFFERSIZE]; } yyreset(r); } /** * Fills Lucene token with the current token text. */ final void getText(Token t) { t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); } /** * Fills TermAttribute with the current token text. */ final void getText(TermAttribute t) { t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); } %} THAI = [\u0E00-\u0E59] // basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function) ALPHANUM = ({LETTER}|{THAI}|[:digit:])+ // internal apostrophes: O'Reilly, you're, O'Reilly's // use a post-filter to remove possessives APOSTROPHE = {ALPHA} ("'" {ALPHA})+ // acronyms: U.S.A., I.B.M., etc. // use a post-filter to remove dots ACRONYM = {LETTER} "." ({LETTER} ".")+ ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+ // company names like AT&T and Excite@Home. COMPANY = {ALPHA} ("&"|"@") {ALPHA} // email addresses EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+ // hostname HOST = {ALPHANUM} ((".") {ALPHANUM})+ // floating point, serial, model numbers, ip addresses, etc. // every other segment must have at least one digit NUM = ({ALPHANUM} {P} {HAS_DIGIT} | {HAS_DIGIT} {P} {ALPHANUM} | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+ | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+ | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+ | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+) // punctuation P = ("_"|"-"|"/"|"."|",") // at least one digit HAS_DIGIT = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])* ALPHA = ({LETTER})+ // From the JFlex manual: "the expression that matches everything of not matched by is !(!|)" LETTER = !(![:letter:]|{CJ}) // Chinese and Japanese (but NOT Korean, which is included in [:letter:]) CJ = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f] WHITESPACE = \r\n | [ \r\n\t\f] %% {ALPHANUM} { return ALPHANUM; } {APOSTROPHE} { return APOSTROPHE; } {ACRONYM} { return ACRONYM; } {COMPANY} { return COMPANY; } {EMAIL} { return EMAIL; } {HOST} { return HOST; } {NUM} { return NUM; } {CJ} { return CJ; } {ACRONYM_DEP} { return ACRONYM_DEP; } /** Ignore the rest */ . | {WHITESPACE} { /* ignore */ } lucene-2.9.4/src/java/org/apache/lucene/analysis/package.html0000644000175000017500000007500211474320222024553 0ustar janpascaljanpascal

    API and code to convert text into indexable/searchable tokens. Covers {@link org.apache.lucene.analysis.Analyzer} and related classes.

    Parsing? Tokenization? Analysis!

    Lucene, indexing and search library, accepts only plain text input.

    Parsing

    Applications that build their search capabilities upon Lucene may support documents in various formats – HTML, XML, PDF, Word – just to name a few. Lucene does not care about the Parsing of these and other document formats, and it is the responsibility of the application using Lucene to use an appropriate Parser to convert the original format into plain text before passing that plain text to Lucene.

    Tokenization

    Plain text passed to Lucene for indexing goes through a process generally called tokenization. Tokenization is the process of breaking input text into small indexing elements – tokens. The way input text is broken into tokens heavily influences how people will then be able to search for that text. For instance, sentences beginnings and endings can be identified to provide for more accurate phrase and proximity searches (though sentence identification is not provided by Lucene).

    In some cases simply breaking the input text into tokens is not enough – a deeper Analysis may be needed. There are many post tokenization steps that can be done, including (but not limited to):

    • Stemming – Replacing of words by their stems. For instance with English stemming "bikes" is replaced by "bike"; now query "bike" can find both documents containing "bike" and those containing "bikes".
    • Stop Words Filtering – Common words like "the", "and" and "a" rarely add any value to a search. Removing them shrinks the index size and increases performance. It may also reduce some "noise" and actually improve search quality.
    • Text Normalization – Stripping accents and other character markings can make for better searching.
    • Synonym Expansion – Adding in synonyms at the same token position as the current word can mean better matching when users search with words in the synonym set.

    Core Analysis

    The analysis package provides the mechanism to convert Strings and Readers into tokens that can be indexed by Lucene. There are three main classes in the package from which all analysis processes are derived. These are:

    • {@link org.apache.lucene.analysis.Analyzer} – An Analyzer is responsible for building a {@link org.apache.lucene.analysis.TokenStream} which can be consumed by the indexing and searching processes. See below for more information on implementing your own Analyzer.
    • {@link org.apache.lucene.analysis.Tokenizer} – A Tokenizer is a {@link org.apache.lucene.analysis.TokenStream} and is responsible for breaking up incoming text into tokens. In most cases, an Analyzer will use a Tokenizer as the first step in the analysis process.
    • {@link org.apache.lucene.analysis.TokenFilter} – A TokenFilter is also a {@link org.apache.lucene.analysis.TokenStream} and is responsible for modifying tokens that have been created by the Tokenizer. Common modifications performed by a TokenFilter are: deletion, stemming, synonym injection, and down casing. Not all Analyzers require TokenFilters
    Lucene 2.9 introduces a new TokenStream API. Please see the section "New TokenStream API" below for more details.

    Hints, Tips and Traps

    The synergy between {@link org.apache.lucene.analysis.Analyzer} and {@link org.apache.lucene.analysis.Tokenizer} is sometimes confusing. To ease on this confusion, some clarifications:

    • The {@link org.apache.lucene.analysis.Analyzer} is responsible for the entire task of creating tokens out of the input text, while the {@link org.apache.lucene.analysis.Tokenizer} is only responsible for breaking the input text into tokens. Very likely, tokens created by the {@link org.apache.lucene.analysis.Tokenizer} would be modified or even omitted by the {@link org.apache.lucene.analysis.Analyzer} (via one or more {@link org.apache.lucene.analysis.TokenFilter}s) before being returned.
    • {@link org.apache.lucene.analysis.Tokenizer} is a {@link org.apache.lucene.analysis.TokenStream}, but {@link org.apache.lucene.analysis.Analyzer} is not.
    • {@link org.apache.lucene.analysis.Analyzer} is "field aware", but {@link org.apache.lucene.analysis.Tokenizer} is not.

    Lucene Java provides a number of analysis capabilities, the most commonly used one being the {@link org.apache.lucene.analysis.standard.StandardAnalyzer}. Many applications will have a long and industrious life with nothing more than the StandardAnalyzer. However, there are a few other classes/packages that are worth mentioning:

    1. {@link org.apache.lucene.analysis.PerFieldAnalyzerWrapper} – Most Analyzers perform the same operation on all {@link org.apache.lucene.document.Field}s. The PerFieldAnalyzerWrapper can be used to associate a different Analyzer with different {@link org.apache.lucene.document.Field}s.
    2. The contrib/analyzers library located at the root of the Lucene distribution has a number of different Analyzer implementations to solve a variety of different problems related to searching. Many of the Analyzers are designed to analyze non-English languages.
    3. The contrib/snowball library located at the root of the Lucene distribution has Analyzer and TokenFilter implementations for a variety of Snowball stemmers. See http://snowball.tartarus.org for more information on Snowball stemmers.
    4. There are a variety of Tokenizer and TokenFilter implementations in this package. Take a look around, chances are someone has implemented what you need.

    Analysis is one of the main causes of performance degradation during indexing. Simply put, the more you analyze the slower the indexing (in most cases). Perhaps your application would be just fine using the simple {@link org.apache.lucene.analysis.WhitespaceTokenizer} combined with a {@link org.apache.lucene.analysis.StopFilter}. The contrib/benchmark library can be useful for testing out the speed of the analysis process.

    Invoking the Analyzer

    Applications usually do not invoke analysis – Lucene does it for them:

    • At indexing, as a consequence of {@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document) addDocument(doc)}, the Analyzer in effect for indexing is invoked for each indexed field of the added document.
    • At search, as a consequence of {@link org.apache.lucene.queryParser.QueryParser#parse(java.lang.String) QueryParser.parse(queryText)}, the QueryParser may invoke the Analyzer in effect. Note that for some queries analysis does not take place, e.g. wildcard queries.
    However an application might invoke Analysis of any text for testing or for any other purpose, something like:
          Analyzer analyzer = new StandardAnalyzer(); // or any other analyzer
          TokenStream ts = analyzer.tokenStream("myfield",new StringReader("some text goes here"));
          while (ts.incrementToken()) {
            System.out.println("token: "+ts));
          }
      

    Indexing Analysis vs. Search Analysis

    Selecting the "correct" analyzer is crucial for search quality, and can also affect indexing and search performance. The "correct" analyzer differs between applications. Lucene java's wiki page AnalysisParalysis provides some data on "analyzing your analyzer". Here are some rules of thumb:

    1. Test test test... (did we say test?)
    2. Beware of over analysis – might hurt indexing performance.
    3. Start with same analyzer for indexing and search, otherwise searches would not find what they are supposed to...
    4. In some cases a different analyzer is required for indexing and search, for instance:
      • Certain searches require more stop words to be filtered. (I.e. more than those that were filtered at indexing.)
      • Query expansion by synonyms, acronyms, auto spell correction, etc.
      This might sometimes require a modified analyzer – see the next section on how to do that.

    Implementing your own Analyzer

    Creating your own Analyzer is straightforward. It usually involves either wrapping an existing Tokenizer and set of TokenFilters to create a new Analyzer or creating both the Analyzer and a Tokenizer or TokenFilter. Before pursuing this approach, you may find it worthwhile to explore the contrib/analyzers library and/or ask on the java-user@lucene.apache.org mailing list first to see if what you need already exists. If you are still committed to creating your own Analyzer or TokenStream derivation (Tokenizer or TokenFilter) have a look at the source code of any one of the many samples located in this package.

    The following sections discuss some aspects of implementing your own analyzer.

    Field Section Boundaries

    When {@link org.apache.lucene.document.Document#add(org.apache.lucene.document.Fieldable) document.add(field)} is called multiple times for the same field name, we could say that each such call creates a new section for that field in that document. In fact, a separate call to {@link org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String, java.io.Reader) tokenStream(field,reader)} would take place for each of these so called "sections". However, the default Analyzer behavior is to treat all these sections as one large section. This allows phrase search and proximity search to seamlessly cross boundaries between these "sections". In other words, if a certain field "f" is added like this:

          document.add(new Field("f","first ends",...);
          document.add(new Field("f","starts two",...);
          indexWriter.addDocument(document);
      
    Then, a phrase search for "ends starts" would find that document. Where desired, this behavior can be modified by introducing a "position gap" between consecutive field "sections", simply by overriding {@link org.apache.lucene.analysis.Analyzer#getPositionIncrementGap(java.lang.String) Analyzer.getPositionIncrementGap(fieldName)}:
          Analyzer myAnalyzer = new StandardAnalyzer() {
             public int getPositionIncrementGap(String fieldName) {
               return 10;
             }
          };
      

    Token Position Increments

    By default, all tokens created by Analyzers and Tokenizers have a {@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#getPositionIncrement() position increment} of one. This means that the position stored for that token in the index would be one more than that of the previous token. Recall that phrase and proximity searches rely on position info.

    If the selected analyzer filters the stop words "is" and "the", then for a document containing the string "blue is the sky", only the tokens "blue", "sky" are indexed, with position("sky") = 1 + position("blue"). Now, a phrase query "blue is the sky" would find that document, because the same analyzer filters the same stop words from that query. But also the phrase query "blue sky" would find that document.

    If this behavior does not fit the application needs, a modified analyzer can be used, that would increment further the positions of tokens following a removed stop word, using {@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#setPositionIncrement(int)}. This can be done with something like:

          public TokenStream tokenStream(final String fieldName, Reader reader) {
            final TokenStream ts = someAnalyzer.tokenStream(fieldName, reader);
            TokenStream res = new TokenStream() {
              TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
              PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
            
              public boolean incrementToken() throws IOException {
                int extraIncrement = 0;
                while (true) {
                  boolean hasNext = ts.incrementToken();
                  if (hasNext) {
                    if (stopWords.contains(termAtt.term())) {
                      extraIncrement++; // filter this word
                      continue;
                    } 
                    if (extraIncrement>0) {
                      posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+extraIncrement);
                    }
                  }
                  return hasNext;
                }
              }
            };
            return res;
          }
       
    Now, with this modified analyzer, the phrase query "blue sky" would find that document. But note that this is yet not a perfect solution, because any phrase query "blue w1 w2 sky" where both w1 and w2 are stop words would match that document.

    Few more use cases for modifying position increments are:

    1. Inhibiting phrase and proximity matches in sentence boundaries – for this, a tokenizer that identifies a new sentence can add 1 to the position increment of the first token of the new sentence.
    2. Injecting synonyms – here, synonyms of a token should be added after that token, and their position increment should be set to 0. As result, all synonyms of a token would be considered to appear in exactly the same position as that token, and so would they be seen by phrase and proximity searches.

    New TokenStream API

    With Lucene 2.9 we introduce a new TokenStream API. The old API used to produce Tokens. A Token has getter and setter methods for different properties like positionIncrement and termText. While this approach was sufficient for the default indexing format, it is not versatile enough for Flexible Indexing, a term which summarizes the effort of making the Lucene indexer pluggable and extensible for custom index formats.

    A fully customizable indexer means that users will be able to store custom data structures on disk. Therefore an API is necessary that can transport custom types of data from the documents to the indexer.

    Attribute and AttributeSource

    Lucene 2.9 therefore introduces a new pair of classes called {@link org.apache.lucene.util.Attribute} and {@link org.apache.lucene.util.AttributeSource}. An Attribute serves as a particular piece of information about a text token. For example, {@link org.apache.lucene.analysis.tokenattributes.TermAttribute} contains the term text of a token, and {@link org.apache.lucene.analysis.tokenattributes.OffsetAttribute} contains the start and end character offsets of a token. An AttributeSource is a collection of Attributes with a restriction: there may be only one instance of each attribute type. TokenStream now extends AttributeSource, which means that one can add Attributes to a TokenStream. Since TokenFilter extends TokenStream, all filters are also AttributeSources.

    Lucene now provides six Attributes out of the box, which replace the variables the Token class has:

    • {@link org.apache.lucene.analysis.tokenattributes.TermAttribute}

      The term text of a token.

    • {@link org.apache.lucene.analysis.tokenattributes.OffsetAttribute}

      The start and end offset of token in characters.

    • {@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute}

      See above for detailed information about position increment.

    • {@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute}

      The payload that a Token can optionally have.

    • {@link org.apache.lucene.analysis.tokenattributes.TypeAttribute}

      The type of the token. Default is 'word'.

    • {@link org.apache.lucene.analysis.tokenattributes.FlagsAttribute}

      Optional flags a token can have.

    Using the new TokenStream API

    There are a few important things to know in order to use the new API efficiently which are summarized here. You may want to walk through the example below first and come back to this section afterwards.
    1. Please keep in mind that an AttributeSource can only have one instance of a particular Attribute. Furthermore, if a chain of a TokenStream and multiple TokenFilters is used, then all TokenFilters in that chain share the Attributes with the TokenStream.

    2. Attribute instances are reused for all tokens of a document. Thus, a TokenStream/-Filter needs to update the appropriate Attribute(s) in incrementToken(). The consumer, commonly the Lucene indexer, consumes the data in the Attributes and then calls incrementToken() again until it retuns false, which indicates that the end of the stream was reached. This means that in each call of incrementToken() a TokenStream/-Filter can safely overwrite the data in the Attribute instances.

    3. For performance reasons a TokenStream/-Filter should add/get Attributes during instantiation; i.e., create an attribute in the constructor and store references to it in an instance variable. Using an instance variable instead of calling addAttribute()/getAttribute() in incrementToken() will avoid expensive casting and attribute lookups for every token in the document.

    4. All methods in AttributeSource are idempotent, which means calling them multiple times always yields the same result. This is especially important to know for addAttribute(). The method takes the type (Class) of an Attribute as an argument and returns an instance. If an Attribute of the same type was previously added, then the already existing instance is returned, otherwise a new instance is created and returned. Therefore TokenStreams/-Filters can safely call addAttribute() with the same Attribute type multiple times. Even consumers of TokenStreams should normally call addAttribute() instead of getAttribute(), because it would not fail if the TokenStream does not have this Attribute (getAttribute() would throw an IllegalArgumentException, if the Attribute is missing). More advanced code could simply check with hasAttribute(), if a TokenStream has it, and may conditionally leave out processing for extra performance.

    Example

    In this example we will create a WhiteSpaceTokenizer and use a LengthFilter to suppress all words that only have two or less characters. The LengthFilter is part of the Lucene core and its implementation will be explained here to illustrate the usage of the new TokenStream API.
    Then we will develop a custom Attribute, a PartOfSpeechAttribute, and add another filter to the chain which utilizes the new custom attribute, and call it PartOfSpeechTaggingFilter.

    Whitespace tokenization

    public class MyAnalyzer extends Analyzer {
    
      public TokenStream tokenStream(String fieldName, Reader reader) {
        TokenStream stream = new WhitespaceTokenizer(reader);
        return stream;
      }
      
      public static void main(String[] args) throws IOException {
        // text to tokenize
        final String text = "This is a demo of the new TokenStream API";
        
        MyAnalyzer analyzer = new MyAnalyzer();
        TokenStream stream = analyzer.tokenStream("field", new StringReader(text));
        
        // get the TermAttribute from the TokenStream
        TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
    
        stream.reset();
        
        // print all tokens until stream is exhausted
        while (stream.incrementToken()) {
          System.out.println(termAtt.term());
        }
        
        stream.end()
        stream.close();
      }
    }
    
    In this easy example a simple white space tokenization is performed. In main() a loop consumes the stream and prints the term text of the tokens by accessing the TermAttribute that the WhitespaceTokenizer provides. Here is the output:
    This
    is
    a
    demo
    of
    the
    new
    TokenStream
    API
    

    Adding a LengthFilter

    We want to suppress all tokens that have 2 or less characters. We can do that easily by adding a LengthFilter to the chain. Only the tokenStream() method in our analyzer needs to be changed:
      public TokenStream tokenStream(String fieldName, Reader reader) {
        TokenStream stream = new WhitespaceTokenizer(reader);
        stream = new LengthFilter(stream, 3, Integer.MAX_VALUE);
        return stream;
      }
    
    Note how now only words with 3 or more characters are contained in the output:
    This
    demo
    the
    new
    TokenStream
    API
    
    Now let's take a look how the LengthFilter is implemented (it is part of Lucene's core):
    public final class LengthFilter extends TokenFilter {
    
      final int min;
      final int max;
      
      private TermAttribute termAtt;
    
      /**
       * Build a filter that removes words that are too long or too
       * short from the text.
       */
      public LengthFilter(TokenStream in, int min, int max)
      {
        super(in);
        this.min = min;
        this.max = max;
        termAtt = (TermAttribute) addAttribute(TermAttribute.class);
      }
      
      /**
       * Returns the next input Token whose term() is the right len
       */
      public final boolean incrementToken() throws IOException
      {
        assert termAtt != null;
        // return the first non-stop word found
        while (input.incrementToken()) {
          int len = termAtt.termLength();
          if (len >= min && len <= max) {
              return true;
          }
          // note: else we ignore it but should we index each part of it?
        }
        // reached EOS -- return null
        return false;
      }
    }
    
    The TermAttribute is added in the constructor and stored in the instance variable termAtt. Remember that there can only be a single instance of TermAttribute in the chain, so in our example the addAttribute() call in LengthFilter returns the TermAttribute that the WhitespaceTokenizer already added. The tokens are retrieved from the input stream in the incrementToken() method. By looking at the term text in the TermAttribute the length of the term can be determined and too short or too long tokens are skipped. Note how incrementToken() can efficiently access the instance variable; no attribute lookup or downcasting is neccessary. The same is true for the consumer, which can simply use local references to the Attributes.

    Adding a custom Attribute

    Now we're going to implement our own custom Attribute for part-of-speech tagging and call it consequently PartOfSpeechAttribute. First we need to define the interface of the new Attribute:
      public interface PartOfSpeechAttribute extends Attribute {
        public static enum PartOfSpeech {
          Noun, Verb, Adjective, Adverb, Pronoun, Preposition, Conjunction, Article, Unknown
        }
      
        public void setPartOfSpeech(PartOfSpeech pos);
      
        public PartOfSpeech getPartOfSpeech();
      }
    
    Now we also need to write the implementing class. The name of that class is important here: By default, Lucene checks if there is a class with the name of the Attribute with the postfix 'Impl'. In this example, we would consequently call the implementing class PartOfSpeechAttributeImpl.
    This should be the usual behavior. However, there is also an expert-API that allows changing these naming conventions: {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. The factory accepts an Attribute interface as argument and returns an actual instance. You can implement your own factory if you need to change the default behavior.

    Now here is the actual class that implements our new Attribute. Notice that the class has to extend {@link org.apache.lucene.util.AttributeImpl}:
    public final class PartOfSpeechAttributeImpl extends AttributeImpl 
                                implements PartOfSpeechAttribute{
      
      private PartOfSpeech pos = PartOfSpeech.Unknown;
      
      public void setPartOfSpeech(PartOfSpeech pos) {
        this.pos = pos;
      }
      
      public PartOfSpeech getPartOfSpeech() {
        return pos;
      }
    
      public void clear() {
        pos = PartOfSpeech.Unknown;
      }
    
      public void copyTo(AttributeImpl target) {
        ((PartOfSpeechAttributeImpl) target).pos = pos;
      }
    
      public boolean equals(Object other) {
        if (other == this) {
          return true;
        }
        
        if (other instanceof PartOfSpeechAttributeImpl) {
          return pos == ((PartOfSpeechAttributeImpl) other).pos;
        }
     
        return false;
      }
    
      public int hashCode() {
        return pos.ordinal();
      }
    }
    
    This is a simple Attribute implementation has only a single variable that stores the part-of-speech of a token. It extends the new AttributeImpl class and therefore implements its abstract methods clear(), copyTo(), equals(), hashCode(). Now we need a TokenFilter that can set this new PartOfSpeechAttribute for each token. In this example we show a very naive filter that tags every word with a leading upper-case letter as a 'Noun' and all other words as 'Unknown'.
      public static class PartOfSpeechTaggingFilter extends TokenFilter {
        PartOfSpeechAttribute posAtt;
        TermAttribute termAtt;
        
        protected PartOfSpeechTaggingFilter(TokenStream input) {
          super(input);
          posAtt = (PartOfSpeechAttribute) addAttribute(PartOfSpeechAttribute.class);
          termAtt = (TermAttribute) addAttribute(TermAttribute.class);
        }
        
        public boolean incrementToken() throws IOException {
          if (!input.incrementToken()) {return false;}
          posAtt.setPartOfSpeech(determinePOS(termAtt.termBuffer(), 0, termAtt.termLength()));
          return true;
        }
        
        // determine the part of speech for the given term
        protected PartOfSpeech determinePOS(char[] term, int offset, int length) {
          // naive implementation that tags every uppercased word as noun
          if (length > 0 && Character.isUpperCase(term[0])) {
            return PartOfSpeech.Noun;
          }
          return PartOfSpeech.Unknown;
        }
      }
    
    Just like the LengthFilter, this new filter accesses the attributes it needs in the constructor and stores references in instance variables. Notice how you only need to pass in the interface of the new Attribute and instantiating the correct class is automatically been taken care of. Now we need to add the filter to the chain:
      public TokenStream tokenStream(String fieldName, Reader reader) {
        TokenStream stream = new WhitespaceTokenizer(reader);
        stream = new LengthFilter(stream, 3, Integer.MAX_VALUE);
        stream = new PartOfSpeechTaggingFilter(stream);
        return stream;
      }
    
    Now let's look at the output:
    This
    demo
    the
    new
    TokenStream
    API
    
    Apparently it hasn't changed, which shows that adding a custom attribute to a TokenStream/Filter chain does not affect any existing consumers, simply because they don't know the new Attribute. Now let's change the consumer to make use of the new PartOfSpeechAttribute and print it out:
      public static void main(String[] args) throws IOException {
        // text to tokenize
        final String text = "This is a demo of the new TokenStream API";
        
        MyAnalyzer analyzer = new MyAnalyzer();
        TokenStream stream = analyzer.tokenStream("field", new StringReader(text));
        
        // get the TermAttribute from the TokenStream
        TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
        
        // get the PartOfSpeechAttribute from the TokenStream
        PartOfSpeechAttribute posAtt = (PartOfSpeechAttribute) stream.addAttribute(PartOfSpeechAttribute.class);
        
        stream.reset();
    
        // print all tokens until stream is exhausted
        while (stream.incrementToken()) {
          System.out.println(termAtt.term() + ": " + posAtt.getPartOfSpeech());
        }
        
        stream.end();
        stream.close();
      }
    
    The change that was made is to get the PartOfSpeechAttribute from the TokenStream and print out its contents in the while loop that consumes the stream. Here is the new output:
    This: Noun
    demo: Unknown
    the: Unknown
    new: Unknown
    TokenStream: Noun
    API: Noun
    
    Each word is now followed by its assigned PartOfSpeech tag. Of course this is a naive part-of-speech tagging. The word 'This' should not even be tagged as noun; it is only spelled capitalized because it is the first word of a sentence. Actually this is a good opportunity for an excerise. To practice the usage of the new API the reader could now write an Attribute and TokenFilter that can specify for each word if it was the first token of a sentence or not. Then the PartOfSpeechTaggingFilter can make use of this knowledge and only tag capitalized words as nouns if not the first word of a sentence (we know, this is still not a correct behavior, but hey, it's a good exercise). As a small hint, this is how the new Attribute class could begin:
      public class FirstTokenOfSentenceAttributeImpl extends Attribute
                       implements FirstTokenOfSentenceAttribute {
        
        private boolean firstToken;
        
        public void setFirstToken(boolean firstToken) {
          this.firstToken = firstToken;
        }
        
        public boolean getFirstToken() {
          return firstToken;
        }
    
        public void clear() {
          firstToken = false;
        }
    
      ...
    
    lucene-2.9.4/src/java/org/apache/lucene/analysis/MappingCharFilter.java0000644000175000017500000001023211474320222026466 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis; import java.io.IOException; import java.io.Reader; import java.util.LinkedList; /** * Simplistic {@link CharFilter} that applies the mappings * contained in a {@link NormalizeCharMap} to the character * stream, and correcting the resulting changes to the * offsets. */ public class MappingCharFilter extends BaseCharFilter { private final NormalizeCharMap normMap; //private LinkedList buffer; private LinkedList buffer; private String replacement; private int charPointer; private int nextCharCounter; /** Default constructor that takes a {@link CharStream}. */ public MappingCharFilter(NormalizeCharMap normMap, CharStream in) { super(in); this.normMap = normMap; } /** Easy-use constructor that takes a {@link Reader}. */ public MappingCharFilter(NormalizeCharMap normMap, Reader in) { super(CharReader.get(in)); this.normMap = normMap; } public int read() throws IOException { while(true) { if (replacement != null && charPointer < replacement.length()) { return replacement.charAt(charPointer++); } int firstChar = nextChar(); if (firstChar == -1) return -1; NormalizeCharMap nm = normMap.submap != null ? (NormalizeCharMap)normMap.submap.get(CharacterCache.valueOf((char) firstChar)) : null; if (nm == null) return firstChar; NormalizeCharMap result = match(nm); if (result == null) return firstChar; replacement = result.normStr; charPointer = 0; if (result.diff != 0) { int prevCumulativeDiff = getLastCumulativeDiff(); if (result.diff < 0) { for(int i = 0; i < -result.diff ; i++) addOffCorrectMap(nextCharCounter + i - prevCumulativeDiff, prevCumulativeDiff - 1 - i); } else { addOffCorrectMap(nextCharCounter - result.diff - prevCumulativeDiff, prevCumulativeDiff + result.diff); } } } } private int nextChar() throws IOException { nextCharCounter++; if (buffer != null && !buffer.isEmpty()) { return ((Character)buffer.removeFirst()).charValue(); } return input.read(); } private void pushChar(int c) { nextCharCounter--; if(buffer == null) buffer = new LinkedList(); buffer.addFirst(new Character((char) c)); } private void pushLastChar(int c) { if (buffer == null) { buffer = new LinkedList(); } buffer.addLast(new Character((char) c)); } private NormalizeCharMap match(NormalizeCharMap map) throws IOException { NormalizeCharMap result = null; if (map.submap != null) { int chr = nextChar(); if (chr != -1) { NormalizeCharMap subMap = (NormalizeCharMap) map.submap.get(CharacterCache.valueOf((char) chr)); if (subMap != null) { result = match(subMap); } if (result == null) { pushChar(chr); } } } if (result == null && map.normStr != null) { result = map; } return result; } public int read(char[] cbuf, int off, int len) throws IOException { char[] tmp = new char[len]; int l = input.read(tmp, 0, len); if (l != -1) { for(int i = 0; i < l; i++) pushLastChar(tmp[i]); } l = 0; for(int i = off; i < off + len; i++) { int c = read(); if (c == -1) break; cbuf[i] = (char) c; l++; } return l == 0 ? -1 : l; } } lucene-2.9.4/src/java/org/apache/lucene/analysis/ASCIIFoldingFilter.java0000644000175000017500000033441311474320222026442 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; import java.io.IOException; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.ArrayUtil; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * This class converts alphabetic, numeric, and symbolic Unicode characters * which are not in the first 127 ASCII characters (the "Basic Latin" Unicode * block) into their ASCII equivalents, if one exists. * * Characters from the following Unicode blocks are converted; however, only * those characters with reasonable ASCII alternatives are converted: * * * * See: http://en.wikipedia.org/wiki/Latin_characters_in_Unicode * * The set of character conversions supported by this class is a superset of * those supported by Lucene's {@link ISOLatin1AccentFilter} which strips * accents from Latin1 characters. For example, 'à' will be replaced by * 'a'. */ public final class ASCIIFoldingFilter extends TokenFilter { public ASCIIFoldingFilter(TokenStream input) { super(input); termAtt = (TermAttribute) addAttribute(TermAttribute.class); } private char[] output = new char[512]; private int outputPos; private TermAttribute termAtt; public boolean incrementToken() throws IOException { if (input.incrementToken()) { final char[] buffer = termAtt.termBuffer(); final int length = termAtt.termLength(); // If no characters actually require rewriting then we // just return token as-is: for(int i = 0 ; i < length ; ++i) { final char c = buffer[i]; if (c >= '\u0080') { foldToASCII(buffer, length); termAtt.setTermBuffer(output, 0, outputPos); break; } } return true; } else { return false; } } /** * Converts characters above ASCII to their ASCII equivalents. For example, * accents are removed from accented characters. * @param input The string to fold * @param length The number of characters in the input string */ public void foldToASCII(char[] input, int length) { // Worst-case length required: final int maxSizeNeeded = 4 * length; if (output.length < maxSizeNeeded) { output = new char[ArrayUtil.getNextSize(maxSizeNeeded)]; } outputPos = 0; for (int pos = 0 ; pos < length ; ++pos) { final char c = input[pos]; // Quick test: if it's not in range then just keep current character if (c < '\u0080') { output[outputPos++] = c; } else { switch (c) { case '\u00C0': // À [LATIN CAPITAL LETTER A WITH GRAVE] case '\u00C1': // à [LATIN CAPITAL LETTER A WITH ACUTE] case '\u00C2': //  [LATIN CAPITAL LETTER A WITH CIRCUMFLEX] case '\u00C3': // à [LATIN CAPITAL LETTER A WITH TILDE] case '\u00C4': // Ä [LATIN CAPITAL LETTER A WITH DIAERESIS] case '\u00C5': // Ã… [LATIN CAPITAL LETTER A WITH RING ABOVE] case '\u0100': // Ä€ [LATIN CAPITAL LETTER A WITH MACRON] case '\u0102': // Ä‚ [LATIN CAPITAL LETTER A WITH BREVE] case '\u0104': // Ä„ [LATIN CAPITAL LETTER A WITH OGONEK] case '\u018F': // Æ http://en.wikipedia.org/wiki/Schwa [LATIN CAPITAL LETTER SCHWA] case '\u01CD': // Ç [LATIN CAPITAL LETTER A WITH CARON] case '\u01DE': // Çž [LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON] case '\u01E0': // Ç  [LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON] case '\u01FA': // Ǻ [LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE] case '\u0200': // È€ [LATIN CAPITAL LETTER A WITH DOUBLE GRAVE] case '\u0202': // È‚ [LATIN CAPITAL LETTER A WITH INVERTED BREVE] case '\u0226': // Ȧ [LATIN CAPITAL LETTER A WITH DOT ABOVE] case '\u023A': // Ⱥ [LATIN CAPITAL LETTER A WITH STROKE] case '\u1D00': // á´€ [LATIN LETTER SMALL CAPITAL A] case '\u1E00': // Ḁ [LATIN CAPITAL LETTER A WITH RING BELOW] case '\u1EA0': // Ạ [LATIN CAPITAL LETTER A WITH DOT BELOW] case '\u1EA2': // Ả [LATIN CAPITAL LETTER A WITH HOOK ABOVE] case '\u1EA4': // Ấ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE] case '\u1EA6': // Ầ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE] case '\u1EA8': // Ẩ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE] case '\u1EAA': // Ẫ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE] case '\u1EAC': // Ậ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW] case '\u1EAE': // Ắ [LATIN CAPITAL LETTER A WITH BREVE AND ACUTE] case '\u1EB0': // Ằ [LATIN CAPITAL LETTER A WITH BREVE AND GRAVE] case '\u1EB2': // Ẳ [LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE] case '\u1EB4': // Ẵ [LATIN CAPITAL LETTER A WITH BREVE AND TILDE] case '\u1EB6': // Ặ [LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW] case '\u24B6': // â’¶ [CIRCLED LATIN CAPITAL LETTER A] case '\uFF21': // A [FULLWIDTH LATIN CAPITAL LETTER A] output[outputPos++] = 'A'; break; case '\u00E0': // à [LATIN SMALL LETTER A WITH GRAVE] case '\u00E1': // á [LATIN SMALL LETTER A WITH ACUTE] case '\u00E2': // â [LATIN SMALL LETTER A WITH CIRCUMFLEX] case '\u00E3': // ã [LATIN SMALL LETTER A WITH TILDE] case '\u00E4': // ä [LATIN SMALL LETTER A WITH DIAERESIS] case '\u00E5': // Ã¥ [LATIN SMALL LETTER A WITH RING ABOVE] case '\u0101': // Ä [LATIN SMALL LETTER A WITH MACRON] case '\u0103': // ă [LATIN SMALL LETTER A WITH BREVE] case '\u0105': // Ä… [LATIN SMALL LETTER A WITH OGONEK] case '\u01CE': // ÇŽ [LATIN SMALL LETTER A WITH CARON] case '\u01DF': // ÇŸ [LATIN SMALL LETTER A WITH DIAERESIS AND MACRON] case '\u01E1': // Ç¡ [LATIN SMALL LETTER A WITH DOT ABOVE AND MACRON] case '\u01FB': // Ç» [LATIN SMALL LETTER A WITH RING ABOVE AND ACUTE] case '\u0201': // È [LATIN SMALL LETTER A WITH DOUBLE GRAVE] case '\u0203': // ȃ [LATIN SMALL LETTER A WITH INVERTED BREVE] case '\u0227': // ȧ [LATIN SMALL LETTER A WITH DOT ABOVE] case '\u0250': // É [LATIN SMALL LETTER TURNED A] case '\u0259': // É™ [LATIN SMALL LETTER SCHWA] case '\u025A': // Éš [LATIN SMALL LETTER SCHWA WITH HOOK] case '\u1D8F': // á¶ [LATIN SMALL LETTER A WITH RETROFLEX HOOK] case '\u1D95': // á¶• [LATIN SMALL LETTER SCHWA WITH RETROFLEX HOOK] case '\u1E01': // ạ [LATIN SMALL LETTER A WITH RING BELOW] case '\u1E9A': // ả [LATIN SMALL LETTER A WITH RIGHT HALF RING] case '\u1EA1': // ạ [LATIN SMALL LETTER A WITH DOT BELOW] case '\u1EA3': // ả [LATIN SMALL LETTER A WITH HOOK ABOVE] case '\u1EA5': // ấ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND ACUTE] case '\u1EA7': // ầ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND GRAVE] case '\u1EA9': // ẩ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE] case '\u1EAB': // ẫ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND TILDE] case '\u1EAD': // ậ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND DOT BELOW] case '\u1EAF': // ắ [LATIN SMALL LETTER A WITH BREVE AND ACUTE] case '\u1EB1': // ằ [LATIN SMALL LETTER A WITH BREVE AND GRAVE] case '\u1EB3': // ẳ [LATIN SMALL LETTER A WITH BREVE AND HOOK ABOVE] case '\u1EB5': // ẵ [LATIN SMALL LETTER A WITH BREVE AND TILDE] case '\u1EB7': // ặ [LATIN SMALL LETTER A WITH BREVE AND DOT BELOW] case '\u2090': // â‚ [LATIN SUBSCRIPT SMALL LETTER A] case '\u2094': // â‚” [LATIN SUBSCRIPT SMALL LETTER SCHWA] case '\u24D0': // â“ [CIRCLED LATIN SMALL LETTER A] case '\u2C65': // â±¥ [LATIN SMALL LETTER A WITH STROKE] case '\u2C6F': // Ɐ [LATIN CAPITAL LETTER TURNED A] case '\uFF41': // ï½ [FULLWIDTH LATIN SMALL LETTER A] output[outputPos++] = 'a'; break; case '\uA732': // Ꜳ [LATIN CAPITAL LETTER AA] output[outputPos++] = 'A'; output[outputPos++] = 'A'; break; case '\u00C6': // Æ [LATIN CAPITAL LETTER AE] case '\u01E2': // Ç¢ [LATIN CAPITAL LETTER AE WITH MACRON] case '\u01FC': // Ǽ [LATIN CAPITAL LETTER AE WITH ACUTE] case '\u1D01': // á´ [LATIN LETTER SMALL CAPITAL AE] output[outputPos++] = 'A'; output[outputPos++] = 'E'; break; case '\uA734': // Ꜵ [LATIN CAPITAL LETTER AO] output[outputPos++] = 'A'; output[outputPos++] = 'O'; break; case '\uA736': // Ꜷ [LATIN CAPITAL LETTER AU] output[outputPos++] = 'A'; output[outputPos++] = 'U'; break; case '\uA738': // Ꜹ [LATIN CAPITAL LETTER AV] case '\uA73A': // Ꜻ [LATIN CAPITAL LETTER AV WITH HORIZONTAL BAR] output[outputPos++] = 'A'; output[outputPos++] = 'V'; break; case '\uA73C': // Ꜽ [LATIN CAPITAL LETTER AY] output[outputPos++] = 'A'; output[outputPos++] = 'Y'; break; case '\u249C': // â’œ [PARENTHESIZED LATIN SMALL LETTER A] output[outputPos++] = '('; output[outputPos++] = 'a'; output[outputPos++] = ')'; break; case '\uA733': // ꜳ [LATIN SMALL LETTER AA] output[outputPos++] = 'a'; output[outputPos++] = 'a'; break; case '\u00E6': // æ [LATIN SMALL LETTER AE] case '\u01E3': // Ç£ [LATIN SMALL LETTER AE WITH MACRON] case '\u01FD': // ǽ [LATIN SMALL LETTER AE WITH ACUTE] case '\u1D02': // á´‚ [LATIN SMALL LETTER TURNED AE] output[outputPos++] = 'a'; output[outputPos++] = 'e'; break; case '\uA735': // ꜵ [LATIN SMALL LETTER AO] output[outputPos++] = 'a'; output[outputPos++] = 'o'; break; case '\uA737': // ꜷ [LATIN SMALL LETTER AU] output[outputPos++] = 'a'; output[outputPos++] = 'u'; break; case '\uA739': // ꜹ [LATIN SMALL LETTER AV] case '\uA73B': // ꜻ [LATIN SMALL LETTER AV WITH HORIZONTAL BAR] output[outputPos++] = 'a'; output[outputPos++] = 'v'; break; case '\uA73D': // ꜽ [LATIN SMALL LETTER AY] output[outputPos++] = 'a'; output[outputPos++] = 'y'; break; case '\u0181': // Æ [LATIN CAPITAL LETTER B WITH HOOK] case '\u0182': // Æ‚ [LATIN CAPITAL LETTER B WITH TOPBAR] case '\u0243': // Ƀ [LATIN CAPITAL LETTER B WITH STROKE] case '\u0299': // Ê™ [LATIN LETTER SMALL CAPITAL B] case '\u1D03': // á´ƒ [LATIN LETTER SMALL CAPITAL BARRED B] case '\u1E02': // Ḃ [LATIN CAPITAL LETTER B WITH DOT ABOVE] case '\u1E04': // Ḅ [LATIN CAPITAL LETTER B WITH DOT BELOW] case '\u1E06': // Ḇ [LATIN CAPITAL LETTER B WITH LINE BELOW] case '\u24B7': // â’· [CIRCLED LATIN CAPITAL LETTER B] case '\uFF22': // ï¼¢ [FULLWIDTH LATIN CAPITAL LETTER B] output[outputPos++] = 'B'; break; case '\u0180': // Æ€ [LATIN SMALL LETTER B WITH STROKE] case '\u0183': // ƃ [LATIN SMALL LETTER B WITH TOPBAR] case '\u0253': // É“ [LATIN SMALL LETTER B WITH HOOK] case '\u1D6C': // ᵬ [LATIN SMALL LETTER B WITH MIDDLE TILDE] case '\u1D80': // á¶€ [LATIN SMALL LETTER B WITH PALATAL HOOK] case '\u1E03': // ḃ [LATIN SMALL LETTER B WITH DOT ABOVE] case '\u1E05': // ḅ [LATIN SMALL LETTER B WITH DOT BELOW] case '\u1E07': // ḇ [LATIN SMALL LETTER B WITH LINE BELOW] case '\u24D1': // â“‘ [CIRCLED LATIN SMALL LETTER B] case '\uFF42': // b [FULLWIDTH LATIN SMALL LETTER B] output[outputPos++] = 'b'; break; case '\u249D': // â’ [PARENTHESIZED LATIN SMALL LETTER B] output[outputPos++] = '('; output[outputPos++] = 'b'; output[outputPos++] = ')'; break; case '\u00C7': // Ç [LATIN CAPITAL LETTER C WITH CEDILLA] case '\u0106': // Ć [LATIN CAPITAL LETTER C WITH ACUTE] case '\u0108': // Ĉ [LATIN CAPITAL LETTER C WITH CIRCUMFLEX] case '\u010A': // ÄŠ [LATIN CAPITAL LETTER C WITH DOT ABOVE] case '\u010C': // ÄŒ [LATIN CAPITAL LETTER C WITH CARON] case '\u0187': // Ƈ [LATIN CAPITAL LETTER C WITH HOOK] case '\u023B': // È» [LATIN CAPITAL LETTER C WITH STROKE] case '\u0297': // Ê— [LATIN LETTER STRETCHED C] case '\u1D04': // á´„ [LATIN LETTER SMALL CAPITAL C] case '\u1E08': // Ḉ [LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE] case '\u24B8': // â’¸ [CIRCLED LATIN CAPITAL LETTER C] case '\uFF23': // ï¼£ [FULLWIDTH LATIN CAPITAL LETTER C] output[outputPos++] = 'C'; break; case '\u00E7': // ç [LATIN SMALL LETTER C WITH CEDILLA] case '\u0107': // ć [LATIN SMALL LETTER C WITH ACUTE] case '\u0109': // ĉ [LATIN SMALL LETTER C WITH CIRCUMFLEX] case '\u010B': // Ä‹ [LATIN SMALL LETTER C WITH DOT ABOVE] case '\u010D': // Ä [LATIN SMALL LETTER C WITH CARON] case '\u0188': // ƈ [LATIN SMALL LETTER C WITH HOOK] case '\u023C': // ȼ [LATIN SMALL LETTER C WITH STROKE] case '\u0255': // É• [LATIN SMALL LETTER C WITH CURL] case '\u1E09': // ḉ [LATIN SMALL LETTER C WITH CEDILLA AND ACUTE] case '\u2184': // ↄ [LATIN SMALL LETTER REVERSED C] case '\u24D2': // â“’ [CIRCLED LATIN SMALL LETTER C] case '\uA73E': // Ꜿ [LATIN CAPITAL LETTER REVERSED C WITH DOT] case '\uA73F': // ꜿ [LATIN SMALL LETTER REVERSED C WITH DOT] case '\uFF43': // c [FULLWIDTH LATIN SMALL LETTER C] output[outputPos++] = 'c'; break; case '\u249E': // â’ž [PARENTHESIZED LATIN SMALL LETTER C] output[outputPos++] = '('; output[outputPos++] = 'c'; output[outputPos++] = ')'; break; case '\u00D0': // à [LATIN CAPITAL LETTER ETH] case '\u010E': // ÄŽ [LATIN CAPITAL LETTER D WITH CARON] case '\u0110': // Ä [LATIN CAPITAL LETTER D WITH STROKE] case '\u0189': // Ɖ [LATIN CAPITAL LETTER AFRICAN D] case '\u018A': // ÆŠ [LATIN CAPITAL LETTER D WITH HOOK] case '\u018B': // Æ‹ [LATIN CAPITAL LETTER D WITH TOPBAR] case '\u1D05': // á´… [LATIN LETTER SMALL CAPITAL D] case '\u1D06': // á´† [LATIN LETTER SMALL CAPITAL ETH] case '\u1E0A': // Ḋ [LATIN CAPITAL LETTER D WITH DOT ABOVE] case '\u1E0C': // Ḍ [LATIN CAPITAL LETTER D WITH DOT BELOW] case '\u1E0E': // Ḏ [LATIN CAPITAL LETTER D WITH LINE BELOW] case '\u1E10': // Ḡ[LATIN CAPITAL LETTER D WITH CEDILLA] case '\u1E12': // Ḓ [LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW] case '\u24B9': // â’¹ [CIRCLED LATIN CAPITAL LETTER D] case '\uA779': // ê¹ [LATIN CAPITAL LETTER INSULAR D] case '\uFF24': // D [FULLWIDTH LATIN CAPITAL LETTER D] output[outputPos++] = 'D'; break; case '\u00F0': // ð [LATIN SMALL LETTER ETH] case '\u010F': // Ä [LATIN SMALL LETTER D WITH CARON] case '\u0111': // Ä‘ [LATIN SMALL LETTER D WITH STROKE] case '\u018C': // ÆŒ [LATIN SMALL LETTER D WITH TOPBAR] case '\u0221': // È¡ [LATIN SMALL LETTER D WITH CURL] case '\u0256': // É– [LATIN SMALL LETTER D WITH TAIL] case '\u0257': // É— [LATIN SMALL LETTER D WITH HOOK] case '\u1D6D': // áµ­ [LATIN SMALL LETTER D WITH MIDDLE TILDE] case '\u1D81': // á¶ [LATIN SMALL LETTER D WITH PALATAL HOOK] case '\u1D91': // á¶‘ [LATIN SMALL LETTER D WITH HOOK AND TAIL] case '\u1E0B': // ḋ [LATIN SMALL LETTER D WITH DOT ABOVE] case '\u1E0D': // Ḡ[LATIN SMALL LETTER D WITH DOT BELOW] case '\u1E0F': // Ḡ[LATIN SMALL LETTER D WITH LINE BELOW] case '\u1E11': // ḑ [LATIN SMALL LETTER D WITH CEDILLA] case '\u1E13': // ḓ [LATIN SMALL LETTER D WITH CIRCUMFLEX BELOW] case '\u24D3': // â““ [CIRCLED LATIN SMALL LETTER D] case '\uA77A': // êº [LATIN SMALL LETTER INSULAR D] case '\uFF44': // d [FULLWIDTH LATIN SMALL LETTER D] output[outputPos++] = 'd'; break; case '\u01C4': // Ç„ [LATIN CAPITAL LETTER DZ WITH CARON] case '\u01F1': // DZ [LATIN CAPITAL LETTER DZ] output[outputPos++] = 'D'; output[outputPos++] = 'Z'; break; case '\u01C5': // Ç… [LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON] case '\u01F2': // Dz [LATIN CAPITAL LETTER D WITH SMALL LETTER Z] output[outputPos++] = 'D'; output[outputPos++] = 'z'; break; case '\u249F': // â’Ÿ [PARENTHESIZED LATIN SMALL LETTER D] output[outputPos++] = '('; output[outputPos++] = 'd'; output[outputPos++] = ')'; break; case '\u0238': // ȸ [LATIN SMALL LETTER DB DIGRAPH] output[outputPos++] = 'd'; output[outputPos++] = 'b'; break; case '\u01C6': // dž [LATIN SMALL LETTER DZ WITH CARON] case '\u01F3': // dz [LATIN SMALL LETTER DZ] case '\u02A3': // Ê£ [LATIN SMALL LETTER DZ DIGRAPH] case '\u02A5': // Ê¥ [LATIN SMALL LETTER DZ DIGRAPH WITH CURL] output[outputPos++] = 'd'; output[outputPos++] = 'z'; break; case '\u00C8': // È [LATIN CAPITAL LETTER E WITH GRAVE] case '\u00C9': // É [LATIN CAPITAL LETTER E WITH ACUTE] case '\u00CA': // Ê [LATIN CAPITAL LETTER E WITH CIRCUMFLEX] case '\u00CB': // Ë [LATIN CAPITAL LETTER E WITH DIAERESIS] case '\u0112': // Ä’ [LATIN CAPITAL LETTER E WITH MACRON] case '\u0114': // Ä” [LATIN CAPITAL LETTER E WITH BREVE] case '\u0116': // Ä– [LATIN CAPITAL LETTER E WITH DOT ABOVE] case '\u0118': // Ę [LATIN CAPITAL LETTER E WITH OGONEK] case '\u011A': // Äš [LATIN CAPITAL LETTER E WITH CARON] case '\u018E': // ÆŽ [LATIN CAPITAL LETTER REVERSED E] case '\u0190': // Æ [LATIN CAPITAL LETTER OPEN E] case '\u0204': // È„ [LATIN CAPITAL LETTER E WITH DOUBLE GRAVE] case '\u0206': // Ȇ [LATIN CAPITAL LETTER E WITH INVERTED BREVE] case '\u0228': // Ȩ [LATIN CAPITAL LETTER E WITH CEDILLA] case '\u0246': // Ɇ [LATIN CAPITAL LETTER E WITH STROKE] case '\u1D07': // á´‡ [LATIN LETTER SMALL CAPITAL E] case '\u1E14': // Ḕ [LATIN CAPITAL LETTER E WITH MACRON AND GRAVE] case '\u1E16': // Ḗ [LATIN CAPITAL LETTER E WITH MACRON AND ACUTE] case '\u1E18': // Ḙ [LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW] case '\u1E1A': // Ḛ [LATIN CAPITAL LETTER E WITH TILDE BELOW] case '\u1E1C': // Ḝ [LATIN CAPITAL LETTER E WITH CEDILLA AND BREVE] case '\u1EB8': // Ẹ [LATIN CAPITAL LETTER E WITH DOT BELOW] case '\u1EBA': // Ẻ [LATIN CAPITAL LETTER E WITH HOOK ABOVE] case '\u1EBC': // Ẽ [LATIN CAPITAL LETTER E WITH TILDE] case '\u1EBE': // Ế [LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE] case '\u1EC0': // Ề [LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE] case '\u1EC2': // Ể [LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE] case '\u1EC4': // Ễ [LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE] case '\u1EC6': // Ệ [LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW] case '\u24BA': // â’º [CIRCLED LATIN CAPITAL LETTER E] case '\u2C7B': // â±» [LATIN LETTER SMALL CAPITAL TURNED E] case '\uFF25': // ï¼¥ [FULLWIDTH LATIN CAPITAL LETTER E] output[outputPos++] = 'E'; break; case '\u00E8': // è [LATIN SMALL LETTER E WITH GRAVE] case '\u00E9': // é [LATIN SMALL LETTER E WITH ACUTE] case '\u00EA': // ê [LATIN SMALL LETTER E WITH CIRCUMFLEX] case '\u00EB': // ë [LATIN SMALL LETTER E WITH DIAERESIS] case '\u0113': // Ä“ [LATIN SMALL LETTER E WITH MACRON] case '\u0115': // Ä• [LATIN SMALL LETTER E WITH BREVE] case '\u0117': // Ä— [LATIN SMALL LETTER E WITH DOT ABOVE] case '\u0119': // Ä™ [LATIN SMALL LETTER E WITH OGONEK] case '\u011B': // Ä› [LATIN SMALL LETTER E WITH CARON] case '\u01DD': // Ç [LATIN SMALL LETTER TURNED E] case '\u0205': // È… [LATIN SMALL LETTER E WITH DOUBLE GRAVE] case '\u0207': // ȇ [LATIN SMALL LETTER E WITH INVERTED BREVE] case '\u0229': // È© [LATIN SMALL LETTER E WITH CEDILLA] case '\u0247': // ɇ [LATIN SMALL LETTER E WITH STROKE] case '\u0258': // ɘ [LATIN SMALL LETTER REVERSED E] case '\u025B': // É› [LATIN SMALL LETTER OPEN E] case '\u025C': // Éœ [LATIN SMALL LETTER REVERSED OPEN E] case '\u025D': // É [LATIN SMALL LETTER REVERSED OPEN E WITH HOOK] case '\u025E': // Éž [LATIN SMALL LETTER CLOSED REVERSED OPEN E] case '\u029A': // Êš [LATIN SMALL LETTER CLOSED OPEN E] case '\u1D08': // á´ˆ [LATIN SMALL LETTER TURNED OPEN E] case '\u1D92': // á¶’ [LATIN SMALL LETTER E WITH RETROFLEX HOOK] case '\u1D93': // á¶“ [LATIN SMALL LETTER OPEN E WITH RETROFLEX HOOK] case '\u1D94': // á¶” [LATIN SMALL LETTER REVERSED OPEN E WITH RETROFLEX HOOK] case '\u1E15': // ḕ [LATIN SMALL LETTER E WITH MACRON AND GRAVE] case '\u1E17': // ḗ [LATIN SMALL LETTER E WITH MACRON AND ACUTE] case '\u1E19': // ḙ [LATIN SMALL LETTER E WITH CIRCUMFLEX BELOW] case '\u1E1B': // ḛ [LATIN SMALL LETTER E WITH TILDE BELOW] case '\u1E1D': // Ḡ[LATIN SMALL LETTER E WITH CEDILLA AND BREVE] case '\u1EB9': // ẹ [LATIN SMALL LETTER E WITH DOT BELOW] case '\u1EBB': // ẻ [LATIN SMALL LETTER E WITH HOOK ABOVE] case '\u1EBD': // ẽ [LATIN SMALL LETTER E WITH TILDE] case '\u1EBF': // ế [LATIN SMALL LETTER E WITH CIRCUMFLEX AND ACUTE] case '\u1EC1': // á» [LATIN SMALL LETTER E WITH CIRCUMFLEX AND GRAVE] case '\u1EC3': // ể [LATIN SMALL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE] case '\u1EC5': // á»… [LATIN SMALL LETTER E WITH CIRCUMFLEX AND TILDE] case '\u1EC7': // ệ [LATIN SMALL LETTER E WITH CIRCUMFLEX AND DOT BELOW] case '\u2091': // â‚‘ [LATIN SUBSCRIPT SMALL LETTER E] case '\u24D4': // â“” [CIRCLED LATIN SMALL LETTER E] case '\u2C78': // ⱸ [LATIN SMALL LETTER E WITH NOTCH] case '\uFF45': // ï½… [FULLWIDTH LATIN SMALL LETTER E] output[outputPos++] = 'e'; break; case '\u24A0': // â’  [PARENTHESIZED LATIN SMALL LETTER E] output[outputPos++] = '('; output[outputPos++] = 'e'; output[outputPos++] = ')'; break; case '\u0191': // Æ‘ [LATIN CAPITAL LETTER F WITH HOOK] case '\u1E1E': // Ḟ [LATIN CAPITAL LETTER F WITH DOT ABOVE] case '\u24BB': // â’» [CIRCLED LATIN CAPITAL LETTER F] case '\uA730': // ꜰ [LATIN LETTER SMALL CAPITAL F] case '\uA77B': // ê» [LATIN CAPITAL LETTER INSULAR F] case '\uA7FB': // ꟻ [LATIN EPIGRAPHIC LETTER REVERSED F] case '\uFF26': // F [FULLWIDTH LATIN CAPITAL LETTER F] output[outputPos++] = 'F'; break; case '\u0192': // Æ’ [LATIN SMALL LETTER F WITH HOOK] case '\u1D6E': // áµ® [LATIN SMALL LETTER F WITH MIDDLE TILDE] case '\u1D82': // á¶‚ [LATIN SMALL LETTER F WITH PALATAL HOOK] case '\u1E1F': // ḟ [LATIN SMALL LETTER F WITH DOT ABOVE] case '\u1E9B': // ẛ [LATIN SMALL LETTER LONG S WITH DOT ABOVE] case '\u24D5': // â“• [CIRCLED LATIN SMALL LETTER F] case '\uA77C': // ê¼ [LATIN SMALL LETTER INSULAR F] case '\uFF46': // f [FULLWIDTH LATIN SMALL LETTER F] output[outputPos++] = 'f'; break; case '\u24A1': // â’¡ [PARENTHESIZED LATIN SMALL LETTER F] output[outputPos++] = '('; output[outputPos++] = 'f'; output[outputPos++] = ')'; break; case '\uFB00': // ff [LATIN SMALL LIGATURE FF] output[outputPos++] = 'f'; output[outputPos++] = 'f'; break; case '\uFB03': // ffi [LATIN SMALL LIGATURE FFI] output[outputPos++] = 'f'; output[outputPos++] = 'f'; output[outputPos++] = 'i'; break; case '\uFB04': // ffl [LATIN SMALL LIGATURE FFL] output[outputPos++] = 'f'; output[outputPos++] = 'f'; output[outputPos++] = 'l'; break; case '\uFB01': // ï¬ [LATIN SMALL LIGATURE FI] output[outputPos++] = 'f'; output[outputPos++] = 'i'; break; case '\uFB02': // fl [LATIN SMALL LIGATURE FL] output[outputPos++] = 'f'; output[outputPos++] = 'l'; break; case '\u011C': // Äœ [LATIN CAPITAL LETTER G WITH CIRCUMFLEX] case '\u011E': // Äž [LATIN CAPITAL LETTER G WITH BREVE] case '\u0120': // Ä  [LATIN CAPITAL LETTER G WITH DOT ABOVE] case '\u0122': // Ä¢ [LATIN CAPITAL LETTER G WITH CEDILLA] case '\u0193': // Æ“ [LATIN CAPITAL LETTER G WITH HOOK] case '\u01E4': // Ǥ [LATIN CAPITAL LETTER G WITH STROKE] case '\u01E5': // Ç¥ [LATIN SMALL LETTER G WITH STROKE] case '\u01E6': // Ǧ [LATIN CAPITAL LETTER G WITH CARON] case '\u01E7': // ǧ [LATIN SMALL LETTER G WITH CARON] case '\u01F4': // Ç´ [LATIN CAPITAL LETTER G WITH ACUTE] case '\u0262': // É¢ [LATIN LETTER SMALL CAPITAL G] case '\u029B': // Ê› [LATIN LETTER SMALL CAPITAL G WITH HOOK] case '\u1E20': // Ḡ [LATIN CAPITAL LETTER G WITH MACRON] case '\u24BC': // â’¼ [CIRCLED LATIN CAPITAL LETTER G] case '\uA77D': // ê½ [LATIN CAPITAL LETTER INSULAR G] case '\uA77E': // ê¾ [LATIN CAPITAL LETTER TURNED INSULAR G] case '\uFF27': // ï¼§ [FULLWIDTH LATIN CAPITAL LETTER G] output[outputPos++] = 'G'; break; case '\u011D': // Ä [LATIN SMALL LETTER G WITH CIRCUMFLEX] case '\u011F': // ÄŸ [LATIN SMALL LETTER G WITH BREVE] case '\u0121': // Ä¡ [LATIN SMALL LETTER G WITH DOT ABOVE] case '\u0123': // Ä£ [LATIN SMALL LETTER G WITH CEDILLA] case '\u01F5': // ǵ [LATIN SMALL LETTER G WITH ACUTE] case '\u0260': // É  [LATIN SMALL LETTER G WITH HOOK] case '\u0261': // É¡ [LATIN SMALL LETTER SCRIPT G] case '\u1D77': // áµ· [LATIN SMALL LETTER TURNED G] case '\u1D79': // áµ¹ [LATIN SMALL LETTER INSULAR G] case '\u1D83': // ᶃ [LATIN SMALL LETTER G WITH PALATAL HOOK] case '\u1E21': // ḡ [LATIN SMALL LETTER G WITH MACRON] case '\u24D6': // â“– [CIRCLED LATIN SMALL LETTER G] case '\uA77F': // ê¿ [LATIN SMALL LETTER TURNED INSULAR G] case '\uFF47': // g [FULLWIDTH LATIN SMALL LETTER G] output[outputPos++] = 'g'; break; case '\u24A2': // â’¢ [PARENTHESIZED LATIN SMALL LETTER G] output[outputPos++] = '('; output[outputPos++] = 'g'; output[outputPos++] = ')'; break; case '\u0124': // Ĥ [LATIN CAPITAL LETTER H WITH CIRCUMFLEX] case '\u0126': // Ħ [LATIN CAPITAL LETTER H WITH STROKE] case '\u021E': // Èž [LATIN CAPITAL LETTER H WITH CARON] case '\u029C': // Êœ [LATIN LETTER SMALL CAPITAL H] case '\u1E22': // Ḣ [LATIN CAPITAL LETTER H WITH DOT ABOVE] case '\u1E24': // Ḥ [LATIN CAPITAL LETTER H WITH DOT BELOW] case '\u1E26': // Ḧ [LATIN CAPITAL LETTER H WITH DIAERESIS] case '\u1E28': // Ḩ [LATIN CAPITAL LETTER H WITH CEDILLA] case '\u1E2A': // Ḫ [LATIN CAPITAL LETTER H WITH BREVE BELOW] case '\u24BD': // â’½ [CIRCLED LATIN CAPITAL LETTER H] case '\u2C67': // â±§ [LATIN CAPITAL LETTER H WITH DESCENDER] case '\u2C75': // â±µ [LATIN CAPITAL LETTER HALF H] case '\uFF28': // H [FULLWIDTH LATIN CAPITAL LETTER H] output[outputPos++] = 'H'; break; case '\u0125': // Ä¥ [LATIN SMALL LETTER H WITH CIRCUMFLEX] case '\u0127': // ħ [LATIN SMALL LETTER H WITH STROKE] case '\u021F': // ÈŸ [LATIN SMALL LETTER H WITH CARON] case '\u0265': // É¥ [LATIN SMALL LETTER TURNED H] case '\u0266': // ɦ [LATIN SMALL LETTER H WITH HOOK] case '\u02AE': // Ê® [LATIN SMALL LETTER TURNED H WITH FISHHOOK] case '\u02AF': // ʯ [LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL] case '\u1E23': // ḣ [LATIN SMALL LETTER H WITH DOT ABOVE] case '\u1E25': // ḥ [LATIN SMALL LETTER H WITH DOT BELOW] case '\u1E27': // ḧ [LATIN SMALL LETTER H WITH DIAERESIS] case '\u1E29': // ḩ [LATIN SMALL LETTER H WITH CEDILLA] case '\u1E2B': // ḫ [LATIN SMALL LETTER H WITH BREVE BELOW] case '\u1E96': // ẖ [LATIN SMALL LETTER H WITH LINE BELOW] case '\u24D7': // â“— [CIRCLED LATIN SMALL LETTER H] case '\u2C68': // ⱨ [LATIN SMALL LETTER H WITH DESCENDER] case '\u2C76': // â±¶ [LATIN SMALL LETTER HALF H] case '\uFF48': // h [FULLWIDTH LATIN SMALL LETTER H] output[outputPos++] = 'h'; break; case '\u01F6': // Ƕ http://en.wikipedia.org/wiki/Hwair [LATIN CAPITAL LETTER HWAIR] output[outputPos++] = 'H'; output[outputPos++] = 'V'; break; case '\u24A3': // â’£ [PARENTHESIZED LATIN SMALL LETTER H] output[outputPos++] = '('; output[outputPos++] = 'h'; output[outputPos++] = ')'; break; case '\u0195': // Æ• [LATIN SMALL LETTER HV] output[outputPos++] = 'h'; output[outputPos++] = 'v'; break; case '\u00CC': // ÃŒ [LATIN CAPITAL LETTER I WITH GRAVE] case '\u00CD': // à [LATIN CAPITAL LETTER I WITH ACUTE] case '\u00CE': // ÃŽ [LATIN CAPITAL LETTER I WITH CIRCUMFLEX] case '\u00CF': // à [LATIN CAPITAL LETTER I WITH DIAERESIS] case '\u0128': // Ĩ [LATIN CAPITAL LETTER I WITH TILDE] case '\u012A': // Ī [LATIN CAPITAL LETTER I WITH MACRON] case '\u012C': // Ĭ [LATIN CAPITAL LETTER I WITH BREVE] case '\u012E': // Ä® [LATIN CAPITAL LETTER I WITH OGONEK] case '\u0130': // İ [LATIN CAPITAL LETTER I WITH DOT ABOVE] case '\u0196': // Æ– [LATIN CAPITAL LETTER IOTA] case '\u0197': // Æ— [LATIN CAPITAL LETTER I WITH STROKE] case '\u01CF': // Ç [LATIN CAPITAL LETTER I WITH CARON] case '\u0208': // Ȉ [LATIN CAPITAL LETTER I WITH DOUBLE GRAVE] case '\u020A': // ÈŠ [LATIN CAPITAL LETTER I WITH INVERTED BREVE] case '\u026A': // ɪ [LATIN LETTER SMALL CAPITAL I] case '\u1D7B': // áµ» [LATIN SMALL CAPITAL LETTER I WITH STROKE] case '\u1E2C': // Ḭ [LATIN CAPITAL LETTER I WITH TILDE BELOW] case '\u1E2E': // Ḯ [LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE] case '\u1EC8': // Ỉ [LATIN CAPITAL LETTER I WITH HOOK ABOVE] case '\u1ECA': // Ị [LATIN CAPITAL LETTER I WITH DOT BELOW] case '\u24BE': // â’¾ [CIRCLED LATIN CAPITAL LETTER I] case '\uA7FE': // ꟾ [LATIN EPIGRAPHIC LETTER I LONGA] case '\uFF29': // I [FULLWIDTH LATIN CAPITAL LETTER I] output[outputPos++] = 'I'; break; case '\u00EC': // ì [LATIN SMALL LETTER I WITH GRAVE] case '\u00ED': // í [LATIN SMALL LETTER I WITH ACUTE] case '\u00EE': // î [LATIN SMALL LETTER I WITH CIRCUMFLEX] case '\u00EF': // ï [LATIN SMALL LETTER I WITH DIAERESIS] case '\u0129': // Ä© [LATIN SMALL LETTER I WITH TILDE] case '\u012B': // Ä« [LATIN SMALL LETTER I WITH MACRON] case '\u012D': // Ä­ [LATIN SMALL LETTER I WITH BREVE] case '\u012F': // į [LATIN SMALL LETTER I WITH OGONEK] case '\u0131': // ı [LATIN SMALL LETTER DOTLESS I] case '\u01D0': // Ç [LATIN SMALL LETTER I WITH CARON] case '\u0209': // ȉ [LATIN SMALL LETTER I WITH DOUBLE GRAVE] case '\u020B': // È‹ [LATIN SMALL LETTER I WITH INVERTED BREVE] case '\u0268': // ɨ [LATIN SMALL LETTER I WITH STROKE] case '\u1D09': // á´‰ [LATIN SMALL LETTER TURNED I] case '\u1D62': // áµ¢ [LATIN SUBSCRIPT SMALL LETTER I] case '\u1D7C': // áµ¼ [LATIN SMALL LETTER IOTA WITH STROKE] case '\u1D96': // á¶– [LATIN SMALL LETTER I WITH RETROFLEX HOOK] case '\u1E2D': // ḭ [LATIN SMALL LETTER I WITH TILDE BELOW] case '\u1E2F': // ḯ [LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE] case '\u1EC9': // ỉ [LATIN SMALL LETTER I WITH HOOK ABOVE] case '\u1ECB': // ị [LATIN SMALL LETTER I WITH DOT BELOW] case '\u2071': // â± [SUPERSCRIPT LATIN SMALL LETTER I] case '\u24D8': // ⓘ [CIRCLED LATIN SMALL LETTER I] case '\uFF49': // i [FULLWIDTH LATIN SMALL LETTER I] output[outputPos++] = 'i'; break; case '\u0132': // IJ [LATIN CAPITAL LIGATURE IJ] output[outputPos++] = 'I'; output[outputPos++] = 'J'; break; case '\u24A4': // â’¤ [PARENTHESIZED LATIN SMALL LETTER I] output[outputPos++] = '('; output[outputPos++] = 'i'; output[outputPos++] = ')'; break; case '\u0133': // ij [LATIN SMALL LIGATURE IJ] output[outputPos++] = 'i'; output[outputPos++] = 'j'; break; case '\u0134': // Ä´ [LATIN CAPITAL LETTER J WITH CIRCUMFLEX] case '\u0248': // Ɉ [LATIN CAPITAL LETTER J WITH STROKE] case '\u1D0A': // á´Š [LATIN LETTER SMALL CAPITAL J] case '\u24BF': // â’¿ [CIRCLED LATIN CAPITAL LETTER J] case '\uFF2A': // J [FULLWIDTH LATIN CAPITAL LETTER J] output[outputPos++] = 'J'; break; case '\u0135': // ĵ [LATIN SMALL LETTER J WITH CIRCUMFLEX] case '\u01F0': // ǰ [LATIN SMALL LETTER J WITH CARON] case '\u0237': // È· [LATIN SMALL LETTER DOTLESS J] case '\u0249': // ɉ [LATIN SMALL LETTER J WITH STROKE] case '\u025F': // ÉŸ [LATIN SMALL LETTER DOTLESS J WITH STROKE] case '\u0284': // Ê„ [LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK] case '\u029D': // Ê [LATIN SMALL LETTER J WITH CROSSED-TAIL] case '\u24D9': // â“™ [CIRCLED LATIN SMALL LETTER J] case '\u2C7C': // â±¼ [LATIN SUBSCRIPT SMALL LETTER J] case '\uFF4A': // j [FULLWIDTH LATIN SMALL LETTER J] output[outputPos++] = 'j'; break; case '\u24A5': // â’¥ [PARENTHESIZED LATIN SMALL LETTER J] output[outputPos++] = '('; output[outputPos++] = 'j'; output[outputPos++] = ')'; break; case '\u0136': // Ķ [LATIN CAPITAL LETTER K WITH CEDILLA] case '\u0198': // Ƙ [LATIN CAPITAL LETTER K WITH HOOK] case '\u01E8': // Ǩ [LATIN CAPITAL LETTER K WITH CARON] case '\u1D0B': // á´‹ [LATIN LETTER SMALL CAPITAL K] case '\u1E30': // Ḱ [LATIN CAPITAL LETTER K WITH ACUTE] case '\u1E32': // Ḳ [LATIN CAPITAL LETTER K WITH DOT BELOW] case '\u1E34': // Ḵ [LATIN CAPITAL LETTER K WITH LINE BELOW] case '\u24C0': // â“€ [CIRCLED LATIN CAPITAL LETTER K] case '\u2C69': // Ⱪ [LATIN CAPITAL LETTER K WITH DESCENDER] case '\uA740': // ê€ [LATIN CAPITAL LETTER K WITH STROKE] case '\uA742': // ê‚ [LATIN CAPITAL LETTER K WITH DIAGONAL STROKE] case '\uA744': // ê„ [LATIN CAPITAL LETTER K WITH STROKE AND DIAGONAL STROKE] case '\uFF2B': // K [FULLWIDTH LATIN CAPITAL LETTER K] output[outputPos++] = 'K'; break; case '\u0137': // Ä· [LATIN SMALL LETTER K WITH CEDILLA] case '\u0199': // Æ™ [LATIN SMALL LETTER K WITH HOOK] case '\u01E9': // Ç© [LATIN SMALL LETTER K WITH CARON] case '\u029E': // Êž [LATIN SMALL LETTER TURNED K] case '\u1D84': // á¶„ [LATIN SMALL LETTER K WITH PALATAL HOOK] case '\u1E31': // ḱ [LATIN SMALL LETTER K WITH ACUTE] case '\u1E33': // ḳ [LATIN SMALL LETTER K WITH DOT BELOW] case '\u1E35': // ḵ [LATIN SMALL LETTER K WITH LINE BELOW] case '\u24DA': // ⓚ [CIRCLED LATIN SMALL LETTER K] case '\u2C6A': // ⱪ [LATIN SMALL LETTER K WITH DESCENDER] case '\uA741': // ê [LATIN SMALL LETTER K WITH STROKE] case '\uA743': // êƒ [LATIN SMALL LETTER K WITH DIAGONAL STROKE] case '\uA745': // ê… [LATIN SMALL LETTER K WITH STROKE AND DIAGONAL STROKE] case '\uFF4B': // k [FULLWIDTH LATIN SMALL LETTER K] output[outputPos++] = 'k'; break; case '\u24A6': // â’¦ [PARENTHESIZED LATIN SMALL LETTER K] output[outputPos++] = '('; output[outputPos++] = 'k'; output[outputPos++] = ')'; break; case '\u0139': // Ĺ [LATIN CAPITAL LETTER L WITH ACUTE] case '\u013B': // Ä» [LATIN CAPITAL LETTER L WITH CEDILLA] case '\u013D': // Ľ [LATIN CAPITAL LETTER L WITH CARON] case '\u013F': // Ä¿ [LATIN CAPITAL LETTER L WITH MIDDLE DOT] case '\u0141': // Å [LATIN CAPITAL LETTER L WITH STROKE] case '\u023D': // Ƚ [LATIN CAPITAL LETTER L WITH BAR] case '\u029F': // ÊŸ [LATIN LETTER SMALL CAPITAL L] case '\u1D0C': // á´Œ [LATIN LETTER SMALL CAPITAL L WITH STROKE] case '\u1E36': // Ḷ [LATIN CAPITAL LETTER L WITH DOT BELOW] case '\u1E38': // Ḹ [LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON] case '\u1E3A': // Ḻ [LATIN CAPITAL LETTER L WITH LINE BELOW] case '\u1E3C': // Ḽ [LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW] case '\u24C1': // â“ [CIRCLED LATIN CAPITAL LETTER L] case '\u2C60': // â±  [LATIN CAPITAL LETTER L WITH DOUBLE BAR] case '\u2C62': // â±¢ [LATIN CAPITAL LETTER L WITH MIDDLE TILDE] case '\uA746': // ê† [LATIN CAPITAL LETTER BROKEN L] case '\uA748': // êˆ [LATIN CAPITAL LETTER L WITH HIGH STROKE] case '\uA780': // Ꞁ [LATIN CAPITAL LETTER TURNED L] case '\uFF2C': // L [FULLWIDTH LATIN CAPITAL LETTER L] output[outputPos++] = 'L'; break; case '\u013A': // ĺ [LATIN SMALL LETTER L WITH ACUTE] case '\u013C': // ļ [LATIN SMALL LETTER L WITH CEDILLA] case '\u013E': // ľ [LATIN SMALL LETTER L WITH CARON] case '\u0140': // Å€ [LATIN SMALL LETTER L WITH MIDDLE DOT] case '\u0142': // Å‚ [LATIN SMALL LETTER L WITH STROKE] case '\u019A': // Æš [LATIN SMALL LETTER L WITH BAR] case '\u0234': // È´ [LATIN SMALL LETTER L WITH CURL] case '\u026B': // É« [LATIN SMALL LETTER L WITH MIDDLE TILDE] case '\u026C': // ɬ [LATIN SMALL LETTER L WITH BELT] case '\u026D': // É­ [LATIN SMALL LETTER L WITH RETROFLEX HOOK] case '\u1D85': // á¶… [LATIN SMALL LETTER L WITH PALATAL HOOK] case '\u1E37': // ḷ [LATIN SMALL LETTER L WITH DOT BELOW] case '\u1E39': // ḹ [LATIN SMALL LETTER L WITH DOT BELOW AND MACRON] case '\u1E3B': // ḻ [LATIN SMALL LETTER L WITH LINE BELOW] case '\u1E3D': // ḽ [LATIN SMALL LETTER L WITH CIRCUMFLEX BELOW] case '\u24DB': // â“› [CIRCLED LATIN SMALL LETTER L] case '\u2C61': // ⱡ [LATIN SMALL LETTER L WITH DOUBLE BAR] case '\uA747': // ê‡ [LATIN SMALL LETTER BROKEN L] case '\uA749': // ê‰ [LATIN SMALL LETTER L WITH HIGH STROKE] case '\uA781': // êž [LATIN SMALL LETTER TURNED L] case '\uFF4C': // l [FULLWIDTH LATIN SMALL LETTER L] output[outputPos++] = 'l'; break; case '\u01C7': // LJ [LATIN CAPITAL LETTER LJ] output[outputPos++] = 'L'; output[outputPos++] = 'J'; break; case '\u1EFA': // Ỻ [LATIN CAPITAL LETTER MIDDLE-WELSH LL] output[outputPos++] = 'L'; output[outputPos++] = 'L'; break; case '\u01C8': // Lj [LATIN CAPITAL LETTER L WITH SMALL LETTER J] output[outputPos++] = 'L'; output[outputPos++] = 'j'; break; case '\u24A7': // â’§ [PARENTHESIZED LATIN SMALL LETTER L] output[outputPos++] = '('; output[outputPos++] = 'l'; output[outputPos++] = ')'; break; case '\u01C9': // lj [LATIN SMALL LETTER LJ] output[outputPos++] = 'l'; output[outputPos++] = 'j'; break; case '\u1EFB': // á»» [LATIN SMALL LETTER MIDDLE-WELSH LL] output[outputPos++] = 'l'; output[outputPos++] = 'l'; break; case '\u02AA': // ʪ [LATIN SMALL LETTER LS DIGRAPH] output[outputPos++] = 'l'; output[outputPos++] = 's'; break; case '\u02AB': // Ê« [LATIN SMALL LETTER LZ DIGRAPH] output[outputPos++] = 'l'; output[outputPos++] = 'z'; break; case '\u019C': // Æœ [LATIN CAPITAL LETTER TURNED M] case '\u1D0D': // á´ [LATIN LETTER SMALL CAPITAL M] case '\u1E3E': // Ḿ [LATIN CAPITAL LETTER M WITH ACUTE] case '\u1E40': // á¹€ [LATIN CAPITAL LETTER M WITH DOT ABOVE] case '\u1E42': // Ṃ [LATIN CAPITAL LETTER M WITH DOT BELOW] case '\u24C2': // â“‚ [CIRCLED LATIN CAPITAL LETTER M] case '\u2C6E': // â±® [LATIN CAPITAL LETTER M WITH HOOK] case '\uA7FD': // ꟽ [LATIN EPIGRAPHIC LETTER INVERTED M] case '\uA7FF': // ꟿ [LATIN EPIGRAPHIC LETTER ARCHAIC M] case '\uFF2D': // ï¼­ [FULLWIDTH LATIN CAPITAL LETTER M] output[outputPos++] = 'M'; break; case '\u026F': // ɯ [LATIN SMALL LETTER TURNED M] case '\u0270': // ɰ [LATIN SMALL LETTER TURNED M WITH LONG LEG] case '\u0271': // ɱ [LATIN SMALL LETTER M WITH HOOK] case '\u1D6F': // ᵯ [LATIN SMALL LETTER M WITH MIDDLE TILDE] case '\u1D86': // ᶆ [LATIN SMALL LETTER M WITH PALATAL HOOK] case '\u1E3F': // ḿ [LATIN SMALL LETTER M WITH ACUTE] case '\u1E41': // á¹ [LATIN SMALL LETTER M WITH DOT ABOVE] case '\u1E43': // ṃ [LATIN SMALL LETTER M WITH DOT BELOW] case '\u24DC': // ⓜ [CIRCLED LATIN SMALL LETTER M] case '\uFF4D': // ï½ [FULLWIDTH LATIN SMALL LETTER M] output[outputPos++] = 'm'; break; case '\u24A8': // â’¨ [PARENTHESIZED LATIN SMALL LETTER M] output[outputPos++] = '('; output[outputPos++] = 'm'; output[outputPos++] = ')'; break; case '\u00D1': // Ñ [LATIN CAPITAL LETTER N WITH TILDE] case '\u0143': // Ń [LATIN CAPITAL LETTER N WITH ACUTE] case '\u0145': // Å… [LATIN CAPITAL LETTER N WITH CEDILLA] case '\u0147': // Ň [LATIN CAPITAL LETTER N WITH CARON] case '\u014A': // ÅŠ http://en.wikipedia.org/wiki/Eng_(letter) [LATIN CAPITAL LETTER ENG] case '\u019D': // Æ [LATIN CAPITAL LETTER N WITH LEFT HOOK] case '\u01F8': // Ǹ [LATIN CAPITAL LETTER N WITH GRAVE] case '\u0220': // È  [LATIN CAPITAL LETTER N WITH LONG RIGHT LEG] case '\u0274': // É´ [LATIN LETTER SMALL CAPITAL N] case '\u1D0E': // á´Ž [LATIN LETTER SMALL CAPITAL REVERSED N] case '\u1E44': // Ṅ [LATIN CAPITAL LETTER N WITH DOT ABOVE] case '\u1E46': // Ṇ [LATIN CAPITAL LETTER N WITH DOT BELOW] case '\u1E48': // Ṉ [LATIN CAPITAL LETTER N WITH LINE BELOW] case '\u1E4A': // Ṋ [LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW] case '\u24C3': // Ⓝ [CIRCLED LATIN CAPITAL LETTER N] case '\uFF2E': // ï¼® [FULLWIDTH LATIN CAPITAL LETTER N] output[outputPos++] = 'N'; break; case '\u00F1': // ñ [LATIN SMALL LETTER N WITH TILDE] case '\u0144': // Å„ [LATIN SMALL LETTER N WITH ACUTE] case '\u0146': // ņ [LATIN SMALL LETTER N WITH CEDILLA] case '\u0148': // ň [LATIN SMALL LETTER N WITH CARON] case '\u0149': // ʼn [LATIN SMALL LETTER N PRECEDED BY APOSTROPHE] case '\u014B': // Å‹ http://en.wikipedia.org/wiki/Eng_(letter) [LATIN SMALL LETTER ENG] case '\u019E': // Æž [LATIN SMALL LETTER N WITH LONG RIGHT LEG] case '\u01F9': // ǹ [LATIN SMALL LETTER N WITH GRAVE] case '\u0235': // ȵ [LATIN SMALL LETTER N WITH CURL] case '\u0272': // ɲ [LATIN SMALL LETTER N WITH LEFT HOOK] case '\u0273': // ɳ [LATIN SMALL LETTER N WITH RETROFLEX HOOK] case '\u1D70': // áµ° [LATIN SMALL LETTER N WITH MIDDLE TILDE] case '\u1D87': // ᶇ [LATIN SMALL LETTER N WITH PALATAL HOOK] case '\u1E45': // á¹… [LATIN SMALL LETTER N WITH DOT ABOVE] case '\u1E47': // ṇ [LATIN SMALL LETTER N WITH DOT BELOW] case '\u1E49': // ṉ [LATIN SMALL LETTER N WITH LINE BELOW] case '\u1E4B': // ṋ [LATIN SMALL LETTER N WITH CIRCUMFLEX BELOW] case '\u207F': // â¿ [SUPERSCRIPT LATIN SMALL LETTER N] case '\u24DD': // â“ [CIRCLED LATIN SMALL LETTER N] case '\uFF4E': // n [FULLWIDTH LATIN SMALL LETTER N] output[outputPos++] = 'n'; break; case '\u01CA': // ÇŠ [LATIN CAPITAL LETTER NJ] output[outputPos++] = 'N'; output[outputPos++] = 'J'; break; case '\u01CB': // Ç‹ [LATIN CAPITAL LETTER N WITH SMALL LETTER J] output[outputPos++] = 'N'; output[outputPos++] = 'j'; break; case '\u24A9': // â’© [PARENTHESIZED LATIN SMALL LETTER N] output[outputPos++] = '('; output[outputPos++] = 'n'; output[outputPos++] = ')'; break; case '\u01CC': // ÇŒ [LATIN SMALL LETTER NJ] output[outputPos++] = 'n'; output[outputPos++] = 'j'; break; case '\u00D2': // Ã’ [LATIN CAPITAL LETTER O WITH GRAVE] case '\u00D3': // Ó [LATIN CAPITAL LETTER O WITH ACUTE] case '\u00D4': // Ô [LATIN CAPITAL LETTER O WITH CIRCUMFLEX] case '\u00D5': // Õ [LATIN CAPITAL LETTER O WITH TILDE] case '\u00D6': // Ö [LATIN CAPITAL LETTER O WITH DIAERESIS] case '\u00D8': // Ø [LATIN CAPITAL LETTER O WITH STROKE] case '\u014C': // ÅŒ [LATIN CAPITAL LETTER O WITH MACRON] case '\u014E': // ÅŽ [LATIN CAPITAL LETTER O WITH BREVE] case '\u0150': // Å [LATIN CAPITAL LETTER O WITH DOUBLE ACUTE] case '\u0186': // Ɔ [LATIN CAPITAL LETTER OPEN O] case '\u019F': // ÆŸ [LATIN CAPITAL LETTER O WITH MIDDLE TILDE] case '\u01A0': // Æ  [LATIN CAPITAL LETTER O WITH HORN] case '\u01D1': // Ç‘ [LATIN CAPITAL LETTER O WITH CARON] case '\u01EA': // Ǫ [LATIN CAPITAL LETTER O WITH OGONEK] case '\u01EC': // Ǭ [LATIN CAPITAL LETTER O WITH OGONEK AND MACRON] case '\u01FE': // Ǿ [LATIN CAPITAL LETTER O WITH STROKE AND ACUTE] case '\u020C': // ÈŒ [LATIN CAPITAL LETTER O WITH DOUBLE GRAVE] case '\u020E': // ÈŽ [LATIN CAPITAL LETTER O WITH INVERTED BREVE] case '\u022A': // Ȫ [LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON] case '\u022C': // Ȭ [LATIN CAPITAL LETTER O WITH TILDE AND MACRON] case '\u022E': // È® [LATIN CAPITAL LETTER O WITH DOT ABOVE] case '\u0230': // Ȱ [LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON] case '\u1D0F': // á´ [LATIN LETTER SMALL CAPITAL O] case '\u1D10': // á´ [LATIN LETTER SMALL CAPITAL OPEN O] case '\u1E4C': // Ṍ [LATIN CAPITAL LETTER O WITH TILDE AND ACUTE] case '\u1E4E': // Ṏ [LATIN CAPITAL LETTER O WITH TILDE AND DIAERESIS] case '\u1E50': // á¹ [LATIN CAPITAL LETTER O WITH MACRON AND GRAVE] case '\u1E52': // á¹’ [LATIN CAPITAL LETTER O WITH MACRON AND ACUTE] case '\u1ECC': // Ọ [LATIN CAPITAL LETTER O WITH DOT BELOW] case '\u1ECE': // Ỏ [LATIN CAPITAL LETTER O WITH HOOK ABOVE] case '\u1ED0': // á» [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE] case '\u1ED2': // á»’ [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE] case '\u1ED4': // á»” [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE] case '\u1ED6': // á»– [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE] case '\u1ED8': // Ộ [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW] case '\u1EDA': // Ớ [LATIN CAPITAL LETTER O WITH HORN AND ACUTE] case '\u1EDC': // Ờ [LATIN CAPITAL LETTER O WITH HORN AND GRAVE] case '\u1EDE': // Ở [LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE] case '\u1EE0': // á»  [LATIN CAPITAL LETTER O WITH HORN AND TILDE] case '\u1EE2': // Ợ [LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW] case '\u24C4': // â“„ [CIRCLED LATIN CAPITAL LETTER O] case '\uA74A': // êŠ [LATIN CAPITAL LETTER O WITH LONG STROKE OVERLAY] case '\uA74C': // êŒ [LATIN CAPITAL LETTER O WITH LOOP] case '\uFF2F': // O [FULLWIDTH LATIN CAPITAL LETTER O] output[outputPos++] = 'O'; break; case '\u00F2': // ò [LATIN SMALL LETTER O WITH GRAVE] case '\u00F3': // ó [LATIN SMALL LETTER O WITH ACUTE] case '\u00F4': // ô [LATIN SMALL LETTER O WITH CIRCUMFLEX] case '\u00F5': // õ [LATIN SMALL LETTER O WITH TILDE] case '\u00F6': // ö [LATIN SMALL LETTER O WITH DIAERESIS] case '\u00F8': // ø [LATIN SMALL LETTER O WITH STROKE] case '\u014D': // Å [LATIN SMALL LETTER O WITH MACRON] case '\u014F': // Å [LATIN SMALL LETTER O WITH BREVE] case '\u0151': // Å‘ [LATIN SMALL LETTER O WITH DOUBLE ACUTE] case '\u01A1': // Æ¡ [LATIN SMALL LETTER O WITH HORN] case '\u01D2': // Ç’ [LATIN SMALL LETTER O WITH CARON] case '\u01EB': // Ç« [LATIN SMALL LETTER O WITH OGONEK] case '\u01ED': // Ç­ [LATIN SMALL LETTER O WITH OGONEK AND MACRON] case '\u01FF': // Ç¿ [LATIN SMALL LETTER O WITH STROKE AND ACUTE] case '\u020D': // È [LATIN SMALL LETTER O WITH DOUBLE GRAVE] case '\u020F': // È [LATIN SMALL LETTER O WITH INVERTED BREVE] case '\u022B': // È« [LATIN SMALL LETTER O WITH DIAERESIS AND MACRON] case '\u022D': // È­ [LATIN SMALL LETTER O WITH TILDE AND MACRON] case '\u022F': // ȯ [LATIN SMALL LETTER O WITH DOT ABOVE] case '\u0231': // ȱ [LATIN SMALL LETTER O WITH DOT ABOVE AND MACRON] case '\u0254': // É” [LATIN SMALL LETTER OPEN O] case '\u0275': // ɵ [LATIN SMALL LETTER BARRED O] case '\u1D16': // á´– [LATIN SMALL LETTER TOP HALF O] case '\u1D17': // á´— [LATIN SMALL LETTER BOTTOM HALF O] case '\u1D97': // á¶— [LATIN SMALL LETTER OPEN O WITH RETROFLEX HOOK] case '\u1E4D': // á¹ [LATIN SMALL LETTER O WITH TILDE AND ACUTE] case '\u1E4F': // á¹ [LATIN SMALL LETTER O WITH TILDE AND DIAERESIS] case '\u1E51': // ṑ [LATIN SMALL LETTER O WITH MACRON AND GRAVE] case '\u1E53': // ṓ [LATIN SMALL LETTER O WITH MACRON AND ACUTE] case '\u1ECD': // á» [LATIN SMALL LETTER O WITH DOT BELOW] case '\u1ECF': // á» [LATIN SMALL LETTER O WITH HOOK ABOVE] case '\u1ED1': // ố [LATIN SMALL LETTER O WITH CIRCUMFLEX AND ACUTE] case '\u1ED3': // ồ [LATIN SMALL LETTER O WITH CIRCUMFLEX AND GRAVE] case '\u1ED5': // ổ [LATIN SMALL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE] case '\u1ED7': // á»— [LATIN SMALL LETTER O WITH CIRCUMFLEX AND TILDE] case '\u1ED9': // á»™ [LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW] case '\u1EDB': // á»› [LATIN SMALL LETTER O WITH HORN AND ACUTE] case '\u1EDD': // á» [LATIN SMALL LETTER O WITH HORN AND GRAVE] case '\u1EDF': // ở [LATIN SMALL LETTER O WITH HORN AND HOOK ABOVE] case '\u1EE1': // ỡ [LATIN SMALL LETTER O WITH HORN AND TILDE] case '\u1EE3': // ợ [LATIN SMALL LETTER O WITH HORN AND DOT BELOW] case '\u2092': // â‚’ [LATIN SUBSCRIPT SMALL LETTER O] case '\u24DE': // ⓞ [CIRCLED LATIN SMALL LETTER O] case '\u2C7A': // ⱺ [LATIN SMALL LETTER O WITH LOW RING INSIDE] case '\uA74B': // ê‹ [LATIN SMALL LETTER O WITH LONG STROKE OVERLAY] case '\uA74D': // ê [LATIN SMALL LETTER O WITH LOOP] case '\uFF4F': // ï½ [FULLWIDTH LATIN SMALL LETTER O] output[outputPos++] = 'o'; break; case '\u0152': // Å’ [LATIN CAPITAL LIGATURE OE] case '\u0276': // ɶ [LATIN LETTER SMALL CAPITAL OE] output[outputPos++] = 'O'; output[outputPos++] = 'E'; break; case '\uA74E': // êŽ [LATIN CAPITAL LETTER OO] output[outputPos++] = 'O'; output[outputPos++] = 'O'; break; case '\u0222': // È¢ http://en.wikipedia.org/wiki/OU [LATIN CAPITAL LETTER OU] case '\u1D15': // á´• [LATIN LETTER SMALL CAPITAL OU] output[outputPos++] = 'O'; output[outputPos++] = 'U'; break; case '\u24AA': // â’ª [PARENTHESIZED LATIN SMALL LETTER O] output[outputPos++] = '('; output[outputPos++] = 'o'; output[outputPos++] = ')'; break; case '\u0153': // Å“ [LATIN SMALL LIGATURE OE] case '\u1D14': // á´” [LATIN SMALL LETTER TURNED OE] output[outputPos++] = 'o'; output[outputPos++] = 'e'; break; case '\uA74F': // ê [LATIN SMALL LETTER OO] output[outputPos++] = 'o'; output[outputPos++] = 'o'; break; case '\u0223': // È£ http://en.wikipedia.org/wiki/OU [LATIN SMALL LETTER OU] output[outputPos++] = 'o'; output[outputPos++] = 'u'; break; case '\u01A4': // Ƥ [LATIN CAPITAL LETTER P WITH HOOK] case '\u1D18': // á´˜ [LATIN LETTER SMALL CAPITAL P] case '\u1E54': // á¹” [LATIN CAPITAL LETTER P WITH ACUTE] case '\u1E56': // á¹– [LATIN CAPITAL LETTER P WITH DOT ABOVE] case '\u24C5': // â“… [CIRCLED LATIN CAPITAL LETTER P] case '\u2C63': // â±£ [LATIN CAPITAL LETTER P WITH STROKE] case '\uA750': // ê [LATIN CAPITAL LETTER P WITH STROKE THROUGH DESCENDER] case '\uA752': // ê’ [LATIN CAPITAL LETTER P WITH FLOURISH] case '\uA754': // ê” [LATIN CAPITAL LETTER P WITH SQUIRREL TAIL] case '\uFF30': // ï¼° [FULLWIDTH LATIN CAPITAL LETTER P] output[outputPos++] = 'P'; break; case '\u01A5': // Æ¥ [LATIN SMALL LETTER P WITH HOOK] case '\u1D71': // áµ± [LATIN SMALL LETTER P WITH MIDDLE TILDE] case '\u1D7D': // áµ½ [LATIN SMALL LETTER P WITH STROKE] case '\u1D88': // ᶈ [LATIN SMALL LETTER P WITH PALATAL HOOK] case '\u1E55': // ṕ [LATIN SMALL LETTER P WITH ACUTE] case '\u1E57': // á¹— [LATIN SMALL LETTER P WITH DOT ABOVE] case '\u24DF': // ⓟ [CIRCLED LATIN SMALL LETTER P] case '\uA751': // ê‘ [LATIN SMALL LETTER P WITH STROKE THROUGH DESCENDER] case '\uA753': // ê“ [LATIN SMALL LETTER P WITH FLOURISH] case '\uA755': // ê• [LATIN SMALL LETTER P WITH SQUIRREL TAIL] case '\uA7FC': // ꟼ [LATIN EPIGRAPHIC LETTER REVERSED P] case '\uFF50': // ï½ [FULLWIDTH LATIN SMALL LETTER P] output[outputPos++] = 'p'; break; case '\u24AB': // â’« [PARENTHESIZED LATIN SMALL LETTER P] output[outputPos++] = '('; output[outputPos++] = 'p'; output[outputPos++] = ')'; break; case '\u024A': // ÉŠ [LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL] case '\u24C6': // Ⓠ [CIRCLED LATIN CAPITAL LETTER Q] case '\uA756': // ê– [LATIN CAPITAL LETTER Q WITH STROKE THROUGH DESCENDER] case '\uA758': // ê˜ [LATIN CAPITAL LETTER Q WITH DIAGONAL STROKE] case '\uFF31': // ï¼± [FULLWIDTH LATIN CAPITAL LETTER Q] output[outputPos++] = 'Q'; break; case '\u0138': // ĸ http://en.wikipedia.org/wiki/Kra_(letter) [LATIN SMALL LETTER KRA] case '\u024B': // É‹ [LATIN SMALL LETTER Q WITH HOOK TAIL] case '\u02A0': // Ê  [LATIN SMALL LETTER Q WITH HOOK] case '\u24E0': // â“  [CIRCLED LATIN SMALL LETTER Q] case '\uA757': // ê— [LATIN SMALL LETTER Q WITH STROKE THROUGH DESCENDER] case '\uA759': // ê™ [LATIN SMALL LETTER Q WITH DIAGONAL STROKE] case '\uFF51': // q [FULLWIDTH LATIN SMALL LETTER Q] output[outputPos++] = 'q'; break; case '\u24AC': // â’¬ [PARENTHESIZED LATIN SMALL LETTER Q] output[outputPos++] = '('; output[outputPos++] = 'q'; output[outputPos++] = ')'; break; case '\u0239': // ȹ [LATIN SMALL LETTER QP DIGRAPH] output[outputPos++] = 'q'; output[outputPos++] = 'p'; break; case '\u0154': // Å” [LATIN CAPITAL LETTER R WITH ACUTE] case '\u0156': // Å– [LATIN CAPITAL LETTER R WITH CEDILLA] case '\u0158': // Ř [LATIN CAPITAL LETTER R WITH CARON] case '\u0210': // È’ [LATIN CAPITAL LETTER R WITH DOUBLE GRAVE] case '\u0212': // È’ [LATIN CAPITAL LETTER R WITH INVERTED BREVE] case '\u024C': // ÉŒ [LATIN CAPITAL LETTER R WITH STROKE] case '\u0280': // Ê€ [LATIN LETTER SMALL CAPITAL R] case '\u0281': // Ê [LATIN LETTER SMALL CAPITAL INVERTED R] case '\u1D19': // á´™ [LATIN LETTER SMALL CAPITAL REVERSED R] case '\u1D1A': // á´š [LATIN LETTER SMALL CAPITAL TURNED R] case '\u1E58': // Ṙ [LATIN CAPITAL LETTER R WITH DOT ABOVE] case '\u1E5A': // Ṛ [LATIN CAPITAL LETTER R WITH DOT BELOW] case '\u1E5C': // Ṝ [LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON] case '\u1E5E': // Ṟ [LATIN CAPITAL LETTER R WITH LINE BELOW] case '\u24C7': // Ⓡ [CIRCLED LATIN CAPITAL LETTER R] case '\u2C64': // Ɽ [LATIN CAPITAL LETTER R WITH TAIL] case '\uA75A': // êš [LATIN CAPITAL LETTER R ROTUNDA] case '\uA782': // êž‚ [LATIN CAPITAL LETTER INSULAR R] case '\uFF32': // ï¼² [FULLWIDTH LATIN CAPITAL LETTER R] output[outputPos++] = 'R'; break; case '\u0155': // Å• [LATIN SMALL LETTER R WITH ACUTE] case '\u0157': // Å— [LATIN SMALL LETTER R WITH CEDILLA] case '\u0159': // Å™ [LATIN SMALL LETTER R WITH CARON] case '\u0211': // È‘ [LATIN SMALL LETTER R WITH DOUBLE GRAVE] case '\u0213': // È“ [LATIN SMALL LETTER R WITH INVERTED BREVE] case '\u024D': // É [LATIN SMALL LETTER R WITH STROKE] case '\u027C': // ɼ [LATIN SMALL LETTER R WITH LONG LEG] case '\u027D': // ɽ [LATIN SMALL LETTER R WITH TAIL] case '\u027E': // ɾ [LATIN SMALL LETTER R WITH FISHHOOK] case '\u027F': // É¿ [LATIN SMALL LETTER REVERSED R WITH FISHHOOK] case '\u1D63': // áµ£ [LATIN SUBSCRIPT SMALL LETTER R] case '\u1D72': // áµ² [LATIN SMALL LETTER R WITH MIDDLE TILDE] case '\u1D73': // áµ³ [LATIN SMALL LETTER R WITH FISHHOOK AND MIDDLE TILDE] case '\u1D89': // ᶉ [LATIN SMALL LETTER R WITH PALATAL HOOK] case '\u1E59': // á¹™ [LATIN SMALL LETTER R WITH DOT ABOVE] case '\u1E5B': // á¹› [LATIN SMALL LETTER R WITH DOT BELOW] case '\u1E5D': // á¹ [LATIN SMALL LETTER R WITH DOT BELOW AND MACRON] case '\u1E5F': // ṟ [LATIN SMALL LETTER R WITH LINE BELOW] case '\u24E1': // â“¡ [CIRCLED LATIN SMALL LETTER R] case '\uA75B': // ê› [LATIN SMALL LETTER R ROTUNDA] case '\uA783': // ꞃ [LATIN SMALL LETTER INSULAR R] case '\uFF52': // ï½’ [FULLWIDTH LATIN SMALL LETTER R] output[outputPos++] = 'r'; break; case '\u24AD': // â’­ [PARENTHESIZED LATIN SMALL LETTER R] output[outputPos++] = '('; output[outputPos++] = 'r'; output[outputPos++] = ')'; break; case '\u015A': // Åš [LATIN CAPITAL LETTER S WITH ACUTE] case '\u015C': // Åœ [LATIN CAPITAL LETTER S WITH CIRCUMFLEX] case '\u015E': // Åž [LATIN CAPITAL LETTER S WITH CEDILLA] case '\u0160': // Å  [LATIN CAPITAL LETTER S WITH CARON] case '\u0218': // Ș [LATIN CAPITAL LETTER S WITH COMMA BELOW] case '\u1E60': // á¹  [LATIN CAPITAL LETTER S WITH DOT ABOVE] case '\u1E62': // á¹¢ [LATIN CAPITAL LETTER S WITH DOT BELOW] case '\u1E64': // Ṥ [LATIN CAPITAL LETTER S WITH ACUTE AND DOT ABOVE] case '\u1E66': // Ṧ [LATIN CAPITAL LETTER S WITH CARON AND DOT ABOVE] case '\u1E68': // Ṩ [LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE] case '\u24C8': // Ⓢ [CIRCLED LATIN CAPITAL LETTER S] case '\uA731': // ꜱ [LATIN LETTER SMALL CAPITAL S] case '\uA785': // êž… [LATIN SMALL LETTER INSULAR S] case '\uFF33': // ï¼³ [FULLWIDTH LATIN CAPITAL LETTER S] output[outputPos++] = 'S'; break; case '\u015B': // Å› [LATIN SMALL LETTER S WITH ACUTE] case '\u015D': // Å [LATIN SMALL LETTER S WITH CIRCUMFLEX] case '\u015F': // ÅŸ [LATIN SMALL LETTER S WITH CEDILLA] case '\u0161': // Å¡ [LATIN SMALL LETTER S WITH CARON] case '\u017F': // Å¿ http://en.wikipedia.org/wiki/Long_S [LATIN SMALL LETTER LONG S] case '\u0219': // È™ [LATIN SMALL LETTER S WITH COMMA BELOW] case '\u023F': // È¿ [LATIN SMALL LETTER S WITH SWASH TAIL] case '\u0282': // Ê‚ [LATIN SMALL LETTER S WITH HOOK] case '\u1D74': // áµ´ [LATIN SMALL LETTER S WITH MIDDLE TILDE] case '\u1D8A': // á¶Š [LATIN SMALL LETTER S WITH PALATAL HOOK] case '\u1E61': // ṡ [LATIN SMALL LETTER S WITH DOT ABOVE] case '\u1E63': // á¹£ [LATIN SMALL LETTER S WITH DOT BELOW] case '\u1E65': // á¹¥ [LATIN SMALL LETTER S WITH ACUTE AND DOT ABOVE] case '\u1E67': // á¹§ [LATIN SMALL LETTER S WITH CARON AND DOT ABOVE] case '\u1E69': // ṩ [LATIN SMALL LETTER S WITH DOT BELOW AND DOT ABOVE] case '\u1E9C': // ẜ [LATIN SMALL LETTER LONG S WITH DIAGONAL STROKE] case '\u1E9D': // Ạ[LATIN SMALL LETTER LONG S WITH HIGH STROKE] case '\u24E2': // â“¢ [CIRCLED LATIN SMALL LETTER S] case '\uA784': // êž„ [LATIN CAPITAL LETTER INSULAR S] case '\uFF53': // s [FULLWIDTH LATIN SMALL LETTER S] output[outputPos++] = 's'; break; case '\u1E9E': // ẞ [LATIN CAPITAL LETTER SHARP S] output[outputPos++] = 'S'; output[outputPos++] = 'S'; break; case '\u24AE': // â’® [PARENTHESIZED LATIN SMALL LETTER S] output[outputPos++] = '('; output[outputPos++] = 's'; output[outputPos++] = ')'; break; case '\u00DF': // ß [LATIN SMALL LETTER SHARP S] output[outputPos++] = 's'; output[outputPos++] = 's'; break; case '\uFB06': // st [LATIN SMALL LIGATURE ST] output[outputPos++] = 's'; output[outputPos++] = 't'; break; case '\u0162': // Å¢ [LATIN CAPITAL LETTER T WITH CEDILLA] case '\u0164': // Ť [LATIN CAPITAL LETTER T WITH CARON] case '\u0166': // Ŧ [LATIN CAPITAL LETTER T WITH STROKE] case '\u01AC': // Ƭ [LATIN CAPITAL LETTER T WITH HOOK] case '\u01AE': // Æ® [LATIN CAPITAL LETTER T WITH RETROFLEX HOOK] case '\u021A': // Èš [LATIN CAPITAL LETTER T WITH COMMA BELOW] case '\u023E': // Ⱦ [LATIN CAPITAL LETTER T WITH DIAGONAL STROKE] case '\u1D1B': // á´› [LATIN LETTER SMALL CAPITAL T] case '\u1E6A': // Ṫ [LATIN CAPITAL LETTER T WITH DOT ABOVE] case '\u1E6C': // Ṭ [LATIN CAPITAL LETTER T WITH DOT BELOW] case '\u1E6E': // á¹® [LATIN CAPITAL LETTER T WITH LINE BELOW] case '\u1E70': // á¹° [LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW] case '\u24C9': // Ⓣ [CIRCLED LATIN CAPITAL LETTER T] case '\uA786': // Ꞇ [LATIN CAPITAL LETTER INSULAR T] case '\uFF34': // ï¼´ [FULLWIDTH LATIN CAPITAL LETTER T] output[outputPos++] = 'T'; break; case '\u0163': // Å£ [LATIN SMALL LETTER T WITH CEDILLA] case '\u0165': // Å¥ [LATIN SMALL LETTER T WITH CARON] case '\u0167': // ŧ [LATIN SMALL LETTER T WITH STROKE] case '\u01AB': // Æ« [LATIN SMALL LETTER T WITH PALATAL HOOK] case '\u01AD': // Æ­ [LATIN SMALL LETTER T WITH HOOK] case '\u021B': // È› [LATIN SMALL LETTER T WITH COMMA BELOW] case '\u0236': // ȶ [LATIN SMALL LETTER T WITH CURL] case '\u0287': // ʇ [LATIN SMALL LETTER TURNED T] case '\u0288': // ʈ [LATIN SMALL LETTER T WITH RETROFLEX HOOK] case '\u1D75': // áµµ [LATIN SMALL LETTER T WITH MIDDLE TILDE] case '\u1E6B': // ṫ [LATIN SMALL LETTER T WITH DOT ABOVE] case '\u1E6D': // á¹­ [LATIN SMALL LETTER T WITH DOT BELOW] case '\u1E6F': // ṯ [LATIN SMALL LETTER T WITH LINE BELOW] case '\u1E71': // á¹± [LATIN SMALL LETTER T WITH CIRCUMFLEX BELOW] case '\u1E97': // ẗ [LATIN SMALL LETTER T WITH DIAERESIS] case '\u24E3': // â“£ [CIRCLED LATIN SMALL LETTER T] case '\u2C66': // ⱦ [LATIN SMALL LETTER T WITH DIAGONAL STROKE] case '\uFF54': // ï½” [FULLWIDTH LATIN SMALL LETTER T] output[outputPos++] = 't'; break; case '\u00DE': // Þ [LATIN CAPITAL LETTER THORN] case '\uA766': // ê¦ [LATIN CAPITAL LETTER THORN WITH STROKE THROUGH DESCENDER] output[outputPos++] = 'T'; output[outputPos++] = 'H'; break; case '\uA728': // Ꜩ [LATIN CAPITAL LETTER TZ] output[outputPos++] = 'T'; output[outputPos++] = 'Z'; break; case '\u24AF': // â’¯ [PARENTHESIZED LATIN SMALL LETTER T] output[outputPos++] = '('; output[outputPos++] = 't'; output[outputPos++] = ')'; break; case '\u02A8': // ʨ [LATIN SMALL LETTER TC DIGRAPH WITH CURL] output[outputPos++] = 't'; output[outputPos++] = 'c'; break; case '\u00FE': // þ [LATIN SMALL LETTER THORN] case '\u1D7A': // ᵺ [LATIN SMALL LETTER TH WITH STRIKETHROUGH] case '\uA767': // ê§ [LATIN SMALL LETTER THORN WITH STROKE THROUGH DESCENDER] output[outputPos++] = 't'; output[outputPos++] = 'h'; break; case '\u02A6': // ʦ [LATIN SMALL LETTER TS DIGRAPH] output[outputPos++] = 't'; output[outputPos++] = 's'; break; case '\uA729': // ꜩ [LATIN SMALL LETTER TZ] output[outputPos++] = 't'; output[outputPos++] = 'z'; break; case '\u00D9': // Ù [LATIN CAPITAL LETTER U WITH GRAVE] case '\u00DA': // Ú [LATIN CAPITAL LETTER U WITH ACUTE] case '\u00DB': // Û [LATIN CAPITAL LETTER U WITH CIRCUMFLEX] case '\u00DC': // Ü [LATIN CAPITAL LETTER U WITH DIAERESIS] case '\u0168': // Ũ [LATIN CAPITAL LETTER U WITH TILDE] case '\u016A': // Ū [LATIN CAPITAL LETTER U WITH MACRON] case '\u016C': // Ŭ [LATIN CAPITAL LETTER U WITH BREVE] case '\u016E': // Å® [LATIN CAPITAL LETTER U WITH RING ABOVE] case '\u0170': // Ű [LATIN CAPITAL LETTER U WITH DOUBLE ACUTE] case '\u0172': // Ų [LATIN CAPITAL LETTER U WITH OGONEK] case '\u01AF': // Ư [LATIN CAPITAL LETTER U WITH HORN] case '\u01D3': // Ç“ [LATIN CAPITAL LETTER U WITH CARON] case '\u01D5': // Ç• [LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON] case '\u01D7': // Ç— [LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE] case '\u01D9': // Ç™ [LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON] case '\u01DB': // Ç› [LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE] case '\u0214': // È” [LATIN CAPITAL LETTER U WITH DOUBLE GRAVE] case '\u0216': // È– [LATIN CAPITAL LETTER U WITH INVERTED BREVE] case '\u0244': // É„ [LATIN CAPITAL LETTER U BAR] case '\u1D1C': // á´œ [LATIN LETTER SMALL CAPITAL U] case '\u1D7E': // áµ¾ [LATIN SMALL CAPITAL LETTER U WITH STROKE] case '\u1E72': // á¹² [LATIN CAPITAL LETTER U WITH DIAERESIS BELOW] case '\u1E74': // á¹´ [LATIN CAPITAL LETTER U WITH TILDE BELOW] case '\u1E76': // á¹¶ [LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW] case '\u1E78': // Ṹ [LATIN CAPITAL LETTER U WITH TILDE AND ACUTE] case '\u1E7A': // Ṻ [LATIN CAPITAL LETTER U WITH MACRON AND DIAERESIS] case '\u1EE4': // Ụ [LATIN CAPITAL LETTER U WITH DOT BELOW] case '\u1EE6': // Ủ [LATIN CAPITAL LETTER U WITH HOOK ABOVE] case '\u1EE8': // Ứ [LATIN CAPITAL LETTER U WITH HORN AND ACUTE] case '\u1EEA': // Ừ [LATIN CAPITAL LETTER U WITH HORN AND GRAVE] case '\u1EEC': // Ử [LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE] case '\u1EEE': // á»® [LATIN CAPITAL LETTER U WITH HORN AND TILDE] case '\u1EF0': // á»° [LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW] case '\u24CA': // Ⓤ [CIRCLED LATIN CAPITAL LETTER U] case '\uFF35': // ï¼µ [FULLWIDTH LATIN CAPITAL LETTER U] output[outputPos++] = 'U'; break; case '\u00F9': // ù [LATIN SMALL LETTER U WITH GRAVE] case '\u00FA': // ú [LATIN SMALL LETTER U WITH ACUTE] case '\u00FB': // û [LATIN SMALL LETTER U WITH CIRCUMFLEX] case '\u00FC': // ü [LATIN SMALL LETTER U WITH DIAERESIS] case '\u0169': // Å© [LATIN SMALL LETTER U WITH TILDE] case '\u016B': // Å« [LATIN SMALL LETTER U WITH MACRON] case '\u016D': // Å­ [LATIN SMALL LETTER U WITH BREVE] case '\u016F': // ů [LATIN SMALL LETTER U WITH RING ABOVE] case '\u0171': // ű [LATIN SMALL LETTER U WITH DOUBLE ACUTE] case '\u0173': // ų [LATIN SMALL LETTER U WITH OGONEK] case '\u01B0': // ư [LATIN SMALL LETTER U WITH HORN] case '\u01D4': // Ç” [LATIN SMALL LETTER U WITH CARON] case '\u01D6': // Ç– [LATIN SMALL LETTER U WITH DIAERESIS AND MACRON] case '\u01D8': // ǘ [LATIN SMALL LETTER U WITH DIAERESIS AND ACUTE] case '\u01DA': // Çš [LATIN SMALL LETTER U WITH DIAERESIS AND CARON] case '\u01DC': // Çœ [LATIN SMALL LETTER U WITH DIAERESIS AND GRAVE] case '\u0215': // È• [LATIN SMALL LETTER U WITH DOUBLE GRAVE] case '\u0217': // È— [LATIN SMALL LETTER U WITH INVERTED BREVE] case '\u0289': // ʉ [LATIN SMALL LETTER U BAR] case '\u1D64': // ᵤ [LATIN SUBSCRIPT SMALL LETTER U] case '\u1D99': // á¶™ [LATIN SMALL LETTER U WITH RETROFLEX HOOK] case '\u1E73': // á¹³ [LATIN SMALL LETTER U WITH DIAERESIS BELOW] case '\u1E75': // á¹µ [LATIN SMALL LETTER U WITH TILDE BELOW] case '\u1E77': // á¹· [LATIN SMALL LETTER U WITH CIRCUMFLEX BELOW] case '\u1E79': // á¹¹ [LATIN SMALL LETTER U WITH TILDE AND ACUTE] case '\u1E7B': // á¹» [LATIN SMALL LETTER U WITH MACRON AND DIAERESIS] case '\u1EE5': // ụ [LATIN SMALL LETTER U WITH DOT BELOW] case '\u1EE7': // á»§ [LATIN SMALL LETTER U WITH HOOK ABOVE] case '\u1EE9': // ứ [LATIN SMALL LETTER U WITH HORN AND ACUTE] case '\u1EEB': // ừ [LATIN SMALL LETTER U WITH HORN AND GRAVE] case '\u1EED': // á»­ [LATIN SMALL LETTER U WITH HORN AND HOOK ABOVE] case '\u1EEF': // ữ [LATIN SMALL LETTER U WITH HORN AND TILDE] case '\u1EF1': // á»± [LATIN SMALL LETTER U WITH HORN AND DOT BELOW] case '\u24E4': // ⓤ [CIRCLED LATIN SMALL LETTER U] case '\uFF55': // u [FULLWIDTH LATIN SMALL LETTER U] output[outputPos++] = 'u'; break; case '\u24B0': // â’° [PARENTHESIZED LATIN SMALL LETTER U] output[outputPos++] = '('; output[outputPos++] = 'u'; output[outputPos++] = ')'; break; case '\u1D6B': // ᵫ [LATIN SMALL LETTER UE] output[outputPos++] = 'u'; output[outputPos++] = 'e'; break; case '\u01B2': // Ʋ [LATIN CAPITAL LETTER V WITH HOOK] case '\u0245': // É… [LATIN CAPITAL LETTER TURNED V] case '\u1D20': // á´  [LATIN LETTER SMALL CAPITAL V] case '\u1E7C': // á¹¼ [LATIN CAPITAL LETTER V WITH TILDE] case '\u1E7E': // á¹¾ [LATIN CAPITAL LETTER V WITH DOT BELOW] case '\u1EFC': // Ỽ [LATIN CAPITAL LETTER MIDDLE-WELSH V] case '\u24CB': // â“‹ [CIRCLED LATIN CAPITAL LETTER V] case '\uA75E': // êž [LATIN CAPITAL LETTER V WITH DIAGONAL STROKE] case '\uA768': // ê¨ [LATIN CAPITAL LETTER VEND] case '\uFF36': // ï¼¶ [FULLWIDTH LATIN CAPITAL LETTER V] output[outputPos++] = 'V'; break; case '\u028B': // Ê‹ [LATIN SMALL LETTER V WITH HOOK] case '\u028C': // ÊŒ [LATIN SMALL LETTER TURNED V] case '\u1D65': // áµ¥ [LATIN SUBSCRIPT SMALL LETTER V] case '\u1D8C': // á¶Œ [LATIN SMALL LETTER V WITH PALATAL HOOK] case '\u1E7D': // á¹½ [LATIN SMALL LETTER V WITH TILDE] case '\u1E7F': // ṿ [LATIN SMALL LETTER V WITH DOT BELOW] case '\u24E5': // â“¥ [CIRCLED LATIN SMALL LETTER V] case '\u2C71': // â±± [LATIN SMALL LETTER V WITH RIGHT HOOK] case '\u2C74': // â±´ [LATIN SMALL LETTER V WITH CURL] case '\uA75F': // êŸ [LATIN SMALL LETTER V WITH DIAGONAL STROKE] case '\uFF56': // ï½– [FULLWIDTH LATIN SMALL LETTER V] output[outputPos++] = 'v'; break; case '\uA760': // ê  [LATIN CAPITAL LETTER VY] output[outputPos++] = 'V'; output[outputPos++] = 'Y'; break; case '\u24B1': // â’± [PARENTHESIZED LATIN SMALL LETTER V] output[outputPos++] = '('; output[outputPos++] = 'v'; output[outputPos++] = ')'; break; case '\uA761': // ê¡ [LATIN SMALL LETTER VY] output[outputPos++] = 'v'; output[outputPos++] = 'y'; break; case '\u0174': // Å´ [LATIN CAPITAL LETTER W WITH CIRCUMFLEX] case '\u01F7': // Ç· http://en.wikipedia.org/wiki/Wynn [LATIN CAPITAL LETTER WYNN] case '\u1D21': // á´¡ [LATIN LETTER SMALL CAPITAL W] case '\u1E80': // Ẁ [LATIN CAPITAL LETTER W WITH GRAVE] case '\u1E82': // Ẃ [LATIN CAPITAL LETTER W WITH ACUTE] case '\u1E84': // Ẅ [LATIN CAPITAL LETTER W WITH DIAERESIS] case '\u1E86': // Ẇ [LATIN CAPITAL LETTER W WITH DOT ABOVE] case '\u1E88': // Ẉ [LATIN CAPITAL LETTER W WITH DOT BELOW] case '\u24CC': // Ⓦ [CIRCLED LATIN CAPITAL LETTER W] case '\u2C72': // â±² [LATIN CAPITAL LETTER W WITH HOOK] case '\uFF37': // ï¼· [FULLWIDTH LATIN CAPITAL LETTER W] output[outputPos++] = 'W'; break; case '\u0175': // ŵ [LATIN SMALL LETTER W WITH CIRCUMFLEX] case '\u01BF': // Æ¿ http://en.wikipedia.org/wiki/Wynn [LATIN LETTER WYNN] case '\u028D': // Ê [LATIN SMALL LETTER TURNED W] case '\u1E81': // Ạ[LATIN SMALL LETTER W WITH GRAVE] case '\u1E83': // ẃ [LATIN SMALL LETTER W WITH ACUTE] case '\u1E85': // ẅ [LATIN SMALL LETTER W WITH DIAERESIS] case '\u1E87': // ẇ [LATIN SMALL LETTER W WITH DOT ABOVE] case '\u1E89': // ẉ [LATIN SMALL LETTER W WITH DOT BELOW] case '\u1E98': // ẘ [LATIN SMALL LETTER W WITH RING ABOVE] case '\u24E6': // ⓦ [CIRCLED LATIN SMALL LETTER W] case '\u2C73': // â±³ [LATIN SMALL LETTER W WITH HOOK] case '\uFF57': // ï½— [FULLWIDTH LATIN SMALL LETTER W] output[outputPos++] = 'w'; break; case '\u24B2': // â’² [PARENTHESIZED LATIN SMALL LETTER W] output[outputPos++] = '('; output[outputPos++] = 'w'; output[outputPos++] = ')'; break; case '\u1E8A': // Ẋ [LATIN CAPITAL LETTER X WITH DOT ABOVE] case '\u1E8C': // Ẍ [LATIN CAPITAL LETTER X WITH DIAERESIS] case '\u24CD': // â“ [CIRCLED LATIN CAPITAL LETTER X] case '\uFF38': // X [FULLWIDTH LATIN CAPITAL LETTER X] output[outputPos++] = 'X'; break; case '\u1D8D': // á¶ [LATIN SMALL LETTER X WITH PALATAL HOOK] case '\u1E8B': // ẋ [LATIN SMALL LETTER X WITH DOT ABOVE] case '\u1E8D': // Ạ[LATIN SMALL LETTER X WITH DIAERESIS] case '\u2093': // â‚“ [LATIN SUBSCRIPT SMALL LETTER X] case '\u24E7': // â“§ [CIRCLED LATIN SMALL LETTER X] case '\uFF58': // x [FULLWIDTH LATIN SMALL LETTER X] output[outputPos++] = 'x'; break; case '\u24B3': // â’³ [PARENTHESIZED LATIN SMALL LETTER X] output[outputPos++] = '('; output[outputPos++] = 'x'; output[outputPos++] = ')'; break; case '\u00DD': // à [LATIN CAPITAL LETTER Y WITH ACUTE] case '\u0176': // Ŷ [LATIN CAPITAL LETTER Y WITH CIRCUMFLEX] case '\u0178': // Ÿ [LATIN CAPITAL LETTER Y WITH DIAERESIS] case '\u01B3': // Ƴ [LATIN CAPITAL LETTER Y WITH HOOK] case '\u0232': // Ȳ [LATIN CAPITAL LETTER Y WITH MACRON] case '\u024E': // ÉŽ [LATIN CAPITAL LETTER Y WITH STROKE] case '\u028F': // Ê [LATIN LETTER SMALL CAPITAL Y] case '\u1E8E': // Ẏ [LATIN CAPITAL LETTER Y WITH DOT ABOVE] case '\u1EF2': // Ỳ [LATIN CAPITAL LETTER Y WITH GRAVE] case '\u1EF4': // á»´ [LATIN CAPITAL LETTER Y WITH DOT BELOW] case '\u1EF6': // á»¶ [LATIN CAPITAL LETTER Y WITH HOOK ABOVE] case '\u1EF8': // Ỹ [LATIN CAPITAL LETTER Y WITH TILDE] case '\u1EFE': // Ỿ [LATIN CAPITAL LETTER Y WITH LOOP] case '\u24CE': // Ⓨ [CIRCLED LATIN CAPITAL LETTER Y] case '\uFF39': // ï¼¹ [FULLWIDTH LATIN CAPITAL LETTER Y] output[outputPos++] = 'Y'; break; case '\u00FD': // ý [LATIN SMALL LETTER Y WITH ACUTE] case '\u00FF': // ÿ [LATIN SMALL LETTER Y WITH DIAERESIS] case '\u0177': // Å· [LATIN SMALL LETTER Y WITH CIRCUMFLEX] case '\u01B4': // Æ´ [LATIN SMALL LETTER Y WITH HOOK] case '\u0233': // ȳ [LATIN SMALL LETTER Y WITH MACRON] case '\u024F': // É [LATIN SMALL LETTER Y WITH STROKE] case '\u028E': // ÊŽ [LATIN SMALL LETTER TURNED Y] case '\u1E8F': // Ạ[LATIN SMALL LETTER Y WITH DOT ABOVE] case '\u1E99': // ẙ [LATIN SMALL LETTER Y WITH RING ABOVE] case '\u1EF3': // ỳ [LATIN SMALL LETTER Y WITH GRAVE] case '\u1EF5': // ỵ [LATIN SMALL LETTER Y WITH DOT BELOW] case '\u1EF7': // á»· [LATIN SMALL LETTER Y WITH HOOK ABOVE] case '\u1EF9': // ỹ [LATIN SMALL LETTER Y WITH TILDE] case '\u1EFF': // ỿ [LATIN SMALL LETTER Y WITH LOOP] case '\u24E8': // ⓨ [CIRCLED LATIN SMALL LETTER Y] case '\uFF59': // ï½™ [FULLWIDTH LATIN SMALL LETTER Y] output[outputPos++] = 'y'; break; case '\u24B4': // â’´ [PARENTHESIZED LATIN SMALL LETTER Y] output[outputPos++] = '('; output[outputPos++] = 'y'; output[outputPos++] = ')'; break; case '\u0179': // Ź [LATIN CAPITAL LETTER Z WITH ACUTE] case '\u017B': // Å» [LATIN CAPITAL LETTER Z WITH DOT ABOVE] case '\u017D': // Ž [LATIN CAPITAL LETTER Z WITH CARON] case '\u01B5': // Ƶ [LATIN CAPITAL LETTER Z WITH STROKE] case '\u021C': // Èœ http://en.wikipedia.org/wiki/Yogh [LATIN CAPITAL LETTER YOGH] case '\u0224': // Ȥ [LATIN CAPITAL LETTER Z WITH HOOK] case '\u1D22': // á´¢ [LATIN LETTER SMALL CAPITAL Z] case '\u1E90': // Ạ[LATIN CAPITAL LETTER Z WITH CIRCUMFLEX] case '\u1E92': // Ẓ [LATIN CAPITAL LETTER Z WITH DOT BELOW] case '\u1E94': // Ẕ [LATIN CAPITAL LETTER Z WITH LINE BELOW] case '\u24CF': // â“ [CIRCLED LATIN CAPITAL LETTER Z] case '\u2C6B': // Ⱬ [LATIN CAPITAL LETTER Z WITH DESCENDER] case '\uA762': // ê¢ [LATIN CAPITAL LETTER VISIGOTHIC Z] case '\uFF3A': // Z [FULLWIDTH LATIN CAPITAL LETTER Z] output[outputPos++] = 'Z'; break; case '\u017A': // ź [LATIN SMALL LETTER Z WITH ACUTE] case '\u017C': // ż [LATIN SMALL LETTER Z WITH DOT ABOVE] case '\u017E': // ž [LATIN SMALL LETTER Z WITH CARON] case '\u01B6': // ƶ [LATIN SMALL LETTER Z WITH STROKE] case '\u021D': // È http://en.wikipedia.org/wiki/Yogh [LATIN SMALL LETTER YOGH] case '\u0225': // È¥ [LATIN SMALL LETTER Z WITH HOOK] case '\u0240': // É€ [LATIN SMALL LETTER Z WITH SWASH TAIL] case '\u0290': // Ê [LATIN SMALL LETTER Z WITH RETROFLEX HOOK] case '\u0291': // Ê‘ [LATIN SMALL LETTER Z WITH CURL] case '\u1D76': // áµ¶ [LATIN SMALL LETTER Z WITH MIDDLE TILDE] case '\u1D8E': // á¶Ž [LATIN SMALL LETTER Z WITH PALATAL HOOK] case '\u1E91': // ẑ [LATIN SMALL LETTER Z WITH CIRCUMFLEX] case '\u1E93': // ẓ [LATIN SMALL LETTER Z WITH DOT BELOW] case '\u1E95': // ẕ [LATIN SMALL LETTER Z WITH LINE BELOW] case '\u24E9': // â“© [CIRCLED LATIN SMALL LETTER Z] case '\u2C6C': // ⱬ [LATIN SMALL LETTER Z WITH DESCENDER] case '\uA763': // ê£ [LATIN SMALL LETTER VISIGOTHIC Z] case '\uFF5A': // z [FULLWIDTH LATIN SMALL LETTER Z] output[outputPos++] = 'z'; break; case '\u24B5': // â’µ [PARENTHESIZED LATIN SMALL LETTER Z] output[outputPos++] = '('; output[outputPos++] = 'z'; output[outputPos++] = ')'; break; case '\u2070': // â° [SUPERSCRIPT ZERO] case '\u2080': // â‚€ [SUBSCRIPT ZERO] case '\u24EA': // ⓪ [CIRCLED DIGIT ZERO] case '\u24FF': // â“¿ [NEGATIVE CIRCLED DIGIT ZERO] case '\uFF10': // ï¼ [FULLWIDTH DIGIT ZERO] output[outputPos++] = '0'; break; case '\u00B9': // ¹ [SUPERSCRIPT ONE] case '\u2081': // â‚ [SUBSCRIPT ONE] case '\u2460': // â‘  [CIRCLED DIGIT ONE] case '\u24F5': // ⓵ [DOUBLE CIRCLED DIGIT ONE] case '\u2776': // â¶ [DINGBAT NEGATIVE CIRCLED DIGIT ONE] case '\u2780': // ➀ [DINGBAT CIRCLED SANS-SERIF DIGIT ONE] case '\u278A': // ➊ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ONE] case '\uFF11': // 1 [FULLWIDTH DIGIT ONE] output[outputPos++] = '1'; break; case '\u2488': // â’ˆ [DIGIT ONE FULL STOP] output[outputPos++] = '1'; output[outputPos++] = '.'; break; case '\u2474': // â‘´ [PARENTHESIZED DIGIT ONE] output[outputPos++] = '('; output[outputPos++] = '1'; output[outputPos++] = ')'; break; case '\u00B2': // ² [SUPERSCRIPT TWO] case '\u2082': // â‚‚ [SUBSCRIPT TWO] case '\u2461': // â‘¡ [CIRCLED DIGIT TWO] case '\u24F6': // â“¶ [DOUBLE CIRCLED DIGIT TWO] case '\u2777': // â· [DINGBAT NEGATIVE CIRCLED DIGIT TWO] case '\u2781': // âž [DINGBAT CIRCLED SANS-SERIF DIGIT TWO] case '\u278B': // âž‹ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT TWO] case '\uFF12': // ï¼’ [FULLWIDTH DIGIT TWO] output[outputPos++] = '2'; break; case '\u2489': // â’‰ [DIGIT TWO FULL STOP] output[outputPos++] = '2'; output[outputPos++] = '.'; break; case '\u2475': // ⑵ [PARENTHESIZED DIGIT TWO] output[outputPos++] = '('; output[outputPos++] = '2'; output[outputPos++] = ')'; break; case '\u00B3': // ³ [SUPERSCRIPT THREE] case '\u2083': // ₃ [SUBSCRIPT THREE] case '\u2462': // â‘¢ [CIRCLED DIGIT THREE] case '\u24F7': // â“· [DOUBLE CIRCLED DIGIT THREE] case '\u2778': // ⸠[DINGBAT NEGATIVE CIRCLED DIGIT THREE] case '\u2782': // âž‚ [DINGBAT CIRCLED SANS-SERIF DIGIT THREE] case '\u278C': // ➌ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT THREE] case '\uFF13': // 3 [FULLWIDTH DIGIT THREE] output[outputPos++] = '3'; break; case '\u248A': // â’Š [DIGIT THREE FULL STOP] output[outputPos++] = '3'; output[outputPos++] = '.'; break; case '\u2476': // â‘¶ [PARENTHESIZED DIGIT THREE] output[outputPos++] = '('; output[outputPos++] = '3'; output[outputPos++] = ')'; break; case '\u2074': // â´ [SUPERSCRIPT FOUR] case '\u2084': // â‚„ [SUBSCRIPT FOUR] case '\u2463': // â‘£ [CIRCLED DIGIT FOUR] case '\u24F8': // ⓸ [DOUBLE CIRCLED DIGIT FOUR] case '\u2779': // â¹ [DINGBAT NEGATIVE CIRCLED DIGIT FOUR] case '\u2783': // ➃ [DINGBAT CIRCLED SANS-SERIF DIGIT FOUR] case '\u278D': // âž [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT FOUR] case '\uFF14': // ï¼” [FULLWIDTH DIGIT FOUR] output[outputPos++] = '4'; break; case '\u248B': // â’‹ [DIGIT FOUR FULL STOP] output[outputPos++] = '4'; output[outputPos++] = '.'; break; case '\u2477': // â‘· [PARENTHESIZED DIGIT FOUR] output[outputPos++] = '('; output[outputPos++] = '4'; output[outputPos++] = ')'; break; case '\u2075': // âµ [SUPERSCRIPT FIVE] case '\u2085': // â‚… [SUBSCRIPT FIVE] case '\u2464': // ⑤ [CIRCLED DIGIT FIVE] case '\u24F9': // ⓹ [DOUBLE CIRCLED DIGIT FIVE] case '\u277A': // ⺠[DINGBAT NEGATIVE CIRCLED DIGIT FIVE] case '\u2784': // âž„ [DINGBAT CIRCLED SANS-SERIF DIGIT FIVE] case '\u278E': // ➎ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT FIVE] case '\uFF15': // 5 [FULLWIDTH DIGIT FIVE] output[outputPos++] = '5'; break; case '\u248C': // â’Œ [DIGIT FIVE FULL STOP] output[outputPos++] = '5'; output[outputPos++] = '.'; break; case '\u2478': // ⑸ [PARENTHESIZED DIGIT FIVE] output[outputPos++] = '('; output[outputPos++] = '5'; output[outputPos++] = ')'; break; case '\u2076': // â¶ [SUPERSCRIPT SIX] case '\u2086': // ₆ [SUBSCRIPT SIX] case '\u2465': // â‘¥ [CIRCLED DIGIT SIX] case '\u24FA': // ⓺ [DOUBLE CIRCLED DIGIT SIX] case '\u277B': // â» [DINGBAT NEGATIVE CIRCLED DIGIT SIX] case '\u2785': // âž… [DINGBAT CIRCLED SANS-SERIF DIGIT SIX] case '\u278F': // âž [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT SIX] case '\uFF16': // ï¼– [FULLWIDTH DIGIT SIX] output[outputPos++] = '6'; break; case '\u248D': // â’ [DIGIT SIX FULL STOP] output[outputPos++] = '6'; output[outputPos++] = '.'; break; case '\u2479': // ⑹ [PARENTHESIZED DIGIT SIX] output[outputPos++] = '('; output[outputPos++] = '6'; output[outputPos++] = ')'; break; case '\u2077': // â· [SUPERSCRIPT SEVEN] case '\u2087': // ₇ [SUBSCRIPT SEVEN] case '\u2466': // ⑦ [CIRCLED DIGIT SEVEN] case '\u24FB': // â“» [DOUBLE CIRCLED DIGIT SEVEN] case '\u277C': // â¼ [DINGBAT NEGATIVE CIRCLED DIGIT SEVEN] case '\u2786': // ➆ [DINGBAT CIRCLED SANS-SERIF DIGIT SEVEN] case '\u2790': // âž [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT SEVEN] case '\uFF17': // ï¼— [FULLWIDTH DIGIT SEVEN] output[outputPos++] = '7'; break; case '\u248E': // â’Ž [DIGIT SEVEN FULL STOP] output[outputPos++] = '7'; output[outputPos++] = '.'; break; case '\u247A': // ⑺ [PARENTHESIZED DIGIT SEVEN] output[outputPos++] = '('; output[outputPos++] = '7'; output[outputPos++] = ')'; break; case '\u2078': // ⸠[SUPERSCRIPT EIGHT] case '\u2088': // ₈ [SUBSCRIPT EIGHT] case '\u2467': // â‘§ [CIRCLED DIGIT EIGHT] case '\u24FC': // ⓼ [DOUBLE CIRCLED DIGIT EIGHT] case '\u277D': // â½ [DINGBAT NEGATIVE CIRCLED DIGIT EIGHT] case '\u2787': // ➇ [DINGBAT CIRCLED SANS-SERIF DIGIT EIGHT] case '\u2791': // âž‘ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT EIGHT] case '\uFF18': // 8 [FULLWIDTH DIGIT EIGHT] output[outputPos++] = '8'; break; case '\u248F': // â’ [DIGIT EIGHT FULL STOP] output[outputPos++] = '8'; output[outputPos++] = '.'; break; case '\u247B': // â‘» [PARENTHESIZED DIGIT EIGHT] output[outputPos++] = '('; output[outputPos++] = '8'; output[outputPos++] = ')'; break; case '\u2079': // â¹ [SUPERSCRIPT NINE] case '\u2089': // ₉ [SUBSCRIPT NINE] case '\u2468': // ⑨ [CIRCLED DIGIT NINE] case '\u24FD': // ⓽ [DOUBLE CIRCLED DIGIT NINE] case '\u277E': // â¾ [DINGBAT NEGATIVE CIRCLED DIGIT NINE] case '\u2788': // ➈ [DINGBAT CIRCLED SANS-SERIF DIGIT NINE] case '\u2792': // âž’ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT NINE] case '\uFF19': // ï¼™ [FULLWIDTH DIGIT NINE] output[outputPos++] = '9'; break; case '\u2490': // â’ [DIGIT NINE FULL STOP] output[outputPos++] = '9'; output[outputPos++] = '.'; break; case '\u247C': // ⑼ [PARENTHESIZED DIGIT NINE] output[outputPos++] = '('; output[outputPos++] = '9'; output[outputPos++] = ')'; break; case '\u2469': // â‘© [CIRCLED NUMBER TEN] case '\u24FE': // ⓾ [DOUBLE CIRCLED NUMBER TEN] case '\u277F': // â¿ [DINGBAT NEGATIVE CIRCLED NUMBER TEN] case '\u2789': // ➉ [DINGBAT CIRCLED SANS-SERIF NUMBER TEN] case '\u2793': // âž“ [DINGBAT NEGATIVE CIRCLED SANS-SERIF NUMBER TEN] output[outputPos++] = '1'; output[outputPos++] = '0'; break; case '\u2491': // â’‘ [NUMBER TEN FULL STOP] output[outputPos++] = '1'; output[outputPos++] = '0'; output[outputPos++] = '.'; break; case '\u247D': // ⑽ [PARENTHESIZED NUMBER TEN] output[outputPos++] = '('; output[outputPos++] = '1'; output[outputPos++] = '0'; output[outputPos++] = ')'; break; case '\u246A': // ⑪ [CIRCLED NUMBER ELEVEN] case '\u24EB': // â“« [NEGATIVE CIRCLED NUMBER ELEVEN] output[outputPos++] = '1'; output[outputPos++] = '1'; break; case '\u2492': // â’’ [NUMBER ELEVEN FULL STOP] output[outputPos++] = '1'; output[outputPos++] = '1'; output[outputPos++] = '.'; break; case '\u247E': // ⑾ [PARENTHESIZED NUMBER ELEVEN] output[outputPos++] = '('; output[outputPos++] = '1'; output[outputPos++] = '1'; output[outputPos++] = ')'; break; case '\u246B': // â‘« [CIRCLED NUMBER TWELVE] case '\u24EC': // ⓬ [NEGATIVE CIRCLED NUMBER TWELVE] output[outputPos++] = '1'; output[outputPos++] = '2'; break; case '\u2493': // â’“ [NUMBER TWELVE FULL STOP] output[outputPos++] = '1'; output[outputPos++] = '2'; output[outputPos++] = '.'; break; case '\u247F': // â‘¿ [PARENTHESIZED NUMBER TWELVE] output[outputPos++] = '('; output[outputPos++] = '1'; output[outputPos++] = '2'; output[outputPos++] = ')'; break; case '\u246C': // ⑬ [CIRCLED NUMBER THIRTEEN] case '\u24ED': // â“­ [NEGATIVE CIRCLED NUMBER THIRTEEN] output[outputPos++] = '1'; output[outputPos++] = '3'; break; case '\u2494': // â’” [NUMBER THIRTEEN FULL STOP] output[outputPos++] = '1'; output[outputPos++] = '3'; output[outputPos++] = '.'; break; case '\u2480': // â’€ [PARENTHESIZED NUMBER THIRTEEN] output[outputPos++] = '('; output[outputPos++] = '1'; output[outputPos++] = '3'; output[outputPos++] = ')'; break; case '\u246D': // â‘­ [CIRCLED NUMBER FOURTEEN] case '\u24EE': // â“® [NEGATIVE CIRCLED NUMBER FOURTEEN] output[outputPos++] = '1'; output[outputPos++] = '4'; break; case '\u2495': // â’• [NUMBER FOURTEEN FULL STOP] output[outputPos++] = '1'; output[outputPos++] = '4'; output[outputPos++] = '.'; break; case '\u2481': // â’ [PARENTHESIZED NUMBER FOURTEEN] output[outputPos++] = '('; output[outputPos++] = '1'; output[outputPos++] = '4'; output[outputPos++] = ')'; break; case '\u246E': // â‘® [CIRCLED NUMBER FIFTEEN] case '\u24EF': // ⓯ [NEGATIVE CIRCLED NUMBER FIFTEEN] output[outputPos++] = '1'; output[outputPos++] = '5'; break; case '\u2496': // â’– [NUMBER FIFTEEN FULL STOP] output[outputPos++] = '1'; output[outputPos++] = '5'; output[outputPos++] = '.'; break; case '\u2482': // â’‚ [PARENTHESIZED NUMBER FIFTEEN] output[outputPos++] = '('; output[outputPos++] = '1'; output[outputPos++] = '5'; output[outputPos++] = ')'; break; case '\u246F': // ⑯ [CIRCLED NUMBER SIXTEEN] case '\u24F0': // â“° [NEGATIVE CIRCLED NUMBER SIXTEEN] output[outputPos++] = '1'; output[outputPos++] = '6'; break; case '\u2497': // â’— [NUMBER SIXTEEN FULL STOP] output[outputPos++] = '1'; output[outputPos++] = '6'; output[outputPos++] = '.'; break; case '\u2483': // â’ƒ [PARENTHESIZED NUMBER SIXTEEN] output[outputPos++] = '('; output[outputPos++] = '1'; output[outputPos++] = '6'; output[outputPos++] = ')'; break; case '\u2470': // â‘° [CIRCLED NUMBER SEVENTEEN] case '\u24F1': // ⓱ [NEGATIVE CIRCLED NUMBER SEVENTEEN] output[outputPos++] = '1'; output[outputPos++] = '7'; break; case '\u2498': // â’˜ [NUMBER SEVENTEEN FULL STOP] output[outputPos++] = '1'; output[outputPos++] = '7'; output[outputPos++] = '.'; break; case '\u2484': // â’„ [PARENTHESIZED NUMBER SEVENTEEN] output[outputPos++] = '('; output[outputPos++] = '1'; output[outputPos++] = '7'; output[outputPos++] = ')'; break; case '\u2471': // ⑱ [CIRCLED NUMBER EIGHTEEN] case '\u24F2': // ⓲ [NEGATIVE CIRCLED NUMBER EIGHTEEN] output[outputPos++] = '1'; output[outputPos++] = '8'; break; case '\u2499': // â’™ [NUMBER EIGHTEEN FULL STOP] output[outputPos++] = '1'; output[outputPos++] = '8'; output[outputPos++] = '.'; break; case '\u2485': // â’… [PARENTHESIZED NUMBER EIGHTEEN] output[outputPos++] = '('; output[outputPos++] = '1'; output[outputPos++] = '8'; output[outputPos++] = ')'; break; case '\u2472': // ⑲ [CIRCLED NUMBER NINETEEN] case '\u24F3': // ⓳ [NEGATIVE CIRCLED NUMBER NINETEEN] output[outputPos++] = '1'; output[outputPos++] = '9'; break; case '\u249A': // â’š [NUMBER NINETEEN FULL STOP] output[outputPos++] = '1'; output[outputPos++] = '9'; output[outputPos++] = '.'; break; case '\u2486': // â’† [PARENTHESIZED NUMBER NINETEEN] output[outputPos++] = '('; output[outputPos++] = '1'; output[outputPos++] = '9'; output[outputPos++] = ')'; break; case '\u2473': // ⑳ [CIRCLED NUMBER TWENTY] case '\u24F4': // â“´ [NEGATIVE CIRCLED NUMBER TWENTY] output[outputPos++] = '2'; output[outputPos++] = '0'; break; case '\u249B': // â’› [NUMBER TWENTY FULL STOP] output[outputPos++] = '2'; output[outputPos++] = '0'; output[outputPos++] = '.'; break; case '\u2487': // â’‡ [PARENTHESIZED NUMBER TWENTY] output[outputPos++] = '('; output[outputPos++] = '2'; output[outputPos++] = '0'; output[outputPos++] = ')'; break; case '\u00AB': // « [LEFT-POINTING DOUBLE ANGLE QUOTATION MARK] case '\u00BB': // » [RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK] case '\u201C': // “ [LEFT DOUBLE QUOTATION MARK] case '\u201D': // †[RIGHT DOUBLE QUOTATION MARK] case '\u201E': // „ [DOUBLE LOW-9 QUOTATION MARK] case '\u2033': // ″ [DOUBLE PRIME] case '\u2036': // ‶ [REVERSED DOUBLE PRIME] case '\u275D': // â [HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT] case '\u275E': // âž [HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT] case '\u276E': // â® [HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT] case '\u276F': // ⯠[HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT] case '\uFF02': // " [FULLWIDTH QUOTATION MARK] output[outputPos++] = '"'; break; case '\u2018': // ‘ [LEFT SINGLE QUOTATION MARK] case '\u2019': // ’ [RIGHT SINGLE QUOTATION MARK] case '\u201A': // ‚ [SINGLE LOW-9 QUOTATION MARK] case '\u201B': // ‛ [SINGLE HIGH-REVERSED-9 QUOTATION MARK] case '\u2032': // ′ [PRIME] case '\u2035': // ‵ [REVERSED PRIME] case '\u2039': // ‹ [SINGLE LEFT-POINTING ANGLE QUOTATION MARK] case '\u203A': // › [SINGLE RIGHT-POINTING ANGLE QUOTATION MARK] case '\u275B': // â› [HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT] case '\u275C': // ✠[HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT] case '\uFF07': // ' [FULLWIDTH APOSTROPHE] output[outputPos++] = '\''; break; case '\u2010': // †[HYPHEN] case '\u2011': // ‑ [NON-BREAKING HYPHEN] case '\u2012': // ‒ [FIGURE DASH] case '\u2013': // – [EN DASH] case '\u2014': // — [EM DASH] case '\u207B': // â» [SUPERSCRIPT MINUS] case '\u208B': // â‚‹ [SUBSCRIPT MINUS] case '\uFF0D': // ï¼ [FULLWIDTH HYPHEN-MINUS] output[outputPos++] = '-'; break; case '\u2045': // â… [LEFT SQUARE BRACKET WITH QUILL] case '\u2772': // â² [LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT] case '\uFF3B': // ï¼» [FULLWIDTH LEFT SQUARE BRACKET] output[outputPos++] = '['; break; case '\u2046': // ↠[RIGHT SQUARE BRACKET WITH QUILL] case '\u2773': // â³ [LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT] case '\uFF3D': // ï¼½ [FULLWIDTH RIGHT SQUARE BRACKET] output[outputPos++] = ']'; break; case '\u207D': // â½ [SUPERSCRIPT LEFT PARENTHESIS] case '\u208D': // â‚ [SUBSCRIPT LEFT PARENTHESIS] case '\u2768': // ⨠[MEDIUM LEFT PARENTHESIS ORNAMENT] case '\u276A': // ⪠[MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT] case '\uFF08': // ( [FULLWIDTH LEFT PARENTHESIS] output[outputPos++] = '('; break; case '\u2E28': // ⸨ [LEFT DOUBLE PARENTHESIS] output[outputPos++] = '('; output[outputPos++] = '('; break; case '\u207E': // â¾ [SUPERSCRIPT RIGHT PARENTHESIS] case '\u208E': // ₎ [SUBSCRIPT RIGHT PARENTHESIS] case '\u2769': // â© [MEDIUM RIGHT PARENTHESIS ORNAMENT] case '\u276B': // â« [MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT] case '\uFF09': // ) [FULLWIDTH RIGHT PARENTHESIS] output[outputPos++] = ')'; break; case '\u2E29': // ⸩ [RIGHT DOUBLE PARENTHESIS] output[outputPos++] = ')'; output[outputPos++] = ')'; break; case '\u276C': // ⬠[MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT] case '\u2770': // â° [HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT] case '\uFF1C': // < [FULLWIDTH LESS-THAN SIGN] output[outputPos++] = '<'; break; case '\u276D': // â­ [MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT] case '\u2771': // â± [HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT] case '\uFF1E': // > [FULLWIDTH GREATER-THAN SIGN] output[outputPos++] = '>'; break; case '\u2774': // â´ [MEDIUM LEFT CURLY BRACKET ORNAMENT] case '\uFF5B': // ï½› [FULLWIDTH LEFT CURLY BRACKET] output[outputPos++] = '{'; break; case '\u2775': // âµ [MEDIUM RIGHT CURLY BRACKET ORNAMENT] case '\uFF5D': // ï½ [FULLWIDTH RIGHT CURLY BRACKET] output[outputPos++] = '}'; break; case '\u207A': // ⺠[SUPERSCRIPT PLUS SIGN] case '\u208A': // ₊ [SUBSCRIPT PLUS SIGN] case '\uFF0B': // + [FULLWIDTH PLUS SIGN] output[outputPos++] = '+'; break; case '\u207C': // â¼ [SUPERSCRIPT EQUALS SIGN] case '\u208C': // ₌ [SUBSCRIPT EQUALS SIGN] case '\uFF1D': // ï¼ [FULLWIDTH EQUALS SIGN] output[outputPos++] = '='; break; case '\uFF01': // ï¼ [FULLWIDTH EXCLAMATION MARK] output[outputPos++] = '!'; break; case '\u203C': // ‼ [DOUBLE EXCLAMATION MARK] output[outputPos++] = '!'; output[outputPos++] = '!'; break; case '\u2049': // ≠[EXCLAMATION QUESTION MARK] output[outputPos++] = '!'; output[outputPos++] = '?'; break; case '\uFF03': // # [FULLWIDTH NUMBER SIGN] output[outputPos++] = '#'; break; case '\uFF04': // $ [FULLWIDTH DOLLAR SIGN] output[outputPos++] = '$'; break; case '\u2052': // â’ [COMMERCIAL MINUS SIGN] case '\uFF05': // ï¼… [FULLWIDTH PERCENT SIGN] output[outputPos++] = '%'; break; case '\uFF06': // & [FULLWIDTH AMPERSAND] output[outputPos++] = '&'; break; case '\u204E': // ⎠[LOW ASTERISK] case '\uFF0A': // * [FULLWIDTH ASTERISK] output[outputPos++] = '*'; break; case '\uFF0C': // , [FULLWIDTH COMMA] output[outputPos++] = ','; break; case '\uFF0E': // . [FULLWIDTH FULL STOP] output[outputPos++] = '.'; break; case '\u2044': // â„ [FRACTION SLASH] case '\uFF0F': // ï¼ [FULLWIDTH SOLIDUS] output[outputPos++] = '/'; break; case '\uFF1A': // : [FULLWIDTH COLON] output[outputPos++] = ':'; break; case '\u204F': // â [REVERSED SEMICOLON] case '\uFF1B': // ï¼› [FULLWIDTH SEMICOLON] output[outputPos++] = ';'; break; case '\uFF1F': // ? [FULLWIDTH QUESTION MARK] output[outputPos++] = '?'; break; case '\u2047': // ⇠[DOUBLE QUESTION MARK] output[outputPos++] = '?'; output[outputPos++] = '?'; break; case '\u2048': // ∠[QUESTION EXCLAMATION MARK] output[outputPos++] = '?'; output[outputPos++] = '!'; break; case '\uFF20': // ï¼  [FULLWIDTH COMMERCIAL AT] output[outputPos++] = '@'; break; case '\uFF3C': // ï¼¼ [FULLWIDTH REVERSE SOLIDUS] output[outputPos++] = '\\'; break; case '\u2038': // ‸ [CARET] case '\uFF3E': // ï¼¾ [FULLWIDTH CIRCUMFLEX ACCENT] output[outputPos++] = '^'; break; case '\uFF3F': // _ [FULLWIDTH LOW LINE] output[outputPos++] = '_'; break; case '\u2053': // â“ [SWUNG DASH] case '\uFF5E': // ~ [FULLWIDTH TILDE] output[outputPos++] = '~'; break; default: output[outputPos++] = c; break; } } } } } lucene-2.9.4/src/java/org/apache/lucene/analysis/PorterStemmer.java0000644000175000017500000003364311474320222025752 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* Porter stemmer in Java. The original paper is in Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, no. 3, pp 130-137, See also http://www.tartarus.org/~martin/PorterStemmer/index.html Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below. Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1] is then out outside the bounds of b. Similarly, Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below. 'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and b[j] is then outside the bounds of b. Release 3. [ This version is derived from Release 3, modified by Brian Goetz to optimize for fewer object creations. ] */ import java.io.*; /** * * Stemmer, implementing the Porter Stemming Algorithm * * The Stemmer class transforms a word into its root form. The input * word can be provided a character at time (by calling add()), or at once * by calling one of the various stem(something) methods. */ class PorterStemmer { private char[] b; private int i, /* offset into b */ j, k, k0; private boolean dirty = false; private static final int INC = 50; /* unit of size whereby b is increased */ private static final int EXTRA = 1; public PorterStemmer() { b = new char[INC]; i = 0; } /** * reset() resets the stemmer so it can stem another word. If you invoke * the stemmer by calling add(char) and then stem(), you must call reset() * before starting another word. */ public void reset() { i = 0; dirty = false; } /** * Add a character to the word being stemmed. When you are finished * adding characters, you can call stem(void) to process the word. */ public void add(char ch) { if (b.length <= i + EXTRA) { char[] new_b = new char[b.length+INC]; System.arraycopy(b, 0, new_b, 0, b.length); b = new_b; } b[i++] = ch; } /** * After a word has been stemmed, it can be retrieved by toString(), * or a reference to the internal buffer can be retrieved by getResultBuffer * and getResultLength (which is generally more efficient.) */ public String toString() { return new String(b,0,i); } /** * Returns the length of the word resulting from the stemming process. */ public int getResultLength() { return i; } /** * Returns a reference to a character buffer containing the results of * the stemming process. You also need to consult getResultLength() * to determine the length of the result. */ public char[] getResultBuffer() { return b; } /* cons(i) is true <=> b[i] is a consonant. */ private final boolean cons(int i) { switch (b[i]) { case 'a': case 'e': case 'i': case 'o': case 'u': return false; case 'y': return (i==k0) ? true : !cons(i-1); default: return true; } } /* m() measures the number of consonant sequences between k0 and j. if c is a consonant sequence and v a vowel sequence, and <..> indicates arbitrary presence, gives 0 vc gives 1 vcvc gives 2 vcvcvc gives 3 .... */ private final int m() { int n = 0; int i = k0; while(true) { if (i > j) return n; if (! cons(i)) break; i++; } i++; while(true) { while(true) { if (i > j) return n; if (cons(i)) break; i++; } i++; n++; while(true) { if (i > j) return n; if (! cons(i)) break; i++; } i++; } } /* vowelinstem() is true <=> k0,...j contains a vowel */ private final boolean vowelinstem() { int i; for (i = k0; i <= j; i++) if (! cons(i)) return true; return false; } /* doublec(j) is true <=> j,(j-1) contain a double consonant. */ private final boolean doublec(int j) { if (j < k0+1) return false; if (b[j] != b[j-1]) return false; return cons(j); } /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant and also if the second c is not w,x or y. this is used when trying to restore an e at the end of a short word. e.g. cav(e), lov(e), hop(e), crim(e), but snow, box, tray. */ private final boolean cvc(int i) { if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2)) return false; else { int ch = b[i]; if (ch == 'w' || ch == 'x' || ch == 'y') return false; } return true; } private final boolean ends(String s) { int l = s.length(); int o = k-l+1; if (o < k0) return false; for (int i = 0; i < l; i++) if (b[o+i] != s.charAt(i)) return false; j = k-l; return true; } /* setto(s) sets (j+1),...k to the characters in the string s, readjusting k. */ void setto(String s) { int l = s.length(); int o = j+1; for (int i = 0; i < l; i++) b[o+i] = s.charAt(i); k = j+l; dirty = true; } /* r(s) is used further down. */ void r(String s) { if (m() > 0) setto(s); } /* step1() gets rid of plurals and -ed or -ing. e.g. caresses -> caress ponies -> poni ties -> ti caress -> caress cats -> cat feed -> feed agreed -> agree disabled -> disable matting -> mat mating -> mate meeting -> meet milling -> mill messing -> mess meetings -> meet */ private final void step1() { if (b[k] == 's') { if (ends("sses")) k -= 2; else if (ends("ies")) setto("i"); else if (b[k-1] != 's') k--; } if (ends("eed")) { if (m() > 0) k--; } else if ((ends("ed") || ends("ing")) && vowelinstem()) { k = j; if (ends("at")) setto("ate"); else if (ends("bl")) setto("ble"); else if (ends("iz")) setto("ize"); else if (doublec(k)) { int ch = b[k--]; if (ch == 'l' || ch == 's' || ch == 'z') k++; } else if (m() == 1 && cvc(k)) setto("e"); } } /* step2() turns terminal y to i when there is another vowel in the stem. */ private final void step2() { if (ends("y") && vowelinstem()) { b[k] = 'i'; dirty = true; } } /* step3() maps double suffices to single ones. so -ization ( = -ize plus -ation) maps to -ize etc. note that the string before the suffix must give m() > 0. */ private final void step3() { if (k == k0) return; /* For Bug 1 */ switch (b[k-1]) { case 'a': if (ends("ational")) { r("ate"); break; } if (ends("tional")) { r("tion"); break; } break; case 'c': if (ends("enci")) { r("ence"); break; } if (ends("anci")) { r("ance"); break; } break; case 'e': if (ends("izer")) { r("ize"); break; } break; case 'l': if (ends("bli")) { r("ble"); break; } if (ends("alli")) { r("al"); break; } if (ends("entli")) { r("ent"); break; } if (ends("eli")) { r("e"); break; } if (ends("ousli")) { r("ous"); break; } break; case 'o': if (ends("ization")) { r("ize"); break; } if (ends("ation")) { r("ate"); break; } if (ends("ator")) { r("ate"); break; } break; case 's': if (ends("alism")) { r("al"); break; } if (ends("iveness")) { r("ive"); break; } if (ends("fulness")) { r("ful"); break; } if (ends("ousness")) { r("ous"); break; } break; case 't': if (ends("aliti")) { r("al"); break; } if (ends("iviti")) { r("ive"); break; } if (ends("biliti")) { r("ble"); break; } break; case 'g': if (ends("logi")) { r("log"); break; } } } /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */ private final void step4() { switch (b[k]) { case 'e': if (ends("icate")) { r("ic"); break; } if (ends("ative")) { r(""); break; } if (ends("alize")) { r("al"); break; } break; case 'i': if (ends("iciti")) { r("ic"); break; } break; case 'l': if (ends("ical")) { r("ic"); break; } if (ends("ful")) { r(""); break; } break; case 's': if (ends("ness")) { r(""); break; } break; } } /* step5() takes off -ant, -ence etc., in context vcvc. */ private final void step5() { if (k == k0) return; /* for Bug 1 */ switch (b[k-1]) { case 'a': if (ends("al")) break; return; case 'c': if (ends("ance")) break; if (ends("ence")) break; return; case 'e': if (ends("er")) break; return; case 'i': if (ends("ic")) break; return; case 'l': if (ends("able")) break; if (ends("ible")) break; return; case 'n': if (ends("ant")) break; if (ends("ement")) break; if (ends("ment")) break; /* element etc. not stripped before the m */ if (ends("ent")) break; return; case 'o': if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break; /* j >= 0 fixes Bug 2 */ if (ends("ou")) break; return; /* takes care of -ous */ case 's': if (ends("ism")) break; return; case 't': if (ends("ate")) break; if (ends("iti")) break; return; case 'u': if (ends("ous")) break; return; case 'v': if (ends("ive")) break; return; case 'z': if (ends("ize")) break; return; default: return; } if (m() > 1) k = j; } /* step6() removes a final -e if m() > 1. */ private final void step6() { j = k; if (b[k] == 'e') { int a = m(); if (a > 1 || a == 1 && !cvc(k-1)) k--; } if (b[k] == 'l' && doublec(k) && m() > 1) k--; } /** * Stem a word provided as a String. Returns the result as a String. */ public String stem(String s) { if (stem(s.toCharArray(), s.length())) return toString(); else return s; } /** Stem a word contained in a char[]. Returns true if the stemming process * resulted in a word different from the input. You can retrieve the * result with getResultLength()/getResultBuffer() or toString(). */ public boolean stem(char[] word) { return stem(word, word.length); } /** Stem a word contained in a portion of a char[] array. Returns * true if the stemming process resulted in a word different from * the input. You can retrieve the result with * getResultLength()/getResultBuffer() or toString(). */ public boolean stem(char[] wordBuffer, int offset, int wordLen) { reset(); if (b.length < wordLen) { char[] new_b = new char[wordLen + EXTRA]; b = new_b; } System.arraycopy(wordBuffer, offset, b, 0, wordLen); i = wordLen; return stem(0); } /** Stem a word contained in a leading portion of a char[] array. * Returns true if the stemming process resulted in a word different * from the input. You can retrieve the result with * getResultLength()/getResultBuffer() or toString(). */ public boolean stem(char[] word, int wordLen) { return stem(word, 0, wordLen); } /** Stem the word placed into the Stemmer buffer through calls to add(). * Returns true if the stemming process resulted in a word different * from the input. You can retrieve the result with * getResultLength()/getResultBuffer() or toString(). */ public boolean stem() { return stem(0); } public boolean stem(int i0) { k = i - 1; k0 = i0; if (k > k0+1) { step1(); step2(); step3(); step4(); step5(); step6(); } // Also, a word is considered dirty if we lopped off letters // Thanks to Ifigenia Vairelles for pointing this out. if (i != k+1) dirty = true; i = k+1; return dirty; } /** Test program for demonstrating the Stemmer. It reads a file and * stems each word, writing the result to standard out. * Usage: Stemmer file-name */ public static void main(String[] args) { PorterStemmer s = new PorterStemmer(); for (int i = 0; i < args.length; i++) { try { InputStream in = new FileInputStream(args[i]); byte[] buffer = new byte[1024]; int bufferLen, offset, ch; bufferLen = in.read(buffer); offset = 0; s.reset(); while(true) { if (offset < bufferLen) ch = buffer[offset++]; else { bufferLen = in.read(buffer); offset = 0; if (bufferLen < 0) ch = -1; else ch = buffer[offset++]; } if (Character.isLetter((char) ch)) { s.add(Character.toLowerCase((char) ch)); } else { s.stem(); System.out.print(s.toString()); s.reset(); if (ch < 0) break; else { System.out.print((char) ch); } } } in.close(); } catch (IOException e) { System.out.println("error reading " + args[i]); } } } } lucene-2.9.4/src/java/org/apache/lucene/analysis/SinkTokenizer.java0000644000175000017500000000722311474320222025734 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; /** * A SinkTokenizer can be used to cache Tokens for use in an Analyzer *

    * WARNING: {@link TeeTokenFilter} and {@link SinkTokenizer} only work with the old TokenStream API. * If you switch to the new API, you need to use {@link TeeSinkTokenFilter} instead, which offers * the same functionality. * @see TeeTokenFilter * @deprecated Use {@link TeeSinkTokenFilter} instead * **/ public class SinkTokenizer extends Tokenizer { protected List/**/ lst = new ArrayList/**/(); protected Iterator/**/ iter; public SinkTokenizer(List/**/ input) { this.lst = input; if (this.lst == null) this.lst = new ArrayList/**/(); } public SinkTokenizer() { this.lst = new ArrayList/**/(); } public SinkTokenizer(int initCap){ this.lst = new ArrayList/**/(initCap); } /** * Get the tokens in the internal List. *

    * WARNING: Adding tokens to this list requires the {@link #reset()} method to be called in order for them * to be made available. Also, this Tokenizer does nothing to protect against {@link java.util.ConcurrentModificationException}s * in the case of adds happening while {@link #next(org.apache.lucene.analysis.Token)} is being called. *

    * WARNING: Since this SinkTokenizer can be reset and the cached tokens made available again, do not modify them. Modify clones instead. * * @return A List of {@link org.apache.lucene.analysis.Token}s */ public List/**/ getTokens() { return lst; } /** * Returns the next token out of the list of cached tokens * @return The next {@link org.apache.lucene.analysis.Token} in the Sink. * @throws IOException */ public Token next(final Token reusableToken) throws IOException { assert reusableToken != null; if (iter == null) iter = lst.iterator(); // Since this TokenStream can be reset we have to maintain the tokens as immutable if (iter.hasNext()) { Token nextToken = (Token) iter.next(); return (Token) nextToken.clone(); } return null; } /** * Override this method to cache only certain tokens, or new tokens based * on the old tokens. * * @param t The {@link org.apache.lucene.analysis.Token} to add to the sink */ public void add(Token t) { if (t == null) return; lst.add((Token) t.clone()); } public void close() throws IOException { //nothing to close input = null; lst = null; } /** * Reset the internal data structures to the start at the front of the list of tokens. Should be called * if tokens were added to the list after an invocation of {@link #next(Token)} * @throws IOException */ public void reset() throws IOException { iter = lst.iterator(); } } lucene-2.9.4/src/java/org/apache/lucene/analysis/tokenattributes/0000755000175000017500000000000011554106562025525 5ustar janpascaljanpascallucene-2.9.4/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttributeImpl.java0000644000175000017500000000430511474320222032124 0ustar janpascaljanpascalpackage org.apache.lucene.analysis.tokenattributes; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Serializable; import org.apache.lucene.util.AttributeImpl; /** * This attribute can be used to pass different flags down the tokenizer chain, * eg from one TokenFilter to another one. */ public class FlagsAttributeImpl extends AttributeImpl implements FlagsAttribute, Cloneable, Serializable { private int flags = 0; /** * EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long. *

    * * Get the bitset for any bits that have been set. This is completely distinct from {@link TypeAttribute#type()}, although they do share similar purposes. * The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s. * * * @return The bits */ public int getFlags() { return flags; } /** * @see #getFlags() */ public void setFlags(int flags) { this.flags = flags; } public void clear() { flags = 0; } public boolean equals(Object other) { if (this == other) { return true; } if (other instanceof FlagsAttributeImpl) { return ((FlagsAttributeImpl) other).flags == flags; } return false; } public int hashCode() { return flags; } public void copyTo(AttributeImpl target) { FlagsAttribute t = (FlagsAttribute) target; t.setFlags(flags); } } lucene-2.9.4/src/java/org/apache/lucene/analysis/tokenattributes/TermAttribute.java0000644000175000017500000000725511474320222031164 0ustar janpascaljanpascalpackage org.apache.lucene.analysis.tokenattributes; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.util.Attribute; /** * The term text of a Token. */ public interface TermAttribute extends Attribute { /** Returns the Token's term text. * * This method has a performance penalty * because the text is stored internally in a char[]. If * possible, use {@link #termBuffer()} and {@link * #termLength()} directly instead. If you really need a * String, use this method, which is nothing more than * a convenience call to new String(token.termBuffer(), 0, token.termLength()) */ public String term(); /** Copies the contents of buffer, starting at offset for * length characters, into the termBuffer array. * @param buffer the buffer to copy * @param offset the index in the buffer of the first character to copy * @param length the number of characters to copy */ public void setTermBuffer(char[] buffer, int offset, int length); /** Copies the contents of buffer into the termBuffer array. * @param buffer the buffer to copy */ public void setTermBuffer(String buffer); /** Copies the contents of buffer, starting at offset and continuing * for length characters, into the termBuffer array. * @param buffer the buffer to copy * @param offset the index in the buffer of the first character to copy * @param length the number of characters to copy */ public void setTermBuffer(String buffer, int offset, int length); /** Returns the internal termBuffer character array which * you can then directly alter. If the array is too * small for your token, use {@link * #resizeTermBuffer(int)} to increase it. After * altering the buffer be sure to call {@link * #setTermLength} to record the number of valid * characters that were placed into the termBuffer. */ public char[] termBuffer(); /** Grows the termBuffer to at least size newSize, preserving the * existing content. Note: If the next operation is to change * the contents of the term buffer use * {@link #setTermBuffer(char[], int, int)}, * {@link #setTermBuffer(String)}, or * {@link #setTermBuffer(String, int, int)} * to optimally combine the resize with the setting of the termBuffer. * @param newSize minimum size of the new termBuffer * @return newly created termBuffer with length >= newSize */ public char[] resizeTermBuffer(int newSize); /** Return number of valid characters (length of the term) * in the termBuffer array. */ public int termLength(); /** Set number of valid characters (length of the term) in * the termBuffer array. Use this to truncate the termBuffer * or to synchronize with external manipulation of the termBuffer. * Note: to grow the size of the array, * use {@link #resizeTermBuffer(int)} first. * @param length the truncated length */ public void setTermLength(int length); } lucene-2.9.4/src/java/org/apache/lucene/analysis/tokenattributes/TermAttributeImpl.java0000644000175000017500000001655011474320222032004 0ustar janpascaljanpascalpackage org.apache.lucene.analysis.tokenattributes; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Serializable; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.AttributeImpl; /** * The term text of a Token. */ public class TermAttributeImpl extends AttributeImpl implements TermAttribute, Cloneable, Serializable { private static int MIN_BUFFER_SIZE = 10; private char[] termBuffer; private int termLength; /** Returns the Token's term text. * * This method has a performance penalty * because the text is stored internally in a char[]. If * possible, use {@link #termBuffer()} and {@link * #termLength()} directly instead. If you really need a * String, use this method, which is nothing more than * a convenience call to new String(token.termBuffer(), 0, token.termLength()) */ public String term() { initTermBuffer(); return new String(termBuffer, 0, termLength); } /** Copies the contents of buffer, starting at offset for * length characters, into the termBuffer array. * @param buffer the buffer to copy * @param offset the index in the buffer of the first character to copy * @param length the number of characters to copy */ public void setTermBuffer(char[] buffer, int offset, int length) { growTermBuffer(length); System.arraycopy(buffer, offset, termBuffer, 0, length); termLength = length; } /** Copies the contents of buffer into the termBuffer array. * @param buffer the buffer to copy */ public void setTermBuffer(String buffer) { int length = buffer.length(); growTermBuffer(length); buffer.getChars(0, length, termBuffer, 0); termLength = length; } /** Copies the contents of buffer, starting at offset and continuing * for length characters, into the termBuffer array. * @param buffer the buffer to copy * @param offset the index in the buffer of the first character to copy * @param length the number of characters to copy */ public void setTermBuffer(String buffer, int offset, int length) { assert offset <= buffer.length(); assert offset + length <= buffer.length(); growTermBuffer(length); buffer.getChars(offset, offset + length, termBuffer, 0); termLength = length; } /** Returns the internal termBuffer character array which * you can then directly alter. If the array is too * small for your token, use {@link * #resizeTermBuffer(int)} to increase it. After * altering the buffer be sure to call {@link * #setTermLength} to record the number of valid * characters that were placed into the termBuffer. */ public char[] termBuffer() { initTermBuffer(); return termBuffer; } /** Grows the termBuffer to at least size newSize, preserving the * existing content. Note: If the next operation is to change * the contents of the term buffer use * {@link #setTermBuffer(char[], int, int)}, * {@link #setTermBuffer(String)}, or * {@link #setTermBuffer(String, int, int)} * to optimally combine the resize with the setting of the termBuffer. * @param newSize minimum size of the new termBuffer * @return newly created termBuffer with length >= newSize */ public char[] resizeTermBuffer(int newSize) { if (termBuffer == null) { // The buffer is always at least MIN_BUFFER_SIZE termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)]; } else { if(termBuffer.length < newSize){ // Not big enough; create a new array with slight // over allocation and preserve content final char[] newCharBuffer = new char[ArrayUtil.getNextSize(newSize)]; System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length); termBuffer = newCharBuffer; } } return termBuffer; } /** Allocates a buffer char[] of at least newSize, without preserving the existing content. * its always used in places that set the content * @param newSize minimum size of the buffer */ private void growTermBuffer(int newSize) { if (termBuffer == null) { // The buffer is always at least MIN_BUFFER_SIZE termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)]; } else { if(termBuffer.length < newSize){ // Not big enough; create a new array with slight // over allocation: termBuffer = new char[ArrayUtil.getNextSize(newSize)]; } } } private void initTermBuffer() { if (termBuffer == null) { termBuffer = new char[ArrayUtil.getNextSize(MIN_BUFFER_SIZE)]; termLength = 0; } } /** Return number of valid characters (length of the term) * in the termBuffer array. */ public int termLength() { return termLength; } /** Set number of valid characters (length of the term) in * the termBuffer array. Use this to truncate the termBuffer * or to synchronize with external manipulation of the termBuffer. * Note: to grow the size of the array, * use {@link #resizeTermBuffer(int)} first. * @param length the truncated length */ public void setTermLength(int length) { initTermBuffer(); if (length > termBuffer.length) throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")"); termLength = length; } public int hashCode() { initTermBuffer(); int code = termLength; code = code * 31 + ArrayUtil.hashCode(termBuffer, 0, termLength); return code; } public void clear() { termLength = 0; } public Object clone() { TermAttributeImpl t = (TermAttributeImpl)super.clone(); // Do a deep clone if (termBuffer != null) { t.termBuffer = new char[this.termLength]; System.arraycopy(this.termBuffer, 0, t.termBuffer, 0, this.termLength); } return t; } public boolean equals(Object other) { if (other == this) { return true; } if (other instanceof TermAttribute) { initTermBuffer(); TermAttributeImpl o = ((TermAttributeImpl) other); o.initTermBuffer(); if (termLength != o.termLength) return false; for(int i=0;iThe default value is one. * *

    Some common uses for this are:

      * *
    • Set it to zero to put multiple terms in the same position. This is * useful if, e.g., a word has multiple stems. Searches for phrases * including either stem will match. In this case, all but the first stem's * increment should be set to zero: the increment of the first instance * should be one. Repeating a token with an increment of zero can also be * used to boost the scores of matches on that token. * *
    • Set it to values greater than one to inhibit exact phrase matches. * If, for example, one does not want phrases to match across removed stop * words, then one could build a stop word filter that removes stop words and * also sets the increment to the number of stop words removed before each * non-stop word. Then exact phrase queries will only match when the terms * occur with no intervening stop words. * *
    * * @see org.apache.lucene.index.TermPositions */ public interface PositionIncrementAttribute extends Attribute { /** Set the position increment. The default value is one. * * @param positionIncrement the distance from the prior term */ public void setPositionIncrement(int positionIncrement); /** Returns the position increment of this Token. * @see #setPositionIncrement */ public int getPositionIncrement(); } lucene-2.9.4/src/java/org/apache/lucene/analysis/tokenattributes/TypeAttribute.java0000644000175000017500000000225111474320222031165 0ustar janpascaljanpascalpackage org.apache.lucene.analysis.tokenattributes; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.util.Attribute; /** * A Token's lexical type. The Default value is "word". */ public interface TypeAttribute extends Attribute { /** Returns this Token's lexical type. Defaults to "word". */ public String type(); /** Set the lexical type. @see #type() */ public void setType(String type); } ././@LongLink0000000000000000000000000000014500000000000011565 Lustar rootrootlucene-2.9.4/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.javalucene-2.9.4/src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttributeImpl.java0000644000175000017500000000645711474320222034553 0ustar janpascaljanpascalpackage org.apache.lucene.analysis.tokenattributes; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Serializable; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.util.AttributeImpl; /** The positionIncrement determines the position of this token * relative to the previous Token in a {@link TokenStream}, used in phrase * searching. * *

    The default value is one. * *

    Some common uses for this are:

      * *
    • Set it to zero to put multiple terms in the same position. This is * useful if, e.g., a word has multiple stems. Searches for phrases * including either stem will match. In this case, all but the first stem's * increment should be set to zero: the increment of the first instance * should be one. Repeating a token with an increment of zero can also be * used to boost the scores of matches on that token. * *
    • Set it to values greater than one to inhibit exact phrase matches. * If, for example, one does not want phrases to match across removed stop * words, then one could build a stop word filter that removes stop words and * also sets the increment to the number of stop words removed before each * non-stop word. Then exact phrase queries will only match when the terms * occur with no intervening stop words. * *
    */ public class PositionIncrementAttributeImpl extends AttributeImpl implements PositionIncrementAttribute, Cloneable, Serializable { private int positionIncrement = 1; /** Set the position increment. The default value is one. * * @param positionIncrement the distance from the prior term */ public void setPositionIncrement(int positionIncrement) { if (positionIncrement < 0) throw new IllegalArgumentException ("Increment must be zero or greater: " + positionIncrement); this.positionIncrement = positionIncrement; } /** Returns the position increment of this Token. * @see #setPositionIncrement */ public int getPositionIncrement() { return positionIncrement; } public void clear() { this.positionIncrement = 1; } public boolean equals(Object other) { if (other == this) { return true; } if (other instanceof PositionIncrementAttributeImpl) { return positionIncrement == ((PositionIncrementAttributeImpl) other).positionIncrement; } return false; } public int hashCode() { return positionIncrement; } public void copyTo(AttributeImpl target) { PositionIncrementAttribute t = (PositionIncrementAttribute) target; t.setPositionIncrement(positionIncrement); } } lucene-2.9.4/src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java0000644000175000017500000000321711474320222031303 0ustar janpascaljanpascalpackage org.apache.lucene.analysis.tokenattributes; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.util.Attribute; /** * This attribute can be used to pass different flags down the {@link Tokenizer} chain, * eg from one TokenFilter to another one. */ public interface FlagsAttribute extends Attribute { /** * EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long. *

    * * Get the bitset for any bits that have been set. This is completely distinct from {@link TypeAttribute#type()}, although they do share similar purposes. * The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s. * * * @return The bits */ public int getFlags(); /** * @see #getFlags() */ public void setFlags(int flags); } lucene-2.9.4/src/java/org/apache/lucene/analysis/tokenattributes/TypeAttributeImpl.java0000644000175000017500000000373511474320222032017 0ustar janpascaljanpascalpackage org.apache.lucene.analysis.tokenattributes; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Serializable; import org.apache.lucene.util.AttributeImpl; /** * A Token's lexical type. The Default value is "word". */ public class TypeAttributeImpl extends AttributeImpl implements TypeAttribute, Cloneable, Serializable { private String type; public static final String DEFAULT_TYPE = "word"; public TypeAttributeImpl() { this(DEFAULT_TYPE); } public TypeAttributeImpl(String type) { this.type = type; } /** Returns this Token's lexical type. Defaults to "word". */ public String type() { return type; } /** Set the lexical type. @see #type() */ public void setType(String type) { this.type = type; } public void clear() { type = DEFAULT_TYPE; } public boolean equals(Object other) { if (other == this) { return true; } if (other instanceof TypeAttributeImpl) { return type.equals(((TypeAttributeImpl) other).type); } return false; } public int hashCode() { return type.hashCode(); } public void copyTo(AttributeImpl target) { TypeAttribute t = (TypeAttribute) target; t.setType(type); } } lucene-2.9.4/src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttributeImpl.java0000644000175000017500000000501511474320222032460 0ustar janpascaljanpascalpackage org.apache.lucene.analysis.tokenattributes; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Serializable; import org.apache.lucene.index.Payload; import org.apache.lucene.util.AttributeImpl; /** * The payload of a Token. See also {@link Payload}. */ public class PayloadAttributeImpl extends AttributeImpl implements PayloadAttribute, Cloneable, Serializable { private Payload payload; /** * Initialize this attribute with no payload. */ public PayloadAttributeImpl() {} /** * Initialize this attribute with the given payload. */ public PayloadAttributeImpl(Payload payload) { this.payload = payload; } /** * Returns this Token's payload. */ public Payload getPayload() { return this.payload; } /** * Sets this Token's payload. */ public void setPayload(Payload payload) { this.payload = payload; } public void clear() { payload = null; } public Object clone() { PayloadAttributeImpl clone = (PayloadAttributeImpl) super.clone(); if (payload != null) { clone.payload = (Payload) payload.clone(); } return clone; } public boolean equals(Object other) { if (other == this) { return true; } if (other instanceof PayloadAttribute) { PayloadAttributeImpl o = (PayloadAttributeImpl) other; if (o.payload == null || payload == null) { return o.payload == null && payload == null; } return o.payload.equals(payload); } return false; } public int hashCode() { return (payload == null) ? 0 : payload.hashCode(); } public void copyTo(AttributeImpl target) { PayloadAttribute t = (PayloadAttribute) target; t.setPayload((payload == null) ? null : (Payload) payload.clone()); } } lucene-2.9.4/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java0000644000175000017500000000333311474320222031474 0ustar janpascaljanpascalpackage org.apache.lucene.analysis.tokenattributes; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.util.Attribute; /** * The start and end character offset of a Token. */ public interface OffsetAttribute extends Attribute { /** Returns this Token's starting offset, the position of the first character corresponding to this token in the source text. Note that the difference between endOffset() and startOffset() may not be equal to termText.length(), as the term text may have been altered by a stemmer or some other filter. */ public int startOffset(); /** Set the starting and ending offset. @see #startOffset() and #endOffset()*/ public void setOffset(int startOffset, int endOffset); /** Returns this Token's ending offset, one greater than the position of the last character corresponding to this token in the source text. The length of the token in the source text is (endOffset - startOffset). */ public int endOffset(); } lucene-2.9.4/src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java0000644000175000017500000000231011474320222031631 0ustar janpascaljanpascalpackage org.apache.lucene.analysis.tokenattributes; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.Payload; import org.apache.lucene.util.Attribute; /** * The payload of a Token. See also {@link Payload}. */ public interface PayloadAttribute extends Attribute { /** * Returns this Token's payload. */ public Payload getPayload(); /** * Sets this Token's payload. */ public void setPayload(Payload payload); } lucene-2.9.4/src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttributeImpl.java0000644000175000017500000000514611474320222032322 0ustar janpascaljanpascalpackage org.apache.lucene.analysis.tokenattributes; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Serializable; import org.apache.lucene.util.AttributeImpl; /** * The start and end character offset of a Token. */ public class OffsetAttributeImpl extends AttributeImpl implements OffsetAttribute, Cloneable, Serializable { private int startOffset; private int endOffset; /** Returns this Token's starting offset, the position of the first character corresponding to this token in the source text. Note that the difference between endOffset() and startOffset() may not be equal to termText.length(), as the term text may have been altered by a stemmer or some other filter. */ public int startOffset() { return startOffset; } /** Set the starting and ending offset. @see #startOffset() and #endOffset()*/ public void setOffset(int startOffset, int endOffset) { this.startOffset = startOffset; this.endOffset = endOffset; } /** Returns this Token's ending offset, one greater than the position of the last character corresponding to this token in the source text. The length of the token in the source text is (endOffset - startOffset). */ public int endOffset() { return endOffset; } public void clear() { startOffset = 0; endOffset = 0; } public boolean equals(Object other) { if (other == this) { return true; } if (other instanceof OffsetAttributeImpl) { OffsetAttributeImpl o = (OffsetAttributeImpl) other; return o.startOffset == startOffset && o.endOffset == endOffset; } return false; } public int hashCode() { int code = startOffset; code = code * 31 + endOffset; return code; } public void copyTo(AttributeImpl target) { OffsetAttribute t = (OffsetAttribute) target; t.setOffset(startOffset, endOffset); } } lucene-2.9.4/src/java/org/apache/lucene/analysis/Token.java0000644000175000017500000007470611474320222024227 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.index.Payload; import org.apache.lucene.index.TermPositions; // for javadoc import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Attribute; import org.apache.lucene.util.AttributeImpl; /** A Token is an occurrence of a term from the text of a field. It consists of a term's text, the start and end offset of the term in the text of the field, and a type string.

    The start and end offsets permit applications to re-associate a token with its source text, e.g., to display highlighted query terms in a document browser, or to show matching text fragments in a KWIC display, etc.

    The type is a string, assigned by a lexical analyzer (a.k.a. tokenizer), naming the lexical or syntactic class that the token belongs to. For example an end of sentence marker token might be implemented with type "eos". The default token type is "word".

    A Token can optionally have metadata (a.k.a. Payload) in the form of a variable length byte array. Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)} to retrieve the payloads from the index.

    NOTE: As of 2.9, Token implements all {@link Attribute} interfaces that are part of core Lucene and can be found in the {@code tokenattributes} subpackage. Even though it is not necessary to use Token anymore, with the new TokenStream API it can be used as convenience class that implements all {@link Attribute}s, which is especially useful to easily switch from the old to the new TokenStream API.

    NOTE: As of 2.3, Token stores the term text internally as a malleable char[] termBuffer instead of String termText. The indexing code and core tokenizers have been changed to re-use a single Token instance, changing its buffer and other fields in-place as the Token is processed. This provides substantially better indexing performance as it saves the GC cost of new'ing a Token and String for every term. The APIs that accept String termText are still available but a warning about the associated performance cost has been added (below). The {@link #termText()} method has been deprecated.

    Tokenizers and TokenFilters should try to re-use a Token instance when possible for best performance, by implementing the {@link TokenStream#incrementToken()} API. Failing that, to create a new Token you should first use one of the constructors that starts with null text. To load the token from a char[] use {@link #setTermBuffer(char[], int, int)}. To load from a String use {@link #setTermBuffer(String)} or {@link #setTermBuffer(String, int, int)}. Alternatively you can get the Token's termBuffer by calling either {@link #termBuffer()}, if you know that your text is shorter than the capacity of the termBuffer or {@link #resizeTermBuffer(int)}, if there is any possibility that you may need to grow the buffer. Fill in the characters of your term into this buffer, with {@link String#getChars(int, int, char[], int)} if loading from a string, or with {@link System#arraycopy(Object, int, Object, int, int)}, and finally call {@link #setTermLength(int)} to set the length of the term text. See LUCENE-969 for details.

    Typical Token reuse patterns:

    • Copying text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):
          return reusableToken.reinit(string, startOffset, endOffset[, type]);
        
    • Copying some text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):
          return reusableToken.reinit(string, 0, string.length(), startOffset, endOffset[, type]);
        
    • Copying text from char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):
          return reusableToken.reinit(buffer, 0, buffer.length, startOffset, endOffset[, type]);
        
    • Copying some text from a char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):
          return reusableToken.reinit(buffer, start, end - start, startOffset, endOffset[, type]);
        
    • Copying from one one Token to another (type is reset to {@link #DEFAULT_TYPE} if not specified):
          return reusableToken.reinit(source.termBuffer(), 0, source.termLength(), source.startOffset(), source.endOffset()[, source.type()]);
        
    A few things to note:
    • clear() initializes all of the fields to default values. This was changed in contrast to Lucene 2.4, but should affect no one.
    • Because TokenStreams can be chained, one cannot assume that the Token's current type is correct.
    • The startOffset and endOffset represent the start and offset in the source text, so be careful in adjusting them.
    • When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.

    @see org.apache.lucene.index.Payload */ public class Token extends AttributeImpl implements Cloneable, TermAttribute, TypeAttribute, PositionIncrementAttribute, FlagsAttribute, OffsetAttribute, PayloadAttribute { public static final String DEFAULT_TYPE = "word"; private static int MIN_BUFFER_SIZE = 10; /** @deprecated We will remove this when we remove the * deprecated APIs */ private String termText; /** * Characters for the term text. * @deprecated This will be made private. Instead, use: * {@link #termBuffer()}, * {@link #setTermBuffer(char[], int, int)}, * {@link #setTermBuffer(String)}, or * {@link #setTermBuffer(String, int, int)} */ char[] termBuffer; /** * Length of term text in the buffer. * @deprecated This will be made private. Instead, use: * {@link #termLength()}, or @{link setTermLength(int)}. */ int termLength; /** * Start in source text. * @deprecated This will be made private. Instead, use: * {@link #startOffset()}, or @{link setStartOffset(int)}. */ int startOffset; /** * End in source text. * @deprecated This will be made private. Instead, use: * {@link #endOffset()}, or @{link setEndOffset(int)}. */ int endOffset; /** * The lexical type of the token. * @deprecated This will be made private. Instead, use: * {@link #type()}, or @{link setType(String)}. */ String type = DEFAULT_TYPE; private int flags; /** * @deprecated This will be made private. Instead, use: * {@link #getPayload()}, or @{link setPayload(Payload)}. */ Payload payload; /** * @deprecated This will be made private. Instead, use: * {@link #getPositionIncrement()}, or @{link setPositionIncrement(String)}. */ int positionIncrement = 1; /** Constructs a Token will null text. */ public Token() { } /** Constructs a Token with null text and start & end * offsets. * @param start start offset in the source text * @param end end offset in the source text */ public Token(int start, int end) { startOffset = start; endOffset = end; } /** Constructs a Token with null text and start & end * offsets plus the Token type. * @param start start offset in the source text * @param end end offset in the source text * @param typ the lexical type of this Token */ public Token(int start, int end, String typ) { startOffset = start; endOffset = end; type = typ; } /** * Constructs a Token with null text and start & end * offsets plus flags. NOTE: flags is EXPERIMENTAL. * @param start start offset in the source text * @param end end offset in the source text * @param flags The bits to set for this token */ public Token(int start, int end, int flags) { startOffset = start; endOffset = end; this.flags = flags; } /** Constructs a Token with the given term text, and start * & end offsets. The type defaults to "word." * NOTE: for better indexing speed you should * instead use the char[] termBuffer methods to set the * term text. * @param text term text * @param start start offset * @param end end offset */ public Token(String text, int start, int end) { termText = text; startOffset = start; endOffset = end; } /** Constructs a Token with the given text, start and end * offsets, & type. NOTE: for better indexing * speed you should instead use the char[] termBuffer * methods to set the term text. * @param text term text * @param start start offset * @param end end offset * @param typ token type */ public Token(String text, int start, int end, String typ) { termText = text; startOffset = start; endOffset = end; type = typ; } /** * Constructs a Token with the given text, start and end * offsets, & type. NOTE: for better indexing * speed you should instead use the char[] termBuffer * methods to set the term text. * @param text * @param start * @param end * @param flags token type bits */ public Token(String text, int start, int end, int flags) { termText = text; startOffset = start; endOffset = end; this.flags = flags; } /** * Constructs a Token with the given term buffer (offset * & length), start and end * offsets * @param startTermBuffer * @param termBufferOffset * @param termBufferLength * @param start * @param end */ public Token(char[] startTermBuffer, int termBufferOffset, int termBufferLength, int start, int end) { setTermBuffer(startTermBuffer, termBufferOffset, termBufferLength); startOffset = start; endOffset = end; } /** Set the position increment. This determines the position of this token * relative to the previous Token in a {@link TokenStream}, used in phrase * searching. * *

    The default value is one. * *

    Some common uses for this are:

      * *
    • Set it to zero to put multiple terms in the same position. This is * useful if, e.g., a word has multiple stems. Searches for phrases * including either stem will match. In this case, all but the first stem's * increment should be set to zero: the increment of the first instance * should be one. Repeating a token with an increment of zero can also be * used to boost the scores of matches on that token. * *
    • Set it to values greater than one to inhibit exact phrase matches. * If, for example, one does not want phrases to match across removed stop * words, then one could build a stop word filter that removes stop words and * also sets the increment to the number of stop words removed before each * non-stop word. Then exact phrase queries will only match when the terms * occur with no intervening stop words. * *
    * @param positionIncrement the distance from the prior term * @see org.apache.lucene.index.TermPositions */ public void setPositionIncrement(int positionIncrement) { if (positionIncrement < 0) throw new IllegalArgumentException ("Increment must be zero or greater: " + positionIncrement); this.positionIncrement = positionIncrement; } /** Returns the position increment of this Token. * @see #setPositionIncrement */ public int getPositionIncrement() { return positionIncrement; } /** Sets the Token's term text. NOTE: for better * indexing speed you should instead use the char[] * termBuffer methods to set the term text. * @deprecated use {@link #setTermBuffer(char[], int, int)} or * {@link #setTermBuffer(String)} or * {@link #setTermBuffer(String, int, int)}. */ public void setTermText(String text) { termText = text; termBuffer = null; } /** Returns the Token's term text. * * @deprecated This method now has a performance penalty * because the text is stored internally in a char[]. If * possible, use {@link #termBuffer()} and {@link * #termLength()} directly instead. If you really need a * String, use {@link #term()}
    */ public final String termText() { if (termText == null && termBuffer != null) termText = new String(termBuffer, 0, termLength); return termText; } /** Returns the Token's term text. * * This method has a performance penalty * because the text is stored internally in a char[]. If * possible, use {@link #termBuffer()} and {@link * #termLength()} directly instead. If you really need a * String, use this method, which is nothing more than * a convenience call to new String(token.termBuffer(), 0, token.termLength()) */ public final String term() { if (termText != null) return termText; initTermBuffer(); return new String(termBuffer, 0, termLength); } /** Copies the contents of buffer, starting at offset for * length characters, into the termBuffer array. * @param buffer the buffer to copy * @param offset the index in the buffer of the first character to copy * @param length the number of characters to copy */ public final void setTermBuffer(char[] buffer, int offset, int length) { termText = null; growTermBuffer(length); System.arraycopy(buffer, offset, termBuffer, 0, length); termLength = length; } /** Copies the contents of buffer into the termBuffer array. * @param buffer the buffer to copy */ public final void setTermBuffer(String buffer) { termText = null; final int length = buffer.length(); growTermBuffer(length); buffer.getChars(0, length, termBuffer, 0); termLength = length; } /** Copies the contents of buffer, starting at offset and continuing * for length characters, into the termBuffer array. * @param buffer the buffer to copy * @param offset the index in the buffer of the first character to copy * @param length the number of characters to copy */ public final void setTermBuffer(String buffer, int offset, int length) { assert offset <= buffer.length(); assert offset + length <= buffer.length(); termText = null; growTermBuffer(length); buffer.getChars(offset, offset + length, termBuffer, 0); termLength = length; } /** Returns the internal termBuffer character array which * you can then directly alter. If the array is too * small for your token, use {@link * #resizeTermBuffer(int)} to increase it. After * altering the buffer be sure to call {@link * #setTermLength} to record the number of valid * characters that were placed into the termBuffer. */ public final char[] termBuffer() { initTermBuffer(); return termBuffer; } /** Grows the termBuffer to at least size newSize, preserving the * existing content. Note: If the next operation is to change * the contents of the term buffer use * {@link #setTermBuffer(char[], int, int)}, * {@link #setTermBuffer(String)}, or * {@link #setTermBuffer(String, int, int)} * to optimally combine the resize with the setting of the termBuffer. * @param newSize minimum size of the new termBuffer * @return newly created termBuffer with length >= newSize */ public char[] resizeTermBuffer(int newSize) { if (termBuffer == null) { // The buffer is always at least MIN_BUFFER_SIZE newSize = newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize; //Preserve termText if (termText != null) { final int ttLen = termText.length(); newSize = newSize < ttLen ? ttLen : newSize; termBuffer = new char[ArrayUtil.getNextSize(newSize)]; termText.getChars(0, termText.length(), termBuffer, 0); termText = null; } else { // no term Text, the first allocation termBuffer = new char[ArrayUtil.getNextSize(newSize)]; } } else { if(termBuffer.length < newSize){ // Not big enough; create a new array with slight // over allocation and preserve content final char[] newCharBuffer = new char[ArrayUtil.getNextSize(newSize)]; System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length); termBuffer = newCharBuffer; } } return termBuffer; } /** Allocates a buffer char[] of at least newSize, without preserving the existing content. * its always used in places that set the content * @param newSize minimum size of the buffer */ private void growTermBuffer(int newSize) { if (termBuffer == null) { // The buffer is always at least MIN_BUFFER_SIZE termBuffer = new char[ArrayUtil.getNextSize(newSize < MIN_BUFFER_SIZE ? MIN_BUFFER_SIZE : newSize)]; } else { if(termBuffer.length < newSize){ // Not big enough; create a new array with slight // over allocation: termBuffer = new char[ArrayUtil.getNextSize(newSize)]; } } } // TODO: once we remove the deprecated termText() method // and switch entirely to char[] termBuffer we don't need // to use this method anymore, only for late init of the buffer private void initTermBuffer() { if (termBuffer == null) { if (termText == null) { termBuffer = new char[ArrayUtil.getNextSize(MIN_BUFFER_SIZE)]; termLength = 0; } else { int length = termText.length(); if (length < MIN_BUFFER_SIZE) length = MIN_BUFFER_SIZE; termBuffer = new char[ArrayUtil.getNextSize(length)]; termLength = termText.length(); termText.getChars(0, termText.length(), termBuffer, 0); termText = null; } } else { termText = null; } } /** Return number of valid characters (length of the term) * in the termBuffer array. */ public final int termLength() { initTermBuffer(); return termLength; } /** Set number of valid characters (length of the term) in * the termBuffer array. Use this to truncate the termBuffer * or to synchronize with external manipulation of the termBuffer. * Note: to grow the size of the array, * use {@link #resizeTermBuffer(int)} first. * @param length the truncated length */ public final void setTermLength(int length) { initTermBuffer(); if (length > termBuffer.length) throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")"); termLength = length; } /** Returns this Token's starting offset, the position of the first character corresponding to this token in the source text. Note that the difference between endOffset() and startOffset() may not be equal to termText.length(), as the term text may have been altered by a stemmer or some other filter. */ public final int startOffset() { return startOffset; } /** Set the starting offset. @see #startOffset() */ public void setStartOffset(int offset) { this.startOffset = offset; } /** Returns this Token's ending offset, one greater than the position of the last character corresponding to this token in the source text. The length of the token in the source text is (endOffset - startOffset). */ public final int endOffset() { return endOffset; } /** Set the ending offset. @see #endOffset() */ public void setEndOffset(int offset) { this.endOffset = offset; } /** Set the starting and ending offset. @see #startOffset() and #endOffset()*/ public void setOffset(int startOffset, int endOffset) { this.startOffset = startOffset; this.endOffset = endOffset; } /** Returns this Token's lexical type. Defaults to "word". */ public final String type() { return type; } /** Set the lexical type. @see #type() */ public final void setType(String type) { this.type = type; } /** * EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long. *

    * * Get the bitset for any bits that have been set. This is completely distinct from {@link #type()}, although they do share similar purposes. * The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s. * * * @return The bits */ public int getFlags() { return flags; } /** * @see #getFlags() */ public void setFlags(int flags) { this.flags = flags; } /** * Returns this Token's payload. */ public Payload getPayload() { return this.payload; } /** * Sets this Token's payload. */ public void setPayload(Payload payload) { this.payload = payload; } public String toString() { StringBuffer sb = new StringBuffer(); sb.append('('); initTermBuffer(); if (termBuffer == null) sb.append("null"); else sb.append(termBuffer, 0, termLength); sb.append(',').append(startOffset).append(',').append(endOffset); if (!type.equals("word")) sb.append(",type=").append(type); if (positionIncrement != 1) sb.append(",posIncr=").append(positionIncrement); sb.append(')'); return sb.toString(); } /** Resets the term text, payload, flags, and positionIncrement, * startOffset, endOffset and token type to default. */ public void clear() { payload = null; // Leave termBuffer to allow re-use termLength = 0; termText = null; positionIncrement = 1; flags = 0; startOffset = endOffset = 0; type = DEFAULT_TYPE; } public Object clone() { Token t = (Token)super.clone(); // Do a deep clone if (termBuffer != null) { t.termBuffer = new char[this.termLength]; System.arraycopy(this.termBuffer, 0, t.termBuffer, 0, this.termLength); } if (payload != null) { t.payload = (Payload) payload.clone(); } return t; } /** Makes a clone, but replaces the term buffer & * start/end offset in the process. This is more * efficient than doing a full clone (and then calling * setTermBuffer) because it saves a wasted copy of the old * termBuffer. */ public Token clone(char[] newTermBuffer, int newTermOffset, int newTermLength, int newStartOffset, int newEndOffset) { final Token t = new Token(newTermBuffer, newTermOffset, newTermLength, newStartOffset, newEndOffset); t.positionIncrement = positionIncrement; t.flags = flags; t.type = type; if (payload != null) t.payload = (Payload) payload.clone(); return t; } public boolean equals(Object obj) { if (obj == this) return true; if (obj instanceof Token) { Token other = (Token) obj; initTermBuffer(); other.initTermBuffer(); if (termLength == other.termLength && startOffset == other.startOffset && endOffset == other.endOffset && flags == other.flags && positionIncrement == other.positionIncrement && subEqual(type, other.type) && subEqual(payload, other.payload)) { for(int i=0;iCachingTokenFilter implements the optional method * {@link TokenStream#reset()}, which repositions the * stream to the first Token. */ public class CachingTokenFilter extends TokenFilter { private List cache = null; private Iterator iterator = null; private AttributeSource.State finalState; public CachingTokenFilter(TokenStream input) { super(input); } /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should * not be overridden. Delegates to the backwards compatibility layer. */ public final Token next(final Token reusableToken) throws IOException { return super.next(reusableToken); } /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should * not be overridden. Delegates to the backwards compatibility layer. */ public final Token next() throws IOException { return super.next(); } public final boolean incrementToken() throws IOException { if (cache == null) { // fill cache lazily cache = new LinkedList(); fillCache(); iterator = cache.iterator(); } if (!iterator.hasNext()) { // the cache is exhausted, return false return false; } // Since the TokenFilter can be reset, the tokens need to be preserved as immutable. restoreState((AttributeSource.State) iterator.next()); return true; } public final void end() throws IOException { if (finalState != null) { restoreState(finalState); } } public void reset() throws IOException { if(cache != null) { iterator = cache.iterator(); } } private void fillCache() throws IOException { while(input.incrementToken()) { cache.add(captureState()); } // capture final state input.end(); finalState = captureState(); } } lucene-2.9.4/src/java/org/apache/lucene/analysis/Tokenizer.java0000644000175000017500000000662111474505315025120 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.util.AttributeSource; import java.io.Reader; import java.io.IOException; /** A Tokenizer is a TokenStream whose input is a Reader.

    This is an abstract class; subclasses must override {@link #incrementToken()}

    NOTE: Subclasses overriding {@link #incrementToken()} must call {@link AttributeSource#clearAttributes()} before setting attributes. Subclasses overriding {@link #next(Token)} must call {@link Token#clear()} before setting Token attributes. */ public abstract class Tokenizer extends TokenStream { /** The text source for this Tokenizer. */ protected Reader input; /** Construct a tokenizer with null input. */ protected Tokenizer() {} /** Construct a token stream processing the given input. */ protected Tokenizer(Reader input) { this.input = CharReader.get(input); } /** Construct a tokenizer with null input using the given AttributeFactory. */ protected Tokenizer(AttributeFactory factory) { super(factory); } /** Construct a token stream processing the given input using the given AttributeFactory. */ protected Tokenizer(AttributeFactory factory, Reader input) { super(factory); this.input = CharReader.get(input); } /** Construct a token stream processing the given input using the given AttributeSource. */ protected Tokenizer(AttributeSource source) { super(source); } /** Construct a token stream processing the given input using the given AttributeSource. */ protected Tokenizer(AttributeSource source, Reader input) { super(source); this.input = CharReader.get(input); } /** By default, closes the input Reader. */ public void close() throws IOException { if (input != null) { input.close(); // LUCENE-2387: don't hold onto Reader after close, so // GC can reclaim input = null; } } /** Return the corrected offset. If {@link #input} is a {@link CharStream} subclass * this method calls {@link CharStream#correctOffset}, else returns currentOff. * @param currentOff offset as seen in the output * @return corrected offset based on the input * @see CharStream#correctOffset */ protected final int correctOffset(int currentOff) { return (input instanceof CharStream) ? ((CharStream) input).correctOffset(currentOff) : currentOff; } /** Expert: Reset the tokenizer to a new reader. Typically, an * analyzer (in its reusableTokenStream method) will use * this to re-use a previously created tokenizer. */ public void reset(Reader input) throws IOException { this.input = input; } } lucene-2.9.4/src/java/org/apache/lucene/analysis/LowerCaseFilter.java0000644000175000017500000000312411474320222026163 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** * Normalizes token text to lower case. * * @version $Id: LowerCaseFilter.java 797665 2009-07-24 21:45:48Z buschmi $ */ public final class LowerCaseFilter extends TokenFilter { public LowerCaseFilter(TokenStream in) { super(in); termAtt = (TermAttribute) addAttribute(TermAttribute.class); } private TermAttribute termAtt; public final boolean incrementToken() throws IOException { if (input.incrementToken()) { final char[] buffer = termAtt.termBuffer(); final int length = termAtt.termLength(); for(int i=0;i This is an abstract class; subclasses must override {@link #incrementToken()}. @see TokenStream */ public abstract class TokenFilter extends TokenStream { /** The source of tokens for this filter. */ protected final TokenStream input; /** Construct a token stream filtering the given input. */ protected TokenFilter(TokenStream input) { super(input); this.input = input; } /** Performs end-of-stream operations, if any, and calls then end() on the * input TokenStream.

    * NOTE: Be sure to call super.end() first when overriding this method.*/ public void end() throws IOException { input.end(); } /** Close the input TokenStream. */ public void close() throws IOException { input.close(); } /** Reset the filter as well as the input TokenStream. */ public void reset() throws IOException { input.reset(); } } lucene-2.9.4/src/java/org/apache/lucene/analysis/TokenWrapper.java0000644000175000017500000001031511474320222025552 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.index.Payload; import org.apache.lucene.util.AttributeImpl; /** * This class wraps a Token and supplies a single attribute instance * where the delegate token can be replaced. * @deprecated Will be removed, when old TokenStream API is removed. */ final class TokenWrapper extends AttributeImpl implements Cloneable, TermAttribute, TypeAttribute, PositionIncrementAttribute, FlagsAttribute, OffsetAttribute, PayloadAttribute { Token delegate; TokenWrapper() { this(new Token()); } TokenWrapper(Token delegate) { this.delegate = delegate; } // TermAttribute: public String term() { return delegate.term(); } public void setTermBuffer(char[] buffer, int offset, int length) { delegate.setTermBuffer(buffer, offset, length); } public void setTermBuffer(String buffer) { delegate.setTermBuffer(buffer); } public void setTermBuffer(String buffer, int offset, int length) { delegate.setTermBuffer(buffer, offset, length); } public char[] termBuffer() { return delegate.termBuffer(); } public char[] resizeTermBuffer(int newSize) { return delegate.resizeTermBuffer(newSize); } public int termLength() { return delegate.termLength(); } public void setTermLength(int length) { delegate.setTermLength(length); } // TypeAttribute: public String type() { return delegate.type(); } public void setType(String type) { delegate.setType(type); } public void setPositionIncrement(int positionIncrement) { delegate.setPositionIncrement(positionIncrement); } public int getPositionIncrement() { return delegate.getPositionIncrement(); } // FlagsAttribute public int getFlags() { return delegate.getFlags(); } public void setFlags(int flags) { delegate.setFlags(flags); } // OffsetAttribute public int startOffset() { return delegate.startOffset(); } public void setOffset(int startOffset, int endOffset) { delegate.setOffset(startOffset, endOffset); } public int endOffset() { return delegate.endOffset(); } // PayloadAttribute public Payload getPayload() { return delegate.getPayload(); } public void setPayload(Payload payload) { delegate.setPayload(payload); } // AttributeImpl public void clear() { delegate.clear(); } public String toString() { return delegate.toString(); } public int hashCode() { return delegate.hashCode(); } public boolean equals(Object other) { if (other instanceof TokenWrapper) { return ((TokenWrapper) other).delegate.equals(this.delegate); } return false; } public Object clone() { return new TokenWrapper((Token) delegate.clone()); } public void copyTo(AttributeImpl target) { if (target instanceof TokenWrapper) { ((TokenWrapper) target).delegate = (Token) this.delegate.clone(); } else { this.delegate.copyTo(target); } } } lucene-2.9.4/src/java/org/apache/lucene/analysis/WordlistLoader.java0000644000175000017500000001277711474320222026105 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.util.HashMap; import java.util.HashSet; /** * Loader for text files that represent a list of stopwords. * * * @version $Id: WordlistLoader.java 706342 2008-10-20 17:19:29Z gsingers $ */ public class WordlistLoader { /** * Loads a text file and adds every line as an entry to a HashSet (omitting * leading and trailing whitespace). Every line of the file should contain only * one word. The words need to be in lowercase if you make use of an * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). * * @param wordfile File containing the wordlist * @return A HashSet with the file's words */ public static HashSet getWordSet(File wordfile) throws IOException { HashSet result = new HashSet(); FileReader reader = null; try { reader = new FileReader(wordfile); result = getWordSet(reader); } finally { if (reader != null) reader.close(); } return result; } /** * Loads a text file and adds every non-comment line as an entry to a HashSet (omitting * leading and trailing whitespace). Every line of the file should contain only * one word. The words need to be in lowercase if you make use of an * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). * * @param wordfile File containing the wordlist * @param comment The comment string to ignore * @return A HashSet with the file's words */ public static HashSet getWordSet(File wordfile, String comment) throws IOException { HashSet result = new HashSet(); FileReader reader = null; try { reader = new FileReader(wordfile); result = getWordSet(reader, comment); } finally { if (reader != null) reader.close(); } return result; } /** * Reads lines from a Reader and adds every line as an entry to a HashSet (omitting * leading and trailing whitespace). Every line of the Reader should contain only * one word. The words need to be in lowercase if you make use of an * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). * * @param reader Reader containing the wordlist * @return A HashSet with the reader's words */ public static HashSet getWordSet(Reader reader) throws IOException { HashSet result = new HashSet(); BufferedReader br = null; try { if (reader instanceof BufferedReader) { br = (BufferedReader) reader; } else { br = new BufferedReader(reader); } String word = null; while ((word = br.readLine()) != null) { result.add(word.trim()); } } finally { if (br != null) br.close(); } return result; } /** * Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting * leading and trailing whitespace). Every line of the Reader should contain only * one word. The words need to be in lowercase if you make use of an * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). * * @param reader Reader containing the wordlist * @param comment The string representing a comment. * @return A HashSet with the reader's words */ public static HashSet getWordSet(Reader reader, String comment) throws IOException { HashSet result = new HashSet(); BufferedReader br = null; try { if (reader instanceof BufferedReader) { br = (BufferedReader) reader; } else { br = new BufferedReader(reader); } String word = null; while ((word = br.readLine()) != null) { if (word.startsWith(comment) == false){ result.add(word.trim()); } } } finally { if (br != null) br.close(); } return result; } /** * Reads a stem dictionary. Each line contains: *

    word\tstem
    * (i.e. two tab seperated words) * * @return stem dictionary that overrules the stemming algorithm * @throws IOException */ public static HashMap getStemDict(File wordstemfile) throws IOException { if (wordstemfile == null) throw new NullPointerException("wordstemfile may not be null"); HashMap result = new HashMap(); BufferedReader br = null; FileReader fr = null; try { fr = new FileReader(wordstemfile); br = new BufferedReader(fr); String line; while ((line = br.readLine()) != null) { String[] wordstem = line.split("\t", 2); result.put(wordstem[0], wordstem[1]); } } finally { if (fr != null) fr.close(); if (br != null) br.close(); } return result; } } lucene-2.9.4/src/java/org/apache/lucene/analysis/PerFieldAnalyzerWrapper.java0000644000175000017500000001105511474320222027674 0ustar janpascaljanpascalpackage org.apache.lucene.analysis; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.document.Fieldable; import java.io.Reader; import java.io.IOException; import java.util.Map; import java.util.HashMap; /** * This analyzer is used to facilitate scenarios where different * fields require different analysis techniques. Use {@link #addAnalyzer} * to add a non-default analyzer on a field name basis. * *

    Example usage: * *

     *   PerFieldAnalyzerWrapper aWrapper =
     *      new PerFieldAnalyzerWrapper(new StandardAnalyzer());
     *   aWrapper.addAnalyzer("firstname", new KeywordAnalyzer());
     *   aWrapper.addAnalyzer("lastname", new KeywordAnalyzer());
     * 
    * *

    In this example, StandardAnalyzer will be used for all fields except "firstname" * and "lastname", for which KeywordAnalyzer will be used. * *

    A PerFieldAnalyzerWrapper can be used like any other analyzer, for both indexing * and query parsing. */ public class PerFieldAnalyzerWrapper extends Analyzer { private Analyzer defaultAnalyzer; private Map analyzerMap = new HashMap(); /** * Constructs with default analyzer. * * @param defaultAnalyzer Any fields not specifically * defined to use a different analyzer will use the one provided here. */ public PerFieldAnalyzerWrapper(Analyzer defaultAnalyzer) { this(defaultAnalyzer, null); } /** * Constructs with default analyzer and a map of analyzers to use for * specific fields. * * @param defaultAnalyzer Any fields not specifically * defined to use a different analyzer will use the one provided here. * @param fieldAnalyzers a Map (String field name to the Analyzer) to be * used for those fields */ public PerFieldAnalyzerWrapper(Analyzer defaultAnalyzer, Map /**/ fieldAnalyzers) { this.defaultAnalyzer = defaultAnalyzer; if (fieldAnalyzers != null) { analyzerMap.putAll(fieldAnalyzers); } setOverridesTokenStreamMethod(PerFieldAnalyzerWrapper.class); } /** * Defines an analyzer to use for the specified field. * * @param fieldName field name requiring a non-default analyzer * @param analyzer non-default analyzer to use for field */ public void addAnalyzer(String fieldName, Analyzer analyzer) { analyzerMap.put(fieldName, analyzer); } public TokenStream tokenStream(String fieldName, Reader reader) { Analyzer analyzer = (Analyzer) analyzerMap.get(fieldName); if (analyzer == null) { analyzer = defaultAnalyzer; } return analyzer.tokenStream(fieldName, reader); } public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { if (overridesTokenStreamMethod) { // LUCENE-1678: force fallback to tokenStream() if we // have been subclassed and that subclass overrides // tokenStream but not reusableTokenStream return tokenStream(fieldName, reader); } Analyzer analyzer = (Analyzer) analyzerMap.get(fieldName); if (analyzer == null) analyzer = defaultAnalyzer; return analyzer.reusableTokenStream(fieldName, reader); } /** Return the positionIncrementGap from the analyzer assigned to fieldName */ public int getPositionIncrementGap(String fieldName) { Analyzer analyzer = (Analyzer) analyzerMap.get(fieldName); if (analyzer == null) analyzer = defaultAnalyzer; return analyzer.getPositionIncrementGap(fieldName); } /** Return the offsetGap from the analyzer assigned to field */ public int getOffsetGap(Fieldable field) { Analyzer analyzer = (Analyzer) analyzerMap.get(field.name()); if (analyzer == null) analyzer = defaultAnalyzer; return analyzer.getOffsetGap(field); } public String toString() { return "PerFieldAnalyzerWrapper(" + analyzerMap + ", default=" + defaultAnalyzer + ")"; } } lucene-2.9.4/src/java/org/apache/lucene/analysis/NormalizeCharMap.java0000644000175000017500000000416611474320222026334 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.analysis; import java.util.HashMap; import java.util.Map; /** * Holds a map of String input to String output, to be used * with {@link MappingCharFilter}. */ public class NormalizeCharMap { //Map submap; Map submap; String normStr; int diff; /** Records a replacement to be applied to the inputs * stream. Whenever singleMatch occurs in * the input, it will be replaced with * replacement. * * @param singleMatch input String to be replaced * @param replacement output String */ public void add(String singleMatch, String replacement) { NormalizeCharMap currMap = this; for(int i = 0; i < singleMatch.length(); i++) { char c = singleMatch.charAt(i); if (currMap.submap == null) { currMap.submap = new HashMap(1); } NormalizeCharMap map = (NormalizeCharMap) currMap.submap.get(CharacterCache.valueOf(c)); if (map == null) { map = new NormalizeCharMap(); currMap.submap.put(new Character(c), map); } currMap = map; } if (currMap.normStr != null) { throw new RuntimeException("MappingCharFilter: there is already a mapping for " + singleMatch); } currMap.normStr = replacement; currMap.diff = singleMatch.length() - replacement.length(); } } lucene-2.9.4/src/java/org/apache/lucene/store/0000755000175000017500000000000011554106562021607 5ustar janpascaljanpascallucene-2.9.4/src/java/org/apache/lucene/store/NativeFSLockFactory.java0000644000175000017500000002346411474320230026272 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.nio.channels.FileChannel; import java.nio.channels.FileLock; import java.io.File; import java.io.RandomAccessFile; import java.io.IOException; import java.util.HashSet; /** *

    Implements {@link LockFactory} using native OS file * locks. Note that because this LockFactory relies on * java.nio.* APIs for locking, any problems with those APIs * will cause locking to fail. Specifically, on certain NFS * environments the java.nio.* locks will fail (the lock can * incorrectly be double acquired) whereas {@link * SimpleFSLockFactory} worked perfectly in those same * environments. For NFS based access to an index, it's * recommended that you try {@link SimpleFSLockFactory} * first and work around the one limitation that a lock file * could be left when the JVM exits abnormally.

    * *

    The primary benefit of {@link NativeFSLockFactory} is * that lock files will be properly removed (by the OS) if * the JVM has an abnormal exit.

    * *

    Note that, unlike {@link SimpleFSLockFactory}, the existence of * leftover lock files in the filesystem on exiting the JVM * is fine because the OS will free the locks held against * these files even though the files still remain.

    * *

    If you suspect that this or any other LockFactory is * not working properly in your environment, you can easily * test it by using {@link VerifyingLockFactory}, {@link * LockVerifyServer} and {@link LockStressTest}.

    * * @see LockFactory */ public class NativeFSLockFactory extends FSLockFactory { /** * Create a NativeFSLockFactory instance, with null (unset) * lock directory. When you pass this factory to a {@link FSDirectory} * subclass, the lock directory is automatically set to the * directory itsself. Be sure to create one instance for each directory * your create! */ public NativeFSLockFactory() throws IOException { this((File) null); } /** * Create a NativeFSLockFactory instance, storing lock * files into the specified lockDirName: * * @param lockDirName where lock files are created. */ public NativeFSLockFactory(String lockDirName) throws IOException { this(new File(lockDirName)); } /** * Create a NativeFSLockFactory instance, storing lock * files into the specified lockDir: * * @param lockDir where lock files are created. */ public NativeFSLockFactory(File lockDir) throws IOException { setLockDir(lockDir); } public synchronized Lock makeLock(String lockName) { if (lockPrefix != null) lockName = lockPrefix + "-" + lockName; return new NativeFSLock(lockDir, lockName); } public void clearLock(String lockName) throws IOException { // Note that this isn't strictly required anymore // because the existence of these files does not mean // they are locked, but, still do this in case people // really want to see the files go away: if (lockDir.exists()) { if (lockPrefix != null) { lockName = lockPrefix + "-" + lockName; } File lockFile = new File(lockDir, lockName); if (lockFile.exists() && !lockFile.delete()) { throw new IOException("Cannot delete " + lockFile); } } } } class NativeFSLock extends Lock { private RandomAccessFile f; private FileChannel channel; private FileLock lock; private File path; private File lockDir; /* * The javadocs for FileChannel state that you should have * a single instance of a FileChannel (per JVM) for all * locking against a given file. To ensure this, we have * a single (static) HashSet that contains the file paths * of all currently locked locks. This protects against * possible cases where different Directory instances in * one JVM (each with their own NativeFSLockFactory * instance) have set the same lock dir and lock prefix. */ private static HashSet LOCK_HELD = new HashSet(); public NativeFSLock(File lockDir, String lockFileName) { this.lockDir = lockDir; path = new File(lockDir, lockFileName); } private synchronized boolean lockExists() { return lock != null; } public synchronized boolean obtain() throws IOException { if (lockExists()) { // Our instance is already locked: return false; } // Ensure that lockDir exists and is a directory. if (!lockDir.exists()) { if (!lockDir.mkdirs()) throw new IOException("Cannot create directory: " + lockDir.getAbsolutePath()); } else if (!lockDir.isDirectory()) { throw new IOException("Found regular file where directory expected: " + lockDir.getAbsolutePath()); } String canonicalPath = path.getCanonicalPath(); boolean markedHeld = false; try { // Make sure nobody else in-process has this lock held // already, and, mark it held if not: synchronized(LOCK_HELD) { if (LOCK_HELD.contains(canonicalPath)) { // Someone else in this JVM already has the lock: return false; } else { // This "reserves" the fact that we are the one // thread trying to obtain this lock, so we own // the only instance of a channel against this // file: LOCK_HELD.add(canonicalPath); markedHeld = true; } } try { f = new RandomAccessFile(path, "rw"); } catch (IOException e) { // On Windows, we can get intermittent "Access // Denied" here. So, we treat this as failure to // acquire the lock, but, store the reason in case // there is in fact a real error case. failureReason = e; f = null; } if (f != null) { try { channel = f.getChannel(); try { lock = channel.tryLock(); } catch (IOException e) { // At least on OS X, we will sometimes get an // intermittent "Permission Denied" IOException, // which seems to simply mean "you failed to get // the lock". But other IOExceptions could be // "permanent" (eg, locking is not supported via // the filesystem). So, we record the failure // reason here; the timeout obtain (usually the // one calling us) will use this as "root cause" // if it fails to get the lock. failureReason = e; } finally { if (lock == null) { try { channel.close(); } finally { channel = null; } } } } finally { if (channel == null) { try { f.close(); } finally { f = null; } } } } } finally { if (markedHeld && !lockExists()) { synchronized(LOCK_HELD) { if (LOCK_HELD.contains(canonicalPath)) { LOCK_HELD.remove(canonicalPath); } } } } return lockExists(); } public synchronized void release() throws IOException { if (lockExists()) { try { lock.release(); } finally { lock = null; try { channel.close(); } finally { channel = null; try { f.close(); } finally { f = null; synchronized(LOCK_HELD) { LOCK_HELD.remove(path.getCanonicalPath()); } } } } // LUCENE-2421: we don't care anymore if the file cannot be deleted // because it's held up by another process (e.g. AntiVirus). NativeFSLock // does not depend on the existence/absence of the lock file path.delete(); } else { // if we don't hold the lock, and somebody still called release(), for // example as a result of calling IndexWriter.unlock(), we should attempt // to obtain the lock and release it. If the obtain fails, it means the // lock cannot be released, and we should throw a proper exception rather // than silently failing/not doing anything. boolean obtained = false; try { if (!(obtained = obtain())) { throw new LockReleaseFailedException( "Cannot forcefully unlock a NativeFSLock which is held by another indexer component: " + path); } } finally { if (obtained) { release(); } } } } public synchronized boolean isLocked() { // The test for is isLocked is not directly possible with native file locks: // First a shortcut, if a lock reference in this instance is available if (lockExists()) return true; // Look if lock file is present; if not, there can definitely be no lock! if (!path.exists()) return false; // Try to obtain and release (if was locked) the lock try { boolean obtained = obtain(); if (obtained) release(); return !obtained; } catch (IOException ioe) { return false; } } public String toString() { return "NativeFSLock@" + path; } } lucene-2.9.4/src/java/org/apache/lucene/store/FSDirectory.java0000644000175000017500000007262311474320230024650 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.File; import java.io.FilenameFilter; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.RandomAccessFile; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.HashMap; import java.util.Map; import org.apache.lucene.index.IndexFileNameFilter; import org.apache.lucene.util.Constants; // Used only for WRITE_LOCK_NAME in deprecated create=true case: import org.apache.lucene.index.IndexWriter; /** * * Base class for Directory implementations that store index * files in the file system. There are currently three core * subclasses: * * * * Unfortunately, because of system peculiarities, there is * no single overall best implementation. Therefore, we've * added the {@link #open} method, to allow Lucene to choose * the best FSDirectory implementation given your * environment, and the known limitations of each * implementation. For users who have no reason to prefer a * specific implementation, it's best to simply use {@link * #open}. For all others, you should instantiate the * desired implementation directly. * *

    The locking implementation is by default {@link * NativeFSLockFactory}, but can be changed by * passing in a custom {@link LockFactory} instance. * The deprecated getDirectory methods default to use * {@link SimpleFSLockFactory} for backwards compatibility. * The system properties * org.apache.lucene.store.FSDirectoryLockFactoryClass * and org.apache.lucene.FSDirectory.class * are deprecated and only used by the deprecated * getDirectory methods. The system property * org.apache.lucene.lockDir is ignored completely, * If you really want to store locks * elsewhere, you can create your own {@link * SimpleFSLockFactory} (or {@link NativeFSLockFactory}, * etc.) passing in your preferred lock directory. * *

    In 3.0 this class will become abstract. * * @see Directory */ // TODO: in 3.0 this will become an abstract base class public class FSDirectory extends Directory { /** This cache of directories ensures that there is a unique Directory * instance per path, so that synchronization on the Directory can be used to * synchronize access between readers and writers. We use * refcounts to ensure when the last use of an FSDirectory * instance for a given canonical path is closed, we remove the * instance from the cache. See LUCENE-776 * for some relevant discussion. * @deprecated Not used by any non-deprecated methods anymore */ private static final Map DIRECTORIES = new HashMap(); private static boolean disableLocks = false; // TODO: should this move up to the Directory base class? Also: should we // make a per-instance (in addition to the static "default") version? /** * Set whether Lucene's use of lock files is disabled. By default, * lock files are enabled. They should only be disabled if the index * is on a read-only medium like a CD-ROM. * @deprecated Use a {@link #open(File, LockFactory)} or a constructor * that takes a {@link LockFactory} and supply * {@link NoLockFactory#getNoLockFactory}. This setting does not work * with {@link #open(File)} only the deprecated getDirectory * respect this setting. */ public static void setDisableLocks(boolean doDisableLocks) { FSDirectory.disableLocks = doDisableLocks; } /** * Returns whether Lucene's use of lock files is disabled. * @return true if locks are disabled, false if locks are enabled. * @see #setDisableLocks * @deprecated Use a constructor that takes a {@link LockFactory} and * supply {@link NoLockFactory#getNoLockFactory}. */ public static boolean getDisableLocks() { return FSDirectory.disableLocks; } /** * Directory specified by org.apache.lucene.lockDir * or java.io.tmpdir system property. * @deprecated As of 2.1, LOCK_DIR is unused * because the write.lock is now stored by default in the * index directory. If you really want to store locks * elsewhere, you can create your own {@link * SimpleFSLockFactory} (or {@link NativeFSLockFactory}, * etc.) passing in your preferred lock directory. Then, * pass this LockFactory instance to one of * the open methods that take a * lockFactory (for example, {@link #open(File, LockFactory)}). */ public static final String LOCK_DIR = System.getProperty("org.apache.lucene.lockDir", System.getProperty("java.io.tmpdir")); /** The default class which implements filesystem-based directories. */ // deprecated private static Class IMPL; static { try { String name = System.getProperty("org.apache.lucene.FSDirectory.class", SimpleFSDirectory.class.getName()); if (FSDirectory.class.getName().equals(name)) { // FSDirectory will be abstract, so we replace it by the correct class IMPL = SimpleFSDirectory.class; } else { IMPL = Class.forName(name); } } catch (ClassNotFoundException e) { throw new RuntimeException("cannot load FSDirectory class: " + e.toString(), e); } catch (SecurityException se) { IMPL = SimpleFSDirectory.class; } } private static MessageDigest DIGESTER; static { try { DIGESTER = MessageDigest.getInstance("MD5"); } catch (NoSuchAlgorithmException e) { throw new RuntimeException(e.toString(), e); } } /** A buffer optionally used in renameTo method */ private byte[] buffer = null; /** Returns the directory instance for the named location. * * @deprecated Use {@link #open(File)} * * @param path the path to the directory. * @return the FSDirectory for the named file. */ public static FSDirectory getDirectory(String path) throws IOException { return getDirectory(new File(path), null); } /** Returns the directory instance for the named location. * * @deprecated Use {@link #open(File, LockFactory)} * * @param path the path to the directory. * @param lockFactory instance of {@link LockFactory} providing the * locking implementation. * @return the FSDirectory for the named file. */ public static FSDirectory getDirectory(String path, LockFactory lockFactory) throws IOException { return getDirectory(new File(path), lockFactory); } /** Returns the directory instance for the named location. * * @deprecated Use {@link #open(File)} * * @param file the path to the directory. * @return the FSDirectory for the named file. */ public static FSDirectory getDirectory(File file) throws IOException { return getDirectory(file, null); } /** Returns the directory instance for the named location. * * @deprecated Use {@link #open(File, LockFactory)} * * @param file the path to the directory. * @param lockFactory instance of {@link LockFactory} providing the * locking implementation. * @return the FSDirectory for the named file. */ public static FSDirectory getDirectory(File file, LockFactory lockFactory) throws IOException { file = getCanonicalPath(file); FSDirectory dir; synchronized (DIRECTORIES) { dir = (FSDirectory)DIRECTORIES.get(file); if (dir == null) { try { dir = (FSDirectory)IMPL.newInstance(); } catch (Exception e) { throw new RuntimeException("cannot load FSDirectory class: " + e.toString(), e); } dir.init(file, lockFactory); DIRECTORIES.put(file, dir); } else { // Catch the case where a Directory is pulled from the cache, but has a // different LockFactory instance. if (lockFactory != null && lockFactory != dir.getLockFactory()) { throw new IOException("Directory was previously created with a different LockFactory instance; please pass null as the lockFactory instance and use setLockFactory to change it"); } dir.checked = false; } } synchronized (dir) { dir.refCount++; } return dir; } /** Returns the directory instance for the named location. * * @deprecated Use IndexWriter's create flag, instead, to * create a new index. * * @param path the path to the directory. * @param create if true, create, or erase any existing contents. * @return the FSDirectory for the named file. */ public static FSDirectory getDirectory(String path, boolean create) throws IOException { return getDirectory(new File(path), create); } /** Returns the directory instance for the named location. * * @deprecated Use IndexWriter's create flag, instead, to * create a new index. * * @param file the path to the directory. * @param create if true, create, or erase any existing contents. * @return the FSDirectory for the named file. */ public static FSDirectory getDirectory(File file, boolean create) throws IOException { FSDirectory dir = getDirectory(file, null); // This is now deprecated (creation should only be done // by IndexWriter): if (create) { dir.create(); } return dir; } /** @deprecated */ private void create() throws IOException { if (directory.exists()) { String[] files = directory.list(IndexFileNameFilter.getFilter()); // clear old files if (files == null) throw new IOException("cannot read directory " + directory.getAbsolutePath() + ": list() returned null"); for (int i = 0; i < files.length; i++) { File file = new File(directory, files[i]); if (!file.delete()) throw new IOException("Cannot delete " + file); } } lockFactory.clearLock(IndexWriter.WRITE_LOCK_NAME); } // returns the canonical version of the directory, creating it if it doesn't exist. private static File getCanonicalPath(File file) throws IOException { return new File(file.getCanonicalPath()); } private boolean checked; final void createDir() throws IOException { if (!checked) { if (!directory.exists()) if (!directory.mkdirs()) throw new IOException("Cannot create directory: " + directory); checked = true; } } /** Initializes the directory to create a new file with the given name. * This method should be used in {@link #createOutput}. */ protected final void initOutput(String name) throws IOException { ensureOpen(); createDir(); File file = new File(directory, name); if (file.exists() && !file.delete()) // delete existing, if any throw new IOException("Cannot overwrite: " + file); } /** The underlying filesystem directory */ protected File directory = null; /** @deprecated */ private int refCount = 0; /** @deprecated */ protected FSDirectory() {}; // permit subclassing /** Create a new FSDirectory for the named location (ctor for subclasses). * @param path the path of the directory * @param lockFactory the lock factory to use, or null for the default * ({@link NativeFSLockFactory}); * @throws IOException */ protected FSDirectory(File path, LockFactory lockFactory) throws IOException { path = getCanonicalPath(path); // new ctors use always NativeFSLockFactory as default: if (lockFactory == null) { lockFactory = new NativeFSLockFactory(); } init(path, lockFactory); refCount = 1; } /** Creates an FSDirectory instance, trying to pick the * best implementation given the current environment. * The directory returned uses the {@link NativeFSLockFactory}. * *

    Currently this returns {@link NIOFSDirectory} * on non-Windows JREs and {@link SimpleFSDirectory} * on Windows. It is highly recommended that you consult the * implementation's documentation for your platform before * using this method. * *

    NOTE: this method may suddenly change which * implementation is returned from release to release, in * the event that higher performance defaults become * possible; if the precise implementation is important to * your application, please instantiate it directly, * instead. On 64 bit systems, it may also good to * return {@link MMapDirectory}, but this is disabled * because of officially missing unmap support in Java. * For optimal performance you should consider using * this implementation on 64 bit JVMs. * *

    See above */ public static FSDirectory open(File path) throws IOException { return open(path, null); } /** Just like {@link #open(File)}, but allows you to * also specify a custom {@link LockFactory}. */ public static FSDirectory open(File path, LockFactory lockFactory) throws IOException { /* For testing: MMapDirectory dir=new MMapDirectory(path, lockFactory); dir.setUseUnmap(true); return dir; */ if (Constants.WINDOWS) { return new SimpleFSDirectory(path, lockFactory); } else { return new NIOFSDirectory(path, lockFactory); } } /* will move to ctor, when reflection is removed in 3.0 */ private void init(File path, LockFactory lockFactory) throws IOException { // Set up lockFactory with cascaded defaults: if an instance was passed in, // use that; else if locks are disabled, use NoLockFactory; else if the // system property org.apache.lucene.store.FSDirectoryLockFactoryClass is set, // instantiate that; else, use SimpleFSLockFactory: directory = path; if (directory.exists() && !directory.isDirectory()) throw new NoSuchDirectoryException("file '" + directory + "' exists but is not a directory"); if (lockFactory == null) { if (disableLocks) { // Locks are disabled: lockFactory = NoLockFactory.getNoLockFactory(); } else { String lockClassName = System.getProperty("org.apache.lucene.store.FSDirectoryLockFactoryClass"); if (lockClassName != null && !lockClassName.equals("")) { Class c; try { c = Class.forName(lockClassName); } catch (ClassNotFoundException e) { throw new IOException("unable to find LockClass " + lockClassName); } try { lockFactory = (LockFactory) c.newInstance(); } catch (IllegalAccessException e) { throw new IOException("IllegalAccessException when instantiating LockClass " + lockClassName); } catch (InstantiationException e) { throw new IOException("InstantiationException when instantiating LockClass " + lockClassName); } catch (ClassCastException e) { throw new IOException("unable to cast LockClass " + lockClassName + " instance to a LockFactory"); } } else { // Our default lock is SimpleFSLockFactory; // default lockDir is our index directory: lockFactory = new SimpleFSLockFactory(); } } } setLockFactory(lockFactory); // for filesystem based LockFactory, delete the lockPrefix, if the locks are placed // in index dir. If no index dir is given, set ourselves if (lockFactory instanceof FSLockFactory) { final FSLockFactory lf = (FSLockFactory) lockFactory; final File dir = lf.getLockDir(); // if the lock factory has no lockDir set, use the this directory as lockDir if (dir == null) { lf.setLockDir(this.directory); lf.setLockPrefix(null); } else if (dir.getCanonicalPath().equals(this.directory.getCanonicalPath())) { lf.setLockPrefix(null); } } } /** Lists all files (not subdirectories) in the * directory. This method never returns null (throws * {@link IOException} instead). * * @throws NoSuchDirectoryException if the directory * does not exist, or does exist but is not a * directory. * @throws IOException if list() returns null */ public static String[] listAll(File dir) throws IOException { if (!dir.exists()) throw new NoSuchDirectoryException("directory '" + dir + "' does not exist"); else if (!dir.isDirectory()) throw new NoSuchDirectoryException("file '" + dir + "' exists but is not a directory"); // Exclude subdirs String[] result = dir.list(new FilenameFilter() { public boolean accept(File dir, String file) { return !new File(dir, file).isDirectory(); } }); if (result == null) throw new IOException("directory '" + dir + "' exists and is a directory, but cannot be listed: list() returned null"); return result; } public String[] list() { ensureOpen(); return directory.list(IndexFileNameFilter.getFilter()); } /** Lists all files (not subdirectories) in the * directory. * @see #listAll(File) */ public String[] listAll() throws IOException { ensureOpen(); return listAll(directory); } /** Returns true iff a file with the given name exists. */ public boolean fileExists(String name) { ensureOpen(); File file = new File(directory, name); return file.exists(); } /** Returns the time the named file was last modified. */ public long fileModified(String name) { ensureOpen(); File file = new File(directory, name); return file.lastModified(); } /** Returns the time the named file was last modified. */ public static long fileModified(File directory, String name) { File file = new File(directory, name); return file.lastModified(); } /** Set the modified time of an existing file to now. */ public void touchFile(String name) { ensureOpen(); File file = new File(directory, name); file.setLastModified(System.currentTimeMillis()); } /** Returns the length in bytes of a file in the directory. */ public long fileLength(String name) { ensureOpen(); File file = new File(directory, name); return file.length(); } /** Removes an existing file in the directory. */ public void deleteFile(String name) throws IOException { ensureOpen(); File file = new File(directory, name); if (!file.delete()) throw new IOException("Cannot delete " + file); } /** Renames an existing file in the directory. * Warning: This is not atomic. * @deprecated */ public synchronized void renameFile(String from, String to) throws IOException { ensureOpen(); File old = new File(directory, from); File nu = new File(directory, to); /* This is not atomic. If the program crashes between the call to delete() and the call to renameTo() then we're screwed, but I've been unable to figure out how else to do this... */ if (nu.exists()) if (!nu.delete()) throw new IOException("Cannot delete " + nu); // Rename the old file to the new one. Unfortunately, the renameTo() // method does not work reliably under some JVMs. Therefore, if the // rename fails, we manually rename by copying the old file to the new one if (!old.renameTo(nu)) { java.io.InputStream in = null; java.io.OutputStream out = null; try { in = new FileInputStream(old); out = new FileOutputStream(nu); // see if the buffer needs to be initialized. Initialization is // only done on-demand since many VM's will never run into the renameTo // bug and hence shouldn't waste 1K of mem for no reason. if (buffer == null) { buffer = new byte[1024]; } int len; while ((len = in.read(buffer)) >= 0) { out.write(buffer, 0, len); } // delete the old file. old.delete(); } catch (IOException ioe) { IOException newExc = new IOException("Cannot rename " + old + " to " + nu); newExc.initCause(ioe); throw newExc; } finally { try { if (in != null) { try { in.close(); } catch (IOException e) { throw new RuntimeException("Cannot close input stream: " + e.toString(), e); } } } finally { if (out != null) { try { out.close(); } catch (IOException e) { throw new RuntimeException("Cannot close output stream: " + e.toString(), e); } } } } } } /** Creates an IndexOutput for the file with the given name. * In 3.0 this method will become abstract. */ public IndexOutput createOutput(String name) throws IOException { initOutput(name); return new FSIndexOutput(new File(directory, name)); } public void sync(String name) throws IOException { ensureOpen(); File fullFile = new File(directory, name); boolean success = false; int retryCount = 0; IOException exc = null; while(!success && retryCount < 5) { retryCount++; RandomAccessFile file = null; try { try { file = new RandomAccessFile(fullFile, "rw"); file.getFD().sync(); success = true; } finally { if (file != null) file.close(); } } catch (IOException ioe) { if (exc == null) exc = ioe; try { // Pause 5 msec Thread.sleep(5); } catch (InterruptedException ie) { // In 3.0 we will change this to throw // InterruptedException instead Thread.currentThread().interrupt(); throw new RuntimeException(ie); } } } if (!success) // Throw original exception throw exc; } // Inherit javadoc public IndexInput openInput(String name) throws IOException { ensureOpen(); return openInput(name, BufferedIndexInput.BUFFER_SIZE); } /** Creates an IndexInput for the file with the given name. * In 3.0 this method will become abstract. */ public IndexInput openInput(String name, int bufferSize) throws IOException { ensureOpen(); return new FSIndexInput(new File(directory, name), bufferSize); } /** * So we can do some byte-to-hexchar conversion below */ private static final char[] HEX_DIGITS = {'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'}; public String getLockID() { ensureOpen(); String dirName; // name to be hashed try { dirName = directory.getCanonicalPath(); } catch (IOException e) { throw new RuntimeException(e.toString(), e); } byte digest[]; synchronized (DIGESTER) { digest = DIGESTER.digest(dirName.getBytes()); } StringBuffer buf = new StringBuffer(); buf.append("lucene-"); for (int i = 0; i < digest.length; i++) { int b = digest[i]; buf.append(HEX_DIGITS[(b >> 4) & 0xf]); buf.append(HEX_DIGITS[b & 0xf]); } return buf.toString(); } /** Closes the store to future operations. */ public synchronized void close() { if (isOpen && --refCount <= 0) { isOpen = false; synchronized (DIRECTORIES) { DIRECTORIES.remove(directory); } } } public File getFile() { ensureOpen(); return directory; } /** For debug output. */ public String toString() { return this.getClass().getName() + "@" + directory + " lockFactory=" + getLockFactory(); } /** * Default read chunk size. This is a conditional * default: on 32bit JVMs, it defaults to 100 MB. On * 64bit JVMs, it's Integer.MAX_VALUE. * @see #setReadChunkSize */ public static final int DEFAULT_READ_CHUNK_SIZE = Constants.JRE_IS_64BIT ? Integer.MAX_VALUE: 100 * 1024 * 1024; // LUCENE-1566 private int chunkSize = DEFAULT_READ_CHUNK_SIZE; /** * Sets the maximum number of bytes read at once from the * underlying file during {@link IndexInput#readBytes}. * The default value is {@link #DEFAULT_READ_CHUNK_SIZE}; * *

    This was introduced due to Sun * JVM Bug 6478546, which throws an incorrect * OutOfMemoryError when attempting to read too many bytes * at once. It only happens on 32bit JVMs with a large * maximum heap size.

    * *

    Changes to this value will not impact any * already-opened {@link IndexInput}s. You should call * this before attempting to open an index on the * directory.

    * *

    NOTE: This value should be as large as * possible to reduce any possible performance impact. If * you still encounter an incorrect OutOfMemoryError, * trying lowering the chunk size.

    */ public final void setReadChunkSize(int chunkSize) { // LUCENE-1566 if (chunkSize <= 0) { throw new IllegalArgumentException("chunkSize must be positive"); } if (!Constants.JRE_IS_64BIT) { this.chunkSize = chunkSize; } } /** * The maximum number of bytes to read at once from the * underlying file during {@link IndexInput#readBytes}. * @see #setReadChunkSize */ public final int getReadChunkSize() { // LUCENE-1566 return chunkSize; } /** @deprecated Use SimpleFSDirectory.SimpleFSIndexInput instead */ protected static class FSIndexInput extends SimpleFSDirectory.SimpleFSIndexInput { /** @deprecated */ protected static class Descriptor extends SimpleFSDirectory.SimpleFSIndexInput.Descriptor { /** @deprecated */ public Descriptor(File file, String mode) throws IOException { super(file, mode); } } /** @deprecated */ public FSIndexInput(File path) throws IOException { super(path); } /** @deprecated */ public FSIndexInput(File path, int bufferSize) throws IOException { super(path, bufferSize); } } /** @deprecated Use SimpleFSDirectory.SimpleFSIndexOutput instead */ protected static class FSIndexOutput extends SimpleFSDirectory.SimpleFSIndexOutput { /** @deprecated */ public FSIndexOutput(File path) throws IOException { super(path); } } } lucene-2.9.4/src/java/org/apache/lucene/store/ChecksumIndexInput.java0000644000175000017500000000351111474320230026213 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.zip.CRC32; import java.util.zip.Checksum; /** Writes bytes through to a primary IndexOutput, computing * checksum as it goes. Note that you cannot use seek(). */ public class ChecksumIndexInput extends IndexInput { IndexInput main; Checksum digest; public ChecksumIndexInput(IndexInput main) { this.main = main; digest = new CRC32(); } public byte readByte() throws IOException { final byte b = main.readByte(); digest.update(b); return b; } public void readBytes(byte[] b, int offset, int len) throws IOException { main.readBytes(b, offset, len); digest.update(b, offset, len); } public long getChecksum() { return digest.getValue(); } public void close() throws IOException { main.close(); } public long getFilePointer() { return main.getFilePointer(); } public void seek(long pos) { throw new RuntimeException("not allowed"); } public long length() { return main.length(); } } lucene-2.9.4/src/java/org/apache/lucene/store/RAMInputStream.java0000644000175000017500000000703511474320230025261 0ustar janpascaljanpascalpackage org.apache.lucene.store; import java.io.IOException; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * A memory-resident {@link IndexInput} implementation. * * @version $Id: RAMInputStream.java 632120 2008-02-28 21:13:59Z mikemccand $ */ class RAMInputStream extends IndexInput implements Cloneable { static final int BUFFER_SIZE = RAMOutputStream.BUFFER_SIZE; private RAMFile file; private long length; private byte[] currentBuffer; private int currentBufferIndex; private int bufferPosition; private long bufferStart; private int bufferLength; RAMInputStream(RAMFile f) throws IOException { file = f; length = file.length; if (length/BUFFER_SIZE >= Integer.MAX_VALUE) { throw new IOException("Too large RAMFile! "+length); } // make sure that we switch to the // first needed buffer lazily currentBufferIndex = -1; currentBuffer = null; } public void close() { // nothing to do here } public long length() { return length; } public byte readByte() throws IOException { if (bufferPosition >= bufferLength) { currentBufferIndex++; switchCurrentBuffer(true); } return currentBuffer[bufferPosition++]; } public void readBytes(byte[] b, int offset, int len) throws IOException { while (len > 0) { if (bufferPosition >= bufferLength) { currentBufferIndex++; switchCurrentBuffer(true); } int remainInBuffer = bufferLength - bufferPosition; int bytesToCopy = len < remainInBuffer ? len : remainInBuffer; System.arraycopy(currentBuffer, bufferPosition, b, offset, bytesToCopy); offset += bytesToCopy; len -= bytesToCopy; bufferPosition += bytesToCopy; } } private final void switchCurrentBuffer(boolean enforceEOF) throws IOException { if (currentBufferIndex >= file.numBuffers()) { // end of file reached, no more buffers left if (enforceEOF) throw new IOException("Read past EOF"); else { // Force EOF if a read takes place at this position currentBufferIndex--; bufferPosition = BUFFER_SIZE; } } else { currentBuffer = (byte[]) file.getBuffer(currentBufferIndex); bufferPosition = 0; bufferStart = (long) BUFFER_SIZE * (long) currentBufferIndex; long buflen = length - bufferStart; bufferLength = buflen > BUFFER_SIZE ? BUFFER_SIZE : (int) buflen; } } public long getFilePointer() { return currentBufferIndex < 0 ? 0 : bufferStart + bufferPosition; } public void seek(long pos) throws IOException { if (currentBuffer==null || pos < bufferStart || pos >= bufferStart + BUFFER_SIZE) { currentBufferIndex = (int) (pos / BUFFER_SIZE); switchCurrentBuffer(false); } bufferPosition = (int) (pos % BUFFER_SIZE); } } lucene-2.9.4/src/java/org/apache/lucene/store/Directory.java0000644000175000017500000002112111474320230024402 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.index.IndexFileNameFilter; /** A Directory is a flat list of files. Files may be written once, when they * are created. Once a file is created it may only be opened for read, or * deleted. Random access is permitted both when reading and writing. * *

    Java's i/o APIs not used directly, but rather all i/o is * through this API. This permits things such as:

      *
    • implementation of RAM-based indices; *
    • implementation indices stored in a database, via JDBC; *
    • implementation of an index as a single file; *
    * * Directory locking is implemented by an instance of {@link * LockFactory}, and can be changed for each Directory * instance using {@link #setLockFactory}. * */ public abstract class Directory { volatile protected boolean isOpen = true; /** Holds the LockFactory instance (implements locking for * this Directory instance). */ protected LockFactory lockFactory; /** List the files in the directory. * * @deprecated For some Directory implementations ({@link * FSDirectory}, and its subclasses), this method * silently filters its results to include only index * files. Please use {@link #listAll} instead, which * does no filtering. */ public abstract String[] list() throws IOException; /** Returns an array of strings, one for each file in the * directory. Unlike {@link #list} this method does no * filtering of the contents in a directory, and it will * never return null (throws IOException instead). * * Currently this method simply falls back to {@link * #list} for Directory impls outside of Lucene's core & * contrib, but in 3.0 that method will be removed and * this method will become abstract. */ public String[] listAll() throws IOException { return list(); } /** Returns true iff a file with the given name exists. */ public abstract boolean fileExists(String name) throws IOException; /** Returns the time the named file was last modified. */ public abstract long fileModified(String name) throws IOException; /** Set the modified time of an existing file to now. */ public abstract void touchFile(String name) throws IOException; /** Removes an existing file in the directory. */ public abstract void deleteFile(String name) throws IOException; /** Renames an existing file in the directory. * If a file already exists with the new name, then it is replaced. * This replacement is not guaranteed to be atomic. * @deprecated */ public abstract void renameFile(String from, String to) throws IOException; /** Returns the length of a file in the directory. */ public abstract long fileLength(String name) throws IOException; /** Creates a new, empty file in the directory with the given name. Returns a stream writing this file. */ public abstract IndexOutput createOutput(String name) throws IOException; /** Ensure that any writes to this file are moved to * stable storage. Lucene uses this to properly commit * changes to the index, to prevent a machine/OS crash * from corrupting the index. */ public void sync(String name) throws IOException {} /** Returns a stream reading an existing file. */ public abstract IndexInput openInput(String name) throws IOException; /** Returns a stream reading an existing file, with the * specified read buffer size. The particular Directory * implementation may ignore the buffer size. Currently * the only Directory implementations that respect this * parameter are {@link FSDirectory} and {@link * org.apache.lucene.index.CompoundFileReader}. */ public IndexInput openInput(String name, int bufferSize) throws IOException { return openInput(name); } /** Construct a {@link Lock}. * @param name the name of the lock file */ public Lock makeLock(String name) { return lockFactory.makeLock(name); } /** * Attempt to clear (forcefully unlock and remove) the * specified lock. Only call this at a time when you are * certain this lock is no longer in use. * @param name name of the lock to be cleared. */ public void clearLock(String name) throws IOException { if (lockFactory != null) { lockFactory.clearLock(name); } } /** Closes the store. */ public abstract void close() throws IOException; /** * Set the LockFactory that this Directory instance should * use for its locking implementation. Each * instance of * LockFactory should only be used for one directory (ie, * do not share a single instance across multiple * Directories). * * @param lockFactory instance of {@link LockFactory}. */ public void setLockFactory(LockFactory lockFactory) { this.lockFactory = lockFactory; lockFactory.setLockPrefix(this.getLockID()); } /** * Get the LockFactory that this Directory instance is * using for its locking implementation. Note that this * may be null for Directory implementations that provide * their own locking implementation. */ public LockFactory getLockFactory() { return this.lockFactory; } /** * Return a string identifier that uniquely differentiates * this Directory instance from other Directory instances. * This ID should be the same if two Directory instances * (even in different JVMs and/or on different machines) * are considered "the same index". This is how locking * "scopes" to the right index. */ public String getLockID() { return this.toString(); } public String toString() { return super.toString() + " lockFactory=" + getLockFactory(); } /** * Copy contents of a directory src to a directory dest. * If a file in src already exists in dest then the * one in dest will be blindly overwritten. * *

    NOTE: the source directory cannot change * while this method is running. Otherwise the results * are undefined and you could easily hit a * FileNotFoundException. * *

    NOTE: this method only copies files that look * like index files (ie, have extensions matching the * known extensions of index files). * * @param src source directory * @param dest destination directory * @param closeDirSrc if true, call {@link #close()} method on source directory * @throws IOException */ public static void copy(Directory src, Directory dest, boolean closeDirSrc) throws IOException { final String[] files = src.listAll(); IndexFileNameFilter filter = IndexFileNameFilter.getFilter(); byte[] buf = new byte[BufferedIndexOutput.BUFFER_SIZE]; for (int i = 0; i < files.length; i++) { if (!filter.accept(null, files[i])) continue; IndexOutput os = null; IndexInput is = null; try { // create file in dest directory os = dest.createOutput(files[i]); // read current file is = src.openInput(files[i]); // and copy to dest directory long len = is.length(); long readCount = 0; while (readCount < len) { int toRead = readCount + BufferedIndexOutput.BUFFER_SIZE > len ? (int)(len - readCount) : BufferedIndexOutput.BUFFER_SIZE; is.readBytes(buf, 0, toRead); os.writeBytes(buf, toRead); readCount += toRead; } } finally { // graceful cleanup try { if (os != null) os.close(); } finally { if (is != null) is.close(); } } } if(closeDirSrc) src.close(); } /** * @throws AlreadyClosedException if this Directory is closed */ protected final void ensureOpen() throws AlreadyClosedException { if (!isOpen) throw new AlreadyClosedException("this Directory is closed"); } } lucene-2.9.4/src/java/org/apache/lucene/store/LockVerifyServer.java0000644000175000017500000000547511474320230025720 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.net.ServerSocket; import java.net.Socket; import java.io.OutputStream; import java.io.InputStream; import java.io.IOException; /** * Simple standalone server that must be running when you * use {@link VerifyingLockFactory}. This server simply * verifies at most one process holds the lock at a time. * Run without any args to see usage. * * @see VerifyingLockFactory * @see LockStressTest */ public class LockVerifyServer { private static String getTime(long startTime) { return "[" + ((System.currentTimeMillis()-startTime)/1000) + "s] "; } public static void main(String[] args) throws IOException { if (args.length != 1) { System.out.println("\nUsage: java org.apache.lucene.store.LockVerifyServer port\n"); System.exit(1); } final int port = Integer.parseInt(args[0]); ServerSocket s = new ServerSocket(port); s.setReuseAddress(true); System.out.println("\nReady on port " + port + "..."); int lockedID = 0; long startTime = System.currentTimeMillis(); while(true) { Socket cs = s.accept(); OutputStream out = cs.getOutputStream(); InputStream in = cs.getInputStream(); int id = in.read(); int command = in.read(); boolean err = false; if (command == 1) { // Locked if (lockedID != 0) { err = true; System.out.println(getTime(startTime) + " ERROR: id " + id + " got lock, but " + lockedID + " already holds the lock"); } lockedID = id; } else if (command == 0) { if (lockedID != id) { err = true; System.out.println(getTime(startTime) + " ERROR: id " + id + " released the lock, but " + lockedID + " is the one holding the lock"); } lockedID = 0; } else throw new RuntimeException("unrecognized command " + command); System.out.print("."); if (err) out.write(1); else out.write(0); out.close(); in.close(); cs.close(); } } } lucene-2.9.4/src/java/org/apache/lucene/store/LockReleaseFailedException.java0000644000175000017500000000215011474320230027614 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.store; import java.io.IOException; /** * This exception is thrown when the write.lock * could not be released. * @see Lock#release(). */ public class LockReleaseFailedException extends IOException { public LockReleaseFailedException(String message) { super(message); } } lucene-2.9.4/src/java/org/apache/lucene/store/NoLockFactory.java0000644000175000017500000000343311474320230025161 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; /** * Use this {@link LockFactory} to disable locking entirely. * This LockFactory is used when you call {@link FSDirectory#setDisableLocks}. * Only one instance of this lock is created. You should call {@link * #getNoLockFactory()} to get the instance. * * @see LockFactory */ public class NoLockFactory extends LockFactory { // Single instance returned whenever makeLock is called. private static NoLock singletonLock = new NoLock(); private static NoLockFactory singleton = new NoLockFactory(); public static NoLockFactory getNoLockFactory() { return singleton; } public Lock makeLock(String lockName) { return singletonLock; } public void clearLock(String lockName) {}; }; class NoLock extends Lock { public boolean obtain() throws IOException { return true; } public void release() { } public boolean isLocked() { return false; } public String toString() { return "NoLock"; } } lucene-2.9.4/src/java/org/apache/lucene/store/IndexOutput.java0000644000175000017500000001657311474320230024745 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Map; import java.util.Iterator; import org.apache.lucene.util.UnicodeUtil; /** Abstract base class for output to a file in a Directory. A random-access * output stream. Used for all Lucene index output operations. * @see Directory * @see IndexInput */ public abstract class IndexOutput { /** Writes a single byte. * @see IndexInput#readByte() */ public abstract void writeByte(byte b) throws IOException; /** Writes an array of bytes. * @param b the bytes to write * @param length the number of bytes to write * @see IndexInput#readBytes(byte[],int,int) */ public void writeBytes(byte[] b, int length) throws IOException { writeBytes(b, 0, length); } /** Writes an array of bytes. * @param b the bytes to write * @param offset the offset in the byte array * @param length the number of bytes to write * @see IndexInput#readBytes(byte[],int,int) */ public abstract void writeBytes(byte[] b, int offset, int length) throws IOException; /** Writes an int as four bytes. * @see IndexInput#readInt() */ public void writeInt(int i) throws IOException { writeByte((byte)(i >> 24)); writeByte((byte)(i >> 16)); writeByte((byte)(i >> 8)); writeByte((byte) i); } /** Writes an int in a variable-length format. Writes between one and * five bytes. Smaller values take fewer bytes. Negative numbers are not * supported. * @see IndexInput#readVInt() */ public void writeVInt(int i) throws IOException { while ((i & ~0x7F) != 0) { writeByte((byte)((i & 0x7f) | 0x80)); i >>>= 7; } writeByte((byte)i); } /** Writes a long as eight bytes. * @see IndexInput#readLong() */ public void writeLong(long i) throws IOException { writeInt((int) (i >> 32)); writeInt((int) i); } /** Writes an long in a variable-length format. Writes between one and five * bytes. Smaller values take fewer bytes. Negative numbers are not * supported. * @see IndexInput#readVLong() */ public void writeVLong(long i) throws IOException { while ((i & ~0x7F) != 0) { writeByte((byte)((i & 0x7f) | 0x80)); i >>>= 7; } writeByte((byte)i); } /** Writes a string. * @see IndexInput#readString() */ public void writeString(String s) throws IOException { final UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result(); UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8Result); writeVInt(utf8Result.length); writeBytes(utf8Result.result, 0, utf8Result.length); } /** Writes a sub sequence of characters from s as the old * format (modified UTF-8 encoded bytes). * @param s the source of the characters * @param start the first character in the sequence * @param length the number of characters in the sequence * @deprecated -- please pre-convert to utf8 bytes * instead or use {@link #writeString} */ public void writeChars(String s, int start, int length) throws IOException { final int end = start + length; for (int i = start; i < end; i++) { final int code = (int)s.charAt(i); if (code >= 0x01 && code <= 0x7F) writeByte((byte)code); else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) { writeByte((byte)(0xC0 | (code >> 6))); writeByte((byte)(0x80 | (code & 0x3F))); } else { writeByte((byte)(0xE0 | (code >>> 12))); writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); writeByte((byte)(0x80 | (code & 0x3F))); } } } /** Writes a sub sequence of characters from char[] as * the old format (modified UTF-8 encoded bytes). * @param s the source of the characters * @param start the first character in the sequence * @param length the number of characters in the sequence * @deprecated -- please pre-convert to utf8 bytes instead or use {@link #writeString} */ public void writeChars(char[] s, int start, int length) throws IOException { final int end = start + length; for (int i = start; i < end; i++) { final int code = (int)s[i]; if (code >= 0x01 && code <= 0x7F) writeByte((byte)code); else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) { writeByte((byte)(0xC0 | (code >> 6))); writeByte((byte)(0x80 | (code & 0x3F))); } else { writeByte((byte)(0xE0 | (code >>> 12))); writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); writeByte((byte)(0x80 | (code & 0x3F))); } } } private static int COPY_BUFFER_SIZE = 16384; private byte[] copyBuffer; /** Copy numBytes bytes from input to ourself. */ public void copyBytes(IndexInput input, long numBytes) throws IOException { assert numBytes >= 0: "numBytes=" + numBytes; long left = numBytes; if (copyBuffer == null) copyBuffer = new byte[COPY_BUFFER_SIZE]; while(left > 0) { final int toCopy; if (left > COPY_BUFFER_SIZE) toCopy = COPY_BUFFER_SIZE; else toCopy = (int) left; input.readBytes(copyBuffer, 0, toCopy); writeBytes(copyBuffer, 0, toCopy); left -= toCopy; } } /** Forces any buffered output to be written. */ public abstract void flush() throws IOException; /** Closes this stream to further operations. */ public abstract void close() throws IOException; /** Returns the current position in this file, where the next write will * occur. * @see #seek(long) */ public abstract long getFilePointer(); /** Sets current position in this file, where the next write will occur. * @see #getFilePointer() */ public abstract void seek(long pos) throws IOException; /** The number of bytes in the file. */ public abstract long length() throws IOException; /** Set the file length. By default, this method does * nothing (it's optional for a Directory to implement * it). But, certain Directory implementations (for * example @see FSDirectory) can use this to inform the * underlying IO system to pre-allocate the file to the * specified size. If the length is longer than the * current file length, the bytes added to the file are * undefined. Otherwise the file is truncated. * @param length file length */ public void setLength(long length) throws IOException {}; // map must be Map public void writeStringStringMap(Map map) throws IOException { if (map == null) { writeInt(0); } else { writeInt(map.size()); final Iterator it = map.entrySet().iterator(); while(it.hasNext()) { Map.Entry entry = (Map.Entry) it.next(); writeString((String) entry.getKey()); writeString((String) entry.getValue()); } } } } lucene-2.9.4/src/java/org/apache/lucene/store/RAMDirectory.java0000644000175000017500000001724111474320230024752 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.FileNotFoundException; import java.io.File; import java.io.Serializable; import java.util.HashMap; import java.util.Iterator; import java.util.Set; /** * A memory-resident {@link Directory} implementation. Locking * implementation is by default the {@link SingleInstanceLockFactory} * but can be changed with {@link #setLockFactory}. * * @version $Id: RAMDirectory.java 886288 2009-12-02 19:43:22Z mikemccand $ */ public class RAMDirectory extends Directory implements Serializable { private static final long serialVersionUID = 1l; HashMap fileMap = new HashMap(); long sizeInBytes; // ***** // Lock acquisition sequence: RAMDirectory, then RAMFile // ***** /** Constructs an empty {@link Directory}. */ public RAMDirectory() { setLockFactory(new SingleInstanceLockFactory()); } /** * Creates a new RAMDirectory instance from a different * Directory implementation. This can be used to load * a disk-based index into memory. *

    * This should be used only with indices that can fit into memory. *

    * Note that the resulting RAMDirectory instance is fully * independent from the original Directory (it is a * complete copy). Any subsequent changes to the * original Directory will not be visible in the * RAMDirectory instance. * * @param dir a Directory value * @exception IOException if an error occurs */ public RAMDirectory(Directory dir) throws IOException { this(dir, false); } private RAMDirectory(Directory dir, boolean closeDir) throws IOException { this(); Directory.copy(dir, this, closeDir); } /** * Creates a new RAMDirectory instance from the {@link FSDirectory}. * * @param dir a File specifying the index directory * * @see #RAMDirectory(Directory) * @deprecated Use {@link #RAMDirectory(Directory)} instead */ public RAMDirectory(File dir) throws IOException { this(FSDirectory.getDirectory(dir), true); } /** * Creates a new RAMDirectory instance from the {@link FSDirectory}. * * @param dir a String specifying the full index directory path * * @see #RAMDirectory(Directory) * @deprecated Use {@link #RAMDirectory(Directory)} instead */ public RAMDirectory(String dir) throws IOException { this(FSDirectory.getDirectory(dir), true); } public synchronized final String[] list() { return listAll(); } public synchronized final String[] listAll() { ensureOpen(); Set fileNames = fileMap.keySet(); String[] result = new String[fileNames.size()]; int i = 0; Iterator it = fileNames.iterator(); while (it.hasNext()) result[i++] = (String)it.next(); return result; } /** Returns true iff the named file exists in this directory. */ public final boolean fileExists(String name) { ensureOpen(); RAMFile file; synchronized (this) { file = (RAMFile)fileMap.get(name); } return file != null; } /** Returns the time the named file was last modified. * @throws IOException if the file does not exist */ public final long fileModified(String name) throws IOException { ensureOpen(); RAMFile file; synchronized (this) { file = (RAMFile)fileMap.get(name); } if (file==null) throw new FileNotFoundException(name); return file.getLastModified(); } /** Set the modified time of an existing file to now. * @throws IOException if the file does not exist */ public void touchFile(String name) throws IOException { ensureOpen(); RAMFile file; synchronized (this) { file = (RAMFile)fileMap.get(name); } if (file==null) throw new FileNotFoundException(name); long ts2, ts1 = System.currentTimeMillis(); do { try { Thread.sleep(0, 1); } catch (InterruptedException ie) { // In 3.0 we will change this to throw // InterruptedException instead Thread.currentThread().interrupt(); throw new RuntimeException(ie); } ts2 = System.currentTimeMillis(); } while(ts1 == ts2); file.setLastModified(ts2); } /** Returns the length in bytes of a file in the directory. * @throws IOException if the file does not exist */ public final long fileLength(String name) throws IOException { ensureOpen(); RAMFile file; synchronized (this) { file = (RAMFile)fileMap.get(name); } if (file==null) throw new FileNotFoundException(name); return file.getLength(); } /** Return total size in bytes of all files in this * directory. This is currently quantized to * RAMOutputStream.BUFFER_SIZE. */ public synchronized final long sizeInBytes() { ensureOpen(); return sizeInBytes; } /** Removes an existing file in the directory. * @throws IOException if the file does not exist */ public synchronized void deleteFile(String name) throws IOException { ensureOpen(); RAMFile file = (RAMFile)fileMap.get(name); if (file!=null) { fileMap.remove(name); file.directory = null; sizeInBytes -= file.sizeInBytes; } else throw new FileNotFoundException(name); } /** Renames an existing file in the directory. * @throws FileNotFoundException if from does not exist * @deprecated */ public synchronized final void renameFile(String from, String to) throws IOException { ensureOpen(); RAMFile fromFile = (RAMFile)fileMap.get(from); if (fromFile==null) throw new FileNotFoundException(from); RAMFile toFile = (RAMFile)fileMap.get(to); if (toFile!=null) { sizeInBytes -= toFile.sizeInBytes; // updates to RAMFile.sizeInBytes synchronized on directory toFile.directory = null; } fileMap.remove(from); fileMap.put(to, fromFile); } /** Creates a new, empty file in the directory with the given name. Returns a stream writing this file. */ public IndexOutput createOutput(String name) throws IOException { ensureOpen(); RAMFile file = new RAMFile(this); synchronized (this) { RAMFile existing = (RAMFile)fileMap.get(name); if (existing!=null) { sizeInBytes -= existing.sizeInBytes; existing.directory = null; } fileMap.put(name, file); } return new RAMOutputStream(file); } /** Returns a stream reading an existing file. */ public IndexInput openInput(String name) throws IOException { ensureOpen(); RAMFile file; synchronized (this) { file = (RAMFile)fileMap.get(name); } if (file == null) throw new FileNotFoundException(name); return new RAMInputStream(file); } /** Closes the store to future operations, releasing associated memory. */ public void close() { isOpen = false; fileMap = null; } } lucene-2.9.4/src/java/org/apache/lucene/store/NIOFSDirectory.java0000644000175000017500000001525511474320230025214 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with this * work for additional information regarding copyright ownership. The ASF * licenses this file to You under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ import java.io.File; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.channels.ClosedChannelException; // javadoc @link import java.nio.channels.FileChannel; /** * An {@link FSDirectory} implementation that uses * java.nio's FileChannel's positional read, which allows * multiple threads to read from the same file without * synchronizing. * *

    This class only uses FileChannel when reading; writing * is achieved with {@link SimpleFSDirectory.SimpleFSIndexOutput}. * *

    NOTE: NIOFSDirectory is not recommended on Windows because of a bug * in how FileChannel.read is implemented in Sun's JRE. * Inside of the implementation the position is apparently * synchronized. See here * for details. *

    *

    * NOTE: Accessing this class either directly or * indirectly from a thread while it's interrupted can close the * underlying file descriptor immediately if at the same time the thread is * blocked on IO. The file descriptor will remain closed and subsequent access * to {@link NIOFSDirectory} will throw a {@link ClosedChannelException}. If * your application uses either {@link Thread#interrupt()} or * Future#cancel(boolean) (on Java 1.5) you should use * {@link SimpleFSDirectory} in favor of {@link NIOFSDirectory}. *

    */ public class NIOFSDirectory extends FSDirectory { /** Create a new NIOFSDirectory for the named location. * * @param path the path of the directory * @param lockFactory the lock factory to use, or null for the default * ({@link NativeFSLockFactory}); * @throws IOException */ public NIOFSDirectory(File path, LockFactory lockFactory) throws IOException { super(path, lockFactory); } /** Create a new NIOFSDirectory for the named location and {@link NativeFSLockFactory}. * * @param path the path of the directory * @throws IOException */ public NIOFSDirectory(File path) throws IOException { super(path, null); } // back compatibility so FSDirectory can instantiate via reflection /** @deprecated */ NIOFSDirectory() {} /** Creates an IndexInput for the file with the given name. */ public IndexInput openInput(String name, int bufferSize) throws IOException { ensureOpen(); return new NIOFSIndexInput(new File(getFile(), name), bufferSize, getReadChunkSize()); } /** Creates an IndexOutput for the file with the given name. */ public IndexOutput createOutput(String name) throws IOException { initOutput(name); return new SimpleFSDirectory.SimpleFSIndexOutput(new File(directory, name)); } protected static class NIOFSIndexInput extends SimpleFSDirectory.SimpleFSIndexInput { private ByteBuffer byteBuf; // wraps the buffer for NIO private byte[] otherBuffer; private ByteBuffer otherByteBuf; final FileChannel channel; /** @deprecated Please use ctor taking chunkSize */ public NIOFSIndexInput(File path, int bufferSize) throws IOException { this(path, bufferSize, FSDirectory.DEFAULT_READ_CHUNK_SIZE); } public NIOFSIndexInput(File path, int bufferSize, int chunkSize) throws IOException { super(path, bufferSize, chunkSize); channel = file.getChannel(); } protected void newBuffer(byte[] newBuffer) { super.newBuffer(newBuffer); byteBuf = ByteBuffer.wrap(newBuffer); } public void close() throws IOException { if (!isClone && file.isOpen) { // Close the channel & file try { channel.close(); } finally { file.close(); } } } protected void readInternal(byte[] b, int offset, int len) throws IOException { final ByteBuffer bb; // Determine the ByteBuffer we should use if (b == buffer && 0 == offset) { // Use our own pre-wrapped byteBuf: assert byteBuf != null; byteBuf.clear(); byteBuf.limit(len); bb = byteBuf; } else { if (offset == 0) { if (otherBuffer != b) { // Now wrap this other buffer; with compound // file, we are repeatedly called with its // buffer, so we wrap it once and then re-use it // on subsequent calls otherBuffer = b; otherByteBuf = ByteBuffer.wrap(b); } else otherByteBuf.clear(); otherByteBuf.limit(len); bb = otherByteBuf; } else { // Always wrap when offset != 0 bb = ByteBuffer.wrap(b, offset, len); } } int readOffset = bb.position(); int readLength = bb.limit() - readOffset; assert readLength == len; long pos = getFilePointer(); try { while (readLength > 0) { final int limit; if (readLength > chunkSize) { // LUCENE-1566 - work around JVM Bug by breaking // very large reads into chunks limit = readOffset + chunkSize; } else { limit = readOffset + readLength; } bb.limit(limit); int i = channel.read(bb, pos); if (i == -1) { throw new IOException("read past EOF"); } pos += i; readOffset += i; readLength -= i; } } catch (OutOfMemoryError e) { // propagate OOM up and add a hint for 32bit VM Users hitting the bug // with a large chunk size in the fast path. final OutOfMemoryError outOfMemoryError = new OutOfMemoryError( "OutOfMemoryError likely caused by the Sun VM Bug described in " + "https://issues.apache.org/jira/browse/LUCENE-1566; try calling FSDirectory.setReadChunkSize " + "with a a value smaller than the current chunk size (" + chunkSize + ")"); outOfMemoryError.initCause(e); throw outOfMemoryError; } } } } lucene-2.9.4/src/java/org/apache/lucene/store/SimpleFSDirectory.java0000644000175000017500000001654111474320230026017 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; /** A straightforward implementation of {@link FSDirectory} * using java.io.RandomAccessFile. However, this class has * poor concurrent performance (multiple threads will * bottleneck) as it synchronizes when multiple threads * read from the same file. It's usually better to use * {@link NIOFSDirectory} or {@link MMapDirectory} instead. */ public class SimpleFSDirectory extends FSDirectory { /** Create a new SimpleFSDirectory for the named location. * * @param path the path of the directory * @param lockFactory the lock factory to use, or null for the default * ({@link NativeFSLockFactory}); * @throws IOException */ public SimpleFSDirectory(File path, LockFactory lockFactory) throws IOException { super(path, lockFactory); } /** Create a new SimpleFSDirectory for the named location and {@link NativeFSLockFactory}. * * @param path the path of the directory * @throws IOException */ public SimpleFSDirectory(File path) throws IOException { super(path, null); } // back compatibility so FSDirectory can instantiate via reflection /** @deprecated */ SimpleFSDirectory() {} /** Creates an IndexOutput for the file with the given name. */ public IndexOutput createOutput(String name) throws IOException { initOutput(name); return new SimpleFSIndexOutput(new File(directory, name)); } /** Creates an IndexInput for the file with the given name. */ public IndexInput openInput(String name, int bufferSize) throws IOException { ensureOpen(); return new SimpleFSIndexInput(new File(directory, name), bufferSize, getReadChunkSize()); } protected static class SimpleFSIndexInput extends BufferedIndexInput { protected static class Descriptor extends RandomAccessFile { // remember if the file is open, so that we don't try to close it // more than once protected volatile boolean isOpen; long position; final long length; public Descriptor(File file, String mode) throws IOException { super(file, mode); isOpen=true; length=length(); } public void close() throws IOException { if (isOpen) { isOpen=false; super.close(); } } } protected final Descriptor file; boolean isClone; // LUCENE-1566 - maximum read length on a 32bit JVM to prevent incorrect OOM protected final int chunkSize; /** @deprecated Please use ctor taking chunkSize */ public SimpleFSIndexInput(File path) throws IOException { this(path, BufferedIndexInput.BUFFER_SIZE, SimpleFSDirectory.DEFAULT_READ_CHUNK_SIZE); } /** @deprecated Please use ctor taking chunkSize */ public SimpleFSIndexInput(File path, int bufferSize) throws IOException { this(path, bufferSize, SimpleFSDirectory.DEFAULT_READ_CHUNK_SIZE); } public SimpleFSIndexInput(File path, int bufferSize, int chunkSize) throws IOException { super(bufferSize); file = new Descriptor(path, "r"); this.chunkSize = chunkSize; } /** IndexInput methods */ protected void readInternal(byte[] b, int offset, int len) throws IOException { synchronized (file) { long position = getFilePointer(); if (position != file.position) { file.seek(position); file.position = position; } int total = 0; try { do { final int readLength; if (total + chunkSize > len) { readLength = len - total; } else { // LUCENE-1566 - work around JVM Bug by breaking very large reads into chunks readLength = chunkSize; } final int i = file.read(b, offset + total, readLength); if (i == -1) { throw new IOException("read past EOF"); } file.position += i; total += i; } while (total < len); } catch (OutOfMemoryError e) { // propagate OOM up and add a hint for 32bit VM Users hitting the bug // with a large chunk size in the fast path. final OutOfMemoryError outOfMemoryError = new OutOfMemoryError( "OutOfMemoryError likely caused by the Sun VM Bug described in " + "https://issues.apache.org/jira/browse/LUCENE-1566; try calling FSDirectory.setReadChunkSize " + "with a a value smaller than the current chunks size (" + chunkSize + ")"); outOfMemoryError.initCause(e); throw outOfMemoryError; } } } public void close() throws IOException { // only close the file if this is not a clone if (!isClone) file.close(); } protected void seekInternal(long position) { } public long length() { return file.length; } public Object clone() { SimpleFSIndexInput clone = (SimpleFSIndexInput)super.clone(); clone.isClone = true; return clone; } /** Method used for testing. Returns true if the underlying * file descriptor is valid. */ boolean isFDValid() throws IOException { return file.getFD().valid(); } } protected static class SimpleFSIndexOutput extends BufferedIndexOutput { RandomAccessFile file = null; // remember if the file is open, so that we don't try to close it // more than once private volatile boolean isOpen; public SimpleFSIndexOutput(File path) throws IOException { file = new RandomAccessFile(path, "rw"); isOpen = true; } /** output methods: */ public void flushBuffer(byte[] b, int offset, int size) throws IOException { file.write(b, offset, size); } public void close() throws IOException { // only close the file if it has not been closed yet if (isOpen) { boolean success = false; try { super.close(); success = true; } finally { isOpen = false; if (!success) { try { file.close(); } catch (Throwable t) { // Suppress so we don't mask original exception } } else file.close(); } } } /** Random-access methods */ public void seek(long pos) throws IOException { super.seek(pos); file.seek(pos); } public long length() throws IOException { return file.length(); } public void setLength(long length) throws IOException { file.setLength(length); } } } lucene-2.9.4/src/java/org/apache/lucene/store/ChecksumIndexOutput.java0000644000175000017500000000533211474320230026417 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.zip.CRC32; import java.util.zip.Checksum; /** Writes bytes through to a primary IndexOutput, computing * checksum. Note that you cannot use seek().*/ public class ChecksumIndexOutput extends IndexOutput { IndexOutput main; Checksum digest; public ChecksumIndexOutput(IndexOutput main) { this.main = main; digest = new CRC32(); } public void writeByte(byte b) throws IOException { digest.update(b); main.writeByte(b); } public void writeBytes(byte[] b, int offset, int length) throws IOException { digest.update(b, offset, length); main.writeBytes(b, offset, length); } public long getChecksum() { return digest.getValue(); } public void flush() throws IOException { main.flush(); } public void close() throws IOException { main.close(); } public long getFilePointer() { return main.getFilePointer(); } public void seek(long pos) { throw new RuntimeException("not allowed"); } /** * Starts but does not complete the commit of this file (= * writing of the final checksum at the end). After this * is called must call {@link #finishCommit} and the * {@link #close} to complete the commit. */ public void prepareCommit() throws IOException { final long checksum = getChecksum(); // Intentionally write a mismatched checksum. This is // because we want to 1) test, as best we can, that we // are able to write a long to the file, but 2) not // actually "commit" the file yet. This (prepare // commit) is phase 1 of a two-phase commit. final long pos = main.getFilePointer(); main.writeLong(checksum-1); main.flush(); main.seek(pos); } /** See {@link #prepareCommit} */ public void finishCommit() throws IOException { main.writeLong(getChecksum()); } public long length() throws IOException { return main.length(); } } lucene-2.9.4/src/java/org/apache/lucene/store/MMapDirectory.java0000644000175000017500000003530211474320230025163 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.File; import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.BufferUnderflowException; import java.nio.channels.ClosedChannelException; // javadoc @link import java.nio.channels.FileChannel; import java.nio.channels.FileChannel.MapMode; import java.security.AccessController; import java.security.PrivilegedExceptionAction; import java.security.PrivilegedActionException; import java.lang.reflect.Method; import org.apache.lucene.util.Constants; /** File-based {@link Directory} implementation that uses * mmap for reading, and {@link * SimpleFSDirectory.SimpleFSIndexOutput} for writing. * *

    NOTE: memory mapping uses up a portion of the * virtual memory address space in your process equal to the * size of the file being mapped. Before using this class, * be sure your have plenty of virtual address space, e.g. by * using a 64 bit JRE, or a 32 bit JRE with indexes that are * guaranteed to fit within the address space. * On 32 bit platforms also consult {@link #setMaxChunkSize} * if you have problems with mmap failing because of fragmented * address space. If you get an OutOfMemoryException, it is recommended * to reduce the chunk size, until it works. * *

    Due to * this bug in Sun's JRE, MMapDirectory's {@link IndexInput#close} * is unable to close the underlying OS file handle. Only when GC * finally collects the underlying objects, which could be quite * some time later, will the file handle be closed. * *

    This will consume additional transient disk usage: on Windows, * attempts to delete or overwrite the files will result in an * exception; on other platforms, which typically have a "delete on * last close" semantics, while such operations will succeed, the bytes * are still consuming space on disk. For many applications this * limitation is not a problem (e.g. if you have plenty of disk space, * and you don't rely on overwriting files on Windows) but it's still * an important limitation to be aware of. * *

    This class supplies the workaround mentioned in the bug report * (disabled by default, see {@link #setUseUnmap}), which may fail on * non-Sun JVMs. It forcefully unmaps the buffer on close by using * an undocumented internal cleanup functionality. * {@link #UNMAP_SUPPORTED} is true, if the workaround * can be enabled (with no guarantees). *

    * NOTE: Accessing this class either directly or * indirectly from a thread while it's interrupted can close the * underlying channel immediately if at the same time the thread is * blocked on IO. The channel will remain closed and subsequent access * to {@link MMapDirectory} will throw a {@link ClosedChannelException}. *

    */ public class MMapDirectory extends FSDirectory { /** Create a new MMapDirectory for the named location. * * @param path the path of the directory * @param lockFactory the lock factory to use, or null for the default * ({@link NativeFSLockFactory}); * @throws IOException */ public MMapDirectory(File path, LockFactory lockFactory) throws IOException { super(path, lockFactory); } /** Create a new MMapDirectory for the named location and {@link NativeFSLockFactory}. * * @param path the path of the directory * @throws IOException */ public MMapDirectory(File path) throws IOException { super(path, null); } // back compatibility so FSDirectory can instantiate via reflection /** @deprecated */ MMapDirectory() {} static final Class[] NO_PARAM_TYPES = new Class[0]; static final Object[] NO_PARAMS = new Object[0]; private boolean useUnmapHack = false; private int maxBBuf = Constants.JRE_IS_64BIT ? Integer.MAX_VALUE : (256*1024*1024); /** * true, if this platform supports unmapping mmapped files. */ public static final boolean UNMAP_SUPPORTED; static { boolean v; try { Class.forName("sun.misc.Cleaner"); Class.forName("java.nio.DirectByteBuffer") .getMethod("cleaner", NO_PARAM_TYPES); v = true; } catch (Exception e) { v = false; } UNMAP_SUPPORTED = v; } /** * This method enables the workaround for unmapping the buffers * from address space after closing {@link IndexInput}, that is * mentioned in the bug report. This hack may fail on non-Sun JVMs. * It forcefully unmaps the buffer on close by using * an undocumented internal cleanup functionality. *

    NOTE: Enabling this is completely unsupported * by Java and may lead to JVM crashes if IndexInput * is closed while another thread is still accessing it (SIGSEGV). * @throws IllegalArgumentException if {@link #UNMAP_SUPPORTED} * is false and the workaround cannot be enabled. */ public void setUseUnmap(final boolean useUnmapHack) { if (useUnmapHack && !UNMAP_SUPPORTED) throw new IllegalArgumentException("Unmap hack not supported on this platform!"); this.useUnmapHack=useUnmapHack; } /** * Returns true, if the unmap workaround is enabled. * @see #setUseUnmap */ public boolean getUseUnmap() { return useUnmapHack; } /** * Try to unmap the buffer, this method silently fails if no support * for that in the JVM. On Windows, this leads to the fact, * that mmapped files cannot be modified or deleted. */ final void cleanMapping(final ByteBuffer buffer) throws IOException { if (useUnmapHack) { try { AccessController.doPrivileged(new PrivilegedExceptionAction() { public Object run() throws Exception { final Method getCleanerMethod = buffer.getClass() .getMethod("cleaner", NO_PARAM_TYPES); getCleanerMethod.setAccessible(true); final Object cleaner = getCleanerMethod.invoke(buffer, NO_PARAMS); if (cleaner != null) { cleaner.getClass().getMethod("clean", NO_PARAM_TYPES) .invoke(cleaner, NO_PARAMS); } return null; } }); } catch (PrivilegedActionException e) { final IOException ioe = new IOException("unable to unmap the mapped buffer"); ioe.initCause(e.getCause()); throw ioe; } } } /** * Sets the maximum chunk size (default is {@link Integer#MAX_VALUE} for * 64 bit JVMs and 256 MiBytes for 32 bit JVMs) used for memory mapping. * Especially on 32 bit platform, the address space can be very fragmented, * so large index files cannot be mapped. * Using a lower chunk size makes the directory implementation a little * bit slower (as the correct chunk must be resolved on each seek) * but the chance is higher that mmap does not fail. On 64 bit * Java platforms, this parameter should always be {@link Integer#MAX_VALUE}, * as the address space is big enough. */ public void setMaxChunkSize(final int maxBBuf) { if (maxBBuf<=0) throw new IllegalArgumentException("Maximum chunk size for mmap must be >0"); this.maxBBuf=maxBBuf; } /** * Returns the current mmap chunk size. * @see #setMaxChunkSize */ public int getMaxChunkSize() { return maxBBuf; } private class MMapIndexInput extends IndexInput { private ByteBuffer buffer; private final long length; private boolean isClone = false; private MMapIndexInput(RandomAccessFile raf) throws IOException { this.length = raf.length(); this.buffer = raf.getChannel().map(MapMode.READ_ONLY, 0, length); } public byte readByte() throws IOException { try { return buffer.get(); } catch (BufferUnderflowException e) { throw new IOException("read past EOF"); } } public void readBytes(byte[] b, int offset, int len) throws IOException { try { buffer.get(b, offset, len); } catch (BufferUnderflowException e) { throw new IOException("read past EOF"); } } public long getFilePointer() { return buffer.position(); } public void seek(long pos) throws IOException { buffer.position((int)pos); } public long length() { return length; } public Object clone() { if (buffer == null) throw new AlreadyClosedException("MMapIndexInput already closed"); MMapIndexInput clone = (MMapIndexInput)super.clone(); clone.isClone = true; clone.buffer = buffer.duplicate(); return clone; } public void close() throws IOException { // unmap the buffer (if enabled) and at least unset it for GC try { if (isClone || buffer == null) return; cleanMapping(buffer); } finally { buffer = null; } } } // Because Java's ByteBuffer uses an int to address the // values, it's necessary to access a file > // Integer.MAX_VALUE in size using multiple byte buffers. private class MultiMMapIndexInput extends IndexInput { private ByteBuffer[] buffers; private int[] bufSizes; // keep here, ByteBuffer.size() method is optional private final long length; private int curBufIndex; private final int maxBufSize; private ByteBuffer curBuf; // redundant for speed: buffers[curBufIndex] private int curAvail; // redundant for speed: (bufSizes[curBufIndex] - curBuf.position()) private boolean isClone = false; public MultiMMapIndexInput(RandomAccessFile raf, int maxBufSize) throws IOException { this.length = raf.length(); this.maxBufSize = maxBufSize; if (maxBufSize <= 0) throw new IllegalArgumentException("Non positive maxBufSize: " + maxBufSize); if ((length / maxBufSize) > Integer.MAX_VALUE) throw new IllegalArgumentException ("RandomAccessFile too big for maximum buffer size: " + raf.toString()); int nrBuffers = (int) (length / maxBufSize); if (((long) nrBuffers * maxBufSize) <= length) nrBuffers++; this.buffers = new ByteBuffer[nrBuffers]; this.bufSizes = new int[nrBuffers]; long bufferStart = 0; FileChannel rafc = raf.getChannel(); for (int bufNr = 0; bufNr < nrBuffers; bufNr++) { int bufSize = (length > (bufferStart + maxBufSize)) ? maxBufSize : (int) (length - bufferStart); this.buffers[bufNr] = rafc.map(MapMode.READ_ONLY,bufferStart,bufSize); this.bufSizes[bufNr] = bufSize; bufferStart += bufSize; } seek(0L); } public byte readByte() throws IOException { // Performance might be improved by reading ahead into an array of // e.g. 128 bytes and readByte() from there. if (curAvail == 0) { curBufIndex++; if (curBufIndex >= buffers.length) throw new IOException("read past EOF"); curBuf = buffers[curBufIndex]; curBuf.position(0); curAvail = bufSizes[curBufIndex]; } curAvail--; return curBuf.get(); } public void readBytes(byte[] b, int offset, int len) throws IOException { while (len > curAvail) { curBuf.get(b, offset, curAvail); len -= curAvail; offset += curAvail; curBufIndex++; if (curBufIndex >= buffers.length) throw new IOException("read past EOF"); curBuf = buffers[curBufIndex]; curBuf.position(0); curAvail = bufSizes[curBufIndex]; } curBuf.get(b, offset, len); curAvail -= len; } public long getFilePointer() { return ((long) curBufIndex * maxBufSize) + curBuf.position(); } public void seek(long pos) throws IOException { curBufIndex = (int) (pos / maxBufSize); curBuf = buffers[curBufIndex]; int bufOffset = (int) (pos - ((long) curBufIndex * maxBufSize)); curBuf.position(bufOffset); curAvail = bufSizes[curBufIndex] - bufOffset; } public long length() { return length; } public Object clone() { if (buffers == null) throw new AlreadyClosedException("MultiMMapIndexInput already closed"); MultiMMapIndexInput clone = (MultiMMapIndexInput)super.clone(); clone.isClone = true; clone.buffers = new ByteBuffer[buffers.length]; // No need to clone bufSizes. // Since most clones will use only one buffer, duplicate() could also be // done lazy in clones, e.g. when adapting curBuf. for (int bufNr = 0; bufNr < buffers.length; bufNr++) { clone.buffers[bufNr] = buffers[bufNr].duplicate(); } try { clone.seek(getFilePointer()); } catch(IOException ioe) { RuntimeException newException = new RuntimeException(ioe); newException.initCause(ioe); throw newException; }; return clone; } public void close() throws IOException { try { if (isClone || buffers == null) return; for (int bufNr = 0; bufNr < buffers.length; bufNr++) { // unmap the buffer (if enabled) and at least unset it for GC try { cleanMapping(buffers[bufNr]); } finally { buffers[bufNr] = null; } } } finally { buffers = null; } } } /** Creates an IndexInput for the file with the given name. */ public IndexInput openInput(String name, int bufferSize) throws IOException { ensureOpen(); File f = new File(getFile(), name); RandomAccessFile raf = new RandomAccessFile(f, "r"); try { return (raf.length() <= (long) maxBBuf) ? (IndexInput) new MMapIndexInput(raf) : (IndexInput) new MultiMMapIndexInput(raf, maxBBuf); } finally { raf.close(); } } /** Creates an IndexOutput for the file with the given name. */ public IndexOutput createOutput(String name) throws IOException { initOutput(name); return new SimpleFSDirectory.SimpleFSIndexOutput(new File(directory, name)); } } lucene-2.9.4/src/java/org/apache/lucene/store/AlreadyClosedException.java0000644000175000017500000000211011474320230027025 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * This exception is thrown when there is an attempt to * access something that has already been closed. */ public class AlreadyClosedException extends IllegalStateException { public AlreadyClosedException(String message) { super(message); } } lucene-2.9.4/src/java/org/apache/lucene/store/LockStressTest.java0000644000175000017500000001055211474320230025400 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.File; /** * Simple standalone tool that forever acquires & releases a * lock using a specific LockFactory. Run without any args * to see usage. * * @see VerifyingLockFactory * @see LockVerifyServer */ public class LockStressTest { public static void main(String[] args) throws Exception { if (args.length != 6) { System.out.println("\nUsage: java org.apache.lucene.store.LockStressTest myID verifierHostOrIP verifierPort lockFactoryClassName lockDirName sleepTime\n" + "\n" + " myID = int from 0 .. 255 (should be unique for test process)\n" + " verifierHostOrIP = host name or IP address where LockVerifyServer is running\n" + " verifierPort = port that LockVerifyServer is listening on\n" + " lockFactoryClassName = primary LockFactory class that we will use\n" + " lockDirName = path to the lock directory (only set for Simple/NativeFSLockFactory\n" + " sleepTimeMS = milliseconds to pause betweeen each lock obtain/release\n" + "\n" + "You should run multiple instances of this process, each with its own\n" + "unique ID, and each pointing to the same lock directory, to verify\n" + "that locking is working correctly.\n" + "\n" + "Make sure you are first running LockVerifyServer.\n" + "\n"); System.exit(1); } final int myID = Integer.parseInt(args[0]); if (myID < 0 || myID > 255) { System.out.println("myID must be a unique int 0..255"); System.exit(1); } final String verifierHost = args[1]; final int verifierPort = Integer.parseInt(args[2]); final String lockFactoryClassName = args[3]; final String lockDirName = args[4]; final int sleepTimeMS = Integer.parseInt(args[5]); Class c; try { c = Class.forName(lockFactoryClassName); } catch (ClassNotFoundException e) { throw new IOException("unable to find LockClass " + lockFactoryClassName); } LockFactory lockFactory; try { lockFactory = (LockFactory) c.newInstance(); } catch (IllegalAccessException e) { throw new IOException("IllegalAccessException when instantiating LockClass " + lockFactoryClassName); } catch (InstantiationException e) { throw new IOException("InstantiationException when instantiating LockClass " + lockFactoryClassName); } catch (ClassCastException e) { throw new IOException("unable to cast LockClass " + lockFactoryClassName + " instance to a LockFactory"); } File lockDir = new File(lockDirName); if (lockFactory instanceof NativeFSLockFactory) { ((NativeFSLockFactory) lockFactory).setLockDir(lockDir); } else if (lockFactory instanceof SimpleFSLockFactory) { ((SimpleFSLockFactory) lockFactory).setLockDir(lockDir); } lockFactory.setLockPrefix("test"); LockFactory verifyLF = new VerifyingLockFactory((byte) myID, lockFactory, verifierHost, verifierPort); Lock l = verifyLF.makeLock("test.lock"); while(true) { boolean obtained = false; try { obtained = l.obtain(10); } catch (LockObtainFailedException e) { System.out.print("x"); } if (obtained) { System.out.print("l"); l.release(); } Thread.sleep(sleepTimeMS); } } } lucene-2.9.4/src/java/org/apache/lucene/store/RAMFile.java0000644000175000017500000000527711474320230023673 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.ArrayList; import java.io.Serializable; /** For Lucene internal use */ public class RAMFile implements Serializable { private static final long serialVersionUID = 1l; protected ArrayList buffers = new ArrayList(); long length; RAMDirectory directory; protected long sizeInBytes; // This is publicly modifiable via Directory.touchFile(), so direct access not supported private long lastModified = System.currentTimeMillis(); // File used as buffer, in no RAMDirectory protected RAMFile() {} RAMFile(RAMDirectory directory) { this.directory = directory; } // For non-stream access from thread that might be concurrent with writing public synchronized long getLength() { return length; } protected synchronized void setLength(long length) { this.length = length; } // For non-stream access from thread that might be concurrent with writing public synchronized long getLastModified() { return lastModified; } protected synchronized void setLastModified(long lastModified) { this.lastModified = lastModified; } protected final byte[] addBuffer(int size) { byte[] buffer = newBuffer(size); synchronized(this) { buffers.add(buffer); sizeInBytes += size; } if (directory != null) { synchronized(directory) { directory.sizeInBytes += size; } } return buffer; } protected final synchronized byte[] getBuffer(int index) { return (byte[]) buffers.get(index); } protected final synchronized int numBuffers() { return buffers.size(); } /** * Expert: allocate a new buffer. * Subclasses can allocate differently. * @param size size of allocated buffer. * @return allocated buffer. */ protected byte[] newBuffer(int size) { return new byte[size]; } public synchronized long getSizeInBytes() { return sizeInBytes; } } lucene-2.9.4/src/java/org/apache/lucene/store/RAMOutputStream.java0000644000175000017500000001060411474320230025456 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; /** * A memory-resident {@link IndexOutput} implementation. * *

    For Lucene internal use

    * @version $Id: RAMOutputStream.java 941125 2010-05-05 00:44:15Z mikemccand $ */ public class RAMOutputStream extends IndexOutput { static final int BUFFER_SIZE = 1024; private RAMFile file; private byte[] currentBuffer; private int currentBufferIndex; private int bufferPosition; private long bufferStart; private int bufferLength; /** Construct an empty output buffer. */ public RAMOutputStream() { this(new RAMFile()); } public RAMOutputStream(RAMFile f) { file = f; // make sure that we switch to the // first needed buffer lazily currentBufferIndex = -1; currentBuffer = null; } /** Copy the current contents of this buffer to the named output. */ public void writeTo(IndexOutput out) throws IOException { flush(); final long end = file.length; long pos = 0; int buffer = 0; while (pos < end) { int length = BUFFER_SIZE; long nextPos = pos + length; if (nextPos > end) { // at the last buffer length = (int)(end - pos); } out.writeBytes((byte[])file.getBuffer(buffer++), length); pos = nextPos; } } /** Resets this to an empty file. */ public void reset() { currentBuffer = null; currentBufferIndex = -1; bufferPosition = 0; bufferStart = 0; bufferLength = 0; file.setLength(0); } public void close() throws IOException { flush(); } public void seek(long pos) throws IOException { // set the file length in case we seek back // and flush() has not been called yet setFileLength(); if (pos < bufferStart || pos >= bufferStart + bufferLength) { currentBufferIndex = (int) (pos / BUFFER_SIZE); switchCurrentBuffer(); } bufferPosition = (int) (pos % BUFFER_SIZE); } public long length() { return file.length; } public void writeByte(byte b) throws IOException { if (bufferPosition == bufferLength) { currentBufferIndex++; switchCurrentBuffer(); } currentBuffer[bufferPosition++] = b; } public void writeBytes(byte[] b, int offset, int len) throws IOException { assert b != null; while (len > 0) { if (bufferPosition == bufferLength) { currentBufferIndex++; switchCurrentBuffer(); } int remainInBuffer = currentBuffer.length - bufferPosition; int bytesToCopy = len < remainInBuffer ? len : remainInBuffer; System.arraycopy(b, offset, currentBuffer, bufferPosition, bytesToCopy); offset += bytesToCopy; len -= bytesToCopy; bufferPosition += bytesToCopy; } } private final void switchCurrentBuffer() throws IOException { if (currentBufferIndex == file.numBuffers()) { currentBuffer = file.addBuffer(BUFFER_SIZE); } else { currentBuffer = (byte[]) file.getBuffer(currentBufferIndex); } bufferPosition = 0; bufferStart = (long) BUFFER_SIZE * (long) currentBufferIndex; bufferLength = currentBuffer.length; } private void setFileLength() { long pointer = bufferStart + bufferPosition; if (pointer > file.length) { file.setLength(pointer); } } public void flush() throws IOException { file.setLastModified(System.currentTimeMillis()); setFileLength(); } public long getFilePointer() { return currentBufferIndex < 0 ? 0 : bufferStart + bufferPosition; } /** Returns byte usage of all buffers. */ public long sizeInBytes() { return file.numBuffers() * BUFFER_SIZE; } } lucene-2.9.4/src/java/org/apache/lucene/store/BufferedIndexOutput.java0000644000175000017500000001044511474320230026400 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; /** Base implementation class for buffered {@link IndexOutput}. */ public abstract class BufferedIndexOutput extends IndexOutput { static final int BUFFER_SIZE = 16384; private final byte[] buffer = new byte[BUFFER_SIZE]; private long bufferStart = 0; // position in file of buffer private int bufferPosition = 0; // position in buffer /** Writes a single byte. * @see IndexInput#readByte() */ public void writeByte(byte b) throws IOException { if (bufferPosition >= BUFFER_SIZE) flush(); buffer[bufferPosition++] = b; } /** Writes an array of bytes. * @param b the bytes to write * @param length the number of bytes to write * @see IndexInput#readBytes(byte[],int,int) */ public void writeBytes(byte[] b, int offset, int length) throws IOException { int bytesLeft = BUFFER_SIZE - bufferPosition; // is there enough space in the buffer? if (bytesLeft >= length) { // we add the data to the end of the buffer System.arraycopy(b, offset, buffer, bufferPosition, length); bufferPosition += length; // if the buffer is full, flush it if (BUFFER_SIZE - bufferPosition == 0) flush(); } else { // is data larger then buffer? if (length > BUFFER_SIZE) { // we flush the buffer if (bufferPosition > 0) flush(); // and write data at once flushBuffer(b, offset, length); bufferStart += length; } else { // we fill/flush the buffer (until the input is written) int pos = 0; // position in the input data int pieceLength; while (pos < length) { pieceLength = (length - pos < bytesLeft) ? length - pos : bytesLeft; System.arraycopy(b, pos + offset, buffer, bufferPosition, pieceLength); pos += pieceLength; bufferPosition += pieceLength; // if the buffer is full, flush it bytesLeft = BUFFER_SIZE - bufferPosition; if (bytesLeft == 0) { flush(); bytesLeft = BUFFER_SIZE; } } } } } /** Forces any buffered output to be written. */ public void flush() throws IOException { flushBuffer(buffer, bufferPosition); bufferStart += bufferPosition; bufferPosition = 0; } /** Expert: implements buffer write. Writes bytes at the current position in * the output. * @param b the bytes to write * @param len the number of bytes to write */ private void flushBuffer(byte[] b, int len) throws IOException { flushBuffer(b, 0, len); } /** Expert: implements buffer write. Writes bytes at the current position in * the output. * @param b the bytes to write * @param offset the offset in the byte array * @param len the number of bytes to write */ protected abstract void flushBuffer(byte[] b, int offset, int len) throws IOException; /** Closes this stream to further operations. */ public void close() throws IOException { flush(); } /** Returns the current position in this file, where the next write will * occur. * @see #seek(long) */ public long getFilePointer() { return bufferStart + bufferPosition; } /** Sets current position in this file, where the next write will occur. * @see #getFilePointer() */ public void seek(long pos) throws IOException { flush(); bufferStart = pos; } /** The number of bytes in the file. */ public abstract long length() throws IOException; } lucene-2.9.4/src/java/org/apache/lucene/store/package.html0000644000175000017500000000175111474320230024063 0ustar janpascaljanpascal Binary i/o API, used for all index data. lucene-2.9.4/src/java/org/apache/lucene/store/LockObtainFailedException.java0000644000175000017500000000231011474320230027446 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.store; import java.io.IOException; /** * This exception is thrown when the write.lock * could not be acquired. This * happens when a writer tries to open an index * that another writer already has open. * @see Lock#obtain(long). */ public class LockObtainFailedException extends IOException { public LockObtainFailedException(String message) { super(message); } } lucene-2.9.4/src/java/org/apache/lucene/store/SingleInstanceLockFactory.java0000644000175000017500000000462411474320230027516 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.HashSet; /** * Implements {@link LockFactory} for a single in-process instance, * meaning all locking will take place through this one instance. * Only use this {@link LockFactory} when you are certain all * IndexReaders and IndexWriters for a given index are running * against a single shared in-process Directory instance. This is * currently the default locking for RAMDirectory. * * @see LockFactory */ public class SingleInstanceLockFactory extends LockFactory { private HashSet locks = new HashSet(); public Lock makeLock(String lockName) { // We do not use the LockPrefix at all, because the private // HashSet instance effectively scopes the locking to this // single Directory instance. return new SingleInstanceLock(locks, lockName); } public void clearLock(String lockName) throws IOException { synchronized(locks) { if (locks.contains(lockName)) { locks.remove(lockName); } } } }; class SingleInstanceLock extends Lock { String lockName; private HashSet locks; public SingleInstanceLock(HashSet locks, String lockName) { this.locks = locks; this.lockName = lockName; } public boolean obtain() throws IOException { synchronized(locks) { return locks.add(lockName); } } public void release() { synchronized(locks) { locks.remove(lockName); } } public boolean isLocked() { synchronized(locks) { return locks.contains(lockName); } } public String toString() { return super.toString() + ": " + lockName; } } lucene-2.9.4/src/java/org/apache/lucene/store/NoSuchDirectoryException.java0000644000175000017500000000213011474320230027400 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.FileNotFoundException; /** * This exception is thrown when you try to list a * non-existent directory. */ public class NoSuchDirectoryException extends FileNotFoundException { public NoSuchDirectoryException(String message) { super(message); } } lucene-2.9.4/src/java/org/apache/lucene/store/LockFactory.java0000644000175000017500000000466311474320230024672 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; /** *

    Base class for Locking implementation. {@link Directory} uses * instances of this class to implement locking.

    * *

    Note that there are some useful tools to verify that * your LockFactory is working correctly: {@link * VerifyingLockFactory}, {@link LockStressTest}, {@link * LockVerifyServer}.

    * * @see LockVerifyServer * @see LockStressTest * @see VerifyingLockFactory */ public abstract class LockFactory { protected String lockPrefix = null; /** * Set the prefix in use for all locks created in this * LockFactory. This is normally called once, when a * Directory gets this LockFactory instance. However, you * can also call this (after this instance is assigned to * a Directory) to override the prefix in use. This * is helpful if you're running Lucene on machines that * have different mount points for the same shared * directory. */ public void setLockPrefix(String lockPrefix) { this.lockPrefix = lockPrefix; } /** * Get the prefix in use for all locks created in this LockFactory. */ public String getLockPrefix() { return this.lockPrefix; } /** * Return a new Lock instance identified by lockName. * @param lockName name of the lock to be created. */ public abstract Lock makeLock(String lockName); /** * Attempt to clear (forcefully unlock and remove) the * specified lock. Only call this at a time when you are * certain this lock is no longer in use. * @param lockName name of the lock to be cleared. */ abstract public void clearLock(String lockName) throws IOException; } lucene-2.9.4/src/java/org/apache/lucene/store/SimpleFSLockFactory.java0000644000175000017500000001114411474320230026265 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.File; import java.io.IOException; /** *

    Implements {@link LockFactory} using {@link * File#createNewFile()}.

    * *

    NOTE: the javadocs * for File.createNewFile contain a vague * yet spooky warning about not using the API for file * locking. This warning was added due to this * bug, and in fact the only known problem with using * this API for locking is that the Lucene write lock may * not be released when the JVM exits abnormally.

    *

    When this happens, a {@link LockObtainFailedException} * is hit when trying to create a writer, in which case you * need to explicitly clear the lock file first. You can * either manually remove the file, or use the {@link * org.apache.lucene.index.IndexReader#unlock(Directory)} * API. But, first be certain that no writer is in fact * writing to the index otherwise you can easily corrupt * your index.

    * *

    If you suspect that this or any other LockFactory is * not working properly in your environment, you can easily * test it by using {@link VerifyingLockFactory}, {@link * LockVerifyServer} and {@link LockStressTest}.

    * * @see LockFactory */ public class SimpleFSLockFactory extends FSLockFactory { /** * Create a SimpleFSLockFactory instance, with null (unset) * lock directory. When you pass this factory to a {@link FSDirectory} * subclass, the lock directory is automatically set to the * directory itsself. Be sure to create one instance for each directory * your create! */ public SimpleFSLockFactory() throws IOException { this((File) null); } /** * Instantiate using the provided directory (as a File instance). * @param lockDir where lock files should be created. */ public SimpleFSLockFactory(File lockDir) throws IOException { setLockDir(lockDir); } /** * Instantiate using the provided directory name (String). * @param lockDirName where lock files should be created. */ public SimpleFSLockFactory(String lockDirName) throws IOException { setLockDir(new File(lockDirName)); } public Lock makeLock(String lockName) { if (lockPrefix != null) { lockName = lockPrefix + "-" + lockName; } return new SimpleFSLock(lockDir, lockName); } public void clearLock(String lockName) throws IOException { if (lockDir.exists()) { if (lockPrefix != null) { lockName = lockPrefix + "-" + lockName; } File lockFile = new File(lockDir, lockName); if (lockFile.exists() && !lockFile.delete()) { throw new IOException("Cannot delete " + lockFile); } } } }; class SimpleFSLock extends Lock { File lockFile; File lockDir; public SimpleFSLock(File lockDir, String lockFileName) { this.lockDir = lockDir; lockFile = new File(lockDir, lockFileName); } public boolean obtain() throws IOException { // Ensure that lockDir exists and is a directory: if (!lockDir.exists()) { if (!lockDir.mkdirs()) throw new IOException("Cannot create directory: " + lockDir.getAbsolutePath()); } else if (!lockDir.isDirectory()) { throw new IOException("Found regular file where directory expected: " + lockDir.getAbsolutePath()); } return lockFile.createNewFile(); } public void release() throws LockReleaseFailedException { if (lockFile.exists() && !lockFile.delete()) throw new LockReleaseFailedException("failed to delete " + lockFile); } public boolean isLocked() { return lockFile.exists(); } public String toString() { return "SimpleFSLock@" + lockFile; } } lucene-2.9.4/src/java/org/apache/lucene/store/FSLockFactory.java0000644000175000017500000000320311474320230025110 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.File; /** * Base class for file system based locking implementation. */ public abstract class FSLockFactory extends LockFactory { /** * Directory for the lock files. */ protected File lockDir = null; /** * Set the lock directory. This method can be only called * once to initialize the lock directory. It is used by {@link FSDirectory} * to set the lock directory to itsself. * Subclasses can also use this method to set the directory * in the constructor. */ protected final void setLockDir(File lockDir) { if (this.lockDir != null) throw new IllegalStateException("You can set the lock directory for this factory only once."); this.lockDir = lockDir; } /** * Retrieve the lock directory. */ public File getLockDir() { return lockDir; } } lucene-2.9.4/src/java/org/apache/lucene/store/IndexInput.java0000644000175000017500000001751611474320230024542 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Map; import java.util.HashMap; /** Abstract base class for input from a file in a {@link Directory}. A * random-access input stream. Used for all Lucene index input operations. * @see Directory */ public abstract class IndexInput implements Cloneable { private boolean preUTF8Strings; // true if we are reading old (modified UTF8) string format /** Reads and returns a single byte. * @see IndexOutput#writeByte(byte) */ public abstract byte readByte() throws IOException; /** Reads a specified number of bytes into an array at the specified offset. * @param b the array to read bytes into * @param offset the offset in the array to start storing bytes * @param len the number of bytes to read * @see IndexOutput#writeBytes(byte[],int) */ public abstract void readBytes(byte[] b, int offset, int len) throws IOException; /** Reads a specified number of bytes into an array at the * specified offset with control over whether the read * should be buffered (callers who have their own buffer * should pass in "false" for useBuffer). Currently only * {@link BufferedIndexInput} respects this parameter. * @param b the array to read bytes into * @param offset the offset in the array to start storing bytes * @param len the number of bytes to read * @param useBuffer set to false if the caller will handle * buffering. * @see IndexOutput#writeBytes(byte[],int) */ public void readBytes(byte[] b, int offset, int len, boolean useBuffer) throws IOException { // Default to ignoring useBuffer entirely readBytes(b, offset, len); } /** Reads four bytes and returns an int. * @see IndexOutput#writeInt(int) */ public int readInt() throws IOException { return ((readByte() & 0xFF) << 24) | ((readByte() & 0xFF) << 16) | ((readByte() & 0xFF) << 8) | (readByte() & 0xFF); } /** Reads an int stored in variable-length format. Reads between one and * five bytes. Smaller values take fewer bytes. Negative numbers are not * supported. * @see IndexOutput#writeVInt(int) */ public int readVInt() throws IOException { byte b = readByte(); int i = b & 0x7F; for (int shift = 7; (b & 0x80) != 0; shift += 7) { b = readByte(); i |= (b & 0x7F) << shift; } return i; } /** Reads eight bytes and returns a long. * @see IndexOutput#writeLong(long) */ public long readLong() throws IOException { return (((long)readInt()) << 32) | (readInt() & 0xFFFFFFFFL); } /** Reads a long stored in variable-length format. Reads between one and * nine bytes. Smaller values take fewer bytes. Negative numbers are not * supported. */ public long readVLong() throws IOException { byte b = readByte(); long i = b & 0x7F; for (int shift = 7; (b & 0x80) != 0; shift += 7) { b = readByte(); i |= (b & 0x7FL) << shift; } return i; } /** Call this if readString should read characters stored * in the old modified UTF8 format (length in java chars * and java's modified UTF8 encoding). This is used for * indices written pre-2.4 See LUCENE-510 for details. */ public void setModifiedUTF8StringsMode() { preUTF8Strings = true; } /** Reads a string. * @see IndexOutput#writeString(String) */ public String readString() throws IOException { if (preUTF8Strings) return readModifiedUTF8String(); int length = readVInt(); final byte[] bytes = new byte[length]; readBytes(bytes, 0, length); return new String(bytes, 0, length, "UTF-8"); } private String readModifiedUTF8String() throws IOException { int length = readVInt(); final char[] chars = new char[length]; readChars(chars, 0, length); return new String(chars, 0, length); } /** Reads Lucene's old "modified UTF-8" encoded * characters into an array. * @param buffer the array to read characters into * @param start the offset in the array to start storing characters * @param length the number of characters to read * @see IndexOutput#writeChars(String,int,int) * @deprecated -- please use readString or readBytes * instead, and construct the string * from those utf8 bytes */ public void readChars(char[] buffer, int start, int length) throws IOException { final int end = start + length; for (int i = start; i < end; i++) { byte b = readByte(); if ((b & 0x80) == 0) buffer[i] = (char)(b & 0x7F); else if ((b & 0xE0) != 0xE0) { buffer[i] = (char)(((b & 0x1F) << 6) | (readByte() & 0x3F)); } else buffer[i] = (char)(((b & 0x0F) << 12) | ((readByte() & 0x3F) << 6) | (readByte() & 0x3F)); } } /** * Expert * * Similar to {@link #readChars(char[], int, int)} but does not do any conversion operations on the bytes it is reading in. It still * has to invoke {@link #readByte()} just as {@link #readChars(char[], int, int)} does, but it does not need a buffer to store anything * and it does not have to do any of the bitwise operations, since we don't actually care what is in the byte except to determine * how many more bytes to read * @param length The number of chars to read * @deprecated this method operates on old "modified utf8" encoded * strings */ public void skipChars(int length) throws IOException{ for (int i = 0; i < length; i++) { byte b = readByte(); if ((b & 0x80) == 0){ //do nothing, we only need one byte } else if ((b & 0xE0) != 0xE0) { readByte();//read an additional byte } else{ //read two additional bytes. readByte(); readByte(); } } } /** Closes the stream to further operations. */ public abstract void close() throws IOException; /** Returns the current position in this file, where the next read will * occur. * @see #seek(long) */ public abstract long getFilePointer(); /** Sets current position in this file, where the next read will occur. * @see #getFilePointer() */ public abstract void seek(long pos) throws IOException; /** The number of bytes in the file. */ public abstract long length(); /** Returns a clone of this stream. * *

    Clones of a stream access the same data, and are positioned at the same * point as the stream they were cloned from. * *

    Expert: Subclasses must ensure that clones may be positioned at * different points in the input from each other and from the stream they * were cloned from. */ public Object clone() { IndexInput clone = null; try { clone = (IndexInput)super.clone(); } catch (CloneNotSupportedException e) {} return clone; } // returns Map public Map readStringStringMap() throws IOException { final Map map = new HashMap(); final int count = readInt(); for(int i=0;iTypical use might look like:

     * new Lock.With(directory.makeLock("my.lock")) {
     *     public Object doBody() {
     *       ... code to execute while locked ...
     *     }
     *   }.run();
     * 
    * * * @version $Id: Lock.java 769409 2009-04-28 14:05:43Z mikemccand $ * @see Directory#makeLock(String) */ public abstract class Lock { /** How long {@link #obtain(long)} waits, in milliseconds, * in between attempts to acquire the lock. */ public static long LOCK_POLL_INTERVAL = 1000; /** Pass this value to {@link #obtain(long)} to try * forever to obtain the lock. */ public static final long LOCK_OBTAIN_WAIT_FOREVER = -1; /** Attempts to obtain exclusive access and immediately return * upon success or failure. * @return true iff exclusive access is obtained */ public abstract boolean obtain() throws IOException; /** * If a lock obtain called, this failureReason may be set * with the "root cause" Exception as to why the lock was * not obtained. */ protected Throwable failureReason; /** Attempts to obtain an exclusive lock within amount of * time given. Polls once per {@link #LOCK_POLL_INTERVAL} * (currently 1000) milliseconds until lockWaitTimeout is * passed. * @param lockWaitTimeout length of time to wait in * milliseconds or {@link * #LOCK_OBTAIN_WAIT_FOREVER} to retry forever * @return true if lock was obtained * @throws LockObtainFailedException if lock wait times out * @throws IllegalArgumentException if lockWaitTimeout is * out of bounds * @throws IOException if obtain() throws IOException */ public boolean obtain(long lockWaitTimeout) throws LockObtainFailedException, IOException { failureReason = null; boolean locked = obtain(); if (lockWaitTimeout < 0 && lockWaitTimeout != LOCK_OBTAIN_WAIT_FOREVER) throw new IllegalArgumentException("lockWaitTimeout should be LOCK_OBTAIN_WAIT_FOREVER or a non-negative number (got " + lockWaitTimeout + ")"); long maxSleepCount = lockWaitTimeout / LOCK_POLL_INTERVAL; long sleepCount = 0; while (!locked) { if (lockWaitTimeout != LOCK_OBTAIN_WAIT_FOREVER && sleepCount++ >= maxSleepCount) { String reason = "Lock obtain timed out: " + this.toString(); if (failureReason != null) { reason += ": " + failureReason; } LockObtainFailedException e = new LockObtainFailedException(reason); if (failureReason != null) { e.initCause(failureReason); } throw e; } try { Thread.sleep(LOCK_POLL_INTERVAL); } catch (InterruptedException e) { // In 3.0 we will change this to throw // InterruptedException instead Thread.currentThread().interrupt(); throw new IOException(e.toString()); } locked = obtain(); } return locked; } /** Releases exclusive access. */ public abstract void release() throws IOException; /** Returns true if the resource is currently locked. Note that one must * still call {@link #obtain()} before using the resource. */ public abstract boolean isLocked(); /** Utility class for executing code with exclusive access. */ public abstract static class With { private Lock lock; private long lockWaitTimeout; /** Constructs an executor that will grab the named lock. */ public With(Lock lock, long lockWaitTimeout) { this.lock = lock; this.lockWaitTimeout = lockWaitTimeout; } /** Code to execute with exclusive access. */ protected abstract Object doBody() throws IOException; /** Calls {@link #doBody} while lock is obtained. Blocks if lock * cannot be obtained immediately. Retries to obtain lock once per second * until it is obtained, or until it has tried ten times. Lock is released when * {@link #doBody} exits. * @throws LockObtainFailedException if lock could not * be obtained * @throws IOException if {@link Lock#obtain} throws IOException */ public Object run() throws LockObtainFailedException, IOException { boolean locked = false; try { locked = lock.obtain(lockWaitTimeout); return doBody(); } finally { if (locked) lock.release(); } } } } lucene-2.9.4/src/java/org/apache/lucene/store/BufferedIndexInput.java0000644000175000017500000001566211474320230026205 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; /** Base implementation class for buffered {@link IndexInput}. */ public abstract class BufferedIndexInput extends IndexInput { /** Default buffer size */ public static final int BUFFER_SIZE = 1024; private int bufferSize = BUFFER_SIZE; protected byte[] buffer; private long bufferStart = 0; // position in file of buffer private int bufferLength = 0; // end of valid bytes private int bufferPosition = 0; // next byte to read public byte readByte() throws IOException { if (bufferPosition >= bufferLength) refill(); return buffer[bufferPosition++]; } public BufferedIndexInput() {} /** Inits BufferedIndexInput with a specific bufferSize */ public BufferedIndexInput(int bufferSize) { checkBufferSize(bufferSize); this.bufferSize = bufferSize; } /** Change the buffer size used by this IndexInput */ public void setBufferSize(int newSize) { assert buffer == null || bufferSize == buffer.length: "buffer=" + buffer + " bufferSize=" + bufferSize + " buffer.length=" + (buffer != null ? buffer.length : 0); if (newSize != bufferSize) { checkBufferSize(newSize); bufferSize = newSize; if (buffer != null) { // Resize the existing buffer and carefully save as // many bytes as possible starting from the current // bufferPosition byte[] newBuffer = new byte[newSize]; final int leftInBuffer = bufferLength-bufferPosition; final int numToCopy; if (leftInBuffer > newSize) numToCopy = newSize; else numToCopy = leftInBuffer; System.arraycopy(buffer, bufferPosition, newBuffer, 0, numToCopy); bufferStart += bufferPosition; bufferPosition = 0; bufferLength = numToCopy; newBuffer(newBuffer); } } } protected void newBuffer(byte[] newBuffer) { // Subclasses can do something here buffer = newBuffer; } /** Returns buffer size. @see #setBufferSize */ public int getBufferSize() { return bufferSize; } private void checkBufferSize(int bufferSize) { if (bufferSize <= 0) throw new IllegalArgumentException("bufferSize must be greater than 0 (got " + bufferSize + ")"); } public void readBytes(byte[] b, int offset, int len) throws IOException { readBytes(b, offset, len, true); } public void readBytes(byte[] b, int offset, int len, boolean useBuffer) throws IOException { if(len <= (bufferLength-bufferPosition)){ // the buffer contains enough data to satisfy this request if(len>0) // to allow b to be null if len is 0... System.arraycopy(buffer, bufferPosition, b, offset, len); bufferPosition+=len; } else { // the buffer does not have enough data. First serve all we've got. int available = bufferLength - bufferPosition; if(available > 0){ System.arraycopy(buffer, bufferPosition, b, offset, available); offset += available; len -= available; bufferPosition += available; } // and now, read the remaining 'len' bytes: if (useBuffer && len length()) throw new IOException("read past EOF"); readInternal(b, offset, len); bufferStart = after; bufferPosition = 0; bufferLength = 0; // trigger refill() on read } } } private void refill() throws IOException { long start = bufferStart + bufferPosition; long end = start + bufferSize; if (end > length()) // don't read past EOF end = length(); int newLength = (int)(end - start); if (newLength <= 0) throw new IOException("read past EOF"); if (buffer == null) { newBuffer(new byte[bufferSize]); // allocate buffer lazily seekInternal(bufferStart); } readInternal(buffer, 0, newLength); bufferLength = newLength; bufferStart = start; bufferPosition = 0; } /** Expert: implements buffer refill. Reads bytes from the current position * in the input. * @param b the array to read bytes into * @param offset the offset in the array to start storing bytes * @param length the number of bytes to read */ protected abstract void readInternal(byte[] b, int offset, int length) throws IOException; public long getFilePointer() { return bufferStart + bufferPosition; } public void seek(long pos) throws IOException { if (pos >= bufferStart && pos < (bufferStart + bufferLength)) bufferPosition = (int)(pos - bufferStart); // seek within buffer else { bufferStart = pos; bufferPosition = 0; bufferLength = 0; // trigger refill() on read() seekInternal(pos); } } /** Expert: implements seek. Sets current position in this file, where the * next {@link #readInternal(byte[],int,int)} will occur. * @see #readInternal(byte[],int,int) */ protected abstract void seekInternal(long pos) throws IOException; public Object clone() { BufferedIndexInput clone = (BufferedIndexInput)super.clone(); clone.buffer = null; clone.bufferLength = 0; clone.bufferPosition = 0; clone.bufferStart = getFilePointer(); return clone; } } lucene-2.9.4/src/java/org/apache/lucene/store/FileSwitchDirectory.java0000644000175000017500000001012011474320230026361 0ustar janpascaljanpascalpackage org.apache.lucene.store; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Set; import java.util.HashSet; /** * Expert: A Directory instance that switches files between * two other Directory instances. *

    Files with the specified extensions are placed in the * primary directory; others are placed in the secondary * directory. The provided Set must not change once passed * to this class, and must allow multiple threads to call * contains at once.

    * *

    NOTE: this API is new and experimental and is * subject to suddenly change in the next release. */ public class FileSwitchDirectory extends Directory { private final Directory secondaryDir; private final Directory primaryDir; private final Set primaryExtensions; private boolean doClose; public FileSwitchDirectory(Set primaryExtensions, Directory primaryDir, Directory secondaryDir, boolean doClose) { this.primaryExtensions = primaryExtensions; this.primaryDir = primaryDir; this.secondaryDir = secondaryDir; this.doClose = doClose; this.lockFactory = primaryDir.getLockFactory(); } /** Return the primary directory */ public Directory getPrimaryDir() { return primaryDir; } /** Return the secondary directory */ public Directory getSecondaryDir() { return secondaryDir; } public void close() throws IOException { if (doClose) { try { secondaryDir.close(); } finally { primaryDir.close(); } doClose = false; } } public String[] listAll() throws IOException { Set files = new HashSet(); files.addAll(Arrays.asList(primaryDir.listAll())); files.addAll(Arrays.asList(secondaryDir.listAll())); return (String[]) files.toArray(new String[files.size()]); } public String[] list() throws IOException { return listAll(); } /** Utility method to return a file's extension. */ public static String getExtension(String name) { int i = name.lastIndexOf('.'); if (i == -1) { return ""; } return name.substring(i+1, name.length()); } private Directory getDirectory(String name) { String ext = getExtension(name); if (primaryExtensions.contains(ext)) { return primaryDir; } else { return secondaryDir; } } public boolean fileExists(String name) throws IOException { return getDirectory(name).fileExists(name); } public long fileModified(String name) throws IOException { return getDirectory(name).fileModified(name); } public void touchFile(String name) throws IOException { getDirectory(name).touchFile(name); } public void deleteFile(String name) throws IOException { getDirectory(name).deleteFile(name); } public void renameFile(String from, String to) throws IOException { getDirectory(from).renameFile(from, to); } public long fileLength(String name) throws IOException { return getDirectory(name).fileLength(name); } public IndexOutput createOutput(String name) throws IOException { return getDirectory(name).createOutput(name); } public void sync(String name) throws IOException { getDirectory(name).sync(name); } public IndexInput openInput(String name) throws IOException { return getDirectory(name).openInput(name); } } lucene-2.9.4/src/java/org/apache/lucene/util/0000755000175000017500000000000011554106562021430 5ustar janpascaljanpascallucene-2.9.4/src/java/org/apache/lucene/util/Constants.java0000644000175000017500000000645211474320230024245 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.LucenePackage; /** * Some useful constants. * * * @version $Id: Constants.java 1029226 2010-10-30 23:56:08Z uschindler $ **/ public final class Constants { private Constants() {} // can't construct /** The value of System.getProperty("java.version"). **/ public static final String JAVA_VERSION = System.getProperty("java.version"); /** True iff this is Java version 1.1. */ public static final boolean JAVA_1_1 = JAVA_VERSION.startsWith("1.1."); /** True iff this is Java version 1.2. */ public static final boolean JAVA_1_2 = JAVA_VERSION.startsWith("1.2."); /** True iff this is Java version 1.3. */ public static final boolean JAVA_1_3 = JAVA_VERSION.startsWith("1.3."); /** The value of System.getProperty("os.name"). **/ public static final String OS_NAME = System.getProperty("os.name"); /** True iff running on Linux. */ public static final boolean LINUX = OS_NAME.startsWith("Linux"); /** True iff running on Windows. */ public static final boolean WINDOWS = OS_NAME.startsWith("Windows"); /** True iff running on SunOS. */ public static final boolean SUN_OS = OS_NAME.startsWith("SunOS"); public static final String OS_ARCH = System.getProperty("os.arch"); public static final String OS_VERSION = System.getProperty("os.version"); public static final String JAVA_VENDOR = System.getProperty("java.vendor"); // NOTE: this logic may not be correct; if you know of a // more reliable approach please raise it on java-dev! public static final boolean JRE_IS_64BIT; static { String x = System.getProperty("sun.arch.data.model"); if (x != null) { JRE_IS_64BIT = x.indexOf("64") != -1; } else { if (OS_ARCH != null && OS_ARCH.indexOf("64") != -1) { JRE_IS_64BIT = true; } else { JRE_IS_64BIT = false; } } } // this method prevents inlining the final version constant in compiled classes, // see: http://www.javaworld.com/community/node/3400 private static String ident(final String s) { return s.toString(); } public static final String LUCENE_MAIN_VERSION = ident("2.9.4"); public static final String LUCENE_VERSION; static { Package pkg = LucenePackage.get(); String v = (pkg == null) ? null : pkg.getImplementationVersion(); if (v == null) { v = LUCENE_MAIN_VERSION + "-dev"; } else if (v.indexOf(LUCENE_MAIN_VERSION) == -1) { v = v + " [" + LUCENE_MAIN_VERSION + "]"; } LUCENE_VERSION = ident(v); } } lucene-2.9.4/src/java/org/apache/lucene/util/cache/0000755000175000017500000000000011554106562022473 5ustar janpascaljanpascallucene-2.9.4/src/java/org/apache/lucene/util/cache/SimpleMapCache.java0000644000175000017500000000477511474320230026155 0ustar janpascaljanpascalpackage org.apache.lucene.util.cache; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.HashMap; import java.util.Map; import java.util.Set; /** * Simple cache implementation that uses a HashMap to store (key, value) pairs. * This cache is not synchronized, use {@link Cache#synchronizedCache(Cache)} * if needed. */ public class SimpleMapCache extends Cache { Map map; public SimpleMapCache() { this(new HashMap()); } public SimpleMapCache(Map map) { this.map = map; } public Object get(Object key) { return map.get(key); } public void put(Object key, Object value) { map.put(key, value); } public void close() { // NOOP } public boolean containsKey(Object key) { return map.containsKey(key); } /** * Returns a Set containing all keys in this cache. */ public Set keySet() { return map.keySet(); } Cache getSynchronizedCache() { return new SynchronizedSimpleMapCache(this); } private static class SynchronizedSimpleMapCache extends SimpleMapCache { Object mutex; SimpleMapCache cache; SynchronizedSimpleMapCache(SimpleMapCache cache) { this.cache = cache; this.mutex = this; } public void put(Object key, Object value) { synchronized(mutex) {cache.put(key, value);} } public Object get(Object key) { synchronized(mutex) {return cache.get(key);} } public boolean containsKey(Object key) { synchronized(mutex) {return cache.containsKey(key);} } public void close() { synchronized(mutex) {cache.close();} } public Set keySet() { synchronized(mutex) {return cache.keySet();} } Cache getSynchronizedCache() { return this; } } } lucene-2.9.4/src/java/org/apache/lucene/util/cache/Cache.java0000644000175000017500000000551411474320230024335 0ustar janpascaljanpascalpackage org.apache.lucene.util.cache; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Base class for cache implementations. */ public abstract class Cache { /** * Simple Cache wrapper that synchronizes all * calls that access the cache. */ static class SynchronizedCache extends Cache { Object mutex; Cache cache; SynchronizedCache(Cache cache) { this.cache = cache; this.mutex = this; } SynchronizedCache(Cache cache, Object mutex) { this.cache = cache; this.mutex = mutex; } public void put(Object key, Object value) { synchronized(mutex) {cache.put(key, value);} } public Object get(Object key) { synchronized(mutex) {return cache.get(key);} } public boolean containsKey(Object key) { synchronized(mutex) {return cache.containsKey(key);} } public void close() { synchronized(mutex) {cache.close();} } Cache getSynchronizedCache() { return this; } } /** * Returns a thread-safe cache backed by the specified cache. * In order to guarantee thread-safety, all access to the backed cache must * be accomplished through the returned cache. */ public static Cache synchronizedCache(Cache cache) { return cache.getSynchronizedCache(); } /** * Called by {@link #synchronizedCache(Cache)}. This method * returns a {@link SynchronizedCache} instance that wraps * this instance by default and can be overridden to return * e. g. subclasses of {@link SynchronizedCache} or this * in case this cache is already synchronized. */ Cache getSynchronizedCache() { return new SynchronizedCache(this); } /** * Puts a (key, value)-pair into the cache. */ public abstract void put(Object key, Object value); /** * Returns the value for the given key. */ public abstract Object get(Object key); /** * Returns whether the given key is in this cache. */ public abstract boolean containsKey(Object key); /** * Closes the cache. */ public abstract void close(); } lucene-2.9.4/src/java/org/apache/lucene/util/cache/SimpleLRUCache.java0000644000175000017500000000313511474320230026067 0ustar janpascaljanpascalpackage org.apache.lucene.util.cache; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.LinkedHashMap; import java.util.Map; /** * Simple LRU cache implementation that uses a LinkedHashMap. * This cache is not synchronized, use {@link Cache#synchronizedCache(Cache)} * if needed. * */ public class SimpleLRUCache extends SimpleMapCache { private final static float LOADFACTOR = 0.75f; private int cacheSize; /** * Creates a last-recently-used cache with the specified size. */ public SimpleLRUCache(int cacheSize) { super(null); this.cacheSize = cacheSize; int capacity = (int) Math.ceil(cacheSize / LOADFACTOR) + 1; super.map = new LinkedHashMap(capacity, LOADFACTOR, true) { protected boolean removeEldestEntry(Map.Entry eldest) { return size() > SimpleLRUCache.this.cacheSize; } }; } } lucene-2.9.4/src/java/org/apache/lucene/util/ReaderUtil.java0000644000175000017500000000703211474320230024324 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.ArrayList; import java.util.List; import org.apache.lucene.index.IndexReader; /** * Common util methods for dealing with {@link IndexReader}s. * */ public class ReaderUtil { /** * Gathers sub-readers from reader into a List. * * @param allSubReaders * @param reader */ public static void gatherSubReaders(List allSubReaders, IndexReader reader) { IndexReader[] subReaders = reader.getSequentialSubReaders(); if (subReaders == null) { // Add the reader itself, and do not recurse allSubReaders.add(reader); } else { for (int i = 0; i < subReaders.length; i++) { gatherSubReaders(allSubReaders, subReaders[i]); } } } /** * Returns sub IndexReader that contains the given document id. * * @param doc id of document * @param reader parent reader * @return sub reader of parent which contains the specified doc id */ public static IndexReader subReader(int doc, IndexReader reader) { List subReadersList = new ArrayList(); ReaderUtil.gatherSubReaders(subReadersList, reader); IndexReader[] subReaders = (IndexReader[]) subReadersList .toArray(new IndexReader[subReadersList.size()]); int[] docStarts = new int[subReaders.length]; int maxDoc = 0; for (int i = 0; i < subReaders.length; i++) { docStarts[i] = maxDoc; maxDoc += subReaders[i].maxDoc(); } return subReaders[ReaderUtil.subIndex(doc, docStarts)]; } /** * Returns sub-reader subIndex from reader. * * @param reader parent reader * @param subIndex index of desired sub reader * @return the subreader at subIndex */ public static IndexReader subReader(IndexReader reader, int subIndex) { List subReadersList = new ArrayList(); ReaderUtil.gatherSubReaders(subReadersList, reader); IndexReader[] subReaders = (IndexReader[]) subReadersList .toArray(new IndexReader[subReadersList.size()]); return subReaders[subIndex]; } /** * Returns index of the searcher/reader for document n in the * array used to construct this searcher/reader. */ public static int subIndex(int n, int[] docStarts) { // find // searcher/reader for doc n: int size = docStarts.length; int lo = 0; // search starts array int hi = size - 1; // for first element less than n, return its index while (hi >= lo) { int mid = (lo + hi) >>> 1; int midValue = docStarts[mid]; if (n < midValue) hi = mid - 1; else if (n > midValue) lo = mid + 1; else { // found a match while (mid + 1 < size && docStarts[mid + 1] == midValue) { mid++; // scan to last match } return mid; } } return hi; } } lucene-2.9.4/src/java/org/apache/lucene/util/SimpleStringInterner.java0000644000175000017500000000507711474320230026422 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Simple lockless and memory barrier free String intern cache that is guaranteed * to return the same String instance as String.intern() does. */ public class SimpleStringInterner extends StringInterner { private static class Entry { final private String str; final private int hash; private Entry next; private Entry(String str, int hash, Entry next) { this.str = str; this.hash = hash; this.next = next; } } private final Entry[] cache; private final int maxChainLength; /** * @param tableSize Size of the hash table, should be a power of two. * @param maxChainLength Maximum length of each bucket, after which the oldest item inserted is dropped. */ public SimpleStringInterner(int tableSize, int maxChainLength) { cache = new Entry[Math.max(1,BitUtil.nextHighestPowerOfTwo(tableSize))]; this.maxChainLength = Math.max(2,maxChainLength); } // @Override public String intern(String s) { int h = s.hashCode(); // In the future, it may be worth augmenting the string hash // if the lower bits need better distribution. int slot = h & (cache.length-1); Entry first = this.cache[slot]; Entry nextToLast = null; int chainLength = 0; for(Entry e=first; e!=null; e=e.next) { if (e.hash == h && (e.str == s || e.str.compareTo(s)==0)) { // if (e.str == s || (e.hash == h && e.str.compareTo(s)==0)) { return e.str; } chainLength++; if (e.next != null) { nextToLast = e; } } // insertion-order cache: add new entry at head s = s.intern(); this.cache[slot] = new Entry(s, h, first); if (chainLength >= maxChainLength) { // prune last entry nextToLast.next = null; } return s; } }lucene-2.9.4/src/java/org/apache/lucene/util/AttributeSource.java0000644000175000017500000004266211474320231025421 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.lang.ref.WeakReference; import java.util.Collections; import java.util.NoSuchElementException; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.WeakHashMap; import java.util.LinkedList; import java.util.Map; import java.util.Map.Entry; import org.apache.lucene.analysis.TokenStream; // for javadocs /** * An AttributeSource contains a list of different {@link AttributeImpl}s, * and methods to add and get them. There can only be a single instance * of an attribute in the same AttributeSource instance. This is ensured * by passing in the actual type of the Attribute (Class<Attribute>) to * the {@link #addAttribute(Class)}, which then checks if an instance of * that type is already present. If yes, it returns the instance, otherwise * it creates a new instance and returns it. */ public class AttributeSource { /** * An AttributeFactory creates instances of {@link AttributeImpl}s. */ public static abstract class AttributeFactory { /** * returns an {@link AttributeImpl} for the supplied {@link Attribute} interface class. *

    Signature for Java 1.5: public AttributeImpl createAttributeInstance(Class%lt;? extends Attribute> attClass) */ public abstract AttributeImpl createAttributeInstance(Class attClass); /** * This is the default factory that creates {@link AttributeImpl}s using the * class name of the supplied {@link Attribute} interface class by appending Impl to it. */ public static final AttributeFactory DEFAULT_ATTRIBUTE_FACTORY = new DefaultAttributeFactory(); private static final class DefaultAttributeFactory extends AttributeFactory { private static final WeakHashMap/*, WeakReference>>*/ attClassImplMap = new WeakHashMap(); private DefaultAttributeFactory() {} public AttributeImpl createAttributeInstance(Class attClass) { try { return (AttributeImpl) getClassForInterface(attClass).newInstance(); } catch (InstantiationException e) { throw new IllegalArgumentException("Could not instantiate implementing class for " + attClass.getName()); } catch (IllegalAccessException e) { throw new IllegalArgumentException("Could not instantiate implementing class for " + attClass.getName()); } } private static Class getClassForInterface(Class attClass) { synchronized(attClassImplMap) { final WeakReference ref = (WeakReference) attClassImplMap.get(attClass); Class clazz = (ref == null) ? null : ((Class) ref.get()); if (clazz == null) { try { attClassImplMap.put(attClass, new WeakReference( clazz = Class.forName(attClass.getName() + "Impl", true, attClass.getClassLoader()) )); } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Could not find implementing class for " + attClass.getName()); } } return clazz; } } } } // These two maps must always be in sync!!! // So they are private, final and read-only from the outside (read-only iterators) private final Map/*,AttributeImpl>*/ attributes; private final Map/*,AttributeImpl>*/ attributeImpls; private AttributeFactory factory; /** * An AttributeSource using the default attribute factory {@link AttributeSource.AttributeFactory#DEFAULT_ATTRIBUTE_FACTORY}. */ public AttributeSource() { this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); } /** * An AttributeSource that uses the same attributes as the supplied one. */ public AttributeSource(AttributeSource input) { if (input == null) { throw new IllegalArgumentException("input AttributeSource must not be null"); } this.attributes = input.attributes; this.attributeImpls = input.attributeImpls; this.factory = input.factory; } /** * An AttributeSource using the supplied {@link AttributeFactory} for creating new {@link Attribute} instances. */ public AttributeSource(AttributeFactory factory) { this.attributes = new LinkedHashMap(); this.attributeImpls = new LinkedHashMap(); this.factory = factory; } /** * returns the used AttributeFactory. */ public AttributeFactory getAttributeFactory() { return this.factory; } /** Returns a new iterator that iterates the attribute classes * in the same order they were added in. *

    Signature for Java 1.5: public Iterator<Class<? extends Attribute>> getAttributeClassesIterator() */ public Iterator getAttributeClassesIterator() { return Collections.unmodifiableSet(attributes.keySet()).iterator(); } /** Returns a new iterator that iterates all unique Attribute implementations. * This iterator may contain less entries that {@link #getAttributeClassesIterator}, * if one instance implements more than one Attribute interface. *

    Signature for Java 1.5: public Iterator<AttributeImpl> getAttributeImplsIterator() */ public Iterator getAttributeImplsIterator() { if (hasAttributes()) { if (currentState == null) { computeCurrentState(); } final State initState = currentState; return new Iterator() { private State state = initState; public void remove() { throw new UnsupportedOperationException(); } public Object next() { if (state == null) throw new NoSuchElementException(); final AttributeImpl att = state.attribute; state = state.next; return att; } public boolean hasNext() { return state != null; } }; } else { return Collections.EMPTY_SET.iterator(); } } /** a cache that stores all interfaces for known implementation classes for performance (slow reflection) */ private static final WeakHashMap/*,LinkedList>>>*/ knownImplClasses = new WeakHashMap(); /** Adds a custom AttributeImpl instance with one or more Attribute interfaces. */ public void addAttributeImpl(final AttributeImpl att) { final Class clazz = att.getClass(); if (attributeImpls.containsKey(clazz)) return; LinkedList foundInterfaces; synchronized(knownImplClasses) { foundInterfaces = (LinkedList) knownImplClasses.get(clazz); if (foundInterfaces == null) { // we have a strong reference to the class instance holding all interfaces in the list (parameter "att"), // so all WeakReferences are never evicted by GC knownImplClasses.put(clazz, foundInterfaces=new LinkedList()); // find all interfaces that this attribute instance implements // and that extend the Attribute interface Class actClazz = clazz; do { Class[] interfaces = actClazz.getInterfaces(); for (int i = 0; i < interfaces.length; i++) { final Class curInterface = interfaces[i]; if (curInterface != Attribute.class && Attribute.class.isAssignableFrom(curInterface)) { foundInterfaces.add(new WeakReference(curInterface)); } } actClazz = actClazz.getSuperclass(); } while (actClazz != null); } } // add all interfaces of this AttributeImpl to the maps for (Iterator it = foundInterfaces.iterator(); it.hasNext(); ) { final WeakReference curInterfaceRef = (WeakReference) it.next(); final Class curInterface = (Class) curInterfaceRef.get(); assert (curInterface != null) : "We have a strong reference on the class holding the interfaces, so they should never get evicted"; // Attribute is a superclass of this interface if (!attributes.containsKey(curInterface)) { // invalidate state to force recomputation in captureState() this.currentState = null; attributes.put(curInterface, att); attributeImpls.put(clazz, att); } } } /** * The caller must pass in a Class<? extends Attribute> value. * This method first checks if an instance of that class is * already in this AttributeSource and returns it. Otherwise a * new instance is created, added to this AttributeSource and returned. *

    Signature for Java 1.5: public <T extends Attribute> T addAttribute(Class<T>) */ public Attribute addAttribute(Class attClass) { final Attribute att = (Attribute) attributes.get(attClass); if (att == null) { if (!(attClass.isInterface() && Attribute.class.isAssignableFrom(attClass))) { throw new IllegalArgumentException( "addAttribute() only accepts an interface that extends Attribute, but " + attClass.getName() + " does not fulfil this contract." ); } final AttributeImpl attImpl = this.factory.createAttributeInstance(attClass); addAttributeImpl(attImpl); return attImpl; } else { return att; } } /** Returns true, iff this AttributeSource has any attributes */ public boolean hasAttributes() { return !this.attributes.isEmpty(); } /** * The caller must pass in a Class<? extends Attribute> value. * Returns true, iff this AttributeSource contains the passed-in Attribute. *

    Signature for Java 1.5: public boolean hasAttribute(Class<? extends Attribute>) */ public boolean hasAttribute(Class attClass) { return this.attributes.containsKey(attClass); } /** * The caller must pass in a Class<? extends Attribute> value. * Returns the instance of the passed in Attribute contained in this AttributeSource *

    Signature for Java 1.5: public <T extends Attribute> T getAttribute(Class<T>) * * @throws IllegalArgumentException if this AttributeSource does not contain the * Attribute. It is recommended to always use {@link #addAttribute} even in consumers * of TokenStreams, because you cannot know if a specific TokenStream really uses * a specific Attribute. {@link #addAttribute} will automatically make the attribute * available. If you want to only use the attribute, if it is available (to optimize * consuming), use {@link #hasAttribute}. */ public Attribute getAttribute(Class attClass) { final Attribute att = (Attribute) this.attributes.get(attClass); if (att == null) { throw new IllegalArgumentException("This AttributeSource does not have the attribute '" + attClass.getName() + "'."); } return att; } /** * This class holds the state of an AttributeSource. * @see #captureState * @see #restoreState */ public static final class State implements Cloneable { private AttributeImpl attribute; private State next; public Object clone() { State clone = new State(); clone.attribute = (AttributeImpl) attribute.clone(); if (next != null) { clone.next = (State) next.clone(); } return clone; } } private State currentState = null; private void computeCurrentState() { currentState = new State(); State c = currentState; Iterator it = attributeImpls.values().iterator(); c.attribute = (AttributeImpl) it.next(); while (it.hasNext()) { c.next = new State(); c = c.next; c.attribute = (AttributeImpl) it.next(); } } /** * Resets all Attributes in this AttributeSource by calling * {@link AttributeImpl#clear()} on each Attribute implementation. */ public void clearAttributes() { if (hasAttributes()) { if (currentState == null) { computeCurrentState(); } for (State state = currentState; state != null; state = state.next) { state.attribute.clear(); } } } /** * Captures the state of all Attributes. The return value can be passed to * {@link #restoreState} to restore the state of this or another AttributeSource. */ public State captureState() { if (!hasAttributes()) { return null; } if (currentState == null) { computeCurrentState(); } return (State) this.currentState.clone(); } /** * Restores this state by copying the values of all attribute implementations * that this state contains into the attributes implementations of the targetStream. * The targetStream must contain a corresponding instance for each argument * contained in this state (e.g. it is not possible to restore the state of * an AttributeSource containing a TermAttribute into a AttributeSource using * a Token instance as implementation). *

    * Note that this method does not affect attributes of the targetStream * that are not contained in this state. In other words, if for example * the targetStream contains an OffsetAttribute, but this state doesn't, then * the value of the OffsetAttribute remains unchanged. It might be desirable to * reset its value to the default, in which case the caller should first * call {@link TokenStream#clearAttributes()} on the targetStream. */ public void restoreState(State state) { if (state == null) return; do { AttributeImpl targetImpl = (AttributeImpl) attributeImpls.get(state.attribute.getClass()); if (targetImpl == null) throw new IllegalArgumentException("State contains an AttributeImpl that is not in this AttributeSource"); state.attribute.copyTo(targetImpl); state = state.next; } while (state != null); } public int hashCode() { int code = 0; if (hasAttributes()) { if (currentState == null) { computeCurrentState(); } for (State state = currentState; state != null; state = state.next) { code = code * 31 + state.attribute.hashCode(); } } return code; } public boolean equals(Object obj) { if (obj == this) { return true; } if (obj instanceof AttributeSource) { AttributeSource other = (AttributeSource) obj; if (hasAttributes()) { if (!other.hasAttributes()) { return false; } if (this.attributeImpls.size() != other.attributeImpls.size()) { return false; } // it is only equal if all attribute impls are the same in the same order if (this.currentState == null) { this.computeCurrentState(); } State thisState = this.currentState; if (other.currentState == null) { other.computeCurrentState(); } State otherState = other.currentState; while (thisState != null && otherState != null) { if (otherState.attribute.getClass() != thisState.attribute.getClass() || !otherState.attribute.equals(thisState.attribute)) { return false; } thisState = thisState.next; otherState = otherState.next; } return true; } else { return !other.hasAttributes(); } } else return false; } public String toString() { StringBuffer sb = new StringBuffer(); sb.append('('); if (hasAttributes()) { if (currentState == null) { computeCurrentState(); } for (State state = currentState; state != null; state = state.next) { if (state != currentState) sb.append(','); sb.append(state.attribute.toString()); } } sb.append(')'); return sb.toString(); } /** * Performs a clone of all {@link AttributeImpl} instances returned in a new * AttributeSource instance. This method can be used to e.g. create another TokenStream * with exactly the same attributes (using {@link #AttributeSource(AttributeSource)}) */ public AttributeSource cloneAttributes() { AttributeSource clone = new AttributeSource(this.factory); // first clone the impls if (hasAttributes()) { if (currentState == null) { computeCurrentState(); } for (State state = currentState; state != null; state = state.next) { clone.attributeImpls.put(state.attribute.getClass(), state.attribute.clone()); } } // now the interfaces Iterator/*, AttributeImpl>>*/ attIt = this.attributes.entrySet().iterator(); while (attIt.hasNext()) { Entry/*, AttributeImpl>*/ entry = (Entry/*, AttributeImpl>*/) attIt.next(); clone.attributes.put(entry.getKey(), clone.attributeImpls.get(entry.getValue().getClass())); } return clone; } } lucene-2.9.4/src/java/org/apache/lucene/util/OpenBitSetDISI.java0000644000175000017500000000655411474320230024761 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.search.DocIdSetIterator; public class OpenBitSetDISI extends OpenBitSet { /** Construct an OpenBitSetDISI with its bits set * from the doc ids of the given DocIdSetIterator. * Also give a maximum size one larger than the largest doc id for which a * bit may ever be set on this OpenBitSetDISI. */ public OpenBitSetDISI(DocIdSetIterator disi, int maxSize) throws IOException { super(maxSize); inPlaceOr(disi); } /** Construct an OpenBitSetDISI with no bits set, and a given maximum size * one larger than the largest doc id for which a bit may ever be set * on this OpenBitSetDISI. */ public OpenBitSetDISI(int maxSize) { super(maxSize); } /** * Perform an inplace OR with the doc ids from a given DocIdSetIterator, * setting the bit for each such doc id. * These doc ids should be smaller than the maximum size passed to the * constructor. */ public void inPlaceOr(DocIdSetIterator disi) throws IOException { int doc; long size = size(); while ((doc = disi.nextDoc()) < size) { fastSet(doc); } } /** * Perform an inplace AND with the doc ids from a given DocIdSetIterator, * leaving only the bits set for which the doc ids are in common. * These doc ids should be smaller than the maximum size passed to the * constructor. */ public void inPlaceAnd(DocIdSetIterator disi) throws IOException { int bitSetDoc = nextSetBit(0); int disiDoc; while (bitSetDoc != -1 && (disiDoc = disi.advance(bitSetDoc)) != DocIdSetIterator.NO_MORE_DOCS) { clear(bitSetDoc, disiDoc); bitSetDoc = nextSetBit(disiDoc + 1); } if (bitSetDoc != -1) { clear(bitSetDoc, size()); } } /** * Perform an inplace NOT with the doc ids from a given DocIdSetIterator, * clearing all the bits for each such doc id. * These doc ids should be smaller than the maximum size passed to the * constructor. */ public void inPlaceNot(DocIdSetIterator disi) throws IOException { int doc; long size = size(); while ((doc = disi.nextDoc()) < size) { fastClear(doc); } } /** * Perform an inplace XOR with the doc ids from a given DocIdSetIterator, * flipping all the bits for each such doc id. * These doc ids should be smaller than the maximum size passed to the * constructor. */ public void inPlaceXor(DocIdSetIterator disi) throws IOException { int doc; long size = size(); while ((doc = disi.nextDoc()) < size) { fastFlip(doc); } } } lucene-2.9.4/src/java/org/apache/lucene/util/Attribute.java0000644000175000017500000000161611474320230024231 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Base interface for attributes. */ public interface Attribute { } lucene-2.9.4/src/java/org/apache/lucene/util/MemoryModel.java0000644000175000017500000000263511474320230024521 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with this * work for additional information regarding copyright ownership. The ASF * licenses this file to You under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ /** * Returns primitive memory sizes for estimating RAM usage. * */ public abstract class MemoryModel { /** * @return size of array beyond contents */ public abstract int getArraySize(); /** * @return Class size overhead */ public abstract int getClassSize(); /** * @param clazz a primitive Class - bool, byte, char, short, long, float, * short, double, int * @return the size in bytes of given primitive Class */ public abstract int getPrimitiveSize(Class clazz); /** * @return size of reference */ public abstract int getReferenceSize(); } lucene-2.9.4/src/java/org/apache/lucene/util/SmallFloat.java0000644000175000017500000001042511474320231024323 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** Floating point numbers smaller than 32 bits. * * @version $Id$ */ public class SmallFloat { /** Converts a 32 bit float to an 8 bit float. *
    Values less than zero are all mapped to zero. *
    Values are truncated (rounded down) to the nearest 8 bit value. *
    Values between zero and the smallest representable value * are rounded up. * * @param f the 32 bit float to be converted to an 8 bit float (byte) * @param numMantissaBits the number of mantissa bits to use in the byte, with the remainder to be used in the exponent * @param zeroExp the zero-point in the range of exponent values * @return the 8 bit float representation */ public static byte floatToByte(float f, int numMantissaBits, int zeroExp) { // Adjustment from a float zero exponent to our zero exponent, // shifted over to our exponent position. int fzero = (63-zeroExp)<> (24-numMantissaBits); if (smallfloat < fzero) { return (bits<=0) ? (byte)0 // negative numbers and zero both map to 0 byte :(byte)1; // underflow is mapped to smallest non-zero number. } else if (smallfloat >= fzero + 0x100) { return -1; // overflow maps to largest number } else { return (byte)(smallfloat - fzero); } } /** Converts an 8 bit float to a 32 bit float. */ public static float byteToFloat(byte b, int numMantissaBits, int zeroExp) { // on Java1.5 & 1.6 JVMs, prebuilding a decoding array and doing a lookup // is only a little bit faster (anywhere from 0% to 7%) if (b == 0) return 0.0f; int bits = (b&0xff) << (24-numMantissaBits); bits += (63-zeroExp) << 24; return Float.intBitsToFloat(bits); } // // Some specializations of the generic functions follow. // The generic functions are just as fast with current (1.5) // -server JVMs, but still slower with client JVMs. // /** floatToByte(b, mantissaBits=3, zeroExponent=15) *
    smallest non-zero value = 5.820766E-10 *
    largest value = 7.5161928E9 *
    epsilon = 0.125 */ public static byte floatToByte315(float f) { int bits = Float.floatToRawIntBits(f); int smallfloat = bits >> (24-3); if (smallfloat < (63-15)<<3) { return (bits<=0) ? (byte)0 : (byte)1; } if (smallfloat >= ((63-15)<<3) + 0x100) { return -1; } return (byte)(smallfloat - ((63-15)<<3)); } /** byteToFloat(b, mantissaBits=3, zeroExponent=15) */ public static float byte315ToFloat(byte b) { // on Java1.5 & 1.6 JVMs, prebuilding a decoding array and doing a lookup // is only a little bit faster (anywhere from 0% to 7%) if (b == 0) return 0.0f; int bits = (b&0xff) << (24-3); bits += (63-15) << 24; return Float.intBitsToFloat(bits); } /** floatToByte(b, mantissaBits=5, zeroExponent=2) *
    smallest nonzero value = 0.033203125 *
    largest value = 1984.0 *
    epsilon = 0.03125 */ public static byte floatToByte52(float f) { int bits = Float.floatToRawIntBits(f); int smallfloat = bits >> (24-5); if (smallfloat < (63-2)<<5) { return (bits<=0) ? (byte)0 : (byte)1; } if (smallfloat >= ((63-2)<<5) + 0x100) { return -1; } return (byte)(smallfloat - ((63-2)<<5)); } /** byteToFloat(b, mantissaBits=5, zeroExponent=2) */ public static float byte52ToFloat(byte b) { // on Java1.5 & 1.6 JVMs, prebuilding a decoding array and doing a lookup // is only a little bit faster (anywhere from 0% to 7%) if (b == 0) return 0.0f; int bits = (b&0xff) << (24-5); bits += (63-2) << 24; return Float.intBitsToFloat(bits); } } lucene-2.9.4/src/java/org/apache/lucene/util/UnicodeUtil.java0000644000175000017500000003533211474320231024515 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * Some of this code came from the excellent Unicode * conversion examples from: * * http://www.unicode.org/Public/PROGRAMS/CVTUTF * * Full Copyright for that code follows: */ /* * Copyright 2001-2004 Unicode, Inc. * * Disclaimer * * This source code is provided as is by Unicode, Inc. No claims are * made as to fitness for any particular purpose. No warranties of any * kind are expressed or implied. The recipient agrees to determine * applicability of information provided. If this file has been * purchased on magnetic or optical media from Unicode, Inc., the * sole remedy for any claim will be exchange of defective media * within 90 days of receipt. * * Limitations on Rights to Redistribute This Code * * Unicode, Inc. hereby grants the right to freely use the information * supplied in this file in the creation of products supporting the * Unicode Standard, and to make copies of this file in any form * for internal or external distribution as long as this notice * remains attached. */ /** * Class to encode java's UTF16 char[] into UTF8 byte[] * without always allocating a new byte[] as * String.getBytes("UTF-8") does. * *

    WARNING: This API is a new and experimental and * may suddenly change.

    */ final public class UnicodeUtil { public static final int UNI_SUR_HIGH_START = 0xD800; public static final int UNI_SUR_HIGH_END = 0xDBFF; public static final int UNI_SUR_LOW_START = 0xDC00; public static final int UNI_SUR_LOW_END = 0xDFFF; public static final int UNI_REPLACEMENT_CHAR = 0xFFFD; private static final long UNI_MAX_BMP = 0x0000FFFF; private static final int HALF_BASE = 0x0010000; private static final long HALF_SHIFT = 10; private static final long HALF_MASK = 0x3FFL; public static final class UTF8Result { public byte[] result = new byte[10]; public int length; public void setLength(int newLength) { if (result.length < newLength) { byte[] newArray = new byte[(int) (1.5*newLength)]; System.arraycopy(result, 0, newArray, 0, length); result = newArray; } length = newLength; } } public static final class UTF16Result { public char[] result = new char[10]; public int[] offsets = new int[10]; public int length; public void setLength(int newLength) { if (result.length < newLength) { char[] newArray = new char[(int) (1.5*newLength)]; System.arraycopy(result, 0, newArray, 0, length); result = newArray; } length = newLength; } public void copyText(UTF16Result other) { setLength(other.length); System.arraycopy(other.result, 0, result, 0, length); } } /** Encode characters from a char[] source, starting at * offset and stopping when the character 0xffff is seen. * Returns the number of bytes written to bytesOut. */ public static void UTF16toUTF8(final char[] source, final int offset, UTF8Result result) { int upto = 0; int i = offset; byte[] out = result.result; while(true) { final int code = (int) source[i++]; if (upto+4 > out.length) { byte[] newOut = new byte[2*out.length]; assert newOut.length >= upto+4; System.arraycopy(out, 0, newOut, 0, upto); result.result = out = newOut; } if (code < 0x80) out[upto++] = (byte) code; else if (code < 0x800) { out[upto++] = (byte) (0xC0 | (code >> 6)); out[upto++] = (byte)(0x80 | (code & 0x3F)); } else if (code < 0xD800 || code > 0xDFFF) { if (code == 0xffff) // END break; out[upto++] = (byte)(0xE0 | (code >> 12)); out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F)); out[upto++] = (byte)(0x80 | (code & 0x3F)); } else { // surrogate pair // confirm valid high surrogate if (code < 0xDC00 && source[i] != 0xffff) { int utf32 = (int) source[i]; // confirm valid low surrogate and write pair if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF); i++; out[upto++] = (byte)(0xF0 | (utf32 >> 18)); out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F)); out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F)); out[upto++] = (byte)(0x80 | (utf32 & 0x3F)); continue; } } // replace unpaired surrogate or out-of-order low surrogate // with substitution character out[upto++] = (byte) 0xEF; out[upto++] = (byte) 0xBF; out[upto++] = (byte) 0xBD; } } //assert matches(source, offset, i-offset-1, out, upto); result.length = upto; } /** Encode characters from a char[] source, starting at * offset for length chars. Returns the number of bytes * written to bytesOut. */ public static void UTF16toUTF8(final char[] source, final int offset, final int length, UTF8Result result) { int upto = 0; int i = offset; final int end = offset + length; byte[] out = result.result; while(i < end) { final int code = (int) source[i++]; if (upto+4 > out.length) { byte[] newOut = new byte[2*out.length]; assert newOut.length >= upto+4; System.arraycopy(out, 0, newOut, 0, upto); result.result = out = newOut; } if (code < 0x80) out[upto++] = (byte) code; else if (code < 0x800) { out[upto++] = (byte) (0xC0 | (code >> 6)); out[upto++] = (byte)(0x80 | (code & 0x3F)); } else if (code < 0xD800 || code > 0xDFFF) { out[upto++] = (byte)(0xE0 | (code >> 12)); out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F)); out[upto++] = (byte)(0x80 | (code & 0x3F)); } else { // surrogate pair // confirm valid high surrogate if (code < 0xDC00 && i < end && source[i] != 0xffff) { int utf32 = (int) source[i]; // confirm valid low surrogate and write pair if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF); i++; out[upto++] = (byte)(0xF0 | (utf32 >> 18)); out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F)); out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F)); out[upto++] = (byte)(0x80 | (utf32 & 0x3F)); continue; } } // replace unpaired surrogate or out-of-order low surrogate // with substitution character out[upto++] = (byte) 0xEF; out[upto++] = (byte) 0xBF; out[upto++] = (byte) 0xBD; } } //assert matches(source, offset, length, out, upto); result.length = upto; } /** Encode characters from this String, starting at offset * for length characters. Returns the number of bytes * written to bytesOut. */ public static void UTF16toUTF8(final String s, final int offset, final int length, UTF8Result result) { final int end = offset + length; byte[] out = result.result; int upto = 0; for(int i=offset;i out.length) { byte[] newOut = new byte[2*out.length]; assert newOut.length >= upto+4; System.arraycopy(out, 0, newOut, 0, upto); result.result = out = newOut; } if (code < 0x80) out[upto++] = (byte) code; else if (code < 0x800) { out[upto++] = (byte) (0xC0 | (code >> 6)); out[upto++] = (byte)(0x80 | (code & 0x3F)); } else if (code < 0xD800 || code > 0xDFFF) { out[upto++] = (byte)(0xE0 | (code >> 12)); out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F)); out[upto++] = (byte)(0x80 | (code & 0x3F)); } else { // surrogate pair // confirm valid high surrogate if (code < 0xDC00 && (i < end-1)) { int utf32 = (int) s.charAt(i+1); // confirm valid low surrogate and write pair if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF); i++; out[upto++] = (byte)(0xF0 | (utf32 >> 18)); out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F)); out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F)); out[upto++] = (byte)(0x80 | (utf32 & 0x3F)); continue; } } // replace unpaired surrogate or out-of-order low surrogate // with substitution character out[upto++] = (byte) 0xEF; out[upto++] = (byte) 0xBF; out[upto++] = (byte) 0xBD; } } //assert matches(s, offset, length, out, upto); result.length = upto; } /** Convert UTF8 bytes into UTF16 characters. If offset * is non-zero, conversion starts at that starting point * in utf8, re-using the results from the previous call * up until offset. */ public static void UTF8toUTF16(final byte[] utf8, final int offset, final int length, final UTF16Result result) { final int end = offset + length; char[] out = result.result; if (result.offsets.length <= end) { int[] newOffsets = new int[2*end]; System.arraycopy(result.offsets, 0, newOffsets, 0, result.offsets.length); result.offsets = newOffsets; } final int[] offsets = result.offsets; // If incremental decoding fell in the middle of a // single unicode character, rollback to its start: int upto = offset; while(offsets[upto] == -1) upto--; int outUpto = offsets[upto]; // Pre-allocate for worst case 1-for-1 if (outUpto+length >= out.length) { char[] newOut = new char[2*(outUpto+length)]; System.arraycopy(out, 0, newOut, 0, outUpto); result.result = out = newOut; } while (upto < end) { final int b = utf8[upto]&0xff; final int ch; offsets[upto++] = outUpto; if (b < 0xc0) { assert b < 0x80; ch = b; } else if (b < 0xe0) { ch = ((b&0x1f)<<6) + (utf8[upto]&0x3f); offsets[upto++] = -1; } else if (b < 0xf0) { ch = ((b&0xf)<<12) + ((utf8[upto]&0x3f)<<6) + (utf8[upto+1]&0x3f); offsets[upto++] = -1; offsets[upto++] = -1; } else { assert b < 0xf8; ch = ((b&0x7)<<18) + ((utf8[upto]&0x3f)<<12) + ((utf8[upto+1]&0x3f)<<6) + (utf8[upto+2]&0x3f); offsets[upto++] = -1; offsets[upto++] = -1; offsets[upto++] = -1; } if (ch <= UNI_MAX_BMP) { // target is a character <= 0xFFFF out[outUpto++] = (char) ch; } else { // target is a character in range 0xFFFF - 0x10FFFF final int chHalf = ch - HALF_BASE; out[outUpto++] = (char) ((chHalf >> HALF_SHIFT) + UNI_SUR_HIGH_START); out[outUpto++] = (char) ((chHalf & HALF_MASK) + UNI_SUR_LOW_START); } } offsets[upto] = outUpto; result.length = outUpto; } // Only called from assert /* private static boolean matches(char[] source, int offset, int length, byte[] result, int upto) { try { String s1 = new String(source, offset, length); String s2 = new String(result, 0, upto, "UTF-8"); if (!s1.equals(s2)) { //System.out.println("DIFF: s1 len=" + s1.length()); //for(int i=0;i= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { if (i < size-1) { i++; char nextCH = s.charAt(i); if (nextCH >= UNI_SUR_LOW_START && nextCH <= UNI_SUR_LOW_END) { // Valid surrogate pair } else // Unmatched high surrogate return false; } else // Unmatched high surrogate return false; } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) // Unmatched low surrogate return false; } return true; } public static final boolean validUTF16String(char[] s, int size) { for(int i=0;i= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { if (i < size-1) { i++; char nextCH = s[i]; if (nextCH >= UNI_SUR_LOW_START && nextCH <= UNI_SUR_LOW_END) { // Valid surrogate pair } else return false; } else return false; } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) // Unmatched low surrogate return false; } return true; } */ } lucene-2.9.4/src/java/org/apache/lucene/util/CloseableThreadLocal.java0000644000175000017500000000566211474320230026267 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Map; import java.util.HashMap; import java.util.Iterator; import java.lang.ref.WeakReference; /** Java's builtin ThreadLocal has a serious flaw: * it can take an arbitrarily long amount of time to * dereference the things you had stored in it, even once the * ThreadLocal instance itself is no longer referenced. * This is because there is single, master map stored for * each thread, which all ThreadLocals share, and that * master map only periodically purges "stale" entries. * * While not technically a memory leak, because eventually * the memory will be reclaimed, it can take a long time * and you can easily hit OutOfMemoryError because from the * GC's standpoint the stale entries are not reclaimable. * * This class works around that, by only enrolling * WeakReference values into the ThreadLocal, and * separately holding a hard reference to each stored * value. When you call {@link #close}, these hard * references are cleared and then GC is freely able to * reclaim space by objects stored in it. */ public class CloseableThreadLocal { private ThreadLocal t = new ThreadLocal(); private Map hardRefs = new HashMap(); protected Object initialValue() { return null; } public Object get() { WeakReference weakRef = (WeakReference) t.get(); if (weakRef == null) { Object iv = initialValue(); if (iv != null) { set(iv); return iv; } else return null; } else { return weakRef.get(); } } public void set(Object object) { t.set(new WeakReference(object)); synchronized(hardRefs) { hardRefs.put(Thread.currentThread(), object); // Purge dead threads Iterator it = hardRefs.keySet().iterator(); while(it.hasNext()) { Thread t = (Thread) it.next(); if (!t.isAlive()) it.remove(); } } } public void close() { // Clear the hard refs; then, the only remaining refs to // all values we were storing are weak (unless somewhere // else is still using them) and so GC may reclaim them: hardRefs = null; t = null; } } lucene-2.9.4/src/java/org/apache/lucene/util/SorterTemplate.java0000644000175000017500000001160211474320230025234 0ustar janpascaljanpascalpackage org.apache.lucene.util; /* * Copyright 2003 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Borrowed from Cglib. Allows custom swap so that two arrays can be sorted * at the same time. */ public abstract class SorterTemplate { private static final int MERGESORT_THRESHOLD = 12; private static final int QUICKSORT_THRESHOLD = 7; abstract protected void swap(int i, int j); abstract protected int compare(int i, int j); public void quickSort(int lo, int hi) { quickSortHelper(lo, hi); insertionSort(lo, hi); } private void quickSortHelper(int lo, int hi) { for (;;) { int diff = hi - lo; if (diff <= QUICKSORT_THRESHOLD) { break; } int i = (hi + lo) / 2; if (compare(lo, i) > 0) { swap(lo, i); } if (compare(lo, hi) > 0) { swap(lo, hi); } if (compare(i, hi) > 0) { swap(i, hi); } int j = hi - 1; swap(i, j); i = lo; int v = j; for (;;) { while (compare(++i, v) < 0) { /* nothing */; } while (compare(--j, v) > 0) { /* nothing */; } if (j < i) { break; } swap(i, j); } swap(i, hi - 1); if (j - lo <= hi - i + 1) { quickSortHelper(lo, j); lo = i + 1; } else { quickSortHelper(i + 1, hi); hi = j; } } } private void insertionSort(int lo, int hi) { for (int i = lo + 1 ; i <= hi; i++) { for (int j = i; j > lo; j--) { if (compare(j - 1, j) > 0) { swap(j - 1, j); } else { break; } } } } protected void mergeSort(int lo, int hi) { int diff = hi - lo; if (diff <= MERGESORT_THRESHOLD) { insertionSort(lo, hi); return; } int mid = lo + diff / 2; mergeSort(lo, mid); mergeSort(mid, hi); merge(lo, mid, hi, mid - lo, hi - mid); } private void merge(int lo, int pivot, int hi, int len1, int len2) { if (len1 == 0 || len2 == 0) { return; } if (len1 + len2 == 2) { if (compare(pivot, lo) < 0) { swap(pivot, lo); } return; } int first_cut, second_cut; int len11, len22; if (len1 > len2) { len11 = len1 / 2; first_cut = lo + len11; second_cut = lower(pivot, hi, first_cut); len22 = second_cut - pivot; } else { len22 = len2 / 2; second_cut = pivot + len22; first_cut = upper(lo, pivot, second_cut); len11 = first_cut - lo; } rotate(first_cut, pivot, second_cut); int new_mid = first_cut + len22; merge(lo, first_cut, new_mid, len11, len22); merge(new_mid, second_cut, hi, len1 - len11, len2 - len22); } private void rotate(int lo, int mid, int hi) { int lot = lo; int hit = mid - 1; while (lot < hit) { swap(lot++, hit--); } lot = mid; hit = hi - 1; while (lot < hit) { swap(lot++, hit--); } lot = lo; hit = hi - 1; while (lot < hit) { swap(lot++, hit--); } } private int lower(int lo, int hi, int val) { int len = hi - lo; while (len > 0) { int half = len / 2; int mid= lo + half; if (compare(mid, val) < 0) { lo = mid + 1; len = len - half -1; } else { len = half; } } return lo; } private int upper(int lo, int hi, int val) { int len = hi - lo; while (len > 0) { int half = len / 2; int mid = lo + half; if (compare(val, mid) < 0) { len = half; } else { lo = mid + 1; len = len - half -1; } } return lo; } } lucene-2.9.4/src/java/org/apache/lucene/util/OpenBitSetIterator.java0000644000175000017500000001635311474320231026021 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.util; import org.apache.lucene.search.DocIdSetIterator; /** An iterator to iterate over set bits in an OpenBitSet. * This is faster than nextSetBit() for iterating over the complete set of bits, * especially when the density of the bits set is high. * * @version $Id$ */ public class OpenBitSetIterator extends DocIdSetIterator { // The General Idea: instead of having an array per byte that has // the offsets of the next set bit, that array could be // packed inside a 32 bit integer (8 4 bit numbers). That // should be faster than accessing an array for each index, and // the total array size is kept smaller (256*sizeof(int))=1K protected final static int[] bitlist={ 0x0, 0x1, 0x2, 0x21, 0x3, 0x31, 0x32, 0x321, 0x4, 0x41, 0x42, 0x421, 0x43, 0x431, 0x432, 0x4321, 0x5, 0x51, 0x52, 0x521, 0x53, 0x531, 0x532, 0x5321, 0x54, 0x541, 0x542, 0x5421, 0x543, 0x5431, 0x5432, 0x54321, 0x6, 0x61, 0x62, 0x621, 0x63, 0x631, 0x632, 0x6321, 0x64, 0x641, 0x642, 0x6421, 0x643, 0x6431, 0x6432, 0x64321, 0x65, 0x651, 0x652, 0x6521, 0x653, 0x6531, 0x6532, 0x65321, 0x654, 0x6541, 0x6542, 0x65421, 0x6543, 0x65431, 0x65432, 0x654321, 0x7, 0x71, 0x72, 0x721, 0x73, 0x731, 0x732, 0x7321, 0x74, 0x741, 0x742, 0x7421, 0x743, 0x7431, 0x7432, 0x74321, 0x75, 0x751, 0x752, 0x7521, 0x753, 0x7531, 0x7532, 0x75321, 0x754, 0x7541, 0x7542, 0x75421, 0x7543, 0x75431, 0x75432, 0x754321, 0x76, 0x761, 0x762, 0x7621, 0x763, 0x7631, 0x7632, 0x76321, 0x764, 0x7641, 0x7642, 0x76421, 0x7643, 0x76431, 0x76432, 0x764321, 0x765, 0x7651, 0x7652, 0x76521, 0x7653, 0x76531, 0x76532, 0x765321, 0x7654, 0x76541, 0x76542, 0x765421, 0x76543, 0x765431, 0x765432, 0x7654321, 0x8, 0x81, 0x82, 0x821, 0x83, 0x831, 0x832, 0x8321, 0x84, 0x841, 0x842, 0x8421, 0x843, 0x8431, 0x8432, 0x84321, 0x85, 0x851, 0x852, 0x8521, 0x853, 0x8531, 0x8532, 0x85321, 0x854, 0x8541, 0x8542, 0x85421, 0x8543, 0x85431, 0x85432, 0x854321, 0x86, 0x861, 0x862, 0x8621, 0x863, 0x8631, 0x8632, 0x86321, 0x864, 0x8641, 0x8642, 0x86421, 0x8643, 0x86431, 0x86432, 0x864321, 0x865, 0x8651, 0x8652, 0x86521, 0x8653, 0x86531, 0x86532, 0x865321, 0x8654, 0x86541, 0x86542, 0x865421, 0x86543, 0x865431, 0x865432, 0x8654321, 0x87, 0x871, 0x872, 0x8721, 0x873, 0x8731, 0x8732, 0x87321, 0x874, 0x8741, 0x8742, 0x87421, 0x8743, 0x87431, 0x87432, 0x874321, 0x875, 0x8751, 0x8752, 0x87521, 0x8753, 0x87531, 0x87532, 0x875321, 0x8754, 0x87541, 0x87542, 0x875421, 0x87543, 0x875431, 0x875432, 0x8754321, 0x876, 0x8761, 0x8762, 0x87621, 0x8763, 0x87631, 0x87632, 0x876321, 0x8764, 0x87641, 0x87642, 0x876421, 0x87643, 0x876431, 0x876432, 0x8764321, 0x8765, 0x87651, 0x87652, 0x876521, 0x87653, 0x876531, 0x876532, 0x8765321, 0x87654, 0x876541, 0x876542, 0x8765421, 0x876543, 0x8765431, 0x8765432, 0x87654321 }; /***** the python code that generated bitlist def bits2int(val): arr=0 for shift in range(8,0,-1): if val & 0x80: arr = (arr << 4) | shift val = val << 1 return arr def int_table(): tbl = [ hex(bits2int(val)).strip('L') for val in range(256) ] return ','.join(tbl) ******/ // hmmm, what about an iterator that finds zeros though, // or a reverse iterator... should they be separate classes // for efficiency, or have a common root interface? (or // maybe both? could ask for a SetBitsIterator, etc... private final long[] arr; private final int words; private int i=-1; private long word; private int wordShift; private int indexArray; private int curDocId = -1; public OpenBitSetIterator(OpenBitSet obs) { this(obs.getBits(), obs.getNumWords()); } public OpenBitSetIterator(long[] bits, int numWords) { arr = bits; words = numWords; } // 64 bit shifts private void shift() { if ((int)word ==0) {wordShift +=32; word = word >>>32; } if ((word & 0x0000FFFF) == 0) { wordShift +=16; word >>>=16; } if ((word & 0x000000FF) == 0) { wordShift +=8; word >>>=8; } indexArray = bitlist[(int)word & 0xff]; } /***** alternate shift implementations // 32 bit shifts, but a long shift needed at the end private void shift2() { int y = (int)word; if (y==0) {wordShift +=32; y = (int)(word >>>32); } if ((y & 0x0000FFFF) == 0) { wordShift +=16; y>>>=16; } if ((y & 0x000000FF) == 0) { wordShift +=8; y>>>=8; } indexArray = bitlist[y & 0xff]; word >>>= (wordShift +1); } private void shift3() { int lower = (int)word; int lowByte = lower & 0xff; if (lowByte != 0) { indexArray=bitlist[lowByte]; return; } shift(); } ******/ /** @deprecated use {@link #nextDoc()} instead. */ public boolean next() { return nextDoc() != NO_MORE_DOCS; } public int nextDoc() { if (indexArray == 0) { if (word != 0) { word >>>= 8; wordShift += 8; } while (word == 0) { if (++i >= words) { return curDocId = NO_MORE_DOCS; } word = arr[i]; wordShift = -1; // loop invariant code motion should move this } // after the first time, should I go with a linear search, or // stick with the binary search in shift? shift(); } int bitIndex = (indexArray & 0x0f) + wordShift; indexArray >>>= 4; // should i<<6 be cached as a separate variable? // it would only save one cycle in the best circumstances. return curDocId = (i<<6) + bitIndex; } /** @deprecated use {@link #advance(int)} instead. */ public boolean skipTo(int target) { return advance(target) != NO_MORE_DOCS; } public int advance(int target) { indexArray = 0; i = target >> 6; if (i >= words) { word = 0; // setup so next() will also return -1 return curDocId = NO_MORE_DOCS; } wordShift = target & 0x3f; word = arr[i] >>> wordShift; if (word != 0) { wordShift--; // compensate for 1 based arrIndex } else { while (word == 0) { if (++i >= words) { return curDocId = NO_MORE_DOCS; } word = arr[i]; } wordShift = -1; } shift(); int bitIndex = (indexArray & 0x0f) + wordShift; indexArray >>>= 4; // should i<<6 be cached as a separate variable? // it would only save one cycle in the best circumstances. return curDocId = (i<<6) + bitIndex; } /** @deprecated use {@link #docID()} instead. */ public int doc() { return curDocId; } public int docID() { return curDocId; } } lucene-2.9.4/src/java/org/apache/lucene/util/MapOfSets.java0000644000175000017500000000446511474320230024134 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Set; import java.util.Collection; import java.util.HashSet; import java.util.Map; /** * Helper class for keeping Lists of Objects associated with keys. WARNING: THIS CLASS IS NOT THREAD SAFE */ public class MapOfSets { private final Map theMap; /** * @param m the backing store for this object */ public MapOfSets(Map m) { theMap = m; } /** * @return direct access to the map backing this object. */ public Map getMap() { return theMap; } /** * Adds val to the Set associated with key in the Map. If key is not * already in the map, a new Set will first be created. * @return the size of the Set associated with key once val is added to it. */ public int put(Object key, Object val) { final Set theSet; if (theMap.containsKey(key)) { theSet = (Set)theMap.get(key); } else { theSet = new HashSet(23); theMap.put(key, theSet); } theSet.add(val); return theSet.size(); } /** * Adds multiple vals to the Set associated with key in the Map. * If key is not * already in the map, a new Set will first be created. * @return the size of the Set associated with key once val is added to it. */ public int putAll(Object key, Collection vals) { final Set theSet; if (theMap.containsKey(key)) { theSet = (Set)theMap.get(key); } else { theSet = new HashSet(23); theMap.put(key, theSet); } theSet.addAll(vals); return theSet.size(); } } lucene-2.9.4/src/java/org/apache/lucene/util/PriorityQueue.java0000644000175000017500000002236011474320231025114 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** A PriorityQueue maintains a partial ordering of its elements such that the * least element can always be found in constant time. Put()'s and pop()'s * require log(size) time. * *

    NOTE: This class pre-allocates a full array of * length maxSize+1, in {@link #initialize}. * */ public abstract class PriorityQueue { private int size; private int maxSize; protected Object[] heap; /** Determines the ordering of objects in this priority queue. Subclasses must define this one method. */ protected abstract boolean lessThan(Object a, Object b); /** * This method can be overridden by extending classes to return a sentinel * object which will be used by {@link #initialize(int)} to fill the queue, so * that the code which uses that queue can always assume it's full and only * change the top without attempting to insert any new object.
    * * Those sentinel values should always compare worse than any non-sentinel * value (i.e., {@link #lessThan(Object, Object)} should always favor the * non-sentinel values).
    * * By default, this method returns false, which means the queue will not be * filled with sentinel values. Otherwise, the value returned will be used to * pre-populate the queue. Adds sentinel values to the queue.
    * * If this method is extended to return a non-null value, then the following * usage pattern is recommended: * *

       * // extends getSentinelObject() to return a non-null value.
       * PriorityQueue pq = new MyQueue(numHits);
       * // save the 'top' element, which is guaranteed to not be null.
       * MyObject pqTop = (MyObject) pq.top();
       * <...>
       * // now in order to add a new element, which is 'better' than top (after 
       * // you've verified it is better), it is as simple as:
       * pqTop.change().
       * pqTop = pq.updateTop();
       * 
    * * NOTE: if this method returns a non-null value, it will be called by * {@link #initialize(int)} {@link #size()} times, relying on a new object to * be returned and will not check if it's null again. Therefore you should * ensure any call to this method creates a new instance and behaves * consistently, e.g., it cannot return null if it previously returned * non-null. * * @return the sentinel object to use to pre-populate the queue, or null if * sentinel objects are not supported. */ protected Object getSentinelObject() { return null; } /** Subclass constructors must call this. */ protected final void initialize(int maxSize) { size = 0; int heapSize; if (0 == maxSize) // We allocate 1 extra to avoid if statement in top() heapSize = 2; else { if (maxSize == Integer.MAX_VALUE) { // Don't wrap heapSize to -1, in this case, which // causes a confusing NegativeArraySizeException. // Note that very likely this will simply then hit // an OOME, but at least that's more indicative to // caller that this values is too big. We don't +1 // in this case, but it's very unlikely in practice // one will actually insert this many objects into // the PQ: heapSize = Integer.MAX_VALUE; } else { // NOTE: we add +1 because all access to heap is // 1-based not 0-based. heap[0] is unused. heapSize = maxSize + 1; } } heap = new Object[heapSize]; this.maxSize = maxSize; // If sentinel objects are supported, populate the queue with them Object sentinel = getSentinelObject(); if (sentinel != null) { heap[1] = sentinel; for (int i = 2; i < heap.length; i++) { heap[i] = getSentinelObject(); } size = maxSize; } } /** * Adds an Object to a PriorityQueue in log(size) time. If one tries to add * more objects than maxSize from initialize a RuntimeException * (ArrayIndexOutOfBound) is thrown. * * @deprecated use {@link #add(Object)} which returns the new top object, * saving an additional call to {@link #top()}. */ public final void put(Object element) { size++; heap[size] = element; upHeap(); } /** * Adds an Object to a PriorityQueue in log(size) time. If one tries to add * more objects than maxSize from initialize an * {@link ArrayIndexOutOfBoundsException} is thrown. * * @return the new 'top' element in the queue. */ public final Object add(Object element) { size++; heap[size] = element; upHeap(); return heap[1]; } /** * Adds element to the PriorityQueue in log(size) time if either the * PriorityQueue is not full, or not lessThan(element, top()). * * @param element * @return true if element is added, false otherwise. * @deprecated use {@link #insertWithOverflow(Object)} instead, which * encourages objects reuse. */ public boolean insert(Object element) { return insertWithOverflow(element) != element; } /** * insertWithOverflow() is the same as insert() except its * return value: it returns the object (if any) that was * dropped off the heap because it was full. This can be * the given parameter (in case it is smaller than the * full heap's minimum, and couldn't be added), or another * object that was previously the smallest value in the * heap and now has been replaced by a larger one, or null * if the queue wasn't yet full with maxSize elements. */ public Object insertWithOverflow(Object element) { if (size < maxSize) { put(element); return null; } else if (size > 0 && !lessThan(element, heap[1])) { Object ret = heap[1]; heap[1] = element; adjustTop(); return ret; } else { return element; } } /** Returns the least element of the PriorityQueue in constant time. */ public final Object top() { // We don't need to check size here: if maxSize is 0, // then heap is length 2 array with both entries null. // If size is 0 then heap[1] is already null. return heap[1]; } /** Removes and returns the least element of the PriorityQueue in log(size) time. */ public final Object pop() { if (size > 0) { Object result = heap[1]; // save first value heap[1] = heap[size]; // move last to first heap[size] = null; // permit GC of objects size--; downHeap(); // adjust heap return result; } else return null; } /** * Should be called when the Object at top changes values. Still log(n) worst * case, but it's at least twice as fast to * *
       * pq.top().change();
       * pq.adjustTop();
       * 
    * * instead of * *
       * o = pq.pop();
       * o.change();
       * pq.push(o);
       * 
    * * @deprecated use {@link #updateTop()} which returns the new top element and * saves an additional call to {@link #top()}. */ public final void adjustTop() { downHeap(); } /** * Should be called when the Object at top changes values. Still log(n) worst * case, but it's at least twice as fast to * *
       * pq.top().change();
       * pq.updateTop();
       * 
    * * instead of * *
       * o = pq.pop();
       * o.change();
       * pq.push(o);
       * 
    * * @return the new 'top' element. */ public final Object updateTop() { downHeap(); return heap[1]; } /** Returns the number of elements currently stored in the PriorityQueue. */ public final int size() { return size; } /** Removes all entries from the PriorityQueue. */ public final void clear() { for (int i = 0; i <= size; i++) { heap[i] = null; } size = 0; } private final void upHeap() { int i = size; Object node = heap[i]; // save bottom node int j = i >>> 1; while (j > 0 && lessThan(node, heap[j])) { heap[i] = heap[j]; // shift parents down i = j; j = j >>> 1; } heap[i] = node; // install saved node } private final void downHeap() { int i = 1; Object node = heap[i]; // save top node int j = i << 1; // find smaller child int k = j + 1; if (k <= size && lessThan(heap[k], heap[j])) { j = k; } while (j <= size && lessThan(heap[j], node)) { heap[i] = heap[j]; // shift up child i = j; j = i << 1; k = j + 1; if (k <= size && lessThan(heap[k], heap[j])) { j = k; } } heap[i] = node; // install saved node } } lucene-2.9.4/src/java/org/apache/lucene/util/ToStringUtils.java0000644000175000017500000000214511474320230025056 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Helper methods to ease implementing {@link Object#toString()}. */ public class ToStringUtils { /** for printing boost only if not 1.0 */ public static String boost(float boost) { if (boost != 1.0f) { return "^" + Float.toString(boost); } else return ""; } } lucene-2.9.4/src/java/org/apache/lucene/util/AverageGuessMemoryModel.java0000644000175000017500000000407011474320230027016 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.IdentityHashMap; import java.util.Map; /** * An average, best guess, MemoryModel that should work okay on most systems. * */ public class AverageGuessMemoryModel extends MemoryModel { // best guess primitive sizes private final Map sizes = new IdentityHashMap() { { put(boolean.class, new Integer(1)); put(byte.class, new Integer(1)); put(char.class, new Integer(2)); put(short.class, new Integer(2)); put(int.class, new Integer(4)); put(float.class, new Integer(4)); put(double.class, new Integer(8)); put(long.class, new Integer(8)); } }; /* * (non-Javadoc) * * @see org.apache.lucene.util.MemoryModel#getArraySize() */ public int getArraySize() { return 16; } /* * (non-Javadoc) * * @see org.apache.lucene.util.MemoryModel#getClassSize() */ public int getClassSize() { return 8; } /* (non-Javadoc) * @see org.apache.lucene.util.MemoryModel#getPrimitiveSize(java.lang.Class) */ public int getPrimitiveSize(Class clazz) { return ((Integer) sizes.get(clazz)).intValue(); } /* (non-Javadoc) * @see org.apache.lucene.util.MemoryModel#getReferenceSize() */ public int getReferenceSize() { return 4; } } lucene-2.9.4/src/java/org/apache/lucene/util/IndexableBinaryStringTools.java0000644000175000017500000003310211474320230027531 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.nio.CharBuffer; import java.nio.ByteBuffer; /** * Provides support for converting byte sequences to Strings and back again. * The resulting Strings preserve the original byte sequences' sort order. * * The Strings are constructed using a Base 8000h encoding of the original * binary data - each char of an encoded String represents a 15-bit chunk * from the byte sequence. Base 8000h was chosen because it allows for all * lower 15 bits of char to be used without restriction; the surrogate range * [U+D8000-U+DFFF] does not represent valid chars, and would require * complicated handling to avoid them and allow use of char's high bit. * * Although unset bits are used as padding in the final char, the original * byte sequence could contain trailing bytes with no set bits (null bytes): * padding is indistinguishable from valid information. To overcome this * problem, a char is appended, indicating the number of encoded bytes in the * final content char. * * This class's operations are defined over CharBuffers and ByteBuffers, to * allow for wrapped arrays to be reused, reducing memory allocation costs for * repeated operations. Note that this class calls array() and arrayOffset() * on the CharBuffers and ByteBuffers it uses, so only wrapped arrays may be * used. This class interprets the arrayOffset() and limit() values returned by * its input buffers as beginning and end+1 positions on the wrapped array, * respectively; similarly, on the output buffer, arrayOffset() is the first * position written to, and limit() is set to one past the final output array * position. */ public class IndexableBinaryStringTools { private static final CodingCase[] CODING_CASES = { // CodingCase(int initialShift, int finalShift) new CodingCase( 7, 1 ), // CodingCase(int initialShift, int middleShift, int finalShift) new CodingCase(14, 6, 2), new CodingCase(13, 5, 3), new CodingCase(12, 4, 4), new CodingCase(11, 3, 5), new CodingCase(10, 2, 6), new CodingCase( 9, 1, 7), new CodingCase( 8, 0 ) }; // Export only static methods private IndexableBinaryStringTools() {} /** * Returns the number of chars required to encode the given byte sequence. * * @param original The byte sequence to be encoded. Must be backed by an array. * @return The number of chars required to encode the given byte sequence * @throws IllegalArgumentException If the given ByteBuffer is not backed by an array */ public static int getEncodedLength(ByteBuffer original) throws IllegalArgumentException { if (original.hasArray()) { // Use long for intermediaries to protect against overflow long length = (long)(original.limit() - original.arrayOffset()); return (int)((length * 8L + 14L) / 15L) + 1; } else { throw new IllegalArgumentException("original argument must have a backing array"); } } /** * Returns the number of bytes required to decode the given char sequence. * * @param encoded The char sequence to be encoded. Must be backed by an array. * @return The number of bytes required to decode the given char sequence * @throws IllegalArgumentException If the given CharBuffer is not backed by an array */ public static int getDecodedLength(CharBuffer encoded) throws IllegalArgumentException { if (encoded.hasArray()) { int numChars = encoded.limit() - encoded.arrayOffset() - 1; if (numChars <= 0) { return 0; } else { int numFullBytesInFinalChar = encoded.charAt(encoded.limit() - 1); int numEncodedChars = numChars - 1; return (numEncodedChars * 15 + 7) / 8 + numFullBytesInFinalChar; } } else { throw new IllegalArgumentException("encoded argument must have a backing array"); } } /** * Encodes the input byte sequence into the output char sequence. Before * calling this method, ensure that the output CharBuffer has sufficient * capacity by calling {@link #getEncodedLength(java.nio.ByteBuffer)}. * * @param input The byte sequence to encode * @param output Where the char sequence encoding result will go. The limit * is set to one past the position of the final char. * @throws IllegalArgumentException If either the input or the output buffer * is not backed by an array */ public static void encode(ByteBuffer input, CharBuffer output) { if (input.hasArray() && output.hasArray()) { byte[] inputArray = input.array(); int inputOffset = input.arrayOffset(); int inputLength = input.limit() - inputOffset; char[] outputArray = output.array(); int outputOffset = output.arrayOffset(); int outputLength = getEncodedLength(input); output.limit(outputOffset + outputLength); // Set output final pos + 1 output.position(0); if (inputLength > 0) { int inputByteNum = inputOffset; int caseNum = 0; int outputCharNum = outputOffset; CodingCase codingCase; for ( ; inputByteNum + CODING_CASES[caseNum].numBytes <= inputLength ; ++outputCharNum ) { codingCase = CODING_CASES[caseNum]; if (2 == codingCase.numBytes) { outputArray[outputCharNum] = (char)(((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) + (((inputArray[inputByteNum + 1] & 0xFF) >>> codingCase.finalShift) & codingCase.finalMask) & (short)0x7FFF); } else { // numBytes is 3 outputArray[outputCharNum] = (char)(((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift) + (((inputArray[inputByteNum + 2] & 0xFF) >>> codingCase.finalShift) & codingCase.finalMask) & (short)0x7FFF); } inputByteNum += codingCase.advanceBytes; if (++caseNum == CODING_CASES.length) { caseNum = 0; } } // Produce final char (if any) and trailing count chars. codingCase = CODING_CASES[caseNum]; if (inputByteNum + 1 < inputLength) { // codingCase.numBytes must be 3 outputArray[outputCharNum++] = (char)((((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) + ((inputArray[inputByteNum + 1] & 0xFF) << codingCase.middleShift)) & (short)0x7FFF); // Add trailing char containing the number of full bytes in final char outputArray[outputCharNum++] = (char)1; } else if (inputByteNum < inputLength) { outputArray[outputCharNum++] = (char)(((inputArray[inputByteNum] & 0xFF) << codingCase.initialShift) & (short)0x7FFF); // Add trailing char containing the number of full bytes in final char outputArray[outputCharNum++] = caseNum == 0 ? (char)1 : (char)0; } else { // No left over bits - last char is completely filled. // Add trailing char containing the number of full bytes in final char outputArray[outputCharNum++] = (char)1; } } } else { throw new IllegalArgumentException("Arguments must have backing arrays"); } } /** * Decodes the input char sequence into the output byte sequence. Before * calling this method, ensure that the output ByteBuffer has sufficient * capacity by calling {@link #getDecodedLength(java.nio.CharBuffer)}. * * @param input The char sequence to decode * @param output Where the byte sequence decoding result will go. The limit * is set to one past the position of the final char. * @throws IllegalArgumentException If either the input or the output buffer * is not backed by an array */ public static void decode(CharBuffer input, ByteBuffer output) { if (input.hasArray() && output.hasArray()) { int numInputChars = input.limit() - input.arrayOffset() - 1; int numOutputBytes = getDecodedLength(input); output.limit(numOutputBytes + output.arrayOffset()); // Set output final pos + 1 output.position(0); byte[] outputArray = output.array(); char[] inputArray = input.array(); if (numOutputBytes > 0) { int caseNum = 0; int outputByteNum = output.arrayOffset(); int inputCharNum = input.arrayOffset(); short inputChar; CodingCase codingCase; for ( ; inputCharNum < numInputChars - 1 ; ++inputCharNum) { codingCase = CODING_CASES[caseNum]; inputChar = (short)inputArray[inputCharNum]; if (2 == codingCase.numBytes) { if (0 == caseNum) { outputArray[outputByteNum] = (byte)(inputChar >>> codingCase.initialShift); } else { outputArray[outputByteNum] += (byte)(inputChar >>> codingCase.initialShift); } outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.finalMask) << codingCase.finalShift); } else { // numBytes is 3 outputArray[outputByteNum] += (byte)(inputChar >>> codingCase.initialShift); outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.middleMask) >>> codingCase.middleShift); outputArray[outputByteNum + 2] = (byte)((inputChar & codingCase.finalMask) << codingCase.finalShift); } outputByteNum += codingCase.advanceBytes; if (++caseNum == CODING_CASES.length) { caseNum = 0; } } // Handle final char inputChar = (short)inputArray[inputCharNum]; codingCase = CODING_CASES[caseNum]; if (0 == caseNum) { outputArray[outputByteNum] = 0; } outputArray[outputByteNum] += (byte)(inputChar >>> codingCase.initialShift); int bytesLeft = numOutputBytes - outputByteNum; if (bytesLeft > 1) { if (2 == codingCase.numBytes) { outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.finalMask) >>> codingCase.finalShift); } else { // numBytes is 3 outputArray[outputByteNum + 1] = (byte)((inputChar & codingCase.middleMask) >>> codingCase.middleShift); if (bytesLeft > 2) { outputArray[outputByteNum + 2] = (byte)((inputChar & codingCase.finalMask) << codingCase.finalShift); } } } } } else { throw new IllegalArgumentException("Arguments must have backing arrays"); } } /** * Decodes the given char sequence, which must have been encoded by * {@link #encode(java.nio.ByteBuffer)} or * {@link #encode(java.nio.ByteBuffer, java.nio.CharBuffer)}. * * @param input The char sequence to decode * @return A byte sequence containing the decoding result. The limit * is set to one past the position of the final char. * @throws IllegalArgumentException If the input buffer is not backed by an * array */ public static ByteBuffer decode(CharBuffer input) { byte[] outputArray = new byte[getDecodedLength(input)]; ByteBuffer output = ByteBuffer.wrap(outputArray); decode(input, output); return output; } /** * Encodes the input byte sequence. * * @param input The byte sequence to encode * @return A char sequence containing the encoding result. The limit is set * to one past the position of the final char. * @throws IllegalArgumentException If the input buffer is not backed by an * array */ public static CharBuffer encode(ByteBuffer input) { char[] outputArray = new char[getEncodedLength(input)]; CharBuffer output = CharBuffer.wrap(outputArray); encode(input, output); return output; } static class CodingCase { int numBytes, initialShift, middleShift, finalShift, advanceBytes = 2; short middleMask, finalMask; CodingCase(int initialShift, int middleShift, int finalShift) { this.numBytes = 3; this.initialShift = initialShift; this.middleShift = middleShift; this.finalShift = finalShift; this.finalMask = (short)((short)0xFF >>> finalShift); this.middleMask = (short)((short)0xFF << middleShift); } CodingCase(int initialShift, int finalShift) { this.numBytes = 2; this.initialShift = initialShift; this.finalShift = finalShift; this.finalMask = (short)((short)0xFF >>> finalShift); if (finalShift != 0) { advanceBytes = 1; } } } } lucene-2.9.4/src/java/org/apache/lucene/util/DocIdBitSet.java0000644000175000017500000000525311474320231024365 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.BitSet; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; /** Simple DocIdSet and DocIdSetIterator backed by a BitSet */ public class DocIdBitSet extends DocIdSet { private BitSet bitSet; public DocIdBitSet(BitSet bitSet) { this.bitSet = bitSet; } public DocIdSetIterator iterator() { return new DocIdBitSetIterator(bitSet); } /** This DocIdSet implementation is cacheable. */ public boolean isCacheable() { return true; } /** * Returns the underlying BitSet. */ public BitSet getBitSet() { return this.bitSet; } private static class DocIdBitSetIterator extends DocIdSetIterator { private int docId; private BitSet bitSet; DocIdBitSetIterator(BitSet bitSet) { this.bitSet = bitSet; this.docId = -1; } /** @deprecated use {@link #docID()} instead. */ public int doc() { assert docId != -1; return docId; } public int docID() { return docId; } /** @deprecated use {@link #nextDoc()} instead. */ public boolean next() { // (docId + 1) on next line requires -1 initial value for docNr: return nextDoc() != NO_MORE_DOCS; } public int nextDoc() { // (docId + 1) on next line requires -1 initial value for docNr: int d = bitSet.nextSetBit(docId + 1); // -1 returned by BitSet.nextSetBit() when exhausted docId = d == -1 ? NO_MORE_DOCS : d; return docId; } /** @deprecated use {@link #advance(int)} instead. */ public boolean skipTo(int skipDocNr) { return advance(skipDocNr) != NO_MORE_DOCS; } public int advance(int target) { int d = bitSet.nextSetBit(target); // -1 returned by BitSet.nextSetBit() when exhausted docId = d == -1 ? NO_MORE_DOCS : d; return docId; } } } lucene-2.9.4/src/java/org/apache/lucene/util/NumericUtils.java0000644000175000017500000004405511474320231024716 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.analysis.NumericTokenStream; // for javadocs import org.apache.lucene.document.NumericField; // for javadocs import org.apache.lucene.search.NumericRangeQuery; // for javadocs import org.apache.lucene.search.NumericRangeFilter; // for javadocs /** * This is a helper class to generate prefix-encoded representations for numerical values * and supplies converters to represent float/double values as sortable integers/longs. * *

    To quickly execute range queries in Apache Lucene, a range is divided recursively * into multiple intervals for searching: The center of the range is searched only with * the lowest possible precision in the trie, while the boundaries are matched * more exactly. This reduces the number of terms dramatically. * *

    This class generates terms to achieve this: First the numerical integer values need to * be converted to strings. For that integer values (32 bit or 64 bit) are made unsigned * and the bits are converted to ASCII chars with each 7 bit. The resulting string is * sortable like the original integer value. Each value is also prefixed * (in the first char) by the shift value (number of bits removed) used * during encoding. * *

    To also index floating point numbers, this class supplies two methods to convert them * to integer values by changing their bit layout: {@link #doubleToSortableLong}, * {@link #floatToSortableInt}. You will have no precision loss by * converting floating point numbers to integers and back (only that the integer form * is not usable). Other data types like dates can easily converted to longs or ints (e.g. * date to long: {@link java.util.Date#getTime}). * *

    For easy usage, the trie algorithm is implemented for indexing inside * {@link NumericTokenStream} that can index int, long, * float, and double. For querying, * {@link NumericRangeQuery} and {@link NumericRangeFilter} implement the query part * for the same data types. * *

    This class can also be used, to generate lexicographically sortable (according * {@link String#compareTo(String)}) representations of numeric data types for other * usages (e.g. sorting). * *

    NOTE: This API is experimental and * might change in incompatible ways in the next release. * * @since 2.9 */ public final class NumericUtils { private NumericUtils() {} // no instance! /** * The default precision step used by {@link NumericField}, {@link NumericTokenStream}, * {@link NumericRangeQuery}, and {@link NumericRangeFilter} as default */ public static final int PRECISION_STEP_DEFAULT = 4; /** * Expert: Longs are stored at lower precision by shifting off lower bits. The shift count is * stored as SHIFT_START_LONG+shift in the first character */ public static final char SHIFT_START_LONG = (char)0x20; /** * Expert: The maximum term length (used for char[] buffer size) * for encoding long values. * @see #longToPrefixCoded(long,int,char[]) */ public static final int BUF_SIZE_LONG = 63/7 + 2; /** * Expert: Integers are stored at lower precision by shifting off lower bits. The shift count is * stored as SHIFT_START_INT+shift in the first character */ public static final char SHIFT_START_INT = (char)0x60; /** * Expert: The maximum term length (used for char[] buffer size) * for encoding int values. * @see #intToPrefixCoded(int,int,char[]) */ public static final int BUF_SIZE_INT = 31/7 + 2; /** * Expert: Returns prefix coded bits after reducing the precision by shift bits. * This is method is used by {@link NumericTokenStream}. * @param val the numeric value * @param shift how many bits to strip from the right * @param buffer that will contain the encoded chars, must be at least of {@link #BUF_SIZE_LONG} * length * @return number of chars written to buffer */ public static int longToPrefixCoded(final long val, final int shift, final char[] buffer) { if (shift>63 || shift<0) throw new IllegalArgumentException("Illegal shift value, must be 0..63"); int nChars = (63-shift)/7 + 1, len = nChars+1; buffer[0] = (char)(SHIFT_START_LONG + shift); long sortableBits = val ^ 0x8000000000000000L; sortableBits >>>= shift; while (nChars>=1) { // Store 7 bits per character for good efficiency when UTF-8 encoding. // The whole number is right-justified so that lucene can prefix-encode // the terms more efficiently. buffer[nChars--] = (char)(sortableBits & 0x7f); sortableBits >>>= 7; } return len; } /** * Expert: Returns prefix coded bits after reducing the precision by shift bits. * This is method is used by {@link LongRangeBuilder}. * @param val the numeric value * @param shift how many bits to strip from the right */ public static String longToPrefixCoded(final long val, final int shift) { final char[] buffer = new char[BUF_SIZE_LONG]; final int len = longToPrefixCoded(val, shift, buffer); return new String(buffer, 0, len); } /** * This is a convenience method, that returns prefix coded bits of a long without * reducing the precision. It can be used to store the full precision value as a * stored field in index. *

    To decode, use {@link #prefixCodedToLong}. */ public static String longToPrefixCoded(final long val) { return longToPrefixCoded(val, 0); } /** * Expert: Returns prefix coded bits after reducing the precision by shift bits. * This is method is used by {@link NumericTokenStream}. * @param val the numeric value * @param shift how many bits to strip from the right * @param buffer that will contain the encoded chars, must be at least of {@link #BUF_SIZE_INT} * length * @return number of chars written to buffer */ public static int intToPrefixCoded(final int val, final int shift, final char[] buffer) { if (shift>31 || shift<0) throw new IllegalArgumentException("Illegal shift value, must be 0..31"); int nChars = (31-shift)/7 + 1, len = nChars+1; buffer[0] = (char)(SHIFT_START_INT + shift); int sortableBits = val ^ 0x80000000; sortableBits >>>= shift; while (nChars>=1) { // Store 7 bits per character for good efficiency when UTF-8 encoding. // The whole number is right-justified so that lucene can prefix-encode // the terms more efficiently. buffer[nChars--] = (char)(sortableBits & 0x7f); sortableBits >>>= 7; } return len; } /** * Expert: Returns prefix coded bits after reducing the precision by shift bits. * This is method is used by {@link IntRangeBuilder}. * @param val the numeric value * @param shift how many bits to strip from the right */ public static String intToPrefixCoded(final int val, final int shift) { final char[] buffer = new char[BUF_SIZE_INT]; final int len = intToPrefixCoded(val, shift, buffer); return new String(buffer, 0, len); } /** * This is a convenience method, that returns prefix coded bits of an int without * reducing the precision. It can be used to store the full precision value as a * stored field in index. *

    To decode, use {@link #prefixCodedToInt}. */ public static String intToPrefixCoded(final int val) { return intToPrefixCoded(val, 0); } /** * Returns a long from prefixCoded characters. * Rightmost bits will be zero for lower precision codes. * This method can be used to decode e.g. a stored field. * @throws NumberFormatException if the supplied string is * not correctly prefix encoded. * @see #longToPrefixCoded(long) */ public static long prefixCodedToLong(final String prefixCoded) { final int shift = prefixCoded.charAt(0)-SHIFT_START_LONG; if (shift>63 || shift<0) throw new NumberFormatException("Invalid shift value in prefixCoded string (is encoded value really a LONG?)"); long sortableBits = 0L; for (int i=1, len=prefixCoded.length(); i0x7f) { throw new NumberFormatException( "Invalid prefixCoded numerical value representation (char "+ Integer.toHexString((int)ch)+" at position "+i+" is invalid)" ); } sortableBits |= (long)ch; } return (sortableBits << shift) ^ 0x8000000000000000L; } /** * Returns an int from prefixCoded characters. * Rightmost bits will be zero for lower precision codes. * This method can be used to decode e.g. a stored field. * @throws NumberFormatException if the supplied string is * not correctly prefix encoded. * @see #intToPrefixCoded(int) */ public static int prefixCodedToInt(final String prefixCoded) { final int shift = prefixCoded.charAt(0)-SHIFT_START_INT; if (shift>31 || shift<0) throw new NumberFormatException("Invalid shift value in prefixCoded string (is encoded value really an INT?)"); int sortableBits = 0; for (int i=1, len=prefixCoded.length(); i0x7f) { throw new NumberFormatException( "Invalid prefixCoded numerical value representation (char "+ Integer.toHexString((int)ch)+" at position "+i+" is invalid)" ); } sortableBits |= (int)ch; } return (sortableBits << shift) ^ 0x80000000; } /** * Converts a double value to a sortable signed long. * The value is converted by getting their IEEE 754 floating-point "double format" * bit layout and then some bits are swapped, to be able to compare the result as long. * By this the precision is not reduced, but the value can easily used as a long. * @see #sortableLongToDouble */ public static long doubleToSortableLong(double val) { long f = Double.doubleToRawLongBits(val); if (f<0) f ^= 0x7fffffffffffffffL; return f; } /** * Convenience method: this just returns: * longToPrefixCoded(doubleToSortableLong(val)) */ public static String doubleToPrefixCoded(double val) { return longToPrefixCoded(doubleToSortableLong(val)); } /** * Converts a sortable long back to a double. * @see #doubleToSortableLong */ public static double sortableLongToDouble(long val) { if (val<0) val ^= 0x7fffffffffffffffL; return Double.longBitsToDouble(val); } /** * Convenience method: this just returns: * sortableLongToDouble(prefixCodedToLong(val)) */ public static double prefixCodedToDouble(String val) { return sortableLongToDouble(prefixCodedToLong(val)); } /** * Converts a float value to a sortable signed int. * The value is converted by getting their IEEE 754 floating-point "float format" * bit layout and then some bits are swapped, to be able to compare the result as int. * By this the precision is not reduced, but the value can easily used as an int. * @see #sortableIntToFloat */ public static int floatToSortableInt(float val) { int f = Float.floatToRawIntBits(val); if (f<0) f ^= 0x7fffffff; return f; } /** * Convenience method: this just returns: * intToPrefixCoded(floatToSortableInt(val)) */ public static String floatToPrefixCoded(float val) { return intToPrefixCoded(floatToSortableInt(val)); } /** * Converts a sortable int back to a float. * @see #floatToSortableInt */ public static float sortableIntToFloat(int val) { if (val<0) val ^= 0x7fffffff; return Float.intBitsToFloat(val); } /** * Convenience method: this just returns: * sortableIntToFloat(prefixCodedToInt(val)) */ public static float prefixCodedToFloat(String val) { return sortableIntToFloat(prefixCodedToInt(val)); } /** * Expert: Splits a long range recursively. * You may implement a builder that adds clauses to a * {@link org.apache.lucene.search.BooleanQuery} for each call to its * {@link LongRangeBuilder#addRange(String,String)} * method. *

    This method is used by {@link NumericRangeQuery}. */ public static void splitLongRange(final LongRangeBuilder builder, final int precisionStep, final long minBound, final long maxBound ) { splitRange(builder, 64, precisionStep, minBound, maxBound); } /** * Expert: Splits an int range recursively. * You may implement a builder that adds clauses to a * {@link org.apache.lucene.search.BooleanQuery} for each call to its * {@link IntRangeBuilder#addRange(String,String)} * method. *

    This method is used by {@link NumericRangeQuery}. */ public static void splitIntRange(final IntRangeBuilder builder, final int precisionStep, final int minBound, final int maxBound ) { splitRange(builder, 32, precisionStep, (long)minBound, (long)maxBound); } /** This helper does the splitting for both 32 and 64 bit. */ private static void splitRange( final Object builder, final int valSize, final int precisionStep, long minBound, long maxBound ) { if (precisionStep < 1) throw new IllegalArgumentException("precisionStep must be >=1"); if (minBound > maxBound) return; for (int shift=0; ; shift += precisionStep) { // calculate new bounds for inner precision final long diff = 1L << (shift+precisionStep), mask = ((1L< maxBound; if (shift+precisionStep>=valSize || nextMinBound>nextMaxBound || lowerWrapped || upperWrapped) { // We are in the lowest precision or the next precision is not available. addRange(builder, valSize, minBound, maxBound, shift); // exit the split recursion loop break; } if (hasLower) addRange(builder, valSize, minBound, minBound | mask, shift); if (hasUpper) addRange(builder, valSize, maxBound & ~mask, maxBound, shift); // recurse to next precision minBound = nextMinBound; maxBound = nextMaxBound; } } /** Helper that delegates to correct range builder */ private static void addRange( final Object builder, final int valSize, long minBound, long maxBound, final int shift ) { // for the max bound set all lower bits (that were shifted away): // this is important for testing or other usages of the splitted range // (e.g. to reconstruct the full range). The prefixEncoding will remove // the bits anyway, so they do not hurt! maxBound |= (1L << shift) - 1L; // delegate to correct range builder switch(valSize) { case 64: ((LongRangeBuilder)builder).addRange(minBound, maxBound, shift); break; case 32: ((IntRangeBuilder)builder).addRange((int)minBound, (int)maxBound, shift); break; default: // Should not happen! throw new IllegalArgumentException("valSize must be 32 or 64."); } } /** * Expert: Callback for {@link #splitLongRange}. * You need to overwrite only one of the methods. *

    NOTE: This is a very low-level interface, * the method signatures may change in later versions. */ public static abstract class LongRangeBuilder { /** * Overwrite this method, if you like to receive the already prefix encoded range bounds. * You can directly build classical (inclusive) range queries from them. */ public void addRange(String minPrefixCoded, String maxPrefixCoded) { throw new UnsupportedOperationException(); } /** * Overwrite this method, if you like to receive the raw long range bounds. * You can use this for e.g. debugging purposes (print out range bounds). */ public void addRange(final long min, final long max, final int shift) { addRange(longToPrefixCoded(min, shift), longToPrefixCoded(max, shift)); } } /** * Expert: Callback for {@link #splitIntRange}. * You need to overwrite only one of the methods. *

    NOTE: This is a very low-level interface, * the method signatures may change in later versions. */ public static abstract class IntRangeBuilder { /** * Overwrite this method, if you like to receive the already prefix encoded range bounds. * You can directly build classical range (inclusive) queries from them. */ public void addRange(String minPrefixCoded, String maxPrefixCoded) { throw new UnsupportedOperationException(); } /** * Overwrite this method, if you like to receive the raw int range bounds. * You can use this for e.g. debugging purposes (print out range bounds). */ public void addRange(final int min, final int max, final int shift) { addRange(intToPrefixCoded(min, shift), intToPrefixCoded(max, shift)); } } } lucene-2.9.4/src/java/org/apache/lucene/util/BitUtil.java0000644000175000017500000005353711474320230023653 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.util; // from org.apache.solr.util rev 555343 /** A variety of high efficiency bit twiddling routines. * * @version $Id$ */ public class BitUtil { /** Returns the number of bits set in the long */ public static int pop(long x) { /* Hacker's Delight 32 bit pop function: * http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.cc * int pop(unsigned x) { x = x - ((x >> 1) & 0x55555555); x = (x & 0x33333333) + ((x >> 2) & 0x33333333); x = (x + (x >> 4)) & 0x0F0F0F0F; x = x + (x >> 8); x = x + (x >> 16); return x & 0x0000003F; } ***/ // 64 bit java version of the C function from above x = x - ((x >>> 1) & 0x5555555555555555L); x = (x & 0x3333333333333333L) + ((x >>>2 ) & 0x3333333333333333L); x = (x + (x >>> 4)) & 0x0F0F0F0F0F0F0F0FL; x = x + (x >>> 8); x = x + (x >>> 16); x = x + (x >>> 32); return ((int)x) & 0x7F; } /*** Returns the number of set bits in an array of longs. */ public static long pop_array(long A[], int wordOffset, int numWords) { /* * Robert Harley and David Seal's bit counting algorithm, as documented * in the revisions of Hacker's Delight * http://www.hackersdelight.org/revisions.pdf * http://www.hackersdelight.org/HDcode/newCode/pop_arrayHS.cc * * This function was adapted to Java, and extended to use 64 bit words. * if only we had access to wider registers like SSE from java... * * This function can be transformed to compute the popcount of other functions * on bitsets via something like this: * sed 's/A\[\([^]]*\)\]/\(A[\1] \& B[\1]\)/g' * */ int n = wordOffset+numWords; long tot=0, tot8=0; long ones=0, twos=0, fours=0; int i; for (i = wordOffset; i <= n - 8; i+=8) { /*** C macro from Hacker's Delight #define CSA(h,l, a,b,c) \ {unsigned u = a ^ b; unsigned v = c; \ h = (a & b) | (u & v); l = u ^ v;} ***/ long twosA,twosB,foursA,foursB,eights; // CSA(twosA, ones, ones, A[i], A[i+1]) { long b=A[i], c=A[i+1]; long u=ones ^ b; twosA=(ones & b)|( u & c); ones=u^c; } // CSA(twosB, ones, ones, A[i+2], A[i+3]) { long b=A[i+2], c=A[i+3]; long u=ones^b; twosB =(ones&b)|(u&c); ones=u^c; } //CSA(foursA, twos, twos, twosA, twosB) { long u=twos^twosA; foursA=(twos&twosA)|(u&twosB); twos=u^twosB; } //CSA(twosA, ones, ones, A[i+4], A[i+5]) { long b=A[i+4], c=A[i+5]; long u=ones^b; twosA=(ones&b)|(u&c); ones=u^c; } // CSA(twosB, ones, ones, A[i+6], A[i+7]) { long b=A[i+6], c=A[i+7]; long u=ones^b; twosB=(ones&b)|(u&c); ones=u^c; } //CSA(foursB, twos, twos, twosA, twosB) { long u=twos^twosA; foursB=(twos&twosA)|(u&twosB); twos=u^twosB; } //CSA(eights, fours, fours, foursA, foursB) { long u=fours^foursA; eights=(fours&foursA)|(u&foursB); fours=u^foursB; } tot8 += pop(eights); } // handle trailing words in a binary-search manner... // derived from the loop above by setting specific elements to 0. // the original method in Hackers Delight used a simple for loop: // for (i = i; i < n; i++) // Add in the last elements // tot = tot + pop(A[i]); if (i<=n-4) { long twosA, twosB, foursA, eights; { long b=A[i], c=A[i+1]; long u=ones ^ b; twosA=(ones & b)|( u & c); ones=u^c; } { long b=A[i+2], c=A[i+3]; long u=ones^b; twosB =(ones&b)|(u&c); ones=u^c; } { long u=twos^twosA; foursA=(twos&twosA)|(u&twosB); twos=u^twosB; } eights=fours&foursA; fours=fours^foursA; tot8 += pop(eights); i+=4; } if (i<=n-2) { long b=A[i], c=A[i+1]; long u=ones ^ b; long twosA=(ones & b)|( u & c); ones=u^c; long foursA=twos&twosA; twos=twos^twosA; long eights=fours&foursA; fours=fours^foursA; tot8 += pop(eights); i+=2; } if (i>= 1 return i print ','.join([ str(ntz(i)) for i in range(256) ]) ***/ /** table of number of trailing zeros in a byte */ public static final byte[] ntzTable = {8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0}; /** Returns number of trailing zeros in a 64 bit long value. */ public static int ntz(long val) { // A full binary search to determine the low byte was slower than // a linear search for nextSetBit(). This is most likely because // the implementation of nextSetBit() shifts bits to the right, increasing // the probability that the first non-zero byte is in the rhs. // // This implementation does a single binary search at the top level only // so that all other bit shifting can be done on ints instead of longs to // remain friendly to 32 bit architectures. In addition, the case of a // non-zero first byte is checked for first because it is the most common // in dense bit arrays. int lower = (int)val; int lowByte = lower & 0xff; if (lowByte != 0) return ntzTable[lowByte]; if (lower!=0) { lowByte = (lower>>>8) & 0xff; if (lowByte != 0) return ntzTable[lowByte] + 8; lowByte = (lower>>>16) & 0xff; if (lowByte != 0) return ntzTable[lowByte] + 16; // no need to mask off low byte for the last byte in the 32 bit word // no need to check for zero on the last byte either. return ntzTable[lower>>>24] + 24; } else { // grab upper 32 bits int upper=(int)(val>>32); lowByte = upper & 0xff; if (lowByte != 0) return ntzTable[lowByte] + 32; lowByte = (upper>>>8) & 0xff; if (lowByte != 0) return ntzTable[lowByte] + 40; lowByte = (upper>>>16) & 0xff; if (lowByte != 0) return ntzTable[lowByte] + 48; // no need to mask off low byte for the last byte in the 32 bit word // no need to check for zero on the last byte either. return ntzTable[upper>>>24] + 56; } } /** Returns number of trailing zeros in a 32 bit int value. */ public static int ntz(int val) { // This implementation does a single binary search at the top level only. // In addition, the case of a non-zero first byte is checked for first // because it is the most common in dense bit arrays. int lowByte = val & 0xff; if (lowByte != 0) return ntzTable[lowByte]; lowByte = (val>>>8) & 0xff; if (lowByte != 0) return ntzTable[lowByte] + 8; lowByte = (val>>>16) & 0xff; if (lowByte != 0) return ntzTable[lowByte] + 16; // no need to mask off low byte for the last byte. // no need to check for zero on the last byte either. return ntzTable[val>>>24] + 24; } /** returns 0 based index of first set bit * (only works for x!=0) *
    This is an alternate implementation of ntz() */ public static int ntz2(long x) { int n = 0; int y = (int)x; if (y==0) {n+=32; y = (int)(x>>>32); } // the only 64 bit shift necessary if ((y & 0x0000FFFF) == 0) { n+=16; y>>>=16; } if ((y & 0x000000FF) == 0) { n+=8; y>>>=8; } return (ntzTable[ y & 0xff ]) + n; } /** returns 0 based index of first set bit *
    This is an alternate implementation of ntz() */ public static int ntz3(long x) { // another implementation taken from Hackers Delight, extended to 64 bits // and converted to Java. // Many 32 bit ntz algorithms are at http://www.hackersdelight.org/HDcode/ntz.cc int n = 1; // do the first step as a long, all others as ints. int y = (int)x; if (y==0) {n+=32; y = (int)(x>>>32); } if ((y & 0x0000FFFF) == 0) { n+=16; y>>>=16; } if ((y & 0x000000FF) == 0) { n+=8; y>>>=8; } if ((y & 0x0000000F) == 0) { n+=4; y>>>=4; } if ((y & 0x00000003) == 0) { n+=2; y>>>=2; } return n - (y & 1); } /** returns true if v is a power of two or zero*/ public static boolean isPowerOfTwo(int v) { return ((v & (v-1)) == 0); } /** returns true if v is a power of two or zero*/ public static boolean isPowerOfTwo(long v) { return ((v & (v-1)) == 0); } /** returns the next highest power of two, or the current value if it's already a power of two or zero*/ public static int nextHighestPowerOfTwo(int v) { v--; v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16; v++; return v; } /** returns the next highest power of two, or the current value if it's already a power of two or zero*/ public static long nextHighestPowerOfTwo(long v) { v--; v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16; v |= v >> 32; v++; return v; } } lucene-2.9.4/src/java/org/apache/lucene/util/package.html0000644000175000017500000000172611474320231023707 0ustar janpascaljanpascal Some utility classes. lucene-2.9.4/src/java/org/apache/lucene/util/ArrayUtil.java0000644000175000017500000001660011474320230024201 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at *

    * http://www.apache.org/licenses/LICENSE-2.0 *

    * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Methods for manipulating arrays. */ public final class ArrayUtil { /* Begin Apache Harmony code Revision taken on Friday, June 12. https://svn.apache.org/repos/asf/harmony/enhanced/classlib/archive/java6/modules/luni/src/main/java/java/lang/Integer.java */ /** * Parses the string argument as if it was an int value and returns the * result. Throws NumberFormatException if the string does not represent an * int quantity. * * @param chars a string representation of an int quantity. * @return int the value represented by the argument * @throws NumberFormatException if the argument could not be parsed as an int quantity. */ public static int parseInt(char[] chars) throws NumberFormatException { return parseInt(chars, 0, chars.length, 10); } /** * Parses a char array into an int. * @param chars the character array * @param offset The offset into the array * @param len The length * @return the int * @throws NumberFormatException if it can't parse */ public static int parseInt(char[] chars, int offset, int len) throws NumberFormatException { return parseInt(chars, offset, len, 10); } /** * Parses the string argument as if it was an int value and returns the * result. Throws NumberFormatException if the string does not represent an * int quantity. The second argument specifies the radix to use when parsing * the value. * * @param chars a string representation of an int quantity. * @param radix the base to use for conversion. * @return int the value represented by the argument * @throws NumberFormatException if the argument could not be parsed as an int quantity. */ public static int parseInt(char[] chars, int offset, int len, int radix) throws NumberFormatException { if (chars == null || radix < Character.MIN_RADIX || radix > Character.MAX_RADIX) { throw new NumberFormatException(); } int i = 0; if (len == 0) { throw new NumberFormatException("chars length is 0"); } boolean negative = chars[offset + i] == '-'; if (negative && ++i == len) { throw new NumberFormatException("can't convert to an int"); } if (negative == true){ offset++; len--; } return parse(chars, offset, len, radix, negative); } private static int parse(char[] chars, int offset, int len, int radix, boolean negative) throws NumberFormatException { int max = Integer.MIN_VALUE / radix; int result = 0; for (int i = 0; i < len; i++){ int digit = Character.digit(chars[i + offset], radix); if (digit == -1) { throw new NumberFormatException("Unable to parse"); } if (max > result) { throw new NumberFormatException("Unable to parse"); } int next = result * radix - digit; if (next > result) { throw new NumberFormatException("Unable to parse"); } result = next; } /*while (offset < len) { }*/ if (!negative) { result = -result; if (result < 0) { throw new NumberFormatException("Unable to parse"); } } return result; } /* END APACHE HARMONY CODE */ public static int getNextSize(int targetSize) { /* This over-allocates proportional to the list size, making room * for additional growth. The over-allocation is mild, but is * enough to give linear-time amortized behavior over a long * sequence of appends() in the presence of a poorly-performing * system realloc(). * The growth pattern is: 0, 4, 8, 16, 25, 35, 46, 58, 72, 88, ... */ return (targetSize >> 3) + (targetSize < 9 ? 3 : 6) + targetSize; } public static int getShrinkSize(int currentSize, int targetSize) { final int newSize = getNextSize(targetSize); // Only reallocate if we are "substantially" smaller. // This saves us from "running hot" (constantly making a // bit bigger then a bit smaller, over and over): if (newSize < currentSize / 2) return newSize; else return currentSize; } public static int[] grow(int[] array, int minSize) { if (array.length < minSize) { int[] newArray = new int[getNextSize(minSize)]; System.arraycopy(array, 0, newArray, 0, array.length); return newArray; } else return array; } public static int[] grow(int[] array) { return grow(array, 1 + array.length); } public static int[] shrink(int[] array, int targetSize) { final int newSize = getShrinkSize(array.length, targetSize); if (newSize != array.length) { int[] newArray = new int[newSize]; System.arraycopy(array, 0, newArray, 0, newSize); return newArray; } else return array; } public static long[] grow(long[] array, int minSize) { if (array.length < minSize) { long[] newArray = new long[getNextSize(minSize)]; System.arraycopy(array, 0, newArray, 0, array.length); return newArray; } else return array; } public static long[] grow(long[] array) { return grow(array, 1 + array.length); } public static long[] shrink(long[] array, int targetSize) { final int newSize = getShrinkSize(array.length, targetSize); if (newSize != array.length) { long[] newArray = new long[newSize]; System.arraycopy(array, 0, newArray, 0, newSize); return newArray; } else return array; } public static byte[] grow(byte[] array, int minSize) { if (array.length < minSize) { byte[] newArray = new byte[getNextSize(minSize)]; System.arraycopy(array, 0, newArray, 0, array.length); return newArray; } else return array; } public static byte[] grow(byte[] array) { return grow(array, 1 + array.length); } public static byte[] shrink(byte[] array, int targetSize) { final int newSize = getShrinkSize(array.length, targetSize); if (newSize != array.length) { byte[] newArray = new byte[newSize]; System.arraycopy(array, 0, newArray, 0, newSize); return newArray; } else return array; } /** * Returns hash of chars in range start (inclusive) to * end (inclusive) */ public static int hashCode(char[] array, int start, int end) { int code = 0; for (int i = end - 1; i >= start; i--) code = code * 31 + array[i]; return code; } /** * Returns hash of chars in range start (inclusive) to * end (inclusive) */ public static int hashCode(byte[] array, int start, int end) { int code = 0; for (int i = end - 1; i >= start; i--) code = code * 31 + array[i]; return code; } } lucene-2.9.4/src/java/org/apache/lucene/util/StringInterner.java0000644000175000017500000000267111474320230025245 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Subclasses of StringInterner are required to * return the same single String object for all equal strings. * Depending on the implementation, this may not be * the same object returned as String.intern(). * * This StringInterner base class simply delegates to String.intern(). */ public class StringInterner { /** Returns a single object instance for each equal string. */ public String intern(String s) { return s.intern(); } /** Returns a single object instance for each equal string. */ public String intern(char[] arr, int offset, int len) { return intern(new String(arr, offset, len)); } } lucene-2.9.4/src/java/org/apache/lucene/util/OpenBitSet.java0000644000175000017500000006037711474320230024313 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.util; import java.util.Arrays; import java.io.Serializable; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; /** An "open" BitSet implementation that allows direct access to the array of words * storing the bits. *

    * Unlike java.util.bitset, the fact that bits are packed into an array of longs * is part of the interface. This allows efficient implementation of other algorithms * by someone other than the author. It also allows one to efficiently implement * alternate serialization or interchange formats. *

    * OpenBitSet is faster than java.util.BitSet in most operations * and *much* faster at calculating cardinality of sets and results of set operations. * It can also handle sets of larger cardinality (up to 64 * 2**32-1) *

    * The goals of OpenBitSet are the fastest implementation possible, and * maximum code reuse. Extra safety and encapsulation * may always be built on top, but if that's built in, the cost can never be removed (and * hence people re-implement their own version in order to get better performance). * If you want a "safe", totally encapsulated (and slower and limited) BitSet * class, use java.util.BitSet. *

    *

    Performance Results

    * Test system: Pentium 4, Sun Java 1.5_06 -server -Xbatch -Xmx64M
    BitSet size = 1,000,000
    Results are java.util.BitSet time divided by OpenBitSet time.
    cardinality intersect_count union nextSetBit get iterator
    50% full 3.36 3.96 1.44 1.46 1.99 1.58
    1% full 3.31 3.90   1.04   0.99

    Test system: AMD Opteron, 64 bit linux, Sun Java 1.5_06 -server -Xbatch -Xmx64M
    BitSet size = 1,000,000
    Results are java.util.BitSet time divided by OpenBitSet time.
    cardinality intersect_count union nextSetBit get iterator
    50% full 2.50 3.50 1.00 1.03 1.12 1.25
    1% full 2.51 3.49   1.00   1.02
    * @version $Id$ */ public class OpenBitSet extends DocIdSet implements Cloneable, Serializable { protected long[] bits; protected int wlen; // number of words (elements) used in the array /** Constructs an OpenBitSet large enough to hold numBits. * * @param numBits */ public OpenBitSet(long numBits) { bits = new long[bits2words(numBits)]; wlen = bits.length; } public OpenBitSet() { this(64); } /** Constructs an OpenBitSet from an existing long[]. *
    * The first 64 bits are in long[0], * with bit index 0 at the least significant bit, and bit index 63 at the most significant. * Given a bit index, * the word containing it is long[index/64], and it is at bit number index%64 within that word. *

    * numWords are the number of elements in the array that contain * set bits (non-zero longs). * numWords should be <= bits.length, and * any existing words in the array at position >= numWords should be zero. * */ public OpenBitSet(long[] bits, int numWords) { this.bits = bits; this.wlen = numWords; } public DocIdSetIterator iterator() { return new OpenBitSetIterator(bits, wlen); } /** This DocIdSet implementation is cacheable. */ public boolean isCacheable() { return true; } /** Returns the current capacity in bits (1 greater than the index of the last bit) */ public long capacity() { return bits.length << 6; } /** * Returns the current capacity of this set. Included for * compatibility. This is *not* equal to {@link #cardinality} */ public long size() { return capacity(); } /** Returns true if there are no set bits */ public boolean isEmpty() { return cardinality()==0; } /** Expert: returns the long[] storing the bits */ public long[] getBits() { return bits; } /** Expert: sets a new long[] to use as the bit storage */ public void setBits(long[] bits) { this.bits = bits; } /** Expert: gets the number of longs in the array that are in use */ public int getNumWords() { return wlen; } /** Expert: sets the number of longs in the array that are in use */ public void setNumWords(int nWords) { this.wlen=nWords; } /** Returns true or false for the specified bit index. */ public boolean get(int index) { int i = index >> 6; // div 64 // signed shift will keep a negative index and force an // array-index-out-of-bounds-exception, removing the need for an explicit check. if (i>=bits.length) return false; int bit = index & 0x3f; // mod 64 long bitmask = 1L << bit; return (bits[i] & bitmask) != 0; } /** Returns true or false for the specified bit index. * The index should be less than the OpenBitSet size */ public boolean fastGet(int index) { int i = index >> 6; // div 64 // signed shift will keep a negative index and force an // array-index-out-of-bounds-exception, removing the need for an explicit check. int bit = index & 0x3f; // mod 64 long bitmask = 1L << bit; return (bits[i] & bitmask) != 0; } /** Returns true or false for the specified bit index */ public boolean get(long index) { int i = (int)(index >> 6); // div 64 if (i>=bits.length) return false; int bit = (int)index & 0x3f; // mod 64 long bitmask = 1L << bit; return (bits[i] & bitmask) != 0; } /** Returns true or false for the specified bit index. * The index should be less than the OpenBitSet size. */ public boolean fastGet(long index) { int i = (int)(index >> 6); // div 64 int bit = (int)index & 0x3f; // mod 64 long bitmask = 1L << bit; return (bits[i] & bitmask) != 0; } /* // alternate implementation of get() public boolean get1(int index) { int i = index >> 6; // div 64 int bit = index & 0x3f; // mod 64 return ((bits[i]>>>bit) & 0x01) != 0; // this does a long shift and a bittest (on x86) vs // a long shift, and a long AND, (the test for zero is prob a no-op) // testing on a P4 indicates this is slower than (bits[i] & bitmask) != 0; } */ /** returns 1 if the bit is set, 0 if not. * The index should be less than the OpenBitSet size */ public int getBit(int index) { int i = index >> 6; // div 64 int bit = index & 0x3f; // mod 64 return ((int)(bits[i]>>>bit)) & 0x01; } /* public boolean get2(int index) { int word = index >> 6; // div 64 int bit = index & 0x0000003f; // mod 64 return (bits[word] << bit) < 0; // hmmm, this would work if bit order were reversed // we could right shift and check for parity bit, if it was available to us. } */ /** sets a bit, expanding the set size if necessary */ public void set(long index) { int wordNum = expandingWordNum(index); int bit = (int)index & 0x3f; long bitmask = 1L << bit; bits[wordNum] |= bitmask; } /** Sets the bit at the specified index. * The index should be less than the OpenBitSet size. */ public void fastSet(int index) { int wordNum = index >> 6; // div 64 int bit = index & 0x3f; // mod 64 long bitmask = 1L << bit; bits[wordNum] |= bitmask; } /** Sets the bit at the specified index. * The index should be less than the OpenBitSet size. */ public void fastSet(long index) { int wordNum = (int)(index >> 6); int bit = (int)index & 0x3f; long bitmask = 1L << bit; bits[wordNum] |= bitmask; } /** Sets a range of bits, expanding the set size if necessary * * @param startIndex lower index * @param endIndex one-past the last bit to set */ public void set(long startIndex, long endIndex) { if (endIndex <= startIndex) return; int startWord = (int)(startIndex>>6); // since endIndex is one past the end, this is index of the last // word to be changed. int endWord = expandingWordNum(endIndex-1); long startmask = -1L << startIndex; long endmask = -1L >>> -endIndex; // 64-(endIndex&0x3f) is the same as -endIndex due to wrap if (startWord == endWord) { bits[startWord] |= (startmask & endmask); return; } bits[startWord] |= startmask; Arrays.fill(bits, startWord+1, endWord, -1L); bits[endWord] |= endmask; } protected int expandingWordNum(long index) { int wordNum = (int)(index >> 6); if (wordNum>=wlen) { ensureCapacity(index+1); wlen = wordNum+1; } return wordNum; } /** clears a bit. * The index should be less than the OpenBitSet size. */ public void fastClear(int index) { int wordNum = index >> 6; int bit = index & 0x03f; long bitmask = 1L << bit; bits[wordNum] &= ~bitmask; // hmmm, it takes one more instruction to clear than it does to set... any // way to work around this? If there were only 63 bits per word, we could // use a right shift of 10111111...111 in binary to position the 0 in the // correct place (using sign extension). // Could also use Long.rotateRight() or rotateLeft() *if* they were converted // by the JVM into a native instruction. // bits[word] &= Long.rotateLeft(0xfffffffe,bit); } /** clears a bit. * The index should be less than the OpenBitSet size. */ public void fastClear(long index) { int wordNum = (int)(index >> 6); // div 64 int bit = (int)index & 0x3f; // mod 64 long bitmask = 1L << bit; bits[wordNum] &= ~bitmask; } /** clears a bit, allowing access beyond the current set size without changing the size.*/ public void clear(long index) { int wordNum = (int)(index >> 6); // div 64 if (wordNum>=wlen) return; int bit = (int)index & 0x3f; // mod 64 long bitmask = 1L << bit; bits[wordNum] &= ~bitmask; } /** Clears a range of bits. Clearing past the end does not change the size of the set. * * @param startIndex lower index * @param endIndex one-past the last bit to clear */ public void clear(int startIndex, int endIndex) { if (endIndex <= startIndex) return; int startWord = (startIndex>>6); if (startWord >= wlen) return; // since endIndex is one past the end, this is index of the last // word to be changed. int endWord = ((endIndex-1)>>6); long startmask = -1L << startIndex; long endmask = -1L >>> -endIndex; // 64-(endIndex&0x3f) is the same as -endIndex due to wrap // invert masks since we are clearing startmask = ~startmask; endmask = ~endmask; if (startWord == endWord) { bits[startWord] &= (startmask | endmask); return; } bits[startWord] &= startmask; int middle = Math.min(wlen, endWord); Arrays.fill(bits, startWord+1, middle, 0L); if (endWord < wlen) { bits[endWord] &= endmask; } } /** Clears a range of bits. Clearing past the end does not change the size of the set. * * @param startIndex lower index * @param endIndex one-past the last bit to clear */ public void clear(long startIndex, long endIndex) { if (endIndex <= startIndex) return; int startWord = (int)(startIndex>>6); if (startWord >= wlen) return; // since endIndex is one past the end, this is index of the last // word to be changed. int endWord = (int)((endIndex-1)>>6); long startmask = -1L << startIndex; long endmask = -1L >>> -endIndex; // 64-(endIndex&0x3f) is the same as -endIndex due to wrap // invert masks since we are clearing startmask = ~startmask; endmask = ~endmask; if (startWord == endWord) { bits[startWord] &= (startmask | endmask); return; } bits[startWord] &= startmask; int middle = Math.min(wlen, endWord); Arrays.fill(bits, startWord+1, middle, 0L); if (endWord < wlen) { bits[endWord] &= endmask; } } /** Sets a bit and returns the previous value. * The index should be less than the OpenBitSet size. */ public boolean getAndSet(int index) { int wordNum = index >> 6; // div 64 int bit = index & 0x3f; // mod 64 long bitmask = 1L << bit; boolean val = (bits[wordNum] & bitmask) != 0; bits[wordNum] |= bitmask; return val; } /** Sets a bit and returns the previous value. * The index should be less than the OpenBitSet size. */ public boolean getAndSet(long index) { int wordNum = (int)(index >> 6); // div 64 int bit = (int)index & 0x3f; // mod 64 long bitmask = 1L << bit; boolean val = (bits[wordNum] & bitmask) != 0; bits[wordNum] |= bitmask; return val; } /** flips a bit. * The index should be less than the OpenBitSet size. */ public void fastFlip(int index) { int wordNum = index >> 6; // div 64 int bit = index & 0x3f; // mod 64 long bitmask = 1L << bit; bits[wordNum] ^= bitmask; } /** flips a bit. * The index should be less than the OpenBitSet size. */ public void fastFlip(long index) { int wordNum = (int)(index >> 6); // div 64 int bit = (int)index & 0x3f; // mod 64 long bitmask = 1L << bit; bits[wordNum] ^= bitmask; } /** flips a bit, expanding the set size if necessary */ public void flip(long index) { int wordNum = expandingWordNum(index); int bit = (int)index & 0x3f; // mod 64 long bitmask = 1L << bit; bits[wordNum] ^= bitmask; } /** flips a bit and returns the resulting bit value. * The index should be less than the OpenBitSet size. */ public boolean flipAndGet(int index) { int wordNum = index >> 6; // div 64 int bit = index & 0x3f; // mod 64 long bitmask = 1L << bit; bits[wordNum] ^= bitmask; return (bits[wordNum] & bitmask) != 0; } /** flips a bit and returns the resulting bit value. * The index should be less than the OpenBitSet size. */ public boolean flipAndGet(long index) { int wordNum = (int)(index >> 6); // div 64 int bit = (int)index & 0x3f; // mod 64 long bitmask = 1L << bit; bits[wordNum] ^= bitmask; return (bits[wordNum] & bitmask) != 0; } /** Flips a range of bits, expanding the set size if necessary * * @param startIndex lower index * @param endIndex one-past the last bit to flip */ public void flip(long startIndex, long endIndex) { if (endIndex <= startIndex) return; int startWord = (int)(startIndex>>6); // since endIndex is one past the end, this is index of the last // word to be changed. int endWord = expandingWordNum(endIndex-1); /*** Grrr, java shifting wraps around so -1L>>>64 == -1 * for that reason, make sure not to use endmask if the bits to flip will * be zero in the last word (redefine endWord to be the last changed...) long startmask = -1L << (startIndex & 0x3f); // example: 11111...111000 long endmask = -1L >>> (64-(endIndex & 0x3f)); // example: 00111...111111 ***/ long startmask = -1L << startIndex; long endmask = -1L >>> -endIndex; // 64-(endIndex&0x3f) is the same as -endIndex due to wrap if (startWord == endWord) { bits[startWord] ^= (startmask & endmask); return; } bits[startWord] ^= startmask; for (int i=startWord+1; i b.wlen) { tot += BitUtil.pop_array(a.bits, b.wlen, a.wlen-b.wlen); } return tot; } /** Returns the popcount or cardinality of "a and not b" * or "intersection(a, not(b))". * Neither set is modified. */ public static long andNotCount(OpenBitSet a, OpenBitSet b) { long tot = BitUtil.pop_andnot(a.bits, b.bits, 0, Math.min(a.wlen, b.wlen)); if (a.wlen > b.wlen) { tot += BitUtil.pop_array(a.bits, b.wlen, a.wlen-b.wlen); } return tot; } /** Returns the popcount or cardinality of the exclusive-or of the two sets. * Neither set is modified. */ public static long xorCount(OpenBitSet a, OpenBitSet b) { long tot = BitUtil.pop_xor(a.bits, b.bits, 0, Math.min(a.wlen, b.wlen)); if (a.wlen < b.wlen) { tot += BitUtil.pop_array(b.bits, a.wlen, b.wlen-a.wlen); } else if (a.wlen > b.wlen) { tot += BitUtil.pop_array(a.bits, b.wlen, a.wlen-b.wlen); } return tot; } /** Returns the index of the first set bit starting at the index specified. * -1 is returned if there are no more set bits. */ public int nextSetBit(int index) { int i = index>>6; if (i>=wlen) return -1; int subIndex = index & 0x3f; // index within the word long word = bits[i] >> subIndex; // skip all the bits to the right of index if (word!=0) { return (i<<6) + subIndex + BitUtil.ntz(word); } while(++i < wlen) { word = bits[i]; if (word!=0) return (i<<6) + BitUtil.ntz(word); } return -1; } /** Returns the index of the first set bit starting at the index specified. * -1 is returned if there are no more set bits. */ public long nextSetBit(long index) { int i = (int)(index>>>6); if (i>=wlen) return -1; int subIndex = (int)index & 0x3f; // index within the word long word = bits[i] >>> subIndex; // skip all the bits to the right of index if (word!=0) { return (((long)i)<<6) + (subIndex + BitUtil.ntz(word)); } while(++i < wlen) { word = bits[i]; if (word!=0) return (((long)i)<<6) + BitUtil.ntz(word); } return -1; } public Object clone() { try { OpenBitSet obs = (OpenBitSet)super.clone(); obs.bits = (long[]) obs.bits.clone(); // hopefully an array clone is as fast(er) than arraycopy return obs; } catch (CloneNotSupportedException e) { throw new RuntimeException(e); } } /** this = this AND other */ public void intersect(OpenBitSet other) { int newLen= Math.min(this.wlen,other.wlen); long[] thisArr = this.bits; long[] otherArr = other.bits; // testing against zero can be more efficient int pos=newLen; while(--pos>=0) { thisArr[pos] &= otherArr[pos]; } if (this.wlen > newLen) { // fill zeros from the new shorter length to the old length Arrays.fill(bits,newLen,this.wlen,0); } this.wlen = newLen; } /** this = this OR other */ public void union(OpenBitSet other) { int newLen = Math.max(wlen,other.wlen); ensureCapacityWords(newLen); long[] thisArr = this.bits; long[] otherArr = other.bits; int pos=Math.min(wlen,other.wlen); while(--pos>=0) { thisArr[pos] |= otherArr[pos]; } if (this.wlen < newLen) { System.arraycopy(otherArr, this.wlen, thisArr, this.wlen, newLen-this.wlen); } this.wlen = newLen; } /** Remove all elements set in other. this = this AND_NOT other */ public void remove(OpenBitSet other) { int idx = Math.min(wlen,other.wlen); long[] thisArr = this.bits; long[] otherArr = other.bits; while(--idx>=0) { thisArr[idx] &= ~otherArr[idx]; } } /** this = this XOR other */ public void xor(OpenBitSet other) { int newLen = Math.max(wlen,other.wlen); ensureCapacityWords(newLen); long[] thisArr = this.bits; long[] otherArr = other.bits; int pos=Math.min(wlen,other.wlen); while(--pos>=0) { thisArr[pos] ^= otherArr[pos]; } if (this.wlen < newLen) { System.arraycopy(otherArr, this.wlen, thisArr, this.wlen, newLen-this.wlen); } this.wlen = newLen; } // some BitSet compatability methods //** see {@link intersect} */ public void and(OpenBitSet other) { intersect(other); } //** see {@link union} */ public void or(OpenBitSet other) { union(other); } //** see {@link andNot} */ public void andNot(OpenBitSet other) { remove(other); } /** returns true if the sets have any elements in common */ public boolean intersects(OpenBitSet other) { int pos = Math.min(this.wlen, other.wlen); long[] thisArr = this.bits; long[] otherArr = other.bits; while (--pos>=0) { if ((thisArr[pos] & otherArr[pos])!=0) return true; } return false; } /** Expand the long[] with the size given as a number of words (64 bit longs). * getNumWords() is unchanged by this call. */ public void ensureCapacityWords(int numWords) { if (bits.length < numWords) { bits = ArrayUtil.grow(bits, numWords); } } /** Ensure that the long[] is big enough to hold numBits, expanding it if necessary. * getNumWords() is unchanged by this call. */ public void ensureCapacity(long numBits) { ensureCapacityWords(bits2words(numBits)); } /** Lowers numWords, the number of words in use, * by checking for trailing zero words. */ public void trimTrailingZeros() { int idx = wlen-1; while (idx>=0 && bits[idx]==0) idx--; wlen = idx+1; } /** returns the number of 64 bit words it would take to hold numBits */ public static int bits2words(long numBits) { return (int)(((numBits-1)>>>6)+1); } /** returns true if both sets have the same bits set */ public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof OpenBitSet)) return false; OpenBitSet a; OpenBitSet b = (OpenBitSet)o; // make a the larger set. if (b.wlen > this.wlen) { a = b; b=this; } else { a=this; } // check for any set bits out of the range of b for (int i=a.wlen-1; i>=b.wlen; i--) { if (a.bits[i]!=0) return false; } for (int i=b.wlen-1; i>=0; i--) { if (a.bits[i] != b.bits[i]) return false; } return true; } public int hashCode() { // Start with a zero hash and use a mix that results in zero if the input is zero. // This effectively truncates trailing zeros without an explicit check. long h = 0; for (int i = bits.length; --i>=0;) { h ^= bits[i]; h = (h << 1) | (h >>> 63); // rotate left } // fold leftmost bits into right and add a constant to prevent // empty sets from returning 0, which is too common. return (int)((h>>32) ^ h) + 0x98761234; } } lucene-2.9.4/src/java/org/apache/lucene/util/BitVector.java0000644000175000017500000002267211474320230024174 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; /** Optimized implementation of a vector of bits. This is more-or-less like java.util.BitSet, but also includes the following:

    • a count() method, which efficiently computes the number of one bits;
    • optimized read from and write to disk;
    • inlinable get() method;
    • store and load, as bit set or d-gaps, depending on sparseness;
    @version $Id: BitVector.java 950306 2010-06-01 23:33:34Z mikemccand $ */ public final class BitVector implements Cloneable { private byte[] bits; private int size; private int count; /** Constructs a vector capable of holding n bits. */ public BitVector(int n) { size = n; bits = new byte[(size >> 3) + 1]; count = 0; } BitVector(byte[] bits, int size) { this.bits = bits; this.size = size; count = -1; } public Object clone() { byte[] copyBits = new byte[bits.length]; System.arraycopy(bits, 0, copyBits, 0, bits.length); BitVector clone = new BitVector(copyBits, size); clone.count = count; return clone; } /** Sets the value of bit to one. */ public final void set(int bit) { if (bit >= size) { throw new ArrayIndexOutOfBoundsException(bit); } bits[bit >> 3] |= 1 << (bit & 7); count = -1; } /** Sets the value of bit to true, and * returns true if bit was already set */ public final boolean getAndSet(int bit) { if (bit >= size) { throw new ArrayIndexOutOfBoundsException(bit); } final int pos = bit >> 3; final int v = bits[pos]; final int flag = 1 << (bit & 7); if ((flag & v) != 0) return true; else { bits[pos] = (byte) (v | flag); if (count != -1) count++; return false; } } /** Sets the value of bit to zero. */ public final void clear(int bit) { if (bit >= size) { throw new ArrayIndexOutOfBoundsException(bit); } bits[bit >> 3] &= ~(1 << (bit & 7)); count = -1; } /** Returns true if bit is one and false if it is zero. */ public final boolean get(int bit) { assert bit >= 0 && bit < size: "bit " + bit + " is out of bounds 0.." + (size-1); return (bits[bit >> 3] & (1 << (bit & 7))) != 0; } /** Returns the number of bits in this vector. This is also one greater than the number of the largest valid bit number. */ public final int size() { return size; } /** Returns the total number of one bits in this vector. This is efficiently computed and cached, so that, if the vector is not changed, no recomputation is done for repeated calls. */ public final int count() { // if the vector has been modified if (count == -1) { int c = 0; int end = bits.length; for (int i = 0; i < end; i++) c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte count = c; } return count; } /** For testing */ public final int getRecomputedCount() { int c = 0; int end = bits.length; for (int i = 0; i < end; i++) c += BYTE_COUNTS[bits[i] & 0xFF]; // sum bits per byte return c; } private static final byte[] BYTE_COUNTS = { // table of bits/byte 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 }; /** Writes this vector to the file name in Directory d, in a format that can be read by the constructor {@link #BitVector(Directory, String)}. */ public final void write(Directory d, String name) throws IOException { IndexOutput output = d.createOutput(name); try { if (isSparse()) { writeDgaps(output); // sparse bit-set more efficiently saved as d-gaps. } else { writeBits(output); } } finally { output.close(); } } /** Write as a bit set */ private void writeBits(IndexOutput output) throws IOException { output.writeInt(size()); // write size output.writeInt(count()); // write count output.writeBytes(bits, bits.length); } /** Write as a d-gaps list */ private void writeDgaps(IndexOutput output) throws IOException { output.writeInt(-1); // mark using d-gaps output.writeInt(size()); // write size output.writeInt(count()); // write count int last=0; int n = count(); int m = bits.length; for (int i=0; i0; i++) { if (bits[i]!=0) { output.writeVInt(i-last); output.writeByte(bits[i]); last = i; n -= BYTE_COUNTS[bits[i] & 0xFF]; } } } /** Indicates if the bit vector is sparse and should be saved as a d-gaps list, or dense, and should be saved as a bit set. */ private boolean isSparse() { // note: order of comparisons below set to favor smaller values (no binary range search.) // note: adding 4 because we start with ((int) -1) to indicate d-gaps format. // note: we write the d-gap for the byte number, and the byte (bits[i]) itself, therefore // multiplying count by (8+8) or (8+16) or (8+24) etc.: // - first 8 for writing bits[i] (1 byte vs. 1 bit), and // - second part for writing the byte-number d-gap as vint. // note: factor is for read/write of byte-arrays being faster than vints. int factor = 10; if (bits.length < (1<< 7)) return factor * (4 + (8+ 8)*count()) < size(); if (bits.length < (1<<14)) return factor * (4 + (8+16)*count()) < size(); if (bits.length < (1<<21)) return factor * (4 + (8+24)*count()) < size(); if (bits.length < (1<<28)) return factor * (4 + (8+32)*count()) < size(); return factor * (4 + (8+40)*count()) < size(); } /** Constructs a bit vector from the file name in Directory d, as written by the {@link #write} method. */ public BitVector(Directory d, String name) throws IOException { IndexInput input = d.openInput(name); try { size = input.readInt(); // read size if (size == -1) { readDgaps(input); } else { readBits(input); } } finally { input.close(); } } /** Read as a bit set */ private void readBits(IndexInput input) throws IOException { count = input.readInt(); // read count bits = new byte[(size >> 3) + 1]; // allocate bits input.readBytes(bits, 0, bits.length); } /** read as a d-gaps list */ private void readDgaps(IndexInput input) throws IOException { size = input.readInt(); // (re)read size count = input.readInt(); // read count bits = new byte[(size >> 3) + 1]; // allocate bits int last=0; int n = count(); while (n>0) { last += input.readVInt(); bits[last] = input.readByte(); n -= BYTE_COUNTS[bits[last] & 0xFF]; } } /** * Retrieve a subset of this BitVector. * * @param start * starting index, inclusive * @param end * ending index, exclusive * @return subset */ public BitVector subset(int start, int end) { if (start < 0 || end > size() || end < start) throw new IndexOutOfBoundsException(); // Special case -- return empty vector is start == end if (end == start) return new BitVector(0); byte[] bits = new byte[((end - start - 1) >>> 3) + 1]; int s = start >>> 3; for (int i = 0; i < bits.length; i++) { int cur = 0xFF & this.bits[i + s]; int next = i + s + 1 >= this.bits.length ? 0 : 0xFF & this.bits[i + s + 1]; bits[i] = (byte) ((cur >>> (start & 7)) | ((next << (8 - (start & 7))))); } int bitsToClear = (bits.length * 8 - (end - start)) % 8; bits[bits.length - 1] &= ~(0xFF << (8 - bitsToClear)); return new BitVector(bits, end - start); } } lucene-2.9.4/src/java/org/apache/lucene/util/FieldCacheSanityChecker.java0000644000175000017500000003510211474320231026710 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Copyright 2009 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.FieldCache; import org.apache.lucene.search.FieldCache.CacheEntry; /** * Provides methods for sanity checking that entries in the FieldCache * are not wasteful or inconsistent. *

    *

    * Lucene 2.9 Introduced numerous enhancements into how the FieldCache * is used by the low levels of Lucene searching (for Sorting and * ValueSourceQueries) to improve both the speed for Sorting, as well * as reopening of IndexReaders. But these changes have shifted the * usage of FieldCache from "top level" IndexReaders (frequently a * MultiReader or DirectoryReader) down to the leaf level SegmentReaders. * As a result, existing applications that directly access the FieldCache * may find RAM usage increase significantly when upgrading to 2.9 or * Later. This class provides an API for these applications (or their * Unit tests) to check at run time if the FieldCache contains "insane" * usages of the FieldCache. *

    *

    * EXPERIMENTAL API: This API is considered extremely advanced and * experimental. It may be removed or altered w/o warning in future releases * of Lucene. *

    * @see FieldCache * @see FieldCacheSanityChecker.Insanity * @see FieldCacheSanityChecker.InsanityType */ public final class FieldCacheSanityChecker { private RamUsageEstimator ramCalc = null; public FieldCacheSanityChecker() { /* NOOP */ } /** * If set, will be used to estimate size for all CacheEntry objects * dealt with. */ public void setRamUsageEstimator(RamUsageEstimator r) { ramCalc = r; } /** * Quick and dirty convenience method * @see #check */ public static Insanity[] checkSanity(FieldCache cache) { return checkSanity(cache.getCacheEntries()); } /** * Quick and dirty convenience method that instantiates an instance with * "good defaults" and uses it to test the CacheEntry[] * @see #check */ public static Insanity[] checkSanity(CacheEntry[] cacheEntries) { FieldCacheSanityChecker sanityChecker = new FieldCacheSanityChecker(); // doesn't check for interned sanityChecker.setRamUsageEstimator(new RamUsageEstimator(false)); return sanityChecker.check(cacheEntries); } /** * Tests a CacheEntry[] for indication of "insane" cache usage. *

    * NOTE:FieldCache CreationPlaceholder objects are ignored. * (:TODO: is this a bad idea? are we masking a real problem?) *

    */ public Insanity[] check(CacheEntry[] cacheEntries) { if (null == cacheEntries || 0 == cacheEntries.length) return new Insanity[0]; if (null != ramCalc) { for (int i = 0; i < cacheEntries.length; i++) { cacheEntries[i].estimateSize(ramCalc); } } // the indirect mapping lets MapOfSet dedup identical valIds for us // // maps the (valId) identityhashCode of cache values to // sets of CacheEntry instances final MapOfSets valIdToItems = new MapOfSets(new HashMap(17)); // maps ReaderField keys to Sets of ValueIds final MapOfSets readerFieldToValIds = new MapOfSets(new HashMap(17)); // // any keys that we know result in more then one valId final Set valMismatchKeys = new HashSet(); // iterate over all the cacheEntries to get the mappings we'll need for (int i = 0; i < cacheEntries.length; i++) { final CacheEntry item = cacheEntries[i]; final Object val = item.getValue(); if (val instanceof FieldCache.CreationPlaceholder) continue; final ReaderField rf = new ReaderField(item.getReaderKey(), item.getFieldName()); final Integer valId = new Integer(System.identityHashCode(val)); // indirect mapping, so the MapOfSet will dedup identical valIds for us valIdToItems.put(valId, item); if (1 < readerFieldToValIds.put(rf, valId)) { valMismatchKeys.add(rf); } } final List insanity = new ArrayList(valMismatchKeys.size() * 3); insanity.addAll(checkValueMismatch(valIdToItems, readerFieldToValIds, valMismatchKeys)); insanity.addAll(checkSubreaders(valIdToItems, readerFieldToValIds)); return (Insanity[]) insanity.toArray(new Insanity[insanity.size()]); } /** * Internal helper method used by check that iterates over * valMismatchKeys and generates a Collection of Insanity * instances accordingly. The MapOfSets are used to populate * the Insanity objects. * @see InsanityType#VALUEMISMATCH */ private Collection checkValueMismatch(MapOfSets valIdToItems, MapOfSets readerFieldToValIds, Set valMismatchKeys) { final List insanity = new ArrayList(valMismatchKeys.size() * 3); if (! valMismatchKeys.isEmpty() ) { // we have multiple values for some ReaderFields final Map rfMap = readerFieldToValIds.getMap(); final Map valMap = valIdToItems.getMap(); final Iterator mismatchIter = valMismatchKeys.iterator(); while (mismatchIter.hasNext()) { final ReaderField rf = (ReaderField)mismatchIter.next(); final List badEntries = new ArrayList(valMismatchKeys.size() * 2); final Iterator valIter = ((Set)rfMap.get(rf)).iterator(); while (valIter.hasNext()) { Iterator entriesIter = ((Set)valMap.get(valIter.next())).iterator(); while (entriesIter.hasNext()) { badEntries.add(entriesIter.next()); } } CacheEntry[] badness = new CacheEntry[badEntries.size()]; badness = (CacheEntry[]) badEntries.toArray(badness); insanity.add(new Insanity(InsanityType.VALUEMISMATCH, "Multiple distinct value objects for " + rf.toString(), badness)); } } return insanity; } /** * Internal helper method used by check that iterates over * the keys of readerFieldToValIds and generates a Collection * of Insanity instances whenever two (or more) ReaderField instances are * found that have an ancestry relationships. * * @see InsanityType#SUBREADER */ private Collection checkSubreaders(MapOfSets valIdToItems, MapOfSets readerFieldToValIds) { final List insanity = new ArrayList(23); Map badChildren = new HashMap(17); MapOfSets badKids = new MapOfSets(badChildren); // wrapper Map viToItemSets = valIdToItems.getMap(); Map rfToValIdSets = readerFieldToValIds.getMap(); Set seen = new HashSet(17); Set readerFields = rfToValIdSets.keySet(); Iterator rfIter = readerFields.iterator(); while (rfIter.hasNext()) { ReaderField rf = (ReaderField) rfIter.next(); if (seen.contains(rf)) continue; List kids = getAllDecendentReaderKeys(rf.readerKey); for (int i = 0; i < kids.size(); i++) { ReaderField kid = new ReaderField(kids.get(i), rf.fieldName); if (badChildren.containsKey(kid)) { // we've already process this kid as RF and found other problems // track those problems as our own badKids.put(rf, kid); badKids.putAll(rf, (Collection)badChildren.get(kid)); badChildren.remove(kid); } else if (rfToValIdSets.containsKey(kid)) { // we have cache entries for the kid badKids.put(rf, kid); } seen.add(kid); } seen.add(rf); } // every mapping in badKids represents an Insanity Iterator parentsIter = badChildren.keySet().iterator(); while (parentsIter.hasNext()) { ReaderField parent = (ReaderField) parentsIter.next(); Set kids = (Set) badChildren.get(parent); List badEntries = new ArrayList(kids.size() * 2); // put parent entr(ies) in first { Iterator valIter =((Set)rfToValIdSets.get(parent)).iterator(); while (valIter.hasNext()) { badEntries.addAll((Set)viToItemSets.get(valIter.next())); } } // now the entries for the descendants Iterator kidsIter = kids.iterator(); while (kidsIter.hasNext()) { ReaderField kid = (ReaderField) kidsIter.next(); Iterator valIter =((Set)rfToValIdSets.get(kid)).iterator(); while (valIter.hasNext()) { badEntries.addAll((Set)viToItemSets.get(valIter.next())); } } CacheEntry[] badness = new CacheEntry[badEntries.size()]; badness = (CacheEntry[]) badEntries.toArray(badness); insanity.add(new Insanity(InsanityType.SUBREADER, "Found caches for decendents of " + parent.toString(), badness)); } return insanity; } /** * Checks if the seed is an IndexReader, and if so will walk * the hierarchy of subReaders building up a list of the objects * returned by obj.getFieldCacheKey() */ private List getAllDecendentReaderKeys(Object seed) { List all = new ArrayList(17); // will grow as we iter all.add(seed); for (int i = 0; i < all.size(); i++) { Object obj = all.get(i); if (obj instanceof IndexReader) { IndexReader[] subs = ((IndexReader)obj).getSequentialSubReaders(); for (int j = 0; (null != subs) && (j < subs.length); j++) { all.add(subs[j].getFieldCacheKey()); } } } // need to skip the first, because it was the seed return all.subList(1, all.size()); } /** * Simple pair object for using "readerKey + fieldName" a Map key */ private final static class ReaderField { public final Object readerKey; public final String fieldName; public ReaderField(Object readerKey, String fieldName) { this.readerKey = readerKey; this.fieldName = fieldName; } public int hashCode() { return System.identityHashCode(readerKey) * fieldName.hashCode(); } public boolean equals(Object that) { if (! (that instanceof ReaderField)) return false; ReaderField other = (ReaderField) that; return (this.readerKey == other.readerKey && this.fieldName.equals(other.fieldName)); } public String toString() { return readerKey.toString() + "+" + fieldName; } } /** * Simple container for a collection of related CacheEntry objects that * in conjunction with each other represent some "insane" usage of the * FieldCache. */ public final static class Insanity { private final InsanityType type; private final String msg; private final CacheEntry[] entries; public Insanity(InsanityType type, String msg, CacheEntry[] entries) { if (null == type) { throw new IllegalArgumentException ("Insanity requires non-null InsanityType"); } if (null == entries || 0 == entries.length) { throw new IllegalArgumentException ("Insanity requires non-null/non-empty CacheEntry[]"); } this.type = type; this.msg = msg; this.entries = entries; } /** * Type of insane behavior this object represents */ public InsanityType getType() { return type; } /** * Description of hte insane behavior */ public String getMsg() { return msg; } /** * CacheEntry objects which suggest a problem */ public CacheEntry[] getCacheEntries() { return entries; } /** * Multi-Line representation of this Insanity object, starting with * the Type and Msg, followed by each CacheEntry.toString() on it's * own line prefaced by a tab character */ public String toString() { StringBuffer buf = new StringBuffer(); buf.append(getType()).append(": "); String m = getMsg(); if (null != m) buf.append(m); buf.append('\n'); CacheEntry[] ce = getCacheEntries(); for (int i = 0; i < ce.length; i++) { buf.append('\t').append(ce[i].toString()).append('\n'); } return buf.toString(); } } /** * An Enumeration of the different types of "insane" behavior that * may be detected in a FieldCache. * * @see InsanityType#SUBREADER * @see InsanityType#VALUEMISMATCH * @see InsanityType#EXPECTED */ public final static class InsanityType { private final String label; private InsanityType(final String label) { this.label = label; } public String toString() { return label; } /** * Indicates an overlap in cache usage on a given field * in sub/super readers. */ public final static InsanityType SUBREADER = new InsanityType("SUBREADER"); /** *

    * Indicates entries have the same reader+fieldname but * different cached values. This can happen if different datatypes, * or parsers are used -- and while it's not necessarily a bug * it's typically an indication of a possible problem. *

    *

    * Only the reader, fieldname, and cached value are actually * tested -- if two cache entries have different parsers or datatypes but * the cached values are the same Object (== not just equal()) this method * does not consider that a red flag. This allows for subtle variations * in the way a Parser is specified (null vs DEFAULT_LONG_PARSER, etc...) *

    */ public final static InsanityType VALUEMISMATCH = new InsanityType("VALUEMISMATCH"); /** * Indicates an expected bit of "insanity". This may be useful for * clients that wish to preserve/log information about insane usage * but indicate that it was expected. */ public final static InsanityType EXPECTED = new InsanityType("EXPECTED"); } } lucene-2.9.4/src/java/org/apache/lucene/util/Version.java0000644000175000017500000000606511474320230023716 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Serializable; /** * Use by certain classes to match version compatibility * across releases of Lucene. * *

    WARNING: When changing the version parameter * that you supply to components in Lucene, do not simply * change the version at search-time, but instead also adjust * your indexing code to match, and re-index. */ public final class Version extends Parameter implements Serializable { /** *

    WARNING: if you use this setting, and then * upgrade to a newer release of Lucene, sizable changes * may happen. If backwards compatibility is important * then you should instead explicitly specify an actual * version. *

    * If you use this constant then you may need to * re-index all of your documents when upgrading * Lucene, as the way text is indexed may have changed. * Additionally, you may need to re-test your entire * application to ensure it behaves as expected, as * some defaults may have changed and may break functionality * in your application. * @deprecated Use an actual version instead. */ public static final Version LUCENE_CURRENT = new Version("LUCENE_CURRENT", 0); /** Match settings and bugs in Lucene's 2.0 release. */ public static final Version LUCENE_20 = new Version("LUCENE_20", 2000); /** Match settings and bugs in Lucene's 2.1 release. */ public static final Version LUCENE_21 = new Version("LUCENE_21", 2100); /** Match settings and bugs in Lucene's 2.2 release. */ public static final Version LUCENE_22 = new Version("LUCENE_22", 2200); /** Match settings and bugs in Lucene's 2.3 release. */ public static final Version LUCENE_23 = new Version("LUCENE_23", 2300); /** Match settings and bugs in Lucene's 2.4 release. */ public static final Version LUCENE_24 = new Version("LUCENE_24", 2400); /** Match settings and bugs in Lucene's 2.9 release. *

    * Use this to get the latest & greatest settings, bug * fixes, etc, for Lucene. */ public static final Version LUCENE_29 = new Version("LUCENE_29", 2900); private final int v; public Version(String name, int v) { super(name); this.v = v; } public boolean onOrAfter(Version other) { return v == 0 || v >= other.v; } }lucene-2.9.4/src/java/org/apache/lucene/util/RamUsageEstimator.java0000644000175000017500000001336611474320231025670 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.lang.reflect.*; import java.text.DecimalFormat; import java.util.*; /** * Estimates the size of a given Object using a given MemoryModel for primitive * size information. * * Resource Usage: * * Internally uses a Map to temporally hold a reference to every * object seen. * * If checkIntered, all Strings checked will be interned, but those * that were not already interned will be released for GC when the * estimate is complete. */ public final class RamUsageEstimator { private MemoryModel memoryModel; private final Map seen; private int refSize; private int arraySize; private int classSize; private boolean checkInterned; /** * Constructs this object with an AverageGuessMemoryModel and * checkInterned = true. */ public RamUsageEstimator() { this(new AverageGuessMemoryModel()); } /** * @param checkInterned check if Strings are interned and don't add to size * if they are. Defaults to true but if you know the objects you are checking * won't likely contain many interned Strings, it will be faster to turn off * intern checking. */ public RamUsageEstimator(boolean checkInterned) { this(new AverageGuessMemoryModel(), checkInterned); } /** * @param memoryModel MemoryModel to use for primitive object sizes. */ public RamUsageEstimator(MemoryModel memoryModel) { this(memoryModel, true); } /** * @param memoryModel MemoryModel to use for primitive object sizes. * @param checkInterned check if Strings are interned and don't add to size * if they are. Defaults to true but if you know the objects you are checking * won't likely contain many interned Strings, it will be faster to turn off * intern checking. */ public RamUsageEstimator(MemoryModel memoryModel, boolean checkInterned) { this.memoryModel = memoryModel; this.checkInterned = checkInterned; // Use Map rather than Set so that we can use an IdentityHashMap - not // seeing an IdentityHashSet seen = new IdentityHashMap(64); this.refSize = memoryModel.getReferenceSize(); this.arraySize = memoryModel.getArraySize(); this.classSize = memoryModel.getClassSize(); } public long estimateRamUsage(Object obj) { long size = size(obj); seen.clear(); return size; } private long size(Object obj) { if (obj == null) { return 0; } // interned not part of this object if (checkInterned && obj instanceof String && obj == ((String) obj).intern()) { // interned string will be eligible // for GC on // estimateRamUsage(Object) return return 0; } // skip if we have seen before if (seen.containsKey(obj)) { return 0; } // add to seen seen.put(obj, null); Class clazz = obj.getClass(); if (clazz.isArray()) { return sizeOfArray(obj); } long size = 0; // walk type hierarchy while (clazz != null) { Field[] fields = clazz.getDeclaredFields(); for (int i = 0; i < fields.length; i++) { if (Modifier.isStatic(fields[i].getModifiers())) { continue; } if (fields[i].getType().isPrimitive()) { size += memoryModel.getPrimitiveSize(fields[i].getType()); } else { size += refSize; fields[i].setAccessible(true); try { Object value = fields[i].get(obj); if (value != null) { size += size(value); } } catch (IllegalAccessException ex) { // ignore for now? } } } clazz = clazz.getSuperclass(); } size += classSize; return size; } private long sizeOfArray(Object obj) { int len = Array.getLength(obj); if (len == 0) { return 0; } long size = arraySize; Class arrayElementClazz = obj.getClass().getComponentType(); if (arrayElementClazz.isPrimitive()) { size += len * memoryModel.getPrimitiveSize(arrayElementClazz); } else { for (int i = 0; i < len; i++) { size += refSize + size(Array.get(obj, i)); } } return size; } private static final long ONE_KB = 1024; private static final long ONE_MB = ONE_KB * ONE_KB; private static final long ONE_GB = ONE_KB * ONE_MB; /** * Return good default units based on byte size. */ public static String humanReadableUnits(long bytes, DecimalFormat df) { String newSizeAndUnits; if (bytes / ONE_GB > 0) { newSizeAndUnits = String.valueOf(df.format((float) bytes / ONE_GB)) + " GB"; } else if (bytes / ONE_MB > 0) { newSizeAndUnits = String.valueOf(df.format((float) bytes / ONE_MB)) + " MB"; } else if (bytes / ONE_KB > 0) { newSizeAndUnits = String.valueOf(df.format((float) bytes / ONE_KB)) + " KB"; } else { newSizeAndUnits = String.valueOf(bytes) + " bytes"; } return newSizeAndUnits; } } lucene-2.9.4/src/java/org/apache/lucene/util/ScorerDocQueue.java0000644000175000017500000001424111474320231025155 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* Derived from org.apache.lucene.util.PriorityQueue of March 2005 */ import java.io.IOException; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.Scorer; /** A ScorerDocQueue maintains a partial ordering of its Scorers such that the least Scorer can always be found in constant time. Put()'s and pop()'s require log(size) time. The ordering is by Scorer.doc(). */ public class ScorerDocQueue { // later: SpansQueue for spans with doc and term positions private final HeapedScorerDoc[] heap; private final int maxSize; private int size; private class HeapedScorerDoc { Scorer scorer; int doc; HeapedScorerDoc(Scorer s) { this(s, s.docID()); } HeapedScorerDoc(Scorer scorer, int doc) { this.scorer = scorer; this.doc = doc; } void adjust() { doc = scorer.docID(); } } private HeapedScorerDoc topHSD; // same as heap[1], only for speed /** Create a ScorerDocQueue with a maximum size. */ public ScorerDocQueue(int maxSize) { // assert maxSize >= 0; size = 0; int heapSize = maxSize + 1; heap = new HeapedScorerDoc[heapSize]; this.maxSize = maxSize; topHSD = heap[1]; // initially null } /** * Adds a Scorer to a ScorerDocQueue in log(size) time. * If one tries to add more Scorers than maxSize * a RuntimeException (ArrayIndexOutOfBound) is thrown. */ public final void put(Scorer scorer) { size++; heap[size] = new HeapedScorerDoc(scorer); upHeap(); } /** * Adds a Scorer to the ScorerDocQueue in log(size) time if either * the ScorerDocQueue is not full, or not lessThan(scorer, top()). * @param scorer * @return true if scorer is added, false otherwise. */ public boolean insert(Scorer scorer){ if (size < maxSize) { put(scorer); return true; } else { int docNr = scorer.docID(); if ((size > 0) && (! (docNr < topHSD.doc))) { // heap[1] is top() heap[1] = new HeapedScorerDoc(scorer, docNr); downHeap(); return true; } else { return false; } } } /** Returns the least Scorer of the ScorerDocQueue in constant time. * Should not be used when the queue is empty. */ public final Scorer top() { // assert size > 0; return topHSD.scorer; } /** Returns document number of the least Scorer of the ScorerDocQueue * in constant time. * Should not be used when the queue is empty. */ public final int topDoc() { // assert size > 0; return topHSD.doc; } public final float topScore() throws IOException { // assert size > 0; return topHSD.scorer.score(); } public final boolean topNextAndAdjustElsePop() throws IOException { return checkAdjustElsePop(topHSD.scorer.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); } public final boolean topSkipToAndAdjustElsePop(int target) throws IOException { return checkAdjustElsePop(topHSD.scorer.advance(target) != DocIdSetIterator.NO_MORE_DOCS); } private boolean checkAdjustElsePop(boolean cond) { if (cond) { // see also adjustTop topHSD.doc = topHSD.scorer.docID(); } else { // see also popNoResult heap[1] = heap[size]; // move last to first heap[size] = null; size--; } downHeap(); return cond; } /** Removes and returns the least scorer of the ScorerDocQueue in log(size) * time. * Should not be used when the queue is empty. */ public final Scorer pop() { // assert size > 0; Scorer result = topHSD.scorer; popNoResult(); return result; } /** Removes the least scorer of the ScorerDocQueue in log(size) time. * Should not be used when the queue is empty. */ private final void popNoResult() { heap[1] = heap[size]; // move last to first heap[size] = null; size--; downHeap(); // adjust heap } /** Should be called when the scorer at top changes doc() value. * Still log(n) worst case, but it's at least twice as fast to

       *  { pq.top().change(); pq.adjustTop(); }
       * 
    instead of
       *  { o = pq.pop(); o.change(); pq.push(o); }
       * 
    */ public final void adjustTop() { // assert size > 0; topHSD.adjust(); downHeap(); } /** Returns the number of scorers currently stored in the ScorerDocQueue. */ public final int size() { return size; } /** Removes all entries from the ScorerDocQueue. */ public final void clear() { for (int i = 0; i <= size; i++) { heap[i] = null; } size = 0; } private final void upHeap() { int i = size; HeapedScorerDoc node = heap[i]; // save bottom node int j = i >>> 1; while ((j > 0) && (node.doc < heap[j].doc)) { heap[i] = heap[j]; // shift parents down i = j; j = j >>> 1; } heap[i] = node; // install saved node topHSD = heap[1]; } private final void downHeap() { int i = 1; HeapedScorerDoc node = heap[i]; // save top node int j = i << 1; // find smaller child int k = j + 1; if ((k <= size) && (heap[k].doc < heap[j].doc)) { j = k; } while ((j <= size) && (heap[j].doc < node.doc)) { heap[i] = heap[j]; // shift up child i = j; j = i << 1; k = j + 1; if (k <= size && (heap[k].doc < heap[j].doc)) { j = k; } } heap[i] = node; // install saved node topHSD = heap[1]; } } lucene-2.9.4/src/java/org/apache/lucene/util/AttributeImpl.java0000644000175000017500000000740211474320231025053 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Serializable; import java.lang.reflect.Field; import java.lang.reflect.Modifier; /** * Base class for Attributes that can be added to a * {@link org.apache.lucene.util.AttributeSource}. *

    * Attributes are used to add data in a dynamic, yet type-safe way to a source * of usually streamed objects, e. g. a {@link org.apache.lucene.analysis.TokenStream}. */ public abstract class AttributeImpl implements Cloneable, Serializable, Attribute { /** * Clears the values in this AttributeImpl and resets it to its * default value. If this implementation implements more than one Attribute interface * it clears all. */ public abstract void clear(); /** * The default implementation of this method accesses all declared * fields of this object and prints the values in the following syntax: * *

       *   public String toString() {
       *     return "start=" + startOffset + ",end=" + endOffset;
       *   }
       * 
    * * This method may be overridden by subclasses. */ public String toString() { StringBuffer buffer = new StringBuffer(); Class clazz = this.getClass(); Field[] fields = clazz.getDeclaredFields(); try { for (int i = 0; i < fields.length; i++) { Field f = fields[i]; if (Modifier.isStatic(f.getModifiers())) continue; f.setAccessible(true); Object value = f.get(this); if (buffer.length()>0) { buffer.append(','); } if (value == null) { buffer.append(f.getName() + "=null"); } else { buffer.append(f.getName() + "=" + value); } } } catch (IllegalAccessException e) { // this should never happen, because we're just accessing fields // from 'this' throw new RuntimeException(e); } return buffer.toString(); } /** * Subclasses must implement this method and should compute * a hashCode similar to this: *
       *   public int hashCode() {
       *     int code = startOffset;
       *     code = code * 31 + endOffset;
       *     return code;
       *   }
       * 
    * * see also {@link #equals(Object)} */ public abstract int hashCode(); /** * All values used for computation of {@link #hashCode()} * should be checked here for equality. * * see also {@link Object#equals(Object)} */ public abstract boolean equals(Object other); /** * Copies the values from this Attribute into the passed-in * target attribute. The target implementation must support all the * Attributes this implementation supports. */ public abstract void copyTo(AttributeImpl target); /** * Shallow clone. Subclasses must override this if they * need to clone any members deeply, */ public Object clone() { Object clone = null; try { clone = super.clone(); } catch (CloneNotSupportedException e) { throw new RuntimeException(e); // shouldn't happen } return clone; } } lucene-2.9.4/src/java/org/apache/lucene/util/SortedVIntList.java0000644000175000017500000001620111474320231025160 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.BitSet; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; /** * Stores and iterate on sorted integers in compressed form in RAM.
    * The code for compressing the differences between ascending integers was * borrowed from {@link org.apache.lucene.store.IndexInput} and * {@link org.apache.lucene.store.IndexOutput}. *

    * NOTE: this class assumes the stored integers are doc Ids (hence why it * extends {@link DocIdSet}). Therefore its {@link #iterator()} assumes {@link * DocIdSetIterator#NO_MORE_DOCS} can be used as sentinel. If you intent to use * this value, then make sure it's not used during search flow. */ public class SortedVIntList extends DocIdSet { /** When a BitSet has fewer than 1 in BITS2VINTLIST_SIZE bits set, * a SortedVIntList representing the index numbers of the set bits * will be smaller than that BitSet. */ final static int BITS2VINTLIST_SIZE = 8; private int size; private byte[] bytes; private int lastBytePos; /** * Create a SortedVIntList from all elements of an array of integers. * * @param sortedInts A sorted array of non negative integers. */ public SortedVIntList(int[] sortedInts) { this(sortedInts, sortedInts.length); } /** * Create a SortedVIntList from an array of integers. * @param sortedInts An array of sorted non negative integers. * @param inputSize The number of integers to be used from the array. */ public SortedVIntList(int[] sortedInts, int inputSize) { SortedVIntListBuilder builder = new SortedVIntListBuilder(); for (int i = 0; i < inputSize; i++) { builder.addInt(sortedInts[i]); } builder.done(); } /** * Create a SortedVIntList from a BitSet. * @param bits A bit set representing a set of integers. */ public SortedVIntList(BitSet bits) { SortedVIntListBuilder builder = new SortedVIntListBuilder(); int nextInt = bits.nextSetBit(0); while (nextInt != -1) { builder.addInt(nextInt); nextInt = bits.nextSetBit(nextInt + 1); } builder.done(); } /** * Create a SortedVIntList from an OpenBitSet. * @param bits A bit set representing a set of integers. */ public SortedVIntList(OpenBitSet bits) { SortedVIntListBuilder builder = new SortedVIntListBuilder(); int nextInt = bits.nextSetBit(0); while (nextInt != -1) { builder.addInt(nextInt); nextInt = bits.nextSetBit(nextInt + 1); } builder.done(); } /** * Create a SortedVIntList. * @param docIdSetIterator An iterator providing document numbers as a set of integers. * This DocIdSetIterator is iterated completely when this constructor * is called and it must provide the integers in non * decreasing order. */ public SortedVIntList(DocIdSetIterator docIdSetIterator) throws IOException { SortedVIntListBuilder builder = new SortedVIntListBuilder(); int doc; while ((doc = docIdSetIterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { builder.addInt(doc); } builder.done(); } private class SortedVIntListBuilder { private int lastInt = 0; SortedVIntListBuilder() { initBytes(); lastInt = 0; } void addInt(int nextInt) { int diff = nextInt - lastInt; if (diff < 0) { throw new IllegalArgumentException( "Input not sorted or first element negative."); } if ((lastBytePos + MAX_BYTES_PER_INT) > bytes.length) { // biggest possible int does not fit resizeBytes((bytes.length * 2) + MAX_BYTES_PER_INT); } // See org.apache.lucene.store.IndexOutput.writeVInt() while ((diff & ~VB1) != 0) { // The high bit of the next byte needs to be set. bytes[lastBytePos++] = (byte) ((diff & VB1) | ~VB1); diff >>>= BIT_SHIFT; } bytes[lastBytePos++] = (byte) diff; // Last byte, high bit not set. size++; lastInt = nextInt; } void done() { resizeBytes(lastBytePos); } } private void initBytes() { size = 0; bytes = new byte[128]; // initial byte size lastBytePos = 0; } private void resizeBytes(int newSize) { if (newSize != bytes.length) { byte[] newBytes = new byte[newSize]; System.arraycopy(bytes, 0, newBytes, 0, lastBytePos); bytes = newBytes; } } private static final int VB1 = 0x7F; private static final int BIT_SHIFT = 7; private final int MAX_BYTES_PER_INT = (31 / BIT_SHIFT) + 1; /** * @return The total number of sorted integers. */ public int size() { return size; } /** * @return The size of the byte array storing the compressed sorted integers. */ public int getByteSize() { return bytes.length; } /** This DocIdSet implementation is cacheable. */ public boolean isCacheable() { return true; } /** * @return An iterator over the sorted integers. */ public DocIdSetIterator iterator() { return new DocIdSetIterator() { int bytePos = 0; int lastInt = 0; int doc = -1; private void advance() { // See org.apache.lucene.store.IndexInput.readVInt() byte b = bytes[bytePos++]; lastInt += b & VB1; for (int s = BIT_SHIFT; (b & ~VB1) != 0; s += BIT_SHIFT) { b = bytes[bytePos++]; lastInt += (b & VB1) << s; } } /** @deprecated use {@link #docID()} instead. */ public int doc() {return lastInt;} public int docID() { return doc; } /** @deprecated use {@link #nextDoc()} instead. */ public boolean next() { return nextDoc() != NO_MORE_DOCS; } public int nextDoc() { if (bytePos >= lastBytePos) { doc = NO_MORE_DOCS; } else { advance(); doc = lastInt; } return doc; } /** @deprecated use {@link #advance(int)} instead. */ public boolean skipTo(int docNr) { return advance(docNr) != NO_MORE_DOCS; } public int advance(int target) { while (bytePos < lastBytePos) { advance(); if (lastInt >= target) { return doc = lastInt; } } return doc = NO_MORE_DOCS; } }; } } lucene-2.9.4/src/java/org/apache/lucene/util/Parameter.java0000644000175000017500000000420411474320230024202 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.ObjectStreamException; import java.io.Serializable; import java.io.StreamCorruptedException; import java.util.HashMap; import java.util.Map; /** * A serializable Enum class. */ public abstract class Parameter implements Serializable { static Map allParameters = new HashMap(); private String name; private Parameter() { // typesafe enum pattern, no public constructor } protected Parameter(String name) { // typesafe enum pattern, no public constructor this.name = name; String key = makeKey(name); if(allParameters.containsKey(key)) throw new IllegalArgumentException("Parameter name " + key + " already used!"); allParameters.put(key, this); } private String makeKey(String name){ return getClass() + " " + name; } public String toString() { return name; } /** * Resolves the deserialized instance to the local reference for accurate * equals() and == comparisons. * * @return a reference to Parameter as resolved in the local VM * @throws ObjectStreamException */ protected Object readResolve() throws ObjectStreamException { Object par = allParameters.get(makeKey(name)); if(par == null) throw new StreamCorruptedException("Unknown parameter value: " + name); return par; } } lucene-2.9.4/src/java/org/apache/lucene/util/StringHelper.java0000644000175000017500000000474511474505315024713 0ustar janpascaljanpascalpackage org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Methods for manipulating strings. * * $Id: StringHelper.java 1039905 2010-11-28 16:58:14Z uschindler $ */ public abstract class StringHelper { /** * Expert: * The StringInterner implementation used by Lucene. * This shouldn't be changed to an incompatible implementation after other Lucene APIs have been used. */ public static StringInterner interner = new SimpleStringInterner(1024,8); /** Return the same string object for all equal strings */ public static String intern(String s) { return interner.intern(s); } /** * Compares two byte[] arrays, element by element, and returns the * number of elements common to both arrays. * * @param bytes1 The first byte[] to compare * @param bytes2 The second byte[] to compare * @return The number of common elements. */ public static final int bytesDifference(byte[] bytes1, int len1, byte[] bytes2, int len2) { int len = len1 < len2 ? len1 : len2; for (int i = 0; i < len; i++) if (bytes1[i] != bytes2[i]) return i; return len; } /** * Compares two strings, character by character, and returns the * first position where the two strings differ from one another. * * @param s1 The first string to compare * @param s2 The second string to compare * @return The first position where the two strings differ. */ public static final int stringDifference(String s1, String s2) { int len1 = s1.length(); int len2 = s2.length(); int len = len1 < len2 ? len1 : len2; for (int i = 0; i < len; i++) { if (s1.charAt(i) != s2.charAt(i)) { return i; } } return len; } private StringHelper() { } } lucene-2.9.4/src/java/org/apache/lucene/package.html0000644000175000017500000000146511474320232022733 0ustar janpascaljanpascal Top-level package. lucene-2.9.4/src/java/org/apache/lucene/document/0000755000175000017500000000000011554106562022271 5ustar janpascaljanpascallucene-2.9.4/src/java/org/apache/lucene/document/FieldSelectorResult.java0000644000175000017500000001042111474320232027046 0ustar janpascaljanpascalpackage org.apache.lucene.document; import java.io.Serializable; /** * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Provides information about what should be done with this Field * **/ //Replace with an enumerated type in 1.5 public final class FieldSelectorResult implements Serializable { /** * Load this {@link Field} every time the {@link Document} is loaded, reading in the data as it is encountered. * {@link Document#getField(String)} and {@link Document#getFieldable(String)} should not return null. *

    * {@link Document#add(Fieldable)} should be called by the Reader. */ public transient static final FieldSelectorResult LOAD = new FieldSelectorResult(0); /** * Lazily load this {@link Field}. This means the {@link Field} is valid, but it may not actually contain its data until * invoked. {@link Document#getField(String)} SHOULD NOT BE USED. {@link Document#getFieldable(String)} is safe to use and should * return a valid instance of a {@link Fieldable}. *

    * {@link Document#add(Fieldable)} should be called by the Reader. */ public transient static final FieldSelectorResult LAZY_LOAD = new FieldSelectorResult(1); /** * Do not load the {@link Field}. {@link Document#getField(String)} and {@link Document#getFieldable(String)} should return null. * {@link Document#add(Fieldable)} is not called. *

    * {@link Document#add(Fieldable)} should not be called by the Reader. */ public transient static final FieldSelectorResult NO_LOAD = new FieldSelectorResult(2); /** * Load this field as in the {@link #LOAD} case, but immediately return from {@link Field} loading for the {@link Document}. Thus, the * Document may not have its complete set of Fields. {@link Document#getField(String)} and {@link Document#getFieldable(String)} should * both be valid for this {@link Field} *

    * {@link Document#add(Fieldable)} should be called by the Reader. */ public transient static final FieldSelectorResult LOAD_AND_BREAK = new FieldSelectorResult(3); /** * Behaves much like {@link #LOAD} but does not uncompress any compressed data. This is used for internal purposes. * {@link Document#getField(String)} and {@link Document#getFieldable(String)} should not return null. *

    * {@link Document#add(Fieldable)} should be called by * the Reader. * @deprecated This is an internal option only, and is * no longer needed now that {@link CompressionTools} * is used for field compression. */ public transient static final FieldSelectorResult LOAD_FOR_MERGE = new FieldSelectorResult(4); /** Expert: Load the size of this {@link Field} rather than its value. * Size is measured as number of bytes required to store the field == bytes for a binary or any compressed value, and 2*chars for a String value. * The size is stored as a binary value, represented as an int in a byte[], with the higher order byte first in [0] */ public transient static final FieldSelectorResult SIZE = new FieldSelectorResult(5); /** Expert: Like {@link #SIZE} but immediately break from the field loading loop, i.e., stop loading further fields, after the size is loaded */ public transient static final FieldSelectorResult SIZE_AND_BREAK = new FieldSelectorResult(6); private int id; private FieldSelectorResult(int id) { this.id = id; } public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; final FieldSelectorResult that = (FieldSelectorResult) o; if (id != that.id) return false; return true; } public int hashCode() { return id; } } lucene-2.9.4/src/java/org/apache/lucene/document/Fieldable.java0000644000175000017500000002004311474320232024773 0ustar janpascaljanpascalpackage org.apache.lucene.document; /** * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.index.FieldInvertState; // for javadocs import java.io.Reader; import java.io.Serializable; /** * Synonymous with {@link Field}. * *

    WARNING: This interface may change within minor versions, despite Lucene's backward compatibility requirements. * This means new methods may be added from version to version. This change only affects the Fieldable API; other backwards * compatibility promises remain intact. For example, Lucene can still * read and write indices created within the same major version. *

    * **/ public interface Fieldable extends Serializable { /** Sets the boost factor hits on this field. This value will be * multiplied into the score of all hits on this this field of this * document. * *

    The boost is multiplied by {@link org.apache.lucene.document.Document#getBoost()} of the document * containing this field. If a document has multiple fields with the same * name, all such values are multiplied together. This product is then * used to compute the norm factor for the field. By * default, in the {@link * org.apache.lucene.search.Similarity#computeNorm(String, * FieldInvertState)} method, the boost value is multiplied * by the {@link * org.apache.lucene.search.Similarity#lengthNorm(String, * int)} and then rounded by {@link org.apache.lucene.search.Similarity#encodeNorm(float)} before it is stored in the * index. One should attempt to ensure that this product does not overflow * the range of that encoding. * * @see org.apache.lucene.document.Document#setBoost(float) * @see org.apache.lucene.search.Similarity#computeNorm(String, FieldInvertState) * @see org.apache.lucene.search.Similarity#encodeNorm(float) */ void setBoost(float boost); /** Returns the boost factor for hits for this field. * *

    The default value is 1.0. * *

    Note: this value is not stored directly with the document in the index. * Documents returned from {@link org.apache.lucene.index.IndexReader#document(int)} and * {@link org.apache.lucene.search.Hits#doc(int)} may thus not have the same value present as when * this field was indexed. * * @see #setBoost(float) */ float getBoost(); /** Returns the name of the field as an interned string. * For example "date", "title", "body", ... */ String name(); /** The value of the field as a String, or null. *

    * For indexing, if isStored()==true, the stringValue() will be used as the stored field value * unless isBinary()==true, in which case binaryValue() will be used. * * If isIndexed()==true and isTokenized()==false, this String value will be indexed as a single token. * If isIndexed()==true and isTokenized()==true, then tokenStreamValue() will be used to generate indexed tokens if not null, * else readerValue() will be used to generate indexed tokens if not null, else stringValue() will be used to generate tokens. */ public String stringValue(); /** The value of the field as a Reader, which can be used at index time to generate indexed tokens. * @see #stringValue() */ public Reader readerValue(); /** The value of the field in Binary, or null. * @see #stringValue() */ public byte[] binaryValue(); /** The TokenStream for this field to be used when indexing, or null. * @see #stringValue() */ public TokenStream tokenStreamValue(); /** True if the value of the field is to be stored in the index for return with search hits. */ boolean isStored(); /** True if the value of the field is to be indexed, so that it may be searched on. */ boolean isIndexed(); /** True if the value of the field should be tokenized as text prior to indexing. Un-tokenized fields are indexed as a single word and may not be Reader-valued. */ boolean isTokenized(); /** True if the value of the field is stored and compressed within the index */ boolean isCompressed(); /** True if the term or terms used to index this field are stored as a term * vector, available from {@link org.apache.lucene.index.IndexReader#getTermFreqVector(int,String)}. * These methods do not provide access to the original content of the field, * only to terms used to index it. If the original content must be * preserved, use the stored attribute instead. * * @see org.apache.lucene.index.IndexReader#getTermFreqVector(int, String) */ boolean isTermVectorStored(); /** * True if terms are stored as term vector together with their offsets * (start and end positon in source text). */ boolean isStoreOffsetWithTermVector(); /** * True if terms are stored as term vector together with their token positions. */ boolean isStorePositionWithTermVector(); /** True if the value of the field is stored as binary */ boolean isBinary(); /** True if norms are omitted for this indexed field */ boolean getOmitNorms(); /** Expert: * * If set, omit normalization factors associated with this indexed field. * This effectively disables indexing boosts and length normalization for this field. */ void setOmitNorms(boolean omitNorms); /** @deprecated Renamed to {@link AbstractField#setOmitTermFreqAndPositions} */ void setOmitTf(boolean omitTf); /** @deprecated Renamed to {@link AbstractField#getOmitTermFreqAndPositions} */ boolean getOmitTf(); /** * Indicates whether a Field is Lazy or not. The semantics of Lazy loading are such that if a Field is lazily loaded, retrieving * it's values via {@link #stringValue()} or {@link #binaryValue()} is only valid as long as the {@link org.apache.lucene.index.IndexReader} that * retrieved the {@link Document} is still open. * * @return true if this field can be loaded lazily */ boolean isLazy(); /** * Returns offset into byte[] segment that is used as value, if Field is not binary * returned value is undefined * @return index of the first character in byte[] segment that represents this Field value */ abstract int getBinaryOffset(); /** * Returns length of byte[] segment that is used as value, if Field is not binary * returned value is undefined * @return length of byte[] segment that represents this Field value */ abstract int getBinaryLength(); /** * Return the raw byte[] for the binary field. Note that * you must also call {@link #getBinaryLength} and {@link * #getBinaryOffset} to know which range of bytes in this * returned array belong to the field. * @return reference to the Field value as byte[]. */ abstract byte[] getBinaryValue(); /** * Return the raw byte[] for the binary field. Note that * you must also call {@link #getBinaryLength} and {@link * #getBinaryOffset} to know which range of bytes in this * returned array belong to the field.

    * About reuse: if you pass in the result byte[] and it is * used, likely the underlying implementation will hold * onto this byte[] and return it in future calls to * {@link #binaryValue()} or {@link #getBinaryValue()}. * So if you subsequently re-use the same byte[] elsewhere * it will alter this Fieldable's value. * @param result User defined buffer that will be used if * possible. If this is null or not large enough, a new * buffer is allocated * @return reference to the Field value as byte[]. */ abstract byte[] getBinaryValue(byte[] result); } lucene-2.9.4/src/java/org/apache/lucene/document/DateField.java0000644000175000017500000001061611474320231024751 0ustar janpascaljanpascalpackage org.apache.lucene.document; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.NumericRangeQuery; // for javadocs import org.apache.lucene.util.NumericUtils; // for javadocs import java.util.Date; // for javadoc import java.util.Calendar; // for javadoc // do not remove in 3.0, needed for reading old indexes! /** * Provides support for converting dates to strings and vice-versa. * The strings are structured so that lexicographic sorting orders by date, * which makes them suitable for use as field values and search terms. * *

    Note that this class saves dates with millisecond granularity, * which is bad for {@link TermRangeQuery} and {@link PrefixQuery}, as those * queries are expanded to a BooleanQuery with a potentially large number * of terms when searching. Thus you might want to use * {@link DateTools} instead. * *

    * Note: dates before 1970 cannot be used, and therefore cannot be * indexed when using this class. See {@link DateTools} for an * alternative without such a limitation. * *

    * Another approach is {@link NumericUtils}, which provides * a sortable binary representation (prefix encoded) of numeric values, which * date/time are. * For indexing a {@link Date} or {@link Calendar}, just get the unix timestamp as * long using {@link Date#getTime} or {@link Calendar#getTimeInMillis} and * index this as a numeric value with {@link NumericField} * and use {@link NumericRangeQuery} to query it. * * @deprecated If you build a new index, use {@link DateTools} or * {@link NumericField} instead. * This class is included for use with existing * indices and will be removed in a future release. */ public class DateField { private DateField() {} // make date strings long enough to last a millenium private static int DATE_LEN = Long.toString(1000L*365*24*60*60*1000, Character.MAX_RADIX).length(); public static String MIN_DATE_STRING() { return timeToString(0); } public static String MAX_DATE_STRING() { char[] buffer = new char[DATE_LEN]; char c = Character.forDigit(Character.MAX_RADIX-1, Character.MAX_RADIX); for (int i = 0 ; i < DATE_LEN; i++) buffer[i] = c; return new String(buffer); } /** * Converts a Date to a string suitable for indexing. * @throws RuntimeException if the date specified in the * method argument is before 1970 */ public static String dateToString(Date date) { return timeToString(date.getTime()); } /** * Converts a millisecond time to a string suitable for indexing. * @throws RuntimeException if the time specified in the * method argument is negative, that is, before 1970 */ public static String timeToString(long time) { if (time < 0) throw new RuntimeException("time '" + time + "' is too early, must be >= 0"); String s = Long.toString(time, Character.MAX_RADIX); if (s.length() > DATE_LEN) throw new RuntimeException("time '" + time + "' is too late, length of string " + "representation must be <= " + DATE_LEN); // Pad with leading zeros if (s.length() < DATE_LEN) { StringBuffer sb = new StringBuffer(s); while (sb.length() < DATE_LEN) sb.insert(0, 0); s = sb.toString(); } return s; } /** Converts a string-encoded date into a millisecond time. */ public static long stringToTime(String s) { return Long.parseLong(s, Character.MAX_RADIX); } /** Converts a string-encoded date into a Date object. */ public static Date stringToDate(String s) { return new Date(stringToTime(s)); } } lucene-2.9.4/src/java/org/apache/lucene/document/NumericField.java0000644000175000017500000002717011474320232025502 0ustar janpascaljanpascalpackage org.apache.lucene.document; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Reader; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.NumericTokenStream; import org.apache.lucene.util.NumericUtils; import org.apache.lucene.search.NumericRangeQuery; // javadocs import org.apache.lucene.search.NumericRangeFilter; // javadocs import org.apache.lucene.search.SortField; // javadocs import org.apache.lucene.search.FieldCache; // javadocs /** *

    This class provides a {@link Field} that enables indexing * of numeric values for efficient range filtering and * sorting. Here's an example usage, adding an int value: *

     *  document.add(new NumericField(name).setIntValue(value));
     * 
    * * For optimal performance, re-use the * NumericField and {@link Document} instance for more than * one document: * *
     *  NumericField field = new NumericField(name);
     *  Document document = new Document();
     *  document.add(field);
     *
     *  for(all documents) {
     *    ...
     *    field.setIntValue(value)
     *    writer.addDocument(document);
     *    ...
     *  }
     * 
    * *

    The java native types int, long, * float and double are * directly supported. However, any value that can be * converted into these native types can also be indexed. * For example, date/time values represented by a * {@link java.util.Date} can be translated into a long * value using the {@link java.util.Date#getTime} method. If you * don't need millisecond precision, you can quantize the * value, either by dividing the result of * {@link java.util.Date#getTime} or using the separate getters * (for year, month, etc.) to construct an int or * long value.

    * *

    To perform range querying or filtering against a * NumericField, use {@link NumericRangeQuery} or {@link * NumericRangeFilter}. To sort according to a * NumericField, use the normal numeric sort types, eg * {@link SortField#INT} (note that {@link SortField#AUTO} * will not work with these fields). NumericField values * can also be loaded directly from {@link FieldCache}.

    * *

    By default, a NumericField's value is not stored but * is indexed for range filtering and sorting. You can use * the {@link #NumericField(String,Field.Store,boolean)} * constructor if you need to change these defaults.

    * *

    You may add the same field name as a NumericField to * the same document more than once. Range querying and * filtering will be the logical OR of all values; so a range query * will hit all documents that have at least one value in * the range. However sort behavior is not defined. If you need to sort, * you should separately index a single-valued NumericField.

    * *

    A NumericField will consume somewhat more disk space * in the index than an ordinary single-valued field. * However, for a typical index that includes substantial * textual content per document, this increase will likely * be in the noise.

    * *

    Within Lucene, each numeric value is indexed as a * trie structure, where each term is logically * assigned to larger and larger pre-defined brackets (which * are simply lower-precision representations of the value). * The step size between each successive bracket is called the * precisionStep, measured in bits. Smaller * precisionStep values result in larger number * of brackets, which consumes more disk space in the index * but may result in faster range search performance. The * default value, 4, was selected for a reasonable tradeoff * of disk space consumption versus performance. You can * use the expert constructor {@link * #NumericField(String,int,Field.Store,boolean)} if you'd * like to change the value. Note that you must also * specify a congruent value when creating {@link * NumericRangeQuery} or {@link NumericRangeFilter}. * For low cardinality fields larger precision steps are good. * If the cardinality is < 100, it is fair * to use {@link Integer#MAX_VALUE}, which produces one * term per value. * *

    For more information on the internals of numeric trie * indexing, including the precisionStep * configuration, see {@link NumericRangeQuery}. The format of * indexed values is described in {@link NumericUtils}. * *

    If you only need to sort by numeric value, and never * run range querying/filtering, you can index using a * precisionStep of {@link Integer#MAX_VALUE}. * This will minimize disk space consumed.

    * *

    More advanced users can instead use {@link * NumericTokenStream} directly, when indexing numbers. This * class is a wrapper around this token stream type for * easier, more intuitive usage.

    * *

    NOTE: This class is only used during * indexing. When retrieving the stored field value from a * {@link Document} instance after search, you will get a * conventional {@link Fieldable} instance where the numeric * values are returned as {@link String}s (according to * toString(value) of the used data type). * *

    NOTE: This API is * experimental and might change in incompatible ways in the * next release. * * @since 2.9 */ public final class NumericField extends AbstractField { private final NumericTokenStream tokenStream; /** * Creates a field for numeric values using the default precisionStep * {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). The instance is not yet initialized with * a numeric value, before indexing a document containing this field, * set a value using the various set???Value() methods. * This constructor creates an indexed, but not stored field. * @param name the field name */ public NumericField(String name) { this(name, NumericUtils.PRECISION_STEP_DEFAULT, Field.Store.NO, true); } /** * Creates a field for numeric values using the default precisionStep * {@link NumericUtils#PRECISION_STEP_DEFAULT} (4). The instance is not yet initialized with * a numeric value, before indexing a document containing this field, * set a value using the various set???Value() methods. * @param name the field name * @param store if the field should be stored in plain text form * (according to toString(value) of the used data type) * @param index if the field should be indexed using {@link NumericTokenStream} */ public NumericField(String name, Field.Store store, boolean index) { this(name, NumericUtils.PRECISION_STEP_DEFAULT, store, index); } /** * Creates a field for numeric values with the specified * precisionStep. The instance is not yet initialized with * a numeric value, before indexing a document containing this field, * set a value using the various set???Value() methods. * This constructor creates an indexed, but not stored field. * @param name the field name * @param precisionStep the used precision step */ public NumericField(String name, int precisionStep) { this(name, precisionStep, Field.Store.NO, true); } /** * Creates a field for numeric values with the specified * precisionStep. The instance is not yet initialized with * a numeric value, before indexing a document containing this field, * set a value using the various set???Value() methods. * @param name the field name * @param precisionStep the used precision step * @param store if the field should be stored in plain text form * (according to toString(value) of the used data type) * @param index if the field should be indexed using {@link NumericTokenStream} */ public NumericField(String name, int precisionStep, Field.Store store, boolean index) { super(name, store, index ? Field.Index.ANALYZED_NO_NORMS : Field.Index.NO, Field.TermVector.NO); setOmitTermFreqAndPositions(true); tokenStream = new NumericTokenStream(precisionStep); } /** Returns a {@link NumericTokenStream} for indexing the numeric value. */ public TokenStream tokenStreamValue() { return isIndexed() ? tokenStream : null; } /** Returns always null for numeric fields */ public byte[] binaryValue() { return null; } /** Returns always null for numeric fields */ public byte[] getBinaryValue(byte[] result){ return null; } /** Returns always null for numeric fields */ public Reader readerValue() { return null; } /** Returns the numeric value as a string (how it is stored, when {@link Field.Store#YES} is chosen). */ public String stringValue() { return (fieldsData == null) ? null : fieldsData.toString(); } /** Returns the current numeric value as a subclass of {@link Number}, null if not yet initialized. */ public Number getNumericValue() { return (Number) fieldsData; } /** * Initializes the field with the supplied long value. * @param value the numeric value * @return this instance, because of this you can use it the following way: * document.add(new NumericField(name, precisionStep).setLongValue(value)) */ public NumericField setLongValue(final long value) { tokenStream.setLongValue(value); fieldsData = new Long(value); return this; } /** * Initializes the field with the supplied int value. * @param value the numeric value * @return this instance, because of this you can use it the following way: * document.add(new NumericField(name, precisionStep).setIntValue(value)) */ public NumericField setIntValue(final int value) { tokenStream.setIntValue(value); fieldsData = new Integer(value); return this; } /** * Initializes the field with the supplied double value. * @param value the numeric value * @return this instance, because of this you can use it the following way: * document.add(new NumericField(name, precisionStep).setDoubleValue(value)) */ public NumericField setDoubleValue(final double value) { tokenStream.setDoubleValue(value); fieldsData = new Double(value); return this; } /** * Initializes the field with the supplied float value. * @param value the numeric value * @return this instance, because of this you can use it the following way: * document.add(new NumericField(name, precisionStep).setFloatValue(value)) */ public NumericField setFloatValue(final float value) { tokenStream.setFloatValue(value); fieldsData = new Float(value); return this; } } lucene-2.9.4/src/java/org/apache/lucene/document/Document.java0000644000175000017500000002671011474320232024711 0ustar janpascaljanpascalpackage org.apache.lucene.document; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.*; // for javadoc import org.apache.lucene.search.ScoreDoc; // for javadoc import org.apache.lucene.search.Searcher; // for javadoc import org.apache.lucene.index.IndexReader; // for javadoc /** Documents are the unit of indexing and search. * * A Document is a set of fields. Each field has a name and a textual value. * A field may be {@link Fieldable#isStored() stored} with the document, in which * case it is returned with search hits on the document. Thus each document * should typically contain one or more stored fields which uniquely identify * it. * *

    Note that fields which are not {@link Fieldable#isStored() stored} are * not available in documents retrieved from the index, e.g. with {@link * ScoreDoc#doc}, {@link Searcher#doc(int)} or {@link * IndexReader#document(int)}. */ public final class Document implements java.io.Serializable { List fields = new ArrayList(); private float boost = 1.0f; /** Constructs a new document with no fields. */ public Document() {} /** Sets a boost factor for hits on any field of this document. This value * will be multiplied into the score of all hits on this document. * *

    The default value is 1.0. * *

    Values are multiplied into the value of {@link Fieldable#getBoost()} of * each field in this document. Thus, this method in effect sets a default * boost for the fields of this document. * * @see Fieldable#setBoost(float) */ public void setBoost(float boost) { this.boost = boost; } /** Returns, at indexing time, the boost factor as set by {@link #setBoost(float)}. * *

    Note that once a document is indexed this value is no longer available * from the index. At search time, for retrieved documents, this method always * returns 1. This however does not mean that the boost value set at indexing * time was ignored - it was just combined with other indexing time factors and * stored elsewhere, for better indexing and search performance. (For more * information see the "norm(t,d)" part of the scoring formula in * {@link org.apache.lucene.search.Similarity Similarity}.) * * @see #setBoost(float) */ public float getBoost() { return boost; } /** *

    Adds a field to a document. Several fields may be added with * the same name. In this case, if the fields are indexed, their text is * treated as though appended for the purposes of search.

    *

    Note that add like the removeField(s) methods only makes sense * prior to adding a document to an index. These methods cannot * be used to change the content of an existing index! In order to achieve this, * a document has to be deleted from an index and a new changed version of that * document has to be added.

    */ public final void add(Fieldable field) { fields.add(field); } /** *

    Removes field with the specified name from the document. * If multiple fields exist with this name, this method removes the first field that has been added. * If there is no field with the specified name, the document remains unchanged.

    *

    Note that the removeField(s) methods like the add method only make sense * prior to adding a document to an index. These methods cannot * be used to change the content of an existing index! In order to achieve this, * a document has to be deleted from an index and a new changed version of that * document has to be added.

    */ public final void removeField(String name) { Iterator it = fields.iterator(); while (it.hasNext()) { Fieldable field = (Fieldable)it.next(); if (field.name().equals(name)) { it.remove(); return; } } } /** *

    Removes all fields with the given name from the document. * If there is no field with the specified name, the document remains unchanged.

    *

    Note that the removeField(s) methods like the add method only make sense * prior to adding a document to an index. These methods cannot * be used to change the content of an existing index! In order to achieve this, * a document has to be deleted from an index and a new changed version of that * document has to be added.

    */ public final void removeFields(String name) { Iterator it = fields.iterator(); while (it.hasNext()) { Fieldable field = (Fieldable)it.next(); if (field.name().equals(name)) { it.remove(); } } } /** Returns a field with the given name if any exist in this document, or * null. If multiple fields exists with this name, this method returns the * first value added. * Do not use this method with lazy loaded fields. */ public final Field getField(String name) { for (int i = 0; i < fields.size(); i++) { Field field = (Field)fields.get(i); if (field.name().equals(name)) return field; } return null; } /** Returns a field with the given name if any exist in this document, or * null. If multiple fields exists with this name, this method returns the * first value added. */ public Fieldable getFieldable(String name) { for (int i = 0; i < fields.size(); i++) { Fieldable field = (Fieldable)fields.get(i); if (field.name().equals(name)) return field; } return null; } /** Returns the string value of the field with the given name if any exist in * this document, or null. If multiple fields exist with this name, this * method returns the first value added. If only binary fields with this name * exist, returns null. */ public final String get(String name) { for (int i = 0; i < fields.size(); i++) { Fieldable field = (Fieldable)fields.get(i); if (field.name().equals(name) && (!field.isBinary())) return field.stringValue(); } return null; } /** Returns an Enumeration of all the fields in a document. * @deprecated use {@link #getFields()} instead */ public final Enumeration fields() { return new Enumeration() { final Iterator iter = fields.iterator(); public boolean hasMoreElements() { return iter.hasNext(); } public Object nextElement() { return iter.next(); } }; } /** Returns a List of all the fields in a document. *

    Note that fields which are not {@link Fieldable#isStored() stored} are * not available in documents retrieved from the * index, e.g. {@link Searcher#doc(int)} or {@link * IndexReader#document(int)}. */ public final List getFields() { return fields; } private final static Field[] NO_FIELDS = new Field[0]; /** * Returns an array of {@link Field}s with the given name. * Do not use with lazy loaded fields. * This method returns an empty array when there are no * matching fields. It never returns null. * * @param name the name of the field * @return a Field[] array */ public final Field[] getFields(String name) { List result = new ArrayList(); for (int i = 0; i < fields.size(); i++) { Field field = (Field)fields.get(i); if (field.name().equals(name)) { result.add(field); } } if (result.size() == 0) return NO_FIELDS; return (Field[])result.toArray(new Field[result.size()]); } private final static Fieldable[] NO_FIELDABLES = new Fieldable[0]; /** * Returns an array of {@link Fieldable}s with the given name. * This method returns an empty array when there are no * matching fields. It never returns null. * * @param name the name of the field * @return a Fieldable[] array */ public Fieldable[] getFieldables(String name) { List result = new ArrayList(); for (int i = 0; i < fields.size(); i++) { Fieldable field = (Fieldable)fields.get(i); if (field.name().equals(name)) { result.add(field); } } if (result.size() == 0) return NO_FIELDABLES; return (Fieldable[])result.toArray(new Fieldable[result.size()]); } private final static String[] NO_STRINGS = new String[0]; /** * Returns an array of values of the field specified as the method parameter. * This method returns an empty array when there are no * matching fields. It never returns null. * @param name the name of the field * @return a String[] of field values */ public final String[] getValues(String name) { List result = new ArrayList(); for (int i = 0; i < fields.size(); i++) { Fieldable field = (Fieldable)fields.get(i); if (field.name().equals(name) && (!field.isBinary())) result.add(field.stringValue()); } if (result.size() == 0) return NO_STRINGS; return (String[])result.toArray(new String[result.size()]); } private final static byte[][] NO_BYTES = new byte[0][]; /** * Returns an array of byte arrays for of the fields that have the name specified * as the method parameter. This method returns an empty * array when there are no matching fields. It never * returns null. * * @param name the name of the field * @return a byte[][] of binary field values */ public final byte[][] getBinaryValues(String name) { List result = new ArrayList(); for (int i = 0; i < fields.size(); i++) { Fieldable field = (Fieldable)fields.get(i); if (field.name().equals(name) && (field.isBinary())) result.add(field.binaryValue()); } if (result.size() == 0) return NO_BYTES; return (byte[][])result.toArray(new byte[result.size()][]); } /** * Returns an array of bytes for the first (or only) field that has the name * specified as the method parameter. This method will return null * if no binary fields with the specified name are available. * There may be non-binary fields with the same name. * * @param name the name of the field. * @return a byte[] containing the binary field value or null */ public final byte[] getBinaryValue(String name) { for (int i=0; i < fields.size(); i++) { Fieldable field = (Fieldable)fields.get(i); if (field.name().equals(name) && (field.isBinary())) return field.binaryValue(); } return null; } /** Prints the fields of a document for human consumption. */ public final String toString() { StringBuffer buffer = new StringBuffer(); buffer.append("Document<"); for (int i = 0; i < fields.size(); i++) { Fieldable field = (Fieldable)fields.get(i); buffer.append(field.toString()); if (i != fields.size()-1) buffer.append(" "); } buffer.append(">"); return buffer.toString(); } } lucene-2.9.4/src/java/org/apache/lucene/document/CompressionTools.java0000644000175000017500000001074711474320231026457 0ustar janpascaljanpascalpackage org.apache.lucene.document; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.zip.Deflater; import java.util.zip.Inflater; import java.util.zip.DataFormatException; import java.io.ByteArrayOutputStream; import org.apache.lucene.util.UnicodeUtil; /** Simple utility class providing static methods to * compress and decompress binary data for stored fields. * This class uses java.util.zip.Deflater and Inflater * classes to compress and decompress, which is the same * format previously used by the now deprecated * Field.Store.COMPRESS. */ public class CompressionTools { // Export only static methods private CompressionTools() {} /** Compresses the specified byte range using the * specified compressionLevel (constants are defined in * java.util.zip.Deflater). */ public static byte[] compress(byte[] value, int offset, int length, int compressionLevel) { /* Create an expandable byte array to hold the compressed data. * You cannot use an array that's the same size as the orginal because * there is no guarantee that the compressed data will be smaller than * the uncompressed data. */ ByteArrayOutputStream bos = new ByteArrayOutputStream(length); Deflater compressor = new Deflater(); try { compressor.setLevel(compressionLevel); compressor.setInput(value, offset, length); compressor.finish(); // Compress the data final byte[] buf = new byte[1024]; while (!compressor.finished()) { int count = compressor.deflate(buf); bos.write(buf, 0, count); } } finally { compressor.end(); } return bos.toByteArray(); } /** Compresses the specified byte range, with default BEST_COMPRESSION level */ public static byte[] compress(byte[] value, int offset, int length) { return compress(value, offset, length, Deflater.BEST_COMPRESSION); } /** Compresses all bytes in the array, with default BEST_COMPRESSION level */ public static byte[] compress(byte[] value) { return compress(value, 0, value.length, Deflater.BEST_COMPRESSION); } /** Compresses the String value, with default BEST_COMPRESSION level */ public static byte[] compressString(String value) { return compressString(value, Deflater.BEST_COMPRESSION); } /** Compresses the String value using the specified * compressionLevel (constants are defined in * java.util.zip.Deflater). */ public static byte[] compressString(String value, int compressionLevel) { UnicodeUtil.UTF8Result result = new UnicodeUtil.UTF8Result(); UnicodeUtil.UTF16toUTF8(value, 0, value.length(), result); return compress(result.result, 0, result.length, compressionLevel); } /** Decompress the byte array previously returned by * compress */ public static byte[] decompress(byte[] value) throws DataFormatException { // Create an expandable byte array to hold the decompressed data ByteArrayOutputStream bos = new ByteArrayOutputStream(value.length); Inflater decompressor = new Inflater(); try { decompressor.setInput(value); // Decompress the data final byte[] buf = new byte[1024]; while (!decompressor.finished()) { int count = decompressor.inflate(buf); bos.write(buf, 0, count); } } finally { decompressor.end(); } return bos.toByteArray(); } /** Decompress the byte array previously returned by * compressString back into a String */ public static String decompressString(byte[] value) throws DataFormatException { UnicodeUtil.UTF16Result result = new UnicodeUtil.UTF16Result(); final byte[] bytes = decompress(value); UnicodeUtil.UTF8toUTF16(bytes, 0, bytes.length, result); return new String(result.result, 0, result.length); } } lucene-2.9.4/src/java/org/apache/lucene/document/package.html0000644000175000017500000001015411474320232024544 0ustar janpascaljanpascal

    The logical representation of a {@link org.apache.lucene.document.Document} for indexing and searching.

    The document package provides the user level logical representation of content to be indexed and searched. The package also provides utilities for working with {@link org.apache.lucene.document.Document}s and {@link org.apache.lucene.document.Fieldable}s.

    Document and Fieldable

    A {@link org.apache.lucene.document.Document} is a collection of {@link org.apache.lucene.document.Fieldable}s. A {@link org.apache.lucene.document.Fieldable} is a logical representation of a user's content that needs to be indexed or stored. {@link org.apache.lucene.document.Fieldable}s have a number of properties that tell Lucene how to treat the content (like indexed, tokenized, stored, etc.) See the {@link org.apache.lucene.document.Field} implementation of {@link org.apache.lucene.document.Fieldable} for specifics on these properties.

    Note: it is common to refer to {@link org.apache.lucene.document.Document}s having {@link org.apache.lucene.document.Field}s, even though technically they have {@link org.apache.lucene.document.Fieldable}s.

    Working with Documents

    First and foremost, a {@link org.apache.lucene.document.Document} is something created by the user application. It is your job to create Documents based on the content of the files you are working with in your application (Word, txt, PDF, Excel or any other format.) How this is done is completely up to you. That being said, there are many tools available in other projects that can make the process of taking a file and converting it into a Lucene {@link org.apache.lucene.document.Document}. To see an example of this, take a look at the Lucene demo and the associated source code for extracting content from HTML.

    The {@link org.apache.lucene.document.DateTools} is a utility class to make dates and times searchable (remember, Lucene only searches text). {@link org.apache.lucene.document.NumericField} is a special helper class to simplify indexing of numeric values (and also dates) for fast range range queries with {@link org.apache.lucene.search.NumericRangeQuery} (using a special sortable string representation of numeric values).

    The {@link org.apache.lucene.document.FieldSelector} class provides a mechanism to tell Lucene how to load Documents from storage. If no FieldSelector is used, all Fieldables on a Document will be loaded. As an example of the FieldSelector usage, consider the common use case of displaying search results on a web page and then having users click through to see the full document. In this scenario, it is often the case that there are many small fields and one or two large fields (containing the contents of the original file). Before the FieldSelector, the full Document had to be loaded, including the large fields, in order to display the results. Now, using the FieldSelector, one can {@link org.apache.lucene.document.FieldSelectorResult#LAZY_LOAD} the large fields, thus only loading the large fields when a user clicks on the actual link to view the original content.

    lucene-2.9.4/src/java/org/apache/lucene/document/AbstractField.java0000644000175000017500000003011711474320232025636 0ustar janpascaljanpascalpackage org.apache.lucene.document; /** * Copyright 2006 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.search.PhraseQuery; // for javadocs import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.util.StringHelper; // for javadocs /** * * **/ public abstract class AbstractField implements Fieldable { protected String name = "body"; protected boolean storeTermVector = false; protected boolean storeOffsetWithTermVector = false; protected boolean storePositionWithTermVector = false; protected boolean omitNorms = false; protected boolean isStored = false; protected boolean isIndexed = true; protected boolean isTokenized = true; protected boolean isBinary = false; protected boolean isCompressed = false; protected boolean lazy = false; protected boolean omitTermFreqAndPositions = false; protected float boost = 1.0f; // the data object for all different kind of field values protected Object fieldsData = null; // pre-analyzed tokenStream for indexed fields protected TokenStream tokenStream; // length/offset for all primitive types protected int binaryLength; protected int binaryOffset; protected AbstractField() { } protected AbstractField(String name, Field.Store store, Field.Index index, Field.TermVector termVector) { if (name == null) throw new NullPointerException("name cannot be null"); this.name = StringHelper.intern(name); // field names are interned if (store == Field.Store.YES){ this.isStored = true; this.isCompressed = false; } else if (store == Field.Store.COMPRESS) { this.isStored = true; this.isCompressed = true; } else if (store == Field.Store.NO){ this.isStored = false; this.isCompressed = false; } else throw new IllegalArgumentException("unknown store parameter " + store); if (index == Field.Index.NO) { this.isIndexed = false; this.isTokenized = false; } else if (index == Field.Index.ANALYZED) { this.isIndexed = true; this.isTokenized = true; } else if (index == Field.Index.NOT_ANALYZED) { this.isIndexed = true; this.isTokenized = false; } else if (index == Field.Index.NOT_ANALYZED_NO_NORMS) { this.isIndexed = true; this.isTokenized = false; this.omitNorms = true; } else if (index == Field.Index.ANALYZED_NO_NORMS) { this.isIndexed = true; this.isTokenized = true; this.omitNorms = true; } else { throw new IllegalArgumentException("unknown index parameter " + index); } this.isBinary = false; setStoreTermVector(termVector); } /** Sets the boost factor hits on this field. This value will be * multiplied into the score of all hits on this this field of this * document. * *

    The boost is multiplied by {@link org.apache.lucene.document.Document#getBoost()} of the document * containing this field. If a document has multiple fields with the same * name, all such values are multiplied together. This product is then * used to compute the norm factor for the field. By * default, in the {@link * org.apache.lucene.search.Similarity#computeNorm(String, * FieldInvertState)} method, the boost value is multipled * by the {@link * org.apache.lucene.search.Similarity#lengthNorm(String, * int)} and then * rounded by {@link org.apache.lucene.search.Similarity#encodeNorm(float)} before it is stored in the * index. One should attempt to ensure that this product does not overflow * the range of that encoding. * * @see org.apache.lucene.document.Document#setBoost(float) * @see org.apache.lucene.search.Similarity#computeNorm(String, org.apache.lucene.index.FieldInvertState) * @see org.apache.lucene.search.Similarity#encodeNorm(float) */ public void setBoost(float boost) { this.boost = boost; } /** Returns the boost factor for hits for this field. * *

    The default value is 1.0. * *

    Note: this value is not stored directly with the document in the index. * Documents returned from {@link org.apache.lucene.index.IndexReader#document(int)} and * {@link org.apache.lucene.search.Hits#doc(int)} may thus not have the same value present as when * this field was indexed. * * @see #setBoost(float) */ public float getBoost() { return boost; } /** Returns the name of the field as an interned string. * For example "date", "title", "body", ... */ public String name() { return name; } protected void setStoreTermVector(Field.TermVector termVector) { if (termVector == Field.TermVector.NO) { this.storeTermVector = false; this.storePositionWithTermVector = false; this.storeOffsetWithTermVector = false; } else if (termVector == Field.TermVector.YES) { this.storeTermVector = true; this.storePositionWithTermVector = false; this.storeOffsetWithTermVector = false; } else if (termVector == Field.TermVector.WITH_POSITIONS) { this.storeTermVector = true; this.storePositionWithTermVector = true; this.storeOffsetWithTermVector = false; } else if (termVector == Field.TermVector.WITH_OFFSETS) { this.storeTermVector = true; this.storePositionWithTermVector = false; this.storeOffsetWithTermVector = true; } else if (termVector == Field.TermVector.WITH_POSITIONS_OFFSETS) { this.storeTermVector = true; this.storePositionWithTermVector = true; this.storeOffsetWithTermVector = true; } else { throw new IllegalArgumentException("unknown termVector parameter " + termVector); } } /** True iff the value of the field is to be stored in the index for return with search hits. It is an error for this to be true if a field is Reader-valued. */ public final boolean isStored() { return isStored; } /** True iff the value of the field is to be indexed, so that it may be searched on. */ public final boolean isIndexed() { return isIndexed; } /** True iff the value of the field should be tokenized as text prior to indexing. Un-tokenized fields are indexed as a single word and may not be Reader-valued. */ public final boolean isTokenized() { return isTokenized; } /** True if the value of the field is stored and compressed within the index */ public final boolean isCompressed() { return isCompressed; } /** True iff the term or terms used to index this field are stored as a term * vector, available from {@link org.apache.lucene.index.IndexReader#getTermFreqVector(int,String)}. * These methods do not provide access to the original content of the field, * only to terms used to index it. If the original content must be * preserved, use the stored attribute instead. * * @see org.apache.lucene.index.IndexReader#getTermFreqVector(int, String) */ public final boolean isTermVectorStored() { return storeTermVector; } /** * True iff terms are stored as term vector together with their offsets * (start and end position in source text). */ public boolean isStoreOffsetWithTermVector(){ return storeOffsetWithTermVector; } /** * True iff terms are stored as term vector together with their token positions. */ public boolean isStorePositionWithTermVector(){ return storePositionWithTermVector; } /** True iff the value of the filed is stored as binary */ public final boolean isBinary() { return isBinary; } /** * Return the raw byte[] for the binary field. Note that * you must also call {@link #getBinaryLength} and {@link * #getBinaryOffset} to know which range of bytes in this * returned array belong to the field. * @return reference to the Field value as byte[]. */ public byte[] getBinaryValue() { return getBinaryValue(null); } public byte[] getBinaryValue(byte[] result){ if (isBinary || fieldsData instanceof byte[]) return (byte[]) fieldsData; else return null; } /** * Returns length of byte[] segment that is used as value, if Field is not binary * returned value is undefined * @return length of byte[] segment that represents this Field value */ public int getBinaryLength() { if (isBinary) { if (!isCompressed) return binaryLength; else return ((byte[]) fieldsData).length; } else if (fieldsData instanceof byte[]) return ((byte[]) fieldsData).length; else return 0; } /** * Returns offset into byte[] segment that is used as value, if Field is not binary * returned value is undefined * @return index of the first character in byte[] segment that represents this Field value */ public int getBinaryOffset() { return binaryOffset; } /** True if norms are omitted for this indexed field */ public boolean getOmitNorms() { return omitNorms; } /** @deprecated Renamed to {@link #getOmitTermFreqAndPositions} */ public boolean getOmitTf() { return omitTermFreqAndPositions; } /** @see #setOmitTermFreqAndPositions */ public boolean getOmitTermFreqAndPositions() { return omitTermFreqAndPositions; } /** Expert: * * If set, omit normalization factors associated with this indexed field. * This effectively disables indexing boosts and length normalization for this field. */ public void setOmitNorms(boolean omitNorms) { this.omitNorms=omitNorms; } /** @deprecated Renamed to {@link #setOmitTermFreqAndPositions} */ public void setOmitTf(boolean omitTermFreqAndPositions) { this.omitTermFreqAndPositions=omitTermFreqAndPositions; } /** Expert: * * If set, omit term freq, positions and payloads from * postings for this field. * *

    NOTE: While this option reduces storage space * required in the index, it also means any query * requiring positional information, such as {@link * PhraseQuery} or {@link SpanQuery} subclasses will * silently fail to find results. */ public void setOmitTermFreqAndPositions(boolean omitTermFreqAndPositions) { this.omitTermFreqAndPositions=omitTermFreqAndPositions; } public boolean isLazy() { return lazy; } /** Prints a Field for human consumption. */ public final String toString() { StringBuffer result = new StringBuffer(); if (isStored) { result.append("stored"); if (isCompressed) result.append("/compressed"); else result.append("/uncompressed"); } if (isIndexed) { if (result.length() > 0) result.append(","); result.append("indexed"); } if (isTokenized) { if (result.length() > 0) result.append(","); result.append("tokenized"); } if (storeTermVector) { if (result.length() > 0) result.append(","); result.append("termVector"); } if (storeOffsetWithTermVector) { if (result.length() > 0) result.append(","); result.append("termVectorOffsets"); } if (storePositionWithTermVector) { if (result.length() > 0) result.append(","); result.append("termVectorPosition"); } if (isBinary) { if (result.length() > 0) result.append(","); result.append("binary"); } if (omitNorms) { result.append(",omitNorms"); } if (omitTermFreqAndPositions) { result.append(",omitTermFreqAndPositions"); } if (lazy){ result.append(",lazy"); } result.append('<'); result.append(name); result.append(':'); if (fieldsData != null && lazy == false) { result.append(fieldsData); } result.append('>'); return result.toString(); } } lucene-2.9.4/src/java/org/apache/lucene/document/LoadFirstFieldSelector.java0000644000175000017500000000163611474320232027467 0ustar janpascaljanpascalpackage org.apache.lucene.document; /** * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Load the First field and break. *

    * See {@link FieldSelectorResult#LOAD_AND_BREAK} */ public class LoadFirstFieldSelector implements FieldSelector { public FieldSelectorResult accept(String fieldName) { return FieldSelectorResult.LOAD_AND_BREAK; } }lucene-2.9.4/src/java/org/apache/lucene/document/Field.java0000644000175000017500000005075311474320232024162 0ustar janpascaljanpascalpackage org.apache.lucene.document; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.index.IndexWriter; // for javadoc import org.apache.lucene.util.Parameter; import org.apache.lucene.util.StringHelper; import java.io.Reader; import java.io.Serializable; /** A field is a section of a Document. Each field has two parts, a name and a value. Values may be free text, provided as a String or as a Reader, or they may be atomic keywords, which are not further processed. Such keywords may be used to represent dates, urls, etc. Fields are optionally stored in the index, so that they may be returned with hits on the document. */ public final class Field extends AbstractField implements Fieldable, Serializable { /** Specifies whether and how a field should be stored. */ public static final class Store extends Parameter implements Serializable { private Store(String name) { super(name); } /** Store the original field value in the index in a compressed form. This is * useful for long documents and for binary valued fields. * @deprecated Please use {@link CompressionTools} instead. * For string fields that were previously indexed and stored using compression, * the new way to achieve this is: First add the field indexed-only (no store) * and additionally using the same field name as a binary, stored field * with {@link CompressionTools#compressString}. */ public static final Store COMPRESS = new Store("COMPRESS"); /** Store the original field value in the index. This is useful for short texts * like a document's title which should be displayed with the results. The * value is stored in its original form, i.e. no analyzer is used before it is * stored. */ public static final Store YES = new Store("YES"); /** Do not store the field value in the index. */ public static final Store NO = new Store("NO"); } /** Specifies whether and how a field should be indexed. */ public static final class Index extends Parameter implements Serializable { private Index(String name) { super(name); } /** Do not index the field value. This field can thus not be searched, * but one can still access its contents provided it is * {@link Field.Store stored}. */ public static final Index NO = new Index("NO"); /** Index the tokens produced by running the field's * value through an Analyzer. This is useful for * common text. */ public static final Index ANALYZED = new Index("ANALYZED"); /** @deprecated this has been renamed to {@link #ANALYZED} */ public static final Index TOKENIZED = ANALYZED; /** Index the field's value without using an Analyzer, so it can be searched. * As no analyzer is used the value will be stored as a single term. This is * useful for unique Ids like product numbers. */ public static final Index NOT_ANALYZED = new Index("NOT_ANALYZED"); /** @deprecated This has been renamed to {@link #NOT_ANALYZED} */ public static final Index UN_TOKENIZED = NOT_ANALYZED; /** Expert: Index the field's value without an Analyzer, * and also disable the storing of norms. Note that you * can also separately enable/disable norms by calling * {@link Field#setOmitNorms}. No norms means that * index-time field and document boosting and field * length normalization are disabled. The benefit is * less memory usage as norms take up one byte of RAM * per indexed field for every document in the index, * during searching. Note that once you index a given * field with norms enabled, disabling norms will * have no effect. In other words, for this to have the * above described effect on a field, all instances of * that field must be indexed with NOT_ANALYZED_NO_NORMS * from the beginning. */ public static final Index NOT_ANALYZED_NO_NORMS = new Index("NOT_ANALYZED_NO_NORMS"); /** @deprecated This has been renamed to * {@link #NOT_ANALYZED_NO_NORMS} */ public static final Index NO_NORMS = NOT_ANALYZED_NO_NORMS; /** Expert: Index the tokens produced by running the * field's value through an Analyzer, and also * separately disable the storing of norms. See * {@link #NOT_ANALYZED_NO_NORMS} for what norms are * and why you may want to disable them. */ public static final Index ANALYZED_NO_NORMS = new Index("ANALYZED_NO_NORMS"); } /** Specifies whether and how a field should have term vectors. */ public static final class TermVector extends Parameter implements Serializable { private TermVector(String name) { super(name); } /** Do not store term vectors. */ public static final TermVector NO = new TermVector("NO"); /** Store the term vectors of each document. A term vector is a list * of the document's terms and their number of occurrences in that document. */ public static final TermVector YES = new TermVector("YES"); /** * Store the term vector + token position information * * @see #YES */ public static final TermVector WITH_POSITIONS = new TermVector("WITH_POSITIONS"); /** * Store the term vector + Token offset information * * @see #YES */ public static final TermVector WITH_OFFSETS = new TermVector("WITH_OFFSETS"); /** * Store the term vector + Token position and offset information * * @see #YES * @see #WITH_POSITIONS * @see #WITH_OFFSETS */ public static final TermVector WITH_POSITIONS_OFFSETS = new TermVector("WITH_POSITIONS_OFFSETS"); } /** The value of the field as a String, or null. If null, the Reader value or * binary value is used. Exactly one of stringValue(), * readerValue(), and getBinaryValue() must be set. */ public String stringValue() { return fieldsData instanceof String ? (String)fieldsData : null; } /** The value of the field as a Reader, or null. If null, the String value or * binary value is used. Exactly one of stringValue(), * readerValue(), and getBinaryValue() must be set. */ public Reader readerValue() { return fieldsData instanceof Reader ? (Reader)fieldsData : null; } /** The value of the field in Binary, or null. If null, the Reader value, * or String value is used. Exactly one of stringValue(), * readerValue(), and getBinaryValue() must be set. * @deprecated This method must allocate a new byte[] if * the {@link AbstractField#getBinaryOffset()} is non-zero * or {@link AbstractField#getBinaryLength()} is not the * full length of the byte[]. Please use {@link * AbstractField#getBinaryValue()} instead, which simply * returns the byte[]. */ public byte[] binaryValue() { if (!isBinary) return null; final byte[] data = (byte[]) fieldsData; if (binaryOffset == 0 && data.length == binaryLength) return data; //Optimization final byte[] ret = new byte[binaryLength]; System.arraycopy(data, binaryOffset, ret, 0, binaryLength); return ret; } /** The TokesStream for this field to be used when indexing, or null. If null, the Reader value * or String value is analyzed to produce the indexed tokens. */ public TokenStream tokenStreamValue() { return tokenStream; } /**

    Expert: change the value of this field. This can * be used during indexing to re-use a single Field * instance to improve indexing speed by avoiding GC cost * of new'ing and reclaiming Field instances. Typically * a single {@link Document} instance is re-used as * well. This helps most on small documents.

    * *

    Each Field instance should only be used once * within a single {@link Document} instance. See ImproveIndexingSpeed * for details.

    */ public void setValue(String value) { if (isBinary) { throw new IllegalArgumentException("cannot set a String value on a binary field"); } fieldsData = value; } /** Expert: change the value of this field. See setValue(String). */ public void setValue(Reader value) { if (isBinary) { throw new IllegalArgumentException("cannot set a Reader value on a binary field"); } if (isStored) { throw new IllegalArgumentException("cannot set a Reader value on a stored field"); } fieldsData = value; } /** Expert: change the value of this field. See setValue(String). */ public void setValue(byte[] value) { if (!isBinary) { throw new IllegalArgumentException("cannot set a byte[] value on a non-binary field"); } fieldsData = value; binaryLength = value.length; binaryOffset = 0; } /** Expert: change the value of this field. See setValue(String). */ public void setValue(byte[] value, int offset, int length) { if (!isBinary) { throw new IllegalArgumentException("cannot set a byte[] value on a non-binary field"); } fieldsData = value; binaryLength = length; binaryOffset = offset; } /** Expert: change the value of this field. See setValue(String). * @deprecated use {@link #setTokenStream} */ public void setValue(TokenStream value) { if (isBinary) { throw new IllegalArgumentException("cannot set a TokenStream value on a binary field"); } if (isStored) { throw new IllegalArgumentException("cannot set a TokenStream value on a stored field"); } fieldsData = null; tokenStream = value; } /** Expert: sets the token stream to be used for indexing and causes isIndexed() and isTokenized() to return true. * May be combined with stored values from stringValue() or binaryValue() */ public void setTokenStream(TokenStream tokenStream) { this.isIndexed = true; this.isTokenized = true; this.tokenStream = tokenStream; } /** * Create a field by specifying its name, value and how it will * be saved in the index. Term vectors will not be stored in the index. * * @param name The name of the field * @param value The string to process * @param store Whether value should be stored in the index * @param index Whether the field should be indexed, and if so, if it should * be tokenized before indexing * @throws NullPointerException if name or value is null * @throws IllegalArgumentException if the field is neither stored nor indexed */ public Field(String name, String value, Store store, Index index) { this(name, value, store, index, TermVector.NO); } /** * Create a field by specifying its name, value and how it will * be saved in the index. * * @param name The name of the field * @param value The string to process * @param store Whether value should be stored in the index * @param index Whether the field should be indexed, and if so, if it should * be tokenized before indexing * @param termVector Whether term vector should be stored * @throws NullPointerException if name or value is null * @throws IllegalArgumentException in any of the following situations: *
      *
    • the field is neither stored nor indexed
    • *
    • the field is not indexed but termVector is TermVector.YES
    • *
    */ public Field(String name, String value, Store store, Index index, TermVector termVector) { this(name, true, value, store, index, termVector); } /** * Create a field by specifying its name, value and how it will * be saved in the index. * * @param name The name of the field * @param internName Whether to .intern() name or not * @param value The string to process * @param store Whether value should be stored in the index * @param index Whether the field should be indexed, and if so, if it should * be tokenized before indexing * @param termVector Whether term vector should be stored * @throws NullPointerException if name or value is null * @throws IllegalArgumentException in any of the following situations: *
      *
    • the field is neither stored nor indexed
    • *
    • the field is not indexed but termVector is TermVector.YES
    • *
    */ public Field(String name, boolean internName, String value, Store store, Index index, TermVector termVector) { if (name == null) throw new NullPointerException("name cannot be null"); if (value == null) throw new NullPointerException("value cannot be null"); if (name.length() == 0 && value.length() == 0) throw new IllegalArgumentException("name and value cannot both be empty"); if (index == Index.NO && store == Store.NO) throw new IllegalArgumentException("it doesn't make sense to have a field that " + "is neither indexed nor stored"); if (index == Index.NO && termVector != TermVector.NO) throw new IllegalArgumentException("cannot store term vector information " + "for a field that is not indexed"); if (internName) // field names are optionally interned name = StringHelper.intern(name); this.name = name; this.fieldsData = value; if (store == Store.YES){ this.isStored = true; this.isCompressed = false; } else if (store == Store.COMPRESS) { this.isStored = true; this.isCompressed = true; } else if (store == Store.NO){ this.isStored = false; this.isCompressed = false; } else throw new IllegalArgumentException("unknown store parameter " + store); if (index == Index.NO) { this.isIndexed = false; this.isTokenized = false; this.omitTermFreqAndPositions = false; this.omitNorms = true; } else if (index == Index.ANALYZED) { this.isIndexed = true; this.isTokenized = true; } else if (index == Index.NOT_ANALYZED) { this.isIndexed = true; this.isTokenized = false; } else if (index == Index.NOT_ANALYZED_NO_NORMS) { this.isIndexed = true; this.isTokenized = false; this.omitNorms = true; } else if (index == Index.ANALYZED_NO_NORMS) { this.isIndexed = true; this.isTokenized = true; this.omitNorms = true; } else { throw new IllegalArgumentException("unknown index parameter " + index); } this.isBinary = false; setStoreTermVector(termVector); } /** * Create a tokenized and indexed field that is not stored. Term vectors will * not be stored. The Reader is read only when the Document is added to the index, * i.e. you may not close the Reader until {@link IndexWriter#addDocument(Document)} * has been called. * * @param name The name of the field * @param reader The reader with the content * @throws NullPointerException if name or reader is null */ public Field(String name, Reader reader) { this(name, reader, TermVector.NO); } /** * Create a tokenized and indexed field that is not stored, optionally with * storing term vectors. The Reader is read only when the Document is added to the index, * i.e. you may not close the Reader until {@link IndexWriter#addDocument(Document)} * has been called. * * @param name The name of the field * @param reader The reader with the content * @param termVector Whether term vector should be stored * @throws NullPointerException if name or reader is null */ public Field(String name, Reader reader, TermVector termVector) { if (name == null) throw new NullPointerException("name cannot be null"); if (reader == null) throw new NullPointerException("reader cannot be null"); this.name = StringHelper.intern(name); // field names are interned this.fieldsData = reader; this.isStored = false; this.isCompressed = false; this.isIndexed = true; this.isTokenized = true; this.isBinary = false; setStoreTermVector(termVector); } /** * Create a tokenized and indexed field that is not stored. Term vectors will * not be stored. This is useful for pre-analyzed fields. * The TokenStream is read only when the Document is added to the index, * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)} * has been called. * * @param name The name of the field * @param tokenStream The TokenStream with the content * @throws NullPointerException if name or tokenStream is null */ public Field(String name, TokenStream tokenStream) { this(name, tokenStream, TermVector.NO); } /** * Create a tokenized and indexed field that is not stored, optionally with * storing term vectors. This is useful for pre-analyzed fields. * The TokenStream is read only when the Document is added to the index, * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)} * has been called. * * @param name The name of the field * @param tokenStream The TokenStream with the content * @param termVector Whether term vector should be stored * @throws NullPointerException if name or tokenStream is null */ public Field(String name, TokenStream tokenStream, TermVector termVector) { if (name == null) throw new NullPointerException("name cannot be null"); if (tokenStream == null) throw new NullPointerException("tokenStream cannot be null"); this.name = StringHelper.intern(name); // field names are interned this.fieldsData = null; this.tokenStream = tokenStream; this.isStored = false; this.isCompressed = false; this.isIndexed = true; this.isTokenized = true; this.isBinary = false; setStoreTermVector(termVector); } /** * Create a stored field with binary value. Optionally the value may be compressed. * * @param name The name of the field * @param value The binary value * @param store How value should be stored (compressed or not) * @throws IllegalArgumentException if store is Store.NO */ public Field(String name, byte[] value, Store store) { this(name, value, 0, value.length, store); } /** * Create a stored field with binary value. Optionally the value may be compressed. * * @param name The name of the field * @param value The binary value * @param offset Starting offset in value where this Field's bytes are * @param length Number of bytes to use for this Field, starting at offset * @param store How value should be stored (compressed or not) * @throws IllegalArgumentException if store is Store.NO */ public Field(String name, byte[] value, int offset, int length, Store store) { if (name == null) throw new IllegalArgumentException("name cannot be null"); if (value == null) throw new IllegalArgumentException("value cannot be null"); this.name = StringHelper.intern(name); // field names are interned fieldsData = value; if (store == Store.YES) { isStored = true; isCompressed = false; } else if (store == Store.COMPRESS) { isStored = true; isCompressed = true; } else if (store == Store.NO) throw new IllegalArgumentException("binary values can't be unstored"); else throw new IllegalArgumentException("unknown store parameter " + store); isIndexed = false; isTokenized = false; omitTermFreqAndPositions = false; omitNorms = true; isBinary = true; binaryLength = length; binaryOffset = offset; setStoreTermVector(TermVector.NO); } } lucene-2.9.4/src/java/org/apache/lucene/document/DateTools.java0000644000175000017500000002451211474320232025027 0ustar janpascaljanpascalpackage org.apache.lucene.document; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; import java.util.TimeZone; import java.util.Locale; import org.apache.lucene.search.NumericRangeQuery; // for javadocs import org.apache.lucene.util.NumericUtils; // for javadocs /** * Provides support for converting dates to strings and vice-versa. * The strings are structured so that lexicographic sorting orders * them by date, which makes them suitable for use as field values * and search terms. * *

    This class also helps you to limit the resolution of your dates. Do not * save dates with a finer resolution than you really need, as then * RangeQuery and PrefixQuery will require more memory and become slower. * *

    Compared to {@link DateField} the strings generated by the methods * in this class take slightly more space, unless your selected resolution * is set to Resolution.DAY or lower. * *

    * Another approach is {@link NumericUtils}, which provides * a sortable binary representation (prefix encoded) of numeric values, which * date/time are. * For indexing a {@link Date} or {@link Calendar}, just get the unix timestamp as * long using {@link Date#getTime} or {@link Calendar#getTimeInMillis} and * index this as a numeric value with {@link NumericField} * and use {@link NumericRangeQuery} to query it. */ public class DateTools { private final static TimeZone GMT = TimeZone.getTimeZone("GMT"); private static final SimpleDateFormat YEAR_FORMAT = new SimpleDateFormat("yyyy", Locale.US); private static final SimpleDateFormat MONTH_FORMAT = new SimpleDateFormat("yyyyMM", Locale.US); private static final SimpleDateFormat DAY_FORMAT = new SimpleDateFormat("yyyyMMdd", Locale.US); private static final SimpleDateFormat HOUR_FORMAT = new SimpleDateFormat("yyyyMMddHH", Locale.US); private static final SimpleDateFormat MINUTE_FORMAT = new SimpleDateFormat("yyyyMMddHHmm", Locale.US); private static final SimpleDateFormat SECOND_FORMAT = new SimpleDateFormat("yyyyMMddHHmmss", Locale.US); private static final SimpleDateFormat MILLISECOND_FORMAT = new SimpleDateFormat("yyyyMMddHHmmssSSS", Locale.US); static { // times need to be normalized so the value doesn't depend on the // location the index is created/used: YEAR_FORMAT.setTimeZone(GMT); MONTH_FORMAT.setTimeZone(GMT); DAY_FORMAT.setTimeZone(GMT); HOUR_FORMAT.setTimeZone(GMT); MINUTE_FORMAT.setTimeZone(GMT); SECOND_FORMAT.setTimeZone(GMT); MILLISECOND_FORMAT.setTimeZone(GMT); } private static final Calendar calInstance = Calendar.getInstance(GMT); // cannot create, the class has static methods only private DateTools() {} /** * Converts a Date to a string suitable for indexing. * * @param date the date to be converted * @param resolution the desired resolution, see * {@link #round(Date, DateTools.Resolution)} * @return a string in format yyyyMMddHHmmssSSS or shorter, * depending on resolution; using GMT as timezone */ public static synchronized String dateToString(Date date, Resolution resolution) { return timeToString(date.getTime(), resolution); } /** * Converts a millisecond time to a string suitable for indexing. * * @param time the date expressed as milliseconds since January 1, 1970, 00:00:00 GMT * @param resolution the desired resolution, see * {@link #round(long, DateTools.Resolution)} * @return a string in format yyyyMMddHHmmssSSS or shorter, * depending on resolution; using GMT as timezone */ public static synchronized String timeToString(long time, Resolution resolution) { calInstance.setTimeInMillis(round(time, resolution)); Date date = calInstance.getTime(); if (resolution == Resolution.YEAR) { return YEAR_FORMAT.format(date); } else if (resolution == Resolution.MONTH) { return MONTH_FORMAT.format(date); } else if (resolution == Resolution.DAY) { return DAY_FORMAT.format(date); } else if (resolution == Resolution.HOUR) { return HOUR_FORMAT.format(date); } else if (resolution == Resolution.MINUTE) { return MINUTE_FORMAT.format(date); } else if (resolution == Resolution.SECOND) { return SECOND_FORMAT.format(date); } else if (resolution == Resolution.MILLISECOND) { return MILLISECOND_FORMAT.format(date); } throw new IllegalArgumentException("unknown resolution " + resolution); } /** * Converts a string produced by timeToString or * dateToString back to a time, represented as the * number of milliseconds since January 1, 1970, 00:00:00 GMT. * * @param dateString the date string to be converted * @return the number of milliseconds since January 1, 1970, 00:00:00 GMT * @throws ParseException if dateString is not in the * expected format */ public static synchronized long stringToTime(String dateString) throws ParseException { return stringToDate(dateString).getTime(); } /** * Converts a string produced by timeToString or * dateToString back to a time, represented as a * Date object. * * @param dateString the date string to be converted * @return the parsed time as a Date object * @throws ParseException if dateString is not in the * expected format */ public static synchronized Date stringToDate(String dateString) throws ParseException { if (dateString.length() == 4) { return YEAR_FORMAT.parse(dateString); } else if (dateString.length() == 6) { return MONTH_FORMAT.parse(dateString); } else if (dateString.length() == 8) { return DAY_FORMAT.parse(dateString); } else if (dateString.length() == 10) { return HOUR_FORMAT.parse(dateString); } else if (dateString.length() == 12) { return MINUTE_FORMAT.parse(dateString); } else if (dateString.length() == 14) { return SECOND_FORMAT.parse(dateString); } else if (dateString.length() == 17) { return MILLISECOND_FORMAT.parse(dateString); } throw new ParseException("Input is not valid date string: " + dateString, 0); } /** * Limit a date's resolution. For example, the date 2004-09-21 13:50:11 * will be changed to 2004-09-01 00:00:00 when using * Resolution.MONTH. * * @param resolution The desired resolution of the date to be returned * @return the date with all values more precise than resolution * set to 0 or 1 */ public static synchronized Date round(Date date, Resolution resolution) { return new Date(round(date.getTime(), resolution)); } /** * Limit a date's resolution. For example, the date 1095767411000 * (which represents 2004-09-21 13:50:11) will be changed to * 1093989600000 (2004-09-01 00:00:00) when using * Resolution.MONTH. * * @param resolution The desired resolution of the date to be returned * @return the date with all values more precise than resolution * set to 0 or 1, expressed as milliseconds since January 1, 1970, 00:00:00 GMT */ public static synchronized long round(long time, Resolution resolution) { calInstance.setTimeInMillis(time); if (resolution == Resolution.YEAR) { calInstance.set(Calendar.MONTH, 0); calInstance.set(Calendar.DAY_OF_MONTH, 1); calInstance.set(Calendar.HOUR_OF_DAY, 0); calInstance.set(Calendar.MINUTE, 0); calInstance.set(Calendar.SECOND, 0); calInstance.set(Calendar.MILLISECOND, 0); } else if (resolution == Resolution.MONTH) { calInstance.set(Calendar.DAY_OF_MONTH, 1); calInstance.set(Calendar.HOUR_OF_DAY, 0); calInstance.set(Calendar.MINUTE, 0); calInstance.set(Calendar.SECOND, 0); calInstance.set(Calendar.MILLISECOND, 0); } else if (resolution == Resolution.DAY) { calInstance.set(Calendar.HOUR_OF_DAY, 0); calInstance.set(Calendar.MINUTE, 0); calInstance.set(Calendar.SECOND, 0); calInstance.set(Calendar.MILLISECOND, 0); } else if (resolution == Resolution.HOUR) { calInstance.set(Calendar.MINUTE, 0); calInstance.set(Calendar.SECOND, 0); calInstance.set(Calendar.MILLISECOND, 0); } else if (resolution == Resolution.MINUTE) { calInstance.set(Calendar.SECOND, 0); calInstance.set(Calendar.MILLISECOND, 0); } else if (resolution == Resolution.SECOND) { calInstance.set(Calendar.MILLISECOND, 0); } else if (resolution == Resolution.MILLISECOND) { // don't cut off anything } else { throw new IllegalArgumentException("unknown resolution " + resolution); } return calInstance.getTimeInMillis(); } /** Specifies the time granularity. */ public static class Resolution { public static final Resolution YEAR = new Resolution("year"); public static final Resolution MONTH = new Resolution("month"); public static final Resolution DAY = new Resolution("day"); public static final Resolution HOUR = new Resolution("hour"); public static final Resolution MINUTE = new Resolution("minute"); public static final Resolution SECOND = new Resolution("second"); public static final Resolution MILLISECOND = new Resolution("millisecond"); private String resolution; private Resolution() { } private Resolution(String resolution) { this.resolution = resolution; } public String toString() { return resolution; } } } lucene-2.9.4/src/java/org/apache/lucene/document/FieldSelector.java0000644000175000017500000000233511474320232025654 0ustar janpascaljanpascalpackage org.apache.lucene.document; import java.io.Serializable; /** * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Similar to a {@link java.io.FileFilter}, the FieldSelector allows one to make decisions about * what Fields get loaded on a {@link Document} by {@link org.apache.lucene.index.IndexReader#document(int,org.apache.lucene.document.FieldSelector)} * **/ public interface FieldSelector extends Serializable { /** * * @param fieldName the field to accept or reject * @return an instance of {@link FieldSelectorResult} * if the {@link Field} named fieldName should be loaded. */ FieldSelectorResult accept(String fieldName); } lucene-2.9.4/src/java/org/apache/lucene/document/NumberTools.java0000644000175000017500000001075311474320232025404 0ustar janpascaljanpascalpackage org.apache.lucene.document; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.document.NumericField; // for javadocs import org.apache.lucene.search.NumericRangeQuery; // for javadocs import org.apache.lucene.util.NumericUtils; // for javadocs // do not remove this class in 3.0, it may be needed to decode old indexes! /** * Provides support for converting longs to Strings, and back again. The strings * are structured so that lexicographic sorting order is preserved. * *

    * That is, if l1 is less than l2 for any two longs l1 and l2, then * NumberTools.longToString(l1) is lexicographically less than * NumberTools.longToString(l2). (Similarly for "greater than" and "equals".) * *

    The create argument to the {@link #IndexWriter(Directory, Analyzer, boolean) constructor} determines whether a new index is created, or whether an existing index is opened. Note that you can open an index with create=true even while readers are using the index. The old readers will continue to search the "point in time" snapshot they had opened, and won't see the newly created index until they re-open. There are also {@link #IndexWriter(Directory, Analyzer) constructors} with no create argument which will create a new index if there is not already an index at the provided path and otherwise open the existing index.

    In either case, documents are added with {@link #addDocument(Document) addDocument} and removed with {@link #deleteDocuments(Term)} or {@link #deleteDocuments(Query)}. A document can be updated with {@link #updateDocument(Term, Document) updateDocument} (which just deletes and then adds the entire document). When finished adding, deleting and updating documents, {@link #close() close} should be called.

    These changes are buffered in memory and periodically flushed to the {@link Directory} (during the above method calls). A flush is triggered when there are enough buffered deletes (see {@link #setMaxBufferedDeleteTerms}) or enough added documents since the last flush, whichever is sooner. For the added documents, flushing is triggered either by RAM usage of the documents (see {@link #setRAMBufferSizeMB}) or the number of added documents. The default is to flush when RAM usage hits 16 MB. For best indexing speed you should flush by RAM usage with a large RAM buffer. Note that flushing just moves the internal buffered state in IndexWriter into the index, but these changes are not visible to IndexReader until either {@link #commit()} or {@link #close} is called. A flush may also trigger one or more segment merges which by default run with a background thread so as not to block the addDocument calls (see below for changing the {@link MergeScheduler}).

    The optional autoCommit argument to the {@link #IndexWriter(Directory, boolean, Analyzer) constructors} controls visibility of the changes to {@link IndexReader} instances reading the same index. When this is false, changes are not visible until {@link #close()} or {@link #commit()} is called. Note that changes will still be flushed to the {@link Directory} as new files, but are not committed (no new segments_N file is written referencing the new files, nor are the files sync'd to stable storage) until {@link #close()} or {@link #commit()} is called. If something goes terribly wrong (for example the JVM crashes), then the index will reflect none of the changes made since the last commit, or the starting state if commit was not called. You can also call {@link #rollback()}, which closes the writer without committing any changes, and removes any index files that had been flushed but are now unreferenced. This mode is useful for preventing readers from refreshing at a bad time (for example after you've done all your deletes but before you've done your adds). It can also be used to implement simple single-writer transactional semantics ("all or none"). You can do a two-phase commit by calling {@link #prepareCommit()} followed by {@link #commit()}. This is necessary when Lucene is working with an external resource (for example, a database) and both must either commit or rollback the transaction.

    When autoCommit is true then the writer will periodically commit on its own. [Deprecated: Note that in 3.0, IndexWriter will no longer accept autoCommit=true (it will be hardwired to false). You can always call {@link #commit()} yourself when needed]. There is no guarantee when exactly an auto commit will occur (it used to be after every flush, but it is now after every completed merge, as of 2.4). If you want to force a commit, call {@link #commit()}, or, close the writer. Once a commit has finished, newly opened {@link IndexReader} instances will see the changes to the index as of that commit. When running in this mode, be careful not to refresh your readers while optimize or segment merges are taking place as this can tie up substantial disk space.

    Regardless of autoCommit, an {@link IndexReader} or {@link org.apache.lucene.search.IndexSearcher} will only see the index as of the "point in time" that it was opened. Any changes committed to the index after the reader was opened are not visible until the reader is re-opened.

    If an index will not have more documents added for a while and optimal search performance is desired, then either the full {@link #optimize() optimize} method or partial {@link #optimize(int)} method should be called before the index is closed.

    Opening an IndexWriter creates a lock file for the directory in use. Trying to open another IndexWriter on the same directory will lead to a {@link LockObtainFailedException}. The {@link LockObtainFailedException} is also thrown if an IndexReader on the same directory is used to delete documents from the index.

    Expert: IndexWriter allows an optional {@link IndexDeletionPolicy} implementation to be specified. You can use this to control when prior commits are deleted from the index. The default policy is {@link KeepOnlyLastCommitDeletionPolicy} which removes all prior commits as soon as a new commit is done (this matches behavior before 2.2). Creating your own policy can allow you to explicitly keep previous "point in time" commits alive in the index for some time, to allow readers to refresh to the new commit without having the old commit deleted out from under them. This is necessary on filesystems like NFS that do not support "delete on last close" semantics, which Lucene's "point in time" search normally relies on.

    Expert: IndexWriter allows you to separately change the {@link MergePolicy} and the {@link MergeScheduler}. The {@link MergePolicy} is invoked whenever there are changes to the segments in the index. Its role is to select which merges to do, if any, and return a {@link MergePolicy.MergeSpecification} describing the merges. It also selects merges to do for optimize(). (The default is {@link LogByteSizeMergePolicy}. Then, the {@link MergeScheduler} is invoked with the requested merges and it decides when and how to run the merges. The default is {@link ConcurrentMergeScheduler}.

    NOTE: if you hit an OutOfMemoryError then IndexWriter will quietly record this fact and block all future segment commits. This is a defensive measure in case any internal state (buffered documents and deletions) were corrupted. Any subsequent calls to {@link #commit()} will throw an IllegalStateException. The only course of action is to call {@link #close()}, which internally will call {@link #rollback()}, to undo any changes to the index since the last commit. If you opened the writer with autoCommit false you can also just call {@link #rollback()} directly.

    NOTE: {@link IndexWriter} instances are completely thread safe, meaning multiple threads can call any of its methods, concurrently. If your application requires external synchronization, you should not synchronize on the IndexWriter instance as this may cause deadlock; use your own (non-Lucene) objects instead.

    */ /* * Clarification: Check Points (and commits) * Being able to set autoCommit=false allows IndexWriter to flush and * write new index files to the directory without writing a new segments_N * file which references these new files. It also means that the state of * the in memory SegmentInfos object is different than the most recent * segments_N file written to the directory. * * Each time the SegmentInfos is changed, and matches the (possibly * modified) directory files, we have a new "check point". * If the modified/new SegmentInfos is written to disk - as a new * (generation of) segments_N file - this check point is also an * IndexCommit. * * With autoCommit=true, every checkPoint is also a CommitPoint. * With autoCommit=false, some checkPoints may not be commits. * * A new checkpoint always replaces the previous checkpoint and * becomes the new "front" of the index. This allows the IndexFileDeleter * to delete files that are referenced only by stale checkpoints. * (files that were created since the last commit, but are no longer * referenced by the "front" of the index). For this, IndexFileDeleter * keeps track of the last non commit checkpoint. */ public class IndexWriter { /** * Default value for the write lock timeout (1,000). * @see #setDefaultWriteLockTimeout */ public static long WRITE_LOCK_TIMEOUT = 1000; private long writeLockTimeout = WRITE_LOCK_TIMEOUT; /** * Name of the write lock in the index. */ public static final String WRITE_LOCK_NAME = "write.lock"; /** * @deprecated * @see LogMergePolicy#DEFAULT_MERGE_FACTOR */ public final static int DEFAULT_MERGE_FACTOR = LogMergePolicy.DEFAULT_MERGE_FACTOR; /** * Value to denote a flush trigger is disabled */ public final static int DISABLE_AUTO_FLUSH = -1; /** * Disabled by default (because IndexWriter flushes by RAM usage * by default). Change using {@link #setMaxBufferedDocs(int)}. */ public final static int DEFAULT_MAX_BUFFERED_DOCS = DISABLE_AUTO_FLUSH; /** * Default value is 16 MB (which means flush when buffered * docs consume 16 MB RAM). Change using {@link #setRAMBufferSizeMB}. */ public final static double DEFAULT_RAM_BUFFER_SIZE_MB = 16.0; /** * Disabled by default (because IndexWriter flushes by RAM usage * by default). Change using {@link #setMaxBufferedDeleteTerms(int)}. */ public final static int DEFAULT_MAX_BUFFERED_DELETE_TERMS = DISABLE_AUTO_FLUSH; /** * @deprecated * @see LogDocMergePolicy#DEFAULT_MAX_MERGE_DOCS */ public final static int DEFAULT_MAX_MERGE_DOCS = LogDocMergePolicy.DEFAULT_MAX_MERGE_DOCS; /** * Default value is 10,000. Change using {@link #setMaxFieldLength(int)}. */ public final static int DEFAULT_MAX_FIELD_LENGTH = 10000; /** * Default value is 128. Change using {@link #setTermIndexInterval(int)}. */ public final static int DEFAULT_TERM_INDEX_INTERVAL = 128; /** * Absolute hard maximum length for a term. If a term * arrives from the analyzer longer than this length, it * is skipped and a message is printed to infoStream, if * set (see {@link #setInfoStream}). */ public final static int MAX_TERM_LENGTH = DocumentsWriter.MAX_TERM_LENGTH; /** * Default for {@link #getMaxSyncPauseSeconds}. On * Windows this defaults to 10.0 seconds; elsewhere it's * 0. */ public final static double DEFAULT_MAX_SYNC_PAUSE_SECONDS; static { if (Constants.WINDOWS) DEFAULT_MAX_SYNC_PAUSE_SECONDS = 10.0; else DEFAULT_MAX_SYNC_PAUSE_SECONDS = 0.0; } // The normal read buffer size defaults to 1024, but // increasing this during merging seems to yield // performance gains. However we don't want to increase // it too much because there are quite a few // BufferedIndexInputs created during merging. See // LUCENE-888 for details. private final static int MERGE_READ_BUFFER_SIZE = 4096; // Used for printing messages private static Object MESSAGE_ID_LOCK = new Object(); private static int MESSAGE_ID = 0; private int messageID = -1; volatile private boolean hitOOM; private Directory directory; // where this index resides private Analyzer analyzer; // how to analyze text private Similarity similarity = Similarity.getDefault(); // how to normalize private volatile long changeCount; // increments every time a change is completed private long lastCommitChangeCount; // last changeCount that was committed private SegmentInfos rollbackSegmentInfos; // segmentInfos we will fallback to if the commit fails private HashMap rollbackSegments; volatile SegmentInfos pendingCommit; // set when a commit is pending (after prepareCommit() & before commit()) volatile long pendingCommitChangeCount; private SegmentInfos localRollbackSegmentInfos; // segmentInfos we will fallback to if the commit fails private boolean localAutoCommit; // saved autoCommit during local transaction private int localFlushedDocCount; // saved docWriter.getFlushedDocCount during local transaction private boolean autoCommit = true; // false if we should commit only on close private SegmentInfos segmentInfos = new SegmentInfos(); // the segments private DocumentsWriter docWriter; private IndexFileDeleter deleter; private Set segmentsToOptimize = new HashSet(); // used by optimize to note those needing optimization private int optimizeMaxNumSegments; private Lock writeLock; private int termIndexInterval = DEFAULT_TERM_INDEX_INTERVAL; private boolean closeDir; private boolean closed; private boolean closing; // Holds all SegmentInfo instances currently involved in // merges private HashSet mergingSegments = new HashSet(); private MergePolicy mergePolicy = new LogByteSizeMergePolicy(this); private MergeScheduler mergeScheduler = new ConcurrentMergeScheduler(); private LinkedList pendingMerges = new LinkedList(); private Set runningMerges = new HashSet(); private List mergeExceptions = new ArrayList(); private long mergeGen; private boolean stopMerges; private int flushCount; private int flushDeletesCount; private double maxSyncPauseSeconds = DEFAULT_MAX_SYNC_PAUSE_SECONDS; // Used to only allow one addIndexes to proceed at once // TODO: use ReadWriteLock once we are on 5.0 private int readCount; // count of how many threads are holding read lock private Thread writeThread; // non-null if any thread holds write lock final ReaderPool readerPool = new ReaderPool(); private int upgradeCount; private int readerTermsIndexDivisor = IndexReader.DEFAULT_TERMS_INDEX_DIVISOR; // This is a "write once" variable (like the organic dye // on a DVD-R that may or may not be heated by a laser and // then cooled to permanently record the event): it's // false, until getReader() is called for the first time, // at which point it's switched to true and never changes // back to false. Once this is true, we hold open and // reuse SegmentReader instances internally for applying // deletes, doing merges, and reopening near real-time // readers. private volatile boolean poolReaders; /** * Expert: returns a readonly reader, covering all * committed as well as un-committed changes to the index. * This provides "near real-time" searching, in that * changes made during an IndexWriter session can be * quickly made available for searching without closing * the writer nor calling {@link #commit}. * *

    Note that this is functionally equivalent to calling * {#commit} and then using {@link IndexReader#open} to * open a new reader. But the turarnound time of this * method should be faster since it avoids the potentially * costly {@link #commit}.

    * *

    You must close the {@link IndexReader} returned by * this method once you are done using it.

    * *

    It's near real-time because there is no hard * guarantee on how quickly you can get a new reader after * making changes with IndexWriter. You'll have to * experiment in your situation to determine if it's * fast enough. As this is a new and experimental * feature, please report back on your findings so we can * learn, improve and iterate.

    * *

    The resulting reader supports {@link * IndexReader#reopen}, but that call will simply forward * back to this method (though this may change in the * future).

    * *

    The very first time this method is called, this * writer instance will make every effort to pool the * readers that it opens for doing merges, applying * deletes, etc. This means additional resources (RAM, * file descriptors, CPU time) will be consumed.

    * *

    For lower latency on reopening a reader, you should * call {@link #setMergedSegmentWarmer} to * pre-warm a newly merged segment before it's committed * to the index. This is important for minimizing * index-to-search delay after a large merge.

    * *

    If an addIndexes* call is running in another thread, * then this reader will only search those segments from * the foreign index that have been successfully copied * over, so far

    . * *

    NOTE: Once the writer is closed, any * outstanding readers may continue to be used. However, * if you attempt to reopen any of those readers, you'll * hit an {@link AlreadyClosedException}.

    * *

    NOTE: This API is experimental and might * change in incompatible ways in the next release.

    * * @return IndexReader that covers entire index plus all * changes made so far by this IndexWriter instance * * @throws IOException */ public IndexReader getReader() throws IOException { return getReader(readerTermsIndexDivisor); } /** Expert: like {@link #getReader}, except you can * specify which termInfosIndexDivisor should be used for * any newly opened readers. * @param termInfosIndexDivisor Subsamples which indexed * terms are loaded into RAM. This has the same effect as {@link * IndexWriter#setTermIndexInterval} except that setting * must be done at indexing time while this setting can be * set per reader. When set to N, then one in every * N*termIndexInterval terms in the index is loaded into * memory. By setting this to a value > 1 you can reduce * memory usage, at the expense of higher latency when * loading a TermInfo. The default value is 1. Set this * to -1 to skip loading the terms index entirely. */ public IndexReader getReader(int termInfosIndexDivisor) throws IOException { ensureOpen(); if (infoStream != null) { message("flush at getReader"); } // Do this up front before flushing so that the readers // obtained during this flush are pooled, the first time // this method is called: poolReaders = true; // Prevent segmentInfos from changing while opening the // reader; in theory we could do similar retry logic, // just like we do when loading segments_N IndexReader r; synchronized(this) { flush(false, true, true); r = new ReadOnlyDirectoryReader(this, segmentInfos, termInfosIndexDivisor); } maybeMerge(); return r; } /** Holds shared SegmentReader instances. IndexWriter uses * SegmentReaders for 1) applying deletes, 2) doing * merges, 3) handing out a real-time reader. This pool * reuses instances of the SegmentReaders in all these * places if it is in "near real-time mode" (getReader() * has been called on this instance). */ class ReaderPool { private final Map readerMap = new HashMap(); /** Forcefully clear changes for the specified segments, * and remove from the pool. This is called on successful merge. */ synchronized void clear(SegmentInfos infos) throws IOException { if (infos == null) { Iterator iter = readerMap.entrySet().iterator(); while (iter.hasNext()) { Map.Entry ent = (Map.Entry) iter.next(); ((SegmentReader) ent.getValue()).hasChanges = false; } } else { final int numSegments = infos.size(); for(int i=0;i 0) doWait(); // We could have been closed while we were waiting: ensureOpen(); writeThread = Thread.currentThread(); } synchronized void releaseWrite() { assert Thread.currentThread() == writeThread; writeThread = null; notifyAll(); } synchronized void acquireRead() { final Thread current = Thread.currentThread(); while(writeThread != null && writeThread != current) doWait(); readCount++; } // Allows one readLock to upgrade to a writeLock even if // there are other readLocks as long as all other // readLocks are also blocked in this method: synchronized void upgradeReadToWrite() { assert readCount > 0; upgradeCount++; while(readCount > upgradeCount || writeThread != null) { doWait(); } writeThread = Thread.currentThread(); readCount--; upgradeCount--; } synchronized void releaseRead() { readCount--; assert readCount >= 0; notifyAll(); } synchronized final boolean isOpen(boolean includePendingClose) { return !(closed || (includePendingClose && closing)); } /** * Used internally to throw an {@link * AlreadyClosedException} if this IndexWriter has been * closed. * @throws AlreadyClosedException if this IndexWriter is */ protected synchronized final void ensureOpen(boolean includePendingClose) throws AlreadyClosedException { if (!isOpen(includePendingClose)) { throw new AlreadyClosedException("this IndexWriter is closed"); } } protected synchronized final void ensureOpen() throws AlreadyClosedException { ensureOpen(true); } /** * Prints a message to the infoStream (if non-null), * prefixed with the identifying information for this * writer and the thread that's calling it. */ public void message(String message) { if (infoStream != null) infoStream.println("IW " + messageID + " [" + new Date() + "; " + Thread.currentThread().getName() + "]: " + message); } private synchronized void setMessageID(PrintStream infoStream) { if (infoStream != null && messageID == -1) { synchronized(MESSAGE_ID_LOCK) { messageID = MESSAGE_ID++; } } this.infoStream = infoStream; } /** * Casts current mergePolicy to LogMergePolicy, and throws * an exception if the mergePolicy is not a LogMergePolicy. */ private LogMergePolicy getLogMergePolicy() { if (mergePolicy instanceof LogMergePolicy) return (LogMergePolicy) mergePolicy; else throw new IllegalArgumentException("this method can only be called when the merge policy is the default LogMergePolicy"); } /**

    Get the current setting of whether newly flushed * segments will use the compound file format. Note that * this just returns the value previously set with * setUseCompoundFile(boolean), or the default value * (true). You cannot use this to query the status of * previously flushed segments.

    * *

    Note that this method is a convenience method: it * just calls mergePolicy.getUseCompoundFile as long as * mergePolicy is an instance of {@link LogMergePolicy}. * Otherwise an IllegalArgumentException is thrown.

    * * @see #setUseCompoundFile(boolean) */ public boolean getUseCompoundFile() { return getLogMergePolicy().getUseCompoundFile(); } /**

    Setting to turn on usage of a compound file. When on, * multiple files for each segment are merged into a * single file when a new segment is flushed.

    * *

    Note that this method is a convenience method: it * just calls mergePolicy.setUseCompoundFile as long as * mergePolicy is an instance of {@link LogMergePolicy}. * Otherwise an IllegalArgumentException is thrown.

    */ public void setUseCompoundFile(boolean value) { getLogMergePolicy().setUseCompoundFile(value); getLogMergePolicy().setUseCompoundDocStore(value); } /** Expert: Set the Similarity implementation used by this IndexWriter. * * @see Similarity#setDefault(Similarity) */ public void setSimilarity(Similarity similarity) { ensureOpen(); this.similarity = similarity; docWriter.setSimilarity(similarity); } /** Expert: Return the Similarity implementation used by this IndexWriter. * *

    This defaults to the current value of {@link Similarity#getDefault()}. */ public Similarity getSimilarity() { ensureOpen(); return this.similarity; } /** Expert: Set the interval between indexed terms. Large values cause less * memory to be used by IndexReader, but slow random-access to terms. Small * values cause more memory to be used by an IndexReader, and speed * random-access to terms. * * This parameter determines the amount of computation required per query * term, regardless of the number of documents that contain that term. In * particular, it is the maximum number of other terms that must be * scanned before a term is located and its frequency and position information * may be processed. In a large index with user-entered query terms, query * processing time is likely to be dominated not by term lookup but rather * by the processing of frequency and positional data. In a small index * or when many uncommon query terms are generated (e.g., by wildcard * queries) term lookup may become a dominant cost. * * In particular, numUniqueTerms/interval terms are read into * memory by an IndexReader, and, on average, interval/2 terms * must be scanned for each random term access. * * @see #DEFAULT_TERM_INDEX_INTERVAL */ public void setTermIndexInterval(int interval) { ensureOpen(); this.termIndexInterval = interval; } /** Expert: Return the interval between indexed terms. * * @see #setTermIndexInterval(int) */ public int getTermIndexInterval() { // We pass false because this method is called by SegmentMerger while we are in the process of closing ensureOpen(false); return termIndexInterval; } /** * Constructs an IndexWriter for the index in path. * Text will be analyzed with a. If create * is true, then a new, empty index will be created in * path, replacing the index already there, * if any. * *

    NOTE: autoCommit (see above) is set to false with this * constructor. * * @param path the path to the index directory * @param a the analyzer to use * @param create true to create the index or overwrite * the existing one; false to append to the existing * index * @param mfl Maximum field length in number of tokens/terms: LIMITED, UNLIMITED, or user-specified * via the MaxFieldLength constructor. * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if the directory cannot be read/written to, or * if it does not exist and create is * false or if there is any other low-level * IO error * @deprecated Use {@link #IndexWriter(Directory, Analyzer, * boolean, MaxFieldLength)} */ public IndexWriter(String path, Analyzer a, boolean create, MaxFieldLength mfl) throws CorruptIndexException, LockObtainFailedException, IOException { init(FSDirectory.getDirectory(path), a, create, true, null, false, mfl.getLimit(), null, null); } /** * Constructs an IndexWriter for the index in path. * Text will be analyzed with a. If create * is true, then a new, empty index will be created in * path, replacing the index already there, if any. * * @param path the path to the index directory * @param a the analyzer to use * @param create true to create the index or overwrite * the existing one; false to append to the existing * index * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if the directory cannot be read/written to, or * if it does not exist and create is * false or if there is any other low-level * IO error * @deprecated This constructor will be removed in the 3.0 release. * Use {@link * #IndexWriter(Directory,Analyzer,boolean,MaxFieldLength)} * instead, and call {@link #commit()} when needed. */ public IndexWriter(String path, Analyzer a, boolean create) throws CorruptIndexException, LockObtainFailedException, IOException { init(FSDirectory.getDirectory(path), a, create, true, null, true, DEFAULT_MAX_FIELD_LENGTH, null, null); } /** * Constructs an IndexWriter for the index in path. * Text will be analyzed with a. If create * is true, then a new, empty index will be created in * path, replacing the index already there, if any. * *

    NOTE: autoCommit (see above) is set to false with this * constructor. * * @param path the path to the index directory * @param a the analyzer to use * @param create true to create the index or overwrite * the existing one; false to append to the existing * index * @param mfl Maximum field length in number of terms/tokens: LIMITED, UNLIMITED, or user-specified * via the MaxFieldLength constructor. * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if the directory cannot be read/written to, or * if it does not exist and create is * false or if there is any other low-level * IO error * @deprecated Use {@link #IndexWriter(Directory, * Analyzer, boolean, MaxFieldLength)} */ public IndexWriter(File path, Analyzer a, boolean create, MaxFieldLength mfl) throws CorruptIndexException, LockObtainFailedException, IOException { init(FSDirectory.getDirectory(path), a, create, true, null, false, mfl.getLimit(), null, null); } /** * Constructs an IndexWriter for the index in path. * Text will be analyzed with a. If create * is true, then a new, empty index will be created in * path, replacing the index already there, if any. * * @param path the path to the index directory * @param a the analyzer to use * @param create true to create the index or overwrite * the existing one; false to append to the existing * index * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if the directory cannot be read/written to, or * if it does not exist and create is * false or if there is any other low-level * IO error * @deprecated This constructor will be removed in the 3.0 release. * Use {@link * #IndexWriter(Directory,Analyzer,boolean,MaxFieldLength)} * instead, and call {@link #commit()} when needed. */ public IndexWriter(File path, Analyzer a, boolean create) throws CorruptIndexException, LockObtainFailedException, IOException { init(FSDirectory.getDirectory(path), a, create, true, null, true, DEFAULT_MAX_FIELD_LENGTH, null, null); } /** * Constructs an IndexWriter for the index in d. * Text will be analyzed with a. If create * is true, then a new, empty index will be created in * d, replacing the index already there, if any. * *

    NOTE: autoCommit (see above) is set to false with this * constructor. * * @param d the index directory * @param a the analyzer to use * @param create true to create the index or overwrite * the existing one; false to append to the existing * index * @param mfl Maximum field length in number of terms/tokens: LIMITED, UNLIMITED, or user-specified * via the MaxFieldLength constructor. * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if the directory cannot be read/written to, or * if it does not exist and create is * false or if there is any other low-level * IO error */ public IndexWriter(Directory d, Analyzer a, boolean create, MaxFieldLength mfl) throws CorruptIndexException, LockObtainFailedException, IOException { init(d, a, create, false, null, false, mfl.getLimit(), null, null); } /** * Constructs an IndexWriter for the index in d. * Text will be analyzed with a. If create * is true, then a new, empty index will be created in * d, replacing the index already there, if any. * * @param d the index directory * @param a the analyzer to use * @param create true to create the index or overwrite * the existing one; false to append to the existing * index * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if the directory cannot be read/written to, or * if it does not exist and create is * false or if there is any other low-level * IO error * @deprecated This constructor will be removed in the 3.0 * release, and call {@link #commit()} when needed. * Use {@link #IndexWriter(Directory,Analyzer,boolean,MaxFieldLength)} instead. */ public IndexWriter(Directory d, Analyzer a, boolean create) throws CorruptIndexException, LockObtainFailedException, IOException { init(d, a, create, false, null, true, DEFAULT_MAX_FIELD_LENGTH, null, null); } /** * Constructs an IndexWriter for the index in * path, first creating it if it does not * already exist. Text will be analyzed with * a. * *

    NOTE: autoCommit (see above) is set to false with this * constructor. * * @param path the path to the index directory * @param a the analyzer to use * @param mfl Maximum field length in number of terms/tokens: LIMITED, UNLIMITED, or user-specified * via the MaxFieldLength constructor. * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if the directory cannot be * read/written to or if there is any other low-level * IO error * @deprecated Use {@link #IndexWriter(Directory, Analyzer, MaxFieldLength)} */ public IndexWriter(String path, Analyzer a, MaxFieldLength mfl) throws CorruptIndexException, LockObtainFailedException, IOException { init(FSDirectory.getDirectory(path), a, true, null, false, mfl.getLimit(), null, null); } /** * Constructs an IndexWriter for the index in * path, first creating it if it does not * already exist. Text will be analyzed with * a. * * @param path the path to the index directory * @param a the analyzer to use * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if the directory cannot be * read/written to or if there is any other low-level * IO error * @deprecated This constructor will be removed in the 3.0 * release, and call {@link #commit()} when needed. * Use {@link #IndexWriter(Directory,Analyzer,MaxFieldLength)} instead. */ public IndexWriter(String path, Analyzer a) throws CorruptIndexException, LockObtainFailedException, IOException { init(FSDirectory.getDirectory(path), a, true, null, true, DEFAULT_MAX_FIELD_LENGTH, null, null); } /** * Constructs an IndexWriter for the index in * path, first creating it if it does not * already exist. Text will be analyzed with * a. * *

    NOTE: autoCommit (see above) is set to false with this * constructor. * * @param path the path to the index directory * @param a the analyzer to use * @param mfl Maximum field length in number of terms/tokens: LIMITED, UNLIMITED, or user-specified * via the MaxFieldLength constructor. * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if the directory cannot be * read/written to or if there is any other low-level * IO error * @deprecated Use {@link #IndexWriter(Directory, * Analyzer, MaxFieldLength)} */ public IndexWriter(File path, Analyzer a, MaxFieldLength mfl) throws CorruptIndexException, LockObtainFailedException, IOException { init(FSDirectory.getDirectory(path), a, true, null, false, mfl.getLimit(), null, null); } /** * Constructs an IndexWriter for the index in * path, first creating it if it does not * already exist. Text will be analyzed with * a. * * @param path the path to the index directory * @param a the analyzer to use * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if the directory cannot be * read/written to or if there is any other low-level * IO error * @deprecated This constructor will be removed in the 3.0 release. * Use {@link #IndexWriter(Directory,Analyzer,MaxFieldLength)} * instead, and call {@link #commit()} when needed. */ public IndexWriter(File path, Analyzer a) throws CorruptIndexException, LockObtainFailedException, IOException { init(FSDirectory.getDirectory(path), a, true, null, true, DEFAULT_MAX_FIELD_LENGTH, null, null); } /** * Constructs an IndexWriter for the index in * d, first creating it if it does not * already exist. Text will be analyzed with * a. * *

    NOTE: autoCommit (see above) is set to false with this * constructor. * * @param d the index directory * @param a the analyzer to use * @param mfl Maximum field length in number of terms/tokens: LIMITED, UNLIMITED, or user-specified * via the MaxFieldLength constructor. * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if the directory cannot be * read/written to or if there is any other low-level * IO error */ public IndexWriter(Directory d, Analyzer a, MaxFieldLength mfl) throws CorruptIndexException, LockObtainFailedException, IOException { init(d, a, false, null, false, mfl.getLimit(), null, null); } /** * Constructs an IndexWriter for the index in * d, first creating it if it does not * already exist. Text will be analyzed with * a. * * @param d the index directory * @param a the analyzer to use * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if the directory cannot be * read/written to or if there is any other low-level * IO error * @deprecated This constructor will be removed in the 3.0 release. * Use {@link * #IndexWriter(Directory,Analyzer,MaxFieldLength)} * instead, and call {@link #commit()} when needed. */ public IndexWriter(Directory d, Analyzer a) throws CorruptIndexException, LockObtainFailedException, IOException { init(d, a, false, null, true, DEFAULT_MAX_FIELD_LENGTH, null, null); } /** * Constructs an IndexWriter for the index in * d, first creating it if it does not * already exist. Text will be analyzed with * a. * * @param d the index directory * @param autoCommit see above * @param a the analyzer to use * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if the directory cannot be * read/written to or if there is any other low-level * IO error * @deprecated This constructor will be removed in the 3.0 release. * Use {@link * #IndexWriter(Directory,Analyzer,MaxFieldLength)} * instead, and call {@link #commit()} when needed. */ public IndexWriter(Directory d, boolean autoCommit, Analyzer a) throws CorruptIndexException, LockObtainFailedException, IOException { init(d, a, false, null, autoCommit, DEFAULT_MAX_FIELD_LENGTH, null, null); } /** * Constructs an IndexWriter for the index in d. * Text will be analyzed with a. If create * is true, then a new, empty index will be created in * d, replacing the index already there, if any. * * @param d the index directory * @param autoCommit see above * @param a the analyzer to use * @param create true to create the index or overwrite * the existing one; false to append to the existing * index * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if the directory cannot be read/written to, or * if it does not exist and create is * false or if there is any other low-level * IO error * @deprecated This constructor will be removed in the 3.0 release. * Use {@link * #IndexWriter(Directory,Analyzer,boolean,MaxFieldLength)} * instead, and call {@link #commit()} when needed. */ public IndexWriter(Directory d, boolean autoCommit, Analyzer a, boolean create) throws CorruptIndexException, LockObtainFailedException, IOException { init(d, a, create, false, null, autoCommit, DEFAULT_MAX_FIELD_LENGTH, null, null); } /** * Expert: constructs an IndexWriter with a custom {@link * IndexDeletionPolicy}, for the index in d, * first creating it if it does not already exist. Text * will be analyzed with a. * *

    NOTE: autoCommit (see above) is set to false with this * constructor. * * @param d the index directory * @param a the analyzer to use * @param deletionPolicy see above * @param mfl whether or not to limit field lengths * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if the directory cannot be * read/written to or if there is any other low-level * IO error */ public IndexWriter(Directory d, Analyzer a, IndexDeletionPolicy deletionPolicy, MaxFieldLength mfl) throws CorruptIndexException, LockObtainFailedException, IOException { init(d, a, false, deletionPolicy, false, mfl.getLimit(), null, null); } /** * Expert: constructs an IndexWriter with a custom {@link * IndexDeletionPolicy}, for the index in d, * first creating it if it does not already exist. Text * will be analyzed with a. * * @param d the index directory * @param autoCommit see above * @param a the analyzer to use * @param deletionPolicy see above * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if the directory cannot be * read/written to or if there is any other low-level * IO error * @deprecated This constructor will be removed in the 3.0 release. * Use {@link * #IndexWriter(Directory,Analyzer,IndexDeletionPolicy,MaxFieldLength)} * instead, and call {@link #commit()} when needed. */ public IndexWriter(Directory d, boolean autoCommit, Analyzer a, IndexDeletionPolicy deletionPolicy) throws CorruptIndexException, LockObtainFailedException, IOException { init(d, a, false, deletionPolicy, autoCommit, DEFAULT_MAX_FIELD_LENGTH, null, null); } /** * Expert: constructs an IndexWriter with a custom {@link * IndexDeletionPolicy}, for the index in d. * Text will be analyzed with a. If * create is true, then a new, empty index * will be created in d, replacing the index * already there, if any. * *

    NOTE: autoCommit (see above) is set to false with this * constructor. * * @param d the index directory * @param a the analyzer to use * @param create true to create the index or overwrite * the existing one; false to append to the existing * index * @param deletionPolicy see above * @param mfl {@link org.apache.lucene.index.IndexWriter.MaxFieldLength}, whether or not to limit field lengths. Value is in number of terms/tokens * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if the directory cannot be read/written to, or * if it does not exist and create is * false or if there is any other low-level * IO error */ public IndexWriter(Directory d, Analyzer a, boolean create, IndexDeletionPolicy deletionPolicy, MaxFieldLength mfl) throws CorruptIndexException, LockObtainFailedException, IOException { init(d, a, create, false, deletionPolicy, false, mfl.getLimit(), null, null); } /** * Expert: constructs an IndexWriter with a custom {@link * IndexDeletionPolicy} and {@link IndexingChain}, * for the index in d. * Text will be analyzed with a. If * create is true, then a new, empty index * will be created in d, replacing the index * already there, if any. * *

    NOTE: autoCommit (see above) is set to false with this * constructor. * * @param d the index directory * @param a the analyzer to use * @param create true to create the index or overwrite * the existing one; false to append to the existing * index * @param deletionPolicy see above * @param mfl whether or not to limit field lengths, value is in number of terms/tokens. See {@link org.apache.lucene.index.IndexWriter.MaxFieldLength}. * @param indexingChain the {@link DocConsumer} chain to be used to * process documents * @param commit which commit to open * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if the directory cannot be read/written to, or * if it does not exist and create is * false or if there is any other low-level * IO error */ IndexWriter(Directory d, Analyzer a, boolean create, IndexDeletionPolicy deletionPolicy, MaxFieldLength mfl, IndexingChain indexingChain, IndexCommit commit) throws CorruptIndexException, LockObtainFailedException, IOException { init(d, a, create, false, deletionPolicy, false, mfl.getLimit(), indexingChain, commit); } /** * Expert: constructs an IndexWriter with a custom {@link * IndexDeletionPolicy}, for the index in d. * Text will be analyzed with a. If * create is true, then a new, empty index * will be created in d, replacing the index * already there, if any. * * @param d the index directory * @param autoCommit see above * @param a the analyzer to use * @param create true to create the index or overwrite * the existing one; false to append to the existing * index * @param deletionPolicy see above * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if the directory cannot be read/written to, or * if it does not exist and create is * false or if there is any other low-level * IO error * @deprecated This constructor will be removed in the 3.0 release. * Use {@link * #IndexWriter(Directory,Analyzer,boolean,IndexDeletionPolicy,MaxFieldLength)} * instead, and call {@link #commit()} when needed. */ public IndexWriter(Directory d, boolean autoCommit, Analyzer a, boolean create, IndexDeletionPolicy deletionPolicy) throws CorruptIndexException, LockObtainFailedException, IOException { init(d, a, create, false, deletionPolicy, autoCommit, DEFAULT_MAX_FIELD_LENGTH, null, null); } /** * Expert: constructs an IndexWriter on specific commit * point, with a custom {@link IndexDeletionPolicy}, for * the index in d. Text will be analyzed * with a. * *

    This is only meaningful if you've used a {@link * IndexDeletionPolicy} in that past that keeps more than * just the last commit. * *

    This operation is similar to {@link #rollback()}, * except that method can only rollback what's been done * with the current instance of IndexWriter since its last * commit, whereas this method can rollback to an * arbitrary commit point from the past, assuming the * {@link IndexDeletionPolicy} has preserved past * commits. * *

    NOTE: autoCommit (see above) is set to false with this * constructor. * * @param d the index directory * @param a the analyzer to use * @param deletionPolicy see above * @param mfl whether or not to limit field lengths, value is in number of terms/tokens. See {@link org.apache.lucene.index.IndexWriter.MaxFieldLength}. * @param commit which commit to open * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if the directory cannot be read/written to, or * if it does not exist and create is * false or if there is any other low-level * IO error */ public IndexWriter(Directory d, Analyzer a, IndexDeletionPolicy deletionPolicy, MaxFieldLength mfl, IndexCommit commit) throws CorruptIndexException, LockObtainFailedException, IOException { init(d, a, false, false, deletionPolicy, false, mfl.getLimit(), null, commit); } private void init(Directory d, Analyzer a, boolean closeDir, IndexDeletionPolicy deletionPolicy, boolean autoCommit, int maxFieldLength, IndexingChain indexingChain, IndexCommit commit) throws CorruptIndexException, LockObtainFailedException, IOException { if (IndexReader.indexExists(d)) { init(d, a, false, closeDir, deletionPolicy, autoCommit, maxFieldLength, indexingChain, commit); } else { init(d, a, true, closeDir, deletionPolicy, autoCommit, maxFieldLength, indexingChain, commit); } } private void init(Directory d, Analyzer a, final boolean create, boolean closeDir, IndexDeletionPolicy deletionPolicy, boolean autoCommit, int maxFieldLength, IndexingChain indexingChain, IndexCommit commit) throws CorruptIndexException, LockObtainFailedException, IOException { this.closeDir = closeDir; directory = d; analyzer = a; setMessageID(defaultInfoStream); this.maxFieldLength = maxFieldLength; if (indexingChain == null) indexingChain = DocumentsWriter.DefaultIndexingChain; if (create) { // Clear the write lock in case it's leftover: directory.clearLock(WRITE_LOCK_NAME); } Lock writeLock = directory.makeLock(WRITE_LOCK_NAME); if (!writeLock.obtain(writeLockTimeout)) // obtain write lock throw new LockObtainFailedException("Index locked for write: " + writeLock); this.writeLock = writeLock; // save it boolean success = false; try { if (create) { // Try to read first. This is to allow create // against an index that's currently open for // searching. In this case we write the next // segments_N file with no segments: boolean doCommit; try { segmentInfos.read(directory); segmentInfos.clear(); doCommit = false; } catch (IOException e) { // Likely this means it's a fresh directory doCommit = true; } if (autoCommit || doCommit) { // Always commit if autoCommit=true, else only // commit if there is no segments file in this dir // already. segmentInfos.commit(directory); synced.addAll(segmentInfos.files(directory, true)); } else { // Record that we have a change (zero out all // segments) pending: changeCount++; } } else { segmentInfos.read(directory); if (commit != null) { // Swap out all segments, but, keep metadata in // SegmentInfos, like version & generation, to // preserve write-once. This is important if // readers are open against the future commit // points. if (commit.getDirectory() != directory) throw new IllegalArgumentException("IndexCommit's directory doesn't match my directory"); SegmentInfos oldInfos = new SegmentInfos(); oldInfos.read(directory, commit.getSegmentsFileName()); segmentInfos.replace(oldInfos); changeCount++; if (infoStream != null) message("init: loaded commit \"" + commit.getSegmentsFileName() + "\""); } // We assume that this segments_N was previously // properly sync'd: synced.addAll(segmentInfos.files(directory, true)); } this.autoCommit = autoCommit; setRollbackSegmentInfos(segmentInfos); docWriter = new DocumentsWriter(directory, this, indexingChain); docWriter.setInfoStream(infoStream); docWriter.setMaxFieldLength(maxFieldLength); // Default deleter (for backwards compatibility) is // KeepOnlyLastCommitDeleter: deleter = new IndexFileDeleter(directory, deletionPolicy == null ? new KeepOnlyLastCommitDeletionPolicy() : deletionPolicy, segmentInfos, infoStream, docWriter, synced); if (deleter.startingCommitDeleted) // Deletion policy deleted the "head" commit point. // We have to mark ourself as changed so that if we // are closed w/o any further changes we write a new // segments_N file. changeCount++; pushMaxBufferedDocs(); if (infoStream != null) { message("init: create=" + create); messageState(); } success = true; } finally { if (!success) { if (infoStream != null) { message("init: hit exception on init; releasing write lock"); } try { writeLock.release(); } catch (Throwable t) { // don't mask the original exception } writeLock = null; } } } private synchronized void setRollbackSegmentInfos(SegmentInfos infos) { rollbackSegmentInfos = (SegmentInfos) infos.clone(); assert !rollbackSegmentInfos.hasExternalSegments(directory); rollbackSegments = new HashMap(); final int size = rollbackSegmentInfos.size(); for(int i=0;iDetermines the largest segment (measured by * document count) that may be merged with other segments. * Small values (e.g., less than 10,000) are best for * interactive indexing, as this limits the length of * pauses while indexing to a few seconds. Larger values * are best for batched indexing and speedier * searches.

    * *

    The default value is {@link Integer#MAX_VALUE}.

    * *

    Note that this method is a convenience method: it * just calls mergePolicy.setMaxMergeDocs as long as * mergePolicy is an instance of {@link LogMergePolicy}. * Otherwise an IllegalArgumentException is thrown.

    * *

    The default merge policy ({@link * LogByteSizeMergePolicy}) also allows you to set this * limit by net size (in MB) of the segment, using {@link * LogByteSizeMergePolicy#setMaxMergeMB}.

    */ public void setMaxMergeDocs(int maxMergeDocs) { getLogMergePolicy().setMaxMergeDocs(maxMergeDocs); } /** *

    Returns the largest segment (measured by document * count) that may be merged with other segments.

    * *

    Note that this method is a convenience method: it * just calls mergePolicy.getMaxMergeDocs as long as * mergePolicy is an instance of {@link LogMergePolicy}. * Otherwise an IllegalArgumentException is thrown.

    * * @see #setMaxMergeDocs */ public int getMaxMergeDocs() { return getLogMergePolicy().getMaxMergeDocs(); } /** * The maximum number of terms that will be indexed for a single field in a * document. This limits the amount of memory required for indexing, so that * collections with very large files will not crash the indexing process by * running out of memory. This setting refers to the number of running terms, * not to the number of different terms.

    * Note: this silently truncates large documents, excluding from the * index all terms that occur further in the document. If you know your source * documents are large, be sure to set this value high enough to accomodate * the expected size. If you set it to Integer.MAX_VALUE, then the only limit * is your memory, but you should anticipate an OutOfMemoryError.

    * By default, no more than {@link #DEFAULT_MAX_FIELD_LENGTH} terms * will be indexed for a field. */ public void setMaxFieldLength(int maxFieldLength) { ensureOpen(); this.maxFieldLength = maxFieldLength; docWriter.setMaxFieldLength(maxFieldLength); if (infoStream != null) message("setMaxFieldLength " + maxFieldLength); } /** * Returns the maximum number of terms that will be * indexed for a single field in a document. * @see #setMaxFieldLength */ public int getMaxFieldLength() { ensureOpen(); return maxFieldLength; } /** Sets the termsIndexDivisor passed to any readers that * IndexWriter opens, for example when applying deletes * or creating a near-real-time reader in {@link * IndexWriter#getReader}. Default value is {@link * IndexReader#DEFAULT_TERMS_INDEX_DIVISOR}. */ public void setReaderTermsIndexDivisor(int divisor) { ensureOpen(); if (divisor <= 0) { throw new IllegalArgumentException("divisor must be >= 1 (got " + divisor + ")"); } readerTermsIndexDivisor = divisor; if (infoStream != null) { message("setReaderTermsIndexDivisor " + readerTermsIndexDivisor); } } /** @see #setReaderTermsIndexDivisor */ public int getReaderTermsIndexDivisor() { ensureOpen(); return readerTermsIndexDivisor; } /** Determines the minimal number of documents required * before the buffered in-memory documents are flushed as * a new Segment. Large values generally gives faster * indexing. * *

    When this is set, the writer will flush every * maxBufferedDocs added documents. Pass in {@link * #DISABLE_AUTO_FLUSH} to prevent triggering a flush due * to number of buffered documents. Note that if flushing * by RAM usage is also enabled, then the flush will be * triggered by whichever comes first.

    * *

    Disabled by default (writer flushes by RAM usage).

    * * @throws IllegalArgumentException if maxBufferedDocs is * enabled but smaller than 2, or it disables maxBufferedDocs * when ramBufferSize is already disabled * @see #setRAMBufferSizeMB */ public void setMaxBufferedDocs(int maxBufferedDocs) { ensureOpen(); if (maxBufferedDocs != DISABLE_AUTO_FLUSH && maxBufferedDocs < 2) throw new IllegalArgumentException( "maxBufferedDocs must at least be 2 when enabled"); if (maxBufferedDocs == DISABLE_AUTO_FLUSH && getRAMBufferSizeMB() == DISABLE_AUTO_FLUSH) throw new IllegalArgumentException( "at least one of ramBufferSize and maxBufferedDocs must be enabled"); docWriter.setMaxBufferedDocs(maxBufferedDocs); pushMaxBufferedDocs(); if (infoStream != null) message("setMaxBufferedDocs " + maxBufferedDocs); } /** * If we are flushing by doc count (not by RAM usage), and * using LogDocMergePolicy then push maxBufferedDocs down * as its minMergeDocs, to keep backwards compatibility. */ private void pushMaxBufferedDocs() { if (docWriter.getMaxBufferedDocs() != DISABLE_AUTO_FLUSH) { final MergePolicy mp = mergePolicy; if (mp instanceof LogDocMergePolicy) { LogDocMergePolicy lmp = (LogDocMergePolicy) mp; final int maxBufferedDocs = docWriter.getMaxBufferedDocs(); if (lmp.getMinMergeDocs() != maxBufferedDocs) { if (infoStream != null) message("now push maxBufferedDocs " + maxBufferedDocs + " to LogDocMergePolicy"); lmp.setMinMergeDocs(maxBufferedDocs); } } } } /** * Returns the number of buffered added documents that will * trigger a flush if enabled. * @see #setMaxBufferedDocs */ public int getMaxBufferedDocs() { ensureOpen(); return docWriter.getMaxBufferedDocs(); } /** Determines the amount of RAM that may be used for * buffering added documents and deletions before they are * flushed to the Directory. Generally for faster * indexing performance it's best to flush by RAM usage * instead of document count and use as large a RAM buffer * as you can. * *

    When this is set, the writer will flush whenever * buffered documents and deletions use this much RAM. * Pass in {@link #DISABLE_AUTO_FLUSH} to prevent * triggering a flush due to RAM usage. Note that if * flushing by document count is also enabled, then the * flush will be triggered by whichever comes first.

    * *

    NOTE: the account of RAM usage for pending * deletions is only approximate. Specifically, if you * delete by Query, Lucene currently has no way to measure * the RAM usage if individual Queries so the accounting * will under-estimate and you should compensate by either * calling commit() periodically yourself, or by using * {@link #setMaxBufferedDeleteTerms} to flush by count * instead of RAM usage (each buffered delete Query counts * as one). * *

    NOTE: because IndexWriter uses * ints when managing its internal storage, * the absolute maximum value for this setting is somewhat * less than 2048 MB. The precise limit depends on * various factors, such as how large your documents are, * how many fields have norms, etc., so it's best to set * this value comfortably under 2048.

    * *

    The default value is {@link #DEFAULT_RAM_BUFFER_SIZE_MB}.

    * * @throws IllegalArgumentException if ramBufferSize is * enabled but non-positive, or it disables ramBufferSize * when maxBufferedDocs is already disabled */ public void setRAMBufferSizeMB(double mb) { if (mb > 2048.0) { throw new IllegalArgumentException("ramBufferSize " + mb + " is too large; should be comfortably less than 2048"); } if (mb != DISABLE_AUTO_FLUSH && mb <= 0.0) throw new IllegalArgumentException( "ramBufferSize should be > 0.0 MB when enabled"); if (mb == DISABLE_AUTO_FLUSH && getMaxBufferedDocs() == DISABLE_AUTO_FLUSH) throw new IllegalArgumentException( "at least one of ramBufferSize and maxBufferedDocs must be enabled"); docWriter.setRAMBufferSizeMB(mb); if (infoStream != null) message("setRAMBufferSizeMB " + mb); } /** * Returns the value set by {@link #setRAMBufferSizeMB} if enabled. */ public double getRAMBufferSizeMB() { return docWriter.getRAMBufferSizeMB(); } /** *

    Determines the minimal number of delete terms required before the buffered * in-memory delete terms are applied and flushed. If there are documents * buffered in memory at the time, they are merged and a new segment is * created.

    *

    Disabled by default (writer flushes by RAM usage).

    * * @throws IllegalArgumentException if maxBufferedDeleteTerms * is enabled but smaller than 1 * @see #setRAMBufferSizeMB */ public void setMaxBufferedDeleteTerms(int maxBufferedDeleteTerms) { ensureOpen(); if (maxBufferedDeleteTerms != DISABLE_AUTO_FLUSH && maxBufferedDeleteTerms < 1) throw new IllegalArgumentException( "maxBufferedDeleteTerms must at least be 1 when enabled"); docWriter.setMaxBufferedDeleteTerms(maxBufferedDeleteTerms); if (infoStream != null) message("setMaxBufferedDeleteTerms " + maxBufferedDeleteTerms); } /** * Returns the number of buffered deleted terms that will * trigger a flush if enabled. * @see #setMaxBufferedDeleteTerms */ public int getMaxBufferedDeleteTerms() { ensureOpen(); return docWriter.getMaxBufferedDeleteTerms(); } /** Determines how often segment indices are merged by addDocument(). With * smaller values, less RAM is used while indexing, and searches on * unoptimized indices are faster, but indexing speed is slower. With larger * values, more RAM is used during indexing, and while searches on unoptimized * indices are slower, indexing is faster. Thus larger values (> 10) are best * for batch index creation, and smaller values (< 10) for indices that are * interactively maintained. * *

    Note that this method is a convenience method: it * just calls mergePolicy.setMergeFactor as long as * mergePolicy is an instance of {@link LogMergePolicy}. * Otherwise an IllegalArgumentException is thrown.

    * *

    This must never be less than 2. The default value is 10. */ public void setMergeFactor(int mergeFactor) { getLogMergePolicy().setMergeFactor(mergeFactor); } /** *

    Returns the number of segments that are merged at * once and also controls the total number of segments * allowed to accumulate in the index.

    * *

    Note that this method is a convenience method: it * just calls mergePolicy.getMergeFactor as long as * mergePolicy is an instance of {@link LogMergePolicy}. * Otherwise an IllegalArgumentException is thrown.

    * * @see #setMergeFactor */ public int getMergeFactor() { return getLogMergePolicy().getMergeFactor(); } /** * Expert: returns max delay inserted before syncing a * commit point. On Windows, at least, pausing before * syncing can increase net indexing throughput. The * delay is variable based on size of the segment's files, * and is only inserted when using * ConcurrentMergeScheduler for merges. * @deprecated This will be removed in 3.0, when * autoCommit=true is removed from IndexWriter. */ public double getMaxSyncPauseSeconds() { return maxSyncPauseSeconds; } /** * Expert: sets the max delay before syncing a commit * point. * @see #getMaxSyncPauseSeconds * @deprecated This will be removed in 3.0, when * autoCommit=true is removed from IndexWriter. */ public void setMaxSyncPauseSeconds(double seconds) { maxSyncPauseSeconds = seconds; } /** If non-null, this will be the default infoStream used * by a newly instantiated IndexWriter. * @see #setInfoStream */ public static void setDefaultInfoStream(PrintStream infoStream) { IndexWriter.defaultInfoStream = infoStream; } /** * Returns the current default infoStream for newly * instantiated IndexWriters. * @see #setDefaultInfoStream */ public static PrintStream getDefaultInfoStream() { return IndexWriter.defaultInfoStream; } /** If non-null, information about merges, deletes and a * message when maxFieldLength is reached will be printed * to this. */ public void setInfoStream(PrintStream infoStream) { ensureOpen(); setMessageID(infoStream); docWriter.setInfoStream(infoStream); deleter.setInfoStream(infoStream); if (infoStream != null) messageState(); } private void messageState() { message("setInfoStream: dir=" + directory + " autoCommit=" + autoCommit + " mergePolicy=" + mergePolicy + " mergeScheduler=" + mergeScheduler + " ramBufferSizeMB=" + docWriter.getRAMBufferSizeMB() + " maxBufferedDocs=" + docWriter.getMaxBufferedDocs() + " maxBuffereDeleteTerms=" + docWriter.getMaxBufferedDeleteTerms() + " maxFieldLength=" + maxFieldLength + " index=" + segString()); } /** * Returns the current infoStream in use by this writer. * @see #setInfoStream */ public PrintStream getInfoStream() { ensureOpen(); return infoStream; } /** Returns true if verbosing is enabled (i.e., infoStream != null). */ public boolean verbose() { return infoStream != null; } /** * Sets the maximum time to wait for a write lock (in milliseconds) for this instance of IndexWriter. @see * @see #setDefaultWriteLockTimeout to change the default value for all instances of IndexWriter. */ public void setWriteLockTimeout(long writeLockTimeout) { ensureOpen(); this.writeLockTimeout = writeLockTimeout; } /** * Returns allowed timeout when acquiring the write lock. * @see #setWriteLockTimeout */ public long getWriteLockTimeout() { ensureOpen(); return writeLockTimeout; } /** * Sets the default (for any instance of IndexWriter) maximum time to wait for a write lock (in * milliseconds). */ public static void setDefaultWriteLockTimeout(long writeLockTimeout) { IndexWriter.WRITE_LOCK_TIMEOUT = writeLockTimeout; } /** * Returns default write lock timeout for newly * instantiated IndexWriters. * @see #setDefaultWriteLockTimeout */ public static long getDefaultWriteLockTimeout() { return IndexWriter.WRITE_LOCK_TIMEOUT; } /** * Commits all changes to an index and closes all * associated files. Note that this may be a costly * operation, so, try to re-use a single writer instead of * closing and opening a new one. See {@link #commit()} for * caveats about write caching done by some IO devices. * *

    If an Exception is hit during close, eg due to disk * full or some other reason, then both the on-disk index * and the internal state of the IndexWriter instance will * be consistent. However, the close will not be complete * even though part of it (flushing buffered documents) * may have succeeded, so the write lock will still be * held.

    * *

    If you can correct the underlying cause (eg free up * some disk space) then you can call close() again. * Failing that, if you want to force the write lock to be * released (dangerous, because you may then lose buffered * docs in the IndexWriter instance) then you can do * something like this:

    * *
       * try {
       *   writer.close();
       * } finally {
       *   if (IndexWriter.isLocked(directory)) {
       *     IndexWriter.unlock(directory);
       *   }
       * }
       * 
    * * after which, you must be certain not to use the writer * instance anymore.

    * *

    NOTE: if this method hits an OutOfMemoryError * you should immediately close the writer, again. See above for details.

    * * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public void close() throws CorruptIndexException, IOException { close(true); } /** * Closes the index with or without waiting for currently * running merges to finish. This is only meaningful when * using a MergeScheduler that runs merges in background * threads. * *

    NOTE: if this method hits an OutOfMemoryError * you should immediately close the writer, again. See above for details.

    * *

    NOTE: it is dangerous to always call * close(false), especially when IndexWriter is not open * for very long, because this can result in "merge * starvation" whereby long merges will never have a * chance to finish. This will cause too many segments in * your index over time.

    * * @param waitForMerges if true, this call will block * until all merges complete; else, it will ask all * running merges to abort, wait until those merges have * finished (which should be at most a few seconds), and * then return. */ public void close(boolean waitForMerges) throws CorruptIndexException, IOException { // Ensure that only one thread actually gets to do the closing: if (shouldClose()) { // If any methods have hit OutOfMemoryError, then abort // on close, in case the internal state of IndexWriter // or DocumentsWriter is corrupt if (hitOOM) rollbackInternal(); else closeInternal(waitForMerges); } } // Returns true if this thread should attempt to close, or // false if IndexWriter is now closed; else, waits until // another thread finishes closing synchronized private boolean shouldClose() { while(true) { if (!closed) { if (!closing) { closing = true; return true; } else { // Another thread is presently trying to close; // wait until it finishes one way (closes // successfully) or another (fails to close) doWait(); } } else return false; } } private void closeInternal(boolean waitForMerges) throws CorruptIndexException, IOException { docWriter.pauseAllThreads(); try { if (infoStream != null) message("now flush at close"); docWriter.close(); // Only allow a new merge to be triggered if we are // going to wait for merges: if (!hitOOM) { flush(waitForMerges, true, true); } if (waitForMerges) // Give merge scheduler last chance to run, in case // any pending merges are waiting: mergeScheduler.merge(this); mergePolicy.close(); finishMerges(waitForMerges); stopMerges = true; mergeScheduler.close(); if (infoStream != null) message("now call final commit()"); if (!hitOOM) { commit(0); } if (infoStream != null) message("at close: " + segString()); synchronized(this) { readerPool.close(); docWriter = null; deleter.close(); } if (closeDir) directory.close(); if (writeLock != null) { writeLock.release(); // release write lock writeLock = null; } synchronized(this) { closed = true; } } catch (OutOfMemoryError oom) { handleOOM(oom, "closeInternal"); } finally { synchronized(this) { closing = false; notifyAll(); if (!closed) { if (docWriter != null) docWriter.resumeAllThreads(); if (infoStream != null) message("hit exception while closing"); } } } } /** Tells the docWriter to close its currently open shared * doc stores (stored fields & vectors files). * Return value specifices whether new doc store files are compound or not. */ private synchronized boolean flushDocStores() throws IOException { if (infoStream != null) { message("flushDocStores segment=" + docWriter.getDocStoreSegment()); } boolean useCompoundDocStore = false; if (infoStream != null) { message("closeDocStores segment=" + docWriter.getDocStoreSegment()); } String docStoreSegment; boolean success = false; try { docStoreSegment = docWriter.closeDocStore(); success = true; } finally { if (!success && infoStream != null) { message("hit exception closing doc store segment"); } } if (infoStream != null) { message("flushDocStores files=" + docWriter.closedFiles()); } useCompoundDocStore = mergePolicy.useCompoundDocStore(segmentInfos); if (useCompoundDocStore && docStoreSegment != null && docWriter.closedFiles().size() != 0) { // Now build compound doc store file if (infoStream != null) { message("create compound file " + docStoreSegment + "." + IndexFileNames.COMPOUND_FILE_STORE_EXTENSION); } success = false; final int numSegments = segmentInfos.size(); final String compoundFileName = docStoreSegment + "." + IndexFileNames.COMPOUND_FILE_STORE_EXTENSION; try { CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, compoundFileName); final Iterator it = docWriter.closedFiles().iterator(); while(it.hasNext()) cfsWriter.addFile((String) it.next()); // Perform the merge cfsWriter.close(); success = true; } finally { if (!success) { if (infoStream != null) message("hit exception building compound file doc store for segment " + docStoreSegment); deleter.deleteFile(compoundFileName); } } for(int i=0;iNOTE:

    * This class handles all long values (unlike * {@link org.apache.lucene.document.DateField}). * * @deprecated For new indexes use {@link NumericUtils} instead, which * provides a sortable binary representation (prefix encoded) of numeric * values. * To index and efficiently query numeric values use {@link NumericField} * and {@link NumericRangeQuery}. * This class is included for use with existing * indices and will be removed in a future release. */ public class NumberTools { private static final int RADIX = 36; private static final char NEGATIVE_PREFIX = '-'; // NB: NEGATIVE_PREFIX must be < POSITIVE_PREFIX private static final char POSITIVE_PREFIX = '0'; //NB: this must be less than /** * Equivalent to longToString(Long.MIN_VALUE) */ public static final String MIN_STRING_VALUE = NEGATIVE_PREFIX + "0000000000000"; /** * Equivalent to longToString(Long.MAX_VALUE) */ public static final String MAX_STRING_VALUE = POSITIVE_PREFIX + "1y2p0ij32e8e7"; /** * The length of (all) strings returned by {@link #longToString} */ public static final int STR_SIZE = MIN_STRING_VALUE.length(); /** * Converts a long to a String suitable for indexing. */ public static String longToString(long l) { if (l == Long.MIN_VALUE) { // special case, because long is not symmetric around zero return MIN_STRING_VALUE; } StringBuffer buf = new StringBuffer(STR_SIZE); if (l < 0) { buf.append(NEGATIVE_PREFIX); l = Long.MAX_VALUE + l + 1; } else { buf.append(POSITIVE_PREFIX); } String num = Long.toString(l, RADIX); int padLen = STR_SIZE - num.length() - buf.length(); while (padLen-- > 0) { buf.append('0'); } buf.append(num); return buf.toString(); } /** * Converts a String that was returned by {@link #longToString} back to a * long. * * @throws IllegalArgumentException * if the input is null * @throws NumberFormatException * if the input does not parse (it was not a String returned by * longToString()). */ public static long stringToLong(String str) { if (str == null) { throw new NullPointerException("string cannot be null"); } if (str.length() != STR_SIZE) { throw new NumberFormatException("string is the wrong size"); } if (str.equals(MIN_STRING_VALUE)) { return Long.MIN_VALUE; } char prefix = str.charAt(0); long l = Long.parseLong(str.substring(1), RADIX); if (prefix == POSITIVE_PREFIX) { // nop } else if (prefix == NEGATIVE_PREFIX) { l = l - Long.MAX_VALUE - 1; } else { throw new NumberFormatException( "string does not begin with the correct prefix"); } return l; } }lucene-2.9.4/src/java/org/apache/lucene/document/SetBasedFieldSelector.java0000644000175000017500000000437611474320231027275 0ustar janpascaljanpascalpackage org.apache.lucene.document; import java.util.Set; /** * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Declare what fields to load normally and what fields to load lazily * **/ public class SetBasedFieldSelector implements FieldSelector { private Set fieldsToLoad; private Set lazyFieldsToLoad; /** * Pass in the Set of {@link Field} names to load and the Set of {@link Field} names to load lazily. If both are null, the * Document will not have any {@link Field} on it. * @param fieldsToLoad A Set of {@link String} field names to load. May be empty, but not null * @param lazyFieldsToLoad A Set of {@link String} field names to load lazily. May be empty, but not null */ public SetBasedFieldSelector(Set fieldsToLoad, Set lazyFieldsToLoad) { this.fieldsToLoad = fieldsToLoad; this.lazyFieldsToLoad = lazyFieldsToLoad; } /** * Indicate whether to load the field with the given name or not. If the {@link Field#name()} is not in either of the * initializing Sets, then {@link org.apache.lucene.document.FieldSelectorResult#NO_LOAD} is returned. If a Field name * is in both fieldsToLoad and lazyFieldsToLoad, lazy has precedence. * * @param fieldName The {@link Field} name to check * @return The {@link FieldSelectorResult} */ public FieldSelectorResult accept(String fieldName) { FieldSelectorResult result = FieldSelectorResult.NO_LOAD; if (fieldsToLoad.contains(fieldName) == true){ result = FieldSelectorResult.LOAD; } if (lazyFieldsToLoad.contains(fieldName) == true){ result = FieldSelectorResult.LAZY_LOAD; } return result; } }lucene-2.9.4/src/java/org/apache/lucene/document/MapFieldSelector.java0000644000175000017500000000465611474320232026322 0ustar janpascaljanpascalpackage org.apache.lucene.document; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.HashMap; import java.util.List; import java.util.Map; /** * A {@link FieldSelector} based on a Map of field names to {@link FieldSelectorResult}s * */ public class MapFieldSelector implements FieldSelector { Map fieldSelections; /** Create a a MapFieldSelector * @param fieldSelections maps from field names (String) to {@link FieldSelectorResult}s */ public MapFieldSelector(Map fieldSelections) { this.fieldSelections = fieldSelections; } /** Create a a MapFieldSelector * @param fields fields to LOAD. List of Strings. All other fields are NO_LOAD. */ public MapFieldSelector(List fields) { fieldSelections = new HashMap(fields.size()*5/3); for (int i=0; iIndexWriter creates and maintains an index.

    buffered deletions * are not counted. If you really need these to be * counted you should call {@link #commit()} first. * @see #numDocs */ public synchronized int numDocs() throws IOException { int count; if (docWriter != null) count = docWriter.getNumDocsInRAM(); else count = 0; for (int i = 0; i < segmentInfos.size(); i++) { final SegmentInfo info = segmentInfos.info(i); count += info.docCount - info.getDelCount(); } return count; } public synchronized boolean hasDeletions() throws IOException { ensureOpen(); if (docWriter.hasDeletes()) return true; for (int i = 0; i < segmentInfos.size(); i++) if (segmentInfos.info(i).hasDeletions()) return true; return false; } /** * The maximum number of terms that will be indexed for a single field in a * document. This limits the amount of memory required for indexing, so that * collections with very large files will not crash the indexing process by * running out of memory.

    * Note that this effectively truncates large documents, excluding from the * index terms that occur further in the document. If you know your source * documents are large, be sure to set this value high enough to accommodate * the expected size. If you set it to Integer.MAX_VALUE, then the only limit * is your memory, but you should anticipate an OutOfMemoryError.

    * By default, no more than 10,000 terms will be indexed for a field. * * @see MaxFieldLength */ private int maxFieldLength; /** * Adds a document to this index. If the document contains more than * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are * discarded. * *

    Note that if an Exception is hit (for example disk full) * then the index will be consistent, but this document * may not have been added. Furthermore, it's possible * the index will have one segment in non-compound format * even when using compound files (when a merge has * partially succeeded).

    * *

    This method periodically flushes pending documents * to the Directory (see above), and * also periodically triggers segment merges in the index * according to the {@link MergePolicy} in use.

    * *

    Merges temporarily consume space in the * directory. The amount of space required is up to 1X the * size of all segments being merged, when no * readers/searchers are open against the index, and up to * 2X the size of all segments being merged when * readers/searchers are open against the index (see * {@link #optimize()} for details). The sequence of * primitive merge operations performed is governed by the * merge policy. * *

    Note that each term in the document can be no longer * than 16383 characters, otherwise an * IllegalArgumentException will be thrown.

    * *

    Note that it's possible to create an invalid Unicode * string in java if a UTF16 surrogate pair is malformed. * In this case, the invalid characters are silently * replaced with the Unicode replacement character * U+FFFD.

    * *

    NOTE: if this method hits an OutOfMemoryError * you should immediately close the writer. See above for details.

    * * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public void addDocument(Document doc) throws CorruptIndexException, IOException { addDocument(doc, analyzer); } /** * Adds a document to this index, using the provided analyzer instead of the * value of {@link #getAnalyzer()}. If the document contains more than * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are * discarded. * *

    See {@link #addDocument(Document)} for details on * index and IndexWriter state after an Exception, and * flushing/merging temporary free space requirements.

    * *

    NOTE: if this method hits an OutOfMemoryError * you should immediately close the writer. See above for details.

    * * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException { ensureOpen(); boolean doFlush = false; boolean success = false; try { try { doFlush = docWriter.addDocument(doc, analyzer); success = true; } finally { if (!success) { if (infoStream != null) message("hit exception adding document"); synchronized (this) { // If docWriter has some aborted files that were // never incref'd, then we clean them up here if (docWriter != null) { final Collection files = docWriter.abortedFiles(); if (files != null) deleter.deleteNewFiles(files); } } } } if (doFlush) flush(true, false, false); } catch (OutOfMemoryError oom) { handleOOM(oom, "addDocument"); } } /** * Deletes the document(s) containing term. * *

    NOTE: if this method hits an OutOfMemoryError * you should immediately close the writer. See above for details.

    * * @param term the term to identify the documents to be deleted * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public void deleteDocuments(Term term) throws CorruptIndexException, IOException { ensureOpen(); try { boolean doFlush = docWriter.bufferDeleteTerm(term); if (doFlush) flush(true, false, false); } catch (OutOfMemoryError oom) { handleOOM(oom, "deleteDocuments(Term)"); } } /** * Deletes the document(s) containing any of the * terms. All deletes are flushed at the same time. * *

    NOTE: if this method hits an OutOfMemoryError * you should immediately close the writer. See above for details.

    * * @param terms array of terms to identify the documents * to be deleted * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public void deleteDocuments(Term[] terms) throws CorruptIndexException, IOException { ensureOpen(); try { boolean doFlush = docWriter.bufferDeleteTerms(terms); if (doFlush) flush(true, false, false); } catch (OutOfMemoryError oom) { handleOOM(oom, "deleteDocuments(Term[])"); } } /** * Deletes the document(s) matching the provided query. * *

    NOTE: if this method hits an OutOfMemoryError * you should immediately close the writer. See above for details.

    * * @param query the query to identify the documents to be deleted * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public void deleteDocuments(Query query) throws CorruptIndexException, IOException { ensureOpen(); boolean doFlush = docWriter.bufferDeleteQuery(query); if (doFlush) flush(true, false, false); } /** * Deletes the document(s) matching any of the provided queries. * All deletes are flushed at the same time. * *

    NOTE: if this method hits an OutOfMemoryError * you should immediately close the writer. See above for details.

    * * @param queries array of queries to identify the documents * to be deleted * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public void deleteDocuments(Query[] queries) throws CorruptIndexException, IOException { ensureOpen(); boolean doFlush = docWriter.bufferDeleteQueries(queries); if (doFlush) flush(true, false, false); } /** * Updates a document by first deleting the document(s) * containing term and then adding the new * document. The delete and then add are atomic as seen * by a reader on the same index (flush may happen only after * the add). * *

    NOTE: if this method hits an OutOfMemoryError * you should immediately close the writer. See above for details.

    * * @param term the term to identify the document(s) to be * deleted * @param doc the document to be added * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public void updateDocument(Term term, Document doc) throws CorruptIndexException, IOException { ensureOpen(); updateDocument(term, doc, getAnalyzer()); } /** * Updates a document by first deleting the document(s) * containing term and then adding the new * document. The delete and then add are atomic as seen * by a reader on the same index (flush may happen only after * the add). * *

    NOTE: if this method hits an OutOfMemoryError * you should immediately close the writer. See above for details.

    * * @param term the term to identify the document(s) to be * deleted * @param doc the document to be added * @param analyzer the analyzer to use when analyzing the document * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public void updateDocument(Term term, Document doc, Analyzer analyzer) throws CorruptIndexException, IOException { ensureOpen(); try { boolean doFlush = false; boolean success = false; try { doFlush = docWriter.updateDocument(term, doc, analyzer); success = true; } finally { if (!success) { if (infoStream != null) message("hit exception updating document"); synchronized (this) { // If docWriter has some aborted files that were // never incref'd, then we clean them up here final Collection files = docWriter.abortedFiles(); if (files != null) deleter.deleteNewFiles(files); } } } if (doFlush) flush(true, false, false); } catch (OutOfMemoryError oom) { handleOOM(oom, "updateDocument"); } } // for test purpose final synchronized int getSegmentCount(){ return segmentInfos.size(); } // for test purpose final synchronized int getNumBufferedDocuments(){ return docWriter.getNumDocsInRAM(); } // for test purpose final synchronized int getDocCount(int i) { if (i >= 0 && i < segmentInfos.size()) { return segmentInfos.info(i).docCount; } else { return -1; } } // for test purpose final synchronized int getFlushCount() { return flushCount; } // for test purpose final synchronized int getFlushDeletesCount() { return flushDeletesCount; } final String newSegmentName() { // Cannot synchronize on IndexWriter because that causes // deadlock synchronized(segmentInfos) { // Important to increment changeCount so that the // segmentInfos is written on close. Otherwise we // could close, re-open and re-return the same segment // name that was previously returned which can cause // problems at least with ConcurrentMergeScheduler. changeCount++; return "_" + Integer.toString(segmentInfos.counter++, Character.MAX_RADIX); } } /** If non-null, information about merges will be printed to this. */ private PrintStream infoStream = null; private static PrintStream defaultInfoStream = null; /** * Requests an "optimize" operation on an index, priming the index * for the fastest available search. Traditionally this has meant * merging all segments into a single segment as is done in the * default merge policy, but individual merge policies may implement * optimize in different ways. * *

    It is recommended that this method be called upon completion of indexing. In * environments with frequent updates, optimize is best done during low volume times, if at all. * *

    *

    See http://www.gossamer-threads.com/lists/lucene/java-dev/47895 for more discussion.

    * *

    Note that optimize requires 2X the index size free * space in your Directory (3X if you're using compound * file format). For example, if your index * size is 10 MB then you need 20 MB free for optimize to * complete (30 MB if you're using compound fiel format).

    * *

    If some but not all readers re-open while an * optimize is underway, this will cause > 2X temporary * space to be consumed as those new readers will then * hold open the partially optimized segments at that * time. It is best not to re-open readers while optimize * is running.

    * *

    The actual temporary usage could be much less than * these figures (it depends on many factors).

    * *

    In general, once the optimize completes, the total size of the * index will be less than the size of the starting index. * It could be quite a bit smaller (if there were many * pending deletes) or just slightly smaller.

    * *

    If an Exception is hit during optimize(), for example * due to disk full, the index will not be corrupt and no * documents will have been lost. However, it may have * been partially optimized (some segments were merged but * not all), and it's possible that one of the segments in * the index will be in non-compound format even when * using compound file format. This will occur when the * Exception is hit during conversion of the segment into * compound format.

    * *

    This call will optimize those segments present in * the index when the call started. If other threads are * still adding documents and flushing segments, those * newly created segments will not be optimized unless you * call optimize again.

    * *

    NOTE: if this method hits an OutOfMemoryError * you should immediately close the writer. See above for details.

    * * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error * @see LogMergePolicy#findMergesForOptimize */ public void optimize() throws CorruptIndexException, IOException { optimize(true); } /** * Optimize the index down to <= maxNumSegments. If * maxNumSegments==1 then this is the same as {@link * #optimize()}. * *

    NOTE: if this method hits an OutOfMemoryError * you should immediately close the writer. See above for details.

    * * @param maxNumSegments maximum number of segments left * in the index after optimization finishes */ public void optimize(int maxNumSegments) throws CorruptIndexException, IOException { optimize(maxNumSegments, true); } /** Just like {@link #optimize()}, except you can specify * whether the call should block until the optimize * completes. This is only meaningful with a * {@link MergeScheduler} that is able to run merges in * background threads. * *

    NOTE: if this method hits an OutOfMemoryError * you should immediately close the writer. See above for details.

    */ public void optimize(boolean doWait) throws CorruptIndexException, IOException { optimize(1, doWait); } /** Just like {@link #optimize(int)}, except you can * specify whether the call should block until the * optimize completes. This is only meaningful with a * {@link MergeScheduler} that is able to run merges in * background threads. * *

    NOTE: if this method hits an OutOfMemoryError * you should immediately close the writer. See above for details.

    */ public void optimize(int maxNumSegments, boolean doWait) throws CorruptIndexException, IOException { ensureOpen(); if (maxNumSegments < 1) throw new IllegalArgumentException("maxNumSegments must be >= 1; got " + maxNumSegments); if (infoStream != null) message("optimize: index now " + segString()); flush(true, false, true); synchronized(this) { resetMergeExceptions(); segmentsToOptimize = new HashSet(); optimizeMaxNumSegments = maxNumSegments; final int numSegments = segmentInfos.size(); for(int i=0;i 0) { // Forward any exceptions in background merge // threads to the current thread: final int size = mergeExceptions.size(); for(int i=0;iNOTE: if this method hits an OutOfMemoryError * you should immediately close the writer. See above for details.

    */ public void expungeDeletes(boolean doWait) throws CorruptIndexException, IOException { ensureOpen(); if (infoStream != null) message("expungeDeletes: index now " + segString()); MergePolicy.MergeSpecification spec; synchronized(this) { spec = mergePolicy.findMergesToExpungeDeletes(segmentInfos); if (spec != null) { final int numMerges = spec.merges.size(); for(int i=0;iNOTE: if this method hits an OutOfMemoryError * you should immediately close the writer. See above for details.

    */ public void expungeDeletes() throws CorruptIndexException, IOException { expungeDeletes(true); } /** * Expert: asks the mergePolicy whether any merges are * necessary now and if so, runs the requested merges and * then iterate (test again if merges are needed) until no * more merges are returned by the mergePolicy. * * Explicit calls to maybeMerge() are usually not * necessary. The most common case is when merge policy * parameters have changed. * *

    NOTE: if this method hits an OutOfMemoryError * you should immediately close the writer. See above for details.

    */ public final void maybeMerge() throws CorruptIndexException, IOException { maybeMerge(false); } private final void maybeMerge(boolean optimize) throws CorruptIndexException, IOException { maybeMerge(1, optimize); } private final void maybeMerge(int maxNumSegmentsOptimize, boolean optimize) throws CorruptIndexException, IOException { updatePendingMerges(maxNumSegmentsOptimize, optimize); mergeScheduler.merge(this); } private synchronized void updatePendingMerges(int maxNumSegmentsOptimize, boolean optimize) throws CorruptIndexException, IOException { assert !optimize || maxNumSegmentsOptimize > 0; if (stopMerges) { return; } // Do not start new merges if we've hit OOME if (hitOOM) { return; } final MergePolicy.MergeSpecification spec; if (optimize) { spec = mergePolicy.findMergesForOptimize(segmentInfos, maxNumSegmentsOptimize, segmentsToOptimize); if (spec != null) { final int numMerges = spec.merges.size(); for(int i=0;iIndexWriter without committing * any changes that have occurred since the last commit * (or since it was opened, if commit hasn't been called). * This removes any temporary files that had been created, * after which the state of the index will be the same as * it was when commit() was last called or when this * writer was first opened. This can only be called when * this IndexWriter was opened with * autoCommit=false. This also clears a * previous call to {@link #prepareCommit}. * @throws IllegalStateException if this is called when * the writer was opened with autoCommit=true. * @throws IOException if there is a low-level IO error */ public void rollback() throws IOException { ensureOpen(); if (autoCommit) throw new IllegalStateException("rollback() can only be called when IndexWriter was opened with autoCommit=false"); // Ensure that only one thread actually gets to do the closing: if (shouldClose()) rollbackInternal(); } private void rollbackInternal() throws IOException { boolean success = false; if (infoStream != null ) { message("rollback"); } docWriter.pauseAllThreads(); try { finishMerges(false); // Must pre-close these two, in case they increment // changeCount so that we can then set it to false // before calling closeInternal mergePolicy.close(); mergeScheduler.close(); synchronized(this) { if (pendingCommit != null) { pendingCommit.rollbackCommit(directory); deleter.decRef(pendingCommit); pendingCommit = null; notifyAll(); } // Keep the same segmentInfos instance but replace all // of its SegmentInfo instances. This is so the next // attempt to commit using this instance of IndexWriter // will always write to a new generation ("write // once"). segmentInfos.clear(); segmentInfos.addAll(rollbackSegmentInfos); assert !hasExternalSegments(); docWriter.abort(); assert testPoint("rollback before checkpoint"); // Ask deleter to locate unreferenced files & remove // them: deleter.checkpoint(segmentInfos, false); deleter.refresh(); } // Don't bother saving any changes in our segmentInfos readerPool.clear(null); lastCommitChangeCount = changeCount; success = true; } catch (OutOfMemoryError oom) { handleOOM(oom, "rollbackInternal"); } finally { synchronized(this) { if (!success) { docWriter.resumeAllThreads(); closing = false; notifyAll(); if (infoStream != null) message("hit exception during rollback"); } } } closeInternal(false); } /** * Delete all documents in the index. * *

    This method will drop all buffered documents and will * remove all segments from the index. This change will not be * visible until a {@link #commit()} has been called. This method * can be rolled back using {@link #rollback()}.

    * *

    NOTE: this method is much faster than using deleteDocuments( new MatchAllDocsQuery() ).

    * *

    NOTE: this method will forcefully abort all merges * in progress. If other threads are running {@link * #optimize()} or any of the addIndexes methods, they * will receive {@link MergePolicy.MergeAbortedException}s. */ public synchronized void deleteAll() throws IOException { docWriter.pauseAllThreads(); try { // Abort any running merges finishMerges(false); // Remove any buffered docs docWriter.abort(); docWriter.setFlushedDocCount(0); // Remove all segments segmentInfos.clear(); // Ask deleter to locate unreferenced files & remove them: deleter.checkpoint(segmentInfos, false); deleter.refresh(); // Don't bother saving any changes in our segmentInfos readerPool.clear(null); // Mark that the index has changed ++changeCount; } catch (OutOfMemoryError oom) { handleOOM(oom, "deleteAll"); } finally { docWriter.resumeAllThreads(); if (infoStream != null) { message("hit exception during deleteAll"); } } } private synchronized void finishMerges(boolean waitForMerges) throws IOException { if (!waitForMerges) { stopMerges = true; // Abort all pending & running merges: Iterator it = pendingMerges.iterator(); while(it.hasNext()) { final MergePolicy.OneMerge merge = (MergePolicy.OneMerge) it.next(); if (infoStream != null) message("now abort pending merge " + merge.segString(directory)); merge.abort(); mergeFinish(merge); } pendingMerges.clear(); it = runningMerges.iterator(); while(it.hasNext()) { final MergePolicy.OneMerge merge = (MergePolicy.OneMerge) it.next(); if (infoStream != null) message("now abort running merge " + merge.segString(directory)); merge.abort(); } // Ensure any running addIndexes finishes. It's fine // if a new one attempts to start because its merges // will quickly see the stopMerges == true and abort. acquireRead(); releaseRead(); // These merges periodically check whether they have // been aborted, and stop if so. We wait here to make // sure they all stop. It should not take very long // because the merge threads periodically check if // they are aborted. while(runningMerges.size() > 0) { if (infoStream != null) message("now wait for " + runningMerges.size() + " running merge to abort"); doWait(); } stopMerges = false; notifyAll(); assert 0 == mergingSegments.size(); if (infoStream != null) message("all running merges have aborted"); } else { // waitForMerges() will ensure any running addIndexes finishes. // It's fine if a new one attempts to start because from our // caller above the call will see that we are in the // process of closing, and will throw an // AlreadyClosedException. waitForMerges(); } } /** * Wait for any currently outstanding merges to finish. * *

    It is guaranteed that any merges started prior to calling this method * will have completed once this method completes.

    */ public synchronized void waitForMerges() { // Ensure any running addIndexes finishes. acquireRead(); releaseRead(); while(pendingMerges.size() > 0 || runningMerges.size() > 0) { doWait(); } // sanity check assert 0 == mergingSegments.size(); } /* * Called whenever the SegmentInfos has been updated and * the index files referenced exist (correctly) in the * index directory. */ private synchronized void checkpoint() throws IOException { changeCount++; deleter.checkpoint(segmentInfos, false); } private void finishAddIndexes() { releaseWrite(); } private void blockAddIndexes(boolean includePendingClose) { acquireRead(); boolean success = false; try { // Make sure we are still open since we could have // waited quite a while for last addIndexes to finish ensureOpen(includePendingClose); success = true; } finally { if (!success) releaseRead(); } } private void resumeAddIndexes() { releaseRead(); } /** Merges all segments from an array of indexes into this index. * *

    NOTE: if this method hits an OutOfMemoryError * you should immediately close the writer. See above for details.

    * * @deprecated Use {@link #addIndexesNoOptimize} instead, * then separately call {@link #optimize} afterwards if * you need to. * * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public void addIndexes(Directory[] dirs) throws CorruptIndexException, IOException { ensureOpen(); noDupDirs(dirs); // Do not allow add docs or deletes while we are running: docWriter.pauseAllThreads(); try { if (infoStream != null) message("flush at addIndexes"); flush(true, false, true); boolean success = false; startTransaction(false); try { int docCount = 0; synchronized(this) { ensureOpen(); for (int i = 0; i < dirs.length; i++) { SegmentInfos sis = new SegmentInfos(); // read infos from dir sis.read(dirs[i]); for (int j = 0; j < sis.size(); j++) { final SegmentInfo info = sis.info(j); docCount += info.docCount; assert !segmentInfos.contains(info); segmentInfos.add(info); // add each info } } } // Notify DocumentsWriter that the flushed count just increased docWriter.updateFlushedDocCount(docCount); optimize(); success = true; } finally { if (success) { commitTransaction(); } else { rollbackTransaction(); } } } catch (OutOfMemoryError oom) { handleOOM(oom, "addIndexes(Directory[])"); } finally { if (docWriter != null) { docWriter.resumeAllThreads(); } } } private synchronized void resetMergeExceptions() { mergeExceptions = new ArrayList(); mergeGen++; } private void noDupDirs(Directory[] dirs) { HashSet dups = new HashSet(); for(int i=0;iThis may be used to parallelize batch indexing. A large document * collection can be broken into sub-collections. Each sub-collection can be * indexed in parallel, on a different thread, process or machine. The * complete index can then be created by merging sub-collection indexes * with this method. * *

    NOTE: the index in each Directory must not be * changed (opened by a writer) while this method is * running. This method does not acquire a write lock in * each input Directory, so it is up to the caller to * enforce this. * *

    NOTE: while this is running, any attempts to * add or delete documents (with another thread) will be * paused until this method completes. * *

    This method is transactional in how Exceptions are * handled: it does not commit a new segments_N file until * all indexes are added. This means if an Exception * occurs (for example disk full), then either no indexes * will have been added or they all will have been.

    * *

    Note that this requires temporary free space in the * Directory up to 2X the sum of all input indexes * (including the starting index). If readers/searchers * are open against the starting index, then temporary * free space required will be higher by the size of the * starting index (see {@link #optimize()} for details). *

    * *

    Once this completes, the final size of the index * will be less than the sum of all input index sizes * (including the starting index). It could be quite a * bit smaller (if there were many pending deletes) or * just slightly smaller.

    * *

    * This requires this index not be among those to be added. * *

    NOTE: if this method hits an OutOfMemoryError * you should immediately close the writer. See above for details.

    * * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public void addIndexesNoOptimize(Directory[] dirs) throws CorruptIndexException, IOException { ensureOpen(); noDupDirs(dirs); // Do not allow add docs or deletes while we are running: docWriter.pauseAllThreads(); try { if (infoStream != null) message("flush at addIndexesNoOptimize"); flush(true, false, true); boolean success = false; startTransaction(false); try { int docCount = 0; synchronized(this) { ensureOpen(); for (int i = 0; i < dirs.length; i++) { if (directory == dirs[i]) { // cannot add this index: segments may be deleted in merge before added throw new IllegalArgumentException("Cannot add this index to itself"); } SegmentInfos sis = new SegmentInfos(); // read infos from dir sis.read(dirs[i]); for (int j = 0; j < sis.size(); j++) { SegmentInfo info = sis.info(j); assert !segmentInfos.contains(info): "dup info dir=" + info.dir + " name=" + info.name; docCount += info.docCount; segmentInfos.add(info); // add each info } } } // Notify DocumentsWriter that the flushed count just increased docWriter.updateFlushedDocCount(docCount); maybeMerge(); ensureOpen(); // If after merging there remain segments in the index // that are in a different directory, just copy these // over into our index. This is necessary (before // finishing the transaction) to avoid leaving the // index in an unusable (inconsistent) state. resolveExternalSegments(); ensureOpen(); success = true; } finally { if (success) { commitTransaction(); } else { rollbackTransaction(); } } } catch (OutOfMemoryError oom) { handleOOM(oom, "addIndexesNoOptimize"); } finally { if (docWriter != null) { docWriter.resumeAllThreads(); } } } private boolean hasExternalSegments() { return segmentInfos.hasExternalSegments(directory); } /* If any of our segments are using a directory != ours * then we have to either copy them over one by one, merge * them (if merge policy has chosen to) or wait until * currently running merges (in the background) complete. * We don't return until the SegmentInfos has no more * external segments. Currently this is only used by * addIndexesNoOptimize(). */ private void resolveExternalSegments() throws CorruptIndexException, IOException { boolean any = false; boolean done = false; while(!done) { SegmentInfo info = null; MergePolicy.OneMerge merge = null; synchronized(this) { if (stopMerges) throw new MergePolicy.MergeAbortedException("rollback() was called or addIndexes* hit an unhandled exception"); final int numSegments = segmentInfos.size(); done = true; for(int i=0;iAfter this completes, the index is optimized.

    *

    The provided IndexReaders are not closed.

    * *

    NOTE: while this is running, any attempts to * add or delete documents (with another thread) will be * paused until this method completes. * *

    See {@link #addIndexesNoOptimize(Directory[])} for * details on transactional semantics, temporary free * space required in the Directory, and non-CFS segments * on an Exception.

    * *

    NOTE: if this method hits an OutOfMemoryError * you should immediately close the writer. See above for details.

    * * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public void addIndexes(IndexReader[] readers) throws CorruptIndexException, IOException { ensureOpen(); // Do not allow add docs or deletes while we are running: docWriter.pauseAllThreads(); // We must pre-acquire a read lock here (and upgrade to // write lock in startTransaction below) so that no // other addIndexes is allowed to start up after we have // flushed & optimized but before we then start our // transaction. This is because the merging below // requires that only one segment is present in the // index: acquireRead(); try { SegmentInfo info = null; String mergedName = null; SegmentMerger merger = null; boolean success = false; try { flush(true, false, true); optimize(); // start with zero or 1 seg success = true; } finally { // Take care to release the read lock if we hit an // exception before starting the transaction if (!success) releaseRead(); } // true means we already have a read lock; if this // call hits an exception it will release the write // lock: startTransaction(true); try { mergedName = newSegmentName(); merger = new SegmentMerger(this, mergedName, null); SegmentReader sReader = null; synchronized(this) { if (segmentInfos.size() == 1) { // add existing index, if any sReader = readerPool.get(segmentInfos.info(0), true, BufferedIndexInput.BUFFER_SIZE, -1); } } success = false; try { if (sReader != null) merger.add(sReader); for (int i = 0; i < readers.length; i++) // add new indexes merger.add(readers[i]); int docCount = merger.merge(); // merge 'em synchronized(this) { segmentInfos.clear(); // pop old infos & add new info = new SegmentInfo(mergedName, docCount, directory, false, true, -1, null, false, merger.hasProx()); setDiagnostics(info, "addIndexes(IndexReader[])"); segmentInfos.add(info); } // Notify DocumentsWriter that the flushed count just increased docWriter.updateFlushedDocCount(docCount); success = true; } finally { if (sReader != null) { readerPool.release(sReader); } } } finally { if (!success) { if (infoStream != null) message("hit exception in addIndexes during merge"); rollbackTransaction(); } else { commitTransaction(); } } if (mergePolicy instanceof LogMergePolicy && getUseCompoundFile()) { List files = null; synchronized(this) { // Must incRef our files so that if another thread // is running merge/optimize, it doesn't delete our // segment's files before we have a change to // finish making the compound file. if (segmentInfos.contains(info)) { files = info.files(); deleter.incRef(files); } } if (files != null) { success = false; startTransaction(false); try { merger.createCompoundFile(mergedName + ".cfs"); synchronized(this) { info.setUseCompoundFile(true); } success = true; } finally { synchronized(this) { deleter.decRef(files); } if (!success) { if (infoStream != null) message("hit exception building compound file in addIndexes during merge"); rollbackTransaction(); } else { commitTransaction(); } } } } } catch (OutOfMemoryError oom) { handleOOM(oom, "addIndexes(IndexReader[])"); } finally { if (docWriter != null) { docWriter.resumeAllThreads(); } } } /** * A hook for extending classes to execute operations after pending added and * deleted documents have been flushed to the Directory but before the change * is committed (new segments_N file written). */ protected void doAfterFlush() throws IOException {} /** * Flush all in-memory buffered updates (adds and deletes) * to the Directory. *

    Note: while this will force buffered docs to be * pushed into the index, it will not make these docs * visible to a reader. Use {@link #commit()} instead * *

    NOTE: if this method hits an OutOfMemoryError * you should immediately close the writer. See above for details.

    * * @deprecated please call {@link #commit()}) instead * * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public final void flush() throws CorruptIndexException, IOException { if (hitOOM) { throw new IllegalStateException("this writer hit an OutOfMemoryError; cannot flush"); } flush(true, false, true); } /** * A hook for extending classes to execute operations before pending added and * deleted documents are flushed to the Directory. */ protected void doBeforeFlush() throws IOException {} /** Expert: prepare for commit. * *

    NOTE: if this method hits an OutOfMemoryError * you should immediately close the writer. See above for details.

    * * @see #prepareCommit(Map) */ public final void prepareCommit() throws CorruptIndexException, IOException { ensureOpen(); prepareCommit(null); } /**

    Expert: prepare for commit, specifying * commitUserData Map (String -> String). This does the * first phase of 2-phase commit. You can only call this * when autoCommit is false. This method does all steps * necessary to commit changes since this writer was * opened: flushes pending added and deleted docs, syncs * the index files, writes most of next segments_N file. * After calling this you must call either {@link * #commit()} to finish the commit, or {@link * #rollback()} to revert the commit and undo all changes * done since the writer was opened.

    * * You can also just call {@link #commit(Map)} directly * without prepareCommit first in which case that method * will internally call prepareCommit. * *

    NOTE: if this method hits an OutOfMemoryError * you should immediately close the writer. See above for details.

    * * @param commitUserData Opaque Map (String->String) * that's recorded into the segments file in the index, * and retrievable by {@link * IndexReader#getCommitUserData}. Note that when * IndexWriter commits itself, for example if open with * autoCommit=true, or, during {@link #close}, the * commitUserData is unchanged (just carried over from * the prior commit). If this is null then the previous * commitUserData is kept. Also, the commitUserData will * only "stick" if there are actually changes in the * index to commit. Therefore it's best to use this * feature only when autoCommit is false. */ public final void prepareCommit(Map commitUserData) throws CorruptIndexException, IOException { prepareCommit(commitUserData, false); } private final void prepareCommit(Map commitUserData, boolean internal) throws CorruptIndexException, IOException { if (hitOOM) { throw new IllegalStateException("this writer hit an OutOfMemoryError; cannot commit"); } if (autoCommit && !internal) throw new IllegalStateException("this method can only be used when autoCommit is false"); if (!autoCommit && pendingCommit != null) throw new IllegalStateException("prepareCommit was already called with no corresponding call to commit"); if (infoStream != null) message("prepareCommit: flush"); flush(true, true, true); startCommit(0, commitUserData); } // Used only by commit, below; lock order is commitLock -> IW private final Object commitLock = new Object(); private void commit(long sizeInBytes) throws IOException { synchronized(commitLock) { startCommit(sizeInBytes, null); finishCommit(); } } /** *

    Commits all pending changes (added & deleted * documents, optimizations, segment merges, added * indexes, etc.) to the index, and syncs all referenced * index files, such that a reader will see the changes * and the index updates will survive an OS or machine * crash or power loss. Note that this does not wait for * any running background merges to finish. This may be a * costly operation, so you should test the cost in your * application and do it only when really necessary.

    * *

    Note that this operation calls Directory.sync on * the index files. That call should not return until the * file contents & metadata are on stable storage. For * FSDirectory, this calls the OS's fsync. But, beware: * some hardware devices may in fact cache writes even * during fsync, and return before the bits are actually * on stable storage, to give the appearance of faster * performance. If you have such a device, and it does * not have a battery backup (for example) then on power * loss it may still lose data. Lucene cannot guarantee * consistency on such devices.

    * *

    NOTE: if this method hits an OutOfMemoryError * you should immediately close the writer. See above for details.

    * * @see #prepareCommit * @see #commit(Map) */ public final void commit() throws CorruptIndexException, IOException { commit(null); } /** Commits all changes to the index, specifying a * commitUserData Map (String -> String). This just * calls {@link #prepareCommit(Map)} (if you didn't * already call it) and then {@link #finishCommit}. * *

    NOTE: if this method hits an OutOfMemoryError * you should immediately close the writer. See above for details.

    */ public final void commit(Map commitUserData) throws CorruptIndexException, IOException { ensureOpen(); if (infoStream != null) { message("commit: start"); } synchronized(commitLock) { if (infoStream != null) { message("commit: enter lock"); } if (autoCommit || pendingCommit == null) { if (infoStream != null) message("commit: now prepare"); prepareCommit(commitUserData, true); } else if (infoStream != null) { message("commit: already prepared"); } finishCommit(); } } private synchronized final void finishCommit() throws CorruptIndexException, IOException { if (pendingCommit != null) { try { if (infoStream != null) message("commit: pendingCommit != null"); pendingCommit.finishCommit(directory); if (infoStream != null) message("commit: wrote segments file \"" + pendingCommit.getCurrentSegmentFileName() + "\""); lastCommitChangeCount = pendingCommitChangeCount; segmentInfos.updateGeneration(pendingCommit); segmentInfos.setUserData(pendingCommit.getUserData()); setRollbackSegmentInfos(pendingCommit); deleter.checkpoint(pendingCommit, true); } finally { deleter.decRef(pendingCommit); pendingCommit = null; notifyAll(); } } else if (infoStream != null) { message("commit: pendingCommit == null; skip"); } if (infoStream != null) { message("commit: done"); } } /** * Flush all in-memory buffered udpates (adds and deletes) * to the Directory. * @param triggerMerge if true, we may merge segments (if * deletes or docs were flushed) if necessary * @param flushDocStores if false we are allowed to keep * doc stores open to share with the next segment * @param flushDeletes whether pending deletes should also * be flushed */ protected final void flush(boolean triggerMerge, boolean flushDocStores, boolean flushDeletes) throws CorruptIndexException, IOException { // We can be called during close, when closing==true, so we must pass false to ensureOpen: ensureOpen(false); if (doFlush(flushDocStores, flushDeletes) && triggerMerge) maybeMerge(); } // TODO: this method should not have to be entirely // synchronized, ie, merges should be allowed to commit // even while a flush is happening private synchronized final boolean doFlush(boolean flushDocStores, boolean flushDeletes) throws CorruptIndexException, IOException { try { return doFlushInternal(flushDocStores, flushDeletes); } finally { if (docWriter.doBalanceRAM()) { docWriter.balanceRAM(); } docWriter.clearFlushPending(); } } // TODO: this method should not have to be entirely // synchronized, ie, merges should be allowed to commit // even while a flush is happening private synchronized final boolean doFlushInternal(boolean flushDocStores, boolean flushDeletes) throws CorruptIndexException, IOException { if (hitOOM) { throw new IllegalStateException("this writer hit an OutOfMemoryError; cannot flush"); } ensureOpen(false); assert testPoint("startDoFlush"); doBeforeFlush(); flushCount++; // If we are flushing because too many deletes // accumulated, then we should apply the deletes to free // RAM: flushDeletes |= docWriter.doApplyDeletes(); // When autoCommit=true we must always flush deletes // when flushing a segment; otherwise deletes may become // visible before their corresponding added document // from an updateDocument call flushDeletes |= autoCommit; // Make sure no threads are actively adding a document. // Returns true if docWriter is currently aborting, in // which case we skip flushing this segment if (infoStream != null) { message("flush: now pause all indexing threads"); } if (docWriter.pauseAllThreads()) { docWriter.resumeAllThreads(); return false; } try { SegmentInfo newSegment = null; final int numDocs = docWriter.getNumDocsInRAM(); // Always flush docs if there are any boolean flushDocs = numDocs > 0; // With autoCommit=true we always must flush the doc // stores when we flush flushDocStores |= autoCommit; String docStoreSegment = docWriter.getDocStoreSegment(); assert docStoreSegment != null || numDocs == 0; if (docStoreSegment == null) flushDocStores = false; int docStoreOffset = docWriter.getDocStoreOffset(); // docStoreOffset should only be non-zero when // autoCommit == false assert !autoCommit || 0 == docStoreOffset; boolean docStoreIsCompoundFile = false; if (infoStream != null) { message(" flush: segment=" + docWriter.getSegment() + " docStoreSegment=" + docWriter.getDocStoreSegment() + " docStoreOffset=" + docStoreOffset + " flushDocs=" + flushDocs + " flushDeletes=" + flushDeletes + " flushDocStores=" + flushDocStores + " numDocs=" + numDocs + " numBufDelTerms=" + docWriter.getNumBufferedDeleteTerms()); message(" index before flush " + segString()); } // Check if the doc stores must be separately flushed // because other segments, besides the one we are about // to flush, reference it if (flushDocStores && (!flushDocs || !docWriter.getSegment().equals(docWriter.getDocStoreSegment()))) { // We must separately flush the doc store if (infoStream != null) message(" flush shared docStore segment " + docStoreSegment); docStoreIsCompoundFile = flushDocStores(); flushDocStores = false; } String segment = docWriter.getSegment(); // If we are flushing docs, segment must not be null: assert segment != null || !flushDocs; if (flushDocs) { boolean success = false; final int flushedDocCount; try { flushedDocCount = docWriter.flush(flushDocStores); if (infoStream != null) { message("flushedFiles=" + docWriter.getFlushedFiles()); } success = true; } finally { if (!success) { if (infoStream != null) message("hit exception flushing segment " + segment); deleter.refresh(segment); } } if (0 == docStoreOffset && flushDocStores) { // This means we are flushing private doc stores // with this segment, so it will not be shared // with other segments assert docStoreSegment != null; assert docStoreSegment.equals(segment); docStoreOffset = -1; docStoreIsCompoundFile = false; docStoreSegment = null; } // Create new SegmentInfo, but do not add to our // segmentInfos until deletes are flushed // successfully. newSegment = new SegmentInfo(segment, flushedDocCount, directory, false, true, docStoreOffset, docStoreSegment, docStoreIsCompoundFile, docWriter.hasProx()); setDiagnostics(newSegment, "flush"); } docWriter.pushDeletes(); if (flushDocs) { segmentInfos.add(newSegment); checkpoint(); } if (flushDocs && mergePolicy.useCompoundFile(segmentInfos, newSegment)) { // Now build compound file boolean success = false; try { docWriter.createCompoundFile(segment); success = true; } finally { if (!success) { if (infoStream != null) message("hit exception creating compound file for newly flushed segment " + segment); deleter.deleteFile(segment + "." + IndexFileNames.COMPOUND_FILE_EXTENSION); } } newSegment.setUseCompoundFile(true); checkpoint(); } if (flushDeletes) { applyDeletes(); } if (flushDocs) checkpoint(); doAfterFlush(); return flushDocs; } catch (OutOfMemoryError oom) { handleOOM(oom, "doFlush"); // never hit return false; } finally { docWriter.resumeAllThreads(); } } /** Expert: Return the total size of all index files currently cached in memory. * Useful for size management with flushRamDocs() */ public final long ramSizeInBytes() { ensureOpen(); return docWriter.getRAMUsed(); } /** Expert: Return the number of documents currently * buffered in RAM. */ public final synchronized int numRamDocs() { ensureOpen(); return docWriter.getNumDocsInRAM(); } private int ensureContiguousMerge(MergePolicy.OneMerge merge) { int first = segmentInfos.indexOf(merge.segments.info(0)); if (first == -1) throw new MergePolicy.MergeException("could not find segment " + merge.segments.info(0).name + " in current index " + segString(), directory); final int numSegments = segmentInfos.size(); final int numSegmentsToMerge = merge.segments.size(); for(int i=0;i= numSegments || !segmentInfos.info(first+i).equals(info)) { if (segmentInfos.indexOf(info) == -1) throw new MergePolicy.MergeException("MergePolicy selected a segment (" + info.name + ") that is not in the current index " + segString(), directory); else throw new MergePolicy.MergeException("MergePolicy selected non-contiguous segments to merge (" + merge.segString(directory) + " vs " + segString() + "), which IndexWriter (currently) cannot handle", directory); } } return first; } /** Carefully merges deletes for the segments we just * merged. This is tricky because, although merging will * clear all deletes (compacts the documents), new * deletes may have been flushed to the segments since * the merge was started. This method "carries over" * such new deletes onto the newly merged segment, and * saves the resulting deletes file (incrementing the * delete generation for merge.info). If no deletes were * flushed, no new deletes file is saved. */ synchronized private void commitMergedDeletes(MergePolicy.OneMerge merge, SegmentReader mergeReader) throws IOException { assert testPoint("startCommitMergeDeletes"); final SegmentInfos sourceSegments = merge.segments; if (infoStream != null) message("commitMergeDeletes " + merge.segString(directory)); // Carefully merge deletes that occurred after we // started merging: int docUpto = 0; int delCount = 0; for(int i=0; i < sourceSegments.size(); i++) { SegmentInfo info = sourceSegments.info(i); int docCount = info.docCount; SegmentReader previousReader = merge.readersClone[i]; SegmentReader currentReader = merge.readers[i]; if (previousReader.hasDeletions()) { // There were deletes on this segment when the merge // started. The merge has collapsed away those // deletes, but, if new deletes were flushed since // the merge started, we must now carefully keep any // newly flushed deletes but mapping them to the new // docIDs. if (currentReader.numDeletedDocs() > previousReader.numDeletedDocs()) { // This means this segment has had new deletes // committed since we started the merge, so we // must merge them: for(int j=0;j 0; } /* FIXME if we want to support non-contiguous segment merges */ synchronized private boolean commitMerge(MergePolicy.OneMerge merge, SegmentMerger merger, int mergedDocCount, SegmentReader mergedReader) throws IOException { assert testPoint("startCommitMerge"); if (hitOOM) { throw new IllegalStateException("this writer hit an OutOfMemoryError; cannot complete merge"); } if (infoStream != null) message("commitMerge: " + merge.segString(directory) + " index=" + segString()); assert merge.registerDone; // If merge was explicitly aborted, or, if rollback() or // rollbackTransaction() had been called since our merge // started (which results in an unqualified // deleter.refresh() call that will remove any index // file that current segments does not reference), we // abort this merge if (merge.isAborted()) { if (infoStream != null) message("commitMerge: skipping merge " + merge.segString(directory) + ": it was aborted"); return false; } final int start = ensureContiguousMerge(merge); commitMergedDeletes(merge, mergedReader); docWriter.remapDeletes(segmentInfos, merger.getDocMaps(), merger.getDelCounts(), merge, mergedDocCount); // If the doc store we are using has been closed and // is in now compound format (but wasn't when we // started), then we will switch to the compound // format as well: setMergeDocStoreIsCompoundFile(merge); merge.info.setHasProx(merger.hasProx()); segmentInfos.subList(start, start + merge.segments.size()).clear(); assert !segmentInfos.contains(merge.info); segmentInfos.add(start, merge.info); closeMergeReaders(merge, false); // Must note the change to segmentInfos so any commits // in-flight don't lose it: checkpoint(); // If the merged segments had pending changes, clear // them so that they don't bother writing them to // disk, updating SegmentInfo, etc.: readerPool.clear(merge.segments); if (merge.optimize) { // cascade the optimize: segmentsToOptimize.add(merge.info); } return true; } final private void handleMergeException(Throwable t, MergePolicy.OneMerge merge) throws IOException { if (infoStream != null) { message("handleMergeException: merge=" + merge.segString(directory) + " exc=" + t); } // Set the exception on the merge, so if // optimize() is waiting on us it sees the root // cause exception: merge.setException(t); addMergeException(merge); if (t instanceof MergePolicy.MergeAbortedException) { // We can ignore this exception (it happens when // close(false) or rollback is called), unless the // merge involves segments from external directories, // in which case we must throw it so, for example, the // rollbackTransaction code in addIndexes* is // executed. if (merge.isExternal) throw (MergePolicy.MergeAbortedException) t; } else if (t instanceof IOException) throw (IOException) t; else if (t instanceof RuntimeException) throw (RuntimeException) t; else if (t instanceof Error) throw (Error) t; else // Should not get here throw new RuntimeException(t); } /** * Merges the indicated segments, replacing them in the stack with a * single segment. */ final void merge(MergePolicy.OneMerge merge) throws CorruptIndexException, IOException { boolean success = false; try { try { try { mergeInit(merge); if (infoStream != null) message("now merge\n merge=" + merge.segString(directory) + "\n merge=" + merge + "\n index=" + segString()); mergeMiddle(merge); mergeSuccess(merge); success = true; } catch (Throwable t) { handleMergeException(t, merge); } } finally { synchronized(this) { mergeFinish(merge); if (!success) { if (infoStream != null) message("hit exception during merge"); if (merge.info != null && !segmentInfos.contains(merge.info)) deleter.refresh(merge.info.name); } // This merge (and, generally, any change to the // segments) may now enable new merges, so we call // merge policy & update pending merges. if (success && !merge.isAborted() && !closed && !closing) updatePendingMerges(merge.maxNumSegmentsOptimize, merge.optimize); } } } catch (OutOfMemoryError oom) { handleOOM(oom, "merge"); } } /** Hook that's called when the specified merge is complete. */ void mergeSuccess(MergePolicy.OneMerge merge) { } /** Checks whether this merge involves any segments * already participating in a merge. If not, this merge * is "registered", meaning we record that its segments * are now participating in a merge, and true is * returned. Else (the merge conflicts) false is * returned. */ final synchronized boolean registerMerge(MergePolicy.OneMerge merge) throws MergePolicy.MergeAbortedException { if (merge.registerDone) return true; if (stopMerges) { merge.abort(); throw new MergePolicy.MergeAbortedException("merge is aborted: " + merge.segString(directory)); } final int count = merge.segments.size(); boolean isExternal = false; for(int i=0;i 0; if (hitOOM) { throw new IllegalStateException("this writer hit an OutOfMemoryError; cannot merge"); } if (merge.info != null) // mergeInit already done return; if (merge.isAborted()) return; boolean changed = applyDeletes(); // If autoCommit == true then all deletes should have // been flushed when we flushed the last segment assert !changed || !autoCommit; final SegmentInfos sourceSegments = merge.segments; final int end = sourceSegments.size(); // Check whether this merge will allow us to skip // merging the doc stores (stored field & vectors). // This is a very substantial optimization (saves tons // of IO) that can only be applied with // autoCommit=false. Directory lastDir = directory; String lastDocStoreSegment = null; int next = -1; boolean mergeDocStores = false; boolean doFlushDocStore = false; final String currentDocStoreSegment = docWriter.getDocStoreSegment(); // Test each segment to be merged: check if we need to // flush/merge doc stores for (int i = 0; i < end; i++) { SegmentInfo si = sourceSegments.info(i); // If it has deletions we must merge the doc stores if (si.hasDeletions()) mergeDocStores = true; // If it has its own (private) doc stores we must // merge the doc stores if (-1 == si.getDocStoreOffset()) mergeDocStores = true; // If it has a different doc store segment than // previous segments, we must merge the doc stores String docStoreSegment = si.getDocStoreSegment(); if (docStoreSegment == null) mergeDocStores = true; else if (lastDocStoreSegment == null) lastDocStoreSegment = docStoreSegment; else if (!lastDocStoreSegment.equals(docStoreSegment)) mergeDocStores = true; // Segments' docScoreOffsets must be in-order, // contiguous. For the default merge policy now // this will always be the case but for an arbitrary // merge policy this may not be the case if (-1 == next) next = si.getDocStoreOffset() + si.docCount; else if (next != si.getDocStoreOffset()) mergeDocStores = true; else next = si.getDocStoreOffset() + si.docCount; // If the segment comes from a different directory // we must merge if (lastDir != si.dir) mergeDocStores = true; // If the segment is referencing the current "live" // doc store outputs then we must merge if (si.getDocStoreOffset() != -1 && currentDocStoreSegment != null && si.getDocStoreSegment().equals(currentDocStoreSegment)) { doFlushDocStore = true; } } // if a mergedSegmentWarmer is installed, we must merge // the doc stores because we will open a full // SegmentReader on the merged segment: if (!mergeDocStores && mergedSegmentWarmer != null && currentDocStoreSegment != null && lastDocStoreSegment != null && lastDocStoreSegment.equals(currentDocStoreSegment)) { mergeDocStores = true; } final int docStoreOffset; final String docStoreSegment; final boolean docStoreIsCompoundFile; if (mergeDocStores) { docStoreOffset = -1; docStoreSegment = null; docStoreIsCompoundFile = false; } else { SegmentInfo si = sourceSegments.info(0); docStoreOffset = si.getDocStoreOffset(); docStoreSegment = si.getDocStoreSegment(); docStoreIsCompoundFile = si.getDocStoreIsCompoundFile(); } if (mergeDocStores && doFlushDocStore) { // SegmentMerger intends to merge the doc stores // (stored fields, vectors), and at least one of the // segments to be merged refers to the currently // live doc stores. // TODO: if we know we are about to merge away these // newly flushed doc store files then we should not // make compound file out of them... if (infoStream != null) message("now flush at merge"); doFlush(true, false); } merge.mergeDocStores = mergeDocStores; // Bind a new segment name here so even with // ConcurrentMergePolicy we keep deterministic segment // names. merge.info = new SegmentInfo(newSegmentName(), 0, directory, false, true, docStoreOffset, docStoreSegment, docStoreIsCompoundFile, false); Map details = new HashMap(); details.put("optimize", merge.optimize+""); details.put("mergeFactor", end+""); details.put("mergeDocStores", mergeDocStores+""); setDiagnostics(merge.info, "merge", details); // Also enroll the merged segment into mergingSegments; // this prevents it from getting selected for a merge // after our merge is done but while we are building the // CFS: mergingSegments.add(merge.info); } private void setDiagnostics(SegmentInfo info, String source) { setDiagnostics(info, source, null); } private void setDiagnostics(SegmentInfo info, String source, Map details) { Map diagnostics = new HashMap(); diagnostics.put("source", source); diagnostics.put("lucene.version", Constants.LUCENE_VERSION); diagnostics.put("os", Constants.OS_NAME+""); diagnostics.put("os.arch", Constants.OS_ARCH+""); diagnostics.put("os.version", Constants.OS_VERSION+""); diagnostics.put("java.version", Constants.JAVA_VERSION+""); diagnostics.put("java.vendor", Constants.JAVA_VENDOR+""); if (details != null) { diagnostics.putAll(details); } info.setDiagnostics(diagnostics); } /** This is called after merging a segment and before * building its CFS. Return true if the files should be * sync'd. If you return false, then the source segment * files that were merged cannot be deleted until the CFS * file is built & sync'd. So, returning false consumes * more transient disk space, but saves performance of * not having to sync files which will shortly be deleted * anyway. * @deprecated -- this will be removed in 3.0 when * autoCommit is hardwired to false */ private synchronized boolean doCommitBeforeMergeCFS(MergePolicy.OneMerge merge) throws IOException { long freeableBytes = 0; final int size = merge.segments.size(); for(int i=0;i totalBytes) return true; else return false; } /** Does fininishing for a merge, which is fast but holds * the synchronized lock on IndexWriter instance. */ final synchronized void mergeFinish(MergePolicy.OneMerge merge) throws IOException { // Optimize, addIndexes or finishMerges may be waiting // on merges to finish. notifyAll(); // It's possible we are called twice, eg if there was an // exception inside mergeInit if (merge.registerDone) { final SegmentInfos sourceSegments = merge.segments; final int end = sourceSegments.size(); for(int i=0;i X minutes or // more than Y bytes have been written, etc. if (autoCommit) { final long size; synchronized(this) { size = merge.info.sizeInBytes(); } commit(size); } return mergedDocCount; } synchronized void addMergeException(MergePolicy.OneMerge merge) { assert merge.getException() != null; if (!mergeExceptions.contains(merge) && mergeGen == merge.mergeGen) mergeExceptions.add(merge); } // Apply buffered deletes to all segments. private final synchronized boolean applyDeletes() throws CorruptIndexException, IOException { assert testPoint("startApplyDeletes"); flushDeletesCount++; boolean success = false; boolean changed; try { changed = docWriter.applyDeletes(segmentInfos); success = true; } finally { if (!success && infoStream != null) { message("hit exception flushing deletes"); } } if (changed) checkpoint(); return changed; } // For test purposes. final synchronized int getBufferedDeleteTermsSize() { return docWriter.getBufferedDeleteTerms().size(); } // For test purposes. final synchronized int getNumBufferedDeleteTerms() { return docWriter.getNumBufferedDeleteTerms(); } // utility routines for tests SegmentInfo newestSegment() { return segmentInfos.size() > 0 ? segmentInfos.info(segmentInfos.size()-1) : null; } public synchronized String segString() { return segString(segmentInfos); } private synchronized String segString(SegmentInfos infos) { StringBuffer buffer = new StringBuffer(); final int count = infos.size(); for(int i = 0; i < count; i++) { if (i > 0) { buffer.append(' '); } final SegmentInfo info = infos.info(i); buffer.append(info.segString(directory)); if (info.dir != directory) buffer.append("**"); } return buffer.toString(); } // Files that have been sync'd already private final HashSet synced = new HashSet(); // Files that are now being sync'd private HashSet syncing = new HashSet(); private boolean startSync(String fileName, Collection pending) { synchronized(synced) { if (!synced.contains(fileName)) { if (!syncing.contains(fileName)) { syncing.add(fileName); return true; } else { pending.add(fileName); return false; } } else return false; } } private void finishSync(String fileName, boolean success) { synchronized(synced) { assert syncing.contains(fileName); syncing.remove(fileName); if (success) synced.add(fileName); synced.notifyAll(); } } /** Blocks until all files in syncing are sync'd */ private boolean waitForAllSynced(Collection syncing) throws IOException { synchronized(synced) { Iterator it = syncing.iterator(); while(it.hasNext()) { final String fileName = (String) it.next(); while(!synced.contains(fileName)) { if (!syncing.contains(fileName)) // There was an error because a file that was // previously syncing failed to appear in synced return false; else try { synced.wait(); } catch (InterruptedException ie) { // In 3.0 we will change this to throw // InterruptedException instead Thread.currentThread().interrupt(); throw new RuntimeException(ie); } } } return true; } } /** Pauses before syncing. On Windows, at least, it's * best (performance-wise) to pause in order to let OS * flush writes to disk on its own, before forcing a * sync. * @deprecated -- this will be removed in 3.0 when * autoCommit is hardwired to false */ private void syncPause(long sizeInBytes) { if (mergeScheduler instanceof ConcurrentMergeScheduler && maxSyncPauseSeconds > 0) { // Rough heuristic: for every 10 MB, we pause for 1 // second, up until the max long pauseTime = (long) (1000*sizeInBytes/10/1024/1024); final long maxPauseTime = (long) (maxSyncPauseSeconds*1000); if (pauseTime > maxPauseTime) pauseTime = maxPauseTime; final int sleepCount = (int) (pauseTime / 100); for(int i=0;i toSync.getGeneration()) toSync.updateGeneration(segmentInfos); boolean success = false; try { // Exception here means nothing is prepared // (this method unwinds everything it did on // an exception) try { toSync.prepareCommit(directory); } finally { // Have our master segmentInfos record the // generations we just prepared. We do this // on error or success so we don't // double-write a segments_N file. segmentInfos.updateGeneration(toSync); } assert pendingCommit == null; setPending = true; pendingCommit = toSync; pendingCommitChangeCount = myChangeCount; success = true; } finally { if (!success && infoStream != null) message("hit exception committing segments file"); } break; } else { // Must wait for other commit to complete doWait(); } } } if (infoStream != null) message("done all syncs"); assert testPoint("midStartCommitSuccess"); } finally { synchronized(this) { if (!setPending) deleter.decRef(toSync); } } } catch (OutOfMemoryError oom) { handleOOM(oom, "startCommit"); } assert testPoint("finishStartCommit"); } /** * Returns true iff the index in the named directory is * currently locked. * @param directory the directory to check for a lock * @throws IOException if there is a low-level IO error */ public static boolean isLocked(Directory directory) throws IOException { return directory.makeLock(WRITE_LOCK_NAME).isLocked(); } /** * Returns true iff the index in the named directory is * currently locked. * @param directory the directory to check for a lock * @throws IOException if there is a low-level IO error * @deprecated Use {@link #isLocked(Directory)} */ public static boolean isLocked(String directory) throws IOException { Directory dir = FSDirectory.getDirectory(directory); try { return isLocked(dir); } finally { dir.close(); } } /** * Forcibly unlocks the index in the named directory. *

    * Caution: this should only be used by failure recovery code, * when it is known that no other process nor thread is in fact * currently accessing this index. */ public static void unlock(Directory directory) throws IOException { directory.makeLock(IndexWriter.WRITE_LOCK_NAME).release(); } /** * Specifies maximum field length (in number of tokens/terms) in {@link IndexWriter} constructors. * {@link #setMaxFieldLength(int)} overrides the value set by * the constructor. */ public static final class MaxFieldLength { private int limit; private String name; /** * Private type-safe-enum-pattern constructor. * * @param name instance name * @param limit maximum field length */ private MaxFieldLength(String name, int limit) { this.name = name; this.limit = limit; } /** * Public constructor to allow users to specify the maximum field size limit. * * @param limit The maximum field length */ public MaxFieldLength(int limit) { this("User-specified", limit); } public int getLimit() { return limit; } public String toString() { return name + ":" + limit; } /** Sets the maximum field length to {@link Integer#MAX_VALUE}. */ public static final MaxFieldLength UNLIMITED = new MaxFieldLength("UNLIMITED", Integer.MAX_VALUE); /** * Sets the maximum field length to * {@link #DEFAULT_MAX_FIELD_LENGTH} * */ public static final MaxFieldLength LIMITED = new MaxFieldLength("LIMITED", DEFAULT_MAX_FIELD_LENGTH); } /** If {@link #getReader} has been called (ie, this writer * is in near real-time mode), then after a merge * completes, this class can be invoked to warm the * reader on the newly merged segment, before the merge * commits. This is not required for near real-time * search, but will reduce search latency on opening a * new near real-time reader after a merge completes. * *

    NOTE: This API is experimental and might * change in incompatible ways in the next release.

    * *

    NOTE: warm is called before any deletes have * been carried over to the merged segment. */ public static abstract class IndexReaderWarmer { public abstract void warm(IndexReader reader) throws IOException; } private IndexReaderWarmer mergedSegmentWarmer; /** Set the merged segment warmer. See {@link * IndexReaderWarmer}. */ public void setMergedSegmentWarmer(IndexReaderWarmer warmer) { mergedSegmentWarmer = warmer; } /** Returns the current merged segment warmer. See {@link * IndexReaderWarmer}. */ public IndexReaderWarmer getMergedSegmentWarmer() { return mergedSegmentWarmer; } private void handleOOM(OutOfMemoryError oom, String location) { if (infoStream != null) { message("hit OutOfMemoryError inside " + location); } hitOOM = true; throw oom; } // deprecated private boolean allowMinus1Position; /** Deprecated: emulates IndexWriter's buggy behavior when * first token(s) have positionIncrement==0 (ie, prior to * fixing LUCENE-1542) */ public void setAllowMinus1Position() { allowMinus1Position = true; docWriter.setAllowMinus1Position(); } // deprecated boolean getAllowMinus1Position() { return allowMinus1Position; } // Used only by assert for testing. Current points: // startDoFlush // startCommitMerge // startStartCommit // midStartCommit // midStartCommit2 // midStartCommitSuccess // finishStartCommit // startCommitMergeDeletes // startMergeInit // startApplyDeletes // DocumentsWriter.ThreadState.init start boolean testPoint(String name) { return true; } synchronized boolean nrtIsCurrent(SegmentInfos infos) { if (!infos.equals(segmentInfos)) { // if any structural changes (new segments), we are // stale return false; } else if (infos.getGeneration() != segmentInfos.getGeneration()) { // if any commit took place since we were opened, we // are stale return false; } else { return !docWriter.anyChanges(); } } synchronized boolean isClosed() { return closed; } } lucene-2.9.4/src/java/org/apache/lucene/index/SegmentTermEnum.java0000644000175000017500000001537111474320230025502 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.store.IndexInput; final class SegmentTermEnum extends TermEnum implements Cloneable { private IndexInput input; FieldInfos fieldInfos; long size; long position = -1; private TermBuffer termBuffer = new TermBuffer(); private TermBuffer prevBuffer = new TermBuffer(); private TermBuffer scanBuffer = new TermBuffer(); // used for scanning private TermInfo termInfo = new TermInfo(); private int format; private boolean isIndex = false; long indexPointer = 0; int indexInterval; int skipInterval; int maxSkipLevels; private int formatM1SkipInterval; SegmentTermEnum(IndexInput i, FieldInfos fis, boolean isi) throws CorruptIndexException, IOException { input = i; fieldInfos = fis; isIndex = isi; maxSkipLevels = 1; // use single-level skip lists for formats > -3 int firstInt = input.readInt(); if (firstInt >= 0) { // original-format file, without explicit format version number format = 0; size = firstInt; // back-compatible settings indexInterval = 128; skipInterval = Integer.MAX_VALUE; // switch off skipTo optimization } else { // we have a format version number format = firstInt; // check that it is a format we can understand if (format < TermInfosWriter.FORMAT_CURRENT) throw new CorruptIndexException("Unknown format version:" + format + " expected " + TermInfosWriter.FORMAT_CURRENT + " or higher"); size = input.readLong(); // read the size if(format == -1){ if (!isIndex) { indexInterval = input.readInt(); formatM1SkipInterval = input.readInt(); } // switch off skipTo optimization for file format prior to 1.4rc2 in order to avoid a bug in // skipTo implementation of these versions skipInterval = Integer.MAX_VALUE; } else { indexInterval = input.readInt(); skipInterval = input.readInt(); if (format <= TermInfosWriter.FORMAT) { // this new format introduces multi-level skipping maxSkipLevels = input.readInt(); } } assert indexInterval > 0: "indexInterval=" + indexInterval + " is negative; must be > 0"; assert skipInterval > 0: "skipInterval=" + skipInterval + " is negative; must be > 0"; } if (format > TermInfosWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) { termBuffer.setPreUTF8Strings(); scanBuffer.setPreUTF8Strings(); prevBuffer.setPreUTF8Strings(); } } protected Object clone() { SegmentTermEnum clone = null; try { clone = (SegmentTermEnum) super.clone(); } catch (CloneNotSupportedException e) {} clone.input = (IndexInput) input.clone(); clone.termInfo = new TermInfo(termInfo); clone.termBuffer = (TermBuffer)termBuffer.clone(); clone.prevBuffer = (TermBuffer)prevBuffer.clone(); clone.scanBuffer = new TermBuffer(); return clone; } final void seek(long pointer, long p, Term t, TermInfo ti) throws IOException { input.seek(pointer); position = p; termBuffer.set(t); prevBuffer.reset(); termInfo.set(ti); } /** Increments the enumeration to the next element. True if one exists.*/ public final boolean next() throws IOException { if (position++ >= size - 1) { prevBuffer.set(termBuffer); termBuffer.reset(); return false; } prevBuffer.set(termBuffer); termBuffer.read(input, fieldInfos); termInfo.docFreq = input.readVInt(); // read doc freq termInfo.freqPointer += input.readVLong(); // read freq pointer termInfo.proxPointer += input.readVLong(); // read prox pointer if(format == -1){ // just read skipOffset in order to increment file pointer; // value is never used since skipTo is switched off if (!isIndex) { if (termInfo.docFreq > formatM1SkipInterval) { termInfo.skipOffset = input.readVInt(); } } } else{ if (termInfo.docFreq >= skipInterval) termInfo.skipOffset = input.readVInt(); } if (isIndex) indexPointer += input.readVLong(); // read index pointer return true; } /** Optimized scan, without allocating new terms. * Return number of invocations to next(). */ final int scanTo(Term term) throws IOException { scanBuffer.set(term); int count = 0; while (scanBuffer.compareTo(termBuffer) > 0 && next()) { count++; } return count; } /** Returns the current Term in the enumeration. Initially invalid, valid after next() called for the first time.*/ public final Term term() { return termBuffer.toTerm(); } /** Returns the previous Term enumerated. Initially null.*/ final Term prev() { return prevBuffer.toTerm(); } /** Returns the current TermInfo in the enumeration. Initially invalid, valid after next() called for the first time.*/ final TermInfo termInfo() { return new TermInfo(termInfo); } /** Sets the argument to the current TermInfo in the enumeration. Initially invalid, valid after next() called for the first time.*/ final void termInfo(TermInfo ti) { ti.set(termInfo); } /** Returns the docFreq from the current TermInfo in the enumeration. Initially invalid, valid after next() called for the first time.*/ public final int docFreq() { return termInfo.docFreq; } /* Returns the freqPointer from the current TermInfo in the enumeration. Initially invalid, valid after next() called for the first time.*/ final long freqPointer() { return termInfo.freqPointer; } /* Returns the proxPointer from the current TermInfo in the enumeration. Initially invalid, valid after next() called for the first time.*/ final long proxPointer() { return termInfo.proxPointer; } /** Closes the enumeration to further activity, freeing resources. */ public final void close() throws IOException { input.close(); } } lucene-2.9.4/src/java/org/apache/lucene/index/MultiReader.java0000644000175000017500000003335211474320230024637 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.Map; import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.index.DirectoryReader.MultiTermDocs; import org.apache.lucene.index.DirectoryReader.MultiTermEnum; import org.apache.lucene.index.DirectoryReader.MultiTermPositions; import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.FieldCache; // not great (circular); used only to purge FieldCache entry on close /** An IndexReader which reads multiple indexes, appending their content. * * @version $Id: MultiReader.java 950451 2010-06-02 09:33:57Z mikemccand $ */ public class MultiReader extends IndexReader implements Cloneable { protected IndexReader[] subReaders; private int[] starts; // 1st docno for each segment private boolean[] decrefOnClose; // remember which subreaders to decRef on close private Map normsCache = new HashMap(); private int maxDoc = 0; private int numDocs = -1; private boolean hasDeletions = false; /** *

    Construct a MultiReader aggregating the named set of (sub)readers. * Directory locking for delete, undeleteAll, and setNorm operations is * left to the subreaders.

    *

    Note that all subreaders are closed if this Multireader is closed.

    * @param subReaders set of (sub)readers * @throws IOException */ public MultiReader(IndexReader[] subReaders) { initialize(subReaders, true); } /** *

    Construct a MultiReader aggregating the named set of (sub)readers. * Directory locking for delete, undeleteAll, and setNorm operations is * left to the subreaders.

    * @param closeSubReaders indicates whether the subreaders should be closed * when this MultiReader is closed * @param subReaders set of (sub)readers * @throws IOException */ public MultiReader(IndexReader[] subReaders, boolean closeSubReaders) { initialize(subReaders, closeSubReaders); } private void initialize(IndexReader[] subReaders, boolean closeSubReaders) { this.subReaders = (IndexReader[]) subReaders.clone(); starts = new int[subReaders.length + 1]; // build starts array decrefOnClose = new boolean[subReaders.length]; for (int i = 0; i < subReaders.length; i++) { starts[i] = maxDoc; maxDoc += subReaders[i].maxDoc(); // compute maxDocs if (!closeSubReaders) { subReaders[i].incRef(); decrefOnClose[i] = true; } else { decrefOnClose[i] = false; } if (subReaders[i].hasDeletions()) hasDeletions = true; } starts[subReaders.length] = maxDoc; } /** * Tries to reopen the subreaders. *
    * If one or more subreaders could be re-opened (i. e. subReader.reopen() * returned a new instance != subReader), then a new MultiReader instance * is returned, otherwise this instance is returned. *

    * A re-opened instance might share one or more subreaders with the old * instance. Index modification operations result in undefined behavior * when performed before the old instance is closed. * (see {@link IndexReader#reopen()}). *

    * If subreaders are shared, then the reference count of those * readers is increased to ensure that the subreaders remain open * until the last referring reader is closed. * * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public synchronized IndexReader reopen() throws CorruptIndexException, IOException { return doReopen(false); } /** * Clones the subreaders. * (see {@link IndexReader#clone()}). *
    *

    * If subreaders are shared, then the reference count of those * readers is increased to ensure that the subreaders remain open * until the last referring reader is closed. */ public synchronized Object clone() { try { return doReopen(true); } catch (Exception ex) { throw new RuntimeException(ex); } } /** * If clone is true then we clone each of the subreaders * @param doClone * @return New IndexReader, or same one (this) if * reopen/clone is not necessary * @throws CorruptIndexException * @throws IOException */ protected IndexReader doReopen(boolean doClone) throws CorruptIndexException, IOException { ensureOpen(); boolean reopened = false; IndexReader[] newSubReaders = new IndexReader[subReaders.length]; boolean success = false; try { for (int i = 0; i < subReaders.length; i++) { if (doClone) newSubReaders[i] = (IndexReader) subReaders[i].clone(); else newSubReaders[i] = subReaders[i].reopen(); // if at least one of the subreaders was updated we remember that // and return a new MultiReader if (newSubReaders[i] != subReaders[i]) { reopened = true; } } success = true; } finally { if (!success && reopened) { for (int i = 0; i < newSubReaders.length; i++) { if (newSubReaders[i] != subReaders[i]) { try { newSubReaders[i].close(); } catch (IOException ignore) { // keep going - we want to clean up as much as possible } } } } } if (reopened) { boolean[] newDecrefOnClose = new boolean[subReaders.length]; for (int i = 0; i < subReaders.length; i++) { if (newSubReaders[i] == subReaders[i]) { newSubReaders[i].incRef(); newDecrefOnClose[i] = true; } } MultiReader mr = new MultiReader(newSubReaders); mr.decrefOnClose = newDecrefOnClose; mr.setDisableFakeNorms(getDisableFakeNorms()); return mr; } else { return this; } } public TermFreqVector[] getTermFreqVectors(int n) throws IOException { ensureOpen(); int i = readerIndex(n); // find segment num return subReaders[i].getTermFreqVectors(n - starts[i]); // dispatch to segment } public TermFreqVector getTermFreqVector(int n, String field) throws IOException { ensureOpen(); int i = readerIndex(n); // find segment num return subReaders[i].getTermFreqVector(n - starts[i], field); } public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException { ensureOpen(); int i = readerIndex(docNumber); // find segment num subReaders[i].getTermFreqVector(docNumber - starts[i], field, mapper); } public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException { ensureOpen(); int i = readerIndex(docNumber); // find segment num subReaders[i].getTermFreqVector(docNumber - starts[i], mapper); } public boolean isOptimized() { return false; } public int numDocs() { // Don't call ensureOpen() here (it could affect performance) // NOTE: multiple threads may wind up init'ing // numDocs... but that's harmless if (numDocs == -1) { // check cache int n = 0; // cache miss--recompute for (int i = 0; i < subReaders.length; i++) n += subReaders[i].numDocs(); // sum from readers numDocs = n; } return numDocs; } public int maxDoc() { // Don't call ensureOpen() here (it could affect performance) return maxDoc; } // inherit javadoc public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException { ensureOpen(); int i = readerIndex(n); // find segment num return subReaders[i].document(n - starts[i], fieldSelector); // dispatch to segment reader } public boolean isDeleted(int n) { // Don't call ensureOpen() here (it could affect performance) int i = readerIndex(n); // find segment num return subReaders[i].isDeleted(n - starts[i]); // dispatch to segment reader } public boolean hasDeletions() { // Don't call ensureOpen() here (it could affect performance) return hasDeletions; } protected void doDelete(int n) throws CorruptIndexException, IOException { numDocs = -1; // invalidate cache int i = readerIndex(n); // find segment num subReaders[i].deleteDocument(n - starts[i]); // dispatch to segment reader hasDeletions = true; } protected void doUndeleteAll() throws CorruptIndexException, IOException { for (int i = 0; i < subReaders.length; i++) subReaders[i].undeleteAll(); hasDeletions = false; numDocs = -1; // invalidate cache } private int readerIndex(int n) { // find reader for doc n: return DirectoryReader.readerIndex(n, this.starts, this.subReaders.length); } public boolean hasNorms(String field) throws IOException { ensureOpen(); for (int i = 0; i < subReaders.length; i++) { if (subReaders[i].hasNorms(field)) return true; } return false; } private byte[] ones; private byte[] fakeNorms() { if (ones==null) ones=SegmentReader.createFakeNorms(maxDoc()); return ones; } public synchronized byte[] norms(String field) throws IOException { ensureOpen(); byte[] bytes = (byte[])normsCache.get(field); if (bytes != null) return bytes; // cache hit if (!hasNorms(field)) return getDisableFakeNorms() ? null : fakeNorms(); bytes = new byte[maxDoc()]; for (int i = 0; i < subReaders.length; i++) subReaders[i].norms(field, bytes, starts[i]); normsCache.put(field, bytes); // update cache return bytes; } public synchronized void norms(String field, byte[] result, int offset) throws IOException { ensureOpen(); byte[] bytes = (byte[])normsCache.get(field); for (int i = 0; i < subReaders.length; i++) // read from segments subReaders[i].norms(field, result, offset + starts[i]); if (bytes==null && !hasNorms(field)) { Arrays.fill(result, offset, result.length, DefaultSimilarity.encodeNorm(1.0f)); } else if (bytes != null) { // cache hit System.arraycopy(bytes, 0, result, offset, maxDoc()); } else { for (int i = 0; i < subReaders.length; i++) { // read from segments subReaders[i].norms(field, result, offset + starts[i]); } } } protected void doSetNorm(int n, String field, byte value) throws CorruptIndexException, IOException { synchronized (normsCache) { normsCache.remove(field); // clear cache } int i = readerIndex(n); // find segment num subReaders[i].setNorm(n-starts[i], field, value); // dispatch } public TermEnum terms() throws IOException { ensureOpen(); return new MultiTermEnum(this, subReaders, starts, null); } public TermEnum terms(Term term) throws IOException { ensureOpen(); return new MultiTermEnum(this, subReaders, starts, term); } public int docFreq(Term t) throws IOException { ensureOpen(); int total = 0; // sum freqs in segments for (int i = 0; i < subReaders.length; i++) total += subReaders[i].docFreq(t); return total; } public TermDocs termDocs() throws IOException { ensureOpen(); return new MultiTermDocs(this, subReaders, starts); } public TermPositions termPositions() throws IOException { ensureOpen(); return new MultiTermPositions(this, subReaders, starts); } /** @deprecated */ protected void doCommit() throws IOException { doCommit(null); } protected void doCommit(Map commitUserData) throws IOException { for (int i = 0; i < subReaders.length; i++) subReaders[i].commit(commitUserData); } protected synchronized void doClose() throws IOException { for (int i = 0; i < subReaders.length; i++) { if (decrefOnClose[i]) { subReaders[i].decRef(); } else { subReaders[i].close(); } } // NOTE: only needed in case someone had asked for // FieldCache for top-level reader (which is generally // not a good idea): FieldCache.DEFAULT.purge(this); } public Collection getFieldNames (IndexReader.FieldOption fieldNames) { ensureOpen(); return DirectoryReader.getFieldNames(fieldNames, this.subReaders); } /** * Checks recursively if all subreaders are up to date. */ public boolean isCurrent() throws CorruptIndexException, IOException { for (int i = 0; i < subReaders.length; i++) { if (!subReaders[i].isCurrent()) { return false; } } // all subreaders are up to date return true; } /** Not implemented. * @throws UnsupportedOperationException */ public long getVersion() { throw new UnsupportedOperationException("MultiReader does not support this method."); } public IndexReader[] getSequentialSubReaders() { return subReaders; } } lucene-2.9.4/src/java/org/apache/lucene/index/TermPositionVector.java0000644000175000017500000000355411474320230026242 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** Extends TermFreqVector to provide additional information about * positions in which each of the terms is found. A TermPositionVector not necessarily * contains both positions and offsets, but at least one of these arrays exists. */ public interface TermPositionVector extends TermFreqVector { /** Returns an array of positions in which the term is found. * Terms are identified by the index at which its number appears in the * term String array obtained from the indexOf method. * May return null if positions have not been stored. */ public int[] getTermPositions(int index); /** * Returns an array of TermVectorOffsetInfo in which the term is found. * May return null if offsets have not been stored. * * @see org.apache.lucene.analysis.Token * * @param index The position in the array to get the offsets from * @return An array of TermVectorOffsetInfo objects or the empty list */ public TermVectorOffsetInfo [] getOffsets(int index); }lucene-2.9.4/src/java/org/apache/lucene/index/DocInverterPerThread.java0000644000175000017500000000650311474320230026443 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; /** This is a DocFieldConsumer that inverts each field, * separately, from a Document, and accepts a * InvertedTermsConsumer to process those terms. */ final class DocInverterPerThread extends DocFieldConsumerPerThread { final DocInverter docInverter; final InvertedDocConsumerPerThread consumer; final InvertedDocEndConsumerPerThread endConsumer; //TODO: change to SingleTokenTokenStream after Token was removed final SingleTokenTokenStream singleTokenTokenStream = new SingleTokenTokenStream(); static class SingleTokenTokenStream extends TokenStream { TermAttribute termAttribute; OffsetAttribute offsetAttribute; SingleTokenTokenStream() { termAttribute = (TermAttribute) addAttribute(TermAttribute.class); offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class); } public void reinit(String stringValue, int startOffset, int endOffset) { termAttribute.setTermBuffer(stringValue); offsetAttribute.setOffset(startOffset, endOffset); } // this is a dummy, to not throw an UOE because this class does not implement any iteration method public boolean incrementToken() { throw new UnsupportedOperationException(); } } final DocumentsWriter.DocState docState; final FieldInvertState fieldState = new FieldInvertState(); // Used to read a string value for a field final ReusableStringReader stringReader = new ReusableStringReader(); public DocInverterPerThread(DocFieldProcessorPerThread docFieldProcessorPerThread, DocInverter docInverter) { this.docInverter = docInverter; docState = docFieldProcessorPerThread.docState; consumer = docInverter.consumer.addThread(this); endConsumer = docInverter.endConsumer.addThread(this); } public void startDocument() throws IOException { consumer.startDocument(); endConsumer.startDocument(); } public DocumentsWriter.DocWriter finishDocument() throws IOException { // TODO: allow endConsumer.finishDocument to also return // a DocWriter endConsumer.finishDocument(); return consumer.finishDocument(); } void abort() { try { consumer.abort(); } finally { endConsumer.abort(); } } public DocFieldConsumerPerField addField(FieldInfo fi) { return new DocInverterPerField(this, fi); } } lucene-2.9.4/src/java/org/apache/lucene/index/CompoundFileReader.java0000644000175000017500000002001711474320230026123 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.Lock; import java.util.HashMap; import java.io.IOException; /** * Class for accessing a compound stream. * This class implements a directory, but is limited to only read operations. * Directory methods that would normally modify data throw an exception. * * * @version $Id: CompoundFileReader.java 673371 2008-07-02 11:57:27Z mikemccand $ */ class CompoundFileReader extends Directory { private int readBufferSize; private static final class FileEntry { long offset; long length; } // Base info private Directory directory; private String fileName; private IndexInput stream; private HashMap entries = new HashMap(); public CompoundFileReader(Directory dir, String name) throws IOException { this(dir, name, BufferedIndexInput.BUFFER_SIZE); } public CompoundFileReader(Directory dir, String name, int readBufferSize) throws IOException { directory = dir; fileName = name; this.readBufferSize = readBufferSize; boolean success = false; try { stream = dir.openInput(name, readBufferSize); // read the directory and init files int count = stream.readVInt(); FileEntry entry = null; for (int i=0; i length) throw new IOException("read past EOF"); base.seek(fileOffset + start); base.readBytes(b, offset, len, false); } /** Expert: implements seek. Sets current position in this file, where * the next {@link #readInternal(byte[],int,int)} will occur. * @see #readInternal(byte[],int,int) */ protected void seekInternal(long pos) {} /** Closes the stream to further operations. */ public void close() throws IOException { base.close(); } public long length() { return length; } } } lucene-2.9.4/src/java/org/apache/lucene/index/SerialMergeScheduler.java0000644000175000017500000000271411474320230026456 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; /** A {@link MergeScheduler} that simply does each merge * sequentially, using the current thread. */ public class SerialMergeScheduler extends MergeScheduler { /** Just do the merges in sequence. We do this * "synchronized" so that even if the application is using * multiple threads, only one merge may run at a time. */ synchronized public void merge(IndexWriter writer) throws CorruptIndexException, IOException { while(true) { MergePolicy.OneMerge merge = writer.getNextMerge(); if (merge == null) break; writer.merge(merge); } } public void close() {} } lucene-2.9.4/src/java/org/apache/lucene/index/DocConsumerPerThread.java0000644000175000017500000000240211474320230026432 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; abstract class DocConsumerPerThread { /** Process the document. If there is * something for this document to be done in docID order, * you should encapsulate that as a * DocumentsWriter.DocWriter and return it. * DocumentsWriter then calls finish() on this object * when it's its turn. */ abstract DocumentsWriter.DocWriter processDocument() throws IOException; abstract void abort(); } lucene-2.9.4/src/java/org/apache/lucene/index/CompoundFileWriter.java0000644000175000017500000002172211474320230026201 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexInput; import java.util.LinkedList; import java.util.HashSet; import java.util.Iterator; import java.io.IOException; /** * Combines multiple files into a single compound file. * The file format:
    *

      *
    • VInt fileCount
    • *
    • {Directory} * fileCount entries with the following structure:
    • *
        *
      • long dataOffset
      • *
      • String fileName
      • *
      *
    • {File Data} * fileCount entries with the raw data of the corresponding file
    • *
    * * The fileCount integer indicates how many files are contained in this compound * file. The {directory} that follows has that many entries. Each directory entry * contains a long pointer to the start of this file's data section, and a String * with that file's name. * * * @version $Id: CompoundFileWriter.java 690539 2008-08-30 17:33:06Z mikemccand $ */ final class CompoundFileWriter { private static final class FileEntry { /** source file */ String file; /** temporary holder for the start of directory entry for this file */ long directoryOffset; /** temporary holder for the start of this file's data section */ long dataOffset; } private Directory directory; private String fileName; private HashSet ids; private LinkedList entries; private boolean merged = false; private SegmentMerger.CheckAbort checkAbort; /** Create the compound stream in the specified file. The file name is the * entire name (no extensions are added). * @throws NullPointerException if dir or name is null */ public CompoundFileWriter(Directory dir, String name) { this(dir, name, null); } CompoundFileWriter(Directory dir, String name, SegmentMerger.CheckAbort checkAbort) { if (dir == null) throw new NullPointerException("directory cannot be null"); if (name == null) throw new NullPointerException("name cannot be null"); this.checkAbort = checkAbort; directory = dir; fileName = name; ids = new HashSet(); entries = new LinkedList(); } /** Returns the directory of the compound file. */ public Directory getDirectory() { return directory; } /** Returns the name of the compound file. */ public String getName() { return fileName; } /** Add a source stream. file is the string by which the * sub-stream will be known in the compound stream. * * @throws IllegalStateException if this writer is closed * @throws NullPointerException if file is null * @throws IllegalArgumentException if a file with the same name * has been added already */ public void addFile(String file) { if (merged) throw new IllegalStateException( "Can't add extensions after merge has been called"); if (file == null) throw new NullPointerException( "file cannot be null"); if (! ids.add(file)) throw new IllegalArgumentException( "File " + file + " already added"); FileEntry entry = new FileEntry(); entry.file = file; entries.add(entry); } /** Merge files with the extensions added up to now. * All files with these extensions are combined sequentially into the * compound stream. After successful merge, the source files * are deleted. * @throws IllegalStateException if close() had been called before or * if no file has been added to this object */ public void close() throws IOException { if (merged) throw new IllegalStateException( "Merge already performed"); if (entries.isEmpty()) throw new IllegalStateException( "No entries to merge have been defined"); merged = true; // open the compound stream IndexOutput os = null; try { os = directory.createOutput(fileName); // Write the number of entries os.writeVInt(entries.size()); // Write the directory with all offsets at 0. // Remember the positions of directory entries so that we can // adjust the offsets later Iterator it = entries.iterator(); long totalSize = 0; while(it.hasNext()) { FileEntry fe = (FileEntry) it.next(); fe.directoryOffset = os.getFilePointer(); os.writeLong(0); // for now os.writeString(fe.file); totalSize += directory.fileLength(fe.file); } // Pre-allocate size of file as optimization -- // this can potentially help IO performance as // we write the file and also later during // searching. It also uncovers a disk-full // situation earlier and hopefully without // actually filling disk to 100%: final long finalLength = totalSize+os.getFilePointer(); os.setLength(finalLength); // Open the files and copy their data into the stream. // Remember the locations of each file's data section. byte buffer[] = new byte[16384]; it = entries.iterator(); while(it.hasNext()) { FileEntry fe = (FileEntry) it.next(); fe.dataOffset = os.getFilePointer(); copyFile(fe, os, buffer); } // Write the data offsets into the directory of the compound stream it = entries.iterator(); while(it.hasNext()) { FileEntry fe = (FileEntry) it.next(); os.seek(fe.directoryOffset); os.writeLong(fe.dataOffset); } assert finalLength == os.length(); // Close the output stream. Set the os to null before trying to // close so that if an exception occurs during the close, the // finally clause below will not attempt to close the stream // the second time. IndexOutput tmp = os; os = null; tmp.close(); } finally { if (os != null) try { os.close(); } catch (IOException e) { } } } /** Copy the contents of the file with specified extension into the * provided output stream. Use the provided buffer for moving data * to reduce memory allocation. */ private void copyFile(FileEntry source, IndexOutput os, byte buffer[]) throws IOException { IndexInput is = null; try { long startPtr = os.getFilePointer(); is = directory.openInput(source.file); long length = is.length(); long remainder = length; int chunk = buffer.length; while(remainder > 0) { int len = (int) Math.min(chunk, remainder); is.readBytes(buffer, 0, len, false); os.writeBytes(buffer, len); remainder -= len; if (checkAbort != null) // Roughly every 2 MB we will check if // it's time to abort checkAbort.work(80); } // Verify that remainder is 0 if (remainder != 0) throw new IOException( "Non-zero remainder length after copying: " + remainder + " (id: " + source.file + ", length: " + length + ", buffer size: " + chunk + ")"); // Verify that the output length diff is equal to original file long endPtr = os.getFilePointer(); long diff = endPtr - startPtr; if (diff != length) throw new IOException( "Difference in the output file offsets " + diff + " does not match the original file length " + length); } finally { if (is != null) is.close(); } } } lucene-2.9.4/src/java/org/apache/lucene/index/ReadOnlySegmentReader.java0000644000175000017500000000232711474320230026603 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ class ReadOnlySegmentReader extends SegmentReader { static void noWrite() { throw new UnsupportedOperationException("This IndexReader cannot make any changes to the index (it was opened with readOnly = true)"); } protected void acquireWriteLock() { noWrite(); } // Not synchronized public boolean isDeleted(int n) { return deletedDocs != null && deletedDocs.get(n); } } lucene-2.9.4/src/java/org/apache/lucene/index/Term.java0000644000175000017500000001067111474320230023330 0ustar janpascaljanpascalpackage org.apache.lucene.index; import org.apache.lucene.util.StringHelper; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** A Term represents a word from text. This is the unit of search. It is composed of two elements, the text of the word, as a string, and the name of the field that the text occurred in, an interned string. Note that terms may represent more than words from text fields, but also things like dates, email addresses, urls, etc. */ public final class Term implements Comparable, java.io.Serializable { String field; String text; /** Constructs a Term with the given field and text. *

    Note that a null field or null text value results in undefined * behavior for most Lucene APIs that accept a Term parameter. */ public Term(String fld, String txt) { field = StringHelper.intern(fld); text = txt; } /** Constructs a Term with the given field and empty text. * This serves two purposes: 1) reuse of a Term with the same field. * 2) pattern for a query. * * @param fld */ public Term(String fld) { this(fld, "", true); } Term(String fld, String txt, boolean intern) { field = intern ? StringHelper.intern(fld) : fld; // field names are interned text = txt; // unless already known to be } /** Returns the field of this term, an interned string. The field indicates the part of a document which this term came from. */ public final String field() { return field; } /** Returns the text of this term. In the case of words, this is simply the text of the word. In the case of dates and other types, this is an encoding of the object as a string. */ public final String text() { return text; } /** * Optimized construction of new Terms by reusing same field as this Term * - avoids field.intern() overhead * @param text The text of the new term (field is implicitly same as this Term instance) * @return A new Term */ public Term createTerm(String text) { return new Term(field,text,false); } //@Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; Term other = (Term) obj; if (field == null) { if (other.field != null) return false; } else if (!field.equals(other.field)) return false; if (text == null) { if (other.text != null) return false; } else if (!text.equals(other.text)) return false; return true; } //@Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((field == null) ? 0 : field.hashCode()); result = prime * result + ((text == null) ? 0 : text.hashCode()); return result; } public int compareTo(Object other) { return compareTo((Term)other); } /** Compares two terms, returning a negative integer if this term belongs before the argument, zero if this term is equal to the argument, and a positive integer if this term belongs after the argument. The ordering of terms is first by field, then by text.*/ public final int compareTo(Term other) { if (field == other.field) // fields are interned return text.compareTo(other.text); else return field.compareTo(other.field); } /** Resets the field and text of a Term. */ final void set(String fld, String txt) { field = fld; text = txt; } public final String toString() { return field + ":" + text; } private void readObject(java.io.ObjectInputStream in) throws java.io.IOException, ClassNotFoundException { in.defaultReadObject(); field = StringHelper.intern(field); } } lucene-2.9.4/src/java/org/apache/lucene/index/FormatPostingsDocsConsumer.java0000644000175000017500000000242011474320230027716 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; /** * NOTE: this API is experimental and will likely change */ abstract class FormatPostingsDocsConsumer { /** Adds a new doc in this term. If this returns null * then we just skip consuming positions/payloads. */ abstract FormatPostingsPositionsConsumer addDoc(int docID, int termDocFreq) throws IOException; /** Called when we are done adding docs to this term */ abstract void finish() throws IOException; } lucene-2.9.4/src/java/org/apache/lucene/index/InvertedDocConsumer.java0000644000175000017500000000303311474320230026335 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Map; import java.io.IOException; abstract class InvertedDocConsumer { /** Add a new thread */ abstract InvertedDocConsumerPerThread addThread(DocInverterPerThread docInverterPerThread); /** Abort (called after hitting AbortException) */ abstract void abort(); /** Flush a new segment */ abstract void flush(Map threadsAndFields, SegmentWriteState state) throws IOException; /** Close doc stores */ abstract void closeDocStore(SegmentWriteState state) throws IOException; /** Attempt to free RAM, returning true if any RAM was * freed */ abstract boolean freeRAM(); FieldInfos fieldInfos; void setFieldInfos(FieldInfos fieldInfos) { this.fieldInfos = fieldInfos; } } lucene-2.9.4/src/java/org/apache/lucene/index/MultiLevelSkipListWriter.java0000644000175000017500000001204711474320230027362 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RAMOutputStream; /** * This abstract class writes skip lists with multiple levels. * * Example for skipInterval = 3: * c (skip level 2) * c c c (skip level 1) * x x x x x x x x x x (skip level 0) * d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d (posting list) * 3 6 9 12 15 18 21 24 27 30 (df) * * d - document * x - skip data * c - skip data with child pointer * * Skip level i contains every skipInterval-th entry from skip level i-1. * Therefore the number of entries on level i is: floor(df / ((skipInterval ^ (i + 1))). * * Each skip entry on a level i>0 contains a pointer to the corresponding skip entry in list i-1. * This guarantees a logarithmic amount of skips to find the target document. * * While this class takes care of writing the different skip levels, * subclasses must define the actual format of the skip data. * */ abstract class MultiLevelSkipListWriter { // number of levels in this skip list private int numberOfSkipLevels; // the skip interval in the list with level = 0 private int skipInterval; // for every skip level a different buffer is used private RAMOutputStream[] skipBuffer; protected MultiLevelSkipListWriter(int skipInterval, int maxSkipLevels, int df) { this.skipInterval = skipInterval; // calculate the maximum number of skip levels for this document frequency numberOfSkipLevels = df == 0 ? 0 : (int) Math.floor(Math.log(df) / Math.log(skipInterval)); // make sure it does not exceed maxSkipLevels if (numberOfSkipLevels > maxSkipLevels) { numberOfSkipLevels = maxSkipLevels; } } protected void init() { skipBuffer = new RAMOutputStream[numberOfSkipLevels]; for (int i = 0; i < numberOfSkipLevels; i++) { skipBuffer[i] = new RAMOutputStream(); } } protected void resetSkip() { // creates new buffers or empties the existing ones if (skipBuffer == null) { init(); } else { for (int i = 0; i < skipBuffer.length; i++) { skipBuffer[i].reset(); } } } /** * Subclasses must implement the actual skip data encoding in this method. * * @param level the level skip data shall be writing for * @param skipBuffer the skip buffer to write to */ protected abstract void writeSkipData(int level, IndexOutput skipBuffer) throws IOException; /** * Writes the current skip data to the buffers. The current document frequency determines * the max level is skip data is to be written to. * * @param df the current document frequency * @throws IOException */ void bufferSkip(int df) throws IOException { int numLevels; // determine max level for (numLevels = 0; (df % skipInterval) == 0 && numLevels < numberOfSkipLevels; df /= skipInterval) { numLevels++; } long childPointer = 0; for (int level = 0; level < numLevels; level++) { writeSkipData(level, skipBuffer[level]); long newChildPointer = skipBuffer[level].getFilePointer(); if (level != 0) { // store child pointers for all levels except the lowest skipBuffer[level].writeVLong(childPointer); } //remember the childPointer for the next level childPointer = newChildPointer; } } /** * Writes the buffered skip lists to the given output. * * @param output the IndexOutput the skip lists shall be written to * @return the pointer the skip list starts */ long writeSkip(IndexOutput output) throws IOException { long skipPointer = output.getFilePointer(); if (skipBuffer == null || skipBuffer.length == 0) return skipPointer; for (int level = numberOfSkipLevels - 1; level > 0; level--) { long length = skipBuffer[level].getFilePointer(); if (length > 0) { output.writeVLong(length); skipBuffer[level].writeTo(output); } } skipBuffer[0].writeTo(output); return skipPointer; } } lucene-2.9.4/src/java/org/apache/lucene/index/TermBuffer.java0000644000175000017500000001053111474320230024455 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.UnicodeUtil; final class TermBuffer implements Cloneable { private String field; private Term term; // cached private boolean preUTF8Strings; // true if strings are stored in modified UTF8 encoding (LUCENE-510) private boolean dirty; // true if text was set externally (ie not read via UTF8 bytes) private UnicodeUtil.UTF16Result text = new UnicodeUtil.UTF16Result(); private UnicodeUtil.UTF8Result bytes = new UnicodeUtil.UTF8Result(); public final int compareTo(TermBuffer other) { if (field == other.field) // fields are interned return compareChars(text.result, text.length, other.text.result, other.text.length); else return field.compareTo(other.field); } private static final int compareChars(char[] chars1, int len1, char[] chars2, int len2) { final int end = len1 < len2 ? len1:len2; for (int k = 0; k < end; k++) { char c1 = chars1[k]; char c2 = chars2[k]; if (c1 != c2) { return c1 - c2; } } return len1 - len2; } /** Call this if the IndexInput passed to {@link #read} * stores terms in the "modified UTF8" (pre LUCENE-510) * format. */ void setPreUTF8Strings() { preUTF8Strings = true; } public final void read(IndexInput input, FieldInfos fieldInfos) throws IOException { this.term = null; // invalidate cache int start = input.readVInt(); int length = input.readVInt(); int totalLength = start + length; if (preUTF8Strings) { text.setLength(totalLength); input.readChars(text.result, start, length); } else { if (dirty) { // Fully convert all bytes since bytes is dirty UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes); bytes.setLength(totalLength); input.readBytes(bytes.result, start, length); UnicodeUtil.UTF8toUTF16(bytes.result, 0, totalLength, text); dirty = false; } else { // Incrementally convert only the UTF8 bytes that are new: bytes.setLength(totalLength); input.readBytes(bytes.result, start, length); UnicodeUtil.UTF8toUTF16(bytes.result, start, length, text); } } this.field = fieldInfos.fieldName(input.readVInt()); } public final void set(Term term) { if (term == null) { reset(); return; } final String termText = term.text(); final int termLen = termText.length(); text.setLength(termLen); termText.getChars(0, termLen, text.result, 0); dirty = true; field = term.field(); this.term = term; } public final void set(TermBuffer other) { text.copyText(other.text); dirty = true; field = other.field; term = other.term; } public void reset() { field = null; text.setLength(0); term = null; dirty = true; } public Term toTerm() { if (field == null) // unset return null; if (term == null) term = new Term(field, new String(text.result, 0, text.length), false); return term; } protected Object clone() { TermBuffer clone = null; try { clone = (TermBuffer)super.clone(); } catch (CloneNotSupportedException e) {} clone.dirty = true; clone.bytes = new UnicodeUtil.UTF8Result(); clone.text = new UnicodeUtil.UTF16Result(); clone.text.copyText(text); return clone; } } lucene-2.9.4/src/java/org/apache/lucene/index/CheckIndex.java0000644000175000017500000010123611474320230024424 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.document.AbstractField; // for javadocs import org.apache.lucene.document.Document; import java.text.NumberFormat; import java.io.PrintStream; import java.io.IOException; import java.io.File; import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.ArrayList; import java.util.Map; /** * Basic tool and API to check the health of an index and * write a new segments file that removes reference to * problematic segments. * *

    As this tool checks every byte in the index, on a large * index it can take quite a long time to run. * *

    WARNING: this tool and API is new and * experimental and is subject to suddenly change in the * next release. Please make a complete backup of your * index before using this to fix your index! */ public class CheckIndex { /** Default PrintStream for all CheckIndex instances. * @deprecated Use {@link #setInfoStream} per instance, * instead. */ public static PrintStream out = null; private PrintStream infoStream; private Directory dir; /** * Returned from {@link #checkIndex()} detailing the health and status of the index. * *

    WARNING: this API is new and experimental and is * subject to suddenly change in the next release. **/ public static class Status { /** True if no problems were found with the index. */ public boolean clean; /** True if we were unable to locate and load the segments_N file. */ public boolean missingSegments; /** True if we were unable to open the segments_N file. */ public boolean cantOpenSegments; /** True if we were unable to read the version number from segments_N file. */ public boolean missingSegmentVersion; /** Name of latest segments_N file in the index. */ public String segmentsFileName; /** Number of segments in the index. */ public int numSegments; /** String description of the version of the index. */ public String segmentFormat; /** Empty unless you passed specific segments list to check as optional 3rd argument. * @see CheckIndex#checkIndex(List) */ public List/**/ segmentsChecked = new ArrayList(); /** True if the index was created with a newer version of Lucene than the CheckIndex tool. */ public boolean toolOutOfDate; /** List of {@link SegmentInfoStatus} instances, detailing status of each segment. */ public List/*WARNING: this API is new and experimental and is * subject to suddenly change in the next release. */ public static class SegmentInfoStatus { /** Name of the segment. */ public String name; /** Document count (does not take deletions into account). */ public int docCount; /** True if segment is compound file format. */ public boolean compound; /** Number of files referenced by this segment. */ public int numFiles; /** Net size (MB) of the files referenced by this * segment. */ public double sizeMB; /** Doc store offset, if this segment shares the doc * store files (stored fields and term vectors) with * other segments. This is -1 if it does not share. */ public int docStoreOffset = -1; /** String of the shared doc store segment, or null if * this segment does not share the doc store files. */ public String docStoreSegment; /** True if the shared doc store files are compound file * format. */ public boolean docStoreCompoundFile; /** True if this segment has pending deletions. */ public boolean hasDeletions; /** Name of the current deletions file name. */ public String deletionsFileName; /** Number of deleted documents. */ public int numDeleted; /** True if we were able to open a SegmentReader on this * segment. */ public boolean openReaderPassed; /** Number of fields in this segment. */ int numFields; /** True if at least one of the fields in this segment * does not omitTermFreqAndPositions. * @see AbstractField#setOmitTermFreqAndPositions */ public boolean hasProx; /** Map that includes certain * debugging details that IndexWriter records into * each segment it creates */ public Map diagnostics; /** Status for testing of field norms (null if field norms could not be tested). */ public FieldNormStatus fieldNormStatus; /** Status for testing of indexed terms (null if indexed terms could not be tested). */ public TermIndexStatus termIndexStatus; /** Status for testing of stored fields (null if stored fields could not be tested). */ public StoredFieldStatus storedFieldStatus; /** Status for testing of term vectors (null if term vectors could not be tested). */ public TermVectorStatus termVectorStatus; } /** * Status from testing field norms. */ public static final class FieldNormStatus { /** Number of fields successfully tested */ public long totFields = 0L; /** Exception thrown during term index test (null on success) */ public Throwable error = null; } /** * Status from testing term index. */ public static final class TermIndexStatus { /** Total term count */ public long termCount = 0L; /** Total frequency across all terms. */ public long totFreq = 0L; /** Total number of positions. */ public long totPos = 0L; /** Exception thrown during term index test (null on success) */ public Throwable error = null; } /** * Status from testing stored fields. */ public static final class StoredFieldStatus { /** Number of documents tested. */ public int docCount = 0; /** Total number of stored fields tested. */ public long totFields = 0; /** Exception thrown during stored fields test (null on success) */ public Throwable error = null; } /** * Status from testing stored fields. */ public static final class TermVectorStatus { /** Number of documents tested. */ public int docCount = 0; /** Total number of term vectors tested. */ public long totVectors = 0; /** Exception thrown during term vector test (null on success) */ public Throwable error = null; } } /** Create a new CheckIndex on the directory. */ public CheckIndex(Directory dir) { this.dir = dir; infoStream = out; } /** Set infoStream where messages should go. If null, no * messages are printed */ public void setInfoStream(PrintStream out) { infoStream = out; } private void msg(String msg) { if (infoStream != null) infoStream.println(msg); } private static class MySegmentTermDocs extends SegmentTermDocs { int delCount; MySegmentTermDocs(SegmentReader p) { super(p); } public void seek(Term term) throws IOException { super.seek(term); delCount = 0; } protected void skippingDoc() throws IOException { delCount++; } } /** Returns true if index is clean, else false. * @deprecated Please instantiate a CheckIndex and then use {@link #checkIndex()} instead */ public static boolean check(Directory dir, boolean doFix) throws IOException { return check(dir, doFix, null); } /** Returns true if index is clean, else false. * @deprecated Please instantiate a CheckIndex and then use {@link #checkIndex(List)} instead */ public static boolean check(Directory dir, boolean doFix, List onlySegments) throws IOException { CheckIndex checker = new CheckIndex(dir); Status status = checker.checkIndex(onlySegments); if (doFix && !status.clean) checker.fixIndex(status); return status.clean; } /** Returns a {@link Status} instance detailing * the state of the index. * *

    As this method checks every byte in the index, on a large * index it can take quite a long time to run. * *

    WARNING: make sure * you only call this when the index is not opened by any * writer. */ public Status checkIndex() throws IOException { return checkIndex(null); } /** Returns a {@link Status} instance detailing * the state of the index. * * @param onlySegments list of specific segment names to check * *

    As this method checks every byte in the specified * segments, on a large index it can take quite a long * time to run. * *

    WARNING: make sure * you only call this when the index is not opened by any * writer. */ public Status checkIndex(List onlySegments) throws IOException { NumberFormat nf = NumberFormat.getInstance(); SegmentInfos sis = new SegmentInfos(); Status result = new Status(); result.dir = dir; try { sis.read(dir); } catch (Throwable t) { msg("ERROR: could not read any segments file in directory"); result.missingSegments = true; if (infoStream != null) t.printStackTrace(infoStream); return result; } final int numSegments = sis.size(); final String segmentsFileName = sis.getCurrentSegmentFileName(); IndexInput input = null; try { input = dir.openInput(segmentsFileName); } catch (Throwable t) { msg("ERROR: could not open segments file in directory"); if (infoStream != null) t.printStackTrace(infoStream); result.cantOpenSegments = true; return result; } int format = 0; try { format = input.readInt(); } catch (Throwable t) { msg("ERROR: could not read segment file version in directory"); if (infoStream != null) t.printStackTrace(infoStream); result.missingSegmentVersion = true; return result; } finally { if (input != null) input.close(); } String sFormat = ""; boolean skip = false; if (format == SegmentInfos.FORMAT) sFormat = "FORMAT [Lucene Pre-2.1]"; if (format == SegmentInfos.FORMAT_LOCKLESS) sFormat = "FORMAT_LOCKLESS [Lucene 2.1]"; else if (format == SegmentInfos.FORMAT_SINGLE_NORM_FILE) sFormat = "FORMAT_SINGLE_NORM_FILE [Lucene 2.2]"; else if (format == SegmentInfos.FORMAT_SHARED_DOC_STORE) sFormat = "FORMAT_SHARED_DOC_STORE [Lucene 2.3]"; else { if (format == SegmentInfos.FORMAT_CHECKSUM) sFormat = "FORMAT_CHECKSUM [Lucene 2.4]"; else if (format == SegmentInfos.FORMAT_DEL_COUNT) sFormat = "FORMAT_DEL_COUNT [Lucene 2.4]"; else if (format == SegmentInfos.FORMAT_HAS_PROX) sFormat = "FORMAT_HAS_PROX [Lucene 2.4]"; else if (format == SegmentInfos.FORMAT_USER_DATA) sFormat = "FORMAT_USER_DATA [Lucene 2.9]"; else if (format == SegmentInfos.FORMAT_DIAGNOSTICS) sFormat = "FORMAT_DIAGNOSTICS [Lucene 2.9]"; else if (format < SegmentInfos.CURRENT_FORMAT) { sFormat = "int=" + format + " [newer version of Lucene than this tool]"; skip = true; } else { sFormat = format + " [Lucene 1.3 or prior]"; } } result.segmentsFileName = segmentsFileName; result.numSegments = numSegments; result.segmentFormat = sFormat; result.userData = sis.getUserData(); String userDataString; if (sis.getUserData().size() > 0) { userDataString = " userData=" + sis.getUserData(); } else { userDataString = ""; } msg("Segments file=" + segmentsFileName + " numSegments=" + numSegments + " version=" + sFormat + userDataString); if (onlySegments != null) { result.partial = true; if (infoStream != null) infoStream.print("\nChecking only these segments:"); Iterator it = onlySegments.iterator(); while (it.hasNext()) { if (infoStream != null) infoStream.print(" " + it.next()); } result.segmentsChecked.addAll(onlySegments); msg(":"); } if (skip) { msg("\nERROR: this index appears to be created by a newer version of Lucene than this tool was compiled on; please re-compile this tool on the matching version of Lucene; exiting"); result.toolOutOfDate = true; return result; } result.newSegments = (SegmentInfos) sis.clone(); result.newSegments.clear(); for(int i=0;i= maxDoc) throw new RuntimeException("term " + term + ": doc " + doc + " >= maxDoc " + maxDoc); lastDoc = doc; if (freq <= 0) throw new RuntimeException("term " + term + ": doc " + doc + ": freq " + freq + " is out of bounds"); int lastPos = -1; status.totPos += freq; for(int j=0;jWARNING: this writes a * new segments file into the index, effectively removing * all documents in broken segments from the index. * BE CAREFUL. * *

    WARNING: Make sure you only call this when the * index is not opened by any writer. */ public void fixIndex(Status result) throws IOException { if (result.partial) throw new IllegalArgumentException("can only fix an index that was fully checked (this status checked a subset of segments)"); result.newSegments.commit(result.dir); } private static boolean assertsOn; private static boolean testAsserts() { assertsOn = true; return true; } private static boolean assertsOn() { assert testAsserts(); return assertsOn; } /** Command-line interface to check and fix an index.

    Run it like this:

        java -ea:org.apache.lucene... org.apache.lucene.index.CheckIndex pathToIndex [-fix] [-segment X] [-segment Y]
        
    • -fix: actually write a new segments_N file, removing any problematic segments
    • -segment X: only check the specified segment(s). This can be specified multiple times, to check more than one segment, eg -segment _2 -segment _a. You can't use this with the -fix option.

    WARNING: -fix should only be used on an emergency basis as it will cause documents (perhaps many) to be permanently removed from the index. Always make a backup copy of your index before running this! Do not run this tool on an index that is actively being written to. You have been warned!

    Run without -fix, this tool will open the index, report version information and report any exceptions it hits and what action it would take if -fix were specified. With -fix, this tool will remove any segments that have issues and write a new segments_N file. This means all documents contained in the affected segments will be removed.

    This tool exits with exit code 1 if the index cannot be opened or has any corruption, else 0. */ public static void main(String[] args) throws IOException, InterruptedException { boolean doFix = false; List onlySegments = new ArrayList(); String indexPath = null; int i = 0; while(i < args.length) { if (args[i].equals("-fix")) { doFix = true; i++; } else if (args[i].equals("-segment")) { if (i == args.length-1) { System.out.println("ERROR: missing name for -segment option"); System.exit(1); } onlySegments.add(args[i+1]); i += 2; } else { if (indexPath != null) { System.out.println("ERROR: unexpected extra argument '" + args[i] + "'"); System.exit(1); } indexPath = args[i]; i++; } } if (indexPath == null) { System.out.println("\nERROR: index path not specified"); System.out.println("\nUsage: java org.apache.lucene.index.CheckIndex pathToIndex [-fix] [-segment X] [-segment Y]\n" + "\n" + " -fix: actually write a new segments_N file, removing any problematic segments\n" + " -segment X: only check the specified segments. This can be specified multiple\n" + " times, to check more than one segment, eg '-segment _2 -segment _a'.\n" + " You can't use this with the -fix option\n" + "\n" + "**WARNING**: -fix should only be used on an emergency basis as it will cause\n" + "documents (perhaps many) to be permanently removed from the index. Always make\n" + "a backup copy of your index before running this! Do not run this tool on an index\n" + "that is actively being written to. You have been warned!\n" + "\n" + "Run without -fix, this tool will open the index, report version information\n" + "and report any exceptions it hits and what action it would take if -fix were\n" + "specified. With -fix, this tool will remove any segments that have issues and\n" + "write a new segments_N file. This means all documents contained in the affected\n" + "segments will be removed.\n" + "\n" + "This tool exits with exit code 1 if the index cannot be opened or has any\n" + "corruption, else 0.\n"); System.exit(1); } if (!assertsOn()) System.out.println("\nNOTE: testing will be more thorough if you run java with '-ea:org.apache.lucene...', so assertions are enabled"); if (onlySegments.size() == 0) onlySegments = null; else if (doFix) { System.out.println("ERROR: cannot specify both -fix and -segment"); System.exit(1); } System.out.println("\nOpening index @ " + indexPath + "\n"); Directory dir = null; try { dir = FSDirectory.open(new File(indexPath)); } catch (Throwable t) { System.out.println("ERROR: could not open directory \"" + indexPath + "\"; exiting"); t.printStackTrace(System.out); System.exit(1); } CheckIndex checker = new CheckIndex(dir); checker.setInfoStream(System.out); Status result = checker.checkIndex(onlySegments); if (result.missingSegments) { System.exit(1); } if (!result.clean) { if (!doFix) { System.out.println("WARNING: would write new segments file, and " + result.totLoseDocCount + " documents would be lost, if -fix were specified\n"); } else { System.out.println("WARNING: " + result.totLoseDocCount + " documents will be lost\n"); System.out.println("NOTE: will write new segments file in 5 seconds; this will remove " + result.totLoseDocCount + " docs from the index. THIS IS YOUR LAST CHANCE TO CTRL+C!"); for(int s=0;s<5;s++) { Thread.sleep(1000); System.out.println(" " + (5-s) + "..."); } System.out.println("Writing..."); checker.fixIndex(result); System.out.println("OK"); System.out.println("Wrote new segments file \"" + result.newSegments.getCurrentSegmentFileName() + "\""); } } System.out.println(""); final int exitCode; if (result != null && result.clean == true) exitCode = 0; else exitCode = 1; System.exit(exitCode); } } lucene-2.9.4/src/java/org/apache/lucene/index/SegmentTermDocs.java0000644000175000017500000001514311474320230025463 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.util.BitVector; import org.apache.lucene.store.IndexInput; class SegmentTermDocs implements TermDocs { protected SegmentReader parent; protected IndexInput freqStream; protected int count; protected int df; protected BitVector deletedDocs; int doc = 0; int freq; private int skipInterval; private int maxSkipLevels; private DefaultSkipListReader skipListReader; private long freqBasePointer; private long proxBasePointer; private long skipPointer; private boolean haveSkipped; protected boolean currentFieldStoresPayloads; protected boolean currentFieldOmitTermFreqAndPositions; protected SegmentTermDocs(SegmentReader parent) { this.parent = parent; this.freqStream = (IndexInput) parent.core.freqStream.clone(); synchronized (parent) { this.deletedDocs = parent.deletedDocs; } this.skipInterval = parent.core.getTermsReader().getSkipInterval(); this.maxSkipLevels = parent.core.getTermsReader().getMaxSkipLevels(); } public void seek(Term term) throws IOException { TermInfo ti = parent.core.getTermsReader().get(term); seek(ti, term); } public void seek(TermEnum termEnum) throws IOException { TermInfo ti; Term term; // use comparison of fieldinfos to verify that termEnum belongs to the same segment as this SegmentTermDocs if (termEnum instanceof SegmentTermEnum && ((SegmentTermEnum) termEnum).fieldInfos == parent.core.fieldInfos) { // optimized case SegmentTermEnum segmentTermEnum = ((SegmentTermEnum) termEnum); term = segmentTermEnum.term(); ti = segmentTermEnum.termInfo(); } else { // punt case term = termEnum.term(); ti = parent.core.getTermsReader().get(term); } seek(ti, term); } void seek(TermInfo ti, Term term) throws IOException { count = 0; FieldInfo fi = parent.core.fieldInfos.fieldInfo(term.field); currentFieldOmitTermFreqAndPositions = (fi != null) ? fi.omitTermFreqAndPositions : false; currentFieldStoresPayloads = (fi != null) ? fi.storePayloads : false; if (ti == null) { df = 0; } else { df = ti.docFreq; doc = 0; freqBasePointer = ti.freqPointer; proxBasePointer = ti.proxPointer; skipPointer = freqBasePointer + ti.skipOffset; freqStream.seek(freqBasePointer); haveSkipped = false; } } public void close() throws IOException { freqStream.close(); if (skipListReader != null) skipListReader.close(); } public final int doc() { return doc; } public final int freq() { return freq; } protected void skippingDoc() throws IOException { } public boolean next() throws IOException { while (true) { if (count == df) return false; final int docCode = freqStream.readVInt(); if (currentFieldOmitTermFreqAndPositions) { doc += docCode; freq = 1; } else { doc += docCode >>> 1; // shift off low bit if ((docCode & 1) != 0) // if low bit is set freq = 1; // freq is one else freq = freqStream.readVInt(); // else read freq } count++; if (deletedDocs == null || !deletedDocs.get(doc)) break; skippingDoc(); } return true; } /** Optimized implementation. */ public int read(final int[] docs, final int[] freqs) throws IOException { final int length = docs.length; if (currentFieldOmitTermFreqAndPositions) { return readNoTf(docs, freqs, length); } else { int i = 0; while (i < length && count < df) { // manually inlined call to next() for speed final int docCode = freqStream.readVInt(); doc += docCode >>> 1; // shift off low bit if ((docCode & 1) != 0) // if low bit is set freq = 1; // freq is one else freq = freqStream.readVInt(); // else read freq count++; if (deletedDocs == null || !deletedDocs.get(doc)) { docs[i] = doc; freqs[i] = freq; ++i; } } return i; } } private final int readNoTf(final int[] docs, final int[] freqs, final int length) throws IOException { int i = 0; while (i < length && count < df) { // manually inlined call to next() for speed doc += freqStream.readVInt(); count++; if (deletedDocs == null || !deletedDocs.get(doc)) { docs[i] = doc; // Hardware freq to 1 when term freqs were not // stored in the index freqs[i] = 1; ++i; } } return i; } /** Overridden by SegmentTermPositions to skip in prox stream. */ protected void skipProx(long proxPointer, int payloadLength) throws IOException {} /** Optimized implementation. */ public boolean skipTo(int target) throws IOException { if (df >= skipInterval) { // optimized case if (skipListReader == null) skipListReader = new DefaultSkipListReader((IndexInput) freqStream.clone(), maxSkipLevels, skipInterval); // lazily clone if (!haveSkipped) { // lazily initialize skip stream skipListReader.init(skipPointer, freqBasePointer, proxBasePointer, df, currentFieldStoresPayloads); haveSkipped = true; } int newCount = skipListReader.skipTo(target); if (newCount > count) { freqStream.seek(skipListReader.getFreqPointer()); skipProx(skipListReader.getProxPointer(), skipListReader.getPayloadLength()); doc = skipListReader.getDoc(); count = newCount; } } // done skipping, now just scan do { if (!next()) return false; } while (target > doc); return true; } } lucene-2.9.4/src/java/org/apache/lucene/index/LogDocMergePolicy.java0000644000175000017500000000437311474320230025732 0ustar janpascaljanpascalpackage org.apache.lucene.index; import java.io.IOException; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** This is a {@link LogMergePolicy} that measures size of a * segment as the number of documents (not taking deletions * into account). */ public class LogDocMergePolicy extends LogMergePolicy { /** Default minimum segment size. @see setMinMergeDocs */ public static final int DEFAULT_MIN_MERGE_DOCS = 1000; public LogDocMergePolicy(IndexWriter writer) { super(writer); minMergeSize = DEFAULT_MIN_MERGE_DOCS; // maxMergeSize is never used by LogDocMergePolicy; set // it to Long.MAX_VALUE to disable it maxMergeSize = Long.MAX_VALUE; } protected long size(SegmentInfo info) throws IOException { return sizeDocs(info); } /** Sets the minimum size for the lowest level segments. * Any segments below this size are considered to be on * the same level (even if they vary drastically in size) * and will be merged whenever there are mergeFactor of * them. This effectively truncates the "long tail" of * small segments that would otherwise be created into a * single level. If you set this too large, it could * greatly increase the merging cost during indexing (if * you flush many small segments). */ public void setMinMergeDocs(int minMergeDocs) { minMergeSize = minMergeDocs; } /** Get the minimum size for a segment to remain * un-merged. * @see #setMinMergeDocs **/ public int getMinMergeDocs() { return (int) minMergeSize; } } lucene-2.9.4/src/java/org/apache/lucene/index/DocumentsWriterThreadState.java0000644000175000017500000000406511474320230027710 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; /** Used by DocumentsWriter to maintain per-thread state. * We keep a separate Posting hash and other state for each * thread and then merge postings hashes from all threads * when writing the segment. */ final class DocumentsWriterThreadState { boolean isIdle = true; // false if this is currently in use by a thread int numThreads = 1; // Number of threads that share this instance boolean doFlushAfter; // true if we should flush after processing current doc final DocConsumerPerThread consumer; final DocumentsWriter.DocState docState; final DocumentsWriter docWriter; public DocumentsWriterThreadState(DocumentsWriter docWriter) throws IOException { this.docWriter = docWriter; docState = new DocumentsWriter.DocState(); docState.maxFieldLength = docWriter.maxFieldLength; docState.infoStream = docWriter.infoStream; docState.similarity = docWriter.similarity; docState.docWriter = docWriter; docState.allowMinus1Position = docWriter.writer.getAllowMinus1Position(); consumer = docWriter.consumer.addThread(this); } void doAfterFlush() { numThreads = 0; doFlushAfter = false; } } lucene-2.9.4/src/java/org/apache/lucene/index/TermsHashPerThread.java0000644000175000017500000000750711474320230026122 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; final class TermsHashPerThread extends InvertedDocConsumerPerThread { final TermsHash termsHash; final TermsHashConsumerPerThread consumer; final TermsHashPerThread nextPerThread; final CharBlockPool charPool; final IntBlockPool intPool; final ByteBlockPool bytePool; final boolean primary; final DocumentsWriter.DocState docState; final RawPostingList freePostings[] = new RawPostingList[256]; int freePostingsCount; public TermsHashPerThread(DocInverterPerThread docInverterPerThread, final TermsHash termsHash, final TermsHash nextTermsHash, final TermsHashPerThread primaryPerThread) { docState = docInverterPerThread.docState; this.termsHash = termsHash; this.consumer = termsHash.consumer.addThread(this); if (nextTermsHash != null) { // We are primary charPool = new CharBlockPool(termsHash.docWriter); primary = true; } else { charPool = primaryPerThread.charPool; primary = false; } intPool = new IntBlockPool(termsHash.docWriter, termsHash.trackAllocations); bytePool = new ByteBlockPool(termsHash.docWriter.byteBlockAllocator, termsHash.trackAllocations); if (nextTermsHash != null) nextPerThread = nextTermsHash.addThread(docInverterPerThread, this); else nextPerThread = null; } InvertedDocConsumerPerField addField(DocInverterPerField docInverterPerField, final FieldInfo fieldInfo) { return new TermsHashPerField(docInverterPerField, this, nextPerThread, fieldInfo); } synchronized public void abort() { reset(true); consumer.abort(); if (nextPerThread != null) nextPerThread.abort(); } // perField calls this when it needs more postings: void morePostings() throws IOException { assert freePostingsCount == 0; termsHash.getPostings(freePostings); freePostingsCount = freePostings.length; assert noNullPostings(freePostings, freePostingsCount, "consumer=" + consumer); } private static boolean noNullPostings(RawPostingList[] postings, int count, String details) { for(int i=0;iDetermines the largest segment (measured by total * byte size of the segment's files, in MB) that may be * merged with other segments. Small values (e.g., less * than 50 MB) are best for interactive indexing, as this * limits the length of pauses while indexing to a few * seconds. Larger values are best for batched indexing * and speedier searches.

    * *

    Note that {@link #setMaxMergeDocs} is also * used to check whether a segment is too large for * merging (it's either or).

    */ public void setMaxMergeMB(double mb) { maxMergeSize = (long) (mb*1024*1024); } /** Returns the largest segment (measured by total byte * size of the segment's files, in MB) that may be merged * with other segments. * @see #setMaxMergeMB */ public double getMaxMergeMB() { return ((double) maxMergeSize)/1024/1024; } /** Sets the minimum size for the lowest level segments. * Any segments below this size are considered to be on * the same level (even if they vary drastically in size) * and will be merged whenever there are mergeFactor of * them. This effectively truncates the "long tail" of * small segments that would otherwise be created into a * single level. If you set this too large, it could * greatly increase the merging cost during indexing (if * you flush many small segments). */ public void setMinMergeMB(double mb) { minMergeSize = (long) (mb*1024*1024); } /** Get the minimum size for a segment to remain * un-merged. * @see #setMinMergeMB **/ public double getMinMergeMB() { return ((double) minMergeSize)/1024/1024; } } lucene-2.9.4/src/java/org/apache/lucene/index/FormatPostingsDocsWriter.java0000644000175000017500000001015611474320230027404 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** Consumes doc & freq, writing them using the current * index file format */ import java.io.IOException; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.store.IndexOutput; final class FormatPostingsDocsWriter extends FormatPostingsDocsConsumer { final IndexOutput out; final FormatPostingsTermsWriter parent; final FormatPostingsPositionsWriter posWriter; final DefaultSkipListWriter skipListWriter; final int skipInterval; final int totalNumDocs; boolean omitTermFreqAndPositions; boolean storePayloads; long freqStart; FieldInfo fieldInfo; FormatPostingsDocsWriter(SegmentWriteState state, FormatPostingsTermsWriter parent) throws IOException { super(); this.parent = parent; final String fileName = IndexFileNames.segmentFileName(parent.parent.segment, IndexFileNames.FREQ_EXTENSION); state.flushedFiles.add(fileName); out = parent.parent.dir.createOutput(fileName); totalNumDocs = parent.parent.totalNumDocs; // TODO: abstraction violation skipInterval = parent.parent.termsOut.skipInterval; skipListWriter = parent.parent.skipListWriter; skipListWriter.setFreqOutput(out); posWriter = new FormatPostingsPositionsWriter(state, this); } void setField(FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions; storePayloads = fieldInfo.storePayloads; posWriter.setField(fieldInfo); } int lastDocID; int df; /** Adds a new doc in this term. If this returns null * then we just skip consuming positions/payloads. */ FormatPostingsPositionsConsumer addDoc(int docID, int termDocFreq) throws IOException { final int delta = docID - lastDocID; if (docID < 0 || (df > 0 && delta <= 0)) throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " )"); if ((++df % skipInterval) == 0) { // TODO: abstraction violation skipListWriter.setSkipData(lastDocID, storePayloads, posWriter.lastPayloadLength); skipListWriter.bufferSkip(df); } assert docID < totalNumDocs: "docID=" + docID + " totalNumDocs=" + totalNumDocs; lastDocID = docID; if (omitTermFreqAndPositions) out.writeVInt(delta); else if (1 == termDocFreq) out.writeVInt((delta<<1) | 1); else { out.writeVInt(delta<<1); out.writeVInt(termDocFreq); } return posWriter; } private final TermInfo termInfo = new TermInfo(); // minimize consing final UnicodeUtil.UTF8Result utf8 = new UnicodeUtil.UTF8Result(); /** Called when we are done adding docs to this term */ void finish() throws IOException { long skipPointer = skipListWriter.writeSkip(out); // TODO: this is abstraction violation -- we should not // peek up into parents terms encoding format termInfo.set(df, parent.freqStart, parent.proxStart, (int) (skipPointer - parent.freqStart)); // TODO: we could do this incrementally UnicodeUtil.UTF16toUTF8(parent.currentTerm, parent.currentTermStart, utf8); if (df > 0) { parent.termsOut.add(fieldInfo.number, utf8.result, utf8.length, termInfo); } lastDocID = 0; df = 0; } void close() throws IOException { out.close(); posWriter.close(); } } lucene-2.9.4/src/java/org/apache/lucene/index/TermsHashConsumerPerThread.java0000644000175000017500000000223711474320230027631 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; abstract class TermsHashConsumerPerThread { abstract void startDocument() throws IOException; abstract DocumentsWriter.DocWriter finishDocument() throws IOException; abstract public TermsHashConsumerPerField addField(TermsHashPerField termsHashPerField, FieldInfo fieldInfo); abstract public void abort(); } lucene-2.9.4/src/java/org/apache/lucene/index/NormsWriterPerField.java0000644000175000017500000000507611474320230026332 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.search.Similarity; /** Taps into DocInverter, as an InvertedDocEndConsumer, * which is called at the end of inverting each field. We * just look at the length for the field (docState.length) * and record the norm. */ final class NormsWriterPerField extends InvertedDocEndConsumerPerField implements Comparable { final NormsWriterPerThread perThread; final FieldInfo fieldInfo; final DocumentsWriter.DocState docState; // Holds all docID/norm pairs we've seen int[] docIDs = new int[1]; byte[] norms = new byte[1]; int upto; final FieldInvertState fieldState; public void reset() { // Shrink back if we are overallocated now: docIDs = ArrayUtil.shrink(docIDs, upto); norms = ArrayUtil.shrink(norms, upto); upto = 0; } public NormsWriterPerField(final DocInverterPerField docInverterPerField, final NormsWriterPerThread perThread, final FieldInfo fieldInfo) { this.perThread = perThread; this.fieldInfo = fieldInfo; docState = perThread.docState; fieldState = docInverterPerField.fieldState; } void abort() { upto = 0; } public int compareTo(Object other) { return fieldInfo.name.compareTo(((NormsWriterPerField) other).fieldInfo.name); } void finish() { assert docIDs.length == norms.length; if (fieldInfo.isIndexed && !fieldInfo.omitNorms) { if (docIDs.length <= upto) { assert docIDs.length == upto; docIDs = ArrayUtil.grow(docIDs, 1+upto); norms = ArrayUtil.grow(norms, 1+upto); } final float norm = docState.similarity.computeNorm(fieldInfo.name, fieldState); norms[upto] = Similarity.encodeNorm(norm); docIDs[upto] = docState.docID; upto++; } } } lucene-2.9.4/src/java/org/apache/lucene/index/ConcurrentMergeScheduler.java0000644000175000017500000003145011474320230027360 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.store.Directory; import java.io.IOException; import java.util.List; import java.util.ArrayList; /** A {@link MergeScheduler} that runs each merge using a * separate thread, up until a maximum number of threads * ({@link #setMaxThreadCount}) at which when a merge is * needed, the thread(s) that are updating the index will * pause until one or more merges completes. This is a * simple way to use concurrency in the indexing process * without having to create and manage application level * threads. */ public class ConcurrentMergeScheduler extends MergeScheduler { private int mergeThreadPriority = -1; protected List mergeThreads = new ArrayList(); // Max number of threads allowed to be merging at once private int maxThreadCount = 1; protected Directory dir; private boolean closed; protected IndexWriter writer; protected int mergeThreadCount; public ConcurrentMergeScheduler() { if (allInstances != null) { // Only for testing addMyself(); } } /** Sets the max # simultaneous threads that may be * running. If a merge is necessary yet we already have * this many threads running, the incoming thread (that * is calling add/updateDocument) will block until * a merge thread has completed. */ public void setMaxThreadCount(int count) { if (count < 1) throw new IllegalArgumentException("count should be at least 1"); maxThreadCount = count; } /** Get the max # simultaneous threads that may be * running. @see #setMaxThreadCount. */ public int getMaxThreadCount() { return maxThreadCount; } /** Return the priority that merge threads run at. By * default the priority is 1 plus the priority of (ie, * slightly higher priority than) the first thread that * calls merge. */ public synchronized int getMergeThreadPriority() { initMergeThreadPriority(); return mergeThreadPriority; } /** Return the priority that merge threads run at. */ public synchronized void setMergeThreadPriority(int pri) { if (pri > Thread.MAX_PRIORITY || pri < Thread.MIN_PRIORITY) throw new IllegalArgumentException("priority must be in range " + Thread.MIN_PRIORITY + " .. " + Thread.MAX_PRIORITY + " inclusive"); mergeThreadPriority = pri; final int numThreads = mergeThreadCount(); for(int i=0;i Thread.MAX_PRIORITY) mergeThreadPriority = Thread.MAX_PRIORITY; } } public void close() { closed = true; } public synchronized void sync() { while(mergeThreadCount() > 0) { if (verbose()) message("now wait for threads; currently " + mergeThreads.size() + " still running"); final int count = mergeThreads.size(); if (verbose()) { for(int i=0;i= maxThreadCount) { if (verbose()) message(" too many merge threads running; stalling..."); try { wait(); } catch (InterruptedException ie) { // In 3.0 we will change this to throw // InterruptedException instead Thread.currentThread().interrupt(); throw new RuntimeException(ie); } } if (verbose()) message(" consider merge " + merge.segString(dir)); // OK to spawn a new merge thread to handle this // merge: merger = getMergeThread(writer, merge); mergeThreads.add(merger); if (verbose()) message(" launch new thread [" + merger.getName() + "]"); merger.start(); success = true; } } finally { if (!success) { writer.mergeFinish(merge); } } } } /** Does the actual merge, by calling {@link IndexWriter#merge} */ protected void doMerge(MergePolicy.OneMerge merge) throws IOException { writer.merge(merge); } /** Create and return a new MergeThread */ protected synchronized MergeThread getMergeThread(IndexWriter writer, MergePolicy.OneMerge merge) throws IOException { final MergeThread thread = new MergeThread(writer, merge); thread.setThreadPriority(mergeThreadPriority); thread.setDaemon(true); thread.setName("Lucene Merge Thread #" + mergeThreadCount++); return thread; } protected class MergeThread extends Thread { IndexWriter writer; MergePolicy.OneMerge startMerge; MergePolicy.OneMerge runningMerge; public MergeThread(IndexWriter writer, MergePolicy.OneMerge startMerge) throws IOException { this.writer = writer; this.startMerge = startMerge; } public synchronized void setRunningMerge(MergePolicy.OneMerge merge) { runningMerge = merge; } public synchronized MergePolicy.OneMerge getRunningMerge() { return runningMerge; } public void setThreadPriority(int pri) { try { setPriority(pri); } catch (NullPointerException npe) { // Strangely, Sun's JDK 1.5 on Linux sometimes // throws NPE out of here... } catch (SecurityException se) { // Ignore this because we will still run fine with // normal thread priority } } public void run() { // First time through the while loop we do the merge // that we were started with: MergePolicy.OneMerge merge = this.startMerge; try { if (verbose()) message(" merge thread: start"); while(true) { setRunningMerge(merge); doMerge(merge); // Subsequent times through the loop we do any new // merge that writer says is necessary: merge = writer.getNextMerge(); if (merge != null) { writer.mergeInit(merge); if (verbose()) message(" merge thread: do another merge " + merge.segString(dir)); } else break; } if (verbose()) message(" merge thread: done"); } catch (Throwable exc) { // Ignore the exception if it was due to abort: if (!(exc instanceof MergePolicy.MergeAbortedException)) { if (!suppressExceptions) { // suppressExceptions is normally only set during // testing. anyExceptions = true; handleMergeException(exc); } } } finally { synchronized(ConcurrentMergeScheduler.this) { ConcurrentMergeScheduler.this.notifyAll(); boolean removed = mergeThreads.remove(this); assert removed; } } } public String toString() { MergePolicy.OneMerge merge = getRunningMerge(); if (merge == null) merge = startMerge; return "merge thread: " + merge.segString(dir); } } /** Called when an exception is hit in a background merge * thread */ protected void handleMergeException(Throwable exc) { try { // When an exception is hit during merge, IndexWriter // removes any partial files and then allows another // merge to run. If whatever caused the error is not // transient then the exception will keep happening, // so, we sleep here to avoid saturating CPU in such // cases: Thread.sleep(250); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); // In 3.0 this will throw InterruptedException throw new RuntimeException(ie); } throw new MergePolicy.MergeException(exc, dir); } static boolean anyExceptions = false; /** Used for testing */ public static boolean anyUnhandledExceptions() { if (allInstances == null) { throw new RuntimeException("setTestMode() was not called; often this is because your test case's setUp method fails to call super.setUp in LuceneTestCase"); } synchronized(allInstances) { final int count = allInstances.size(); // Make sure all outstanding threads are done so we see // any exceptions they may produce: for(int i=0;i * If the compoundFile flag is set, then the segments will be merged into a compound file. * * * @see #merge * @see #add */ final class SegmentMerger { /** norms header placeholder */ static final byte[] NORMS_HEADER = new byte[]{'N','R','M',-1}; private Directory directory; private String segment; private int termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL; private List readers = new ArrayList(); private FieldInfos fieldInfos; private int mergedDocs; private final CheckAbort checkAbort; // Whether we should merge doc stores (stored fields and // vectors files). When all segments we are merging // already share the same doc store files, we don't need // to merge the doc stores. private boolean mergeDocStores; /** Maximum number of contiguous documents to bulk-copy when merging stored fields */ private final static int MAX_RAW_MERGE_DOCS = 4192; /** This ctor used only by test code. * * @param dir The Directory to merge the other segments into * @param name The name of the new segment */ SegmentMerger(Directory dir, String name) { directory = dir; segment = name; checkAbort = new CheckAbort(null, null) { public void work(double units) throws MergeAbortedException { // do nothing } }; } SegmentMerger(IndexWriter writer, String name, MergePolicy.OneMerge merge) { directory = writer.getDirectory(); segment = name; if (merge != null) { checkAbort = new CheckAbort(merge, directory); } else { checkAbort = new CheckAbort(null, null) { public void work(double units) throws MergeAbortedException { // do nothing } }; } termIndexInterval = writer.getTermIndexInterval(); } boolean hasProx() { return fieldInfos.hasProx(); } /** * Add an IndexReader to the collection of readers that are to be merged * @param reader */ final void add(IndexReader reader) { readers.add(reader); } /** * * @param i The index of the reader to return * @return The ith reader to be merged */ final IndexReader segmentReader(int i) { return (IndexReader) readers.get(i); } /** * Merges the readers specified by the {@link #add} method into the directory passed to the constructor * @return The number of documents that were merged * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ final int merge() throws CorruptIndexException, IOException { return merge(true); } /** * Merges the readers specified by the {@link #add} method * into the directory passed to the constructor. * @param mergeDocStores if false, we will not merge the * stored fields nor vectors files * @return The number of documents that were merged * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ final int merge(boolean mergeDocStores) throws CorruptIndexException, IOException { this.mergeDocStores = mergeDocStores; // NOTE: it's important to add calls to // checkAbort.work(...) if you make any changes to this // method that will spend alot of time. The frequency // of this check impacts how long // IndexWriter.close(false) takes to actually stop the // threads. mergedDocs = mergeFields(); mergeTerms(); mergeNorms(); if (mergeDocStores && fieldInfos.hasVectors()) mergeVectors(); return mergedDocs; } /** * close all IndexReaders that have been added. * Should not be called before merge(). * @throws IOException */ final void closeReaders() throws IOException { for (Iterator iter = readers.iterator(); iter.hasNext();) { ((IndexReader) iter.next()).close(); } } final Collection getMergedFiles() throws IOException { Set fileSet = new HashSet(); // Basic files for (int i = 0; i < IndexFileNames.COMPOUND_EXTENSIONS.length; i++) { String ext = IndexFileNames.COMPOUND_EXTENSIONS[i]; if (ext.equals(IndexFileNames.PROX_EXTENSION) && !hasProx()) continue; if (mergeDocStores || (!ext.equals(IndexFileNames.FIELDS_EXTENSION) && !ext.equals(IndexFileNames.FIELDS_INDEX_EXTENSION))) fileSet.add(segment + "." + ext); } // Fieldable norm files for (int i = 0; i < fieldInfos.size(); i++) { FieldInfo fi = fieldInfos.fieldInfo(i); if (fi.isIndexed && !fi.omitNorms) { fileSet.add(segment + "." + IndexFileNames.NORMS_EXTENSION); break; } } // Vector files if (fieldInfos.hasVectors() && mergeDocStores) { for (int i = 0; i < IndexFileNames.VECTOR_EXTENSIONS.length; i++) { fileSet.add(segment + "." + IndexFileNames.VECTOR_EXTENSIONS[i]); } } return fileSet; } final Collection createCompoundFile(String fileName) throws IOException { Collection files = getMergedFiles(); CompoundFileWriter cfsWriter = new CompoundFileWriter(directory, fileName, checkAbort); // Now merge all added files Iterator it = files.iterator(); while (it.hasNext()) { cfsWriter.addFile((String) it.next()); } // Perform the merge cfsWriter.close(); return files; } private void addIndexed(IndexReader reader, FieldInfos fInfos, Collection names, boolean storeTermVectors, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean storePayloads, boolean omitTFAndPositions) throws IOException { Iterator i = names.iterator(); while (i.hasNext()) { String field = (String) i.next(); fInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader .hasNorms(field), storePayloads, omitTFAndPositions); } } private SegmentReader[] matchingSegmentReaders; private int[] rawDocLengths; private int[] rawDocLengths2; private void setMatchingSegmentReaders() { // If the i'th reader is a SegmentReader and has // identical fieldName -> number mapping, then this // array will be non-null at position i: int numReaders = readers.size(); matchingSegmentReaders = new SegmentReader[numReaders]; // If this reader is a SegmentReader, and all of its // field name -> number mappings match the "merged" // FieldInfos, then we can do a bulk copy of the // stored fields: for (int i = 0; i < numReaders; i++) { IndexReader reader = (IndexReader) readers.get(i); if (reader instanceof SegmentReader) { SegmentReader segmentReader = (SegmentReader) reader; boolean same = true; FieldInfos segmentFieldInfos = segmentReader.fieldInfos(); int numFieldInfos = segmentFieldInfos.size(); for (int j = 0; same && j < numFieldInfos; j++) { same = fieldInfos.fieldName(j).equals(segmentFieldInfos.fieldName(j)); } if (same) { matchingSegmentReaders[i] = segmentReader; } } } // Used for bulk-reading raw bytes for stored fields rawDocLengths = new int[MAX_RAW_MERGE_DOCS]; rawDocLengths2 = new int[MAX_RAW_MERGE_DOCS]; } /** * * @return The number of documents in all of the readers * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ private final int mergeFields() throws CorruptIndexException, IOException { if (!mergeDocStores) { // When we are not merging by doc stores, that means // all segments were written as part of a single // autoCommit=false IndexWriter session, so their field // name -> number mapping are the same. So, we start // with the fieldInfos of the last segment in this // case, to keep that numbering. final SegmentReader sr = (SegmentReader) readers.get(readers.size()-1); fieldInfos = (FieldInfos) sr.core.fieldInfos.clone(); } else { fieldInfos = new FieldInfos(); // merge field names } for (Iterator iter = readers.iterator(); iter.hasNext();) { IndexReader reader = (IndexReader) iter.next(); if (reader instanceof SegmentReader) { SegmentReader segmentReader = (SegmentReader) reader; FieldInfos readerFieldInfos = segmentReader.fieldInfos(); int numReaderFieldInfos = readerFieldInfos.size(); for (int j = 0; j < numReaderFieldInfos; j++) { FieldInfo fi = readerFieldInfos.fieldInfo(j); fieldInfos.add(fi.name, fi.isIndexed, fi.storeTermVector, fi.storePositionWithTermVector, fi.storeOffsetWithTermVector, !reader.hasNorms(fi.name), fi.storePayloads, fi.omitTermFreqAndPositions); } } else { addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false, false); addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false, false); addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false, false); addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR), true, false, false, false, false); addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.OMIT_TERM_FREQ_AND_POSITIONS), false, false, false, false, true); addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.STORES_PAYLOADS), false, false, false, true, false); addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.INDEXED), false, false, false, false, false); fieldInfos.add(reader.getFieldNames(FieldOption.UNINDEXED), false); } } fieldInfos.write(directory, segment + ".fnm"); int docCount = 0; setMatchingSegmentReaders(); if (mergeDocStores) { // for merging we don't want to compress/uncompress the data, so to tell the FieldsReader that we're // in merge mode, we use this FieldSelector FieldSelector fieldSelectorMerge = new FieldSelector() { public FieldSelectorResult accept(String fieldName) { return FieldSelectorResult.LOAD_FOR_MERGE; } }; // merge field values final FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos); try { int idx = 0; for (Iterator iter = readers.iterator(); iter.hasNext();) { final IndexReader reader = (IndexReader) iter.next(); final SegmentReader matchingSegmentReader = matchingSegmentReaders[idx++]; FieldsReader matchingFieldsReader = null; if (matchingSegmentReader != null) { final FieldsReader fieldsReader = matchingSegmentReader.getFieldsReader(); if (fieldsReader != null && fieldsReader.canReadRawDocs()) { matchingFieldsReader = fieldsReader; } } if (reader.hasDeletions()) { docCount += copyFieldsWithDeletions(fieldSelectorMerge, fieldsWriter, reader, matchingFieldsReader); } else { docCount += copyFieldsNoDeletions(fieldSelectorMerge, fieldsWriter, reader, matchingFieldsReader); } } } finally { fieldsWriter.close(); } final String fileName = segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION; final long fdxFileLength = directory.fileLength(fileName); if (4+((long) docCount)*8 != fdxFileLength) // This is most likely a bug in Sun JRE 1.6.0_04/_05; // we detect that the bug has struck, here, and // throw an exception to prevent the corruption from // entering the index. See LUCENE-1282 for // details. throw new RuntimeException("mergeFields produced an invalid result: docCount is " + docCount + " but fdx file size is " + fdxFileLength + " file=" + fileName + " file exists?=" + directory.fileExists(fileName) + "; now aborting this merge to prevent index corruption"); } else // If we are skipping the doc stores, that means there // are no deletions in any of these segments, so we // just sum numDocs() of each segment to get total docCount for (Iterator iter = readers.iterator(); iter.hasNext();) { docCount += ((IndexReader) iter.next()).numDocs(); } return docCount; } private int copyFieldsWithDeletions(final FieldSelector fieldSelectorMerge, final FieldsWriter fieldsWriter, final IndexReader reader, final FieldsReader matchingFieldsReader) throws IOException, MergeAbortedException, CorruptIndexException { int docCount = 0; final int maxDoc = reader.maxDoc(); if (matchingFieldsReader != null) { // We can bulk-copy because the fieldInfos are "congruent" for (int j = 0; j < maxDoc;) { if (reader.isDeleted(j)) { // skip deleted docs ++j; continue; } // We can optimize this case (doing a bulk byte copy) since the field // numbers are identical int start = j, numDocs = 0; do { j++; numDocs++; if (j >= maxDoc) break; if (reader.isDeleted(j)) { j++; break; } } while(numDocs < MAX_RAW_MERGE_DOCS); IndexInput stream = matchingFieldsReader.rawDocs(rawDocLengths, start, numDocs); fieldsWriter.addRawDocuments(stream, rawDocLengths, numDocs); docCount += numDocs; checkAbort.work(300 * numDocs); } } else { for (int j = 0; j < maxDoc; j++) { if (reader.isDeleted(j)) { // skip deleted docs continue; } // NOTE: it's very important to first assign to doc then pass it to // termVectorsWriter.addAllDocVectors; see LUCENE-1282 Document doc = reader.document(j, fieldSelectorMerge); fieldsWriter.addDocument(doc); docCount++; checkAbort.work(300); } } return docCount; } private int copyFieldsNoDeletions(FieldSelector fieldSelectorMerge, final FieldsWriter fieldsWriter, final IndexReader reader, final FieldsReader matchingFieldsReader) throws IOException, MergeAbortedException, CorruptIndexException { final int maxDoc = reader.maxDoc(); int docCount = 0; if (matchingFieldsReader != null) { // We can bulk-copy because the fieldInfos are "congruent" while (docCount < maxDoc) { int len = Math.min(MAX_RAW_MERGE_DOCS, maxDoc - docCount); IndexInput stream = matchingFieldsReader.rawDocs(rawDocLengths, docCount, len); fieldsWriter.addRawDocuments(stream, rawDocLengths, len); docCount += len; checkAbort.work(300 * len); } } else { for (; docCount < maxDoc; docCount++) { // NOTE: it's very important to first assign to doc then pass it to // termVectorsWriter.addAllDocVectors; see LUCENE-1282 Document doc = reader.document(docCount, fieldSelectorMerge); fieldsWriter.addDocument(doc); checkAbort.work(300); } } return docCount; } /** * Merge the TermVectors from each of the segments into the new one. * @throws IOException */ private final void mergeVectors() throws IOException { TermVectorsWriter termVectorsWriter = new TermVectorsWriter(directory, segment, fieldInfos); try { int idx = 0; for (Iterator iter = readers.iterator(); iter.hasNext();) { final SegmentReader matchingSegmentReader = matchingSegmentReaders[idx++]; TermVectorsReader matchingVectorsReader = null; if (matchingSegmentReader != null) { TermVectorsReader vectorsReader = matchingSegmentReader.getTermVectorsReaderOrig(); // If the TV* files are an older format then they cannot read raw docs: if (vectorsReader != null && vectorsReader.canReadRawDocs()) { matchingVectorsReader = vectorsReader; } } final IndexReader reader = (IndexReader) iter.next(); if (reader.hasDeletions()) { copyVectorsWithDeletions(termVectorsWriter, matchingVectorsReader, reader); } else { copyVectorsNoDeletions(termVectorsWriter, matchingVectorsReader, reader); } } } finally { termVectorsWriter.close(); } final String fileName = segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION; final long tvxSize = directory.fileLength(fileName); if (4+((long) mergedDocs)*16 != tvxSize) // This is most likely a bug in Sun JRE 1.6.0_04/_05; // we detect that the bug has struck, here, and // throw an exception to prevent the corruption from // entering the index. See LUCENE-1282 for // details. throw new RuntimeException("mergeVectors produced an invalid result: mergedDocs is " + mergedDocs + " but tvx size is " + tvxSize + " file=" + fileName + " file exists?=" + directory.fileExists(fileName) + "; now aborting this merge to prevent index corruption"); } private void copyVectorsWithDeletions(final TermVectorsWriter termVectorsWriter, final TermVectorsReader matchingVectorsReader, final IndexReader reader) throws IOException, MergeAbortedException { final int maxDoc = reader.maxDoc(); if (matchingVectorsReader != null) { // We can bulk-copy because the fieldInfos are "congruent" for (int docNum = 0; docNum < maxDoc;) { if (reader.isDeleted(docNum)) { // skip deleted docs ++docNum; continue; } // We can optimize this case (doing a bulk byte copy) since the field // numbers are identical int start = docNum, numDocs = 0; do { docNum++; numDocs++; if (docNum >= maxDoc) break; if (reader.isDeleted(docNum)) { docNum++; break; } } while(numDocs < MAX_RAW_MERGE_DOCS); matchingVectorsReader.rawDocs(rawDocLengths, rawDocLengths2, start, numDocs); termVectorsWriter.addRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs); checkAbort.work(300 * numDocs); } } else { for (int docNum = 0; docNum < maxDoc; docNum++) { if (reader.isDeleted(docNum)) { // skip deleted docs continue; } // NOTE: it's very important to first assign to vectors then pass it to // termVectorsWriter.addAllDocVectors; see LUCENE-1282 TermFreqVector[] vectors = reader.getTermFreqVectors(docNum); termVectorsWriter.addAllDocVectors(vectors); checkAbort.work(300); } } } private void copyVectorsNoDeletions(final TermVectorsWriter termVectorsWriter, final TermVectorsReader matchingVectorsReader, final IndexReader reader) throws IOException, MergeAbortedException { final int maxDoc = reader.maxDoc(); if (matchingVectorsReader != null) { // We can bulk-copy because the fieldInfos are "congruent" int docCount = 0; while (docCount < maxDoc) { int len = Math.min(MAX_RAW_MERGE_DOCS, maxDoc - docCount); matchingVectorsReader.rawDocs(rawDocLengths, rawDocLengths2, docCount, len); termVectorsWriter.addRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, len); docCount += len; checkAbort.work(300 * len); } } else { for (int docNum = 0; docNum < maxDoc; docNum++) { // NOTE: it's very important to first assign to vectors then pass it to // termVectorsWriter.addAllDocVectors; see LUCENE-1282 TermFreqVector[] vectors = reader.getTermFreqVectors(docNum); termVectorsWriter.addAllDocVectors(vectors); checkAbort.work(300); } } } private SegmentMergeQueue queue = null; private final void mergeTerms() throws CorruptIndexException, IOException { SegmentWriteState state = new SegmentWriteState(null, directory, segment, null, mergedDocs, 0, termIndexInterval); final FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos); try { queue = new SegmentMergeQueue(readers.size()); mergeTermInfos(consumer); } finally { consumer.finish(); if (queue != null) queue.close(); } } boolean omitTermFreqAndPositions; private final void mergeTermInfos(final FormatPostingsFieldsConsumer consumer) throws CorruptIndexException, IOException { int base = 0; final int readerCount = readers.size(); for (int i = 0; i < readerCount; i++) { IndexReader reader = (IndexReader) readers.get(i); TermEnum termEnum = reader.terms(); SegmentMergeInfo smi = new SegmentMergeInfo(base, termEnum, reader); int[] docMap = smi.getDocMap(); if (docMap != null) { if (docMaps == null) { docMaps = new int[readerCount][]; delCounts = new int[readerCount]; } docMaps[i] = docMap; delCounts[i] = smi.reader.maxDoc() - smi.reader.numDocs(); } base += reader.numDocs(); assert reader.numDocs() == reader.maxDoc() - smi.delCount; if (smi.next()) queue.add(smi); // initialize queue else smi.close(); } SegmentMergeInfo[] match = new SegmentMergeInfo[readers.size()]; String currentField = null; FormatPostingsTermsConsumer termsConsumer = null; while (queue.size() > 0) { int matchSize = 0; // pop matching terms match[matchSize++] = (SegmentMergeInfo) queue.pop(); Term term = match[0].term; SegmentMergeInfo top = (SegmentMergeInfo) queue.top(); while (top != null && term.compareTo(top.term) == 0) { match[matchSize++] = (SegmentMergeInfo) queue.pop(); top = (SegmentMergeInfo) queue.top(); } if (currentField != term.field) { currentField = term.field; if (termsConsumer != null) termsConsumer.finish(); final FieldInfo fieldInfo = fieldInfos.fieldInfo(currentField); termsConsumer = consumer.addField(fieldInfo); omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions; } int df = appendPostings(termsConsumer, match, matchSize); // add new TermInfo checkAbort.work(df/3.0); while (matchSize > 0) { SegmentMergeInfo smi = match[--matchSize]; if (smi.next()) queue.add(smi); // restore queue else smi.close(); // done with a segment } } } private byte[] payloadBuffer; private int[][] docMaps; int[][] getDocMaps() { return docMaps; } private int[] delCounts; int[] getDelCounts() { return delCounts; } /** Process postings from multiple segments all positioned on the * same term. Writes out merged entries into freqOutput and * the proxOutput streams. * * @param smis array of segments * @param n number of cells in the array actually occupied * @return number of documents across all segments where this term was found * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ private final int appendPostings(final FormatPostingsTermsConsumer termsConsumer, SegmentMergeInfo[] smis, int n) throws CorruptIndexException, IOException { final FormatPostingsDocsConsumer docConsumer = termsConsumer.addTerm(smis[0].term.text); int df = 0; for (int i = 0; i < n; i++) { SegmentMergeInfo smi = smis[i]; TermPositions postings = smi.getPositions(); assert postings != null; int base = smi.base; int[] docMap = smi.getDocMap(); postings.seek(smi.termEnum); while (postings.next()) { df++; int doc = postings.doc(); if (docMap != null) doc = docMap[doc]; // map around deletions doc += base; // convert to merged space final int freq = postings.freq(); final FormatPostingsPositionsConsumer posConsumer = docConsumer.addDoc(doc, freq); if (!omitTermFreqAndPositions) { for (int j = 0; j < freq; j++) { final int position = postings.nextPosition(); final int payloadLength = postings.getPayloadLength(); if (payloadLength > 0) { if (payloadBuffer == null || payloadBuffer.length < payloadLength) payloadBuffer = new byte[payloadLength]; postings.getPayload(payloadBuffer, 0); } posConsumer.addPosition(position, payloadBuffer, 0, payloadLength); } posConsumer.finish(); } } } docConsumer.finish(); return df; } private void mergeNorms() throws IOException { byte[] normBuffer = null; IndexOutput output = null; try { int numFieldInfos = fieldInfos.size(); for (int i = 0; i < numFieldInfos; i++) { FieldInfo fi = fieldInfos.fieldInfo(i); if (fi.isIndexed && !fi.omitNorms) { if (output == null) { output = directory.createOutput(segment + "." + IndexFileNames.NORMS_EXTENSION); output.writeBytes(NORMS_HEADER,NORMS_HEADER.length); } for (Iterator iter = readers.iterator(); iter.hasNext();) { IndexReader reader = (IndexReader) iter.next(); int maxDoc = reader.maxDoc(); if (normBuffer == null || normBuffer.length < maxDoc) { // the buffer is too small for the current segment normBuffer = new byte[maxDoc]; } reader.norms(fi.name, normBuffer, 0); if (!reader.hasDeletions()) { //optimized case for segments without deleted docs output.writeBytes(normBuffer, maxDoc); } else { // this segment has deleted docs, so we have to // check for every doc if it is deleted or not for (int k = 0; k < maxDoc; k++) { if (!reader.isDeleted(k)) { output.writeByte(normBuffer[k]); } } } checkAbort.work(maxDoc); } } } } finally { if (output != null) { output.close(); } } } static class CheckAbort { private double workCount; private MergePolicy.OneMerge merge; private Directory dir; public CheckAbort(MergePolicy.OneMerge merge, Directory dir) { this.merge = merge; this.dir = dir; } /** * Records the fact that roughly units amount of work * have been done since this method was last called. * When adding time-consuming code into SegmentMerger, * you should test different values for units to ensure * that the time in between calls to merge.checkAborted * is up to ~ 1 second. */ public void work(double units) throws MergePolicy.MergeAbortedException { workCount += units; if (workCount >= 10000.0) { merge.checkAborted(dir); workCount = 0; } } } } lucene-2.9.4/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java0000644000175000017500000001250111474320230030004 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.document.Fieldable; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; // TODO: break into separate freq and prox writers as // codecs; make separate container (tii/tis/skip/*) that can // be configured as any number of files 1..N final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implements Comparable { final FreqProxTermsWriterPerThread perThread; final TermsHashPerField termsHashPerField; final FieldInfo fieldInfo; final DocumentsWriter.DocState docState; final FieldInvertState fieldState; boolean omitTermFreqAndPositions; PayloadAttribute payloadAttribute; public FreqProxTermsWriterPerField(TermsHashPerField termsHashPerField, FreqProxTermsWriterPerThread perThread, FieldInfo fieldInfo) { this.termsHashPerField = termsHashPerField; this.perThread = perThread; this.fieldInfo = fieldInfo; docState = termsHashPerField.docState; fieldState = termsHashPerField.fieldState; omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions; } int getStreamCount() { if (fieldInfo.omitTermFreqAndPositions) return 1; else return 2; } void finish() {} boolean hasPayloads; void skippingLongTerm() throws IOException {} public int compareTo(Object other0) { FreqProxTermsWriterPerField other = (FreqProxTermsWriterPerField) other0; return fieldInfo.name.compareTo(other.fieldInfo.name); } void reset() { // Record, up front, whether our in-RAM format will be // with or without term freqs: omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions; payloadAttribute = null; } boolean start(Fieldable[] fields, int count) { for(int i=0;i 0) { termsHashPerField.writeVInt(1, (proxCode<<1)|1); termsHashPerField.writeVInt(1, payload.length); termsHashPerField.writeBytes(1, payload.data, payload.offset, payload.length); hasPayloads = true; } else termsHashPerField.writeVInt(1, proxCode<<1); p.lastPosition = fieldState.position; } final void newTerm(RawPostingList p0) { // First time we're seeing this term since the last // flush assert docState.testPoint("FreqProxTermsWriterPerField.newTerm start"); FreqProxTermsWriter.PostingList p = (FreqProxTermsWriter.PostingList) p0; p.lastDocID = docState.docID; if (omitTermFreqAndPositions) { p.lastDocCode = docState.docID; } else { p.lastDocCode = docState.docID << 1; p.docFreq = 1; writeProx(p, fieldState.position); } } final void addTerm(RawPostingList p0) { assert docState.testPoint("FreqProxTermsWriterPerField.addTerm start"); FreqProxTermsWriter.PostingList p = (FreqProxTermsWriter.PostingList) p0; assert omitTermFreqAndPositions || p.docFreq > 0; if (omitTermFreqAndPositions) { if (docState.docID != p.lastDocID) { assert docState.docID > p.lastDocID; termsHashPerField.writeVInt(0, p.lastDocCode); p.lastDocCode = docState.docID - p.lastDocID; p.lastDocID = docState.docID; } } else { if (docState.docID != p.lastDocID) { assert docState.docID > p.lastDocID; // Term not yet seen in the current doc but previously // seen in other doc(s) since the last flush // Now that we know doc freq for previous doc, // write it & lastDocCode if (1 == p.docFreq) termsHashPerField.writeVInt(0, p.lastDocCode|1); else { termsHashPerField.writeVInt(0, p.lastDocCode); termsHashPerField.writeVInt(0, p.docFreq); } p.docFreq = 1; p.lastDocCode = (docState.docID - p.lastDocID) << 1; p.lastDocID = docState.docID; writeProx(p, fieldState.position); } else { p.docFreq++; writeProx(p, fieldState.position-p.lastPosition); } } } public void abort() {} } lucene-2.9.4/src/java/org/apache/lucene/index/InvertedDocEndConsumer.java0000644000175000017500000000236711474320230026775 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Map; import java.io.IOException; abstract class InvertedDocEndConsumer { abstract InvertedDocEndConsumerPerThread addThread(DocInverterPerThread docInverterPerThread); abstract void flush(Map threadsAndFields, SegmentWriteState state) throws IOException; abstract void closeDocStore(SegmentWriteState state) throws IOException; abstract void abort(); abstract void setFieldInfos(FieldInfos fieldInfos); } lucene-2.9.4/src/java/org/apache/lucene/index/TermVectorEntry.java0000644000175000017500000000440411474320230025532 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Copyright 2007 The Apache Software Foundation *

    * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at *

    * http://www.apache.org/licenses/LICENSE-2.0 *

    * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Convenience class for holding TermVector information. */ public class TermVectorEntry { private String field; private String term; private int frequency; private TermVectorOffsetInfo [] offsets; int [] positions; public TermVectorEntry() { } public TermVectorEntry(String field, String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { this.field = field; this.term = term; this.frequency = frequency; this.offsets = offsets; this.positions = positions; } public String getField() { return field; } public int getFrequency() { return frequency; } public TermVectorOffsetInfo[] getOffsets() { return offsets; } public int[] getPositions() { return positions; } public String getTerm() { return term; } //Keep package local void setFrequency(int frequency) { this.frequency = frequency; } void setOffsets(TermVectorOffsetInfo[] offsets) { this.offsets = offsets; } void setPositions(int[] positions) { this.positions = positions; } public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; TermVectorEntry that = (TermVectorEntry) o; if (term != null ? !term.equals(that.term) : that.term != null) return false; return true; } public int hashCode() { return (term != null ? term.hashCode() : 0); } public String toString() { return "TermVectorEntry{" + "field='" + field + '\'' + ", term='" + term + '\'' + ", frequency=" + frequency + '}'; } } lucene-2.9.4/src/java/org/apache/lucene/index/TermInfo.java0000644000175000017500000000333311474320230024141 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** A TermInfo is the record of information stored for a term.*/ final class TermInfo { /** The number of documents which contain the term. */ int docFreq = 0; long freqPointer = 0; long proxPointer = 0; int skipOffset; TermInfo() {} TermInfo(int df, long fp, long pp) { docFreq = df; freqPointer = fp; proxPointer = pp; } TermInfo(TermInfo ti) { docFreq = ti.docFreq; freqPointer = ti.freqPointer; proxPointer = ti.proxPointer; skipOffset = ti.skipOffset; } final void set(int docFreq, long freqPointer, long proxPointer, int skipOffset) { this.docFreq = docFreq; this.freqPointer = freqPointer; this.proxPointer = proxPointer; this.skipOffset = skipOffset; } final void set(TermInfo ti) { docFreq = ti.docFreq; freqPointer = ti.freqPointer; proxPointer = ti.proxPointer; skipOffset = ti.skipOffset; } } lucene-2.9.4/src/java/org/apache/lucene/index/InvertedDocConsumerPerField.java0000644000175000017500000000310211474320230027745 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.document.Fieldable; abstract class InvertedDocConsumerPerField { // Called once per field, and is given all Fieldable // occurrences for this field in the document. Return // true if you wish to see inverted tokens for these // fields: abstract boolean start(Fieldable[] fields, int count) throws IOException; // Called before a field instance is being processed abstract void start(Fieldable field); // Called once per inverted token abstract void add() throws IOException; // Called once per field per document, after all Fieldable // occurrences are inverted abstract void finish() throws IOException; // Called on hitting an aborting exception abstract void abort(); } lucene-2.9.4/src/java/org/apache/lucene/index/KeepOnlyLastCommitDeletionPolicy.java0000644000175000017500000000320311474320230031001 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.List; /** * This {@link IndexDeletionPolicy} implementation that * keeps only the most recent commit and immediately removes * all prior commits after a new commit is done. This is * the default deletion policy. */ public final class KeepOnlyLastCommitDeletionPolicy implements IndexDeletionPolicy { /** * Deletes all commits except the most recent one. */ public void onInit(List commits) { // Note that commits.size() should normally be 1: onCommit(commits); } /** * Deletes all commits except the most recent one. */ public void onCommit(List commits) { // Note that commits.size() should normally be 2 (if not // called by onInit above): int size = commits.size(); for(int i=0;i 0) // Recycle all but the first buffer docWriter.recycleIntBlocks(buffers, 1, 1+bufferUpto); // Reuse first buffer bufferUpto = 0; intUpto = 0; intOffset = 0; buffer = buffers[0]; } } public void nextBuffer() { if (1+bufferUpto == buffers.length) { int[][] newBuffers = new int[(int) (buffers.length*1.5)][]; System.arraycopy(buffers, 0, newBuffers, 0, buffers.length); buffers = newBuffers; } buffer = buffers[1+bufferUpto] = docWriter.getIntBlock(trackAllocations); bufferUpto++; intUpto = 0; intOffset += DocumentsWriter.INT_BLOCK_SIZE; } } lucene-2.9.4/src/java/org/apache/lucene/index/DocFieldConsumers.java0000644000175000017500000001070311474320230025765 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.HashMap; import java.util.Collection; import java.util.Iterator; import java.util.Map; import java.util.HashSet; import java.io.IOException; import org.apache.lucene.util.ArrayUtil; /** This is just a "splitter" class: it lets you wrap two * DocFieldConsumer instances as a single consumer. */ final class DocFieldConsumers extends DocFieldConsumer { final DocFieldConsumer one; final DocFieldConsumer two; public DocFieldConsumers(DocFieldConsumer one, DocFieldConsumer two) { this.one = one; this.two = two; } void setFieldInfos(FieldInfos fieldInfos) { super.setFieldInfos(fieldInfos); one.setFieldInfos(fieldInfos); two.setFieldInfos(fieldInfos); } public void flush(Map threadsAndFields, SegmentWriteState state) throws IOException { Map oneThreadsAndFields = new HashMap(); Map twoThreadsAndFields = new HashMap(); Iterator it = threadsAndFields.entrySet().iterator(); while(it.hasNext()) { Map.Entry entry = (Map.Entry) it.next(); DocFieldConsumersPerThread perThread = (DocFieldConsumersPerThread) entry.getKey(); Collection fields = (Collection) entry.getValue(); Iterator fieldsIt = fields.iterator(); Collection oneFields = new HashSet(); Collection twoFields = new HashSet(); while(fieldsIt.hasNext()) { DocFieldConsumersPerField perField = (DocFieldConsumersPerField) fieldsIt.next(); oneFields.add(perField.one); twoFields.add(perField.two); } oneThreadsAndFields.put(perThread.one, oneFields); twoThreadsAndFields.put(perThread.two, twoFields); } one.flush(oneThreadsAndFields, state); two.flush(twoThreadsAndFields, state); } public void closeDocStore(SegmentWriteState state) throws IOException { try { one.closeDocStore(state); } finally { two.closeDocStore(state); } } public void abort() { try { one.abort(); } finally { two.abort(); } } public boolean freeRAM() { boolean any = one.freeRAM(); any |= two.freeRAM(); return any; } public DocFieldConsumerPerThread addThread(DocFieldProcessorPerThread docFieldProcessorPerThread) throws IOException { return new DocFieldConsumersPerThread(docFieldProcessorPerThread, this, one.addThread(docFieldProcessorPerThread), two.addThread(docFieldProcessorPerThread)); } PerDoc[] docFreeList = new PerDoc[1]; int freeCount; int allocCount; synchronized PerDoc getPerDoc() { if (freeCount == 0) { allocCount++; if (allocCount > docFreeList.length) { // Grow our free list up front to make sure we have // enough space to recycle all outstanding PerDoc // instances assert allocCount == 1+docFreeList.length; docFreeList = new PerDoc[ArrayUtil.getNextSize(allocCount)]; } return new PerDoc(); } else return docFreeList[--freeCount]; } synchronized void freePerDoc(PerDoc perDoc) { assert freeCount < docFreeList.length; docFreeList[freeCount++] = perDoc; } class PerDoc extends DocumentsWriter.DocWriter { DocumentsWriter.DocWriter one; DocumentsWriter.DocWriter two; public long sizeInBytes() { return one.sizeInBytes() + two.sizeInBytes(); } public void finish() throws IOException { try { try { one.finish(); } finally { two.finish(); } } finally { freePerDoc(this); } } public void abort() { try { try { one.abort(); } finally { two.abort(); } } finally { freePerDoc(this); } } } } lucene-2.9.4/src/java/org/apache/lucene/index/ByteSliceWriter.java0000644000175000017500000000466311474320230025505 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Class to write byte streams into slices of shared * byte[]. This is used by DocumentsWriter to hold the * posting list for many terms in RAM. */ final class ByteSliceWriter { private byte[] slice; private int upto; private final ByteBlockPool pool; int offset0; public ByteSliceWriter(ByteBlockPool pool) { this.pool = pool; } /** * Set up the writer to write at address. */ public void init(int address) { slice = pool.buffers[address >> DocumentsWriter.BYTE_BLOCK_SHIFT]; assert slice != null; upto = address & DocumentsWriter.BYTE_BLOCK_MASK; offset0 = address; assert upto < slice.length; } /** Write byte into byte slice stream */ public void writeByte(byte b) { assert slice != null; if (slice[upto] != 0) { upto = pool.allocSlice(slice, upto); slice = pool.buffer; offset0 = pool.byteOffset; assert slice != null; } slice[upto++] = b; assert upto != slice.length; } public void writeBytes(final byte[] b, int offset, final int len) { final int offsetEnd = offset + len; while(offset < offsetEnd) { if (slice[upto] != 0) { // End marker upto = pool.allocSlice(slice, upto); slice = pool.buffer; offset0 = pool.byteOffset; } slice[upto++] = b[offset++]; assert upto != slice.length; } } public int getAddress() { return upto + (offset0 & DocumentsWriter.BYTE_BLOCK_NOT_MASK); } public void writeVInt(int i) { while ((i & ~0x7F) != 0) { writeByte((byte)((i & 0x7f) | 0x80)); i >>>= 7; } writeByte((byte) i); } } lucene-2.9.4/src/java/org/apache/lucene/index/MergeDocIDRemapper.java0000644000175000017500000001045011474320230026012 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** Remaps docIDs after a merge has completed, where the * merged segments had at least one deletion. This is used * to renumber the buffered deletes in IndexWriter when a * merge of segments with deletions commits. */ final class MergeDocIDRemapper { int[] starts; // used for binary search of mapped docID int[] newStarts; // starts, minus the deletes int[][] docMaps; // maps docIDs in the merged set int minDocID; // minimum docID that needs renumbering int maxDocID; // 1+ the max docID that needs renumbering int docShift; // total # deleted docs that were compacted by this merge public MergeDocIDRemapper(SegmentInfos infos, int[][] docMaps, int[] delCounts, MergePolicy.OneMerge merge, int mergedDocCount) { this.docMaps = docMaps; SegmentInfo firstSegment = merge.segments.info(0); int i = 0; while(true) { SegmentInfo info = infos.info(i); if (info.equals(firstSegment)) break; minDocID += info.docCount; i++; } int numDocs = 0; for(int j=0;j 0; // Make sure it all adds up: assert docShift == maxDocID - (newStarts[docMaps.length-1] + merge.segments.info(docMaps.length-1).docCount - delCounts[docMaps.length-1]); } public int remap(int oldDocID) { if (oldDocID < minDocID) // Unaffected by merge return oldDocID; else if (oldDocID >= maxDocID) // This doc was "after" the merge, so simple shift return oldDocID - docShift; else { // Binary search to locate this document & find its new docID int lo = 0; // search starts array int hi = docMaps.length - 1; // for first element less while (hi >= lo) { int mid = (lo + hi) >>> 1; int midValue = starts[mid]; if (oldDocID < midValue) hi = mid - 1; else if (oldDocID > midValue) lo = mid + 1; else { // found a match while (mid+1 < docMaps.length && starts[mid+1] == midValue) { mid++; // scan to last match } if (docMaps[mid] != null) return newStarts[mid] + docMaps[mid][oldDocID-starts[mid]]; else return newStarts[mid] + oldDocID-starts[mid]; } } if (docMaps[hi] != null) return newStarts[hi] + docMaps[hi][oldDocID-starts[hi]]; else return newStarts[hi] + oldDocID-starts[hi]; } } } lucene-2.9.4/src/java/org/apache/lucene/index/MergeScheduler.java0000644000175000017500000000335211474320230025315 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; /**

    Expert: {@link IndexWriter} uses an instance * implementing this interface to execute the merges * selected by a {@link MergePolicy}. The default * MergeScheduler is {@link ConcurrentMergeScheduler}.

    * *

    NOTE: This API is new and still experimental * (subject to change suddenly in the next release)

    * *

    NOTE: This class typically requires access to * package-private APIs (eg, SegmentInfos) to do its job; * if you implement your own MergePolicy, you'll need to put * it in package org.apache.lucene.index in order to use * these APIs. */ public abstract class MergeScheduler { /** Run the merges provided by {@link IndexWriter#getNextMerge()}. */ abstract void merge(IndexWriter writer) throws CorruptIndexException, IOException; /** Close this MergeScheduler. */ abstract void close() throws CorruptIndexException, IOException; } lucene-2.9.4/src/java/org/apache/lucene/index/FormatPostingsFieldsWriter.java0000644000175000017500000000522611474320230027724 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.store.Directory; final class FormatPostingsFieldsWriter extends FormatPostingsFieldsConsumer { final Directory dir; final String segment; final TermInfosWriter termsOut; final FieldInfos fieldInfos; final FormatPostingsTermsWriter termsWriter; final DefaultSkipListWriter skipListWriter; final int totalNumDocs; public FormatPostingsFieldsWriter(SegmentWriteState state, FieldInfos fieldInfos) throws IOException { super(); dir = state.directory; segment = state.segmentName; totalNumDocs = state.numDocs; this.fieldInfos = fieldInfos; termsOut = new TermInfosWriter(dir, segment, fieldInfos, state.termIndexInterval); // TODO: this is a nasty abstraction violation (that we // peek down to find freqOut/proxOut) -- we need a // better abstraction here whereby these child consumers // can provide skip data or not skipListWriter = new DefaultSkipListWriter(termsOut.skipInterval, termsOut.maxSkipLevels, totalNumDocs, null, null); state.flushedFiles.add(state.segmentFileName(IndexFileNames.TERMS_EXTENSION)); state.flushedFiles.add(state.segmentFileName(IndexFileNames.TERMS_INDEX_EXTENSION)); termsWriter = new FormatPostingsTermsWriter(state, this); } /** Add a new field */ FormatPostingsTermsConsumer addField(FieldInfo field) { termsWriter.setField(field); return termsWriter; } /** Called when we are done adding everything. */ void finish() throws IOException { termsOut.close(); termsWriter.close(); } } lucene-2.9.4/src/java/org/apache/lucene/index/CorruptIndexException.java0000644000175000017500000000210111474320230026713 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.index; import java.io.IOException; /** * This exception is thrown when Lucene detects * an inconsistency in the index. */ public class CorruptIndexException extends IOException { public CorruptIndexException(String message) { super(message); } } lucene-2.9.4/src/java/org/apache/lucene/index/TermVectorsReader.java0000644000175000017500000004674711474320230026036 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import java.io.IOException; import java.util.Arrays; /** * @version $Id: TermVectorsReader.java 1039322 2010-11-26 11:50:23Z mikemccand $ */ class TermVectorsReader implements Cloneable { // NOTE: if you make a new format, it must be larger than // the current format static final int FORMAT_VERSION = 2; // Changes to speed up bulk merging of term vectors: static final int FORMAT_VERSION2 = 3; // Changed strings to UTF8 with length-in-bytes not length-in-chars static final int FORMAT_UTF8_LENGTH_IN_BYTES = 4; // NOTE: always change this if you switch to a new format! static final int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES; //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file static final int FORMAT_SIZE = 4; static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1; static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2; private FieldInfos fieldInfos; private IndexInput tvx; private IndexInput tvd; private IndexInput tvf; private int size; private int numTotalDocs; // The docID offset where our docs begin in the index // file. This will be 0 if we have our own private file. private int docStoreOffset; private final int format; TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos) throws CorruptIndexException, IOException { this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE); } TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize) throws CorruptIndexException, IOException { this(d, segment, fieldInfos, readBufferSize, -1, 0); } TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size) throws CorruptIndexException, IOException { boolean success = false; try { if (d.fileExists(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION)) { tvx = d.openInput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION, readBufferSize); format = checkValidFormat(tvx); tvd = d.openInput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION, readBufferSize); final int tvdFormat = checkValidFormat(tvd); tvf = d.openInput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION, readBufferSize); final int tvfFormat = checkValidFormat(tvf); assert format == tvdFormat; assert format == tvfFormat; if (format >= FORMAT_VERSION2) { assert (tvx.length()-FORMAT_SIZE) % 16 == 0; numTotalDocs = (int) (tvx.length() >> 4); } else { assert (tvx.length()-FORMAT_SIZE) % 8 == 0; numTotalDocs = (int) (tvx.length() >> 3); } if (-1 == docStoreOffset) { this.docStoreOffset = 0; this.size = numTotalDocs; assert size == 0 || numTotalDocs == size; } else { this.docStoreOffset = docStoreOffset; this.size = size; // Verify the file is long enough to hold all of our // docs assert numTotalDocs >= size + docStoreOffset: "numTotalDocs=" + numTotalDocs + " size=" + size + " docStoreOffset=" + docStoreOffset; } } else { // If all documents flushed in a segment had hit // non-aborting exceptions, it's possible that // FieldInfos.hasVectors returns true yet the term // vector files don't exist. format = 0; } this.fieldInfos = fieldInfos; success = true; } finally { // With lock-less commits, it's entirely possible (and // fine) to hit a FileNotFound exception above. In // this case, we want to explicitly close any subset // of things that were opened so that we don't have to // wait for a GC to do so. if (!success) { close(); } } } // Used for bulk copy when merging IndexInput getTvdStream() { return tvd; } // Used for bulk copy when merging IndexInput getTvfStream() { return tvf; } final private void seekTvx(final int docNum) throws IOException { if (format < FORMAT_VERSION2) tvx.seek((docNum + docStoreOffset) * 8L + FORMAT_SIZE); else tvx.seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE); } boolean canReadRawDocs() { return format >= FORMAT_UTF8_LENGTH_IN_BYTES; } /** Retrieve the length (in bytes) of the tvd and tvf * entries for the next numDocs starting with * startDocID. This is used for bulk copying when * merging segments, if the field numbers are * congruent. Once this returns, the tvf & tvd streams * are seeked to the startDocID. */ final void rawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs) throws IOException { if (tvx == null) { Arrays.fill(tvdLengths, 0); Arrays.fill(tvfLengths, 0); return; } // SegmentMerger calls canReadRawDocs() first and should // not call us if that returns false. if (format < FORMAT_VERSION2) throw new IllegalStateException("cannot read raw docs with older term vector formats"); seekTvx(startDocID); long tvdPosition = tvx.readLong(); tvd.seek(tvdPosition); long tvfPosition = tvx.readLong(); tvf.seek(tvfPosition); long lastTvdPosition = tvdPosition; long lastTvfPosition = tvfPosition; int count = 0; while (count < numDocs) { final int docID = docStoreOffset + startDocID + count + 1; assert docID <= numTotalDocs; if (docID < numTotalDocs) { tvdPosition = tvx.readLong(); tvfPosition = tvx.readLong(); } else { tvdPosition = tvd.length(); tvfPosition = tvf.length(); assert count == numDocs-1; } tvdLengths[count] = (int) (tvdPosition-lastTvdPosition); tvfLengths[count] = (int) (tvfPosition-lastTvfPosition); count++; lastTvdPosition = tvdPosition; lastTvfPosition = tvfPosition; } } private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException { int format = in.readInt(); if (format > FORMAT_CURRENT) { throw new CorruptIndexException("Incompatible format version: " + format + " expected " + FORMAT_CURRENT + " or less"); } return format; } void close() throws IOException { // make all effort to close up. Keep the first exception // and throw it as a new one. IOException keep = null; if (tvx != null) try { tvx.close(); } catch (IOException e) { if (keep == null) keep = e; } if (tvd != null) try { tvd.close(); } catch (IOException e) { if (keep == null) keep = e; } if (tvf != null) try { tvf.close(); } catch (IOException e) { if (keep == null) keep = e; } if (keep != null) throw (IOException) keep.fillInStackTrace(); } /** * * @return The number of documents in the reader */ int size() { return size; } public void get(int docNum, String field, TermVectorMapper mapper) throws IOException { if (tvx != null) { int fieldNumber = fieldInfos.fieldNumber(field); //We need to account for the FORMAT_SIZE at when seeking in the tvx //We don't need to do this in other seeks because we already have the // file pointer //that was written in another file seekTvx(docNum); //System.out.println("TVX Pointer: " + tvx.getFilePointer()); long tvdPosition = tvx.readLong(); tvd.seek(tvdPosition); int fieldCount = tvd.readVInt(); //System.out.println("Num Fields: " + fieldCount); // There are only a few fields per document. We opt for a full scan // rather then requiring that they be ordered. We need to read through // all of the fields anyway to get to the tvf pointers. int number = 0; int found = -1; for (int i = 0; i < fieldCount; i++) { if (format >= FORMAT_VERSION) number = tvd.readVInt(); else number += tvd.readVInt(); if (number == fieldNumber) found = i; } // This field, although valid in the segment, was not found in this // document if (found != -1) { // Compute position in the tvf file long position; if (format >= FORMAT_VERSION2) position = tvx.readLong(); else position = tvd.readVLong(); for (int i = 1; i <= found; i++) position += tvd.readVLong(); mapper.setDocumentNumber(docNum); readTermVector(field, position, mapper); } else { //System.out.println("Fieldable not found"); } } else { //System.out.println("No tvx file"); } } /** * Retrieve the term vector for the given document and field * @param docNum The document number to retrieve the vector for * @param field The field within the document to retrieve * @return The TermFreqVector for the document and field or null if there is no termVector for this field. * @throws IOException if there is an error reading the term vector files */ TermFreqVector get(int docNum, String field) throws IOException { // Check if no term vectors are available for this segment at all ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper(); get(docNum, field, mapper); return mapper.materializeVector(); } // Reads the String[] fields; you have to pre-seek tvd to // the right point final private String[] readFields(int fieldCount) throws IOException { int number = 0; String[] fields = new String[fieldCount]; for (int i = 0; i < fieldCount; i++) { if (format >= FORMAT_VERSION) number = tvd.readVInt(); else number += tvd.readVInt(); fields[i] = fieldInfos.fieldName(number); } return fields; } // Reads the long[] offsets into TVF; you have to pre-seek // tvx/tvd to the right point final private long[] readTvfPointers(int fieldCount) throws IOException { // Compute position in the tvf file long position; if (format >= FORMAT_VERSION2) position = tvx.readLong(); else position = tvd.readVLong(); long[] tvfPointers = new long[fieldCount]; tvfPointers[0] = position; for (int i = 1; i < fieldCount; i++) { position += tvd.readVLong(); tvfPointers[i] = position; } return tvfPointers; } /** * Return all term vectors stored for this document or null if the could not be read in. * * @param docNum The document number to retrieve the vector for * @return All term frequency vectors * @throws IOException if there is an error reading the term vector files */ TermFreqVector[] get(int docNum) throws IOException { TermFreqVector[] result = null; if (tvx != null) { //We need to offset by seekTvx(docNum); long tvdPosition = tvx.readLong(); tvd.seek(tvdPosition); int fieldCount = tvd.readVInt(); // No fields are vectorized for this document if (fieldCount != 0) { final String[] fields = readFields(fieldCount); final long[] tvfPointers = readTvfPointers(fieldCount); result = readTermVectors(docNum, fields, tvfPointers); } } else { //System.out.println("No tvx file"); } return result; } public void get(int docNumber, TermVectorMapper mapper) throws IOException { // Check if no term vectors are available for this segment at all if (tvx != null) { //We need to offset by seekTvx(docNumber); long tvdPosition = tvx.readLong(); tvd.seek(tvdPosition); int fieldCount = tvd.readVInt(); // No fields are vectorized for this document if (fieldCount != 0) { final String[] fields = readFields(fieldCount); final long[] tvfPointers = readTvfPointers(fieldCount); mapper.setDocumentNumber(docNumber); readTermVectors(fields, tvfPointers, mapper); } } else { //System.out.println("No tvx file"); } } private SegmentTermVector[] readTermVectors(int docNum, String fields[], long tvfPointers[]) throws IOException { SegmentTermVector res[] = new SegmentTermVector[fields.length]; for (int i = 0; i < fields.length; i++) { ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper(); mapper.setDocumentNumber(docNum); readTermVector(fields[i], tvfPointers[i], mapper); res[i] = (SegmentTermVector) mapper.materializeVector(); } return res; } private void readTermVectors(String fields[], long tvfPointers[], TermVectorMapper mapper) throws IOException { for (int i = 0; i < fields.length; i++) { readTermVector(fields[i], tvfPointers[i], mapper); } } /** * * @param field The field to read in * @param tvfPointer The pointer within the tvf file where we should start reading * @param mapper The mapper used to map the TermVector * @throws IOException */ private void readTermVector(String field, long tvfPointer, TermVectorMapper mapper) throws IOException { // Now read the data from specified position //We don't need to offset by the FORMAT here since the pointer already includes the offset tvf.seek(tvfPointer); int numTerms = tvf.readVInt(); //System.out.println("Num Terms: " + numTerms); // If no terms - return a constant empty termvector. However, this should never occur! if (numTerms == 0) return; boolean storePositions; boolean storeOffsets; if (format >= FORMAT_VERSION){ byte bits = tvf.readByte(); storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; } else{ tvf.readVInt(); storePositions = false; storeOffsets = false; } mapper.setExpectations(field, numTerms, storeOffsets, storePositions); int start = 0; int deltaLength = 0; int totalLength = 0; byte[] byteBuffer; char[] charBuffer; final boolean preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES; // init the buffers if (preUTF8) { charBuffer = new char[10]; byteBuffer = null; } else { charBuffer = null; byteBuffer = new byte[20]; } for (int i = 0; i < numTerms; i++) { start = tvf.readVInt(); deltaLength = tvf.readVInt(); totalLength = start + deltaLength; final String term; if (preUTF8) { // Term stored as java chars if (charBuffer.length < totalLength) { char[] newCharBuffer = new char[(int) (1.5*totalLength)]; System.arraycopy(charBuffer, 0, newCharBuffer, 0, start); charBuffer = newCharBuffer; } tvf.readChars(charBuffer, start, deltaLength); term = new String(charBuffer, 0, totalLength); } else { // Term stored as utf8 bytes if (byteBuffer.length < totalLength) { byte[] newByteBuffer = new byte[(int) (1.5*totalLength)]; System.arraycopy(byteBuffer, 0, newByteBuffer, 0, start); byteBuffer = newByteBuffer; } tvf.readBytes(byteBuffer, start, deltaLength); term = new String(byteBuffer, 0, totalLength, "UTF-8"); } int freq = tvf.readVInt(); int [] positions = null; if (storePositions) { //read in the positions //does the mapper even care about positions? if (mapper.isIgnoringPositions() == false) { positions = new int[freq]; int prevPosition = 0; for (int j = 0; j < freq; j++) { positions[j] = prevPosition + tvf.readVInt(); prevPosition = positions[j]; } } else { //we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip // for (int j = 0; j < freq; j++) { tvf.readVInt(); } } } TermVectorOffsetInfo[] offsets = null; if (storeOffsets) { //does the mapper even care about offsets? if (mapper.isIgnoringOffsets() == false) { offsets = new TermVectorOffsetInfo[freq]; int prevOffset = 0; for (int j = 0; j < freq; j++) { int startOffset = prevOffset + tvf.readVInt(); int endOffset = startOffset + tvf.readVInt(); offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset); prevOffset = endOffset; } } else { for (int j = 0; j < freq; j++){ tvf.readVInt(); tvf.readVInt(); } } } mapper.map(term, freq, offsets, positions); } } protected Object clone() throws CloneNotSupportedException { final TermVectorsReader clone = (TermVectorsReader) super.clone(); // These are null when a TermVectorsReader was created // on a segment that did not have term vectors saved if (tvx != null && tvd != null && tvf != null) { clone.tvx = (IndexInput) tvx.clone(); clone.tvd = (IndexInput) tvd.clone(); clone.tvf = (IndexInput) tvf.clone(); } return clone; } } /** * Models the existing parallel array structure */ class ParallelArrayTermVectorMapper extends TermVectorMapper { private String[] terms; private int[] termFreqs; private int positions[][]; private TermVectorOffsetInfo offsets[][]; private int currentPosition; private boolean storingOffsets; private boolean storingPositions; private String field; public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) { this.field = field; terms = new String[numTerms]; termFreqs = new int[numTerms]; this.storingOffsets = storeOffsets; this.storingPositions = storePositions; if(storePositions) this.positions = new int[numTerms][]; if(storeOffsets) this.offsets = new TermVectorOffsetInfo[numTerms][]; } public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { terms[currentPosition] = term; termFreqs[currentPosition] = frequency; if (storingOffsets) { this.offsets[currentPosition] = offsets; } if (storingPositions) { this.positions[currentPosition] = positions; } currentPosition++; } /** * Construct the vector * @return The {@link TermFreqVector} based on the mappings. */ public TermFreqVector materializeVector() { SegmentTermVector tv = null; if (field != null && terms != null) { if (storingPositions || storingOffsets) { tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets); } else { tv = new SegmentTermVector(field, terms, termFreqs); } } return tv; } } lucene-2.9.4/src/java/org/apache/lucene/index/DocFieldConsumersPerField.java0000644000175000017500000000305611474320230027403 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.document.Fieldable; final class DocFieldConsumersPerField extends DocFieldConsumerPerField { final DocFieldConsumerPerField one; final DocFieldConsumerPerField two; final DocFieldConsumersPerThread perThread; public DocFieldConsumersPerField(DocFieldConsumersPerThread perThread, DocFieldConsumerPerField one, DocFieldConsumerPerField two) { this.perThread = perThread; this.one = one; this.two = two; } public void processFields(Fieldable[] fields, int count) throws IOException { one.processFields(fields, count); two.processFields(fields, count); } public void abort() { try { one.abort(); } finally { two.abort(); } } } lucene-2.9.4/src/java/org/apache/lucene/index/FieldsWriter.java0000644000175000017500000002165611474320230025031 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ import java.io.IOException; import java.util.Iterator; import org.apache.lucene.document.Document; import org.apache.lucene.document.Fieldable; import org.apache.lucene.document.CompressionTools; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexInput; final class FieldsWriter { static final byte FIELD_IS_TOKENIZED = 0x1; static final byte FIELD_IS_BINARY = 0x2; static final byte FIELD_IS_COMPRESSED = 0x4; // Original format static final int FORMAT = 0; // Changed strings to UTF8 static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = 1; // Lucene 3.0: Removal of compressed fields: This is only to provide compatibility with 3.0-created indexes // new segments always use the FORMAT_CURRENT. As the index format did not change in 3.0, only // new stored field files that no longer support compression are marked as such to optimize merging. // But 2.9 can still read them. static final int FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS = 2; // NOTE: if you introduce a new format, make it 1 higher // than the current one, and always change this if you // switch to a new format! static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES; private FieldInfos fieldInfos; private IndexOutput fieldsStream; private IndexOutput indexStream; private boolean doClose; FieldsWriter(Directory d, String segment, FieldInfos fn) throws IOException { fieldInfos = fn; boolean success = false; final String fieldsName = segment + "." + IndexFileNames.FIELDS_EXTENSION; try { fieldsStream = d.createOutput(fieldsName); fieldsStream.writeInt(FORMAT_CURRENT); success = true; } finally { if (!success) { try { close(); } catch (Throwable t) { // Suppress so we keep throwing the original exception } try { d.deleteFile(fieldsName); } catch (Throwable t) { // Suppress so we keep throwing the original exception } } } success = false; final String indexName = segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION; try { indexStream = d.createOutput(indexName); indexStream.writeInt(FORMAT_CURRENT); success = true; } finally { if (!success) { try { close(); } catch (IOException ioe) { } try { d.deleteFile(fieldsName); } catch (Throwable t) { // Suppress so we keep throwing the original exception } try { d.deleteFile(indexName); } catch (Throwable t) { // Suppress so we keep throwing the original exception } } } doClose = true; } FieldsWriter(IndexOutput fdx, IndexOutput fdt, FieldInfos fn) { fieldInfos = fn; fieldsStream = fdt; indexStream = fdx; doClose = false; } void setFieldsStream(IndexOutput stream) { this.fieldsStream = stream; } // Writes the contents of buffer into the fields stream // and adds a new entry for this document into the index // stream. This assumes the buffer was already written // in the correct fields format. void flushDocument(int numStoredFields, RAMOutputStream buffer) throws IOException { indexStream.writeLong(fieldsStream.getFilePointer()); fieldsStream.writeVInt(numStoredFields); buffer.writeTo(fieldsStream); } void skipDocument() throws IOException { indexStream.writeLong(fieldsStream.getFilePointer()); fieldsStream.writeVInt(0); } void flush() throws IOException { indexStream.flush(); fieldsStream.flush(); } final void close() throws IOException { if (doClose) { try { if (fieldsStream != null) { try { fieldsStream.close(); } finally { fieldsStream = null; } } } catch (IOException ioe) { try { if (indexStream != null) { try { indexStream.close(); } finally { indexStream = null; } } } catch (IOException ioe2) { // Ignore so we throw only first IOException hit } throw ioe; } finally { if (indexStream != null) { try { indexStream.close(); } finally { indexStream = null; } } } } } final void writeField(FieldInfo fi, Fieldable field) throws IOException { // if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode // and field.binaryValue() already returns the compressed value for a field // with isCompressed()==true, so we disable compression in that case boolean disableCompression = (field instanceof FieldsReader.FieldForMerge); fieldsStream.writeVInt(fi.number); byte bits = 0; if (field.isTokenized()) bits |= FieldsWriter.FIELD_IS_TOKENIZED; if (field.isBinary()) bits |= FieldsWriter.FIELD_IS_BINARY; if (field.isCompressed()) bits |= FieldsWriter.FIELD_IS_COMPRESSED; fieldsStream.writeByte(bits); if (field.isCompressed()) { // compression is enabled for the current field final byte[] data; final int len; final int offset; if (disableCompression) { // optimized case for merging, the data // is already compressed data = field.getBinaryValue(); assert data != null; len = field.getBinaryLength(); offset = field.getBinaryOffset(); } else { // check if it is a binary field if (field.isBinary()) { data = CompressionTools.compress(field.getBinaryValue(), field.getBinaryOffset(), field.getBinaryLength()); } else { byte x[] = field.stringValue().getBytes("UTF-8"); data = CompressionTools.compress(x, 0, x.length); } len = data.length; offset = 0; } fieldsStream.writeVInt(len); fieldsStream.writeBytes(data, offset, len); } else { // compression is disabled for the current field if (field.isBinary()) { final byte[] data; final int len; final int offset; data = field.getBinaryValue(); len = field.getBinaryLength(); offset = field.getBinaryOffset(); fieldsStream.writeVInt(len); fieldsStream.writeBytes(data, offset, len); } else { fieldsStream.writeString(field.stringValue()); } } } /** Bulk write a contiguous series of documents. The * lengths array is the length (in bytes) of each raw * document. The stream IndexInput is the * fieldsStream from which we should bulk-copy all * bytes. */ final void addRawDocuments(IndexInput stream, int[] lengths, int numDocs) throws IOException { long position = fieldsStream.getFilePointer(); long start = position; for(int i=0;i= fieldHash.length/2) rehash(); } else fp.fieldInfo.update(field.isIndexed(), field.isTermVectorStored(), field.isStorePositionWithTermVector(), field.isStoreOffsetWithTermVector(), field.getOmitNorms(), false, field.getOmitTf()); if (thisFieldGen != fp.lastGen) { // First time we're seeing this field for this doc fp.fieldCount = 0; if (fieldCount == fields.length) { final int newSize = fields.length*2; DocFieldProcessorPerField newArray[] = new DocFieldProcessorPerField[newSize]; System.arraycopy(fields, 0, newArray, 0, fieldCount); fields = newArray; } fields[fieldCount++] = fp; fp.lastGen = thisFieldGen; } if (fp.fieldCount == fp.fields.length) { Fieldable[] newArray = new Fieldable[fp.fields.length*2]; System.arraycopy(fp.fields, 0, newArray, 0, fp.fieldCount); fp.fields = newArray; } fp.fields[fp.fieldCount++] = field; if (field.isStored()) { fieldsWriter.addField(field, fp.fieldInfo); } } // If we are writing vectors then we must visit // fields in sorted order so they are written in // sorted order. TODO: we actually only need to // sort the subset of fields that have vectors // enabled; we could save [small amount of] CPU // here. quickSort(fields, 0, fieldCount-1); for(int i=0;i= hi) return; else if (hi == 1+lo) { if (array[lo].fieldInfo.name.compareTo(array[hi].fieldInfo.name) > 0) { final DocFieldProcessorPerField tmp = array[lo]; array[lo] = array[hi]; array[hi] = tmp; } return; } int mid = (lo + hi) >>> 1; if (array[lo].fieldInfo.name.compareTo(array[mid].fieldInfo.name) > 0) { DocFieldProcessorPerField tmp = array[lo]; array[lo] = array[mid]; array[mid] = tmp; } if (array[mid].fieldInfo.name.compareTo(array[hi].fieldInfo.name) > 0) { DocFieldProcessorPerField tmp = array[mid]; array[mid] = array[hi]; array[hi] = tmp; if (array[lo].fieldInfo.name.compareTo(array[mid].fieldInfo.name) > 0) { DocFieldProcessorPerField tmp2 = array[lo]; array[lo] = array[mid]; array[mid] = tmp2; } } int left = lo + 1; int right = hi - 1; if (left >= right) return; DocFieldProcessorPerField partition = array[mid]; for (; ;) { while (array[right].fieldInfo.name.compareTo(partition.fieldInfo.name) > 0) --right; while (left < right && array[left].fieldInfo.name.compareTo(partition.fieldInfo.name) <= 0) ++left; if (left < right) { DocFieldProcessorPerField tmp = array[left]; array[left] = array[right]; array[right] = tmp; --right; } else { break; } } quickSort(array, lo, left); quickSort(array, left + 1, hi); } PerDoc[] docFreeList = new PerDoc[1]; int freeCount; int allocCount; synchronized PerDoc getPerDoc() { if (freeCount == 0) { allocCount++; if (allocCount > docFreeList.length) { // Grow our free list up front to make sure we have // enough space to recycle all outstanding PerDoc // instances assert allocCount == 1+docFreeList.length; docFreeList = new PerDoc[ArrayUtil.getNextSize(allocCount)]; } return new PerDoc(); } else return docFreeList[--freeCount]; } synchronized void freePerDoc(PerDoc perDoc) { assert freeCount < docFreeList.length; docFreeList[freeCount++] = perDoc; } class PerDoc extends DocumentsWriter.DocWriter { DocumentsWriter.DocWriter one; DocumentsWriter.DocWriter two; public long sizeInBytes() { return one.sizeInBytes() + two.sizeInBytes(); } public void finish() throws IOException { try { try { one.finish(); } finally { two.finish(); } } finally { freePerDoc(this); } } public void abort() { try { try { one.abort(); } finally { two.abort(); } } finally { freePerDoc(this); } } } } lucene-2.9.4/src/java/org/apache/lucene/index/TermFreqVector.java0000644000175000017500000000524411474320230025331 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** Provides access to stored term vector of * a document field. The vector consists of the name of the field, an array of the terms that occur in the field of the * {@link org.apache.lucene.document.Document} and a parallel array of frequencies. Thus, getTermFrequencies()[5] corresponds with the * frequency of getTerms()[5], assuming there are at least 5 terms in the Document. */ public interface TermFreqVector { /** * The {@link org.apache.lucene.document.Fieldable} name. * @return The name of the field this vector is associated with. * */ public String getField(); /** * @return The number of terms in the term vector. */ public int size(); /** * @return An Array of term texts in ascending order. */ public String[] getTerms(); /** Array of term frequencies. Locations of the array correspond one to one * to the terms in the array obtained from getTerms * method. Each location in the array contains the number of times this * term occurs in the document or the document field. */ public int[] getTermFrequencies(); /** Return an index in the term numbers array returned from * getTerms at which the term with the specified * term appears. If this term does not appear in the array, * return -1. */ public int indexOf(String term); /** Just like indexOf(int) but searches for a number of terms * at the same time. Returns an array that has the same size as the number * of terms searched for, each slot containing the result of searching for * that term number. * * @param terms array containing terms to look for * @param start index in the array where the list of terms starts * @param len the number of terms in the list */ public int[] indexesOf(String[] terms, int start, int len); } lucene-2.9.4/src/java/org/apache/lucene/index/IndexFileNameFilter.java0000644000175000017500000000600511474320230026233 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.File; import java.io.FilenameFilter; import java.util.HashSet; /** * Filename filter that accept filenames and extensions only created by Lucene. * * @version $rcs = ' $Id: Exp $ ' ; */ public class IndexFileNameFilter implements FilenameFilter { private static IndexFileNameFilter singleton = new IndexFileNameFilter(); private HashSet extensions; private HashSet extensionsInCFS; // Prevent instantiation. private IndexFileNameFilter() { extensions = new HashSet(); for (int i = 0; i < IndexFileNames.INDEX_EXTENSIONS.length; i++) { extensions.add(IndexFileNames.INDEX_EXTENSIONS[i]); } extensionsInCFS = new HashSet(); for (int i = 0; i < IndexFileNames.INDEX_EXTENSIONS_IN_COMPOUND_FILE.length; i++) { extensionsInCFS.add(IndexFileNames.INDEX_EXTENSIONS_IN_COMPOUND_FILE[i]); } } /* (non-Javadoc) * @see java.io.FilenameFilter#accept(java.io.File, java.lang.String) */ public boolean accept(File dir, String name) { int i = name.lastIndexOf('.'); if (i != -1) { String extension = name.substring(1+i); if (extensions.contains(extension)) { return true; } else if (extension.startsWith("f") && extension.matches("f\\d+")) { return true; } else if (extension.startsWith("s") && extension.matches("s\\d+")) { return true; } } else { if (name.equals(IndexFileNames.DELETABLE)) return true; else if (name.startsWith(IndexFileNames.SEGMENTS)) return true; } return false; } /** * Returns true if this is a file that would be contained * in a CFS file. This function should only be called on * files that pass the above "accept" (ie, are already * known to be a Lucene index file). */ public boolean isCFSFile(String name) { int i = name.lastIndexOf('.'); if (i != -1) { String extension = name.substring(1+i); if (extensionsInCFS.contains(extension)) { return true; } if (extension.startsWith("f") && extension.matches("f\\d+")) { return true; } } return false; } public static IndexFileNameFilter getFilter() { return singleton; } } lucene-2.9.4/src/java/org/apache/lucene/index/FieldReaderException.java0000644000175000017500000000547611474320230026455 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * * **/ public class FieldReaderException extends RuntimeException{ /** * Constructs a new runtime exception with null as its * detail message. The cause is not initialized, and may subsequently be * initialized by a call to {@link #initCause}. */ public FieldReaderException() { } /** * Constructs a new runtime exception with the specified cause and a * detail message of (cause==null ? null : cause.toString()) * (which typically contains the class and detail message of * cause). *

    * This constructor is useful for runtime exceptions * that are little more than wrappers for other throwables. * * @param cause the cause (which is saved for later retrieval by the * {@link #getCause()} method). (A null value is * permitted, and indicates that the cause is nonexistent or * unknown.) * @since 1.4 */ public FieldReaderException(Throwable cause) { super(cause); } /** * Constructs a new runtime exception with the specified detail message. * The cause is not initialized, and may subsequently be initialized by a * call to {@link #initCause}. * * @param message the detail message. The detail message is saved for * later retrieval by the {@link #getMessage()} method. */ public FieldReaderException(String message) { super(message); } /** * Constructs a new runtime exception with the specified detail message and * cause.

    Note that the detail message associated with * cause is not automatically incorporated in * this runtime exception's detail message. * * @param message the detail message (which is saved for later retrieval * by the {@link #getMessage()} method). * @param cause the cause (which is saved for later retrieval by the * {@link #getCause()} method). (A null value is * permitted, and indicates that the cause is nonexistent or * unknown.) * @since 1.4 */ public FieldReaderException(String message, Throwable cause) { super(message, cause); } } lucene-2.9.4/src/java/org/apache/lucene/index/SegmentTermPositions.java0000644000175000017500000001424611474320230026565 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.store.IndexInput; import java.io.IOException; final class SegmentTermPositions extends SegmentTermDocs implements TermPositions { private IndexInput proxStream; private int proxCount; private int position; // the current payload length private int payloadLength; // indicates whether the payload of the current position has // been read from the proxStream yet private boolean needToLoadPayload; // these variables are being used to remember information // for a lazy skip private long lazySkipPointer = -1; private int lazySkipProxCount = 0; SegmentTermPositions(SegmentReader p) { super(p); this.proxStream = null; // the proxStream will be cloned lazily when nextPosition() is called for the first time } final void seek(TermInfo ti, Term term) throws IOException { super.seek(ti, term); if (ti != null) lazySkipPointer = ti.proxPointer; lazySkipProxCount = 0; proxCount = 0; payloadLength = 0; needToLoadPayload = false; } public final void close() throws IOException { super.close(); if (proxStream != null) proxStream.close(); } public final int nextPosition() throws IOException { if (currentFieldOmitTermFreqAndPositions) // This field does not store term freq, positions, payloads return 0; // perform lazy skips if necessary lazySkip(); proxCount--; return position += readDeltaPosition(); } private final int readDeltaPosition() throws IOException { int delta = proxStream.readVInt(); if (currentFieldStoresPayloads) { // if the current field stores payloads then // the position delta is shifted one bit to the left. // if the LSB is set, then we have to read the current // payload length if ((delta & 1) != 0) { payloadLength = proxStream.readVInt(); } delta >>>= 1; needToLoadPayload = true; } return delta; } protected final void skippingDoc() throws IOException { // we remember to skip a document lazily lazySkipProxCount += freq; } public final boolean next() throws IOException { // we remember to skip the remaining positions of the current // document lazily lazySkipProxCount += proxCount; if (super.next()) { // run super proxCount = freq; // note frequency position = 0; // reset position return true; } return false; } public final int read(final int[] docs, final int[] freqs) { throw new UnsupportedOperationException("TermPositions does not support processing multiple documents in one call. Use TermDocs instead."); } /** Called by super.skipTo(). */ protected void skipProx(long proxPointer, int payloadLength) throws IOException { // we save the pointer, we might have to skip there lazily lazySkipPointer = proxPointer; lazySkipProxCount = 0; proxCount = 0; this.payloadLength = payloadLength; needToLoadPayload = false; } private void skipPositions(int n) throws IOException { assert !currentFieldOmitTermFreqAndPositions; for (int f = n; f > 0; f--) { // skip unread positions readDeltaPosition(); skipPayload(); } } private void skipPayload() throws IOException { if (needToLoadPayload && payloadLength > 0) { proxStream.seek(proxStream.getFilePointer() + payloadLength); } needToLoadPayload = false; } // It is not always necessary to move the prox pointer // to a new document after the freq pointer has been moved. // Consider for example a phrase query with two terms: // the freq pointer for term 1 has to move to document x // to answer the question if the term occurs in that document. But // only if term 2 also matches document x, the positions have to be // read to figure out if term 1 and term 2 appear next // to each other in document x and thus satisfy the query. // So we move the prox pointer lazily to the document // as soon as positions are requested. private void lazySkip() throws IOException { if (proxStream == null) { // clone lazily proxStream = (IndexInput) parent.core.proxStream.clone(); } // we might have to skip the current payload // if it was not read yet skipPayload(); if (lazySkipPointer != -1) { proxStream.seek(lazySkipPointer); lazySkipPointer = -1; } if (lazySkipProxCount != 0) { skipPositions(lazySkipProxCount); lazySkipProxCount = 0; } } public int getPayloadLength() { return payloadLength; } public byte[] getPayload(byte[] data, int offset) throws IOException { if (!needToLoadPayload) { throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once."); } // read payloads lazily byte[] retArray; int retOffset; if (data == null || data.length - offset < payloadLength) { // the array is too small to store the payload data, // so we allocate a new one retArray = new byte[payloadLength]; retOffset = 0; } else { retArray = data; retOffset = offset; } proxStream.readBytes(retArray, retOffset, payloadLength); needToLoadPayload = false; return retArray; } public boolean isPayloadAvailable() { return needToLoadPayload && payloadLength > 0; } } lucene-2.9.4/src/java/org/apache/lucene/index/TermVectorOffsetInfo.java0000644000175000017500000000557711474320230026507 0ustar janpascaljanpascalpackage org.apache.lucene.index; import java.io.Serializable; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * The TermVectorOffsetInfo class holds information pertaining to a Term in a {@link org.apache.lucene.index.TermPositionVector}'s * offset information. This offset information is the character offset as set during the Analysis phase (and thus may not be the actual offset in the * original content). */ public class TermVectorOffsetInfo implements Serializable { /** * Convenience declaration when creating a {@link org.apache.lucene.index.TermPositionVector} that stores only position information. */ public transient static final TermVectorOffsetInfo[] EMPTY_OFFSET_INFO = new TermVectorOffsetInfo[0]; private int startOffset; private int endOffset; public TermVectorOffsetInfo() { } public TermVectorOffsetInfo(int startOffset, int endOffset) { this.endOffset = endOffset; this.startOffset = startOffset; } /** * The accessor for the ending offset for the term * @return The offset */ public int getEndOffset() { return endOffset; } public void setEndOffset(int endOffset) { this.endOffset = endOffset; } /** * The accessor for the starting offset of the term. * * @return The offset */ public int getStartOffset() { return startOffset; } public void setStartOffset(int startOffset) { this.startOffset = startOffset; } /** * Two TermVectorOffsetInfos are equals if both the start and end offsets are the same * @param o The comparison Object * @return true if both {@link #getStartOffset()} and {@link #getEndOffset()} are the same for both objects. */ public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof TermVectorOffsetInfo)) return false; final TermVectorOffsetInfo termVectorOffsetInfo = (TermVectorOffsetInfo) o; if (endOffset != termVectorOffsetInfo.endOffset) return false; if (startOffset != termVectorOffsetInfo.startOffset) return false; return true; } public int hashCode() { int result; result = startOffset; result = 29 * result + endOffset; return result; } } lucene-2.9.4/src/java/org/apache/lucene/index/SegmentInfo.java0000644000175000017500000005674611474320230024654 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.BitVector; import java.io.IOException; import java.util.List; import java.util.Map; import java.util.HashMap; import java.util.ArrayList; import java.util.Collections; /** * Information about a segment such as it's name, directory, and files related * to the segment. * * *

    NOTE: This API is new and still experimental * (subject to change suddenly in the next release)

    */ public final class SegmentInfo { static final int NO = -1; // e.g. no norms; no deletes; static final int YES = 1; // e.g. have norms; have deletes; static final int CHECK_DIR = 0; // e.g. must check dir to see if there are norms/deletions static final int WITHOUT_GEN = 0; // a file name that has no GEN in it. public String name; // unique name in dir public int docCount; // number of docs in seg public Directory dir; // where segment resides private boolean preLockless; // true if this is a segments file written before // lock-less commits (2.1) private long delGen; // current generation of del file; NO if there // are no deletes; CHECK_DIR if it's a pre-2.1 segment // (and we must check filesystem); YES or higher if // there are deletes at generation N private long[] normGen; // current generation of each field's norm file. // If this array is null, for lockLess this means no // separate norms. For preLockLess this means we must // check filesystem. If this array is not null, its // values mean: NO says this field has no separate // norms; CHECK_DIR says it is a preLockLess segment and // filesystem must be checked; >= YES says this field // has separate norms with the specified generation private byte isCompoundFile; // NO if it is not; YES if it is; CHECK_DIR if it's // pre-2.1 (ie, must check file system to see // if .cfs and .nrm exist) private boolean hasSingleNormFile; // true if this segment maintains norms in a single file; // false otherwise // this is currently false for segments populated by DocumentWriter // and true for newly created merged segments (both // compound and non compound). private List files; // cached list of files that this segment uses // in the Directory long sizeInBytes = -1; // total byte size of all of our files (computed on demand) private int docStoreOffset; // if this segment shares stored fields & vectors, this // offset is where in that file this segment's docs begin private String docStoreSegment; // name used to derive fields/vectors file we share with // other segments private boolean docStoreIsCompoundFile; // whether doc store files are stored in compound file (*.cfx) private int delCount; // How many deleted docs in this segment, or -1 if not yet known // (if it's an older index) private boolean hasProx; // True if this segment has any fields with omitTermFreqAndPositions==false private Map diagnostics; public String toString() { return "si: "+dir.toString()+" "+name+" docCount: "+docCount+" delCount: "+delCount+" delFileName: "+getDelFileName(); } public SegmentInfo(String name, int docCount, Directory dir) { this.name = name; this.docCount = docCount; this.dir = dir; delGen = NO; isCompoundFile = CHECK_DIR; preLockless = true; hasSingleNormFile = false; docStoreOffset = -1; docStoreSegment = name; docStoreIsCompoundFile = false; delCount = 0; hasProx = true; } public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean hasSingleNormFile) { this(name, docCount, dir, isCompoundFile, hasSingleNormFile, -1, null, false, true); } public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean hasSingleNormFile, int docStoreOffset, String docStoreSegment, boolean docStoreIsCompoundFile, boolean hasProx) { this(name, docCount, dir); this.isCompoundFile = (byte) (isCompoundFile ? YES : NO); this.hasSingleNormFile = hasSingleNormFile; preLockless = false; this.docStoreOffset = docStoreOffset; this.docStoreSegment = docStoreSegment; this.docStoreIsCompoundFile = docStoreIsCompoundFile; this.hasProx = hasProx; delCount = 0; assert docStoreOffset == -1 || docStoreSegment != null: "dso=" + docStoreOffset + " dss=" + docStoreSegment + " docCount=" + docCount; } /** * Copy everything from src SegmentInfo into our instance. */ void reset(SegmentInfo src) { clearFiles(); name = src.name; docCount = src.docCount; dir = src.dir; preLockless = src.preLockless; delGen = src.delGen; docStoreOffset = src.docStoreOffset; docStoreIsCompoundFile = src.docStoreIsCompoundFile; if (src.normGen == null) { normGen = null; } else { normGen = new long[src.normGen.length]; System.arraycopy(src.normGen, 0, normGen, 0, src.normGen.length); } isCompoundFile = src.isCompoundFile; hasSingleNormFile = src.hasSingleNormFile; delCount = src.delCount; } // must be Map void setDiagnostics(Map diagnostics) { this.diagnostics = diagnostics; } // returns Map public Map getDiagnostics() { return diagnostics; } /** * Construct a new SegmentInfo instance by reading a * previously saved SegmentInfo from input. * * @param dir directory to load from * @param format format of the segments info file * @param input input handle to read segment info from */ SegmentInfo(Directory dir, int format, IndexInput input) throws IOException { this.dir = dir; name = input.readString(); docCount = input.readInt(); if (format <= SegmentInfos.FORMAT_LOCKLESS) { delGen = input.readLong(); if (format <= SegmentInfos.FORMAT_SHARED_DOC_STORE) { docStoreOffset = input.readInt(); if (docStoreOffset != -1) { docStoreSegment = input.readString(); docStoreIsCompoundFile = (1 == input.readByte()); } else { docStoreSegment = name; docStoreIsCompoundFile = false; } } else { docStoreOffset = -1; docStoreSegment = name; docStoreIsCompoundFile = false; } if (format <= SegmentInfos.FORMAT_SINGLE_NORM_FILE) { hasSingleNormFile = (1 == input.readByte()); } else { hasSingleNormFile = false; } int numNormGen = input.readInt(); if (numNormGen == NO) { normGen = null; } else { normGen = new long[numNormGen]; for(int j=0;j= YES: this means this segment was written by // the LOCKLESS code and for certain has // deletions // if (delGen == NO) { return false; } else if (delGen >= YES) { return true; } else { return dir.fileExists(getDelFileName()); } } void advanceDelGen() { // delGen 0 is reserved for pre-LOCKLESS format if (delGen == NO) { delGen = YES; } else { delGen++; } clearFiles(); } void clearDelGen() { delGen = NO; clearFiles(); } public Object clone () { SegmentInfo si = new SegmentInfo(name, docCount, dir); si.isCompoundFile = isCompoundFile; si.delGen = delGen; si.delCount = delCount; si.hasProx = hasProx; si.preLockless = preLockless; si.hasSingleNormFile = hasSingleNormFile; si.diagnostics = new HashMap(diagnostics); if (normGen != null) { si.normGen = (long[]) normGen.clone(); } si.docStoreOffset = docStoreOffset; si.docStoreSegment = docStoreSegment; si.docStoreIsCompoundFile = docStoreIsCompoundFile; return si; } public String getDelFileName() { if (delGen == NO) { // In this case we know there is no deletion filename // against this segment return null; } else { // If delGen is CHECK_DIR, it's the pre-lockless-commit file format return IndexFileNames.fileNameFromGeneration(name, "." + IndexFileNames.DELETES_EXTENSION, delGen); } } /** * Returns true if this field for this segment has saved a separate norms file (__N.sX). * * @param fieldNumber the field index to check */ public boolean hasSeparateNorms(int fieldNumber) throws IOException { if ((normGen == null && preLockless) || (normGen != null && normGen[fieldNumber] == CHECK_DIR)) { // Must fallback to directory file exists check: String fileName = name + ".s" + fieldNumber; return dir.fileExists(fileName); } else if (normGen == null || normGen[fieldNumber] == NO) { return false; } else { return true; } } /** * Returns true if any fields in this segment have separate norms. */ public boolean hasSeparateNorms() throws IOException { if (normGen == null) { if (!preLockless) { // This means we were created w/ LOCKLESS code and no // norms are written yet: return false; } else { // This means this segment was saved with pre-LOCKLESS // code. So we must fallback to the original // directory list check: String[] result = dir.list(); if (result == null) throw new IOException("cannot read directory " + dir + ": list() returned null"); String pattern; pattern = name + ".s"; int patternLength = pattern.length(); for(int i = 0; i < result.length; i++){ if(result[i].startsWith(pattern) && Character.isDigit(result[i].charAt(patternLength))) return true; } return false; } } else { // This means this segment was saved with LOCKLESS // code so we first check whether any normGen's are >= 1 // (meaning they definitely have separate norms): for(int i=0;i= YES) { return true; } } // Next we look for any == 0. These cases were // pre-LOCKLESS and must be checked in directory: for(int i=0;i= YES || dir.fileExists(delFileName))) { files.add(delFileName); } // Careful logic for norms files if (normGen != null) { for(int i=0;i= YES) { // Definitely a separate norm file, with generation: files.add(IndexFileNames.fileNameFromGeneration(name, "." + IndexFileNames.SEPARATE_NORMS_EXTENSION + i, gen)); } else if (NO == gen) { // No separate norms but maybe plain norms // in the non compound file case: if (!hasSingleNormFile && !useCompoundFile) { String fileName = name + "." + IndexFileNames.PLAIN_NORMS_EXTENSION + i; if (dir.fileExists(fileName)) { files.add(fileName); } } } else if (CHECK_DIR == gen) { // Pre-2.1: we have to check file existence String fileName = null; if (useCompoundFile) { fileName = name + "." + IndexFileNames.SEPARATE_NORMS_EXTENSION + i; } else if (!hasSingleNormFile) { fileName = name + "." + IndexFileNames.PLAIN_NORMS_EXTENSION + i; } if (fileName != null && dir.fileExists(fileName)) { files.add(fileName); } } } } else if (preLockless || (!hasSingleNormFile && !useCompoundFile)) { // Pre-2.1: we have to scan the dir to find all // matching _X.sN/_X.fN files for our segment: String prefix; if (useCompoundFile) prefix = name + "." + IndexFileNames.SEPARATE_NORMS_EXTENSION; else prefix = name + "." + IndexFileNames.PLAIN_NORMS_EXTENSION; int prefixLength = prefix.length(); String[] allFiles = dir.listAll(); final IndexFileNameFilter filter = IndexFileNameFilter.getFilter(); for(int i=0;i prefixLength && Character.isDigit(fileName.charAt(prefixLength)) && fileName.startsWith(prefix)) { files.add(fileName); } } } return files; } /* Called whenever any change is made that affects which * files this segment has. */ private void clearFiles() { files = null; sizeInBytes = -1; } /** Used for debugging */ public String segString(Directory dir) { String cfs; try { if (getUseCompoundFile()) cfs = "c"; else cfs = "C"; } catch (IOException ioe) { cfs = "?"; } String docStore; if (docStoreOffset != -1) docStore = "->" + docStoreSegment; else docStore = ""; return name + ":" + cfs + (this.dir == dir ? "" : "x") + docCount + docStore; } /** We consider another SegmentInfo instance equal if it * has the same dir and same name. */ public boolean equals(Object obj) { SegmentInfo other; try { other = (SegmentInfo) obj; } catch (ClassCastException cce) { return false; } return other.dir == dir && other.name.equals(name); } public int hashCode() { return dir.hashCode() + name.hashCode(); } } lucene-2.9.4/src/java/org/apache/lucene/index/ReusableStringReader.java0000644000175000017500000000320111474320230026464 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Reader; /** Used by DocumentsWriter to implemented a StringReader * that can be reset to a new string; we use this when * tokenizing the string value from a Field. */ final class ReusableStringReader extends Reader { int upto; int left; String s; void init(String s) { this.s = s; left = s.length(); this.upto = 0; } public int read(char[] c) { return read(c, 0, c.length); } public int read(char[] c, int off, int len) { if (left > len) { s.getChars(upto, upto+len, c, off); upto += len; left -= len; return len; } else if (0 == left) { s = null; return -1; } else { s.getChars(upto, upto+left, c, off); int r = left; left = 0; upto = s.length(); return r; } } public void close() {}; } lucene-2.9.4/src/java/org/apache/lucene/index/DocInverterPerField.java0000644000175000017500000001732311474320230026261 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.Reader; import org.apache.lucene.document.Fieldable; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; /** * Holds state for inverting all occurrences of a single * field in the document. This class doesn't do anything * itself; instead, it forwards the tokens produced by * analysis to its own consumer * (InvertedDocConsumerPerField). It also interacts with an * endConsumer (InvertedDocEndConsumerPerField). */ final class DocInverterPerField extends DocFieldConsumerPerField { final private DocInverterPerThread perThread; final private FieldInfo fieldInfo; final InvertedDocConsumerPerField consumer; final InvertedDocEndConsumerPerField endConsumer; final DocumentsWriter.DocState docState; final FieldInvertState fieldState; public DocInverterPerField(DocInverterPerThread perThread, FieldInfo fieldInfo) { this.perThread = perThread; this.fieldInfo = fieldInfo; docState = perThread.docState; fieldState = perThread.fieldState; this.consumer = perThread.consumer.addField(this, fieldInfo); this.endConsumer = perThread.endConsumer.addField(this, fieldInfo); } void abort() { consumer.abort(); endConsumer.abort(); } public void processFields(final Fieldable[] fields, final int count) throws IOException { fieldState.reset(docState.doc.getBoost()); final int maxFieldLength = docState.maxFieldLength; final boolean doInvert = consumer.start(fields, count); for(int i=0;i 0) fieldState.position += docState.analyzer.getPositionIncrementGap(fieldInfo.name); if (!field.isTokenized()) { // un-tokenized field String stringValue = field.stringValue(); final int valueLength = stringValue.length(); perThread.singleTokenTokenStream.reinit(stringValue, 0, valueLength); fieldState.attributeSource = perThread.singleTokenTokenStream; consumer.start(field); boolean success = false; try { consumer.add(); success = true; } finally { if (!success) docState.docWriter.setAborting(); } fieldState.offset += valueLength; fieldState.length++; fieldState.position++; anyToken = valueLength > 0; } else { // tokenized field final TokenStream stream; final TokenStream streamValue = field.tokenStreamValue(); if (streamValue != null) stream = streamValue; else { // the field does not have a TokenStream, // so we have to obtain one from the analyzer final Reader reader; // find or make Reader final Reader readerValue = field.readerValue(); if (readerValue != null) reader = readerValue; else { String stringValue = field.stringValue(); if (stringValue == null) throw new IllegalArgumentException("field must have either TokenStream, String or Reader value"); perThread.stringReader.init(stringValue); reader = perThread.stringReader; } // Tokenize field and add to postingTable stream = docState.analyzer.reusableTokenStream(fieldInfo.name, reader); } // reset the TokenStream to the first token stream.reset(); final int startLength = fieldState.length; // deprecated final boolean allowMinus1Position = docState.allowMinus1Position; try { int offsetEnd = fieldState.offset-1; boolean hasMoreTokens = stream.incrementToken(); fieldState.attributeSource = stream; OffsetAttribute offsetAttribute = (OffsetAttribute) fieldState.attributeSource.addAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute) fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class); consumer.start(field); for(;;) { // If we hit an exception in stream.next below // (which is fairly common, eg if analyzer // chokes on a given document), then it's // non-aborting and (above) this one document // will be marked as deleted, but still // consume a docID if (!hasMoreTokens) break; final int posIncr = posIncrAttribute.getPositionIncrement(); fieldState.position += posIncr; if (allowMinus1Position || fieldState.position > 0) { fieldState.position--; } if (posIncr == 0) fieldState.numOverlap++; boolean success = false; try { // If we hit an exception in here, we abort // all buffered documents since the last // flush, on the likelihood that the // internal state of the consumer is now // corrupt and should not be flushed to a // new segment: consumer.add(); success = true; } finally { if (!success) docState.docWriter.setAborting(); } fieldState.position++; offsetEnd = fieldState.offset + offsetAttribute.endOffset(); if (++fieldState.length >= maxFieldLength) { if (docState.infoStream != null) docState.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens"); break; } hasMoreTokens = stream.incrementToken(); } // trigger streams to perform end-of-stream operations stream.end(); fieldState.offset += offsetAttribute.endOffset(); anyToken = fieldState.length > startLength; } finally { stream.close(); } } if (anyToken) fieldState.offset += docState.analyzer.getOffsetGap(field); fieldState.boost *= field.getBoost(); } // LUCENE-2387: don't hang onto the field, so GC can // reclaim fields[i] = null; } consumer.finish(); endConsumer.finish(); } } lucene-2.9.4/src/java/org/apache/lucene/index/SnapshotDeletionPolicy.java0000644000175000017500000001201311474320230027054 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Collection; import java.util.List; import java.util.ArrayList; import java.util.Map; import java.io.IOException; import org.apache.lucene.store.Directory; /** A {@link IndexDeletionPolicy} that wraps around any other * {@link IndexDeletionPolicy} and adds the ability to hold and * later release a single "snapshot" of an index. While * the snapshot is held, the {@link IndexWriter} will not * remove any files associated with it even if the index is * otherwise being actively, arbitrarily changed. Because * we wrap another arbitrary {@link IndexDeletionPolicy}, this * gives you the freedom to continue using whatever {@link * IndexDeletionPolicy} you would normally want to use with your * index. Note that you can re-use a single instance of * SnapshotDeletionPolicy across multiple writers as long * as they are against the same index Directory. Any * snapshot held when a writer is closed will "survive" * when the next writer is opened. * *

    WARNING: This API is a new and experimental and * may suddenly change.

    */ public class SnapshotDeletionPolicy implements IndexDeletionPolicy { private IndexCommit lastCommit; private IndexDeletionPolicy primary; private String snapshot; public SnapshotDeletionPolicy(IndexDeletionPolicy primary) { this.primary = primary; } public synchronized void onInit(List commits) throws IOException { primary.onInit(wrapCommits(commits)); lastCommit = (IndexCommit) commits.get(commits.size()-1); } public synchronized void onCommit(List commits) throws IOException { primary.onCommit(wrapCommits(commits)); lastCommit = (IndexCommit) commits.get(commits.size()-1); } /** Take a snapshot of the most recent commit to the * index. You must call release() to free this snapshot. * Note that while the snapshot is held, the files it * references will not be deleted, which will consume * additional disk space in your index. If you take a * snapshot at a particularly bad time (say just before * you call optimize()) then in the worst case this could * consume an extra 1X of your total index size, until * you release the snapshot. */ // TODO 3.0: change this to return IndexCommit instead public synchronized IndexCommitPoint snapshot() { if (lastCommit == null) { throw new IllegalStateException("no index commits to snapshot !"); } if (snapshot == null) snapshot = lastCommit.getSegmentsFileName(); else throw new IllegalStateException("snapshot is already set; please call release() first"); return lastCommit; } /** Release the currently held snapshot. */ public synchronized void release() { if (snapshot != null) snapshot = null; else throw new IllegalStateException("snapshot was not set; please call snapshot() first"); } private class MyCommitPoint extends IndexCommit { IndexCommit cp; MyCommitPoint(IndexCommit cp) { this.cp = cp; } public String toString() { return "SnapshotDeletionPolicy.SnapshotCommitPoint(" + cp + ")"; } public String getSegmentsFileName() { return cp.getSegmentsFileName(); } public Collection getFileNames() throws IOException { return cp.getFileNames(); } public Directory getDirectory() { return cp.getDirectory(); } public void delete() { synchronized(SnapshotDeletionPolicy.this) { // Suppress the delete request if this commit point is // our current snapshot. if (snapshot == null || !snapshot.equals(getSegmentsFileName())) cp.delete(); } } public boolean isDeleted() { return cp.isDeleted(); } public long getVersion() { return cp.getVersion(); } public long getGeneration() { return cp.getGeneration(); } public Map getUserData() throws IOException { return cp.getUserData(); } public boolean isOptimized() { return cp.isOptimized(); } } private List wrapCommits(List commits) { final int count = commits.size(); List myCommits = new ArrayList(count); for(int i=0;i.f + a number and * from .s + a number. Also note that * Lucene's segments_N files do not have any * filename extension. */ static final String INDEX_EXTENSIONS[] = new String[] { COMPOUND_FILE_EXTENSION, FIELD_INFOS_EXTENSION, FIELDS_INDEX_EXTENSION, FIELDS_EXTENSION, TERMS_INDEX_EXTENSION, TERMS_EXTENSION, FREQ_EXTENSION, PROX_EXTENSION, DELETES_EXTENSION, VECTORS_INDEX_EXTENSION, VECTORS_DOCUMENTS_EXTENSION, VECTORS_FIELDS_EXTENSION, GEN_EXTENSION, NORMS_EXTENSION, COMPOUND_FILE_STORE_EXTENSION, }; /** File extensions that are added to a compound file * (same as above, minus "del", "gen", "cfs"). */ static final String[] INDEX_EXTENSIONS_IN_COMPOUND_FILE = new String[] { FIELD_INFOS_EXTENSION, FIELDS_INDEX_EXTENSION, FIELDS_EXTENSION, TERMS_INDEX_EXTENSION, TERMS_EXTENSION, FREQ_EXTENSION, PROX_EXTENSION, VECTORS_INDEX_EXTENSION, VECTORS_DOCUMENTS_EXTENSION, VECTORS_FIELDS_EXTENSION, NORMS_EXTENSION }; static final String[] STORE_INDEX_EXTENSIONS = new String[] { VECTORS_INDEX_EXTENSION, VECTORS_FIELDS_EXTENSION, VECTORS_DOCUMENTS_EXTENSION, FIELDS_INDEX_EXTENSION, FIELDS_EXTENSION }; static final String[] NON_STORE_INDEX_EXTENSIONS = new String[] { FIELD_INFOS_EXTENSION, FREQ_EXTENSION, PROX_EXTENSION, TERMS_EXTENSION, TERMS_INDEX_EXTENSION, NORMS_EXTENSION }; /** File extensions of old-style index files */ static final String COMPOUND_EXTENSIONS[] = new String[] { FIELD_INFOS_EXTENSION, FREQ_EXTENSION, PROX_EXTENSION, FIELDS_INDEX_EXTENSION, FIELDS_EXTENSION, TERMS_INDEX_EXTENSION, TERMS_EXTENSION }; /** File extensions for term vector support */ static final String VECTOR_EXTENSIONS[] = new String[] { VECTORS_INDEX_EXTENSION, VECTORS_DOCUMENTS_EXTENSION, VECTORS_FIELDS_EXTENSION }; /** * Computes the full file name from base, extension and * generation. If the generation is -1, the file name is * null. If it's 0, the file name is . * If it's > 0, the file name is _. * * @param base -- main part of the file name * @param extension -- extension of the filename (including .) * @param gen -- generation */ static final String fileNameFromGeneration(String base, String extension, long gen) { if (gen == SegmentInfo.NO) { return null; } else if (gen == SegmentInfo.WITHOUT_GEN) { return base + extension; } else { return base + "_" + Long.toString(gen, Character.MAX_RADIX) + extension; } } /** * Returns true if the provided filename is one of the doc * store files (ends with an extension in * STORE_INDEX_EXTENSIONS). */ static final boolean isDocStoreFile(String fileName) { if (fileName.endsWith(COMPOUND_FILE_STORE_EXTENSION)) return true; for(int i=0;i * To store payloads in the index a {@link TokenStream} has to be used that * produces payload data. *

    * Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)} * to retrieve the payloads from the index.
    * */ public class Payload implements Serializable, Cloneable { /** the byte array containing the payload data */ protected byte[] data; /** the offset within the byte array */ protected int offset; /** the length of the payload data */ protected int length; /** Creates an empty payload and does not allocate a byte array. */ public Payload() { // nothing to do } /** * Creates a new payload with the the given array as data. * A reference to the passed-in array is held, i. e. no * copy is made. * * @param data the data of this payload */ public Payload(byte[] data) { this(data, 0, data.length); } /** * Creates a new payload with the the given array as data. * A reference to the passed-in array is held, i. e. no * copy is made. * * @param data the data of this payload * @param offset the offset in the data byte array * @param length the length of the data */ public Payload(byte[] data, int offset, int length) { if (offset < 0 || offset + length > data.length) { throw new IllegalArgumentException(); } this.data = data; this.offset = offset; this.length = length; } /** * Sets this payloads data. * A reference to the passed-in array is held, i. e. no * copy is made. */ public void setData(byte[] data) { setData(data, 0, data.length); } /** * Sets this payloads data. * A reference to the passed-in array is held, i. e. no * copy is made. */ public void setData(byte[] data, int offset, int length) { this.data = data; this.offset = offset; this.length = length; } /** * Returns a reference to the underlying byte array * that holds this payloads data. */ public byte[] getData() { return this.data; } /** * Returns the offset in the underlying byte array */ public int getOffset() { return this.offset; } /** * Returns the length of the payload data. */ public int length() { return this.length; } /** * Returns the byte at the given index. */ public byte byteAt(int index) { if (0 <= index && index < this.length) { return this.data[this.offset + index]; } throw new ArrayIndexOutOfBoundsException(index); } /** * Allocates a new byte array, copies the payload data into it and returns it. */ public byte[] toByteArray() { byte[] retArray = new byte[this.length]; System.arraycopy(this.data, this.offset, retArray, 0, this.length); return retArray; } /** * Copies the payload data to a byte array. * * @param target the target byte array * @param targetOffset the offset in the target byte array */ public void copyTo(byte[] target, int targetOffset) { if (this.length > target.length + targetOffset) { throw new ArrayIndexOutOfBoundsException(); } System.arraycopy(this.data, this.offset, target, targetOffset, this.length); } /** * Clones this payload by creating a copy of the underlying * byte array. */ public Object clone() { try { // Start with a shallow copy of data Payload clone = (Payload) super.clone(); // Only copy the part of data that belongs to this Payload if (offset == 0 && length == data.length) { // It is the whole thing, so just clone it. clone.data = (byte[]) data.clone(); } else { // Just get the part clone.data = this.toByteArray(); clone.offset = 0; } return clone; } catch (CloneNotSupportedException e) { throw new RuntimeException(e); // shouldn't happen } } public boolean equals(Object obj) { if (obj == this) return true; if (obj instanceof Payload) { Payload other = (Payload) obj; if (length == other.length) { for(int i=0;i our current segmentInfos version in case we were // opened on a past IndexCommit: private long maxIndexVersion; static IndexReader open(final Directory directory, final IndexDeletionPolicy deletionPolicy, final IndexCommit commit, final boolean readOnly, final int termInfosIndexDivisor) throws CorruptIndexException, IOException { return (IndexReader) new SegmentInfos.FindSegmentsFile(directory) { protected Object doBody(String segmentFileName) throws CorruptIndexException, IOException { SegmentInfos infos = new SegmentInfos(); infos.read(directory, segmentFileName); if (readOnly) return new ReadOnlyDirectoryReader(directory, infos, deletionPolicy, termInfosIndexDivisor); else return new DirectoryReader(directory, infos, deletionPolicy, false, termInfosIndexDivisor); } }.run(commit); } /** Construct reading the named set of readers. */ DirectoryReader(Directory directory, SegmentInfos sis, IndexDeletionPolicy deletionPolicy, boolean readOnly, int termInfosIndexDivisor) throws IOException { this.directory = directory; this.readOnly = readOnly; this.segmentInfos = sis; this.deletionPolicy = deletionPolicy; this.termInfosIndexDivisor = termInfosIndexDivisor; if (!readOnly) { // We assume that this segments_N was previously // properly sync'd: synced.addAll(sis.files(directory, true)); } // To reduce the chance of hitting FileNotFound // (and having to retry), we open segments in // reverse because IndexWriter merges & deletes // the newest segments first. SegmentReader[] readers = new SegmentReader[sis.size()]; for (int i = sis.size()-1; i >= 0; i--) { boolean success = false; try { readers[i] = SegmentReader.get(readOnly, sis.info(i), termInfosIndexDivisor); success = true; } finally { if (!success) { // Close all readers we had opened: for(i++;i=0;upto--) { try { readers[upto].close(); } catch (Throwable ignore) { // keep going - we want to clean up as much as possible } } } } } this.writer = writer; if (upto < readers.length) { // This means some segments were in a foreign Directory SegmentReader[] newReaders = new SegmentReader[upto]; System.arraycopy(readers, 0, newReaders, 0, upto); readers = newReaders; } initialize(readers); } /** This constructor is only used for {@link #reopen()} */ DirectoryReader(Directory directory, SegmentInfos infos, SegmentReader[] oldReaders, int[] oldStarts, Map oldNormsCache, boolean readOnly, boolean doClone, int termInfosIndexDivisor) throws IOException { this.directory = directory; this.readOnly = readOnly; this.segmentInfos = infos; this.termInfosIndexDivisor = termInfosIndexDivisor; if (!readOnly) { // We assume that this segments_N was previously // properly sync'd: synced.addAll(infos.files(directory, true)); } // we put the old SegmentReaders in a map, that allows us // to lookup a reader using its segment name Map segmentReaders = new HashMap(); if (oldReaders != null) { // create a Map SegmentName->SegmentReader for (int i = 0; i < oldReaders.length; i++) { segmentReaders.put(oldReaders[i].getSegmentName(), new Integer(i)); } } SegmentReader[] newReaders = new SegmentReader[infos.size()]; // remember which readers are shared between the old and the re-opened // DirectoryReader - we have to incRef those readers boolean[] readerShared = new boolean[infos.size()]; for (int i = infos.size() - 1; i>=0; i--) { // find SegmentReader for this segment Integer oldReaderIndex = (Integer) segmentReaders.get(infos.info(i).name); if (oldReaderIndex == null) { // this is a new segment, no old SegmentReader can be reused newReaders[i] = null; } else { // there is an old reader for this segment - we'll try to reopen it newReaders[i] = oldReaders[oldReaderIndex.intValue()]; } boolean success = false; try { SegmentReader newReader; if (newReaders[i] == null || infos.info(i).getUseCompoundFile() != newReaders[i].getSegmentInfo().getUseCompoundFile()) { // We should never see a totally new segment during cloning assert !doClone; // this is a new reader; in case we hit an exception we can close it safely newReader = SegmentReader.get(readOnly, infos.info(i), termInfosIndexDivisor); } else { newReader = newReaders[i].reopenSegment(infos.info(i), doClone, readOnly); } if (newReader == newReaders[i]) { // this reader will be shared between the old and the new one, // so we must incRef it readerShared[i] = true; newReader.incRef(); } else { readerShared[i] = false; newReaders[i] = newReader; } success = true; } finally { if (!success) { for (i++; i < infos.size(); i++) { if (newReaders[i] != null) { try { if (!readerShared[i]) { // this is a new subReader that is not used by the old one, // we can close it newReaders[i].close(); } else { // this subReader is also used by the old reader, so instead // closing we must decRef it newReaders[i].decRef(); } } catch (IOException ignore) { // keep going - we want to clean up as much as possible } } } } } } // initialize the readers to calculate maxDoc before we try to reuse the old normsCache initialize(newReaders); // try to copy unchanged norms from the old normsCache to the new one if (oldNormsCache != null) { Iterator it = oldNormsCache.entrySet().iterator(); while (it.hasNext()) { Map.Entry entry = (Map.Entry) it.next(); String field = (String) entry.getKey(); if (!hasNorms(field)) { continue; } byte[] oldBytes = (byte[]) entry.getValue(); byte[] bytes = new byte[maxDoc()]; for (int i = 0; i < subReaders.length; i++) { Integer oldReaderIndex = ((Integer) segmentReaders.get(subReaders[i].getSegmentName())); // this SegmentReader was not re-opened, we can copy all of its norms if (oldReaderIndex != null && (oldReaders[oldReaderIndex.intValue()] == subReaders[i] || oldReaders[oldReaderIndex.intValue()].norms.get(field) == subReaders[i].norms.get(field))) { // we don't have to synchronize here: either this constructor is called from a SegmentReader, // in which case no old norms cache is present, or it is called from MultiReader.reopen(), // which is synchronized System.arraycopy(oldBytes, oldStarts[oldReaderIndex.intValue()], bytes, starts[i], starts[i+1] - starts[i]); } else { subReaders[i].norms(field, bytes, starts[i]); } } normsCache.put(field, bytes); // update cache } } } private void initialize(SegmentReader[] subReaders) throws IOException { this.subReaders = subReaders; starts = new int[subReaders.length + 1]; // build starts array for (int i = 0; i < subReaders.length; i++) { starts[i] = maxDoc; maxDoc += subReaders[i].maxDoc(); // compute maxDocs if (subReaders[i].hasDeletions()) hasDeletions = true; } starts[subReaders.length] = maxDoc; if (!readOnly) { maxIndexVersion = SegmentInfos.readCurrentVersion(directory); } } public final synchronized Object clone() { try { return clone(readOnly); // Preserve current readOnly } catch (Exception ex) { throw new RuntimeException(ex); } } public final synchronized IndexReader clone(boolean openReadOnly) throws CorruptIndexException, IOException { DirectoryReader newReader = doReopen((SegmentInfos) segmentInfos.clone(), true, openReadOnly); if (this != newReader) { newReader.deletionPolicy = deletionPolicy; } newReader.writer = writer; // If we're cloning a non-readOnly reader, move the // writeLock (if there is one) to the new reader: if (!openReadOnly && writeLock != null) { // In near real-time search, reader is always readonly assert writer == null; newReader.writeLock = writeLock; newReader.hasChanges = hasChanges; newReader.hasDeletions = hasDeletions; writeLock = null; hasChanges = false; } return newReader; } public final IndexReader reopen() throws CorruptIndexException, IOException { // Preserve current readOnly return doReopen(readOnly, null); } public final IndexReader reopen(boolean openReadOnly) throws CorruptIndexException, IOException { return doReopen(openReadOnly, null); } public final IndexReader reopen(final IndexCommit commit) throws CorruptIndexException, IOException { return doReopen(true, commit); } private final IndexReader doReopenFromWriter(boolean openReadOnly, IndexCommit commit) throws CorruptIndexException, IOException { assert readOnly; if (!openReadOnly) { throw new IllegalArgumentException("a reader obtained from IndexWriter.getReader() can only be reopened with openReadOnly=true (got false)"); } if (commit != null) { throw new IllegalArgumentException("a reader obtained from IndexWriter.getReader() cannot currently accept a commit"); } // TODO: right now we *always* make a new reader; in // the future we could have write make some effort to // detect that no changes have occurred return writer.getReader(); } private IndexReader doReopen(final boolean openReadOnly, IndexCommit commit) throws CorruptIndexException, IOException { ensureOpen(); assert commit == null || openReadOnly; // If we were obtained by writer.getReader(), re-ask the // writer to get a new reader. if (writer != null) { return doReopenFromWriter(openReadOnly, commit); } else { return doReopenNoWriter(openReadOnly, commit); } } private synchronized IndexReader doReopenNoWriter(final boolean openReadOnly, IndexCommit commit) throws CorruptIndexException, IOException { if (commit == null) { if (hasChanges) { // We have changes, which means we are not readOnly: assert readOnly == false; // and we hold the write lock: assert writeLock != null; // so no other writer holds the write lock, which // means no changes could have been done to the index: assert isCurrent(); if (openReadOnly) { return (IndexReader) clone(openReadOnly); } else { return this; } } else if (isCurrent()) { if (openReadOnly != readOnly) { // Just fallback to clone return (IndexReader) clone(openReadOnly); } else { return this; } } } else { if (directory != commit.getDirectory()) throw new IOException("the specified commit does not match the specified Directory"); if (segmentInfos != null && commit.getSegmentsFileName().equals(segmentInfos.getCurrentSegmentFileName())) { if (readOnly != openReadOnly) { // Just fallback to clone return (IndexReader) clone(openReadOnly); } else { return this; } } } return (IndexReader) new SegmentInfos.FindSegmentsFile(directory) { protected Object doBody(String segmentFileName) throws CorruptIndexException, IOException { SegmentInfos infos = new SegmentInfos(); infos.read(directory, segmentFileName); return doReopen(infos, false, openReadOnly); } }.run(commit); } private synchronized DirectoryReader doReopen(SegmentInfos infos, boolean doClone, boolean openReadOnly) throws CorruptIndexException, IOException { DirectoryReader reader; if (openReadOnly) { reader = new ReadOnlyDirectoryReader(directory, infos, subReaders, starts, normsCache, doClone, termInfosIndexDivisor); } else { reader = new DirectoryReader(directory, infos, subReaders, starts, normsCache, false, doClone, termInfosIndexDivisor); } reader.setDisableFakeNorms(getDisableFakeNorms()); return reader; } /** Version number when this IndexReader was opened. */ public long getVersion() { ensureOpen(); return segmentInfos.getVersion(); } public TermFreqVector[] getTermFreqVectors(int n) throws IOException { ensureOpen(); int i = readerIndex(n); // find segment num return subReaders[i].getTermFreqVectors(n - starts[i]); // dispatch to segment } public TermFreqVector getTermFreqVector(int n, String field) throws IOException { ensureOpen(); int i = readerIndex(n); // find segment num return subReaders[i].getTermFreqVector(n - starts[i], field); } public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException { ensureOpen(); int i = readerIndex(docNumber); // find segment num subReaders[i].getTermFreqVector(docNumber - starts[i], field, mapper); } public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException { ensureOpen(); int i = readerIndex(docNumber); // find segment num subReaders[i].getTermFreqVector(docNumber - starts[i], mapper); } /** * Checks is the index is optimized (if it has a single segment and no deletions) * @return true if the index is optimized; false otherwise */ public boolean isOptimized() { ensureOpen(); return segmentInfos.size() == 1 && !hasDeletions(); } public int numDocs() { // Don't call ensureOpen() here (it could affect performance) // NOTE: multiple threads may wind up init'ing // numDocs... but that's harmless if (numDocs == -1) { // check cache int n = 0; // cache miss--recompute for (int i = 0; i < subReaders.length; i++) n += subReaders[i].numDocs(); // sum from readers numDocs = n; } return numDocs; } public int maxDoc() { // Don't call ensureOpen() here (it could affect performance) return maxDoc; } // inherit javadoc public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException { ensureOpen(); int i = readerIndex(n); // find segment num return subReaders[i].document(n - starts[i], fieldSelector); // dispatch to segment reader } public boolean isDeleted(int n) { // Don't call ensureOpen() here (it could affect performance) final int i = readerIndex(n); // find segment num return subReaders[i].isDeleted(n - starts[i]); // dispatch to segment reader } public boolean hasDeletions() { // Don't call ensureOpen() here (it could affect performance) return hasDeletions; } protected void doDelete(int n) throws CorruptIndexException, IOException { numDocs = -1; // invalidate cache int i = readerIndex(n); // find segment num subReaders[i].deleteDocument(n - starts[i]); // dispatch to segment reader hasDeletions = true; } protected void doUndeleteAll() throws CorruptIndexException, IOException { for (int i = 0; i < subReaders.length; i++) subReaders[i].undeleteAll(); hasDeletions = false; numDocs = -1; // invalidate cache } private int readerIndex(int n) { // find reader for doc n: return readerIndex(n, this.starts, this.subReaders.length); } final static int readerIndex(int n, int[] starts, int numSubReaders) { // find reader for doc n: int lo = 0; // search starts array int hi = numSubReaders - 1; // for first element less while (hi >= lo) { int mid = (lo + hi) >>> 1; int midValue = starts[mid]; if (n < midValue) hi = mid - 1; else if (n > midValue) lo = mid + 1; else { // found a match while (mid+1 < numSubReaders && starts[mid+1] == midValue) { mid++; // scan to last match } return mid; } } return hi; } public boolean hasNorms(String field) throws IOException { ensureOpen(); for (int i = 0; i < subReaders.length; i++) { if (subReaders[i].hasNorms(field)) return true; } return false; } private byte[] ones; private byte[] fakeNorms() { if (ones==null) ones=SegmentReader.createFakeNorms(maxDoc()); return ones; } public synchronized byte[] norms(String field) throws IOException { ensureOpen(); byte[] bytes = (byte[])normsCache.get(field); if (bytes != null) return bytes; // cache hit if (!hasNorms(field)) return getDisableFakeNorms() ? null : fakeNorms(); bytes = new byte[maxDoc()]; for (int i = 0; i < subReaders.length; i++) subReaders[i].norms(field, bytes, starts[i]); normsCache.put(field, bytes); // update cache return bytes; } public synchronized void norms(String field, byte[] result, int offset) throws IOException { ensureOpen(); byte[] bytes = (byte[])normsCache.get(field); if (bytes==null && !hasNorms(field)) { Arrays.fill(result, offset, result.length, DefaultSimilarity.encodeNorm(1.0f)); } else if (bytes != null) { // cache hit System.arraycopy(bytes, 0, result, offset, maxDoc()); } else { for (int i = 0; i < subReaders.length; i++) { // read from segments subReaders[i].norms(field, result, offset + starts[i]); } } } protected void doSetNorm(int n, String field, byte value) throws CorruptIndexException, IOException { synchronized (normsCache) { normsCache.remove(field); // clear cache } int i = readerIndex(n); // find segment num subReaders[i].setNorm(n-starts[i], field, value); // dispatch } public TermEnum terms() throws IOException { ensureOpen(); return new MultiTermEnum(this, subReaders, starts, null); } public TermEnum terms(Term term) throws IOException { ensureOpen(); return new MultiTermEnum(this, subReaders, starts, term); } public int docFreq(Term t) throws IOException { ensureOpen(); int total = 0; // sum freqs in segments for (int i = 0; i < subReaders.length; i++) total += subReaders[i].docFreq(t); return total; } public TermDocs termDocs() throws IOException { ensureOpen(); return new MultiTermDocs(this, subReaders, starts); } public TermPositions termPositions() throws IOException { ensureOpen(); return new MultiTermPositions(this, subReaders, starts); } /** * Tries to acquire the WriteLock on this directory. this method is only valid if this IndexReader is directory * owner. * * @throws StaleReaderException if the index has changed since this reader was opened * @throws CorruptIndexException if the index is corrupt * @throws org.apache.lucene.store.LockObtainFailedException * if another writer has this index open (write.lock could not be * obtained) * @throws IOException if there is a low-level IO error */ protected void acquireWriteLock() throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException { if (readOnly) { // NOTE: we should not reach this code w/ the core // IndexReader classes; however, an external subclass // of IndexReader could reach this. ReadOnlySegmentReader.noWrite(); } if (segmentInfos != null) { ensureOpen(); if (stale) throw new StaleReaderException("IndexReader out of date and no longer valid for delete, undelete, or setNorm operations"); if (writeLock == null) { Lock writeLock = directory.makeLock(IndexWriter.WRITE_LOCK_NAME); if (!writeLock.obtain(IndexWriter.WRITE_LOCK_TIMEOUT)) // obtain write lock throw new LockObtainFailedException("Index locked for write: " + writeLock); this.writeLock = writeLock; // we have to check whether index has changed since this reader was opened. // if so, this reader is no longer valid for // deletion if (SegmentInfos.readCurrentVersion(directory) > maxIndexVersion) { stale = true; this.writeLock.release(); this.writeLock = null; throw new StaleReaderException("IndexReader out of date and no longer valid for delete, undelete, or setNorm operations"); } } } } /** @deprecated */ protected void doCommit() throws IOException { doCommit(null); } /** * Commit changes resulting from delete, undeleteAll, or setNorm operations *

    * If an exception is hit, then either no changes or all changes will have been committed to the index (transactional * semantics). * * @throws IOException if there is a low-level IO error */ protected void doCommit(Map commitUserData) throws IOException { if (hasChanges) { segmentInfos.setUserData(commitUserData); // Default deleter (for backwards compatibility) is // KeepOnlyLastCommitDeleter: IndexFileDeleter deleter = new IndexFileDeleter(directory, deletionPolicy == null ? new KeepOnlyLastCommitDeletionPolicy() : deletionPolicy, segmentInfos, null, null, synced); segmentInfos.updateGeneration(deleter.getLastSegmentInfos()); // Checkpoint the state we are about to change, in // case we have to roll back: startCommit(); boolean success = false; try { for (int i = 0; i < subReaders.length; i++) subReaders[i].commit(); // Sync all files we just wrote Iterator it = segmentInfos.files(directory, false).iterator(); while (it.hasNext()) { final String fileName = (String) it.next(); if (!synced.contains(fileName)) { assert directory.fileExists(fileName); directory.sync(fileName); synced.add(fileName); } } segmentInfos.commit(directory); success = true; } finally { if (!success) { // Rollback changes that were made to // SegmentInfos but failed to get [fully] // committed. This way this reader instance // remains consistent (matched to what's // actually in the index): rollbackCommit(); // Recompute deletable files & remove them (so // partially written .del files, etc, are // removed): deleter.refresh(); } } // Have the deleter remove any now unreferenced // files due to this commit: deleter.checkpoint(segmentInfos, true); deleter.close(); maxIndexVersion = segmentInfos.getVersion(); if (writeLock != null) { writeLock.release(); // release write lock writeLock = null; } } hasChanges = false; } void startCommit() { rollbackHasChanges = hasChanges; for (int i = 0; i < subReaders.length; i++) { subReaders[i].startCommit(); } } void rollbackCommit() { hasChanges = rollbackHasChanges; for (int i = 0; i < subReaders.length; i++) { subReaders[i].rollbackCommit(); } } public Map getCommitUserData() { ensureOpen(); return segmentInfos.getUserData(); } public boolean isCurrent() throws CorruptIndexException, IOException { ensureOpen(); if (writer == null || writer.isClosed()) { // we loaded SegmentInfos from the directory return SegmentInfos.readCurrentVersion(directory) == segmentInfos.getVersion(); } else { return writer.nrtIsCurrent(segmentInfosStart); } } protected synchronized void doClose() throws IOException { IOException ioe = null; normsCache = null; for (int i = 0; i < subReaders.length; i++) { // try to close each reader, even if an exception is thrown try { subReaders[i].decRef(); } catch (IOException e) { if (ioe == null) ioe = e; } } // NOTE: only needed in case someone had asked for // FieldCache for top-level reader (which is generally // not a good idea): FieldCache.DEFAULT.purge(this); // throw the first exception if (ioe != null) throw ioe; } public Collection getFieldNames (IndexReader.FieldOption fieldNames) { ensureOpen(); return getFieldNames(fieldNames, this.subReaders); } static Collection getFieldNames (IndexReader.FieldOption fieldNames, IndexReader[] subReaders) { // maintain a unique set of field names Set fieldSet = new HashSet(); for (int i = 0; i < subReaders.length; i++) { IndexReader reader = subReaders[i]; Collection names = reader.getFieldNames(fieldNames); fieldSet.addAll(names); } return fieldSet; } public IndexReader[] getSequentialSubReaders() { return subReaders; } public void setDisableFakeNorms(boolean disableFakeNorms) { super.setDisableFakeNorms(disableFakeNorms); for (int i = 0; i < subReaders.length; i++) subReaders[i].setDisableFakeNorms(disableFakeNorms); } /** Returns the directory this index resides in. */ public Directory directory() { // Don't ensureOpen here -- in certain cases, when a // cloned/reopened reader needs to commit, it may call // this method on the closed original reader return directory; } public int getTermInfosIndexDivisor() { return termInfosIndexDivisor; } /** * Expert: return the IndexCommit that this reader has opened. *

    *

    WARNING: this API is new and experimental and may suddenly change.

    */ public IndexCommit getIndexCommit() throws IOException { return new ReaderCommit(segmentInfos, directory); } /** @see org.apache.lucene.index.IndexReader#listCommits */ public static Collection listCommits(Directory dir) throws IOException { final String[] files = dir.listAll(); Collection commits = new ArrayList(); SegmentInfos latest = new SegmentInfos(); latest.read(dir); final long currentGen = latest.getGeneration(); commits.add(new ReaderCommit(latest, dir)); for(int i=0;i 0) { next(); } } public boolean next() throws IOException { for (int i=0; iTerm enumerations are always ordered by Term.compareTo(). Each term in the enumeration is greater than all that precede it. */ public abstract class TermEnum { /** Increments the enumeration to the next element. True if one exists.*/ public abstract boolean next() throws IOException; /** Returns the current Term in the enumeration.*/ public abstract Term term(); /** Returns the docFreq of the current Term in the enumeration.*/ public abstract int docFreq(); /** Closes the enumeration to further activity, freeing resources. */ public abstract void close() throws IOException; /** Skips terms to the first beyond the current whose value is * greater or equal to target.

    Returns true iff there is such * an entry.

    Behaves as if written:

       *   public boolean skipTo(Term target) {
       *     do {
       *       if (!next())
       * 	     return false;
       *     } while (target > term());
       *     return true;
       *   }
       * 
    * Some implementations *could* be considerably more efficient than a linear scan. * Check the implementation to be sure. * @deprecated This method is not performant and will be removed in Lucene 3.0. * Use {@link IndexReader#terms(Term)} to create a new TermEnum positioned at a * given term. */ public boolean skipTo(Term target) throws IOException { do { if (!next()) return false; } while (target.compareTo(term()) > 0); return true; } } lucene-2.9.4/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java0000644000175000017500000000577211474320230027122 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; // TODO FI: some of this is "generic" to TermsHash* so we // should factor it out so other consumers don't have to // duplicate this code /** Used by DocumentsWriter to merge the postings from * multiple ThreadStates when creating a segment */ final class FreqProxFieldMergeState { final FreqProxTermsWriterPerField field; final int numPostings; final CharBlockPool charPool; final RawPostingList[] postings; private FreqProxTermsWriter.PostingList p; char[] text; int textOffset; private int postingUpto = -1; final ByteSliceReader freq = new ByteSliceReader(); final ByteSliceReader prox = new ByteSliceReader(); int docID; int termFreq; public FreqProxFieldMergeState(FreqProxTermsWriterPerField field) { this.field = field; this.charPool = field.perThread.termsHashPerThread.charPool; this.numPostings = field.termsHashPerField.numPostings; this.postings = field.termsHashPerField.sortPostings(); } boolean nextTerm() throws IOException { postingUpto++; if (postingUpto == numPostings) return false; p = (FreqProxTermsWriter.PostingList) postings[postingUpto]; docID = 0; text = charPool.buffers[p.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; textOffset = p.textStart & DocumentsWriter.CHAR_BLOCK_MASK; field.termsHashPerField.initReader(freq, p, 0); if (!field.fieldInfo.omitTermFreqAndPositions) field.termsHashPerField.initReader(prox, p, 1); // Should always be true boolean result = nextDoc(); assert result; return true; } public boolean nextDoc() throws IOException { if (freq.eof()) { if (p.lastDocCode != -1) { // Return last doc docID = p.lastDocID; if (!field.omitTermFreqAndPositions) termFreq = p.docFreq; p.lastDocCode = -1; return true; } else // EOF return false; } final int code = freq.readVInt(); if (field.omitTermFreqAndPositions) docID += code; else { docID += code >>> 1; if ((code & 1) != 0) termFreq = 1; else termFreq = freq.readVInt(); } assert docID != p.lastDocID; return true; } } lucene-2.9.4/src/java/org/apache/lucene/index/SegmentMergeInfo.java0000644000175000017500000000434311474320230025616 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; final class SegmentMergeInfo { Term term; int base; int ord; // the position of the segment in a MultiReader TermEnum termEnum; IndexReader reader; int delCount; private TermPositions postings; // use getPositions() private int[] docMap; // use getDocMap() SegmentMergeInfo(int b, TermEnum te, IndexReader r) throws IOException { base = b; reader = r; termEnum = te; term = te.term(); } // maps around deleted docs int[] getDocMap() { if (docMap == null) { delCount = 0; // build array which maps document numbers around deletions if (reader.hasDeletions()) { int maxDoc = reader.maxDoc(); docMap = new int[maxDoc]; int j = 0; for (int i = 0; i < maxDoc; i++) { if (reader.isDeleted(i)) { delCount++; docMap[i] = -1; } else docMap[i] = j++; } } } return docMap; } TermPositions getPositions() throws IOException { if (postings == null) { postings = reader.termPositions(); } return postings; } final boolean next() throws IOException { if (termEnum.next()) { term = termEnum.term(); return true; } else { term = null; return false; } } final void close() throws IOException { termEnum.close(); if (postings != null) { postings.close(); } } } lucene-2.9.4/src/java/org/apache/lucene/index/IndexCommitPoint.java0000644000175000017500000000330011474320230025642 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Collection; import java.io.IOException; /** * @deprecated Please subclass IndexCommit class instead */ public interface IndexCommitPoint { /** * Get the segments file (segments_N) associated * with this commit point. */ public String getSegmentsFileName(); /** * Returns all index files referenced by this commit point. */ public Collection getFileNames() throws IOException; /** * Delete this commit point. *

    * Upon calling this, the writer is notified that this commit * point should be deleted. *

    * Decision that a commit-point should be deleted is taken by the {@link IndexDeletionPolicy} in effect * and therefore this should only be called by its {@link IndexDeletionPolicy#onInit onInit()} or * {@link IndexDeletionPolicy#onCommit onCommit()} methods. */ public void delete(); } lucene-2.9.4/src/java/org/apache/lucene/index/FormatPostingsTermsWriter.java0000644000175000017500000000425011474320230027604 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; final class FormatPostingsTermsWriter extends FormatPostingsTermsConsumer { final FormatPostingsFieldsWriter parent; final FormatPostingsDocsWriter docsWriter; final TermInfosWriter termsOut; FieldInfo fieldInfo; FormatPostingsTermsWriter(SegmentWriteState state, FormatPostingsFieldsWriter parent) throws IOException { super(); this.parent = parent; termsOut = parent.termsOut; docsWriter = new FormatPostingsDocsWriter(state, this); } void setField(FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; docsWriter.setField(fieldInfo); } char[] currentTerm; int currentTermStart; long freqStart; long proxStart; /** Adds a new term in this field */ FormatPostingsDocsConsumer addTerm(char[] text, int start) { currentTerm = text; currentTermStart = start; // TODO: this is abstraction violation -- ideally this // terms writer is not so "invasive", looking for file // pointers in its child consumers. freqStart = docsWriter.out.getFilePointer(); if (docsWriter.posWriter.out != null) proxStart = docsWriter.posWriter.out.getFilePointer(); parent.skipListWriter.resetSkip(); return docsWriter; } /** Called when we are done adding terms to this field */ void finish() { } void close() throws IOException { docsWriter.close(); } } lucene-2.9.4/src/java/org/apache/lucene/index/StaleReaderException.java0000644000175000017500000000256711474320230026500 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.index; import java.io.IOException; /** * This exception is thrown when an {@link IndexReader} * tries to make changes to the index (via {@link * IndexReader#deleteDocument}, {@link * IndexReader#undeleteAll} or {@link IndexReader#setNorm}) * but changes have already been committed to the index * since this reader was instantiated. When this happens * you must open a new reader on the current index to make * the changes. */ public class StaleReaderException extends IOException { public StaleReaderException(String message) { super(message); } } lucene-2.9.4/src/java/org/apache/lucene/index/AbstractAllTermDocs.java0000644000175000017500000000456411474320230026262 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.index; import java.io.IOException; /** Base class for enumerating all but deleted docs. * *

    NOTE: this class is meant only to be used internally * by Lucene; it's only public so it can be shared across * packages. This means the API is freely subject to * change, and, the class could be removed entirely, in any * Lucene release. Use directly at your own risk! */ public abstract class AbstractAllTermDocs implements TermDocs { protected int maxDoc; protected int doc = -1; protected AbstractAllTermDocs(int maxDoc) { this.maxDoc = maxDoc; } public void seek(Term term) throws IOException { if (term==null) { doc = -1; } else { throw new UnsupportedOperationException(); } } public void seek(TermEnum termEnum) throws IOException { throw new UnsupportedOperationException(); } public int doc() { return doc; } public int freq() { return 1; } public boolean next() throws IOException { return skipTo(doc+1); } public int read(int[] docs, int[] freqs) throws IOException { final int length = docs.length; int i = 0; while (i < length && doc < maxDoc) { if (!isDeleted(doc)) { docs[i] = doc; freqs[i] = 1; ++i; } doc++; } return i; } public boolean skipTo(int target) throws IOException { doc = target; while (doc < maxDoc) { if (!isDeleted(doc)) { return true; } doc++; } return false; } public void close() throws IOException { } public abstract boolean isDeleted(int doc); }lucene-2.9.4/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java0000644000175000017500000002050511474320230030516 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.document.Fieldable; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.UnicodeUtil; final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField { final TermVectorsTermsWriterPerThread perThread; final TermsHashPerField termsHashPerField; final TermVectorsTermsWriter termsWriter; final FieldInfo fieldInfo; final DocumentsWriter.DocState docState; final FieldInvertState fieldState; boolean doVectors; boolean doVectorPositions; boolean doVectorOffsets; int maxNumPostings; OffsetAttribute offsetAttribute = null; public TermVectorsTermsWriterPerField(TermsHashPerField termsHashPerField, TermVectorsTermsWriterPerThread perThread, FieldInfo fieldInfo) { this.termsHashPerField = termsHashPerField; this.perThread = perThread; this.termsWriter = perThread.termsWriter; this.fieldInfo = fieldInfo; docState = termsHashPerField.docState; fieldState = termsHashPerField.fieldState; } int getStreamCount() { return 2; } boolean start(Fieldable[] fields, int count) { doVectors = false; doVectorPositions = false; doVectorOffsets = false; for(int i=0;i= 0; if (!doVectors || numPostings == 0) return; if (numPostings > maxNumPostings) maxNumPostings = numPostings; final IndexOutput tvf = perThread.doc.perDocTvf; // This is called once, after inverting all occurrences // of a given field in the doc. At this point we flush // our hash into the DocWriter. assert fieldInfo.storeTermVector; assert perThread.vectorFieldsInOrder(fieldInfo); perThread.doc.addField(termsHashPerField.fieldInfo.number); final RawPostingList[] postings = termsHashPerField.sortPostings(); tvf.writeVInt(numPostings); byte bits = 0x0; if (doVectorPositions) bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR; if (doVectorOffsets) bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR; tvf.writeByte(bits); int encoderUpto = 0; int lastTermBytesCount = 0; final ByteSliceReader reader = perThread.vectorSliceReader; final char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers; for(int j=0;j> DocumentsWriter.CHAR_BLOCK_SHIFT]; final int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK; // We swap between two encoders to save copying // last Term's byte array final UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto]; // TODO: we could do this incrementally UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result); final int termBytesCount = utf8Result.length; // TODO: UTF16toUTF8 could tell us this prefix // Compute common prefix between last term and // this term int prefix = 0; if (j > 0) { final byte[] lastTermBytes = perThread.utf8Results[1-encoderUpto].result; final byte[] termBytes = perThread.utf8Results[encoderUpto].result; while(prefix < lastTermBytesCount && prefix < termBytesCount) { if (lastTermBytes[prefix] != termBytes[prefix]) break; prefix++; } } encoderUpto = 1-encoderUpto; lastTermBytesCount = termBytesCount; final int suffix = termBytesCount - prefix; tvf.writeVInt(prefix); tvf.writeVInt(suffix); tvf.writeBytes(utf8Result.result, prefix, suffix); tvf.writeVInt(freq); if (doVectorPositions) { termsHashPerField.initReader(reader, posting, 0); reader.writeTo(tvf); } if (doVectorOffsets) { termsHashPerField.initReader(reader, posting, 1); reader.writeTo(tvf); } } termsHashPerField.reset(); // NOTE: we clear, per-field, at the thread level, // because term vectors fully write themselves on each // field; this saves RAM (eg if large doc has two large // fields w/ term vectors on) because we recycle/reuse // all RAM after each field: perThread.termsHashPerThread.reset(false); } void shrinkHash() { termsHashPerField.shrinkHash(maxNumPostings); maxNumPostings = 0; } void start(Fieldable f) { if (doVectorOffsets) { offsetAttribute = (OffsetAttribute) fieldState.attributeSource.addAttribute(OffsetAttribute.class); } else { offsetAttribute = null; } } void newTerm(RawPostingList p0) { assert docState.testPoint("TermVectorsTermsWriterPerField.newTerm start"); TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0; p.freq = 1; if (doVectorOffsets) { int startOffset = fieldState.offset + offsetAttribute.startOffset();; int endOffset = fieldState.offset + offsetAttribute.endOffset(); termsHashPerField.writeVInt(1, startOffset); termsHashPerField.writeVInt(1, endOffset - startOffset); p.lastOffset = endOffset; } if (doVectorPositions) { termsHashPerField.writeVInt(0, fieldState.position); p.lastPosition = fieldState.position; } } void addTerm(RawPostingList p0) { assert docState.testPoint("TermVectorsTermsWriterPerField.addTerm start"); TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0; p.freq++; if (doVectorOffsets) { int startOffset = fieldState.offset + offsetAttribute.startOffset();; int endOffset = fieldState.offset + offsetAttribute.endOffset(); termsHashPerField.writeVInt(1, startOffset - p.lastOffset); termsHashPerField.writeVInt(1, endOffset - startOffset); p.lastOffset = endOffset; } if (doVectorPositions) { termsHashPerField.writeVInt(0, fieldState.position - p.lastPosition); p.lastPosition = fieldState.position; } } void skippingLongTerm() {} } lucene-2.9.4/src/java/org/apache/lucene/index/package.html0000644000175000017500000000174511474320230024041 0ustar janpascaljanpascal Code to maintain and access indices. lucene-2.9.4/src/java/org/apache/lucene/index/FieldSortedTermVectorMapper.java0000644000175000017500000000423011474320230027777 0ustar janpascaljanpascalpackage org.apache.lucene.index; import java.util.*; /** * Copyright 2007 The Apache Software Foundation *

    * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at *

    * http://www.apache.org/licenses/LICENSE-2.0 *

    * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * For each Field, store a sorted collection of {@link TermVectorEntry}s *

    * This is not thread-safe. */ public class FieldSortedTermVectorMapper extends TermVectorMapper{ private Map fieldToTerms = new HashMap(); private SortedSet currentSet; private String currentField; private Comparator comparator; /** * * @param comparator A Comparator for sorting {@link TermVectorEntry}s */ public FieldSortedTermVectorMapper(Comparator comparator) { this(false, false, comparator); } public FieldSortedTermVectorMapper(boolean ignoringPositions, boolean ignoringOffsets, Comparator comparator) { super(ignoringPositions, ignoringOffsets); this.comparator = comparator; } public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { TermVectorEntry entry = new TermVectorEntry(currentField, term, frequency, offsets, positions); currentSet.add(entry); } public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) { currentSet = new TreeSet(comparator); currentField = field; fieldToTerms.put(field, currentSet); } /** * Get the mapping between fields and terms, sorted by the comparator * * @return A map between field names and {@link java.util.SortedSet}s per field. SortedSet entries are {@link TermVectorEntry} */ public Map getFieldToTerms() { return fieldToTerms; } public Comparator getComparator() { return comparator; } } lucene-2.9.4/src/java/org/apache/lucene/index/PositionBasedTermVectorMapper.java0000644000175000017500000001067611474320230030351 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Copyright 2007 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; /** * For each Field, store position by position information. It ignores frequency information *

    * This is not thread-safe. */ public class PositionBasedTermVectorMapper extends TermVectorMapper{ private Map/*>*/ fieldToTerms; private String currentField; /** * A Map of Integer and TVPositionInfo */ private Map/**/ currentPositions; private boolean storeOffsets; /** * * */ public PositionBasedTermVectorMapper() { super(false, false); } public PositionBasedTermVectorMapper(boolean ignoringOffsets) { super(false, ignoringOffsets); } /** * Never ignores positions. This mapper doesn't make much sense unless there are positions * @return false */ public boolean isIgnoringPositions() { return false; } /** * Callback for the TermVectorReader. * @param term * @param frequency * @param offsets * @param positions */ public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { for (int i = 0; i < positions.length; i++) { Integer posVal = new Integer(positions[i]); TVPositionInfo pos = (TVPositionInfo) currentPositions.get(posVal); if (pos == null) { pos = new TVPositionInfo(positions[i], storeOffsets); currentPositions.put(posVal, pos); } pos.addTerm(term, offsets != null ? offsets[i] : null); } } /** * Callback mechanism used by the TermVectorReader * @param field The field being read * @param numTerms The number of terms in the vector * @param storeOffsets Whether offsets are available * @param storePositions Whether positions are available */ public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) { if (storePositions == false) { throw new RuntimeException("You must store positions in order to use this Mapper"); } if (storeOffsets == true) { //ignoring offsets } fieldToTerms = new HashMap(numTerms); this.storeOffsets = storeOffsets; currentField = field; currentPositions = new HashMap(); fieldToTerms.put(currentField, currentPositions); } /** * Get the mapping between fields and terms, sorted by the comparator * * @return A map between field names and a Map. The sub-Map key is the position as the integer, the value is {@link org.apache.lucene.index.PositionBasedTermVectorMapper.TVPositionInfo}. */ public Map getFieldToTerms() { return fieldToTerms; } /** * Container for a term at a position */ public static class TVPositionInfo{ private int position; //a list of Strings private List terms; //A list of TermVectorOffsetInfo private List offsets; public TVPositionInfo(int position, boolean storeOffsets) { this.position = position; terms = new ArrayList(); if (storeOffsets) { offsets = new ArrayList(); } } void addTerm(String term, TermVectorOffsetInfo info) { terms.add(term); if (offsets != null) { offsets.add(info); } } /** * * @return The position of the term */ public int getPosition() { return position; } /** * Note, there may be multiple terms at the same position * @return A List of Strings */ public List getTerms() { return terms; } /** * Parallel list (to {@link #getTerms()}) of TermVectorOffsetInfo objects. There may be multiple entries since there may be multiple terms at a position * @return A List of TermVectorOffsetInfo objects, if offsets are store. */ public List getOffsets() { return offsets; } } } lucene-2.9.4/src/java/org/apache/lucene/index/DefaultSkipListWriter.java0000644000175000017500000001262411474320230026665 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Arrays; import org.apache.lucene.store.IndexOutput; /** * Implements the skip list writer for the default posting list format * that stores positions and payloads. * */ class DefaultSkipListWriter extends MultiLevelSkipListWriter { private int[] lastSkipDoc; private int[] lastSkipPayloadLength; private long[] lastSkipFreqPointer; private long[] lastSkipProxPointer; private IndexOutput freqOutput; private IndexOutput proxOutput; private int curDoc; private boolean curStorePayloads; private int curPayloadLength; private long curFreqPointer; private long curProxPointer; DefaultSkipListWriter(int skipInterval, int numberOfSkipLevels, int docCount, IndexOutput freqOutput, IndexOutput proxOutput) { super(skipInterval, numberOfSkipLevels, docCount); this.freqOutput = freqOutput; this.proxOutput = proxOutput; lastSkipDoc = new int[numberOfSkipLevels]; lastSkipPayloadLength = new int[numberOfSkipLevels]; lastSkipFreqPointer = new long[numberOfSkipLevels]; lastSkipProxPointer = new long[numberOfSkipLevels]; } void setFreqOutput(IndexOutput freqOutput) { this.freqOutput = freqOutput; } void setProxOutput(IndexOutput proxOutput) { this.proxOutput = proxOutput; } /** * Sets the values for the current skip data. */ void setSkipData(int doc, boolean storePayloads, int payloadLength) { this.curDoc = doc; this.curStorePayloads = storePayloads; this.curPayloadLength = payloadLength; this.curFreqPointer = freqOutput.getFilePointer(); if (proxOutput != null) this.curProxPointer = proxOutput.getFilePointer(); } protected void resetSkip() { super.resetSkip(); Arrays.fill(lastSkipDoc, 0); Arrays.fill(lastSkipPayloadLength, -1); // we don't have to write the first length in the skip list Arrays.fill(lastSkipFreqPointer, freqOutput.getFilePointer()); if (proxOutput != null) Arrays.fill(lastSkipProxPointer, proxOutput.getFilePointer()); } protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException { // To efficiently store payloads in the posting lists we do not store the length of // every payload. Instead we omit the length for a payload if the previous payload had // the same length. // However, in order to support skipping the payload length at every skip point must be known. // So we use the same length encoding that we use for the posting lists for the skip data as well: // Case 1: current field does not store payloads // SkipDatum --> DocSkip, FreqSkip, ProxSkip // DocSkip,FreqSkip,ProxSkip --> VInt // DocSkip records the document number before every SkipInterval th document in TermFreqs. // Document numbers are represented as differences from the previous value in the sequence. // Case 2: current field stores payloads // SkipDatum --> DocSkip, PayloadLength?, FreqSkip,ProxSkip // DocSkip,FreqSkip,ProxSkip --> VInt // PayloadLength --> VInt // In this case DocSkip/2 is the difference between // the current and the previous value. If DocSkip // is odd, then a PayloadLength encoded as VInt follows, // if DocSkip is even, then it is assumed that the // current payload length equals the length at the previous // skip point if (curStorePayloads) { int delta = curDoc - lastSkipDoc[level]; if (curPayloadLength == lastSkipPayloadLength[level]) { // the current payload length equals the length at the previous skip point, // so we don't store the length again skipBuffer.writeVInt(delta * 2); } else { // the payload length is different from the previous one. We shift the DocSkip, // set the lowest bit and store the current payload length as VInt. skipBuffer.writeVInt(delta * 2 + 1); skipBuffer.writeVInt(curPayloadLength); lastSkipPayloadLength[level] = curPayloadLength; } } else { // current field does not store payloads skipBuffer.writeVInt(curDoc - lastSkipDoc[level]); } skipBuffer.writeVInt((int) (curFreqPointer - lastSkipFreqPointer[level])); skipBuffer.writeVInt((int) (curProxPointer - lastSkipProxPointer[level])); lastSkipDoc[level] = curDoc; //System.out.println("write doc at level " + level + ": " + curDoc); lastSkipFreqPointer[level] = curFreqPointer; lastSkipProxPointer[level] = curProxPointer; } } lucene-2.9.4/src/java/org/apache/lucene/index/TermInfosReader.java0000644000175000017500000002167511474320230025460 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.store.Directory; import org.apache.lucene.util.cache.Cache; import org.apache.lucene.util.cache.SimpleLRUCache; import org.apache.lucene.util.CloseableThreadLocal; /** This stores a monotonically increasing set of pairs in a * Directory. Pairs are accessed either by Term or by ordinal position the * set. */ final class TermInfosReader { private final Directory directory; private final String segment; private final FieldInfos fieldInfos; private final CloseableThreadLocal threadResources = new CloseableThreadLocal(); private final SegmentTermEnum origEnum; private final long size; private final Term[] indexTerms; private final TermInfo[] indexInfos; private final long[] indexPointers; private final int totalIndexInterval; private final static int DEFAULT_CACHE_SIZE = 1024; /** * Per-thread resources managed by ThreadLocal */ private static final class ThreadResources { SegmentTermEnum termEnum; // Used for caching the least recently looked-up Terms Cache termInfoCache; } TermInfosReader(Directory dir, String seg, FieldInfos fis, int readBufferSize, int indexDivisor) throws CorruptIndexException, IOException { boolean success = false; if (indexDivisor < 1 && indexDivisor != -1) { throw new IllegalArgumentException("indexDivisor must be -1 (don't load terms index) or greater than 0: got " + indexDivisor); } try { directory = dir; segment = seg; fieldInfos = fis; origEnum = new SegmentTermEnum(directory.openInput(segment + "." + IndexFileNames.TERMS_EXTENSION, readBufferSize), fieldInfos, false); size = origEnum.size; if (indexDivisor != -1) { // Load terms index totalIndexInterval = origEnum.indexInterval * indexDivisor; final SegmentTermEnum indexEnum = new SegmentTermEnum(directory.openInput(segment + "." + IndexFileNames.TERMS_INDEX_EXTENSION, readBufferSize), fieldInfos, true); try { int indexSize = 1+((int)indexEnum.size-1)/indexDivisor; // otherwise read index indexTerms = new Term[indexSize]; indexInfos = new TermInfo[indexSize]; indexPointers = new long[indexSize]; for (int i = 0; indexEnum.next(); i++) { indexTerms[i] = indexEnum.term(); indexInfos[i] = indexEnum.termInfo(); indexPointers[i] = indexEnum.indexPointer; for (int j = 1; j < indexDivisor; j++) if (!indexEnum.next()) break; } } finally { indexEnum.close(); } } else { // Do not load terms index: totalIndexInterval = -1; indexTerms = null; indexInfos = null; indexPointers = null; } success = true; } finally { // With lock-less commits, it's entirely possible (and // fine) to hit a FileNotFound exception above. In // this case, we want to explicitly close any subset // of things that were opened so that we don't have to // wait for a GC to do so. if (!success) { close(); } } } public int getSkipInterval() { return origEnum.skipInterval; } public int getMaxSkipLevels() { return origEnum.maxSkipLevels; } final void close() throws IOException { if (origEnum != null) origEnum.close(); threadResources.close(); } /** Returns the number of term/value pairs in the set. */ final long size() { return size; } private ThreadResources getThreadResources() { ThreadResources resources = (ThreadResources)threadResources.get(); if (resources == null) { resources = new ThreadResources(); resources.termEnum = terms(); // Cache does not have to be thread-safe, it is only used by one thread at the same time resources.termInfoCache = new SimpleLRUCache(DEFAULT_CACHE_SIZE); threadResources.set(resources); } return resources; } /** Returns the offset of the greatest index entry which is less than or equal to term.*/ private final int getIndexOffset(Term term) { int lo = 0; // binary search indexTerms[] int hi = indexTerms.length - 1; while (hi >= lo) { int mid = (lo + hi) >>> 1; int delta = term.compareTo(indexTerms[mid]); if (delta < 0) hi = mid - 1; else if (delta > 0) lo = mid + 1; else return mid; } return hi; } private final void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException { enumerator.seek(indexPointers[indexOffset], ((long) indexOffset * totalIndexInterval) - 1, indexTerms[indexOffset], indexInfos[indexOffset]); } /** Returns the TermInfo for a Term in the set, or null. */ TermInfo get(Term term) throws IOException { return get(term, true); } /** Returns the TermInfo for a Term in the set, or null. */ private TermInfo get(Term term, boolean useCache) throws IOException { if (size == 0) return null; ensureIndexIsRead(); TermInfo ti; ThreadResources resources = getThreadResources(); Cache cache = null; if (useCache) { cache = resources.termInfoCache; // check the cache first if the term was recently looked up ti = (TermInfo) cache.get(term); if (ti != null) { return ti; } } // optimize sequential access: first try scanning cached enum w/o seeking SegmentTermEnum enumerator = resources.termEnum; if (enumerator.term() != null // term is at or past current && ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0) || term.compareTo(enumerator.term()) >= 0)) { int enumOffset = (int)(enumerator.position/totalIndexInterval)+1; if (indexTerms.length == enumOffset // but before end of block || term.compareTo(indexTerms[enumOffset]) < 0) { // no need to seek int numScans = enumerator.scanTo(term); if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) { ti = enumerator.termInfo(); if (cache != null && numScans > 1) { // we only want to put this TermInfo into the cache if // scanEnum skipped more than one dictionary entry. // This prevents RangeQueries or WildcardQueries to // wipe out the cache when they iterate over a large numbers // of terms in order cache.put(term, ti); } } else { ti = null; } return ti; } } // random-access: must seek seekEnum(enumerator, getIndexOffset(term)); enumerator.scanTo(term); if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) { ti = enumerator.termInfo(); if (cache != null) { cache.put(term, ti); } } else { ti = null; } return ti; } private void ensureIndexIsRead() { if (indexTerms == null) { throw new IllegalStateException("terms index was not loaded when this reader was created"); } } /** Returns the position of a Term in the set or -1. */ final long getPosition(Term term) throws IOException { if (size == 0) return -1; ensureIndexIsRead(); int indexOffset = getIndexOffset(term); SegmentTermEnum enumerator = getThreadResources().termEnum; seekEnum(enumerator, indexOffset); while(term.compareTo(enumerator.term()) > 0 && enumerator.next()) {} if (term.compareTo(enumerator.term()) == 0) return enumerator.position; else return -1; } /** Returns an enumeration of all the Terms and TermInfos in the set. */ public SegmentTermEnum terms() { return (SegmentTermEnum)origEnum.clone(); } /** Returns an enumeration of terms starting at or after the named term. */ public SegmentTermEnum terms(Term term) throws IOException { // don't use the cache in this call because we want to reposition the // enumeration get(term, false); return (SegmentTermEnum)getThreadResources().termEnum.clone(); } } lucene-2.9.4/src/java/org/apache/lucene/index/TermDocs.java0000644000175000017500000000602011474320230024132 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; /** TermDocs provides an interface for enumerating <document, frequency> pairs for a term.

    The document portion names each document containing the term. Documents are indicated by number. The frequency portion gives the number of times the term occurred in each document.

    The pairs are ordered by document number. @see IndexReader#termDocs() */ public interface TermDocs { /** Sets this to the data for a term. * The enumeration is reset to the start of the data for this term. */ void seek(Term term) throws IOException; /** Sets this to the data for the current term in a {@link TermEnum}. * This may be optimized in some implementations. */ void seek(TermEnum termEnum) throws IOException; /** Returns the current document number.

    This is invalid until {@link #next()} is called for the first time.*/ int doc(); /** Returns the frequency of the term within the current document.

    This is invalid until {@link #next()} is called for the first time.*/ int freq(); /** Moves to the next pair in the enumeration.

    Returns true iff there is such a next pair in the enumeration. */ boolean next() throws IOException; /** Attempts to read multiple entries from the enumeration, up to length of * docs. Document numbers are stored in docs, and term * frequencies are stored in freqs. The freqs array must be as * long as the docs array. * *

    Returns the number of entries read. Zero is only returned when the * stream has been exhausted. */ int read(int[] docs, int[] freqs) throws IOException; /** Skips entries to the first beyond the current whose document number is * greater than or equal to target.

    Returns true iff there is such * an entry.

    Behaves as if written:

       *   boolean skipTo(int target) {
       *     do {
       *       if (!next())
       * 	     return false;
       *     } while (target > doc());
       *     return true;
       *   }
       * 
    * Some implementations are considerably more efficient than that. */ boolean skipTo(int target) throws IOException; /** Frees associated resources. */ void close() throws IOException; } lucene-2.9.4/src/java/org/apache/lucene/index/DefaultSkipListReader.java0000644000175000017500000000735411474320230026617 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Arrays; import org.apache.lucene.store.IndexInput; /** * Implements the skip list reader for the default posting list format * that stores positions and payloads. * */ class DefaultSkipListReader extends MultiLevelSkipListReader { private boolean currentFieldStoresPayloads; private long freqPointer[]; private long proxPointer[]; private int payloadLength[]; private long lastFreqPointer; private long lastProxPointer; private int lastPayloadLength; DefaultSkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval) { super(skipStream, maxSkipLevels, skipInterval); freqPointer = new long[maxSkipLevels]; proxPointer = new long[maxSkipLevels]; payloadLength = new int[maxSkipLevels]; } void init(long skipPointer, long freqBasePointer, long proxBasePointer, int df, boolean storesPayloads) { super.init(skipPointer, df); this.currentFieldStoresPayloads = storesPayloads; lastFreqPointer = freqBasePointer; lastProxPointer = proxBasePointer; Arrays.fill(freqPointer, freqBasePointer); Arrays.fill(proxPointer, proxBasePointer); Arrays.fill(payloadLength, 0); } /** Returns the freq pointer of the doc to which the last call of * {@link MultiLevelSkipListReader#skipTo(int)} has skipped. */ long getFreqPointer() { return lastFreqPointer; } /** Returns the prox pointer of the doc to which the last call of * {@link MultiLevelSkipListReader#skipTo(int)} has skipped. */ long getProxPointer() { return lastProxPointer; } /** Returns the payload length of the payload stored just before * the doc to which the last call of {@link MultiLevelSkipListReader#skipTo(int)} * has skipped. */ int getPayloadLength() { return lastPayloadLength; } protected void seekChild(int level) throws IOException { super.seekChild(level); freqPointer[level] = lastFreqPointer; proxPointer[level] = lastProxPointer; payloadLength[level] = lastPayloadLength; } protected void setLastSkipData(int level) { super.setLastSkipData(level); lastFreqPointer = freqPointer[level]; lastProxPointer = proxPointer[level]; lastPayloadLength = payloadLength[level]; } protected int readSkipData(int level, IndexInput skipStream) throws IOException { int delta; if (currentFieldStoresPayloads) { // the current field stores payloads. // if the doc delta is odd then we have // to read the current payload length // because it differs from the length of the // previous payload delta = skipStream.readVInt(); if ((delta & 1) != 0) { payloadLength[level] = skipStream.readVInt(); } delta >>>= 1; } else { delta = skipStream.readVInt(); } freqPointer[level] += skipStream.readVInt(); proxPointer[level] += skipStream.readVInt(); return delta; } } lucene-2.9.4/src/java/org/apache/lucene/index/TermVectorEntryFreqSortedComparator.java0000644000175000017500000000250611474320230031562 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Copyright 2007 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Comparator; /** * Compares {@link org.apache.lucene.index.TermVectorEntry}s first by frequency and then by * the term (case-sensitive) * **/ public class TermVectorEntryFreqSortedComparator implements Comparator { public int compare(Object object, Object object1) { int result = 0; TermVectorEntry entry = (TermVectorEntry) object; TermVectorEntry entry1 = (TermVectorEntry) object1; result = entry1.getFrequency() - entry.getFrequency(); if (result == 0) { result = entry.getTerm().compareTo(entry1.getTerm()); if (result == 0) { result = entry.getField().compareTo(entry1.getField()); } } return result; } } lucene-2.9.4/src/java/org/apache/lucene/index/ByteSliceReader.java0000644000175000017500000001056511474320230025431 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import java.io.IOException; /* IndexInput that knows how to read the byte slices written * by Posting and PostingVector. We read the bytes in * each slice until we hit the end of that slice at which * point we read the forwarding address of the next slice * and then jump to it.*/ final class ByteSliceReader extends IndexInput { ByteBlockPool pool; int bufferUpto; byte[] buffer; public int upto; int limit; int level; public int bufferOffset; public int endIndex; public void init(ByteBlockPool pool, int startIndex, int endIndex) { assert endIndex-startIndex >= 0; assert startIndex >= 0; assert endIndex >= 0; this.pool = pool; this.endIndex = endIndex; level = 0; bufferUpto = startIndex / DocumentsWriter.BYTE_BLOCK_SIZE; bufferOffset = bufferUpto * DocumentsWriter.BYTE_BLOCK_SIZE; buffer = pool.buffers[bufferUpto]; upto = startIndex & DocumentsWriter.BYTE_BLOCK_MASK; final int firstSize = ByteBlockPool.levelSizeArray[0]; if (startIndex+firstSize >= endIndex) { // There is only this one slice to read limit = endIndex & DocumentsWriter.BYTE_BLOCK_MASK; } else limit = upto+firstSize-4; } public boolean eof() { assert upto + bufferOffset <= endIndex; return upto + bufferOffset == endIndex; } public byte readByte() { assert !eof(); assert upto <= limit; if (upto == limit) nextSlice(); return buffer[upto++]; } public long writeTo(IndexOutput out) throws IOException { long size = 0; while(true) { if (limit + bufferOffset == endIndex) { assert endIndex - bufferOffset >= upto; out.writeBytes(buffer, upto, limit-upto); size += limit-upto; break; } else { out.writeBytes(buffer, upto, limit-upto); size += limit-upto; nextSlice(); } } return size; } public void nextSlice() { // Skip to our next slice final int nextIndex = ((buffer[limit]&0xff)<<24) + ((buffer[1+limit]&0xff)<<16) + ((buffer[2+limit]&0xff)<<8) + (buffer[3+limit]&0xff); level = ByteBlockPool.nextLevelArray[level]; final int newSize = ByteBlockPool.levelSizeArray[level]; bufferUpto = nextIndex / DocumentsWriter.BYTE_BLOCK_SIZE; bufferOffset = bufferUpto * DocumentsWriter.BYTE_BLOCK_SIZE; buffer = pool.buffers[bufferUpto]; upto = nextIndex & DocumentsWriter.BYTE_BLOCK_MASK; if (nextIndex + newSize >= endIndex) { // We are advancing to the final slice assert endIndex - nextIndex > 0; limit = endIndex - bufferOffset; } else { // This is not the final slice (subtract 4 for the // forwarding address at the end of this new slice) limit = upto+newSize-4; } } public void readBytes(byte[] b, int offset, int len) { while(len > 0) { final int numLeft = limit-upto; if (numLeft < len) { // Read entire slice System.arraycopy(buffer, upto, b, offset, numLeft); offset += numLeft; len -= numLeft; nextSlice(); } else { // This slice is the last one System.arraycopy(buffer, upto, b, offset, len); upto += len; break; } } } public long getFilePointer() {throw new RuntimeException("not implemented");} public long length() {throw new RuntimeException("not implemented");} public void seek(long pos) {throw new RuntimeException("not implemented");} public void close() {throw new RuntimeException("not implemented");} } lucene-2.9.4/src/java/org/apache/lucene/index/NormsWriterPerThread.java0000644000175000017500000000271711474320230026515 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ final class NormsWriterPerThread extends InvertedDocEndConsumerPerThread { final NormsWriter normsWriter; final DocumentsWriter.DocState docState; public NormsWriterPerThread(DocInverterPerThread docInverterPerThread, NormsWriter normsWriter) { this.normsWriter = normsWriter; docState = docInverterPerThread.docState; } InvertedDocEndConsumerPerField addField(DocInverterPerField docInverterPerField, final FieldInfo fieldInfo) { return new NormsWriterPerField(docInverterPerField, this, fieldInfo); } void abort() {} void startDocument() {} void finishDocument() {} boolean freeRAM() { return false; } } lucene-2.9.4/src/java/org/apache/lucene/index/MergePolicy.java0000644000175000017500000002227511474320230024643 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.store.Directory; import java.io.IOException; import java.util.List; import java.util.ArrayList; import java.util.Set; /** *

    Expert: a MergePolicy determines the sequence of * primitive merge operations to be used for overall merge * and optimize operations.

    * *

    Whenever the segments in an index have been altered by * {@link IndexWriter}, either the addition of a newly * flushed segment, addition of many segments from * addIndexes* calls, or a previous merge that may now need * to cascade, {@link IndexWriter} invokes {@link * #findMerges} to give the MergePolicy a chance to pick * merges that are now required. This method returns a * {@link MergeSpecification} instance describing the set of * merges that should be done, or null if no merges are * necessary. When IndexWriter.optimize is called, it calls * {@link #findMergesForOptimize} and the MergePolicy should * then return the necessary merges.

    * *

    Note that the policy can return more than one merge at * a time. In this case, if the writer is using {@link * SerialMergeScheduler}, the merges will be run * sequentially but if it is using {@link * ConcurrentMergeScheduler} they will be run concurrently.

    * *

    The default MergePolicy is {@link * LogByteSizeMergePolicy}.

    * *

    NOTE: This API is new and still experimental * (subject to change suddenly in the next release)

    * *

    NOTE: This class typically requires access to * package-private APIs (e.g. SegmentInfos) to do its job; * if you implement your own MergePolicy, you'll need to put * it in package org.apache.lucene.index in order to use * these APIs. */ public abstract class MergePolicy { /** OneMerge provides the information necessary to perform * an individual primitive merge operation, resulting in * a single new segment. The merge spec includes the * subset of segments to be merged as well as whether the * new segment should use the compound file format. */ public static class OneMerge { SegmentInfo info; // used by IndexWriter boolean mergeDocStores; // used by IndexWriter boolean optimize; // used by IndexWriter boolean registerDone; // used by IndexWriter long mergeGen; // used by IndexWriter boolean isExternal; // used by IndexWriter int maxNumSegmentsOptimize; // used by IndexWriter SegmentReader[] readers; // used by IndexWriter SegmentReader[] readersClone; // used by IndexWriter final SegmentInfos segments; final boolean useCompoundFile; boolean aborted; Throwable error; volatile boolean mergeDone; // used by IndexWriter public OneMerge(SegmentInfos segments, boolean useCompoundFile) { if (0 == segments.size()) throw new RuntimeException("segments must include at least one segment"); this.segments = segments; this.useCompoundFile = useCompoundFile; } /** Record that an exception occurred while executing * this merge */ synchronized void setException(Throwable error) { this.error = error; } /** Retrieve previous exception set by {@link * #setException}. */ synchronized Throwable getException() { return error; } /** Mark this merge as aborted. If this is called * before the merge is committed then the merge will * not be committed. */ synchronized void abort() { aborted = true; } /** Returns true if this merge was aborted. */ synchronized boolean isAborted() { return aborted; } synchronized void checkAborted(Directory dir) throws MergeAbortedException { if (aborted) throw new MergeAbortedException("merge is aborted: " + segString(dir)); } String segString(Directory dir) { StringBuffer b = new StringBuffer(); final int numSegments = segments.size(); for(int i=0;i 0) b.append(' '); b.append(segments.info(i).segString(dir)); } if (info != null) b.append(" into ").append(info.name); if (optimize) b.append(" [optimize]"); if (mergeDocStores) { b.append(" [mergeDocStores]"); } return b.toString(); } } /** * A MergeSpecification instance provides the information * necessary to perform multiple merges. It simply * contains a list of {@link OneMerge} instances. */ public static class MergeSpecification { /** * The subset of segments to be included in the primitive merge. */ public List merges = new ArrayList(); public void add(OneMerge merge) { merges.add(merge); } public String segString(Directory dir) { StringBuffer b = new StringBuffer(); b.append("MergeSpec:\n"); final int count = merges.size(); for(int i=0;iThis class implements a {@link MergePolicy} that tries * to merge segments into levels of exponentially * increasing size, where each level has fewer segments than * the value of the merge factor. Whenever extra segments * (beyond the merge factor upper bound) are encountered, * all segments within the level are merged. You can get or * set the merge factor using {@link #getMergeFactor()} and * {@link #setMergeFactor(int)} respectively.

    * *

    This class is abstract and requires a subclass to * define the {@link #size} method which specifies how a * segment's size is determined. {@link LogDocMergePolicy} * is one subclass that measures size by document count in * the segment. {@link LogByteSizeMergePolicy} is another * subclass that measures size as the total byte size of the * file(s) for the segment.

    */ public abstract class LogMergePolicy extends MergePolicy { /** Defines the allowed range of log(size) for each * level. A level is computed by taking the max segment * log size, minus LEVEL_LOG_SPAN, and finding all * segments falling within that range. */ public static final double LEVEL_LOG_SPAN = 0.75; /** Default merge factor, which is how many segments are * merged at a time */ public static final int DEFAULT_MERGE_FACTOR = 10; /** Default maximum segment size. A segment of this size * or larger will never be merged. @see setMaxMergeDocs */ public static final int DEFAULT_MAX_MERGE_DOCS = Integer.MAX_VALUE; /** Default noCFSRatio. If a merge's size is >= 10% of * the index, then we disable compound file for it. * @see #setNoCFSRatio */ public static final double DEFAULT_NO_CFS_RATIO = 0.1; private int mergeFactor = DEFAULT_MERGE_FACTOR; long minMergeSize; long maxMergeSize; int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS; protected double noCFSRatio = DEFAULT_NO_CFS_RATIO; /* TODO 3.0: change this default to true */ protected boolean calibrateSizeByDeletes = false; private boolean useCompoundFile = true; private boolean useCompoundDocStore = true; public LogMergePolicy(IndexWriter writer) { super(writer); } protected boolean verbose() { return writer != null && writer.verbose(); } /** @see #setNoCFSRatio */ public double getNoCFSRatio() { return noCFSRatio; } /** If a merged segment will be more than this percentage * of the total size of the index, leave the segment as * non-compound file even if compound file is enabled. * Set to 1.0 to always use CFS regardless of merge * size. */ public void setNoCFSRatio(double noCFSRatio) { if (noCFSRatio < 0.0 || noCFSRatio > 1.0) { throw new IllegalArgumentException("noCFSRatio must be 0.0 to 1.0 inclusive; got " + noCFSRatio); } this.noCFSRatio = noCFSRatio; } private void message(String message) { if (verbose()) writer.message("LMP: " + message); } /**

    Returns the number of segments that are merged at * once and also controls the total number of segments * allowed to accumulate in the index.

    */ public int getMergeFactor() { return mergeFactor; } /** Determines how often segment indices are merged by * addDocument(). With smaller values, less RAM is used * while indexing, and searches on unoptimized indices are * faster, but indexing speed is slower. With larger * values, more RAM is used during indexing, and while * searches on unoptimized indices are slower, indexing is * faster. Thus larger values (> 10) are best for batch * index creation, and smaller values (< 10) for indices * that are interactively maintained. */ public void setMergeFactor(int mergeFactor) { if (mergeFactor < 2) throw new IllegalArgumentException("mergeFactor cannot be less than 2"); this.mergeFactor = mergeFactor; } // Javadoc inherited public boolean useCompoundFile(SegmentInfos infos, SegmentInfo info) { return useCompoundFile; } /** Sets whether compound file format should be used for * newly flushed and newly merged segments. */ public void setUseCompoundFile(boolean useCompoundFile) { this.useCompoundFile = useCompoundFile; } /** Returns true if newly flushed and newly merge segments * are written in compound file format. @see * #setUseCompoundFile */ public boolean getUseCompoundFile() { return useCompoundFile; } // Javadoc inherited public boolean useCompoundDocStore(SegmentInfos infos) { return useCompoundDocStore; } /** Sets whether compound file format should be used for * newly flushed and newly merged doc store * segment files (term vectors and stored fields). */ public void setUseCompoundDocStore(boolean useCompoundDocStore) { this.useCompoundDocStore = useCompoundDocStore; } /** Returns true if newly flushed and newly merge doc * store segment files (term vectors and stored fields) * are written in compound file format. @see * #setUseCompoundDocStore */ public boolean getUseCompoundDocStore() { return useCompoundDocStore; } /** Sets whether the segment size should be calibrated by * the number of deletes when choosing segments for merge. */ public void setCalibrateSizeByDeletes(boolean calibrateSizeByDeletes) { this.calibrateSizeByDeletes = calibrateSizeByDeletes; } /** Returns true if the segment size should be calibrated * by the number of deletes when choosing segments for merge. */ public boolean getCalibrateSizeByDeletes() { return calibrateSizeByDeletes; } public void close() {} abstract protected long size(SegmentInfo info) throws IOException; protected long sizeDocs(SegmentInfo info) throws IOException { if (calibrateSizeByDeletes) { int delCount = writer.numDeletedDocs(info); return (info.docCount - (long)delCount); } else { return info.docCount; } } protected long sizeBytes(SegmentInfo info) throws IOException { long byteSize = info.sizeInBytes(); if (calibrateSizeByDeletes) { int delCount = writer.numDeletedDocs(info); float delRatio = (info.docCount <= 0 ? 0.0f : ((float)delCount / (float)info.docCount)); return (info.docCount <= 0 ? byteSize : (long)(byteSize * (1.0f - delRatio))); } else { return byteSize; } } private boolean isOptimized(SegmentInfos infos, int maxNumSegments, Set segmentsToOptimize) throws IOException { final int numSegments = infos.size(); int numToOptimize = 0; SegmentInfo optimizeInfo = null; for(int i=0;i 0; return !hasDeletions && !info.hasSeparateNorms() && info.dir == writer.getDirectory() && (info.getUseCompoundFile() == useCompoundFile || noCFSRatio < 1.0); } /** Returns the merges necessary to optimize the index. * This merge policy defines "optimized" to mean only one * segment in the index, where that segment has no * deletions pending nor separate norms, and it is in * compound file format if the current useCompoundFile * setting is true. This method returns multiple merges * (mergeFactor at a time) so the {@link MergeScheduler} * in use may make use of concurrency. */ public MergeSpecification findMergesForOptimize(SegmentInfos infos, int maxNumSegments, Set segmentsToOptimize) throws IOException { MergeSpecification spec; assert maxNumSegments > 0; if (!isOptimized(infos, maxNumSegments, segmentsToOptimize)) { // Find the newest (rightmost) segment that needs to // be optimized (other segments may have been flushed // since optimize started): int last = infos.size(); while(last > 0) { final SegmentInfo info = infos.info(--last); if (segmentsToOptimize.contains(info)) { last++; break; } } if (last > 0) { spec = new MergeSpecification(); // First, enroll all "full" merges (size // mergeFactor) to potentially be run concurrently: while (last - maxNumSegments + 1 >= mergeFactor) { spec.add(makeOneMerge(infos, infos.range(last-mergeFactor, last))); last -= mergeFactor; } // Only if there are no full merges pending do we // add a final partial (< mergeFactor segments) merge: if (0 == spec.merges.size()) { if (maxNumSegments == 1) { // Since we must optimize down to 1 segment, the // choice is simple: if (last > 1 || !isOptimized(infos.info(0))) spec.add(makeOneMerge(infos, infos.range(0, last))); } else if (last > maxNumSegments) { // Take care to pick a partial merge that is // least cost, but does not make the index too // lopsided. If we always just picked the // partial tail then we could produce a highly // lopsided index over time: // We must merge this many segments to leave // maxNumSegments in the index (from when // optimize was first kicked off): final int finalMergeSize = last - maxNumSegments + 1; // Consider all possible starting points: long bestSize = 0; int bestStart = 0; for(int i=0;i 0) { if (verbose()) message(" segment " + info.name + " has deletions"); if (firstSegmentWithDeletions == -1) firstSegmentWithDeletions = i; else if (i - firstSegmentWithDeletions == mergeFactor) { // We've seen mergeFactor segments in a row with // deletions, so force a merge now: if (verbose()) message(" add merge " + firstSegmentWithDeletions + " to " + (i-1) + " inclusive"); spec.add(makeOneMerge(segmentInfos, segmentInfos.range(firstSegmentWithDeletions, i))); firstSegmentWithDeletions = i; } } else if (firstSegmentWithDeletions != -1) { // End of a sequence of segments with deletions, so, // merge those past segments even if it's fewer than // mergeFactor segments if (verbose()) message(" add merge " + firstSegmentWithDeletions + " to " + (i-1) + " inclusive"); spec.add(makeOneMerge(segmentInfos, segmentInfos.range(firstSegmentWithDeletions, i))); firstSegmentWithDeletions = -1; } } if (firstSegmentWithDeletions != -1) { if (verbose()) message(" add merge " + firstSegmentWithDeletions + " to " + (numSegments-1) + " inclusive"); spec.add(makeOneMerge(segmentInfos, segmentInfos.range(firstSegmentWithDeletions, numSegments))); } return spec; } /** Checks if any merges are now necessary and returns a * {@link MergePolicy.MergeSpecification} if so. A merge * is necessary when there are more than {@link * #setMergeFactor} segments at a given level. When * multiple levels have too many segments, this method * will return multiple merges, allowing the {@link * MergeScheduler} to use concurrency. */ public MergeSpecification findMerges(SegmentInfos infos) throws IOException { final int numSegments = infos.size(); if (verbose()) message("findMerges: " + numSegments + " segments"); // Compute levels, which is just log (base mergeFactor) // of the size of each segment float[] levels = new float[numSegments]; final float norm = (float) Math.log(mergeFactor); for(int i=0;i maxLevel) maxLevel = level; } // Now search backwards for the rightmost segment that // falls into this level: float levelBottom; if (maxLevel < levelFloor) // All remaining segments fall into the min level levelBottom = -1.0F; else { levelBottom = (float) (maxLevel - LEVEL_LOG_SPAN); // Force a boundary at the level floor if (levelBottom < levelFloor && maxLevel >= levelFloor) levelBottom = levelFloor; } int upto = numSegments-1; while(upto >= start) { if (levels[upto] >= levelBottom) { break; } upto--; } if (verbose()) message(" level " + levelBottom + " to " + maxLevel + ": " + (1+upto-start) + " segments"); // Finally, record all merges that are viable at this level: int end = start + mergeFactor; while(end <= 1+upto) { boolean anyTooLarge = false; for(int i=start;i= maxMergeSize || sizeDocs(info) >= maxMergeDocs); } if (!anyTooLarge) { if (spec == null) spec = new MergeSpecification(); if (verbose()) message(" " + start + " to " + end + ": add this merge"); spec.add(makeOneMerge(infos, infos.range(start, end))); } else if (verbose()) message(" " + start + " to " + end + ": contains segment over maxMergeSize or maxMergeDocs; skipping"); start = end; end = start + mergeFactor; } start = 1+upto; } return spec; } protected OneMerge makeOneMerge(SegmentInfos infos, SegmentInfos infosToMerge) throws IOException { final boolean doCFS; if (!useCompoundFile) { doCFS = false; } else if (noCFSRatio == 1.0) { doCFS = true; } else { long totSize = 0; for(int i=0;iDetermines the largest segment (measured by * document count) that may be merged with other segments. * Small values (e.g., less than 10,000) are best for * interactive indexing, as this limits the length of * pauses while indexing to a few seconds. Larger values * are best for batched indexing and speedier * searches.

    * *

    The default value is {@link Integer#MAX_VALUE}.

    * *

    The default merge policy ({@link * LogByteSizeMergePolicy}) also allows you to set this * limit by net size (in MB) of the segment, using {@link * LogByteSizeMergePolicy#setMaxMergeMB}.

    */ public void setMaxMergeDocs(int maxMergeDocs) { this.maxMergeDocs = maxMergeDocs; } /** Returns the largest segment (measured by document * count) that may be merged with other segments. * @see #setMaxMergeDocs */ public int getMaxMergeDocs() { return maxMergeDocs; } } lucene-2.9.4/src/java/org/apache/lucene/index/DocConsumer.java0000644000175000017500000000235611474320230024643 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Collection; abstract class DocConsumer { abstract DocConsumerPerThread addThread(DocumentsWriterThreadState perThread) throws IOException; abstract void flush(final Collection threads, final SegmentWriteState state) throws IOException; abstract void closeDocStore(final SegmentWriteState state) throws IOException; abstract void abort(); abstract boolean freeRAM(); } lucene-2.9.4/src/java/org/apache/lucene/index/DocumentsWriter.java0000644000175000017500000015062511474320230025563 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.PrintStream; import java.text.NumberFormat; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.Map; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map.Entry; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Weight; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMFile; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.Constants; /** * This class accepts multiple added documents and directly * writes a single segment file. It does this more * efficiently than creating a single segment per document * (with DocumentWriter) and doing standard merges on those * segments. * * Each added document is passed to the {@link DocConsumer}, * which in turn processes the document and interacts with * other consumers in the indexing chain. Certain * consumers, like {@link StoredFieldsWriter} and {@link * TermVectorsTermsWriter}, digest a document and * immediately write bytes to the "doc store" files (ie, * they do not consume RAM per document, except while they * are processing the document). * * Other consumers, eg {@link FreqProxTermsWriter} and * {@link NormsWriter}, buffer bytes in RAM and flush only * when a new segment is produced. * Once we have used our allowed RAM buffer, or the number * of added docs is large enough (in the case we are * flushing by doc count instead of RAM usage), we create a * real segment and flush it to the Directory. * * Threads: * * Multiple threads are allowed into addDocument at once. * There is an initial synchronized call to getThreadState * which allocates a ThreadState for this thread. The same * thread will get the same ThreadState over time (thread * affinity) so that if there are consistent patterns (for * example each thread is indexing a different content * source) then we make better use of RAM. Then * processDocument is called on that ThreadState without * synchronization (most of the "heavy lifting" is in this * call). Finally the synchronized "finishDocument" is * called to flush changes to the directory. * * When flush is called by IndexWriter, or, we flush * internally when autoCommit=false, we forcefully idle all * threads and flush only once they are all idle. This * means you can call flush with a given thread even while * other threads are actively adding/deleting documents. * * * Exceptions: * * Because this class directly updates in-memory posting * lists, and flushes stored fields and term vectors * directly to files in the directory, there are certain * limited times when an exception can corrupt this state. * For example, a disk full while flushing stored fields * leaves this file in a corrupt state. Or, an OOM * exception while appending to the in-memory posting lists * can corrupt that posting list. We call such exceptions * "aborting exceptions". In these cases we must call * abort() to discard all docs added since the last flush. * * All other exceptions ("non-aborting exceptions") can * still partially update the index structures. These * updates are consistent, but, they represent only a part * of the document seen up until the exception was hit. * When this happens, we immediately mark the document as * deleted so that the document is always atomically ("all * or none") added to the index. */ final class DocumentsWriter { IndexWriter writer; Directory directory; String segment; // Current segment we are working on private String docStoreSegment; // Current doc-store segment we are writing private int docStoreOffset; // Current starting doc-store offset of current segment private int nextDocID; // Next docID to be added private int numDocsInRAM; // # docs buffered in RAM int numDocsInStore; // # docs written to doc stores // Max # ThreadState instances; if there are more threads // than this they share ThreadStates private final static int MAX_THREAD_STATE = 5; private DocumentsWriterThreadState[] threadStates = new DocumentsWriterThreadState[0]; private final HashMap threadBindings = new HashMap(); private int pauseThreads; // Non-zero when we need all threads to // pause (eg to flush) boolean flushPending; // True when a thread has decided to flush boolean bufferIsFull; // True when it's time to write segment private boolean aborting; // True if an abort is pending private DocFieldProcessor docFieldProcessor; PrintStream infoStream; int maxFieldLength = IndexWriter.DEFAULT_MAX_FIELD_LENGTH; Similarity similarity; List newFiles; static class DocState { DocumentsWriter docWriter; Analyzer analyzer; int maxFieldLength; PrintStream infoStream; Similarity similarity; int docID; Document doc; String maxTermPrefix; // deprecated boolean allowMinus1Position; // Only called by asserts public boolean testPoint(String name) { return docWriter.writer.testPoint(name); } public void clear() { // don't hold onto doc nor analyzer, in case it is // largish: doc = null; analyzer = null; } } /** Consumer returns this on each doc. This holds any * state that must be flushed synchronized "in docID * order". We gather these and flush them in order. */ abstract static class DocWriter { DocWriter next; int docID; abstract void finish() throws IOException; abstract void abort(); abstract long sizeInBytes(); void setNext(DocWriter next) { this.next = next; } } /** * Create and return a new DocWriterBuffer. */ PerDocBuffer newPerDocBuffer() { return new PerDocBuffer(); } /** * RAMFile buffer for DocWriters. */ class PerDocBuffer extends RAMFile { /** * Allocate bytes used from shared pool. */ protected byte[] newBuffer(int size) { assert size == PER_DOC_BLOCK_SIZE; return perDocAllocator.getByteBlock(false); } /** * Recycle the bytes used. */ synchronized void recycle() { if (buffers.size() > 0) { setLength(0); // Recycle the blocks perDocAllocator.recycleByteBlocks(buffers); buffers.clear(); sizeInBytes = 0; assert numBuffers() == 0; } } } /** * The IndexingChain must define the {@link #getChain(DocumentsWriter)} method * which returns the DocConsumer that the DocumentsWriter calls to process the * documents. */ abstract static class IndexingChain { abstract DocConsumer getChain(DocumentsWriter documentsWriter); } static final IndexingChain DefaultIndexingChain = new IndexingChain() { DocConsumer getChain(DocumentsWriter documentsWriter) { /* This is the current indexing chain: DocConsumer / DocConsumerPerThread --> code: DocFieldProcessor / DocFieldProcessorPerThread --> DocFieldConsumer / DocFieldConsumerPerThread / DocFieldConsumerPerField --> code: DocFieldConsumers / DocFieldConsumersPerThread / DocFieldConsumersPerField --> code: DocInverter / DocInverterPerThread / DocInverterPerField --> InvertedDocConsumer / InvertedDocConsumerPerThread / InvertedDocConsumerPerField --> code: TermsHash / TermsHashPerThread / TermsHashPerField --> TermsHashConsumer / TermsHashConsumerPerThread / TermsHashConsumerPerField --> code: FreqProxTermsWriter / FreqProxTermsWriterPerThread / FreqProxTermsWriterPerField --> code: TermVectorsTermsWriter / TermVectorsTermsWriterPerThread / TermVectorsTermsWriterPerField --> InvertedDocEndConsumer / InvertedDocConsumerPerThread / InvertedDocConsumerPerField --> code: NormsWriter / NormsWriterPerThread / NormsWriterPerField --> code: StoredFieldsWriter / StoredFieldsWriterPerThread / StoredFieldsWriterPerField */ // Build up indexing chain: final TermsHashConsumer termVectorsWriter = new TermVectorsTermsWriter(documentsWriter); final TermsHashConsumer freqProxWriter = new FreqProxTermsWriter(); final InvertedDocConsumer termsHash = new TermsHash(documentsWriter, true, freqProxWriter, new TermsHash(documentsWriter, false, termVectorsWriter, null)); final NormsWriter normsWriter = new NormsWriter(); final DocInverter docInverter = new DocInverter(termsHash, normsWriter); return new DocFieldProcessor(documentsWriter, docInverter); } }; final DocConsumer consumer; // Deletes done after the last flush; these are discarded // on abort private BufferedDeletes deletesInRAM = new BufferedDeletes(false); // Deletes done before the last flush; these are still // kept on abort private BufferedDeletes deletesFlushed = new BufferedDeletes(true); // The max number of delete terms that can be buffered before // they must be flushed to disk. private int maxBufferedDeleteTerms = IndexWriter.DEFAULT_MAX_BUFFERED_DELETE_TERMS; // How much RAM we can use before flushing. This is 0 if // we are flushing by doc count instead. private long ramBufferSize = (long) (IndexWriter.DEFAULT_RAM_BUFFER_SIZE_MB*1024*1024); private long waitQueuePauseBytes = (long) (ramBufferSize*0.1); private long waitQueueResumeBytes = (long) (ramBufferSize*0.05); // If we've allocated 5% over our RAM budget, we then // free down to 95% private long freeTrigger = (long) (IndexWriter.DEFAULT_RAM_BUFFER_SIZE_MB*1024*1024*1.05); private long freeLevel = (long) (IndexWriter.DEFAULT_RAM_BUFFER_SIZE_MB*1024*1024*0.95); // Flush @ this number of docs. If ramBufferSize is // non-zero we will flush by RAM usage instead. private int maxBufferedDocs = IndexWriter.DEFAULT_MAX_BUFFERED_DOCS; private int flushedDocCount; // How many docs already flushed to index synchronized void updateFlushedDocCount(int n) { flushedDocCount += n; } synchronized int getFlushedDocCount() { return flushedDocCount; } synchronized void setFlushedDocCount(int n) { flushedDocCount = n; } private boolean closed; DocumentsWriter(Directory directory, IndexWriter writer, IndexingChain indexingChain) throws IOException { this.directory = directory; this.writer = writer; this.similarity = writer.getSimilarity(); flushedDocCount = writer.maxDoc(); consumer = indexingChain.getChain(this); if (consumer instanceof DocFieldProcessor) { docFieldProcessor = (DocFieldProcessor) consumer; } } /** Returns true if any of the fields in the current * buffered docs have omitTermFreqAndPositions==false */ boolean hasProx() { return (docFieldProcessor != null) ? docFieldProcessor.fieldInfos.hasProx() : true; } /** If non-null, various details of indexing are printed * here. */ synchronized void setInfoStream(PrintStream infoStream) { this.infoStream = infoStream; for(int i=0;i= MAX_THREAD_STATE)) { state = minThreadState; state.numThreads++; } else { // Just create a new "private" thread state DocumentsWriterThreadState[] newArray = new DocumentsWriterThreadState[1+threadStates.length]; if (threadStates.length > 0) System.arraycopy(threadStates, 0, newArray, 0, threadStates.length); state = newArray[threadStates.length] = new DocumentsWriterThreadState(this); threadStates = newArray; } threadBindings.put(Thread.currentThread(), state); } // Next, wait until my thread state is idle (in case // it's shared with other threads) and for threads to // not be paused nor a flush pending: waitReady(state); // Allocate segment name if this is the first doc since // last flush: initSegmentName(false); state.isIdle = false; boolean success = false; try { state.docState.docID = nextDocID; assert writer.testPoint("DocumentsWriter.ThreadState.init start"); if (delTerm != null) { addDeleteTerm(delTerm, state.docState.docID); state.doFlushAfter = timeToFlushDeletes(); } assert writer.testPoint("DocumentsWriter.ThreadState.init after delTerm"); nextDocID++; numDocsInRAM++; // We must at this point commit to flushing to ensure we // always get N docs when we flush by doc count, even if // > 1 thread is adding documents: if (!flushPending && maxBufferedDocs != IndexWriter.DISABLE_AUTO_FLUSH && numDocsInRAM >= maxBufferedDocs) { flushPending = true; state.doFlushAfter = true; } success = true; } finally { if (!success) { // Forcefully idle this ThreadState: state.isIdle = true; notifyAll(); if (state.doFlushAfter) { state.doFlushAfter = false; flushPending = false; } } } return state; } /** Returns true if the caller (IndexWriter) should now * flush. */ boolean addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException { return updateDocument(doc, analyzer, null); } boolean updateDocument(Term t, Document doc, Analyzer analyzer) throws CorruptIndexException, IOException { return updateDocument(doc, analyzer, t); } boolean updateDocument(Document doc, Analyzer analyzer, Term delTerm) throws CorruptIndexException, IOException { // This call is synchronized but fast final DocumentsWriterThreadState state = getThreadState(doc, delTerm); final DocState docState = state.docState; docState.doc = doc; docState.analyzer = analyzer; boolean success = false; try { // This call is not synchronized and does all the // work final DocWriter perDoc; try { perDoc = state.consumer.processDocument(); } finally { docState.clear(); } // This call is synchronized but fast finishDocument(state, perDoc); success = true; } finally { if (!success) { synchronized(this) { if (aborting) { state.isIdle = true; notifyAll(); abort(); } else { skipDocWriter.docID = docState.docID; boolean success2 = false; try { waitQueue.add(skipDocWriter); success2 = true; } finally { if (!success2) { state.isIdle = true; notifyAll(); abort(); return false; } } state.isIdle = true; notifyAll(); // If this thread state had decided to flush, we // must clear it so another thread can flush if (state.doFlushAfter) { state.doFlushAfter = false; flushPending = false; notifyAll(); } // Immediately mark this document as deleted // since likely it was partially added. This // keeps indexing as "all or none" (atomic) when // adding a document: addDeleteDocID(state.docState.docID); } } } } return state.doFlushAfter || timeToFlushDeletes(); } // for testing synchronized int getNumBufferedDeleteTerms() { return deletesInRAM.numTerms; } // for testing synchronized Map getBufferedDeleteTerms() { return deletesInRAM.terms; } /** Called whenever a merge has completed and the merged segments had deletions */ synchronized void remapDeletes(SegmentInfos infos, int[][] docMaps, int[] delCounts, MergePolicy.OneMerge merge, int mergeDocCount) { if (docMaps == null) // The merged segments had no deletes so docIDs did not change and we have nothing to do return; MergeDocIDRemapper mapper = new MergeDocIDRemapper(infos, docMaps, delCounts, merge, mergeDocCount); deletesInRAM.remap(mapper, infos, docMaps, delCounts, merge, mergeDocCount); deletesFlushed.remap(mapper, infos, docMaps, delCounts, merge, mergeDocCount); flushedDocCount -= mapper.docShift; } synchronized private void waitReady(DocumentsWriterThreadState state) { while (!closed && ((state != null && !state.isIdle) || pauseThreads != 0 || flushPending || aborting)) { try { wait(); } catch (InterruptedException ie) { // In 3.0 we will change this to throw // InterruptedException instead Thread.currentThread().interrupt(); throw new RuntimeException(ie); } } if (closed) throw new AlreadyClosedException("this IndexWriter is closed"); } synchronized boolean bufferDeleteTerms(Term[] terms) throws IOException { waitReady(null); for (int i = 0; i < terms.length; i++) addDeleteTerm(terms[i], numDocsInRAM); return timeToFlushDeletes(); } synchronized boolean bufferDeleteTerm(Term term) throws IOException { waitReady(null); addDeleteTerm(term, numDocsInRAM); return timeToFlushDeletes(); } synchronized boolean bufferDeleteQueries(Query[] queries) throws IOException { waitReady(null); for (int i = 0; i < queries.length; i++) addDeleteQuery(queries[i], numDocsInRAM); return timeToFlushDeletes(); } synchronized boolean bufferDeleteQuery(Query query) throws IOException { waitReady(null); addDeleteQuery(query, numDocsInRAM); return timeToFlushDeletes(); } synchronized boolean deletesFull() { return (ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH && (deletesInRAM.bytesUsed + deletesFlushed.bytesUsed + numBytesUsed) >= ramBufferSize) || (maxBufferedDeleteTerms != IndexWriter.DISABLE_AUTO_FLUSH && ((deletesInRAM.size() + deletesFlushed.size()) >= maxBufferedDeleteTerms)); } synchronized boolean doApplyDeletes() { // Very similar to deletesFull(), except we don't count // numBytesAlloc, because we are checking whether // deletes (alone) are consuming too many resources now // and thus should be applied. We apply deletes if RAM // usage is > 1/2 of our allowed RAM buffer, to prevent // too-frequent flushing of a long tail of tiny segments // when merges (which always apply deletes) are // infrequent. return (ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH && (deletesInRAM.bytesUsed + deletesFlushed.bytesUsed) >= ramBufferSize/2) || (maxBufferedDeleteTerms != IndexWriter.DISABLE_AUTO_FLUSH && ((deletesInRAM.size() + deletesFlushed.size()) >= maxBufferedDeleteTerms)); } synchronized private boolean timeToFlushDeletes() { return (bufferIsFull || deletesFull()) && setFlushPending(); } void setMaxBufferedDeleteTerms(int maxBufferedDeleteTerms) { this.maxBufferedDeleteTerms = maxBufferedDeleteTerms; } int getMaxBufferedDeleteTerms() { return maxBufferedDeleteTerms; } synchronized boolean hasDeletes() { return deletesFlushed.any(); } synchronized boolean applyDeletes(SegmentInfos infos) throws IOException { if (!hasDeletes()) return false; if (infoStream != null) message("apply " + deletesFlushed.numTerms + " buffered deleted terms and " + deletesFlushed.docIDs.size() + " deleted docIDs and " + deletesFlushed.queries.size() + " deleted queries on " + + infos.size() + " segments."); final int infosEnd = infos.size(); int docStart = 0; boolean any = false; for (int i = 0; i < infosEnd; i++) { // Make sure we never attempt to apply deletes to // segment in external dir assert infos.info(i).dir == directory; SegmentReader reader = writer.readerPool.get(infos.info(i), false); try { any |= applyDeletes(reader, docStart); docStart += reader.maxDoc(); } finally { writer.readerPool.release(reader); } } deletesFlushed.clear(); return any; } // used only by assert private Term lastDeleteTerm; // used only by assert private boolean checkDeleteTerm(Term term) { if (term != null) { assert lastDeleteTerm == null || term.compareTo(lastDeleteTerm) > 0: "lastTerm=" + lastDeleteTerm + " vs term=" + term; } lastDeleteTerm = term; return true; } // Apply buffered delete terms, queries and docIDs to the // provided reader private final synchronized boolean applyDeletes(IndexReader reader, int docIDStart) throws CorruptIndexException, IOException { final int docEnd = docIDStart + reader.maxDoc(); boolean any = false; assert checkDeleteTerm(null); // Delete by term Iterator iter = deletesFlushed.terms.entrySet().iterator(); TermDocs docs = reader.termDocs(); try { while (iter.hasNext()) { Entry entry = (Entry) iter.next(); Term term = (Term) entry.getKey(); // LUCENE-2086: we should be iterating a TreeMap, // here, so terms better be in order: assert checkDeleteTerm(term); docs.seek(term); int limit = ((BufferedDeletes.Num) entry.getValue()).getNum(); while (docs.next()) { int docID = docs.doc(); if (docIDStart+docID >= limit) break; reader.deleteDocument(docID); any = true; } } } finally { docs.close(); } // Delete by docID iter = deletesFlushed.docIDs.iterator(); while(iter.hasNext()) { int docID = ((Integer) iter.next()).intValue(); if (docID >= docIDStart && docID < docEnd) { reader.deleteDocument(docID-docIDStart); any = true; } } // Delete by query IndexSearcher searcher = new IndexSearcher(reader); iter = deletesFlushed.queries.entrySet().iterator(); while(iter.hasNext()) { Entry entry = (Entry) iter.next(); Query query = (Query) entry.getKey(); int limit = ((Integer) entry.getValue()).intValue(); Weight weight = query.weight(searcher); Scorer scorer = weight.scorer(reader, true, false); if (scorer != null) { while(true) { int doc = scorer.nextDoc(); if (((long) docIDStart) + doc >= limit) break; reader.deleteDocument(doc); any = true; } } } searcher.close(); return any; } // Buffer a term in bufferedDeleteTerms, which records the // current number of documents buffered in ram so that the // delete term will be applied to those documents as well // as the disk segments. synchronized private void addDeleteTerm(Term term, int docCount) { BufferedDeletes.Num num = (BufferedDeletes.Num) deletesInRAM.terms.get(term); final int docIDUpto = flushedDocCount + docCount; if (num == null) deletesInRAM.terms.put(term, new BufferedDeletes.Num(docIDUpto)); else num.setNum(docIDUpto); deletesInRAM.numTerms++; deletesInRAM.addBytesUsed(BYTES_PER_DEL_TERM + term.text.length()*CHAR_NUM_BYTE); } // Buffer a specific docID for deletion. Currently only // used when we hit a exception when adding a document synchronized private void addDeleteDocID(int docID) { deletesInRAM.docIDs.add(new Integer(flushedDocCount+docID)); deletesInRAM.addBytesUsed(BYTES_PER_DEL_DOCID); } synchronized private void addDeleteQuery(Query query, int docID) { deletesInRAM.queries.put(query, new Integer(flushedDocCount + docID)); deletesInRAM.addBytesUsed(BYTES_PER_DEL_QUERY); } synchronized boolean doBalanceRAM() { return ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH && !bufferIsFull && (numBytesUsed+deletesInRAM.bytesUsed+deletesFlushed.bytesUsed >= ramBufferSize || numBytesAlloc >= freeTrigger); } /** Does the synchronized work to finish/flush the * inverted document. */ private void finishDocument(DocumentsWriterThreadState perThread, DocWriter docWriter) throws IOException { if (doBalanceRAM()) // Must call this w/o holding synchronized(this) else // we'll hit deadlock: balanceRAM(); synchronized(this) { assert docWriter == null || docWriter.docID == perThread.docState.docID; if (aborting) { // We are currently aborting, and another thread is // waiting for me to become idle. We just forcefully // idle this threadState; it will be fully reset by // abort() if (docWriter != null) try { docWriter.abort(); } catch (Throwable t) { } perThread.isIdle = true; notifyAll(); return; } final boolean doPause; if (docWriter != null) doPause = waitQueue.add(docWriter); else { skipDocWriter.docID = perThread.docState.docID; doPause = waitQueue.add(skipDocWriter); } if (doPause) waitForWaitQueue(); if (bufferIsFull && !flushPending) { flushPending = true; perThread.doFlushAfter = true; } perThread.isIdle = true; notifyAll(); } } synchronized void waitForWaitQueue() { do { try { wait(); } catch (InterruptedException ie) { // In 3.0 we will change this to throw // InterruptedException instead Thread.currentThread().interrupt(); throw new RuntimeException(ie); } } while (!waitQueue.doResume()); } private static class SkipDocWriter extends DocWriter { void finish() { } void abort() { } long sizeInBytes() { return 0; } } final SkipDocWriter skipDocWriter = new SkipDocWriter(); long getRAMUsed() { return numBytesUsed + deletesInRAM.bytesUsed + deletesFlushed.bytesUsed; } long numBytesAlloc; long numBytesUsed; NumberFormat nf = NumberFormat.getInstance(); // Coarse estimates used to measure RAM usage of buffered deletes final static int OBJECT_HEADER_BYTES = 8; final static int POINTER_NUM_BYTE = Constants.JRE_IS_64BIT ? 8 : 4; final static int INT_NUM_BYTE = 4; final static int CHAR_NUM_BYTE = 2; /* Rough logic: HashMap has an array[Entry] w/ varying load factor (say 2 * POINTER). Entry is object w/ Term key, BufferedDeletes.Num val, int hash, Entry next (OBJ_HEADER + 3*POINTER + INT). Term is object w/ String field and String text (OBJ_HEADER + 2*POINTER). We don't count Term's field since it's interned. Term's text is String (OBJ_HEADER + 4*INT + POINTER + OBJ_HEADER + string.length*CHAR). BufferedDeletes.num is OBJ_HEADER + INT. */ final static int BYTES_PER_DEL_TERM = 8*POINTER_NUM_BYTE + 5*OBJECT_HEADER_BYTES + 6*INT_NUM_BYTE; /* Rough logic: del docIDs are List. Say list allocates ~2X size (2*POINTER). Integer is OBJ_HEADER + int */ final static int BYTES_PER_DEL_DOCID = 2*POINTER_NUM_BYTE + OBJECT_HEADER_BYTES + INT_NUM_BYTE; /* Rough logic: HashMap has an array[Entry] w/ varying load factor (say 2 * POINTER). Entry is object w/ Query key, Integer val, int hash, Entry next (OBJ_HEADER + 3*POINTER + INT). Query we often undercount (say 24 bytes). Integer is OBJ_HEADER + INT. */ final static int BYTES_PER_DEL_QUERY = 5*POINTER_NUM_BYTE + 2*OBJECT_HEADER_BYTES + 2*INT_NUM_BYTE + 24; /* Initial chunks size of the shared byte[] blocks used to store postings data */ final static int BYTE_BLOCK_SHIFT = 15; final static int BYTE_BLOCK_SIZE = 1 << BYTE_BLOCK_SHIFT; final static int BYTE_BLOCK_MASK = BYTE_BLOCK_SIZE - 1; final static int BYTE_BLOCK_NOT_MASK = ~BYTE_BLOCK_MASK; private class ByteBlockAllocator extends ByteBlockPool.Allocator { final int blockSize; ArrayList freeByteBlocks = new ArrayList(); ByteBlockAllocator(int blockSize) { this.blockSize = blockSize; } /* Allocate another byte[] from the shared pool */ byte[] getByteBlock(boolean trackAllocations) { synchronized(DocumentsWriter.this) { final int size = freeByteBlocks.size(); final byte[] b; if (0 == size) { // Always record a block allocated, even if // trackAllocations is false. This is necessary // because this block will be shared between // things that don't track allocations (term // vectors) and things that do (freq/prox // postings). numBytesAlloc += blockSize; b = new byte[blockSize]; } else b = (byte[]) freeByteBlocks.remove(size-1); if (trackAllocations) numBytesUsed += blockSize; assert numBytesUsed <= numBytesAlloc; return b; } } /* Return byte[]'s to the pool */ void recycleByteBlocks(byte[][] blocks, int start, int end) { synchronized(DocumentsWriter.this) { for(int i=start;i freeTrigger) { if (infoStream != null) message(" RAM: now balance allocations: usedMB=" + toMB(numBytesUsed) + " vs trigger=" + toMB(flushTrigger) + " allocMB=" + toMB(numBytesAlloc) + " deletesMB=" + toMB(deletesRAMUsed) + " vs trigger=" + toMB(freeTrigger) + " byteBlockFree=" + toMB(byteBlockAllocator.freeByteBlocks.size()*BYTE_BLOCK_SIZE) + " perDocFree=" + toMB(perDocAllocator.freeByteBlocks.size()*PER_DOC_BLOCK_SIZE) + " charBlockFree=" + toMB(freeCharBlocks.size()*CHAR_BLOCK_SIZE*CHAR_NUM_BYTE)); final long startBytesAlloc = numBytesAlloc + deletesRAMUsed; int iter = 0; // We free equally from each pool in 32 KB // chunks until we are below our threshold // (freeLevel) boolean any = true; while(numBytesAlloc+deletesRAMUsed > freeLevel) { synchronized(this) { if (0 == perDocAllocator.freeByteBlocks.size() && 0 == byteBlockAllocator.freeByteBlocks.size() && 0 == freeCharBlocks.size() && 0 == freeIntBlocks.size() && !any) { // Nothing else to free -- must flush now. bufferIsFull = numBytesUsed+deletesRAMUsed > flushTrigger; if (infoStream != null) { if (bufferIsFull) message(" nothing to free; now set bufferIsFull"); else message(" nothing to free"); } assert numBytesUsed <= numBytesAlloc; break; } if ((0 == iter % 5) && byteBlockAllocator.freeByteBlocks.size() > 0) { byteBlockAllocator.freeByteBlocks.remove(byteBlockAllocator.freeByteBlocks.size()-1); numBytesAlloc -= BYTE_BLOCK_SIZE; } if ((1 == iter % 5) && freeCharBlocks.size() > 0) { freeCharBlocks.remove(freeCharBlocks.size()-1); numBytesAlloc -= CHAR_BLOCK_SIZE * CHAR_NUM_BYTE; } if ((2 == iter % 5) && freeIntBlocks.size() > 0) { freeIntBlocks.remove(freeIntBlocks.size()-1); numBytesAlloc -= INT_BLOCK_SIZE * INT_NUM_BYTE; } if ((3 == iter % 5) && perDocAllocator.freeByteBlocks.size() > 0) { // Remove upwards of 32 blocks (each block is 1K) for (int i = 0; i < 32; ++i) { perDocAllocator.freeByteBlocks.remove(perDocAllocator.freeByteBlocks.size() - 1); numBytesAlloc -= PER_DOC_BLOCK_SIZE; if (perDocAllocator.freeByteBlocks.size() == 0) { break; } } } } if ((4 == iter % 5) && any) // Ask consumer to free any recycled state any = consumer.freeRAM(); iter++; } if (infoStream != null) message(" after free: freedMB=" + nf.format((startBytesAlloc-numBytesAlloc-deletesRAMUsed)/1024./1024.) + " usedMB=" + nf.format((numBytesUsed+deletesRAMUsed)/1024./1024.) + " allocMB=" + nf.format(numBytesAlloc/1024./1024.)); } else { // If we have not crossed the 100% mark, but have // crossed the 95% mark of RAM we are actually // using, go ahead and flush. This prevents // over-allocating and then freeing, with every // flush. synchronized(this) { if (numBytesUsed+deletesRAMUsed > flushTrigger) { if (infoStream != null) message(" RAM: now flush @ usedMB=" + nf.format(numBytesUsed/1024./1024.) + " allocMB=" + nf.format(numBytesAlloc/1024./1024.) + " deletesMB=" + nf.format(deletesRAMUsed/1024./1024.) + " triggerMB=" + nf.format(flushTrigger/1024./1024.)); bufferIsFull = true; } } } } final WaitQueue waitQueue = new WaitQueue(); private class WaitQueue { DocWriter[] waiting; int nextWriteDocID; int nextWriteLoc; int numWaiting; long waitingBytes; public WaitQueue() { waiting = new DocWriter[10]; } synchronized void reset() { // NOTE: nextWriteLoc doesn't need to be reset assert numWaiting == 0; assert waitingBytes == 0; nextWriteDocID = 0; } synchronized boolean doResume() { return waitingBytes <= waitQueueResumeBytes; } synchronized boolean doPause() { return waitingBytes > waitQueuePauseBytes; } synchronized void abort() { int count = 0; for(int i=0;i= nextWriteDocID; if (doc.docID == nextWriteDocID) { writeDocument(doc); while(true) { doc = waiting[nextWriteLoc]; if (doc != null) { numWaiting--; waiting[nextWriteLoc] = null; waitingBytes -= doc.sizeInBytes(); writeDocument(doc); } else break; } } else { // I finished before documents that were added // before me. This can easily happen when I am a // small doc and the docs before me were large, or, // just due to luck in the thread scheduling. Just // add myself to the queue and when that large doc // finishes, it will flush me: int gap = doc.docID - nextWriteDocID; if (gap >= waiting.length) { // Grow queue DocWriter[] newArray = new DocWriter[ArrayUtil.getNextSize(gap)]; assert nextWriteLoc >= 0; System.arraycopy(waiting, nextWriteLoc, newArray, 0, waiting.length-nextWriteLoc); System.arraycopy(waiting, 0, newArray, waiting.length-nextWriteLoc, nextWriteLoc); nextWriteLoc = 0; waiting = newArray; gap = doc.docID - nextWriteDocID; } int loc = nextWriteLoc + gap; if (loc >= waiting.length) loc -= waiting.length; // We should only wrap one time assert loc < waiting.length; // Nobody should be in my spot! assert waiting[loc] == null; waiting[loc] = doc; numWaiting++; waitingBytes += doc.sizeInBytes(); } return doPause(); } } } lucene-2.9.4/src/java/org/apache/lucene/index/TermsHashConsumerPerField.java0000644000175000017500000000300511474320230027437 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** Implement this class to plug into the TermsHash * processor, which inverts & stores Tokens into a hash * table and provides an API for writing bytes into * multiple streams for each unique Token. */ import java.io.IOException; import org.apache.lucene.document.Fieldable; abstract class TermsHashConsumerPerField { abstract boolean start(Fieldable[] fields, int count) throws IOException; abstract void finish() throws IOException; abstract void skippingLongTerm() throws IOException; abstract void start(Fieldable field); abstract void newTerm(RawPostingList p) throws IOException; abstract void addTerm(RawPostingList p) throws IOException; abstract int getStreamCount(); } lucene-2.9.4/src/java/org/apache/lucene/index/IndexDeletionPolicy.java0000644000175000017500000001015411474320230026330 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.List; import java.io.IOException; /** *

    Expert: policy for deletion of stale {@link IndexCommit index commits}. * *

    Implement this interface, and pass it to one * of the {@link IndexWriter} or {@link IndexReader} * constructors, to customize when older * {@link IndexCommit point-in-time commits} * are deleted from the index directory. The default deletion policy * is {@link KeepOnlyLastCommitDeletionPolicy}, which always * removes old commits as soon as a new commit is done (this * matches the behavior before 2.2).

    * *

    One expected use case for this (and the reason why it * was first created) is to work around problems with an * index directory accessed via filesystems like NFS because * NFS does not provide the "delete on last close" semantics * that Lucene's "point in time" search normally relies on. * By implementing a custom deletion policy, such as "a * commit is only removed once it has been stale for more * than X minutes", you can give your readers time to * refresh to the new commit before {@link IndexWriter} * removes the old commits. Note that doing so will * increase the storage requirements of the index. See LUCENE-710 * for details.

    */ public interface IndexDeletionPolicy { /** *

    This is called once when a writer is first * instantiated to give the policy a chance to remove old * commit points.

    * *

    The writer locates all index commits present in the * index directory and calls this method. The policy may * choose to delete some of the commit points, doing so by * calling method {@link IndexCommit#delete delete()} * of {@link IndexCommit}.

    * *

    Note: the last CommitPoint is the most recent one, * i.e. the "front index state". Be careful not to delete it, * unless you know for sure what you are doing, and unless * you can afford to lose the index content while doing that. * * @param commits List of current * {@link IndexCommit point-in-time commits}, * sorted by age (the 0th one is the oldest commit). */ public void onInit(List commits) throws IOException; /** *

    This is called each time the writer completed a commit. * This gives the policy a chance to remove old commit points * with each commit.

    * *

    The policy may now choose to delete old commit points * by calling method {@link IndexCommit#delete delete()} * of {@link IndexCommit}.

    * *

    If writer has autoCommit = true then * this method will in general be called many times during * one instance of {@link IndexWriter}. If * autoCommit = false then this method is * only called once when {@link IndexWriter#close} is * called, or not at all if the {@link IndexWriter#abort} * is called. * *

    Note: the last CommitPoint is the most recent one, * i.e. the "front index state". Be careful not to delete it, * unless you know for sure what you are doing, and unless * you can afford to lose the index content while doing that. * * @param commits List of {@link IndexCommit}, * sorted by age (the 0th one is the oldest commit). */ public void onCommit(List commits) throws IOException; } lucene-2.9.4/src/java/org/apache/lucene/index/SegmentMergeQueue.java0000644000175000017500000000265611474320230026014 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.util.PriorityQueue; final class SegmentMergeQueue extends PriorityQueue { SegmentMergeQueue(int size) { initialize(size); } protected final boolean lessThan(Object a, Object b) { SegmentMergeInfo stiA = (SegmentMergeInfo)a; SegmentMergeInfo stiB = (SegmentMergeInfo)b; int comparison = stiA.term.compareTo(stiB.term); if (comparison == 0) return stiA.base < stiB.base; else return comparison < 0; } final void close() throws IOException { while (top() != null) ((SegmentMergeInfo)pop()).close(); } } lucene-2.9.4/src/java/org/apache/lucene/index/TermVectorMapper.java0000644000175000017500000000735411474320230025664 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Copyright 2007 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * The TermVectorMapper can be used to map Term Vectors into your own * structure instead of the parallel array structure used by * {@link org.apache.lucene.index.IndexReader#getTermFreqVector(int,String)}. *

    * It is up to the implementation to make sure it is thread-safe. * * **/ public abstract class TermVectorMapper { private boolean ignoringPositions; private boolean ignoringOffsets; protected TermVectorMapper() { } /** * * @param ignoringPositions true if this mapper should tell Lucene to ignore positions even if they are stored * @param ignoringOffsets similar to ignoringPositions */ protected TermVectorMapper(boolean ignoringPositions, boolean ignoringOffsets) { this.ignoringPositions = ignoringPositions; this.ignoringOffsets = ignoringOffsets; } /** * Tell the mapper what to expect in regards to field, number of terms, offset and position storage. * This method will be called once before retrieving the vector for a field. * * This method will be called before {@link #map(String,int,TermVectorOffsetInfo[],int[])}. * @param field The field the vector is for * @param numTerms The number of terms that need to be mapped * @param storeOffsets true if the mapper should expect offset information * @param storePositions true if the mapper should expect positions info */ public abstract void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions); /** * Map the Term Vector information into your own structure * @param term The term to add to the vector * @param frequency The frequency of the term in the document * @param offsets null if the offset is not specified, otherwise the offset into the field of the term * @param positions null if the position is not specified, otherwise the position in the field of the term */ public abstract void map(String term, int frequency, TermVectorOffsetInfo [] offsets, int [] positions); /** * Indicate to Lucene that even if there are positions stored, this mapper is not interested in them and they * can be skipped over. Derived classes should set this to true if they want to ignore positions. The default * is false, meaning positions will be loaded if they are stored. * @return false */ public boolean isIgnoringPositions() { return ignoringPositions; } /** * * @see #isIgnoringPositions() Same principal as {@link #isIgnoringPositions()}, but applied to offsets. false by default. * @return false */ public boolean isIgnoringOffsets() { return ignoringOffsets; } /** * Passes down the index of the document whose term vector is currently being mapped, * once for each top level call to a term vector reader. *

    * Default implementation IGNORES the document number. Override if your implementation needs the document number. *

    * NOTE: Document numbers are internal to Lucene and subject to change depending on indexing operations. * * @param documentNumber index of document currently being mapped */ public void setDocumentNumber(int documentNumber) { } } lucene-2.9.4/src/java/org/apache/lucene/index/IndexFileDeleter.java0000644000175000017500000005401311474320230025573 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.store.Directory; import java.io.IOException; import java.io.FileNotFoundException; import java.io.PrintStream; import java.util.Map; import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.Set; import java.util.List; import java.util.ArrayList; import java.util.Collections; import java.util.Collection; /* * This class keeps track of each SegmentInfos instance that * is still "live", either because it corresponds to a * segments_N file in the Directory (a "commit", i.e. a * committed SegmentInfos) or because it's an in-memory * SegmentInfos that a writer is actively updating but has * not yet committed. This class uses simple reference * counting to map the live SegmentInfos instances to * individual files in the Directory. * * When autoCommit=true, IndexWriter currently commits only * on completion of a merge (though this may change with * time: it is not a guarantee). When autoCommit=false, * IndexWriter only commits when it is closed. Regardless * of autoCommit, the user may call IndexWriter.commit() to * force a blocking commit. * * The same directory file may be referenced by more than * one IndexCommit, i.e. more than one SegmentInfos. * Therefore we count how many commits reference each file. * When all the commits referencing a certain file have been * deleted, the refcount for that file becomes zero, and the * file is deleted. * * A separate deletion policy interface * (IndexDeletionPolicy) is consulted on creation (onInit) * and once per commit (onCommit), to decide when a commit * should be removed. * * It is the business of the IndexDeletionPolicy to choose * when to delete commit points. The actual mechanics of * file deletion, retrying, etc, derived from the deletion * of commit points is the business of the IndexFileDeleter. * * The current default deletion policy is {@link * KeepOnlyLastCommitDeletionPolicy}, which removes all * prior commits when a new commit has completed. This * matches the behavior before 2.2. * * Note that you must hold the write.lock before * instantiating this class. It opens segments_N file(s) * directly with no retry logic. */ final class IndexFileDeleter { /* Files that we tried to delete but failed (likely * because they are open and we are running on Windows), * so we will retry them again later: */ private List deletable; /* Reference count for all files in the index. * Counts how many existing commits reference a file. * Maps String to RefCount (class below) instances: */ private Map refCounts = new HashMap(); /* Holds all commits (segments_N) currently in the index. * This will have just 1 commit if you are using the * default delete policy (KeepOnlyLastCommitDeletionPolicy). * Other policies may leave commit points live for longer * in which case this list would be longer than 1: */ private List commits = new ArrayList(); /* Holds files we had incref'd from the previous * non-commit checkpoint: */ private List lastFiles = new ArrayList(); /* Commits that the IndexDeletionPolicy have decided to delete: */ private List commitsToDelete = new ArrayList(); private PrintStream infoStream; private Directory directory; private IndexDeletionPolicy policy; private DocumentsWriter docWriter; final boolean startingCommitDeleted; private SegmentInfos lastSegmentInfos; private final Set synced; /** Change to true to see details of reference counts when * infoStream != null */ public static boolean VERBOSE_REF_COUNTS = false; void setInfoStream(PrintStream infoStream) { this.infoStream = infoStream; if (infoStream != null) message("setInfoStream deletionPolicy=" + policy); } private void message(String message) { infoStream.println("IFD [" + new Date() + "; " + Thread.currentThread().getName() + "]: " + message); } /** * Initialize the deleter: find all previous commits in * the Directory, incref the files they reference, call * the policy to let it delete commits. This will remove * any files not referenced by any of the commits. * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public IndexFileDeleter(Directory directory, IndexDeletionPolicy policy, SegmentInfos segmentInfos, PrintStream infoStream, DocumentsWriter docWriter, Set synced) throws CorruptIndexException, IOException { this.docWriter = docWriter; this.infoStream = infoStream; this.synced = synced; if (infoStream != null) message("init: current segments file is \"" + segmentInfos.getCurrentSegmentFileName() + "\"; deletionPolicy=" + policy); this.policy = policy; this.directory = directory; // First pass: walk the files and initialize our ref // counts: long currentGen = segmentInfos.getGeneration(); IndexFileNameFilter filter = IndexFileNameFilter.getFilter(); String[] files = directory.listAll(); CommitPoint currentCommitPoint = null; for(int i=0;i lastSegmentInfos.getGeneration()) { lastSegmentInfos = sis; } } } } } if (currentCommitPoint == null) { // We did not in fact see the segments_N file // corresponding to the segmentInfos that was passed // in. Yet, it must exist, because our caller holds // the write lock. This can happen when the directory // listing was stale (eg when index accessed via NFS // client with stale directory listing cache). So we // try now to explicitly open this commit point: SegmentInfos sis = new SegmentInfos(); try { sis.read(directory, segmentInfos.getCurrentSegmentFileName()); } catch (IOException e) { throw new CorruptIndexException("failed to locate current segments_N file"); } if (infoStream != null) message("forced open of current segments file " + segmentInfos.getCurrentSegmentFileName()); currentCommitPoint = new CommitPoint(commitsToDelete, directory, sis); commits.add(currentCommitPoint); incRef(sis, true); } // We keep commits list in sorted order (oldest to newest): Collections.sort(commits); // Now delete anything with ref count at 0. These are // presumably abandoned files eg due to crash of // IndexWriter. Iterator it = refCounts.keySet().iterator(); while(it.hasNext()) { String fileName = (String) it.next(); RefCount rc = (RefCount) refCounts.get(fileName); if (0 == rc.count) { if (infoStream != null) { message("init: removing unreferenced file \"" + fileName + "\""); } deleteFile(fileName); } } // Finally, give policy a chance to remove things on // startup: policy.onInit(commits); // Always protect the incoming segmentInfos since // sometime it may not be the most recent commit checkpoint(segmentInfos, false); startingCommitDeleted = currentCommitPoint.isDeleted(); deleteCommits(); } public SegmentInfos getLastSegmentInfos() { return lastSegmentInfos; } /** * Remove the CommitPoints in the commitsToDelete List by * DecRef'ing all files from each SegmentInfos. */ private void deleteCommits() throws IOException { int size = commitsToDelete.size(); if (size > 0) { // First decref all files that had been referred to by // the now-deleted commits: for(int i=0;i writeTo) { commits.remove(size-1); size--; } } } /** * Writer calls this when it has hit an error and had to * roll back, to tell us that there may now be * unreferenced files in the filesystem. So we re-list * the filesystem and delete such files. If segmentName * is non-null, we will only delete files corresponding to * that segment. */ public void refresh(String segmentName) throws IOException { String[] files = directory.listAll(); IndexFileNameFilter filter = IndexFileNameFilter.getFilter(); String segmentPrefix1; String segmentPrefix2; if (segmentName != null) { segmentPrefix1 = segmentName + "."; segmentPrefix2 = segmentName + "_"; } else { segmentPrefix1 = null; segmentPrefix2 = null; } for(int i=0;i 0) { for(int i=0;i 0) { for(int i=0;i 0; } } private RefCount getRefCount(String fileName) { RefCount rc; if (!refCounts.containsKey(fileName)) { rc = new RefCount(fileName); refCounts.put(fileName, rc); } else { rc = (RefCount) refCounts.get(fileName); } return rc; } void deleteFiles(List files) throws IOException { final int size = files.size(); for(int i=0;i 0: "RefCount is 0 pre-increment for file \"" + fileName + "\""; } return ++count; } public int DecRef() { assert count > 0: "RefCount is 0 pre-decrement for file \"" + fileName + "\""; return --count; } } /** * Holds details for each commit point. This class is * also passed to the deletion policy. Note: this class * has a natural ordering that is inconsistent with * equals. */ final private static class CommitPoint extends IndexCommit implements Comparable { long gen; Collection files; String segmentsFileName; boolean deleted; Directory directory; Collection commitsToDelete; long version; long generation; final boolean isOptimized; final Map userData; public CommitPoint(Collection commitsToDelete, Directory directory, SegmentInfos segmentInfos) throws IOException { this.directory = directory; this.commitsToDelete = commitsToDelete; userData = segmentInfos.getUserData(); segmentsFileName = segmentInfos.getCurrentSegmentFileName(); version = segmentInfos.getVersion(); generation = segmentInfos.getGeneration(); files = Collections.unmodifiableCollection(segmentInfos.files(directory, true)); gen = segmentInfos.getGeneration(); isOptimized = segmentInfos.size() == 1 && !segmentInfos.info(0).hasDeletions(); assert !segmentInfos.hasExternalSegments(directory); } public String toString() { return "IndexFileDeleter.CommitPoint(" + segmentsFileName + ")"; } public boolean isOptimized() { return isOptimized; } public String getSegmentsFileName() { return segmentsFileName; } public Collection getFileNames() throws IOException { return files; } public Directory getDirectory() { return directory; } public long getVersion() { return version; } public long getGeneration() { return generation; } public Map getUserData() { return userData; } /** * Called only be the deletion policy, to remove this * commit point from the index. */ public void delete() { if (!deleted) { deleted = true; commitsToDelete.add(this); } } public boolean isDeleted() { return deleted; } public int compareTo(Object obj) { CommitPoint commit = (CommitPoint) obj; if (gen < commit.gen) { return -1; } else if (gen > commit.gen) { return 1; } else { return 0; } } } } lucene-2.9.4/src/java/org/apache/lucene/index/SegmentTermVector.java0000644000175000017500000000503711474320230026036 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.*; class SegmentTermVector implements TermFreqVector { private String field; private String terms[]; private int termFreqs[]; SegmentTermVector(String field, String terms[], int termFreqs[]) { this.field = field; this.terms = terms; this.termFreqs = termFreqs; } /** * * @return The number of the field this vector is associated with */ public String getField() { return field; } public String toString() { StringBuffer sb = new StringBuffer(); sb.append('{'); sb.append(field).append(": "); if(terms != null){ for (int i=0; i0) sb.append(", "); sb.append(terms[i]).append('/').append(termFreqs[i]); } } sb.append('}'); return sb.toString(); } public int size() { return terms == null ? 0 : terms.length; } public String [] getTerms() { return terms; } public int[] getTermFrequencies() { return termFreqs; } public int indexOf(String termText) { if(terms == null) return -1; int res = Arrays.binarySearch(terms, termText); return res >= 0 ? res : -1; } public int[] indexesOf(String [] termNumbers, int start, int len) { // TODO: there must be a more efficient way of doing this. // At least, we could advance the lower bound of the terms array // as we find valid indexes. Also, it might be possible to leverage // this even more by starting in the middle of the termNumbers array // and thus dividing the terms array maybe in half with each found index. int res[] = new int[len]; for (int i=0; i < len; i++) { res[i] = indexOf(termNumbers[start+ i]); } return res; } } lucene-2.9.4/src/java/org/apache/lucene/index/NormsWriter.java0000644000175000017500000001320011474320230024703 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Collection; import java.util.Iterator; import java.util.HashMap; import java.util.Map; import java.util.List; import java.util.ArrayList; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.search.Similarity; // TODO FI: norms could actually be stored as doc store /** Writes norms. Each thread X field accumulates the norms * for the doc/fields it saw, then the flush method below * merges all of these together into a single _X.nrm file. */ final class NormsWriter extends InvertedDocEndConsumer { private static final byte defaultNorm = Similarity.encodeNorm(1.0f); private FieldInfos fieldInfos; public InvertedDocEndConsumerPerThread addThread(DocInverterPerThread docInverterPerThread) { return new NormsWriterPerThread(docInverterPerThread, this); } public void abort() {} // We only write the _X.nrm file at flush void files(Collection files) {} void setFieldInfos(FieldInfos fieldInfos) { this.fieldInfos = fieldInfos; } /** Produce _X.nrm if any document had a field with norms * not disabled */ public void flush(Map threadsAndFields, SegmentWriteState state) throws IOException { final Map byField = new HashMap(); // Typically, each thread will have encountered the same // field. So first we collate by field, ie, all // per-thread field instances that correspond to the // same FieldInfo final Iterator it = threadsAndFields.entrySet().iterator(); while(it.hasNext()) { Map.Entry entry = (Map.Entry) it.next(); Collection fields = (Collection) entry.getValue(); Iterator fieldsIt = fields.iterator(); while(fieldsIt.hasNext()) { NormsWriterPerField perField = (NormsWriterPerField) fieldsIt.next(); if (perField.upto > 0) { // It has some norms List l = (List) byField.get(perField.fieldInfo); if (l == null) { l = new ArrayList(); byField.put(perField.fieldInfo, l); } l.add(perField); } else // Remove this field since we haven't seen it // since the previous flush fieldsIt.remove(); } } final String normsFileName = state.segmentName + "." + IndexFileNames.NORMS_EXTENSION; state.flushedFiles.add(normsFileName); IndexOutput normsOut = state.directory.createOutput(normsFileName); try { normsOut.writeBytes(SegmentMerger.NORMS_HEADER, 0, SegmentMerger.NORMS_HEADER.length); final int numField = fieldInfos.size(); int normCount = 0; for(int fieldNumber=0;fieldNumber 0) { assert uptos[0] < fields[0].docIDs.length : " uptos[0]=" + uptos[0] + " len=" + (fields[0].docIDs.length); int minLoc = 0; int minDocID = fields[0].docIDs[uptos[0]]; for(int j=1;j[Note that as of 2.1, all but one of the * methods in this class are available via {@link * IndexWriter}. The one method that is not available is * {@link #deleteDocument(int)}.]

    * * A class to modify an index, i.e. to delete and add documents. This * class hides {@link IndexReader} and {@link IndexWriter} so that you * do not need to care about implementation details such as that adding * documents is done via IndexWriter and deletion is done via IndexReader. * *

    Note that you cannot create more than one IndexModifier object * on the same directory at the same time. * *

    Example usage: *

        Analyzer analyzer = new StandardAnalyzer();
        // create an index in /tmp/index, overwriting an existing one:
        IndexModifier indexModifier = new IndexModifier("/tmp/index", analyzer, true);
        Document doc = new Document();
        doc.add(new Field("id""1", Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("body""a simple test", Field.Store.YES, Field.Index.ANALYZED));
        indexModifier.addDocument(doc);
        int deleted = indexModifier.delete(new Term("id""1"));
        System.out.println("Deleted " + deleted + " document");
        indexModifier.flush();
        System.out.println(indexModifier.docCount() " docs in index");
        indexModifier.close();
    * *

    Not all methods of IndexReader and IndexWriter are offered by this * class. If you need access to additional methods, either use those classes * directly or implement your own class that extends IndexModifier. * *

    Although an instance of this class can be used from more than one * thread, you will not get the best performance. You might want to use * IndexReader and IndexWriter directly for that (but you will need to * care about synchronization yourself then). * *

    While you can freely mix calls to add() and delete() using this class, * you should batch you calls for best performance. For example, if you * want to update 20 documents, you should first delete all those documents, * then add all the new documents. * * @deprecated Please use {@link IndexWriter} instead. */ public class IndexModifier { protected IndexWriter indexWriter = null; protected IndexReader indexReader = null; protected Directory directory = null; protected Analyzer analyzer = null; protected boolean open = false, closeDir = false; // Lucene defaults: protected PrintStream infoStream = null; protected boolean useCompoundFile = true; protected int maxBufferedDocs = IndexWriter.DEFAULT_MAX_BUFFERED_DOCS; protected int maxFieldLength = IndexWriter.DEFAULT_MAX_FIELD_LENGTH; protected int mergeFactor = IndexWriter.DEFAULT_MERGE_FACTOR; /** * Open an index with write access. * * @param directory the index directory * @param analyzer the analyzer to use for adding new documents * @param create true to create the index or overwrite the existing one; * false to append to the existing index * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if there is a low-level IO error */ public IndexModifier(Directory directory, Analyzer analyzer, boolean create) throws CorruptIndexException, LockObtainFailedException, IOException { init(directory, analyzer, create); } /** * Open an index with write access. * * @param dirName the index directory * @param analyzer the analyzer to use for adding new documents * @param create true to create the index or overwrite the existing one; * false to append to the existing index * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if there is a low-level IO error */ public IndexModifier(String dirName, Analyzer analyzer, boolean create) throws CorruptIndexException, LockObtainFailedException, IOException { Directory dir = FSDirectory.getDirectory(dirName); this.closeDir = true; init(dir, analyzer, create); } /** * Open an index with write access. * * @param file the index directory * @param analyzer the analyzer to use for adding new documents * @param create true to create the index or overwrite the existing one; * false to append to the existing index * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if there is a low-level IO error */ public IndexModifier(File file, Analyzer analyzer, boolean create) throws CorruptIndexException, LockObtainFailedException, IOException { Directory dir = FSDirectory.getDirectory(file); this.closeDir = true; init(dir, analyzer, create); } /** * Initialize an IndexWriter. * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if there is a low-level IO error */ protected void init(Directory directory, Analyzer analyzer, boolean create) throws CorruptIndexException, LockObtainFailedException, IOException { this.directory = directory; synchronized(this.directory) { this.analyzer = analyzer; indexWriter = new IndexWriter(directory, analyzer, create, IndexWriter.MaxFieldLength.LIMITED); open = true; } } /** * Throw an IllegalStateException if the index is closed. * @throws IllegalStateException */ protected void assureOpen() { if (!open) { throw new IllegalStateException("Index is closed"); } } /** * Close the IndexReader and open an IndexWriter. * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if there is a low-level IO error */ protected void createIndexWriter() throws CorruptIndexException, LockObtainFailedException, IOException { if (indexWriter == null) { if (indexReader != null) { indexReader.close(); indexReader = null; } indexWriter = new IndexWriter(directory, analyzer, false, new IndexWriter.MaxFieldLength(maxFieldLength)); // IndexModifier cannot use ConcurrentMergeScheduler // because it synchronizes on the directory which can // cause deadlock indexWriter.setMergeScheduler(new SerialMergeScheduler()); indexWriter.setInfoStream(infoStream); indexWriter.setUseCompoundFile(useCompoundFile); if (maxBufferedDocs != IndexWriter.DISABLE_AUTO_FLUSH) indexWriter.setMaxBufferedDocs(maxBufferedDocs); indexWriter.setMergeFactor(mergeFactor); } } /** * Close the IndexWriter and open an IndexReader. * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ protected void createIndexReader() throws CorruptIndexException, IOException { if (indexReader == null) { if (indexWriter != null) { indexWriter.close(); indexWriter = null; } indexReader = IndexReader.open(directory); } } /** * Make sure all changes are written to disk. * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if there is a low-level IO error */ public void flush() throws CorruptIndexException, LockObtainFailedException, IOException { synchronized(directory) { assureOpen(); if (indexWriter != null) { indexWriter.close(); indexWriter = null; createIndexWriter(); } else { indexReader.close(); indexReader = null; createIndexReader(); } } } /** * Adds a document to this index, using the provided analyzer instead of the * one specific in the constructor. If the document contains more than * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are * discarded. * @see IndexWriter#addDocument(Document, Analyzer) * @throws IllegalStateException if the index is closed * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if there is a low-level IO error */ public void addDocument(Document doc, Analyzer docAnalyzer) throws CorruptIndexException, LockObtainFailedException, IOException { synchronized(directory) { assureOpen(); createIndexWriter(); if (docAnalyzer != null) indexWriter.addDocument(doc, docAnalyzer); else indexWriter.addDocument(doc); } } /** * Adds a document to this index. If the document contains more than * {@link #setMaxFieldLength(int)} terms for a given field, the remainder are * discarded. * @see IndexWriter#addDocument(Document) * @throws IllegalStateException if the index is closed * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if there is a low-level IO error */ public void addDocument(Document doc) throws CorruptIndexException, LockObtainFailedException, IOException { addDocument(doc, null); } /** * Deletes all documents containing term. * This is useful if one uses a document field to hold a unique ID string for * the document. Then to delete such a document, one merely constructs a * term with the appropriate field and the unique ID string as its text and * passes it to this method. Returns the number of documents deleted. * @return the number of documents deleted * @see IndexReader#deleteDocuments(Term) * @throws IllegalStateException if the index is closed * @throws StaleReaderException if the index has changed * since this reader was opened * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if there is a low-level IO error */ public int deleteDocuments(Term term) throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException { synchronized(directory) { assureOpen(); createIndexReader(); return indexReader.deleteDocuments(term); } } /** * Deletes the document numbered docNum. * @see IndexReader#deleteDocument(int) * @throws StaleReaderException if the index has changed * since this reader was opened * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IllegalStateException if the index is closed */ public void deleteDocument(int docNum) throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException { synchronized(directory) { assureOpen(); createIndexReader(); indexReader.deleteDocument(docNum); } } /** * Returns the number of documents currently in this * index. If the writer is currently open, this returns * {@link IndexWriter#docCount()}, else {@link * IndexReader#numDocs()}. But, note that {@link * IndexWriter#docCount()} does not take deletions into * account, unlike {@link IndexReader#numDocs}. * @throws IllegalStateException if the index is closed */ public int docCount() { synchronized(directory) { assureOpen(); if (indexWriter != null) { return indexWriter.docCount(); } else { return indexReader.numDocs(); } } } /** * Merges all segments together into a single segment, optimizing an index * for search. * @see IndexWriter#optimize() * @throws IllegalStateException if the index is closed * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if there is a low-level IO error */ public void optimize() throws CorruptIndexException, LockObtainFailedException, IOException { synchronized(directory) { assureOpen(); createIndexWriter(); indexWriter.optimize(); } } /** * If non-null, information about merges and a message when * {@link #getMaxFieldLength()} is reached will be printed to this. *

    Example: index.setInfoStream(System.err); * @see IndexWriter#setInfoStream(PrintStream) * @throws IllegalStateException if the index is closed */ public void setInfoStream(PrintStream infoStream) { synchronized(directory) { assureOpen(); if (indexWriter != null) { indexWriter.setInfoStream(infoStream); } this.infoStream = infoStream; } } /** * @see IndexModifier#setInfoStream(PrintStream) * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if there is a low-level IO error */ public PrintStream getInfoStream() throws CorruptIndexException, LockObtainFailedException, IOException { synchronized(directory) { assureOpen(); createIndexWriter(); return indexWriter.getInfoStream(); } } /** * Setting to turn on usage of a compound file. When on, multiple files * for each segment are merged into a single file once the segment creation * is finished. This is done regardless of what directory is in use. * @see IndexWriter#setUseCompoundFile(boolean) * @throws IllegalStateException if the index is closed */ public void setUseCompoundFile(boolean useCompoundFile) { synchronized(directory) { assureOpen(); if (indexWriter != null) { indexWriter.setUseCompoundFile(useCompoundFile); } this.useCompoundFile = useCompoundFile; } } /** * @see IndexModifier#setUseCompoundFile(boolean) * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if there is a low-level IO error */ public boolean getUseCompoundFile() throws CorruptIndexException, LockObtainFailedException, IOException { synchronized(directory) { assureOpen(); createIndexWriter(); return indexWriter.getUseCompoundFile(); } } /** * The maximum number of terms that will be indexed for a single field in a * document. This limits the amount of memory required for indexing, so that * collections with very large files will not crash the indexing process by * running out of memory.

    * Note that this effectively truncates large documents, excluding from the * index terms that occur further in the document. If you know your source * documents are large, be sure to set this value high enough to accommodate * the expected size. If you set it to Integer.MAX_VALUE, then the only limit * is your memory, but you should anticipate an OutOfMemoryError.

    * By default, no more than 10,000 terms will be indexed for a field. * @see IndexWriter#setMaxFieldLength(int) * @throws IllegalStateException if the index is closed */ public void setMaxFieldLength(int maxFieldLength) { synchronized(directory) { assureOpen(); if (indexWriter != null) { indexWriter.setMaxFieldLength(maxFieldLength); } this.maxFieldLength = maxFieldLength; } } /** * @see IndexModifier#setMaxFieldLength(int) * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if there is a low-level IO error */ public int getMaxFieldLength() throws CorruptIndexException, LockObtainFailedException, IOException { synchronized(directory) { assureOpen(); createIndexWriter(); return indexWriter.getMaxFieldLength(); } } /** * Determines the minimal number of documents required before the buffered * in-memory documents are merging and a new Segment is created. * Since Documents are merged in a {@link org.apache.lucene.store.RAMDirectory}, * large value gives faster indexing. At the same time, mergeFactor limits * the number of files open in a FSDirectory. * *

    The default value is 10. * * @see IndexWriter#setMaxBufferedDocs(int) * @throws IllegalStateException if the index is closed * @throws IllegalArgumentException if maxBufferedDocs is smaller than 2 */ public void setMaxBufferedDocs(int maxBufferedDocs) { synchronized(directory) { assureOpen(); if (indexWriter != null) { indexWriter.setMaxBufferedDocs(maxBufferedDocs); } this.maxBufferedDocs = maxBufferedDocs; } } /** * @see IndexModifier#setMaxBufferedDocs(int) * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if there is a low-level IO error */ public int getMaxBufferedDocs() throws CorruptIndexException, LockObtainFailedException, IOException { synchronized(directory) { assureOpen(); createIndexWriter(); return indexWriter.getMaxBufferedDocs(); } } /** * Determines how often segment indices are merged by addDocument(). With * smaller values, less RAM is used while indexing, and searches on * unoptimized indices are faster, but indexing speed is slower. With larger * values, more RAM is used during indexing, and while searches on unoptimized * indices are slower, indexing is faster. Thus larger values (> 10) are best * for batch index creation, and smaller values (< 10) for indices that are * interactively maintained. *

    This must never be less than 2. The default value is 10. * * @see IndexWriter#setMergeFactor(int) * @throws IllegalStateException if the index is closed */ public void setMergeFactor(int mergeFactor) { synchronized(directory) { assureOpen(); if (indexWriter != null) { indexWriter.setMergeFactor(mergeFactor); } this.mergeFactor = mergeFactor; } } /** * @see IndexModifier#setMergeFactor(int) * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if there is a low-level IO error */ public int getMergeFactor() throws CorruptIndexException, LockObtainFailedException, IOException { synchronized(directory) { assureOpen(); createIndexWriter(); return indexWriter.getMergeFactor(); } } /** * Close this index, writing all pending changes to disk. * * @throws IllegalStateException if the index has been closed before already * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public void close() throws CorruptIndexException, IOException { synchronized(directory) { if (!open) throw new IllegalStateException("Index is closed already"); if (indexWriter != null) { indexWriter.close(); indexWriter = null; } else if (indexReader != null) { indexReader.close(); indexReader = null; } open = false; if (closeDir) { directory.close(); } closeDir = false; } } public String toString() { return "Index@" + directory; } /* // used as an example in the javadoc: public static void main(String[] args) throws IOException { Analyzer analyzer = new StandardAnalyzer(); // create an index in /tmp/index, overwriting an existing one: IndexModifier indexModifier = new IndexModifier("/tmp/index", analyzer, true); Document doc = new Document(); doc.add(new Fieldable("id", "1", Fieldable.Store.YES, Fieldable.Index.NOT_ANALYZED)); doc.add(new Fieldable("body", "a simple test", Fieldable.Store.YES, Fieldable.Index.ANALYZED)); indexModifier.addDocument(doc); int deleted = indexModifier.delete(new Term("id", "1")); System.out.println("Deleted " + deleted + " document"); indexModifier.flush(); System.out.println(indexModifier.docCount() + " docs in index"); indexModifier.close(); }*/ } lucene-2.9.4/src/java/org/apache/lucene/index/TermsHashPerField.java0000644000175000017500000004114311474320230025730 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Arrays; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.Fieldable; import org.apache.lucene.util.UnicodeUtil; final class TermsHashPerField extends InvertedDocConsumerPerField { final TermsHashConsumerPerField consumer; final TermsHashPerField nextPerField; final TermsHashPerThread perThread; final DocumentsWriter.DocState docState; final FieldInvertState fieldState; TermAttribute termAtt; // Copied from our perThread final CharBlockPool charPool; final IntBlockPool intPool; final ByteBlockPool bytePool; final int streamCount; final int numPostingInt; final FieldInfo fieldInfo; boolean postingsCompacted; int numPostings; private int postingsHashSize = 4; private int postingsHashHalfSize = postingsHashSize/2; private int postingsHashMask = postingsHashSize-1; private RawPostingList[] postingsHash = new RawPostingList[postingsHashSize]; private RawPostingList p; public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) { this.perThread = perThread; intPool = perThread.intPool; charPool = perThread.charPool; bytePool = perThread.bytePool; docState = perThread.docState; fieldState = docInverterPerField.fieldState; this.consumer = perThread.consumer.addField(this, fieldInfo); streamCount = consumer.getStreamCount(); numPostingInt = 2*streamCount; this.fieldInfo = fieldInfo; if (nextPerThread != null) nextPerField = (TermsHashPerField) nextPerThread.addField(docInverterPerField, fieldInfo); else nextPerField = null; } void shrinkHash(int targetSize) { assert postingsCompacted || numPostings == 0; final int newSize = 4; if (newSize != postingsHash.length) { postingsHash = new RawPostingList[newSize]; postingsHashSize = newSize; postingsHashHalfSize = newSize/2; postingsHashMask = newSize-1; } Arrays.fill(postingsHash, null); } public void reset() { if (!postingsCompacted) compactPostings(); assert numPostings <= postingsHash.length; if (numPostings > 0) { perThread.termsHash.recyclePostings(postingsHash, numPostings); Arrays.fill(postingsHash, 0, numPostings, null); numPostings = 0; } postingsCompacted = false; if (nextPerField != null) nextPerField.reset(); } synchronized public void abort() { reset(); if (nextPerField != null) nextPerField.abort(); } public void initReader(ByteSliceReader reader, RawPostingList p, int stream) { assert stream < streamCount; final int[] ints = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT]; final int upto = p.intStart & DocumentsWriter.INT_BLOCK_MASK; reader.init(bytePool, p.byteStart+stream*ByteBlockPool.FIRST_LEVEL_SIZE, ints[upto+stream]); } private synchronized void compactPostings() { int upto = 0; for(int i=0;i= hi) return; else if (hi == 1+lo) { if (comparePostings(postings[lo], postings[hi]) > 0) { final RawPostingList tmp = postings[lo]; postings[lo] = postings[hi]; postings[hi] = tmp; } return; } int mid = (lo + hi) >>> 1; if (comparePostings(postings[lo], postings[mid]) > 0) { RawPostingList tmp = postings[lo]; postings[lo] = postings[mid]; postings[mid] = tmp; } if (comparePostings(postings[mid], postings[hi]) > 0) { RawPostingList tmp = postings[mid]; postings[mid] = postings[hi]; postings[hi] = tmp; if (comparePostings(postings[lo], postings[mid]) > 0) { RawPostingList tmp2 = postings[lo]; postings[lo] = postings[mid]; postings[mid] = tmp2; } } int left = lo + 1; int right = hi - 1; if (left >= right) return; RawPostingList partition = postings[mid]; for (; ;) { while (comparePostings(postings[right], partition) > 0) --right; while (left < right && comparePostings(postings[left], partition) <= 0) ++left; if (left < right) { RawPostingList tmp = postings[left]; postings[left] = postings[right]; postings[right] = tmp; --right; } else { break; } } quickSort(postings, lo, left); quickSort(postings, left + 1, hi); } /** Compares term text for two Posting instance and * returns -1 if p1 < p2; 1 if p1 > p2; else 0. */ int comparePostings(RawPostingList p1, RawPostingList p2) { if (p1 == p2) return 0; final char[] text1 = charPool.buffers[p1.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; int pos1 = p1.textStart & DocumentsWriter.CHAR_BLOCK_MASK; final char[] text2 = charPool.buffers[p2.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; int pos2 = p2.textStart & DocumentsWriter.CHAR_BLOCK_MASK; assert text1 != text2 || pos1 != pos2; while(true) { final char c1 = text1[pos1++]; final char c2 = text2[pos2++]; if (c1 != c2) { if (0xffff == c2) return 1; else if (0xffff == c1) return -1; else return c1-c2; } else // This method should never compare equal postings // unless p1==p2 assert c1 != 0xffff; } } /** Test whether the text for current RawPostingList p equals * current tokenText. */ private boolean postingEquals(final char[] tokenText, final int tokenTextLen) { final char[] text = perThread.charPool.buffers[p.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; assert text != null; int pos = p.textStart & DocumentsWriter.CHAR_BLOCK_MASK; int tokenPos = 0; for(;tokenPos>8)+code)|1; do { code += inc; hashPos = code & postingsHashMask; p = postingsHash[hashPos]; } while (p != null && p.textStart != textStart); } if (p == null) { // First time we are seeing this token since we last // flushed the hash. // Refill? if (0 == perThread.freePostingsCount) perThread.morePostings(); // Pull next free RawPostingList from free list p = perThread.freePostings[--perThread.freePostingsCount]; assert p != null; p.textStart = textStart; assert postingsHash[hashPos] == null; postingsHash[hashPos] = p; numPostings++; if (numPostings == postingsHashHalfSize) rehashPostings(2*postingsHashSize); // Init stream slices if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) intPool.nextBuffer(); if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) bytePool.nextBuffer(); intUptos = intPool.buffer; intUptoStart = intPool.intUpto; intPool.intUpto += streamCount; p.intStart = intUptoStart + intPool.intOffset; for(int i=0;i> DocumentsWriter.INT_BLOCK_SHIFT]; intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK; consumer.addTerm(p); } } // Primary entry point (for first TermsHash) void add() throws IOException { assert !postingsCompacted; // We are first in the chain so we must "intern" the // term text into textStart address // Get the text of this term. final char[] tokenText = termAtt.termBuffer();; final int tokenTextLen = termAtt.termLength(); // Compute hashcode & replace any invalid UTF16 sequences int downto = tokenTextLen; int code = 0; while (downto > 0) { char ch = tokenText[--downto]; if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) { if (0 == downto) { // Unpaired ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR; } else { final char ch2 = tokenText[downto-1]; if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END) { // OK: high followed by low. This is a valid // surrogate pair. code = ((code*31) + ch)*31+ch2; downto--; continue; } else { // Unpaired ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR; } } } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && (ch <= UnicodeUtil.UNI_SUR_HIGH_END || ch == 0xffff)) { // Unpaired or 0xffff ch = tokenText[downto] = UnicodeUtil.UNI_REPLACEMENT_CHAR; } code = (code*31) + ch; } int hashPos = code & postingsHashMask; // Locate RawPostingList in hash p = postingsHash[hashPos]; if (p != null && !postingEquals(tokenText, tokenTextLen)) { // Conflict: keep searching different locations in // the hash table. final int inc = ((code>>8)+code)|1; do { code += inc; hashPos = code & postingsHashMask; p = postingsHash[hashPos]; } while (p != null && !postingEquals(tokenText, tokenTextLen)); } if (p == null) { // First time we are seeing this token since we last // flushed the hash. final int textLen1 = 1+tokenTextLen; if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE) { if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) { // Just skip this term, to remain as robust as // possible during indexing. A TokenFilter // can be inserted into the analyzer chain if // other behavior is wanted (pruning the term // to a prefix, throwing an exception, etc). if (docState.maxTermPrefix == null) docState.maxTermPrefix = new String(tokenText, 0, 30); consumer.skippingLongTerm(); return; } charPool.nextBuffer(); } // Refill? if (0 == perThread.freePostingsCount) perThread.morePostings(); // Pull next free RawPostingList from free list p = perThread.freePostings[--perThread.freePostingsCount]; assert p != null; final char[] text = charPool.buffer; final int textUpto = charPool.charUpto; p.textStart = textUpto + charPool.charOffset; charPool.charUpto += textLen1; System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen); text[textUpto+tokenTextLen] = 0xffff; assert postingsHash[hashPos] == null; postingsHash[hashPos] = p; numPostings++; if (numPostings == postingsHashHalfSize) rehashPostings(2*postingsHashSize); // Init stream slices if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) intPool.nextBuffer(); if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) bytePool.nextBuffer(); intUptos = intPool.buffer; intUptoStart = intPool.intUpto; intPool.intUpto += streamCount; p.intStart = intUptoStart + intPool.intOffset; for(int i=0;i> DocumentsWriter.INT_BLOCK_SHIFT]; intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK; consumer.addTerm(p); } if (doNextCall) nextPerField.add(p.textStart); } int[] intUptos; int intUptoStart; void writeByte(int stream, byte b) { int upto = intUptos[intUptoStart+stream]; byte[] bytes = bytePool.buffers[upto >> DocumentsWriter.BYTE_BLOCK_SHIFT]; assert bytes != null; int offset = upto & DocumentsWriter.BYTE_BLOCK_MASK; if (bytes[offset] != 0) { // End of slice; allocate a new one offset = bytePool.allocSlice(bytes, offset); bytes = bytePool.buffer; intUptos[intUptoStart+stream] = offset + bytePool.byteOffset; } bytes[offset] = b; (intUptos[intUptoStart+stream])++; } public void writeBytes(int stream, byte[] b, int offset, int len) { // TODO: optimize final int end = offset + len; for(int i=offset;i>>= 7; } writeByte(stream, (byte) i); } void finish() throws IOException { consumer.finish(); if (nextPerField != null) nextPerField.finish(); } /** Called when postings hash is too small (> 50% * occupied) or too large (< 20% occupied). */ void rehashPostings(final int newSize) { final int newMask = newSize-1; RawPostingList[] newHash = new RawPostingList[newSize]; for(int i=0;i> DocumentsWriter.CHAR_BLOCK_SHIFT]; int pos = start; while(text[pos] != 0xffff) pos++; code = 0; while (pos > start) code = (code*31) + text[--pos]; } else code = p0.textStart; int hashPos = code & newMask; assert hashPos >= 0; if (newHash[hashPos] != null) { final int inc = ((code>>8)+code)|1; do { code += inc; hashPos = code & newMask; } while (newHash[hashPos] != null); } newHash[hashPos] = p0; } } postingsHashMask = newMask; postingsHash = newHash; postingsHashSize = newSize; postingsHashHalfSize = newSize >> 1; } } lucene-2.9.4/src/java/org/apache/lucene/index/DocFieldProcessor.java0000644000175000017500000000571011474320230025770 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Collection; import java.util.Map; import java.util.HashMap; import java.util.Iterator; /** * This is a DocConsumer that gathers all fields under the * same name, and calls per-field consumers to process field * by field. This class doesn't doesn't do any "real" work * of its own: it just forwards the fields to a * DocFieldConsumer. */ final class DocFieldProcessor extends DocConsumer { final DocumentsWriter docWriter; final FieldInfos fieldInfos = new FieldInfos(); final DocFieldConsumer consumer; final StoredFieldsWriter fieldsWriter; public DocFieldProcessor(DocumentsWriter docWriter, DocFieldConsumer consumer) { this.docWriter = docWriter; this.consumer = consumer; consumer.setFieldInfos(fieldInfos); fieldsWriter = new StoredFieldsWriter(docWriter, fieldInfos); } public void closeDocStore(SegmentWriteState state) throws IOException { consumer.closeDocStore(state); fieldsWriter.closeDocStore(state); } public void flush(Collection threads, SegmentWriteState state) throws IOException { Map childThreadsAndFields = new HashMap(); Iterator it = threads.iterator(); while(it.hasNext()) { DocFieldProcessorPerThread perThread = (DocFieldProcessorPerThread) it.next(); childThreadsAndFields.put(perThread.consumer, perThread.fields()); perThread.trimFields(state); } fieldsWriter.flush(state); consumer.flush(childThreadsAndFields, state); // Important to save after asking consumer to flush so // consumer can alter the FieldInfo* if necessary. EG, // FreqProxTermsWriter does this with // FieldInfo.storePayload. final String fileName = state.segmentFileName(IndexFileNames.FIELD_INFOS_EXTENSION); fieldInfos.write(state.directory, fileName); state.flushedFiles.add(fileName); } public void abort() { fieldsWriter.abort(); consumer.abort(); } public boolean freeRAM() { return consumer.freeRAM(); } public DocConsumerPerThread addThread(DocumentsWriterThreadState threadState) throws IOException { return new DocFieldProcessorPerThread(threadState, this); } } lucene-2.9.4/src/java/org/apache/lucene/index/DirectoryOwningReader.java0000644000175000017500000000641311474320230026671 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; /** * This class keeps track of closing the underlying directory. It is used to wrap * DirectoryReaders, that are created using a String/File parameter * in IndexReader.open() with FSDirectory.getDirectory(). * @deprecated This helper class is removed with all String/File * IndexReader.open() methods in Lucene 3.0 */ final class DirectoryOwningReader extends FilterIndexReader implements Cloneable { DirectoryOwningReader(final IndexReader in) { super(in); this.ref = new SegmentReader.Ref(); assert this.ref.refCount() == 1; } private DirectoryOwningReader(final IndexReader in, final SegmentReader.Ref ref) { super(in); this.ref = ref; ref.incRef(); } public IndexReader reopen() throws CorruptIndexException, IOException { ensureOpen(); final IndexReader r = in.reopen(); if (r != in) return new DirectoryOwningReader(r, ref); return this; } public IndexReader reopen(boolean openReadOnly) throws CorruptIndexException, IOException { ensureOpen(); final IndexReader r = in.reopen(openReadOnly); if (r != in) return new DirectoryOwningReader(r, ref); return this; } public IndexReader reopen(final IndexCommit commit) throws CorruptIndexException, IOException { ensureOpen(); final IndexReader r = in.reopen(commit); if (r != in) return new DirectoryOwningReader(r, ref); return this; } public Object clone() { ensureOpen(); return new DirectoryOwningReader((IndexReader) in.clone(), ref); } public IndexReader clone(boolean openReadOnly) throws CorruptIndexException, IOException { ensureOpen(); return new DirectoryOwningReader(in.clone(openReadOnly), ref); } protected void doClose() throws IOException { IOException ioe = null; // close the reader, record exception try { super.doClose(); } catch (IOException e) { ioe = e; } // close the directory, record exception if (ref.decRef() == 0) { try { in.directory().close(); } catch (IOException e) { if (ioe == null) ioe = e; } } // throw the first exception if (ioe != null) throw ioe; } /** * This member contains the ref counter, that is passed to each instance after cloning/reopening, * and is global to all DirectoryOwningReader derived from the original one. * This reuses the class {@link SegmentReader.Ref} */ private final SegmentReader.Ref ref; } lucene-2.9.4/src/java/org/apache/lucene/index/IndexReader.java0000644000175000017500000017644011474320230024622 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.search.Similarity; import org.apache.lucene.store.*; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.util.Arrays; import java.util.Collection; import java.util.Map; /** IndexReader is an abstract class, providing an interface for accessing an index. Search of an index is done entirely through this abstract interface, so that any subclass which implements it is searchable.

    Concrete subclasses of IndexReader are usually constructed with a call to one of the static open() methods, e.g. {@link #open(String, boolean)}.

    For efficiency, in this API documents are often referred to via document numbers, non-negative integers which each name a unique document in the index. These document numbers are ephemeral--they may change as documents are added to and deleted from an index. Clients should thus not rely on a given document having the same number between sessions.

    An IndexReader can be opened on a directory for which an IndexWriter is opened already, but it cannot be used to delete documents from the index then.

    NOTE: for backwards API compatibility, several methods are not listed as abstract, but have no useful implementations in this base class and instead always throw UnsupportedOperationException. Subclasses are strongly encouraged to override these methods, but in many cases may not need to.

    NOTE: as of 2.4, it's possible to open a read-only IndexReader using one of the static open methods that accepts the boolean readOnly parameter. Such a reader has better concurrency as it's not necessary to synchronize on the isDeleted method. Currently the default for readOnly is false, meaning if not specified you will get a read/write IndexReader. But in 3.0 this default will change to true, meaning you must explicitly specify false if you want to make changes with the resulting IndexReader.

    NOTE: {@link IndexReader} instances are completely thread safe, meaning multiple threads can call any of its methods, concurrently. If your application requires external synchronization, you should not synchronize on the IndexReader instance; use your own (non-Lucene) objects instead. @version $Id: IndexReader.java 950612 2010-06-02 16:03:19Z mikemccand $ */ public abstract class IndexReader implements Cloneable { /** * Constants describing field properties, for example used for * {@link IndexReader#getFieldNames(FieldOption)}. */ public static final class FieldOption { private String option; private FieldOption() { } private FieldOption(String option) { this.option = option; } public String toString() { return this.option; } /** All fields */ public static final FieldOption ALL = new FieldOption ("ALL"); /** All indexed fields */ public static final FieldOption INDEXED = new FieldOption ("INDEXED"); /** All fields that store payloads */ public static final FieldOption STORES_PAYLOADS = new FieldOption ("STORES_PAYLOADS"); /** All fields that omit tf */ public static final FieldOption OMIT_TERM_FREQ_AND_POSITIONS = new FieldOption ("OMIT_TERM_FREQ_AND_POSITIONS"); /** @deprecated Renamed to {@link #OMIT_TERM_FREQ_AND_POSITIONS} */ public static final FieldOption OMIT_TF = OMIT_TERM_FREQ_AND_POSITIONS; /** All fields which are not indexed */ public static final FieldOption UNINDEXED = new FieldOption ("UNINDEXED"); /** All fields which are indexed with termvectors enabled */ public static final FieldOption INDEXED_WITH_TERMVECTOR = new FieldOption ("INDEXED_WITH_TERMVECTOR"); /** All fields which are indexed but don't have termvectors enabled */ public static final FieldOption INDEXED_NO_TERMVECTOR = new FieldOption ("INDEXED_NO_TERMVECTOR"); /** All fields with termvectors enabled. Please note that only standard termvector fields are returned */ public static final FieldOption TERMVECTOR = new FieldOption ("TERMVECTOR"); /** All fields with termvectors with position values enabled */ public static final FieldOption TERMVECTOR_WITH_POSITION = new FieldOption ("TERMVECTOR_WITH_POSITION"); /** All fields with termvectors with offset values enabled */ public static final FieldOption TERMVECTOR_WITH_OFFSET = new FieldOption ("TERMVECTOR_WITH_OFFSET"); /** All fields with termvectors with offset values and position values enabled */ public static final FieldOption TERMVECTOR_WITH_POSITION_OFFSET = new FieldOption ("TERMVECTOR_WITH_POSITION_OFFSET"); } private boolean closed; protected boolean hasChanges; private int refCount; static int DEFAULT_TERMS_INDEX_DIVISOR = 1; private boolean disableFakeNorms = false; /** Expert: returns the current refCount for this reader */ public synchronized int getRefCount() { return refCount; } /** * Expert: increments the refCount of this IndexReader * instance. RefCounts are used to determine when a * reader can be closed safely, i.e. as soon as there are * no more references. Be sure to always call a * corresponding {@link #decRef}, in a finally clause; * otherwise the reader may never be closed. Note that * {@link #close} simply calls decRef(), which means that * the IndexReader will not really be closed until {@link * #decRef} has been called for all outstanding * references. * * @see #decRef */ public synchronized void incRef() { assert refCount > 0; ensureOpen(); refCount++; } /** * Expert: decreases the refCount of this IndexReader * instance. If the refCount drops to 0, then pending * changes (if any) are committed to the index and this * reader is closed. * * @throws IOException in case an IOException occurs in commit() or doClose() * * @see #incRef */ public synchronized void decRef() throws IOException { assert refCount > 0; ensureOpen(); if (refCount == 1) { commit(); doClose(); } refCount--; } /** * @deprecated will be deleted when IndexReader(Directory) is deleted * @see #directory() */ private Directory directory; /** * Legacy Constructor for backwards compatibility. * *

    * This Constructor should not be used, it exists for backwards * compatibility only to support legacy subclasses that did not "own" * a specific directory, but needed to specify something to be returned * by the directory() method. Future subclasses should delegate to the * no arg constructor and implement the directory() method as appropriate. * * @param directory Directory to be returned by the directory() method * @see #directory() * @deprecated - use IndexReader() */ protected IndexReader(Directory directory) { this(); this.directory = directory; } protected IndexReader() { refCount = 1; } /** * @throws AlreadyClosedException if this IndexReader is closed */ protected final void ensureOpen() throws AlreadyClosedException { if (refCount <= 0) { throw new AlreadyClosedException("this IndexReader is closed"); } } /** Returns a read/write IndexReader reading the index in an FSDirectory in the named * path. * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error * @deprecated Use {@link #open(Directory, boolean)} instead. * This method will be removed in the 3.0 release. * * @param path the path to the index directory */ public static IndexReader open(String path) throws CorruptIndexException, IOException { return open(path, false); } /** Returns an IndexReader reading the index in an * FSDirectory in the named path. You should pass * readOnly=true, since it gives much better concurrent * performance, unless you intend to do write operations * (delete documents or change norms) with the reader. * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error * @param path the path to the index directory * @param readOnly true if this should be a readOnly * reader * @deprecated Use {@link #open(Directory, boolean)} instead. * This method will be removed in the 3.0 release. * */ public static IndexReader open(String path, boolean readOnly) throws CorruptIndexException, IOException { final Directory dir = FSDirectory.getDirectory(path); IndexReader r = null; try { r = open(dir, null, null, readOnly, DEFAULT_TERMS_INDEX_DIVISOR); } finally { if (r == null) dir.close(); } return new DirectoryOwningReader(r); } /** Returns a read/write IndexReader reading the index in an FSDirectory in the named * path. * @param path the path to the index directory * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error * @deprecated Use {@link #open(Directory, boolean)} instead. * This method will be removed in the 3.0 release. * */ public static IndexReader open(File path) throws CorruptIndexException, IOException { return open(path, false); } /** Returns an IndexReader reading the index in an * FSDirectory in the named path. You should pass * readOnly=true, since it gives much better concurrent * performance, unless you intend to do write operations * (delete documents or change norms) with the reader. * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error * @param path the path to the index directory * @param readOnly true if this should be a readOnly * reader * @deprecated Use {@link #open(Directory, boolean)} instead. * This method will be removed in the 3.0 release. * */ public static IndexReader open(File path, boolean readOnly) throws CorruptIndexException, IOException { final Directory dir = FSDirectory.getDirectory(path); IndexReader r = null; try { r = open(dir, null, null, readOnly, DEFAULT_TERMS_INDEX_DIVISOR); } finally { if (r == null) dir.close(); } return new DirectoryOwningReader(r); } /** Returns a read/write IndexReader reading the index in * the given Directory. * @param directory the index directory * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error * @deprecated Use {@link #open(Directory, boolean)} instead * This method will be removed in the 3.0 release. * */ public static IndexReader open(final Directory directory) throws CorruptIndexException, IOException { return open(directory, null, null, false, DEFAULT_TERMS_INDEX_DIVISOR); } /** Returns an IndexReader reading the index in the given * Directory. You should pass readOnly=true, since it * gives much better concurrent performance, unless you * intend to do write operations (delete documents or * change norms) with the reader. * @param directory the index directory * @param readOnly true if no changes (deletions, norms) will be made with this IndexReader * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public static IndexReader open(final Directory directory, boolean readOnly) throws CorruptIndexException, IOException { return open(directory, null, null, readOnly, DEFAULT_TERMS_INDEX_DIVISOR); } /** Expert: returns a read/write IndexReader reading the index in the given * {@link IndexCommit}. * @param commit the commit point to open * @throws CorruptIndexException if the index is corrupt * @deprecated Use {@link #open(IndexCommit, boolean)} instead. * This method will be removed in the 3.0 release. * * @throws IOException if there is a low-level IO error */ public static IndexReader open(final IndexCommit commit) throws CorruptIndexException, IOException { return open(commit.getDirectory(), null, commit, false, DEFAULT_TERMS_INDEX_DIVISOR); } /** Expert: returns an IndexReader reading the index in the given * {@link IndexCommit}. You should pass readOnly=true, since it * gives much better concurrent performance, unless you * intend to do write operations (delete documents or * change norms) with the reader. * @param commit the commit point to open * @param readOnly true if no changes (deletions, norms) will be made with this IndexReader * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public static IndexReader open(final IndexCommit commit, boolean readOnly) throws CorruptIndexException, IOException { return open(commit.getDirectory(), null, commit, readOnly, DEFAULT_TERMS_INDEX_DIVISOR); } /** Expert: returns a read/write IndexReader reading the index in the given * Directory, with a custom {@link IndexDeletionPolicy}. * @param directory the index directory * @param deletionPolicy a custom deletion policy (only used * if you use this reader to perform deletes or to set * norms); see {@link IndexWriter} for details. * @deprecated Use {@link #open(Directory, IndexDeletionPolicy, boolean)} instead. * This method will be removed in the 3.0 release. * * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public static IndexReader open(final Directory directory, IndexDeletionPolicy deletionPolicy) throws CorruptIndexException, IOException { return open(directory, deletionPolicy, null, false, DEFAULT_TERMS_INDEX_DIVISOR); } /** Expert: returns an IndexReader reading the index in * the given Directory, with a custom {@link * IndexDeletionPolicy}. You should pass readOnly=true, * since it gives much better concurrent performance, * unless you intend to do write operations (delete * documents or change norms) with the reader. * @param directory the index directory * @param deletionPolicy a custom deletion policy (only used * if you use this reader to perform deletes or to set * norms); see {@link IndexWriter} for details. * @param readOnly true if no changes (deletions, norms) will be made with this IndexReader * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public static IndexReader open(final Directory directory, IndexDeletionPolicy deletionPolicy, boolean readOnly) throws CorruptIndexException, IOException { return open(directory, deletionPolicy, null, readOnly, DEFAULT_TERMS_INDEX_DIVISOR); } /** Expert: returns an IndexReader reading the index in * the given Directory, with a custom {@link * IndexDeletionPolicy}. You should pass readOnly=true, * since it gives much better concurrent performance, * unless you intend to do write operations (delete * documents or change norms) with the reader. * @param directory the index directory * @param deletionPolicy a custom deletion policy (only used * if you use this reader to perform deletes or to set * norms); see {@link IndexWriter} for details. * @param readOnly true if no changes (deletions, norms) will be made with this IndexReader * @param termInfosIndexDivisor Subsamples which indexed * terms are loaded into RAM. This has the same effect as {@link * IndexWriter#setTermIndexInterval} except that setting * must be done at indexing time while this setting can be * set per reader. When set to N, then one in every * N*termIndexInterval terms in the index is loaded into * memory. By setting this to a value > 1 you can reduce * memory usage, at the expense of higher latency when * loading a TermInfo. The default value is 1. Set this * to -1 to skip loading the terms index entirely. * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public static IndexReader open(final Directory directory, IndexDeletionPolicy deletionPolicy, boolean readOnly, int termInfosIndexDivisor) throws CorruptIndexException, IOException { return open(directory, deletionPolicy, null, readOnly, termInfosIndexDivisor); } /** Expert: returns a read/write IndexReader reading the index in the given * Directory, using a specific commit and with a custom * {@link IndexDeletionPolicy}. * @param commit the specific {@link IndexCommit} to open; * see {@link IndexReader#listCommits} to list all commits * in a directory * @param deletionPolicy a custom deletion policy (only used * if you use this reader to perform deletes or to set * norms); see {@link IndexWriter} for details. * @deprecated Use {@link #open(IndexCommit, IndexDeletionPolicy, boolean)} instead. * This method will be removed in the 3.0 release. * * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public static IndexReader open(final IndexCommit commit, IndexDeletionPolicy deletionPolicy) throws CorruptIndexException, IOException { return open(commit.getDirectory(), deletionPolicy, commit, false, DEFAULT_TERMS_INDEX_DIVISOR); } /** Expert: returns an IndexReader reading the index in * the given Directory, using a specific commit and with * a custom {@link IndexDeletionPolicy}. You should pass * readOnly=true, since it gives much better concurrent * performance, unless you intend to do write operations * (delete documents or change norms) with the reader. * @param commit the specific {@link IndexCommit} to open; * see {@link IndexReader#listCommits} to list all commits * in a directory * @param deletionPolicy a custom deletion policy (only used * if you use this reader to perform deletes or to set * norms); see {@link IndexWriter} for details. * @param readOnly true if no changes (deletions, norms) will be made with this IndexReader * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public static IndexReader open(final IndexCommit commit, IndexDeletionPolicy deletionPolicy, boolean readOnly) throws CorruptIndexException, IOException { return open(commit.getDirectory(), deletionPolicy, commit, readOnly, DEFAULT_TERMS_INDEX_DIVISOR); } /** Expert: returns an IndexReader reading the index in * the given Directory, using a specific commit and with * a custom {@link IndexDeletionPolicy}. You should pass * readOnly=true, since it gives much better concurrent * performance, unless you intend to do write operations * (delete documents or change norms) with the reader. * @param commit the specific {@link IndexCommit} to open; * see {@link IndexReader#listCommits} to list all commits * in a directory * @param deletionPolicy a custom deletion policy (only used * if you use this reader to perform deletes or to set * norms); see {@link IndexWriter} for details. * @param readOnly true if no changes (deletions, norms) will be made with this IndexReader * @param termInfosIndexDivisor Subsamples which indexed * terms are loaded into RAM. This has the same effect as {@link * IndexWriter#setTermIndexInterval} except that setting * must be done at indexing time while this setting can be * set per reader. When set to N, then one in every * N*termIndexInterval terms in the index is loaded into * memory. By setting this to a value > 1 you can reduce * memory usage, at the expense of higher latency when * loading a TermInfo. The default value is 1. Set this * to -1 to skip loading the terms index entirely. * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public static IndexReader open(final IndexCommit commit, IndexDeletionPolicy deletionPolicy, boolean readOnly, int termInfosIndexDivisor) throws CorruptIndexException, IOException { return open(commit.getDirectory(), deletionPolicy, commit, readOnly, termInfosIndexDivisor); } private static IndexReader open(final Directory directory, final IndexDeletionPolicy deletionPolicy, final IndexCommit commit, final boolean readOnly, int termInfosIndexDivisor) throws CorruptIndexException, IOException { return DirectoryReader.open(directory, deletionPolicy, commit, readOnly, termInfosIndexDivisor); } /** * Refreshes an IndexReader if the index has changed since this instance * was (re)opened. *

    * Opening an IndexReader is an expensive operation. This method can be used * to refresh an existing IndexReader to reduce these costs. This method * tries to only load segments that have changed or were created after the * IndexReader was (re)opened. *

    * If the index has not changed since this instance was (re)opened, then this * call is a NOOP and returns this instance. Otherwise, a new instance is * returned. The old instance is not closed and remains usable.
    *

    * If the reader is reopened, even though they share * resources internally, it's safe to make changes * (deletions, norms) with the new reader. All shared * mutable state obeys "copy on write" semantics to ensure * the changes are not seen by other readers. *

    * You can determine whether a reader was actually reopened by comparing the * old instance with the instance returned by this method: *

       * IndexReader reader = ... 
       * ...
       * IndexReader newReader = r.reopen();
       * if (newReader != reader) {
       * ...     // reader was reopened
       *   reader.close(); 
       * }
       * reader = newReader;
       * ...
       * 
    * * Be sure to synchronize that code so that other threads, * if present, can never use reader after it has been * closed and before it's switched to newReader. * *

    NOTE: If this reader is a near real-time * reader (obtained from {@link IndexWriter#getReader()}, * reopen() will simply call writer.getReader() again for * you, though this may change in the future. * * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public synchronized IndexReader reopen() throws CorruptIndexException, IOException { throw new UnsupportedOperationException("This reader does not support reopen()."); } /** Just like {@link #reopen()}, except you can change the * readOnly of the original reader. If the index is * unchanged but readOnly is different then a new reader * will be returned. */ public synchronized IndexReader reopen(boolean openReadOnly) throws CorruptIndexException, IOException { throw new UnsupportedOperationException("This reader does not support reopen()."); } /** Expert: reopen this reader on a specific commit point. * This always returns a readOnly reader. If the * specified commit point matches what this reader is * already on, and this reader is already readOnly, then * this same instance is returned; if it is not already * readOnly, a readOnly clone is returned. */ public synchronized IndexReader reopen(final IndexCommit commit) throws CorruptIndexException, IOException { throw new UnsupportedOperationException("This reader does not support reopen(IndexCommit)."); } /** * Efficiently clones the IndexReader (sharing most * internal state). *

    * On cloning a reader with pending changes (deletions, * norms), the original reader transfers its write lock to * the cloned reader. This means only the cloned reader * may make further changes to the index, and commit the * changes to the index on close, but the old reader still * reflects all changes made up until it was cloned. *

    * Like {@link #reopen()}, it's safe to make changes to * either the original or the cloned reader: all shared * mutable state obeys "copy on write" semantics to ensure * the changes are not seen by other readers. *

    * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public synchronized Object clone() { throw new UnsupportedOperationException("This reader does not implement clone()"); } /** * Clones the IndexReader and optionally changes readOnly. A readOnly * reader cannot open a writeable reader. * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public synchronized IndexReader clone(boolean openReadOnly) throws CorruptIndexException, IOException { throw new UnsupportedOperationException("This reader does not implement clone()"); } /** * Returns the directory associated with this index. The Default * implementation returns the directory specified by subclasses when * delegating to the IndexReader(Directory) constructor, or throws an * UnsupportedOperationException if one was not specified. * @throws UnsupportedOperationException if no directory */ public Directory directory() { ensureOpen(); if (null != directory) { return directory; } else { throw new UnsupportedOperationException("This reader does not support this method."); } } /** * Returns the time the index in the named directory was last modified. * Do not use this to check whether the reader is still up-to-date, use * {@link #isCurrent()} instead. * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error * @deprecated Use {@link #lastModified(Directory)} instead. * This method will be removed in the 3.0 release. */ public static long lastModified(String directory) throws CorruptIndexException, IOException { return lastModified(new File(directory)); } /** * Returns the time the index in the named directory was last modified. * Do not use this to check whether the reader is still up-to-date, use * {@link #isCurrent()} instead. * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error * @deprecated Use {@link #lastModified(Directory)} instead. * This method will be removed in the 3.0 release. * */ public static long lastModified(File fileDirectory) throws CorruptIndexException, IOException { Directory dir = FSDirectory.getDirectory(fileDirectory); // use new static method here try { return lastModified(dir); } finally { dir.close(); } } /** * Returns the time the index in the named directory was last modified. * Do not use this to check whether the reader is still up-to-date, use * {@link #isCurrent()} instead. * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public static long lastModified(final Directory directory2) throws CorruptIndexException, IOException { return ((Long) new SegmentInfos.FindSegmentsFile(directory2) { public Object doBody(String segmentFileName) throws IOException { return new Long(directory2.fileModified(segmentFileName)); } }.run()).longValue(); } /** * Reads version number from segments files. The version number is * initialized with a timestamp and then increased by one for each change of * the index. * * @param directory where the index resides. * @return version number. * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error * @deprecated Use {@link #getCurrentVersion(Directory)} instead. * This method will be removed in the 3.0 release. */ public static long getCurrentVersion(String directory) throws CorruptIndexException, IOException { return getCurrentVersion(new File(directory)); } /** * Reads version number from segments files. The version number is * initialized with a timestamp and then increased by one for each change of * the index. * * @param directory where the index resides. * @return version number. * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error * @deprecated Use {@link #getCurrentVersion(Directory)} instead. * This method will be removed in the 3.0 release. */ public static long getCurrentVersion(File directory) throws CorruptIndexException, IOException { Directory dir = FSDirectory.getDirectory(directory); try { return getCurrentVersion(dir); } finally { dir.close(); } } /** * Reads version number from segments files. The version number is * initialized with a timestamp and then increased by one for each change of * the index. * * @param directory where the index resides. * @return version number. * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public static long getCurrentVersion(Directory directory) throws CorruptIndexException, IOException { return SegmentInfos.readCurrentVersion(directory); } /** * Reads commitUserData, previously passed to {@link * IndexWriter#commit(Map)}, from current index * segments file. This will return null if {@link * IndexWriter#commit(Map)} has never been called for * this index. * * @param directory where the index resides. * @return commit userData. * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error * * @see #getCommitUserData() */ public static Map getCommitUserData(Directory directory) throws CorruptIndexException, IOException { return SegmentInfos.readCurrentUserData(directory); } /** * Version number when this IndexReader was opened. Not * implemented in the IndexReader base class. * *

    If this reader is based on a Directory (ie, was * created by calling {@link #open}, or {@link #reopen} on * a reader based on a Directory), then this method * returns the version recorded in the commit that the * reader opened. This version is advanced every time * {@link IndexWriter#commit} is called.

    * *

    If instead this reader is a near real-time reader * (ie, obtained by a call to {@link * IndexWriter#getReader}, or by calling {@link #reopen} * on a near real-time reader), then this method returns * the version of the last commit done by the writer. * Note that even as further changes are made with the * writer, the version will not changed until a commit is * completed. Thus, you should not rely on this method to * determine when a near real-time reader should be * opened. Use {@link #isCurrent} instead.

    * * @throws UnsupportedOperationException unless overridden in subclass */ public long getVersion() { throw new UnsupportedOperationException("This reader does not support this method."); } /** * Retrieve the String userData optionally passed to * IndexWriter#commit. This will return null if {@link * IndexWriter#commit(Map)} has never been called for * this index. * * @see #getCommitUserData(Directory) */ public Map getCommitUserData() { throw new UnsupportedOperationException("This reader does not support this method."); } /**

    For IndexReader implementations that use * TermInfosReader to read terms, this sets the * indexDivisor to subsample the number of indexed terms * loaded into memory. This has the same effect as {@link * IndexWriter#setTermIndexInterval} except that setting * must be done at indexing time while this setting can be * set per reader. When set to N, then one in every * N*termIndexInterval terms in the index is loaded into * memory. By setting this to a value > 1 you can reduce * memory usage, at the expense of higher latency when * loading a TermInfo. The default value is 1.

    * * NOTE: you must call this before the term * index is loaded. If the index is already loaded, * an IllegalStateException is thrown. * @throws IllegalStateException if the term index has already been loaded into memory * @deprecated Please use {@link IndexReader#open(Directory, IndexDeletionPolicy, boolean, int)} to specify the required TermInfos index divisor instead. */ public void setTermInfosIndexDivisor(int indexDivisor) throws IllegalStateException { throw new UnsupportedOperationException("Please pass termInfosIndexDivisor up-front when opening IndexReader"); } /**

    For IndexReader implementations that use * TermInfosReader to read terms, this returns the * current indexDivisor as specified when the reader was * opened. */ public int getTermInfosIndexDivisor() { throw new UnsupportedOperationException("This reader does not support this method."); } /** * Check whether any new changes have occurred to the * index since this reader was opened. * *

    If this reader is based on a Directory (ie, was * created by calling {@link #open}, or {@link #reopen} on * a reader based on a Directory), then this method checks * if any further commits (see {@link IndexWriter#commit} * have occurred in that directory).

    * *

    If instead this reader is a near real-time reader * (ie, obtained by a call to {@link * IndexWriter#getReader}, or by calling {@link #reopen} * on a near real-time reader), then this method checks if * either a new commmit has occurred, or any new * uncommitted changes have taken place via the writer. * Note that even if the writer has only performed * merging, this method will still return false.

    * *

    In any event, if this returns false, you should call * {@link #reopen} to get a new reader that sees the * changes.

    * * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error * @throws UnsupportedOperationException unless overridden in subclass */ public boolean isCurrent() throws CorruptIndexException, IOException { throw new UnsupportedOperationException("This reader does not support this method."); } /** * Checks is the index is optimized (if it has a single segment and * no deletions). Not implemented in the IndexReader base class. * @return true if the index is optimized; false otherwise * @throws UnsupportedOperationException unless overridden in subclass */ public boolean isOptimized() { throw new UnsupportedOperationException("This reader does not support this method."); } /** * Return an array of term frequency vectors for the specified document. * The array contains a vector for each vectorized field in the document. * Each vector contains terms and frequencies for all terms in a given vectorized field. * If no such fields existed, the method returns null. The term vectors that are * returned may either be of type {@link TermFreqVector} * or of type {@link TermPositionVector} if * positions or offsets have been stored. * * @param docNumber document for which term frequency vectors are returned * @return array of term frequency vectors. May be null if no term vectors have been * stored for the specified document. * @throws IOException if index cannot be accessed * @see org.apache.lucene.document.Field.TermVector */ abstract public TermFreqVector[] getTermFreqVectors(int docNumber) throws IOException; /** * Return a term frequency vector for the specified document and field. The * returned vector contains terms and frequencies for the terms in * the specified field of this document, if the field had the storeTermVector * flag set. If termvectors had been stored with positions or offsets, a * {@link TermPositionVector} is returned. * * @param docNumber document for which the term frequency vector is returned * @param field field for which the term frequency vector is returned. * @return term frequency vector May be null if field does not exist in the specified * document or term vector was not stored. * @throws IOException if index cannot be accessed * @see org.apache.lucene.document.Field.TermVector */ abstract public TermFreqVector getTermFreqVector(int docNumber, String field) throws IOException; /** * Load the Term Vector into a user-defined data structure instead of relying on the parallel arrays of * the {@link TermFreqVector}. * @param docNumber The number of the document to load the vector for * @param field The name of the field to load * @param mapper The {@link TermVectorMapper} to process the vector. Must not be null * @throws IOException if term vectors cannot be accessed or if they do not exist on the field and doc. specified. * */ abstract public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException; /** * Map all the term vectors for all fields in a Document * @param docNumber The number of the document to load the vector for * @param mapper The {@link TermVectorMapper} to process the vector. Must not be null * @throws IOException if term vectors cannot be accessed or if they do not exist on the field and doc. specified. */ abstract public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException; /** * Returns true if an index exists at the specified directory. * If the directory does not exist or if there is no index in it. * false is returned. * @param directory the directory to check for an index * @return true if an index exists; false otherwise * @deprecated Use {@link #indexExists(Directory)} instead * This method will be removed in the 3.0 release. * */ public static boolean indexExists(String directory) { return indexExists(new File(directory)); } /** * Returns true if an index exists at the specified directory. * If the directory does not exist or if there is no index in it. * @param directory the directory to check for an index * @return true if an index exists; false otherwise * @deprecated Use {@link #indexExists(Directory)} instead. * This method will be removed in the 3.0 release. * */ public static boolean indexExists(File directory) { return SegmentInfos.getCurrentSegmentGeneration(directory.list()) != -1; } /** * Returns true if an index exists at the specified directory. * If the directory does not exist or if there is no index in it. * @param directory the directory to check for an index * @return true if an index exists; false otherwise * @throws IOException if there is a problem with accessing the index */ public static boolean indexExists(Directory directory) throws IOException { return SegmentInfos.getCurrentSegmentGeneration(directory) != -1; } /** Returns the number of documents in this index. */ public abstract int numDocs(); /** Returns one greater than the largest possible document number. * This may be used to, e.g., determine how big to allocate an array which * will have an element for every document number in an index. */ public abstract int maxDoc(); /** Returns the number of deleted documents. */ public int numDeletedDocs() { return maxDoc() - numDocs(); } /** * Returns the stored fields of the nth * Document in this index. *

    * NOTE: for performance reasons, this method does not check if the * requested document is deleted, and therefore asking for a deleted document * may yield unspecified results. Usually this is not required, however you * can call {@link #isDeleted(int)} with the requested document ID to verify * the document is not deleted. * * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public Document document(int n) throws CorruptIndexException, IOException { ensureOpen(); return document(n, null); } /** * Get the {@link org.apache.lucene.document.Document} at the n * th position. The {@link FieldSelector} may be used to determine * what {@link org.apache.lucene.document.Field}s to load and how they should * be loaded. NOTE: If this Reader (more specifically, the underlying * FieldsReader) is closed before the lazy * {@link org.apache.lucene.document.Field} is loaded an exception may be * thrown. If you want the value of a lazy * {@link org.apache.lucene.document.Field} to be available after closing you * must explicitly load it or fetch the Document again with a new loader. *

    * NOTE: for performance reasons, this method does not check if the * requested document is deleted, and therefore asking for a deleted document * may yield unspecified results. Usually this is not required, however you * can call {@link #isDeleted(int)} with the requested document ID to verify * the document is not deleted. * * @param n Get the document at the nth position * @param fieldSelector The {@link FieldSelector} to use to determine what * Fields should be loaded on the Document. May be null, in which case * all Fields will be loaded. * @return The stored fields of the * {@link org.apache.lucene.document.Document} at the nth position * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error * @see org.apache.lucene.document.Fieldable * @see org.apache.lucene.document.FieldSelector * @see org.apache.lucene.document.SetBasedFieldSelector * @see org.apache.lucene.document.LoadFirstFieldSelector */ // TODO (1.5): When we convert to JDK 1.5 make this Set public abstract Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException; /** Returns true if document n has been deleted */ public abstract boolean isDeleted(int n); /** Returns true if any documents have been deleted */ public abstract boolean hasDeletions(); /** Returns true if there are norms stored for this field. */ public boolean hasNorms(String field) throws IOException { // backward compatible implementation. // SegmentReader has an efficient implementation. ensureOpen(); return norms(field) != null; } /** Returns the byte-encoded normalization factor for the named field of * every document. This is used by the search code to score documents. * * @see org.apache.lucene.document.Field#setBoost(float) */ public abstract byte[] norms(String field) throws IOException; /** Reads the byte-encoded normalization factor for the named field of every * document. This is used by the search code to score documents. * * @see org.apache.lucene.document.Field#setBoost(float) */ public abstract void norms(String field, byte[] bytes, int offset) throws IOException; /** Expert: Resets the normalization factor for the named field of the named * document. The norm represents the product of the field's {@link * org.apache.lucene.document.Fieldable#setBoost(float) boost} and its {@link Similarity#lengthNorm(String, * int) length normalization}. Thus, to preserve the length normalization * values when resetting this, one should base the new value upon the old. * * NOTE: If this field does not store norms, then * this method call will silently do nothing. * * @see #norms(String) * @see Similarity#decodeNorm(byte) * @throws StaleReaderException if the index has changed * since this reader was opened * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if there is a low-level IO error */ public synchronized void setNorm(int doc, String field, byte value) throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException { ensureOpen(); acquireWriteLock(); hasChanges = true; doSetNorm(doc, field, value); } /** Implements setNorm in subclass.*/ protected abstract void doSetNorm(int doc, String field, byte value) throws CorruptIndexException, IOException; /** Expert: Resets the normalization factor for the named field of the named * document. * * @see #norms(String) * @see Similarity#decodeNorm(byte) * * @throws StaleReaderException if the index has changed * since this reader was opened * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if there is a low-level IO error */ public void setNorm(int doc, String field, float value) throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException { ensureOpen(); setNorm(doc, field, Similarity.encodeNorm(value)); } /** Returns an enumeration of all the terms in the index. The * enumeration is ordered by Term.compareTo(). Each term is greater * than all that precede it in the enumeration. Note that after * calling terms(), {@link TermEnum#next()} must be called * on the resulting enumeration before calling other methods such as * {@link TermEnum#term()}. * @throws IOException if there is a low-level IO error */ public abstract TermEnum terms() throws IOException; /** Returns an enumeration of all terms starting at a given term. If * the given term does not exist, the enumeration is positioned at the * first term greater than the supplied term. The enumeration is * ordered by Term.compareTo(). Each term is greater than all that * precede it in the enumeration. * @throws IOException if there is a low-level IO error */ public abstract TermEnum terms(Term t) throws IOException; /** Returns the number of documents containing the term t. * @throws IOException if there is a low-level IO error */ public abstract int docFreq(Term t) throws IOException; /** Returns an enumeration of all the documents which contain * term. For each document, the document number, the frequency of * the term in that document is also provided, for use in * search scoring. If term is null, then all non-deleted * docs are returned with freq=1. * Thus, this method implements the mapping: *

      * Term    =>    <docNum, freq>* *
    *

    The enumeration is ordered by document number. Each document number * is greater than all that precede it in the enumeration. * @throws IOException if there is a low-level IO error */ public TermDocs termDocs(Term term) throws IOException { ensureOpen(); TermDocs termDocs = termDocs(); termDocs.seek(term); return termDocs; } /** Returns an unpositioned {@link TermDocs} enumerator. * @throws IOException if there is a low-level IO error */ public abstract TermDocs termDocs() throws IOException; /** Returns an enumeration of all the documents which contain * term. For each document, in addition to the document number * and frequency of the term in that document, a list of all of the ordinal * positions of the term in the document is available. Thus, this method * implements the mapping: * *

      * Term    =>    <docNum, freq, * <pos1, pos2, ... * posfreq-1> * >* *
    *

    This positional information facilitates phrase and proximity searching. *

    The enumeration is ordered by document number. Each document number is * greater than all that precede it in the enumeration. * @throws IOException if there is a low-level IO error */ public TermPositions termPositions(Term term) throws IOException { ensureOpen(); TermPositions termPositions = termPositions(); termPositions.seek(term); return termPositions; } /** Returns an unpositioned {@link TermPositions} enumerator. * @throws IOException if there is a low-level IO error */ public abstract TermPositions termPositions() throws IOException; /** Deletes the document numbered docNum. Once a document is * deleted it will not appear in TermDocs or TermPostitions enumerations. * Attempts to read its field with the {@link #document} * method will result in an error. The presence of this document may still be * reflected in the {@link #docFreq} statistic, though * this will be corrected eventually as the index is further modified. * * @throws StaleReaderException if the index has changed * since this reader was opened * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if there is a low-level IO error */ public synchronized void deleteDocument(int docNum) throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException { ensureOpen(); acquireWriteLock(); hasChanges = true; doDelete(docNum); } /** Implements deletion of the document numbered docNum. * Applications should call {@link #deleteDocument(int)} or {@link #deleteDocuments(Term)}. */ protected abstract void doDelete(int docNum) throws CorruptIndexException, IOException; /** Deletes all documents that have a given term indexed. * This is useful if one uses a document field to hold a unique ID string for * the document. Then to delete such a document, one merely constructs a * term with the appropriate field and the unique ID string as its text and * passes it to this method. * See {@link #deleteDocument(int)} for information about when this deletion will * become effective. * * @return the number of documents deleted * @throws StaleReaderException if the index has changed * since this reader was opened * @throws CorruptIndexException if the index is corrupt * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws IOException if there is a low-level IO error */ public int deleteDocuments(Term term) throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException { ensureOpen(); TermDocs docs = termDocs(term); if (docs == null) return 0; int n = 0; try { while (docs.next()) { deleteDocument(docs.doc()); n++; } } finally { docs.close(); } return n; } /** Undeletes all documents currently marked as deleted in this index. * * @throws StaleReaderException if the index has changed * since this reader was opened * @throws LockObtainFailedException if another writer * has this index open (write.lock could not * be obtained) * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public synchronized void undeleteAll() throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException { ensureOpen(); acquireWriteLock(); hasChanges = true; doUndeleteAll(); } /** Implements actual undeleteAll() in subclass. */ protected abstract void doUndeleteAll() throws CorruptIndexException, IOException; /** Does nothing by default. Subclasses that require a write lock for * index modifications must implement this method. */ protected synchronized void acquireWriteLock() throws IOException { /* NOOP */ } /** * * @throws IOException */ public final synchronized void flush() throws IOException { ensureOpen(); commit(); } /** * @param commitUserData Opaque Map (String -> String) * that's recorded into the segments file in the index, * and retrievable by {@link * IndexReader#getCommitUserData}. * @throws IOException */ public final synchronized void flush(Map commitUserData) throws IOException { ensureOpen(); commit(commitUserData); } /** * Commit changes resulting from delete, undeleteAll, or * setNorm operations * * If an exception is hit, then either no changes or all * changes will have been committed to the index * (transactional semantics). * @throws IOException if there is a low-level IO error */ protected final synchronized void commit() throws IOException { commit(null); } /** * Commit changes resulting from delete, undeleteAll, or * setNorm operations * * If an exception is hit, then either no changes or all * changes will have been committed to the index * (transactional semantics). * @throws IOException if there is a low-level IO error */ protected final synchronized void commit(Map commitUserData) throws IOException { if (hasChanges) { doCommit(commitUserData); } hasChanges = false; } /** Implements commit. * @deprecated Please implement {@link #doCommit(Map) * instead}. */ protected abstract void doCommit() throws IOException; /** Implements commit. NOTE: subclasses should override * this. In 3.0 this will become an abstract method. */ void doCommit(Map commitUserData) throws IOException { // Default impl discards commitUserData; all Lucene // subclasses override this (do not discard it). doCommit(); } /** * Closes files associated with this index. * Also saves any new deletions to disk. * No other methods should be called after this has been called. * @throws IOException if there is a low-level IO error */ public final synchronized void close() throws IOException { if (!closed) { decRef(); closed = true; } } /** Implements close. */ protected abstract void doClose() throws IOException; /** * Get a list of unique field names that exist in this index and have the specified * field option information. * @param fldOption specifies which field option should be available for the returned fields * @return Collection of Strings indicating the names of the fields. * @see IndexReader.FieldOption */ public abstract Collection getFieldNames(FieldOption fldOption); /** * Returns true iff the index in the named directory is * currently locked. * @param directory the directory to check for a lock * @throws IOException if there is a low-level IO error * @deprecated Please use {@link IndexWriter#isLocked(Directory)} instead. * This method will be removed in the 3.0 release. * */ public static boolean isLocked(Directory directory) throws IOException { return directory.makeLock(IndexWriter.WRITE_LOCK_NAME).isLocked(); } /** * Returns true iff the index in the named directory is * currently locked. * @param directory the directory to check for a lock * @throws IOException if there is a low-level IO error * @deprecated Use {@link #isLocked(Directory)} instead. * This method will be removed in the 3.0 release. * */ public static boolean isLocked(String directory) throws IOException { Directory dir = FSDirectory.getDirectory(directory); try { return isLocked(dir); } finally { dir.close(); } } /** * Forcibly unlocks the index in the named directory. *

    * Caution: this should only be used by failure recovery code, * when it is known that no other process nor thread is in fact * currently accessing this index. * @deprecated Please use {@link IndexWriter#unlock(Directory)} instead. * This method will be removed in the 3.0 release. * */ public static void unlock(Directory directory) throws IOException { directory.makeLock(IndexWriter.WRITE_LOCK_NAME).release(); } /** * Expert: return the IndexCommit that this reader has * opened. This method is only implemented by those * readers that correspond to a Directory with its own * segments_N file. * *

    WARNING: this API is new and experimental and * may suddenly change.

    */ public IndexCommit getIndexCommit() throws IOException { throw new UnsupportedOperationException("This reader does not support this method."); } /** * Prints the filename and size of each file within a given compound file. * Add the -extract flag to extract files to the current working directory. * In order to make the extracted version of the index work, you have to copy * the segments file from the compound index into the directory where the extracted files are stored. * @param args Usage: org.apache.lucene.index.IndexReader [-extract] <cfsfile> */ public static void main(String [] args) { String filename = null; boolean extract = false; for (int i = 0; i < args.length; ++i) { if (args[i].equals("-extract")) { extract = true; } else if (filename == null) { filename = args[i]; } } if (filename == null) { System.out.println("Usage: org.apache.lucene.index.IndexReader [-extract] "); return; } Directory dir = null; CompoundFileReader cfr = null; try { File file = new File(filename); String dirname = file.getAbsoluteFile().getParent(); filename = file.getName(); dir = FSDirectory.open(new File(dirname)); cfr = new CompoundFileReader(dir, filename); String [] files = cfr.list(); Arrays.sort(files); // sort the array of filename so that the output is more readable for (int i = 0; i < files.length; ++i) { long len = cfr.fileLength(files[i]); if (extract) { System.out.println("extract " + files[i] + " with " + len + " bytes to local directory..."); IndexInput ii = cfr.openInput(files[i]); FileOutputStream f = new FileOutputStream(files[i]); // read and write with a small buffer, which is more effective than reading byte by byte byte[] buffer = new byte[1024]; int chunk = buffer.length; while(len > 0) { final int bufLen = (int) Math.min(chunk, len); ii.readBytes(buffer, 0, bufLen); f.write(buffer, 0, bufLen); len -= bufLen; } f.close(); ii.close(); } else System.out.println(files[i] + ": " + len + " bytes"); } } catch (IOException ioe) { ioe.printStackTrace(); } finally { try { if (dir != null) dir.close(); if (cfr != null) cfr.close(); } catch (IOException ioe) { ioe.printStackTrace(); } } } /** Returns all commit points that exist in the Directory. * Normally, because the default is {@link * KeepOnlyLastCommitDeletionPolicy}, there would be only * one commit point. But if you're using a custom {@link * IndexDeletionPolicy} then there could be many commits. * Once you have a given commit, you can open a reader on * it by calling {@link IndexReader#open(IndexCommit)} * There must be at least one commit in * the Directory, else this method throws {@link * java.io.IOException}. Note that if a commit is in * progress while this method is running, that commit * may or may not be returned array. */ public static Collection listCommits(Directory dir) throws IOException { return DirectoryReader.listCommits(dir); } /** Expert: returns the sequential sub readers that this * reader is logically composed of. For example, * IndexSearcher uses this API to drive searching by one * sub reader at a time. If this reader is not composed * of sequential child readers, it should return null. * If this method returns an empty array, that means this * reader is a null reader (for example a MultiReader * that has no sub readers). *

    * NOTE: You should not try using sub-readers returned by * this method to make any changes (setNorm, deleteDocument, * etc.). While this might succeed for one composite reader * (like MultiReader), it will most likely lead to index * corruption for other readers (like DirectoryReader obtained * through {@link #open}. Use the parent reader directly. */ public IndexReader[] getSequentialSubReaders() { return null; } /** Expert * @deprecated */ public Object getFieldCacheKey() { return this; } /** Expert. Warning: this returns null if the reader has * no deletions */ public Object getDeletesCacheKey() { return this; } /** Returns the number of unique terms (across all fields) * in this reader. * * This method returns long, even though internally * Lucene cannot handle more than 2^31 unique terms, for * a possible future when this limitation is removed. * * @throws UnsupportedOperationException if this count * cannot be easily determined (eg Multi*Readers). * Instead, you should call {@link * #getSequentialSubReaders} and ask each sub reader for * its unique term count. */ public long getUniqueTermCount() throws IOException { throw new UnsupportedOperationException("this reader does not implement getUniqueTermCount()"); } /** Expert: Return the state of the flag that disables fakes norms in favor of representing the absence of field norms with null. * @return true if fake norms are disabled * @deprecated This currently defaults to false (to remain * back-compatible), but in 3.0 it will be hardwired to * true, meaning the norms() methods will return null for * fields that had disabled norms. */ public boolean getDisableFakeNorms() { return disableFakeNorms; } /** Expert: Set the state of the flag that disables fakes norms in favor of representing the absence of field norms with null. * @param disableFakeNorms true to disable fake norms, false to preserve the legacy behavior * @deprecated This currently defaults to false (to remain * back-compatible), but in 3.0 it will be hardwired to * true, meaning the norms() methods will return null for * fields that had disabled norms. */ public void setDisableFakeNorms(boolean disableFakeNorms) { this.disableFakeNorms = disableFakeNorms; } } lucene-2.9.4/src/java/org/apache/lucene/index/ParallelReader.java0000644000175000017500000004647011474320230025306 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.FieldSelectorResult; import org.apache.lucene.document.Fieldable; import org.apache.lucene.search.FieldCache; // not great (circular); used only to purge FieldCache entry on close import java.io.IOException; import java.util.*; /** An IndexReader which reads multiple, parallel indexes. Each index added * must have the same number of documents, but typically each contains * different fields. Each document contains the union of the fields of all * documents with the same document number. When searching, matches for a * query term are from the first index added that has the field. * *

    This is useful, e.g., with collections that have large fields which * change rarely and small fields that change more frequently. The smaller * fields may be re-indexed in a new index and both indexes may be searched * together. * *

    Warning: It is up to you to make sure all indexes * are created and modified the same way. For example, if you add * documents to one index, you need to add the same documents in the * same order to the other indexes. Failure to do so will result in * undefined behavior. */ public class ParallelReader extends IndexReader { private List readers = new ArrayList(); private List decrefOnClose = new ArrayList(); // remember which subreaders to decRef on close boolean incRefReaders = false; private SortedMap fieldToReader = new TreeMap(); private Map readerToFields = new HashMap(); private List storedFieldReaders = new ArrayList(); private int maxDoc; private int numDocs; private boolean hasDeletions; /** Construct a ParallelReader. *

    Note that all subreaders are closed if this ParallelReader is closed.

    */ public ParallelReader() throws IOException { this(true); } /** Construct a ParallelReader. * @param closeSubReaders indicates whether the subreaders should be closed * when this ParallelReader is closed */ public ParallelReader(boolean closeSubReaders) throws IOException { super(); this.incRefReaders = !closeSubReaders; } /** Add an IndexReader. * @throws IOException if there is a low-level IO error */ public void add(IndexReader reader) throws IOException { ensureOpen(); add(reader, false); } /** Add an IndexReader whose stored fields will not be returned. This can * accelerate search when stored fields are only needed from a subset of * the IndexReaders. * * @throws IllegalArgumentException if not all indexes contain the same number * of documents * @throws IllegalArgumentException if not all indexes have the same value * of {@link IndexReader#maxDoc()} * @throws IOException if there is a low-level IO error */ public void add(IndexReader reader, boolean ignoreStoredFields) throws IOException { ensureOpen(); if (readers.size() == 0) { this.maxDoc = reader.maxDoc(); this.numDocs = reader.numDocs(); this.hasDeletions = reader.hasDeletions(); } if (reader.maxDoc() != maxDoc) // check compatibility throw new IllegalArgumentException ("All readers must have same maxDoc: "+maxDoc+"!="+reader.maxDoc()); if (reader.numDocs() != numDocs) throw new IllegalArgumentException ("All readers must have same numDocs: "+numDocs+"!="+reader.numDocs()); Collection fields = reader.getFieldNames(IndexReader.FieldOption.ALL); readerToFields.put(reader, fields); Iterator i = fields.iterator(); while (i.hasNext()) { // update fieldToReader map String field = (String)i.next(); if (fieldToReader.get(field) == null) fieldToReader.put(field, reader); } if (!ignoreStoredFields) storedFieldReaders.add(reader); // add to storedFieldReaders readers.add(reader); if (incRefReaders) { reader.incRef(); } decrefOnClose.add(Boolean.valueOf(incRefReaders)); } public synchronized Object clone() { try { return doReopen(true); } catch (Exception ex) { throw new RuntimeException(ex); } } /** * Tries to reopen the subreaders. *
    * If one or more subreaders could be re-opened (i. e. subReader.reopen() * returned a new instance != subReader), then a new ParallelReader instance * is returned, otherwise this instance is returned. *

    * A re-opened instance might share one or more subreaders with the old * instance. Index modification operations result in undefined behavior * when performed before the old instance is closed. * (see {@link IndexReader#reopen()}). *

    * If subreaders are shared, then the reference count of those * readers is increased to ensure that the subreaders remain open * until the last referring reader is closed. * * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public synchronized IndexReader reopen() throws CorruptIndexException, IOException { return doReopen(false); } protected IndexReader doReopen(boolean doClone) throws CorruptIndexException, IOException { ensureOpen(); boolean reopened = false; List newReaders = new ArrayList(); boolean success = false; try { for (int i = 0; i < readers.size(); i++) { IndexReader oldReader = (IndexReader) readers.get(i); IndexReader newReader = null; if (doClone) { newReader = (IndexReader) oldReader.clone(); } else { newReader = oldReader.reopen(); } newReaders.add(newReader); // if at least one of the subreaders was updated we remember that // and return a new ParallelReader if (newReader != oldReader) { reopened = true; } } success = true; } finally { if (!success && reopened) { for (int i = 0; i < newReaders.size(); i++) { IndexReader r = (IndexReader) newReaders.get(i); if (r != readers.get(i)) { try { r.close(); } catch (IOException ignore) { // keep going - we want to clean up as much as possible } } } } } if (reopened) { List newDecrefOnClose = new ArrayList(); ParallelReader pr = new ParallelReader(); for (int i = 0; i < readers.size(); i++) { IndexReader oldReader = (IndexReader) readers.get(i); IndexReader newReader = (IndexReader) newReaders.get(i); if (newReader == oldReader) { newDecrefOnClose.add(Boolean.TRUE); newReader.incRef(); } else { // this is a new subreader instance, so on close() we don't // decRef but close it newDecrefOnClose.add(Boolean.FALSE); } pr.add(newReader, !storedFieldReaders.contains(oldReader)); } pr.decrefOnClose = newDecrefOnClose; pr.incRefReaders = incRefReaders; return pr; } else { // No subreader was refreshed return this; } } public int numDocs() { // Don't call ensureOpen() here (it could affect performance) return numDocs; } public int maxDoc() { // Don't call ensureOpen() here (it could affect performance) return maxDoc; } public boolean hasDeletions() { // Don't call ensureOpen() here (it could affect performance) return hasDeletions; } // check first reader public boolean isDeleted(int n) { // Don't call ensureOpen() here (it could affect performance) if (readers.size() > 0) return ((IndexReader)readers.get(0)).isDeleted(n); return false; } // delete in all readers protected void doDelete(int n) throws CorruptIndexException, IOException { for (int i = 0; i < readers.size(); i++) { ((IndexReader)readers.get(i)).deleteDocument(n); } hasDeletions = true; } // undeleteAll in all readers protected void doUndeleteAll() throws CorruptIndexException, IOException { for (int i = 0; i < readers.size(); i++) { ((IndexReader)readers.get(i)).undeleteAll(); } hasDeletions = false; } // append fields from storedFieldReaders public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException { ensureOpen(); Document result = new Document(); for (int i = 0; i < storedFieldReaders.size(); i++) { IndexReader reader = (IndexReader)storedFieldReaders.get(i); boolean include = (fieldSelector==null); if (!include) { Iterator it = ((Collection) readerToFields.get(reader)).iterator(); while (it.hasNext()) if (fieldSelector.accept((String)it.next())!=FieldSelectorResult.NO_LOAD) { include = true; break; } } if (include) { Iterator fieldIterator = reader.document(n, fieldSelector).getFields().iterator(); while (fieldIterator.hasNext()) { result.add((Fieldable)fieldIterator.next()); } } } return result; } // get all vectors public TermFreqVector[] getTermFreqVectors(int n) throws IOException { ensureOpen(); ArrayList results = new ArrayList(); Iterator i = fieldToReader.entrySet().iterator(); while (i.hasNext()) { Map.Entry e = (Map.Entry)i.next(); String field = (String)e.getKey(); IndexReader reader = (IndexReader)e.getValue(); TermFreqVector vector = reader.getTermFreqVector(n, field); if (vector != null) results.add(vector); } return (TermFreqVector[]) results.toArray(new TermFreqVector[results.size()]); } public TermFreqVector getTermFreqVector(int n, String field) throws IOException { ensureOpen(); IndexReader reader = ((IndexReader)fieldToReader.get(field)); return reader==null ? null : reader.getTermFreqVector(n, field); } public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException { ensureOpen(); IndexReader reader = ((IndexReader)fieldToReader.get(field)); if (reader != null) { reader.getTermFreqVector(docNumber, field, mapper); } } public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException { ensureOpen(); Iterator i = fieldToReader.entrySet().iterator(); while (i.hasNext()) { Map.Entry e = (Map.Entry)i.next(); String field = (String)e.getKey(); IndexReader reader = (IndexReader)e.getValue(); reader.getTermFreqVector(docNumber, field, mapper); } } public boolean hasNorms(String field) throws IOException { ensureOpen(); IndexReader reader = ((IndexReader)fieldToReader.get(field)); return reader==null ? false : reader.hasNorms(field); } public byte[] norms(String field) throws IOException { ensureOpen(); IndexReader reader = ((IndexReader)fieldToReader.get(field)); return reader==null ? null : reader.norms(field); } public void norms(String field, byte[] result, int offset) throws IOException { ensureOpen(); IndexReader reader = ((IndexReader)fieldToReader.get(field)); if (reader!=null) reader.norms(field, result, offset); } protected void doSetNorm(int n, String field, byte value) throws CorruptIndexException, IOException { IndexReader reader = ((IndexReader)fieldToReader.get(field)); if (reader!=null) reader.doSetNorm(n, field, value); } public TermEnum terms() throws IOException { ensureOpen(); return new ParallelTermEnum(); } public TermEnum terms(Term term) throws IOException { ensureOpen(); return new ParallelTermEnum(term); } public int docFreq(Term term) throws IOException { ensureOpen(); IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); return reader==null ? 0 : reader.docFreq(term); } public TermDocs termDocs(Term term) throws IOException { ensureOpen(); return new ParallelTermDocs(term); } public TermDocs termDocs() throws IOException { ensureOpen(); return new ParallelTermDocs(); } public TermPositions termPositions(Term term) throws IOException { ensureOpen(); return new ParallelTermPositions(term); } public TermPositions termPositions() throws IOException { ensureOpen(); return new ParallelTermPositions(); } /** * Checks recursively if all subreaders are up to date. */ public boolean isCurrent() throws CorruptIndexException, IOException { for (int i = 0; i < readers.size(); i++) { if (!((IndexReader)readers.get(i)).isCurrent()) { return false; } } // all subreaders are up to date return true; } /** * Checks recursively if all subindexes are optimized */ public boolean isOptimized() { for (int i = 0; i < readers.size(); i++) { if (!((IndexReader)readers.get(i)).isOptimized()) { return false; } } // all subindexes are optimized return true; } /** Not implemented. * @throws UnsupportedOperationException */ public long getVersion() { throw new UnsupportedOperationException("ParallelReader does not support this method."); } // for testing IndexReader[] getSubReaders() { return (IndexReader[]) readers.toArray(new IndexReader[readers.size()]); } /** @deprecated */ protected void doCommit() throws IOException { doCommit(null); } protected void doCommit(Map commitUserData) throws IOException { for (int i = 0; i < readers.size(); i++) ((IndexReader)readers.get(i)).commit(commitUserData); } protected synchronized void doClose() throws IOException { for (int i = 0; i < readers.size(); i++) { if (((Boolean) decrefOnClose.get(i)).booleanValue()) { ((IndexReader)readers.get(i)).decRef(); } else { ((IndexReader)readers.get(i)).close(); } } FieldCache.DEFAULT.purge(this); } public Collection getFieldNames (IndexReader.FieldOption fieldNames) { ensureOpen(); Set fieldSet = new HashSet(); for (int i = 0; i < readers.size(); i++) { IndexReader reader = ((IndexReader)readers.get(i)); Collection names = reader.getFieldNames(fieldNames); fieldSet.addAll(names); } return fieldSet; } private class ParallelTermEnum extends TermEnum { private String field; private Iterator fieldIterator; private TermEnum termEnum; public ParallelTermEnum() throws IOException { try { field = (String)fieldToReader.firstKey(); } catch(NoSuchElementException e) { // No fields, so keep field == null, termEnum == null return; } if (field != null) termEnum = ((IndexReader)fieldToReader.get(field)).terms(); } public ParallelTermEnum(Term term) throws IOException { field = term.field(); IndexReader reader = ((IndexReader)fieldToReader.get(field)); if (reader!=null) termEnum = reader.terms(term); } public boolean next() throws IOException { if (termEnum==null) return false; // another term in this field? if (termEnum.next() && termEnum.term().field()==field) return true; // yes, keep going termEnum.close(); // close old termEnum // find the next field with terms, if any if (fieldIterator==null) { fieldIterator = fieldToReader.tailMap(field).keySet().iterator(); fieldIterator.next(); // Skip field to get next one } while (fieldIterator.hasNext()) { field = (String) fieldIterator.next(); termEnum = ((IndexReader)fieldToReader.get(field)).terms(new Term(field)); Term term = termEnum.term(); if (term!=null && term.field()==field) return true; else termEnum.close(); } return false; // no more fields } public Term term() { if (termEnum==null) return null; return termEnum.term(); } public int docFreq() { if (termEnum==null) return 0; return termEnum.docFreq(); } public void close() throws IOException { if (termEnum!=null) termEnum.close(); } } // wrap a TermDocs in order to support seek(Term) private class ParallelTermDocs implements TermDocs { protected TermDocs termDocs; public ParallelTermDocs() {} public ParallelTermDocs(Term term) throws IOException { if (term == null) termDocs = readers.isEmpty() ? null : ((IndexReader)readers.get(0)).termDocs(null); else seek(term); } public int doc() { return termDocs.doc(); } public int freq() { return termDocs.freq(); } public void seek(Term term) throws IOException { IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); termDocs = reader!=null ? reader.termDocs(term) : null; } public void seek(TermEnum termEnum) throws IOException { seek(termEnum.term()); } public boolean next() throws IOException { if (termDocs==null) return false; return termDocs.next(); } public int read(final int[] docs, final int[] freqs) throws IOException { if (termDocs==null) return 0; return termDocs.read(docs, freqs); } public boolean skipTo(int target) throws IOException { if (termDocs==null) return false; return termDocs.skipTo(target); } public void close() throws IOException { if (termDocs!=null) termDocs.close(); } } private class ParallelTermPositions extends ParallelTermDocs implements TermPositions { public ParallelTermPositions() {} public ParallelTermPositions(Term term) throws IOException { seek(term); } public void seek(Term term) throws IOException { IndexReader reader = ((IndexReader)fieldToReader.get(term.field())); termDocs = reader!=null ? reader.termPositions(term) : null; } public int nextPosition() throws IOException { // It is an error to call this if there is no next position, e.g. if termDocs==null return ((TermPositions)termDocs).nextPosition(); } public int getPayloadLength() { return ((TermPositions)termDocs).getPayloadLength(); } public byte[] getPayload(byte[] data, int offset) throws IOException { return ((TermPositions)termDocs).getPayload(data, offset); } // TODO: Remove warning after API has been finalized public boolean isPayloadAvailable() { return ((TermPositions) termDocs).isPayloadAvailable(); } } } lucene-2.9.4/src/java/org/apache/lucene/index/SegmentTermPositionVector.java0000644000175000017500000000443311474320230027562 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ class SegmentTermPositionVector extends SegmentTermVector implements TermPositionVector { protected int[][] positions; protected TermVectorOffsetInfo[][] offsets; public static final int[] EMPTY_TERM_POS = new int[0]; public SegmentTermPositionVector(String field, String terms[], int termFreqs[], int[][] positions, TermVectorOffsetInfo[][] offsets) { super(field, terms, termFreqs); this.offsets = offsets; this.positions = positions; } /** * Returns an array of TermVectorOffsetInfo in which the term is found. * * @param index The position in the array to get the offsets from * @return An array of TermVectorOffsetInfo objects or the empty list * @see org.apache.lucene.analysis.Token */ public TermVectorOffsetInfo[] getOffsets(int index) { TermVectorOffsetInfo[] result = TermVectorOffsetInfo.EMPTY_OFFSET_INFO; if(offsets == null) return null; if (index >=0 && index < offsets.length) { result = offsets[index]; } return result; } /** * Returns an array of positions in which the term is found. * Terms are identified by the index at which its number appears in the * term String array obtained from the indexOf method. */ public int[] getTermPositions(int index) { int[] result = EMPTY_TERM_POS; if(positions == null) return null; if (index >=0 && index < positions.length) { result = positions[index]; } return result; } }lucene-2.9.4/src/java/org/apache/lucene/index/IndexCommit.java0000644000175000017500000001007311474320230024635 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Collection; import java.util.Map; import java.io.IOException; import org.apache.lucene.store.Directory; /** *

    Expert: represents a single commit into an index as seen by the * {@link IndexDeletionPolicy} or {@link IndexReader}.

    * *

    Changes to the content of an index are made visible * only after the writer who made that change commits by * writing a new segments file * (segments_N). This point in time, when the * action of writing of a new segments file to the directory * is completed, is an index commit.

    * *

    Each index commit point has a unique segments file * associated with it. The segments file associated with a * later index commit point would have a larger N.

    * *

    WARNING: This API is a new and experimental and * may suddenly change.

    */ public abstract class IndexCommit implements IndexCommitPoint { /** * Get the segments file (segments_N) associated * with this commit point. */ public abstract String getSegmentsFileName(); /** * Returns all index files referenced by this commit point. */ public abstract Collection getFileNames() throws IOException; /** * Returns the {@link Directory} for the index. */ public abstract Directory getDirectory(); /** * Delete this commit point. This only applies when using * the commit point in the context of IndexWriter's * IndexDeletionPolicy. *

    * Upon calling this, the writer is notified that this commit * point should be deleted. *

    * Decision that a commit-point should be deleted is taken by the {@link IndexDeletionPolicy} in effect * and therefore this should only be called by its {@link IndexDeletionPolicy#onInit onInit()} or * {@link IndexDeletionPolicy#onCommit onCommit()} methods. */ public abstract void delete(); public abstract boolean isDeleted(); /** Returns true if this commit is an optimized index. */ public abstract boolean isOptimized(); /** Two IndexCommits are equal if both their Directory and versions are equal. */ public boolean equals(Object other) { if (other instanceof IndexCommit) { IndexCommit otherCommit = (IndexCommit) other; return otherCommit.getDirectory().equals(getDirectory()) && otherCommit.getVersion() == getVersion(); } else return false; } public int hashCode() { return (int) (getDirectory().hashCode() + getVersion()); } /** Returns the version for this IndexCommit. This is the * same value that {@link IndexReader#getVersion} would * return if it were opened on this commit. */ public abstract long getVersion(); /** Returns the generation (the _N in segments_N) for this * IndexCommit */ public abstract long getGeneration(); /** Convenience method that returns the last modified time * of the segments_N file corresponding to this index * commit, equivalent to * getDirectory().fileModified(getSegmentsFileName()). */ public long getTimestamp() throws IOException { return getDirectory().fileModified(getSegmentsFileName()); } /** Returns userData, previously passed to {@link * IndexWriter#commit(Map)} for this commit. Map is * String -> String. */ public abstract Map getUserData() throws IOException; } lucene-2.9.4/src/java/org/apache/lucene/index/SortedTermVectorMapper.java0000644000175000017500000001102011474320230027026 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Copyright 2007 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.*; /** * Store a sorted collection of {@link org.apache.lucene.index.TermVectorEntry}s. Collects all term information * into a single, SortedSet. *
    * NOTE: This Mapper ignores all Field information for the Document. This means that if you are using offset/positions you will not * know what Fields they correlate with. *
    * This is not thread-safe */ public class SortedTermVectorMapper extends TermVectorMapper{ private SortedSet currentSet; private Map termToTVE = new HashMap(); private boolean storeOffsets; private boolean storePositions; /** * Stand-in name for the field in {@link TermVectorEntry}. */ public static final String ALL = "_ALL_"; /** * * @param comparator A Comparator for sorting {@link TermVectorEntry}s */ public SortedTermVectorMapper(Comparator comparator) { this(false, false, comparator); } public SortedTermVectorMapper(boolean ignoringPositions, boolean ignoringOffsets, Comparator comparator) { super(ignoringPositions, ignoringOffsets); currentSet = new TreeSet(comparator); } /** * * @param term The term to map * @param frequency The frequency of the term * @param offsets Offset information, may be null * @param positions Position information, may be null */ //We need to combine any previous mentions of the term public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { TermVectorEntry entry = (TermVectorEntry) termToTVE.get(term); if (entry == null) { entry = new TermVectorEntry(ALL, term, frequency, storeOffsets == true ? offsets : null, storePositions == true ? positions : null); termToTVE.put(term, entry); currentSet.add(entry); } else { entry.setFrequency(entry.getFrequency() + frequency); if (storeOffsets) { TermVectorOffsetInfo [] existingOffsets = entry.getOffsets(); //A few diff. cases here: offsets is null, existing offsets is null, both are null, same for positions if (existingOffsets != null && offsets != null && offsets.length > 0) { //copy over the existing offsets TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[existingOffsets.length + offsets.length]; System.arraycopy(existingOffsets, 0, newOffsets, 0, existingOffsets.length); System.arraycopy(offsets, 0, newOffsets, existingOffsets.length, offsets.length); entry.setOffsets(newOffsets); } else if (existingOffsets == null && offsets != null && offsets.length > 0) { entry.setOffsets(offsets); } //else leave it alone } if (storePositions) { int [] existingPositions = entry.getPositions(); if (existingPositions != null && positions != null && positions.length > 0) { int [] newPositions = new int[existingPositions.length + positions.length]; System.arraycopy(existingPositions, 0, newPositions, 0, existingPositions.length); System.arraycopy(positions, 0, newPositions, existingPositions.length, positions.length); entry.setPositions(newPositions); } else if (existingPositions == null && positions != null && positions.length > 0) { entry.setPositions(positions); } } } } public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) { this.storeOffsets = storeOffsets; this.storePositions = storePositions; } /** * The TermVectorEntrySet. A SortedSet of {@link TermVectorEntry} objects. Sort is by the comparator passed into the constructor. *
    * This set will be empty until after the mapping process takes place. * * @return The SortedSet of {@link TermVectorEntry}. */ public SortedSet getTermVectorEntrySet() { return currentSet; } } lucene-2.9.4/src/java/org/apache/lucene/index/ReadOnlyDirectoryReader.java0000644000175000017500000000340711474320230027145 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.store.Directory; import java.io.IOException; import java.util.Map; class ReadOnlyDirectoryReader extends DirectoryReader { ReadOnlyDirectoryReader(Directory directory, SegmentInfos sis, IndexDeletionPolicy deletionPolicy, int termInfosIndexDivisor) throws IOException { super(directory, sis, deletionPolicy, true, termInfosIndexDivisor); } ReadOnlyDirectoryReader(Directory directory, SegmentInfos infos, SegmentReader[] oldReaders, int[] oldStarts, Map oldNormsCache, boolean doClone, int termInfosIndexDivisor) throws IOException { super(directory, infos, oldReaders, oldStarts, oldNormsCache, true, doClone, termInfosIndexDivisor); } ReadOnlyDirectoryReader(IndexWriter writer, SegmentInfos infos, int termInfosIndexDivisor) throws IOException { super(writer, infos, termInfosIndexDivisor); } protected void acquireWriteLock() { ReadOnlySegmentReader.noWrite(); } } lucene-2.9.4/src/java/org/apache/lucene/index/FreqProxTermsWriter.java0000644000175000017500000002321011474320230026370 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.UnicodeUtil; import java.io.IOException; import java.util.Collection; import java.util.Collections; import java.util.Map; import java.util.ArrayList; import java.util.List; import java.util.Iterator; final class FreqProxTermsWriter extends TermsHashConsumer { public TermsHashConsumerPerThread addThread(TermsHashPerThread perThread) { return new FreqProxTermsWriterPerThread(perThread); } void createPostings(RawPostingList[] postings, int start, int count) { final int end = start + count; for(int i=start;i 0) allFields.add(perField); } } // Sort by field name Collections.sort(allFields); final int numAllFields = allFields.size(); // TODO: allow Lucene user to customize this consumer: final FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos); /* Current writer chain: FormatPostingsFieldsConsumer -> IMPL: FormatPostingsFieldsWriter -> FormatPostingsTermsConsumer -> IMPL: FormatPostingsTermsWriter -> FormatPostingsDocConsumer -> IMPL: FormatPostingsDocWriter -> FormatPostingsPositionsConsumer -> IMPL: FormatPostingsPositionsWriter */ int start = 0; while(start < numAllFields) { final FieldInfo fieldInfo = ((FreqProxTermsWriterPerField) allFields.get(start)).fieldInfo; final String fieldName = fieldInfo.name; int end = start+1; while(end < numAllFields && ((FreqProxTermsWriterPerField) allFields.get(end)).fieldInfo.name.equals(fieldName)) end++; FreqProxTermsWriterPerField[] fields = new FreqProxTermsWriterPerField[end-start]; for(int i=start;i 0) { // Get the next term to merge termStates[0] = mergeStates[0]; int numToMerge = 1; for(int i=1;i 0) { FreqProxFieldMergeState minState = termStates[0]; for(int i=1;i> 1; final int payloadLength; if ((code & 1) != 0) { // This position has a payload payloadLength = prox.readVInt(); if (payloadBuffer == null || payloadBuffer.length < payloadLength) payloadBuffer = new byte[payloadLength]; prox.readBytes(payloadBuffer, 0, payloadLength); } else payloadLength = 0; posConsumer.addPosition(position, payloadBuffer, 0, payloadLength); } //End for posConsumer.finish(); } if (!minState.nextDoc()) { // Remove from termStates int upto = 0; for(int i=0;iNOTE: This API is new and still experimental * (subject to change suddenly in the next release)

    */ public class SegmentReader extends IndexReader implements Cloneable { protected boolean readOnly; private SegmentInfo si; private int readBufferSize; CloseableThreadLocal fieldsReaderLocal = new FieldsReaderLocal(); CloseableThreadLocal termVectorsLocal = new CloseableThreadLocal(); BitVector deletedDocs = null; Ref deletedDocsRef = null; private boolean deletedDocsDirty = false; private boolean normsDirty = false; private int pendingDeleteCount; private boolean rollbackHasChanges = false; private boolean rollbackDeletedDocsDirty = false; private boolean rollbackNormsDirty = false; private SegmentInfo rollbackSegmentInfo; private int rollbackPendingDeleteCount; // optionally used for the .nrm file shared by multiple norms private IndexInput singleNormStream; private Ref singleNormRef; CoreReaders core; // Holds core readers that are shared (unchanged) when // SegmentReader is cloned or reopened static final class CoreReaders { // Counts how many other reader share the core objects // (freqStream, proxStream, tis, etc.) of this reader; // when coreRef drops to 0, these core objects may be // closed. A given instance of SegmentReader may be // closed, even those it shares core objects with other // SegmentReaders: private final Ref ref = new Ref(); final String segment; final FieldInfos fieldInfos; final IndexInput freqStream; final IndexInput proxStream; final TermInfosReader tisNoIndex; final Directory dir; final Directory cfsDir; final int readBufferSize; final int termsIndexDivisor; private final SegmentReader origInstance; TermInfosReader tis; FieldsReader fieldsReaderOrig; TermVectorsReader termVectorsReaderOrig; CompoundFileReader cfsReader; CompoundFileReader storeCFSReader; CoreReaders(SegmentReader origInstance, Directory dir, SegmentInfo si, int readBufferSize, int termsIndexDivisor) throws IOException { segment = si.name; this.readBufferSize = readBufferSize; this.dir = dir; boolean success = false; try { Directory dir0 = dir; if (si.getUseCompoundFile()) { cfsReader = new CompoundFileReader(dir, segment + "." + IndexFileNames.COMPOUND_FILE_EXTENSION, readBufferSize); dir0 = cfsReader; } cfsDir = dir0; fieldInfos = new FieldInfos(cfsDir, segment + "." + IndexFileNames.FIELD_INFOS_EXTENSION); this.termsIndexDivisor = termsIndexDivisor; TermInfosReader reader = new TermInfosReader(cfsDir, segment, fieldInfos, readBufferSize, termsIndexDivisor); if (termsIndexDivisor == -1) { tisNoIndex = reader; } else { tis = reader; tisNoIndex = null; } // make sure that all index files have been read or are kept open // so that if an index update removes them we'll still have them freqStream = cfsDir.openInput(segment + "." + IndexFileNames.FREQ_EXTENSION, readBufferSize); if (fieldInfos.hasProx()) { proxStream = cfsDir.openInput(segment + "." + IndexFileNames.PROX_EXTENSION, readBufferSize); } else { proxStream = null; } success = true; } finally { if (!success) { decRef(); } } // Must assign this at the end -- if we hit an // exception above core, we don't want to attempt to // purge the FieldCache (will hit NPE because core is // not assigned yet). this.origInstance = origInstance; } synchronized TermVectorsReader getTermVectorsReaderOrig() { return termVectorsReaderOrig; } synchronized FieldsReader getFieldsReaderOrig() { return fieldsReaderOrig; } synchronized void incRef() { ref.incRef(); } synchronized Directory getCFSReader() { return cfsReader; } synchronized TermInfosReader getTermsReader() { if (tis != null) { return tis; } else { return tisNoIndex; } } synchronized boolean termsIndexIsLoaded() { return tis != null; } // NOTE: only called from IndexWriter when a near // real-time reader is opened, or applyDeletes is run, // sharing a segment that's still being merged. This // method is not fully thread safe, and relies on the // synchronization in IndexWriter synchronized void loadTermsIndex(SegmentInfo si, int termsIndexDivisor) throws IOException { if (tis == null) { Directory dir0; if (si.getUseCompoundFile()) { // In some cases, we were originally opened when CFS // was not used, but then we are asked to open the // terms reader with index, the segment has switched // to CFS if (cfsReader == null) { cfsReader = new CompoundFileReader(dir, segment + "." + IndexFileNames.COMPOUND_FILE_EXTENSION, readBufferSize); } dir0 = cfsReader; } else { dir0 = dir; } tis = new TermInfosReader(dir0, segment, fieldInfos, readBufferSize, termsIndexDivisor); } } synchronized void decRef() throws IOException { if (ref.decRef() == 0) { // close everything, nothing is shared anymore with other readers if (tis != null) { tis.close(); // null so if an app hangs on to us we still free most ram tis = null; } if (tisNoIndex != null) { tisNoIndex.close(); } if (freqStream != null) { freqStream.close(); } if (proxStream != null) { proxStream.close(); } if (termVectorsReaderOrig != null) { termVectorsReaderOrig.close(); } if (fieldsReaderOrig != null) { fieldsReaderOrig.close(); } if (cfsReader != null) { cfsReader.close(); } if (storeCFSReader != null) { storeCFSReader.close(); } // Force FieldCache to evict our entries at this point if (origInstance != null) { FieldCache.DEFAULT.purge(origInstance); } } } synchronized void openDocStores(SegmentInfo si) throws IOException { assert si.name.equals(segment); if (fieldsReaderOrig == null) { final Directory storeDir; if (si.getDocStoreOffset() != -1) { if (si.getDocStoreIsCompoundFile()) { assert storeCFSReader == null; storeCFSReader = new CompoundFileReader(dir, si.getDocStoreSegment() + "." + IndexFileNames.COMPOUND_FILE_STORE_EXTENSION, readBufferSize); storeDir = storeCFSReader; assert storeDir != null; } else { storeDir = dir; assert storeDir != null; } } else if (si.getUseCompoundFile()) { // In some cases, we were originally opened when CFS // was not used, but then we are asked to open doc // stores after the segment has switched to CFS if (cfsReader == null) { cfsReader = new CompoundFileReader(dir, segment + "." + IndexFileNames.COMPOUND_FILE_EXTENSION, readBufferSize); } storeDir = cfsReader; assert storeDir != null; } else { storeDir = dir; assert storeDir != null; } final String storesSegment; if (si.getDocStoreOffset() != -1) { storesSegment = si.getDocStoreSegment(); } else { storesSegment = segment; } fieldsReaderOrig = new FieldsReader(storeDir, storesSegment, fieldInfos, readBufferSize, si.getDocStoreOffset(), si.docCount); // Verify two sources of "maxDoc" agree: if (si.getDocStoreOffset() == -1 && fieldsReaderOrig.size() != si.docCount) { throw new CorruptIndexException("doc counts differ for segment " + segment + ": fieldsReader shows " + fieldsReaderOrig.size() + " but segmentInfo shows " + si.docCount); } if (fieldInfos.hasVectors()) { // open term vector files only as needed termVectorsReaderOrig = new TermVectorsReader(storeDir, storesSegment, fieldInfos, readBufferSize, si.getDocStoreOffset(), si.docCount); } } } } /** * Sets the initial value */ private class FieldsReaderLocal extends CloseableThreadLocal { protected Object initialValue() { return core.getFieldsReaderOrig().clone(); } } static class Ref { private int refCount = 1; public String toString() { return "refcount: "+refCount; } public synchronized int refCount() { return refCount; } public synchronized int incRef() { assert refCount > 0; refCount++; return refCount; } public synchronized int decRef() { assert refCount > 0; refCount--; return refCount; } } /** * Byte[] referencing is used because a new norm object needs * to be created for each clone, and the byte array is all * that is needed for sharing between cloned readers. The * current norm referencing is for sharing between readers * whereas the byte[] referencing is for copy on write which * is independent of reader references (i.e. incRef, decRef). */ final class Norm implements Cloneable { private int refCount = 1; // If this instance is a clone, the originalNorm // references the Norm that has a real open IndexInput: private Norm origNorm; private IndexInput in; private long normSeek; // null until bytes is set private Ref bytesRef; private byte[] bytes; private boolean dirty; private int number; private boolean rollbackDirty; public Norm(IndexInput in, int number, long normSeek) { this.in = in; this.number = number; this.normSeek = normSeek; } public synchronized void incRef() { assert refCount > 0 && (origNorm == null || origNorm.refCount > 0); refCount++; } private void closeInput() throws IOException { if (in != null) { if (in != singleNormStream) { // It's private to us -- just close it in.close(); } else { // We are sharing this with others -- decRef and // maybe close the shared norm stream if (singleNormRef.decRef() == 0) { singleNormStream.close(); singleNormStream = null; } } in = null; } } public synchronized void decRef() throws IOException { assert refCount > 0 && (origNorm == null || origNorm.refCount > 0); if (--refCount == 0) { if (origNorm != null) { origNorm.decRef(); origNorm = null; } else { closeInput(); } if (bytes != null) { assert bytesRef != null; bytesRef.decRef(); bytes = null; bytesRef = null; } else { assert bytesRef == null; } } } // Load bytes but do not cache them if they were not // already cached public synchronized void bytes(byte[] bytesOut, int offset, int len) throws IOException { assert refCount > 0 && (origNorm == null || origNorm.refCount > 0); if (bytes != null) { // Already cached -- copy from cache: assert len <= maxDoc(); System.arraycopy(bytes, 0, bytesOut, offset, len); } else { // Not cached if (origNorm != null) { // Ask origNorm to load origNorm.bytes(bytesOut, offset, len); } else { // We are orig -- read ourselves from disk: synchronized(in) { in.seek(normSeek); in.readBytes(bytesOut, offset, len, false); } } } } // Load & cache full bytes array. Returns bytes. public synchronized byte[] bytes() throws IOException { assert refCount > 0 && (origNorm == null || origNorm.refCount > 0); if (bytes == null) { // value not yet read assert bytesRef == null; if (origNorm != null) { // Ask origNorm to load so that for a series of // reopened readers we share a single read-only // byte[] bytes = origNorm.bytes(); bytesRef = origNorm.bytesRef; bytesRef.incRef(); // Once we've loaded the bytes we no longer need // origNorm: origNorm.decRef(); origNorm = null; } else { // We are the origNorm, so load the bytes for real // ourself: final int count = maxDoc(); bytes = new byte[count]; // Since we are orig, in must not be null assert in != null; // Read from disk. synchronized(in) { in.seek(normSeek); in.readBytes(bytes, 0, count, false); } bytesRef = new Ref(); closeInput(); } } return bytes; } // Only for testing Ref bytesRef() { return bytesRef; } // Called if we intend to change a norm value. We make a // private copy of bytes if it's shared with others: public synchronized byte[] copyOnWrite() throws IOException { assert refCount > 0 && (origNorm == null || origNorm.refCount > 0); bytes(); assert bytes != null; assert bytesRef != null; if (bytesRef.refCount() > 1) { // I cannot be the origNorm for another norm // instance if I'm being changed. Ie, only the // "head Norm" can be changed: assert refCount == 1; final Ref oldRef = bytesRef; bytes = cloneNormBytes(bytes); bytesRef = new Ref(); oldRef.decRef(); } dirty = true; return bytes; } // Returns a copy of this Norm instance that shares // IndexInput & bytes with the original one public synchronized Object clone() { assert refCount > 0 && (origNorm == null || origNorm.refCount > 0); Norm clone; try { clone = (Norm) super.clone(); } catch (CloneNotSupportedException cnse) { // Cannot happen throw new RuntimeException("unexpected CloneNotSupportedException", cnse); } clone.refCount = 1; if (bytes != null) { assert bytesRef != null; assert origNorm == null; // Clone holds a reference to my bytes: clone.bytesRef.incRef(); } else { assert bytesRef == null; if (origNorm == null) { // I become the origNorm for the clone: clone.origNorm = this; } clone.origNorm.incRef(); } // Only the origNorm will actually readBytes from in: clone.in = null; return clone; } // Flush all pending changes to the next generation // separate norms file. public void reWrite(SegmentInfo si) throws IOException { assert refCount > 0 && (origNorm == null || origNorm.refCount > 0): "refCount=" + refCount + " origNorm=" + origNorm; // NOTE: norms are re-written in regular directory, not cfs si.advanceNormGen(this.number); final String normFileName = si.getNormFileName(this.number); IndexOutput out = directory().createOutput(normFileName); boolean success = false; try { try { out.writeBytes(bytes, maxDoc()); } finally { out.close(); } success = true; } finally { if (!success) { try { directory().deleteFile(normFileName); } catch (Throwable t) { // suppress this so we keep throwing the // original exception } } } this.dirty = false; } } Map norms = new HashMap(); /** The class which implements SegmentReader. */ // @deprecated (LUCENE-1677) private static Class IMPL; static { try { String name = System.getProperty("org.apache.lucene.SegmentReader.class", SegmentReader.class.getName()); IMPL = Class.forName(name); } catch (ClassNotFoundException e) { throw new RuntimeException("cannot load SegmentReader class: " + e, e); } catch (SecurityException se) { try { IMPL = Class.forName(SegmentReader.class.getName()); } catch (ClassNotFoundException e) { throw new RuntimeException("cannot load default SegmentReader class: " + e, e); } } } // @deprecated (LUCENE-1677) private static Class READONLY_IMPL; static { try { String name = System.getProperty("org.apache.lucene.ReadOnlySegmentReader.class", ReadOnlySegmentReader.class.getName()); READONLY_IMPL = Class.forName(name); } catch (ClassNotFoundException e) { throw new RuntimeException("cannot load ReadOnlySegmentReader class: " + e, e); } catch (SecurityException se) { try { READONLY_IMPL = Class.forName(ReadOnlySegmentReader.class.getName()); } catch (ClassNotFoundException e) { throw new RuntimeException("cannot load default ReadOnlySegmentReader class: " + e, e); } } } /** * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error * @deprecated */ public static SegmentReader get(SegmentInfo si) throws CorruptIndexException, IOException { return get(false, si.dir, si, BufferedIndexInput.BUFFER_SIZE, true, IndexReader.DEFAULT_TERMS_INDEX_DIVISOR); } /** * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public static SegmentReader get(boolean readOnly, SegmentInfo si, int termInfosIndexDivisor) throws CorruptIndexException, IOException { return get(readOnly, si.dir, si, BufferedIndexInput.BUFFER_SIZE, true, termInfosIndexDivisor); } /** * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error * @deprecated */ static SegmentReader get(SegmentInfo si, int readBufferSize, boolean doOpenStores, int termInfosIndexDivisor) throws CorruptIndexException, IOException { return get(false, si.dir, si, readBufferSize, doOpenStores, termInfosIndexDivisor); } /** * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public static SegmentReader get(boolean readOnly, Directory dir, SegmentInfo si, int readBufferSize, boolean doOpenStores, int termInfosIndexDivisor) throws CorruptIndexException, IOException { SegmentReader instance; try { if (readOnly) instance = (SegmentReader)READONLY_IMPL.newInstance(); else instance = (SegmentReader)IMPL.newInstance(); } catch (Exception e) { throw new RuntimeException("cannot load SegmentReader class: " + e, e); } instance.readOnly = readOnly; instance.si = si; instance.readBufferSize = readBufferSize; boolean success = false; try { instance.core = new CoreReaders(instance, dir, si, readBufferSize, termInfosIndexDivisor); if (doOpenStores) { instance.core.openDocStores(si); } instance.loadDeletedDocs(); instance.openNorms(instance.core.cfsDir, readBufferSize); success = true; } finally { // With lock-less commits, it's entirely possible (and // fine) to hit a FileNotFound exception above. In // this case, we want to explicitly close any subset // of things that were opened so that we don't have to // wait for a GC to do so. if (!success) { instance.doClose(); } } return instance; } void openDocStores() throws IOException { core.openDocStores(si); } private boolean checkDeletedCounts() throws IOException { final int recomputedCount = deletedDocs.getRecomputedCount(); assert deletedDocs.count() == recomputedCount : "deleted count=" + deletedDocs.count() + " vs recomputed count=" + recomputedCount; assert si.getDelCount() == recomputedCount : "delete count mismatch: info=" + si.getDelCount() + " vs BitVector=" + recomputedCount; // Verify # deletes does not exceed maxDoc for this // segment: assert si.getDelCount() <= maxDoc() : "delete count mismatch: " + recomputedCount + ") exceeds max doc (" + maxDoc() + ") for segment " + si.name; return true; } private void loadDeletedDocs() throws IOException { // NOTE: the bitvector is stored using the regular directory, not cfs if (hasDeletions(si)) { deletedDocs = new BitVector(directory(), si.getDelFileName()); deletedDocsRef = new Ref(); assert checkDeletedCounts(); } else assert si.getDelCount() == 0; } /** * Clones the norm bytes. May be overridden by subclasses. New and experimental. * @param bytes Byte array to clone * @return New BitVector */ protected byte[] cloneNormBytes(byte[] bytes) { byte[] cloneBytes = new byte[bytes.length]; System.arraycopy(bytes, 0, cloneBytes, 0, bytes.length); return cloneBytes; } /** * Clones the deleteDocs BitVector. May be overridden by subclasses. New and experimental. * @param bv BitVector to clone * @return New BitVector */ protected BitVector cloneDeletedDocs(BitVector bv) { return (BitVector)bv.clone(); } public final synchronized Object clone() { try { return clone(readOnly); // Preserve current readOnly } catch (Exception ex) { throw new RuntimeException(ex); } } public final synchronized IndexReader clone(boolean openReadOnly) throws CorruptIndexException, IOException { return reopenSegment(si, true, openReadOnly); } synchronized SegmentReader reopenSegment(SegmentInfo si, boolean doClone, boolean openReadOnly) throws CorruptIndexException, IOException { boolean deletionsUpToDate = (this.si.hasDeletions() == si.hasDeletions()) && (!si.hasDeletions() || this.si.getDelFileName().equals(si.getDelFileName())); boolean normsUpToDate = true; boolean[] fieldNormsChanged = new boolean[core.fieldInfos.size()]; final int fieldCount = core.fieldInfos.size(); for (int i = 0; i < fieldCount; i++) { if (!this.si.getNormFileName(i).equals(si.getNormFileName(i))) { normsUpToDate = false; fieldNormsChanged[i] = true; } } // if we're cloning we need to run through the reopenSegment logic // also if both old and new readers aren't readonly, we clone to avoid sharing modifications if (normsUpToDate && deletionsUpToDate && !doClone && openReadOnly && readOnly) { return this; } // When cloning, the incoming SegmentInfos should not // have any changes in it: assert !doClone || (normsUpToDate && deletionsUpToDate); // clone reader SegmentReader clone; try { if (openReadOnly) clone = (SegmentReader) READONLY_IMPL.newInstance(); else clone = (SegmentReader) IMPL.newInstance(); } catch (Exception e) { throw new RuntimeException("cannot load SegmentReader class: " + e, e); } boolean success = false; try { core.incRef(); clone.core = core; clone.readOnly = openReadOnly; clone.si = si; clone.readBufferSize = readBufferSize; if (!openReadOnly && hasChanges) { // My pending changes transfer to the new reader clone.pendingDeleteCount = pendingDeleteCount; clone.deletedDocsDirty = deletedDocsDirty; clone.normsDirty = normsDirty; clone.hasChanges = hasChanges; hasChanges = false; } if (doClone) { if (deletedDocs != null) { deletedDocsRef.incRef(); clone.deletedDocs = deletedDocs; clone.deletedDocsRef = deletedDocsRef; } } else { if (!deletionsUpToDate) { // load deleted docs assert clone.deletedDocs == null; clone.loadDeletedDocs(); } else if (deletedDocs != null) { deletedDocsRef.incRef(); clone.deletedDocs = deletedDocs; clone.deletedDocsRef = deletedDocsRef; } } clone.setDisableFakeNorms(getDisableFakeNorms()); clone.norms = new HashMap(); // Clone norms for (int i = 0; i < fieldNormsChanged.length; i++) { // Clone unchanged norms to the cloned reader if (doClone || !fieldNormsChanged[i]) { final String curField = core.fieldInfos.fieldInfo(i).name; Norm norm = (Norm) this.norms.get(curField); if (norm != null) clone.norms.put(curField, norm.clone()); } } // If we are not cloning, then this will open anew // any norms that have changed: clone.openNorms(si.getUseCompoundFile() ? core.getCFSReader() : directory(), readBufferSize); success = true; } finally { if (!success) { // An exception occurred during reopen, we have to decRef the norms // that we incRef'ed already and close singleNormsStream and FieldsReader clone.decRef(); } } return clone; } /** @deprecated */ protected void doCommit() throws IOException { doCommit(null); } protected void doCommit(Map commitUserData) throws IOException { if (hasChanges) { startCommit(); boolean success = false; try { commitChanges(commitUserData); success = true; } finally { if (!success) { rollbackCommit(); } } } } private void commitChanges(Map commitUserData) throws IOException { if (deletedDocsDirty) { // re-write deleted si.advanceDelGen(); // We can write directly to the actual name (vs to a // .tmp & renaming it) because the file is not live // until segments file is written: final String delFileName = si.getDelFileName(); boolean success = false; try { deletedDocs.write(directory(), delFileName); success = true; } finally { if (!success) { try { directory().deleteFile(delFileName); } catch (Throwable t) { // suppress this so we keep throwing the // original exception } } } si.setDelCount(si.getDelCount()+pendingDeleteCount); pendingDeleteCount = 0; assert deletedDocs.count() == si.getDelCount(): "delete count mismatch during commit: info=" + si.getDelCount() + " vs BitVector=" + deletedDocs.count(); } else { assert pendingDeleteCount == 0; } if (normsDirty) { // re-write norms si.setNumFields(core.fieldInfos.size()); Iterator it = norms.values().iterator(); while (it.hasNext()) { Norm norm = (Norm) it.next(); if (norm.dirty) { norm.reWrite(si); } } } deletedDocsDirty = false; normsDirty = false; hasChanges = false; } FieldsReader getFieldsReader() { return (FieldsReader) fieldsReaderLocal.get(); } protected void doClose() throws IOException { termVectorsLocal.close(); fieldsReaderLocal.close(); if (deletedDocs != null) { deletedDocsRef.decRef(); // null so if an app hangs on to us we still free most ram deletedDocs = null; } Iterator it = norms.values().iterator(); while (it.hasNext()) { ((Norm) it.next()).decRef(); } if (core != null) { core.decRef(); } } static boolean hasDeletions(SegmentInfo si) throws IOException { // Don't call ensureOpen() here (it could affect performance) return si.hasDeletions(); } public boolean hasDeletions() { // Don't call ensureOpen() here (it could affect performance) return deletedDocs != null; } static boolean usesCompoundFile(SegmentInfo si) throws IOException { return si.getUseCompoundFile(); } static boolean hasSeparateNorms(SegmentInfo si) throws IOException { return si.hasSeparateNorms(); } protected void doDelete(int docNum) { if (deletedDocs == null) { deletedDocs = new BitVector(maxDoc()); deletedDocsRef = new Ref(); } // there is more than 1 SegmentReader with a reference to this // deletedDocs BitVector so decRef the current deletedDocsRef, // clone the BitVector, create a new deletedDocsRef if (deletedDocsRef.refCount() > 1) { Ref oldRef = deletedDocsRef; deletedDocs = cloneDeletedDocs(deletedDocs); deletedDocsRef = new Ref(); oldRef.decRef(); } deletedDocsDirty = true; if (!deletedDocs.getAndSet(docNum)) pendingDeleteCount++; } protected void doUndeleteAll() { deletedDocsDirty = false; if (deletedDocs != null) { assert deletedDocsRef != null; deletedDocsRef.decRef(); deletedDocs = null; deletedDocsRef = null; pendingDeleteCount = 0; si.clearDelGen(); si.setDelCount(0); } else { assert deletedDocsRef == null; assert pendingDeleteCount == 0; } } List files() throws IOException { return new ArrayList(si.files()); } public TermEnum terms() { ensureOpen(); return core.getTermsReader().terms(); } public TermEnum terms(Term t) throws IOException { ensureOpen(); return core.getTermsReader().terms(t); } FieldInfos fieldInfos() { return core.fieldInfos; } public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException { ensureOpen(); return getFieldsReader().doc(n, fieldSelector); } public synchronized boolean isDeleted(int n) { return (deletedDocs != null && deletedDocs.get(n)); } public TermDocs termDocs(Term term) throws IOException { if (term == null) { return new AllTermDocs(this); } else { return super.termDocs(term); } } public TermDocs termDocs() throws IOException { ensureOpen(); return new SegmentTermDocs(this); } public TermPositions termPositions() throws IOException { ensureOpen(); return new SegmentTermPositions(this); } public int docFreq(Term t) throws IOException { ensureOpen(); TermInfo ti = core.getTermsReader().get(t); if (ti != null) return ti.docFreq; else return 0; } public int numDocs() { // Don't call ensureOpen() here (it could affect performance) int n = maxDoc(); if (deletedDocs != null) n -= deletedDocs.count(); return n; } public int maxDoc() { // Don't call ensureOpen() here (it could affect performance) return si.docCount; } /** * @see IndexReader#getFieldNames(org.apache.lucene.index.IndexReader.FieldOption) */ public Collection getFieldNames(IndexReader.FieldOption fieldOption) { ensureOpen(); Set fieldSet = new HashSet(); for (int i = 0; i < core.fieldInfos.size(); i++) { FieldInfo fi = core.fieldInfos.fieldInfo(i); if (fieldOption == IndexReader.FieldOption.ALL) { fieldSet.add(fi.name); } else if (!fi.isIndexed && fieldOption == IndexReader.FieldOption.UNINDEXED) { fieldSet.add(fi.name); } else if (fi.omitTermFreqAndPositions && fieldOption == IndexReader.FieldOption.OMIT_TERM_FREQ_AND_POSITIONS) { fieldSet.add(fi.name); } else if (fi.storePayloads && fieldOption == IndexReader.FieldOption.STORES_PAYLOADS) { fieldSet.add(fi.name); } else if (fi.isIndexed && fieldOption == IndexReader.FieldOption.INDEXED) { fieldSet.add(fi.name); } else if (fi.isIndexed && fi.storeTermVector == false && fieldOption == IndexReader.FieldOption.INDEXED_NO_TERMVECTOR) { fieldSet.add(fi.name); } else if (fi.storeTermVector == true && fi.storePositionWithTermVector == false && fi.storeOffsetWithTermVector == false && fieldOption == IndexReader.FieldOption.TERMVECTOR) { fieldSet.add(fi.name); } else if (fi.isIndexed && fi.storeTermVector && fieldOption == IndexReader.FieldOption.INDEXED_WITH_TERMVECTOR) { fieldSet.add(fi.name); } else if (fi.storePositionWithTermVector && fi.storeOffsetWithTermVector == false && fieldOption == IndexReader.FieldOption.TERMVECTOR_WITH_POSITION) { fieldSet.add(fi.name); } else if (fi.storeOffsetWithTermVector && fi.storePositionWithTermVector == false && fieldOption == IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET) { fieldSet.add(fi.name); } else if ((fi.storeOffsetWithTermVector && fi.storePositionWithTermVector) && fieldOption == IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET) { fieldSet.add(fi.name); } } return fieldSet; } public synchronized boolean hasNorms(String field) { ensureOpen(); return norms.containsKey(field); } static byte[] createFakeNorms(int size) { byte[] ones = new byte[size]; Arrays.fill(ones, DefaultSimilarity.encodeNorm(1.0f)); return ones; } private byte[] ones; private byte[] fakeNorms() { assert !getDisableFakeNorms(); if (ones==null) ones=createFakeNorms(maxDoc()); return ones; } // can return null if norms aren't stored protected synchronized byte[] getNorms(String field) throws IOException { Norm norm = (Norm) norms.get(field); if (norm == null) return null; // not indexed, or norms not stored return norm.bytes(); } // returns fake norms if norms aren't available public synchronized byte[] norms(String field) throws IOException { ensureOpen(); byte[] bytes = getNorms(field); if (bytes==null && !getDisableFakeNorms()) bytes=fakeNorms(); return bytes; } protected void doSetNorm(int doc, String field, byte value) throws IOException { Norm norm = (Norm) norms.get(field); if (norm == null) // not an indexed field return; normsDirty = true; norm.copyOnWrite()[doc] = value; // set the value } /** Read norms into a pre-allocated array. */ public synchronized void norms(String field, byte[] bytes, int offset) throws IOException { ensureOpen(); Norm norm = (Norm) norms.get(field); if (norm == null) { Arrays.fill(bytes, offset, bytes.length, DefaultSimilarity.encodeNorm(1.0f)); return; } norm.bytes(bytes, offset, maxDoc()); } private void openNorms(Directory cfsDir, int readBufferSize) throws IOException { long nextNormSeek = SegmentMerger.NORMS_HEADER.length; //skip header (header unused for now) int maxDoc = maxDoc(); for (int i = 0; i < core.fieldInfos.size(); i++) { FieldInfo fi = core.fieldInfos.fieldInfo(i); if (norms.containsKey(fi.name)) { // in case this SegmentReader is being re-opened, we might be able to // reuse some norm instances and skip loading them here continue; } if (fi.isIndexed && !fi.omitNorms) { Directory d = directory(); String fileName = si.getNormFileName(fi.number); if (!si.hasSeparateNorms(fi.number)) { d = cfsDir; } // singleNormFile means multiple norms share this file boolean singleNormFile = fileName.endsWith("." + IndexFileNames.NORMS_EXTENSION); IndexInput normInput = null; long normSeek; if (singleNormFile) { normSeek = nextNormSeek; if (singleNormStream == null) { singleNormStream = d.openInput(fileName, readBufferSize); singleNormRef = new Ref(); } else { singleNormRef.incRef(); } // All norms in the .nrm file can share a single IndexInput since // they are only used in a synchronized context. // If this were to change in the future, a clone could be done here. normInput = singleNormStream; } else { normSeek = 0; normInput = d.openInput(fileName); } norms.put(fi.name, new Norm(normInput, fi.number, normSeek)); nextNormSeek += maxDoc; // increment also if some norms are separate } } } boolean termsIndexLoaded() { return core.termsIndexIsLoaded(); } // NOTE: only called from IndexWriter when a near // real-time reader is opened, or applyDeletes is run, // sharing a segment that's still being merged. This // method is not thread safe, and relies on the // synchronization in IndexWriter void loadTermsIndex(int termsIndexDivisor) throws IOException { core.loadTermsIndex(si, termsIndexDivisor); } // for testing only boolean normsClosed() { if (singleNormStream != null) { return false; } Iterator it = norms.values().iterator(); while (it.hasNext()) { Norm norm = (Norm) it.next(); if (norm.refCount > 0) { return false; } } return true; } // for testing only boolean normsClosed(String field) { Norm norm = (Norm) norms.get(field); return norm.refCount == 0; } /** * Create a clone from the initial TermVectorsReader and store it in the ThreadLocal. * @return TermVectorsReader */ TermVectorsReader getTermVectorsReader() { TermVectorsReader tvReader = (TermVectorsReader) termVectorsLocal.get(); if (tvReader == null) { TermVectorsReader orig = core.getTermVectorsReaderOrig(); if (orig == null) { return null; } else { try { tvReader = (TermVectorsReader) orig.clone(); } catch (CloneNotSupportedException cnse) { return null; } } termVectorsLocal.set(tvReader); } return tvReader; } TermVectorsReader getTermVectorsReaderOrig() { return core.getTermVectorsReaderOrig(); } /** Return a term frequency vector for the specified document and field. The * vector returned contains term numbers and frequencies for all terms in * the specified field of this document, if the field had storeTermVector * flag set. If the flag was not set, the method returns null. * @throws IOException */ public TermFreqVector getTermFreqVector(int docNumber, String field) throws IOException { // Check if this field is invalid or has no stored term vector ensureOpen(); FieldInfo fi = core.fieldInfos.fieldInfo(field); if (fi == null || !fi.storeTermVector) return null; TermVectorsReader termVectorsReader = getTermVectorsReader(); if (termVectorsReader == null) return null; return termVectorsReader.get(docNumber, field); } public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException { ensureOpen(); FieldInfo fi = core.fieldInfos.fieldInfo(field); if (fi == null || !fi.storeTermVector) return; TermVectorsReader termVectorsReader = getTermVectorsReader(); if (termVectorsReader == null) { return; } termVectorsReader.get(docNumber, field, mapper); } public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException { ensureOpen(); TermVectorsReader termVectorsReader = getTermVectorsReader(); if (termVectorsReader == null) return; termVectorsReader.get(docNumber, mapper); } /** Return an array of term frequency vectors for the specified document. * The array contains a vector for each vectorized field in the document. * Each vector vector contains term numbers and frequencies for all terms * in a given vectorized field. * If no such fields existed, the method returns null. * @throws IOException */ public TermFreqVector[] getTermFreqVectors(int docNumber) throws IOException { ensureOpen(); TermVectorsReader termVectorsReader = getTermVectorsReader(); if (termVectorsReader == null) return null; return termVectorsReader.get(docNumber); } /** * Return the name of the segment this reader is reading. */ public String getSegmentName() { return core.segment; } /** * Return the SegmentInfo of the segment this reader is reading. */ SegmentInfo getSegmentInfo() { return si; } void setSegmentInfo(SegmentInfo info) { si = info; } void startCommit() { rollbackSegmentInfo = (SegmentInfo) si.clone(); rollbackHasChanges = hasChanges; rollbackDeletedDocsDirty = deletedDocsDirty; rollbackNormsDirty = normsDirty; rollbackPendingDeleteCount = pendingDeleteCount; Iterator it = norms.values().iterator(); while (it.hasNext()) { Norm norm = (Norm) it.next(); norm.rollbackDirty = norm.dirty; } } void rollbackCommit() { si.reset(rollbackSegmentInfo); hasChanges = rollbackHasChanges; deletedDocsDirty = rollbackDeletedDocsDirty; normsDirty = rollbackNormsDirty; pendingDeleteCount = rollbackPendingDeleteCount; Iterator it = norms.values().iterator(); while (it.hasNext()) { Norm norm = (Norm) it.next(); norm.dirty = norm.rollbackDirty; } } /** Returns the directory this index resides in. */ public Directory directory() { // Don't ensureOpen here -- in certain cases, when a // cloned/reopened reader needs to commit, it may call // this method on the closed original reader return core.dir; } // This is necessary so that cloned SegmentReaders (which // share the underlying postings data) will map to the // same entry in the FieldCache. See LUCENE-1579. public final Object getFieldCacheKey() { return core.freqStream; } public Object getDeletesCacheKey() { return deletedDocs; } public long getUniqueTermCount() { return core.getTermsReader().size(); } /** * Lotsa tests did hacks like:
    * SegmentReader reader = (SegmentReader) IndexReader.open(dir);
    * They broke. This method serves as a hack to keep hacks working */ static SegmentReader getOnlySegmentReader(Directory dir) throws IOException { return getOnlySegmentReader(IndexReader.open(dir)); } static SegmentReader getOnlySegmentReader(IndexReader reader) { if (reader instanceof SegmentReader) return (SegmentReader) reader; if (reader instanceof DirectoryReader) { IndexReader[] subReaders = reader.getSequentialSubReaders(); if (subReaders.length != 1) throw new IllegalArgumentException(reader + " has " + subReaders.length + " segments instead of exactly one"); return (SegmentReader) subReaders[0]; } throw new IllegalArgumentException(reader + " is not a SegmentReader or a single-segment DirectoryReader"); } public int getTermInfosIndexDivisor() { return core.termsIndexDivisor; } } lucene-2.9.4/src/java/org/apache/lucene/index/FormatPostingsPositionsConsumer.java0000644000175000017500000000242411474320230031021 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.store.IndexInput; abstract class FormatPostingsPositionsConsumer { /** Add a new position & payload. If payloadLength > 0 * you must read those bytes from the IndexInput. */ abstract void addPosition(int position, byte[] payload, int payloadOffset, int payloadLength) throws IOException; /** Called when we are done adding positions & payloads */ abstract void finish() throws IOException; } lucene-2.9.4/src/java/org/apache/lucene/index/DocFieldProcessorPerField.java0000644000175000017500000000261511474320230027404 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.document.Fieldable; /** * Holds all per thread, per field state. */ final class DocFieldProcessorPerField { final DocFieldConsumerPerField consumer; final FieldInfo fieldInfo; DocFieldProcessorPerField next; int lastGen = -1; int fieldCount; Fieldable[] fields = new Fieldable[1]; public DocFieldProcessorPerField(final DocFieldProcessorPerThread perThread, final FieldInfo fieldInfo) { this.consumer = perThread.consumer.addField(fieldInfo); this.fieldInfo = fieldInfo; } public void abort() { consumer.abort(); } } lucene-2.9.4/src/java/org/apache/lucene/index/RawPostingList.java0000644000175000017500000000276211474320230025354 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** This is the base class for an in-memory posting list, * keyed by a Token. {@link TermsHash} maintains a hash * table holding one instance of this per unique Token. * Consumers of TermsHash ({@link TermsHashConsumer}) must * subclass this class with its own concrete class. * FreqProxTermsWriter.PostingList is a private inner class used * for the freq/prox postings, and * TermVectorsTermsWriter.PostingList is a private inner class * used to hold TermVectors postings. */ abstract class RawPostingList { final static int BYTES_SIZE = DocumentsWriter.OBJECT_HEADER_BYTES + 3*DocumentsWriter.INT_NUM_BYTE; int textStart; int intStart; int byteStart; } lucene-2.9.4/src/java/org/apache/lucene/index/DocFieldConsumerPerField.java0000644000175000017500000000213311474320230027213 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.document.Fieldable; abstract class DocFieldConsumerPerField { /** Processes all occurrences of a single field */ abstract void processFields(Fieldable[] fields, int count) throws IOException; abstract void abort(); } lucene-2.9.4/src/java/org/apache/lucene/index/ByteBlockPool.java0000644000175000017500000001214111474320230025123 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* Class that Posting and PostingVector use to write byte * streams into shared fixed-size byte[] arrays. The idea * is to allocate slices of increasing lengths For * example, the first slice is 5 bytes, the next slice is * 14, etc. We start by writing our bytes into the first * 5 bytes. When we hit the end of the slice, we allocate * the next slice and then write the address of the new * slice into the last 4 bytes of the previous slice (the * "forwarding address"). * * Each slice is filled with 0's initially, and we mark * the end with a non-zero byte. This way the methods * that are writing into the slice don't need to record * its length and instead allocate a new slice once they * hit a non-zero byte. */ import java.util.Arrays; import java.util.List; final class ByteBlockPool { abstract static class Allocator { abstract void recycleByteBlocks(byte[][] blocks, int start, int end); abstract void recycleByteBlocks(List blocks); abstract byte[] getByteBlock(boolean trackAllocations); } public byte[][] buffers = new byte[10][]; int bufferUpto = -1; // Which buffer we are upto public int byteUpto = DocumentsWriter.BYTE_BLOCK_SIZE; // Where we are in head buffer public byte[] buffer; // Current head buffer public int byteOffset = -DocumentsWriter.BYTE_BLOCK_SIZE; // Current head offset private final boolean trackAllocations; private final Allocator allocator; public ByteBlockPool(Allocator allocator, boolean trackAllocations) { this.allocator = allocator; this.trackAllocations = trackAllocations; } public void reset() { if (bufferUpto != -1) { // We allocated at least one buffer for(int i=0;i 0) // Recycle all but the first buffer allocator.recycleByteBlocks(buffers, 1, 1+bufferUpto); // Re-use the first buffer bufferUpto = 0; byteUpto = 0; byteOffset = 0; buffer = buffers[0]; } } public void nextBuffer() { if (1+bufferUpto == buffers.length) { byte[][] newBuffers = new byte[(int) (buffers.length*1.5)][]; System.arraycopy(buffers, 0, newBuffers, 0, buffers.length); buffers = newBuffers; } buffer = buffers[1+bufferUpto] = allocator.getByteBlock(trackAllocations); bufferUpto++; byteUpto = 0; byteOffset += DocumentsWriter.BYTE_BLOCK_SIZE; } public int newSlice(final int size) { if (byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE-size) nextBuffer(); final int upto = byteUpto; byteUpto += size; buffer[byteUpto-1] = 16; return upto; } // Size of each slice. These arrays should be at most 16 // elements (index is encoded with 4 bits). First array // is just a compact way to encode X+1 with a max. Second // array is the length of each slice, ie first slice is 5 // bytes, next slice is 14 bytes, etc. final static int[] nextLevelArray = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9}; final static int[] levelSizeArray = {5, 14, 20, 30, 40, 40, 80, 80, 120, 200}; final static int FIRST_LEVEL_SIZE = levelSizeArray[0]; public int allocSlice(final byte[] slice, final int upto) { final int level = slice[upto] & 15; final int newLevel = nextLevelArray[level]; final int newSize = levelSizeArray[newLevel]; // Maybe allocate another block if (byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE-newSize) nextBuffer(); final int newUpto = byteUpto; final int offset = newUpto + byteOffset; byteUpto += newSize; // Copy forward the past 3 bytes (which we are about // to overwrite with the forwarding address): buffer[newUpto] = slice[upto-3]; buffer[newUpto+1] = slice[upto-2]; buffer[newUpto+2] = slice[upto-1]; // Write forwarding address at end of last slice: slice[upto-3] = (byte) (offset >>> 24); slice[upto-2] = (byte) (offset >>> 16); slice[upto-1] = (byte) (offset >>> 8); slice[upto] = (byte) offset; // Write new level: buffer[byteUpto-1] = (byte) (16|newLevel); return newUpto+3; } } lucene-2.9.4/src/java/org/apache/lucene/index/FieldInfos.java0000644000175000017500000003403411474320230024442 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.document.Document; import org.apache.lucene.document.Fieldable; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.StringHelper; import java.io.IOException; import java.util.*; /** Access to the Fieldable Info file that describes document fields and whether or * not they are indexed. Each segment has a separate Fieldable Info file. Objects * of this class are thread-safe for multiple readers, but only one thread can * be adding documents at a time, with no other reader or writer threads * accessing this object. */ final class FieldInfos { // Used internally (ie not written to *.fnm files) for pre-2.9 files public static final int FORMAT_PRE = -1; // First used in 2.9; prior to 2.9 there was no format header public static final int FORMAT_START = -2; static final int CURRENT_FORMAT = FORMAT_START; static final byte IS_INDEXED = 0x1; static final byte STORE_TERMVECTOR = 0x2; static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x4; static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x8; static final byte OMIT_NORMS = 0x10; static final byte STORE_PAYLOADS = 0x20; static final byte OMIT_TERM_FREQ_AND_POSITIONS = 0x40; private final ArrayList byNumber = new ArrayList(); private final HashMap byName = new HashMap(); private int format; FieldInfos() { } /** * Construct a FieldInfos object using the directory and the name of the file * IndexInput * @param d The directory to open the IndexInput from * @param name The name of the file to open the IndexInput from in the Directory * @throws IOException */ FieldInfos(Directory d, String name) throws IOException { IndexInput input = d.openInput(name); try { try { read(input, name); } catch (IOException ioe) { if (format == FORMAT_PRE) { // LUCENE-1623: FORMAT_PRE (before there was a // format) may be 2.3.2 (pre-utf8) or 2.4.x (utf8) // encoding; retry with input set to pre-utf8 input.seek(0); input.setModifiedUTF8StringsMode(); byNumber.clear(); byName.clear(); try { read(input, name); } catch (Throwable t) { // Ignore any new exception & throw original IOE throw ioe; } } else { // The IOException cannot be caused by // LUCENE-1623, so re-throw it throw ioe; } } } finally { input.close(); } } /** * Returns a deep clone of this FieldInfos instance. */ synchronized public Object clone() { FieldInfos fis = new FieldInfos(); final int numField = byNumber.size(); for(int i=0;i= 0) ? (FieldInfo) byNumber.get(fieldNumber) : null; } public int size() { return byNumber.size(); } public boolean hasVectors() { boolean hasVectors = false; for (int i = 0; i < size(); i++) { if (fieldInfo(i).storeTermVector) { hasVectors = true; break; } } return hasVectors; } public void write(Directory d, String name) throws IOException { IndexOutput output = d.createOutput(name); try { write(output); } finally { output.close(); } } public void write(IndexOutput output) throws IOException { output.writeVInt(CURRENT_FORMAT); output.writeVInt(size()); for (int i = 0; i < size(); i++) { FieldInfo fi = fieldInfo(i); byte bits = 0x0; if (fi.isIndexed) bits |= IS_INDEXED; if (fi.storeTermVector) bits |= STORE_TERMVECTOR; if (fi.storePositionWithTermVector) bits |= STORE_POSITIONS_WITH_TERMVECTOR; if (fi.storeOffsetWithTermVector) bits |= STORE_OFFSET_WITH_TERMVECTOR; if (fi.omitNorms) bits |= OMIT_NORMS; if (fi.storePayloads) bits |= STORE_PAYLOADS; if (fi.omitTermFreqAndPositions) bits |= OMIT_TERM_FREQ_AND_POSITIONS; output.writeString(fi.name); output.writeByte(bits); } } private void read(IndexInput input, String fileName) throws IOException { int firstInt = input.readVInt(); if (firstInt < 0) { // This is a real format format = firstInt; } else { format = FORMAT_PRE; } if (format != FORMAT_PRE & format != FORMAT_START) { throw new CorruptIndexException("unrecognized format " + format + " in file \"" + fileName + "\""); } int size; if (format == FORMAT_PRE) { size = firstInt; } else { size = input.readVInt(); //read in the size } for (int i = 0; i < size; i++) { String name = StringHelper.intern(input.readString()); byte bits = input.readByte(); boolean isIndexed = (bits & IS_INDEXED) != 0; boolean storeTermVector = (bits & STORE_TERMVECTOR) != 0; boolean storePositionsWithTermVector = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; boolean storeOffsetWithTermVector = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; boolean omitNorms = (bits & OMIT_NORMS) != 0; boolean storePayloads = (bits & STORE_PAYLOADS) != 0; boolean omitTermFreqAndPositions = (bits & OMIT_TERM_FREQ_AND_POSITIONS) != 0; addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions); } if (input.getFilePointer() != input.length()) { throw new CorruptIndexException("did not read all bytes from file \"" + fileName + "\": read " + input.getFilePointer() + " vs size " + input.length()); } } } lucene-2.9.4/src/java/org/apache/lucene/index/TermVectorsTermsWriter.java0000644000175000017500000002356111474320230027110 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.ArrayUtil; import java.io.IOException; import java.util.Collection; import java.util.Iterator; import java.util.Map; final class TermVectorsTermsWriter extends TermsHashConsumer { final DocumentsWriter docWriter; TermVectorsWriter termVectorsWriter; PerDoc[] docFreeList = new PerDoc[1]; int freeCount; IndexOutput tvx; IndexOutput tvd; IndexOutput tvf; int lastDocID; public TermVectorsTermsWriter(DocumentsWriter docWriter) { this.docWriter = docWriter; } public TermsHashConsumerPerThread addThread(TermsHashPerThread termsHashPerThread) { return new TermVectorsTermsWriterPerThread(termsHashPerThread, this); } void createPostings(RawPostingList[] postings, int start, int count) { final int end = start + count; for(int i=start;i 0) // In case there are some final documents that we // didn't see (because they hit a non-aborting exception): fill(state.numDocsInStore - docWriter.getDocStoreOffset()); tvx.flush(); tvd.flush(); tvf.flush(); } Iterator it = threadsAndFields.entrySet().iterator(); while(it.hasNext()) { Map.Entry entry = (Map.Entry) it.next(); Iterator it2 = ((Collection) entry.getValue()).iterator(); while(it2.hasNext()) { TermVectorsTermsWriterPerField perField = (TermVectorsTermsWriterPerField) it2.next(); perField.termsHashPerField.reset(); perField.shrinkHash(); } TermVectorsTermsWriterPerThread perThread = (TermVectorsTermsWriterPerThread) entry.getKey(); perThread.termsHashPerThread.reset(true); } } synchronized void closeDocStore(final SegmentWriteState state) throws IOException { if (tvx != null) { // At least one doc in this run had term vectors // enabled fill(state.numDocsInStore - docWriter.getDocStoreOffset()); tvx.close(); tvf.close(); tvd.close(); tvx = null; assert state.docStoreSegmentName != null; final String fileName = state.docStoreSegmentName + "." + IndexFileNames.VECTORS_INDEX_EXTENSION; if (4+((long) state.numDocsInStore)*16 != state.directory.fileLength(fileName)) throw new RuntimeException("after flush: tvx size mismatch: " + state.numDocsInStore + " docs vs " + state.directory.fileLength(fileName) + " length in bytes of " + fileName + " file exists?=" + state.directory.fileExists(fileName)); state.flushedFiles.add(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); state.flushedFiles.add(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); state.flushedFiles.add(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); docWriter.removeOpenFile(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); docWriter.removeOpenFile(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); docWriter.removeOpenFile(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); lastDocID = 0; } } int allocCount; synchronized PerDoc getPerDoc() { if (freeCount == 0) { allocCount++; if (allocCount > docFreeList.length) { // Grow our free list up front to make sure we have // enough space to recycle all outstanding PerDoc // instances assert allocCount == 1+docFreeList.length; docFreeList = new PerDoc[ArrayUtil.getNextSize(allocCount)]; } return new PerDoc(); } else return docFreeList[--freeCount]; } /** Fills in no-term-vectors for all docs we haven't seen * since the last doc that had term vectors. */ void fill(int docID) throws IOException { final int docStoreOffset = docWriter.getDocStoreOffset(); final int end = docID+docStoreOffset; if (lastDocID < end) { final long tvfPosition = tvf.getFilePointer(); while(lastDocID < end) { tvx.writeLong(tvd.getFilePointer()); tvd.writeVInt(0); tvx.writeLong(tvfPosition); lastDocID++; } } } synchronized void initTermVectorsWriter() throws IOException { if (tvx == null) { final String docStoreSegment = docWriter.getDocStoreSegment(); if (docStoreSegment == null) return; assert docStoreSegment != null; // If we hit an exception while init'ing the term // vector output files, we must abort this segment // because those files will be in an unknown // state: tvx = docWriter.directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); tvd = docWriter.directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); tvf = docWriter.directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); tvx.writeInt(TermVectorsReader.FORMAT_CURRENT); tvd.writeInt(TermVectorsReader.FORMAT_CURRENT); tvf.writeInt(TermVectorsReader.FORMAT_CURRENT); docWriter.addOpenFile(docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); docWriter.addOpenFile(docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); docWriter.addOpenFile(docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); lastDocID = 0; } } synchronized void finishDocument(PerDoc perDoc) throws IOException { assert docWriter.writer.testPoint("TermVectorsTermsWriter.finishDocument start"); initTermVectorsWriter(); fill(perDoc.docID); // Append term vectors to the real outputs: tvx.writeLong(tvd.getFilePointer()); tvx.writeLong(tvf.getFilePointer()); tvd.writeVInt(perDoc.numVectorFields); if (perDoc.numVectorFields > 0) { for(int i=0;i 0 && tpVector.getTermPositions(0) != null; storeOffsets = tpVector.size() > 0 && tpVector.getOffsets(0) != null; bits = (byte) ((storePositions ? TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR : 0) + (storeOffsets ? TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR : 0)); } else { tpVector = null; bits = 0; storePositions = false; storeOffsets = false; } tvf.writeVInt(bits); final String[] terms = vectors[i].getTerms(); final int[] freqs = vectors[i].getTermFrequencies(); int utf8Upto = 0; utf8Results[1].length = 0; for (int j=0; j 1) { long lastFieldPointer = fieldPointers[0]; for (int i=1; iFilterIndexReader contains another IndexReader, which it * uses as its basic source of data, possibly transforming the data along the * way or providing additional functionality. The class * FilterIndexReader itself simply implements all abstract methods * of IndexReader with versions that pass all requests to the * contained index reader. Subclasses of FilterIndexReader may * further override some of these methods and may also provide additional * methods and fields. */ public class FilterIndexReader extends IndexReader { /** Base class for filtering {@link TermDocs} implementations. */ public static class FilterTermDocs implements TermDocs { protected TermDocs in; public FilterTermDocs(TermDocs in) { this.in = in; } public void seek(Term term) throws IOException { in.seek(term); } public void seek(TermEnum termEnum) throws IOException { in.seek(termEnum); } public int doc() { return in.doc(); } public int freq() { return in.freq(); } public boolean next() throws IOException { return in.next(); } public int read(int[] docs, int[] freqs) throws IOException { return in.read(docs, freqs); } public boolean skipTo(int i) throws IOException { return in.skipTo(i); } public void close() throws IOException { in.close(); } } /** Base class for filtering {@link TermPositions} implementations. */ public static class FilterTermPositions extends FilterTermDocs implements TermPositions { public FilterTermPositions(TermPositions in) { super(in); } public int nextPosition() throws IOException { return ((TermPositions) this.in).nextPosition(); } public int getPayloadLength() { return ((TermPositions) this.in).getPayloadLength(); } public byte[] getPayload(byte[] data, int offset) throws IOException { return ((TermPositions) this.in).getPayload(data, offset); } // TODO: Remove warning after API has been finalized public boolean isPayloadAvailable() { return ((TermPositions)this.in).isPayloadAvailable(); } } /** Base class for filtering {@link TermEnum} implementations. */ public static class FilterTermEnum extends TermEnum { protected TermEnum in; public FilterTermEnum(TermEnum in) { this.in = in; } public boolean next() throws IOException { return in.next(); } public Term term() { return in.term(); } public int docFreq() { return in.docFreq(); } public void close() throws IOException { in.close(); } } protected IndexReader in; /** *

    Construct a FilterIndexReader based on the specified base reader. * Directory locking for delete, undeleteAll, and setNorm operations is * left to the base reader.

    *

    Note that base reader is closed if this FilterIndexReader is closed.

    * @param in specified base reader. */ public FilterIndexReader(IndexReader in) { super(); this.in = in; } public Directory directory() { return in.directory(); } public TermFreqVector[] getTermFreqVectors(int docNumber) throws IOException { ensureOpen(); return in.getTermFreqVectors(docNumber); } public TermFreqVector getTermFreqVector(int docNumber, String field) throws IOException { ensureOpen(); return in.getTermFreqVector(docNumber, field); } public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException { ensureOpen(); in.getTermFreqVector(docNumber, field, mapper); } public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException { ensureOpen(); in.getTermFreqVector(docNumber, mapper); } public int numDocs() { // Don't call ensureOpen() here (it could affect performance) return in.numDocs(); } public int maxDoc() { // Don't call ensureOpen() here (it could affect performance) return in.maxDoc(); } public Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException { ensureOpen(); return in.document(n, fieldSelector); } public boolean isDeleted(int n) { // Don't call ensureOpen() here (it could affect performance) return in.isDeleted(n); } public boolean hasDeletions() { // Don't call ensureOpen() here (it could affect performance) return in.hasDeletions(); } protected void doUndeleteAll() throws CorruptIndexException, IOException {in.undeleteAll();} public boolean hasNorms(String field) throws IOException { ensureOpen(); return in.hasNorms(field); } public byte[] norms(String f) throws IOException { ensureOpen(); return in.norms(f); } public void norms(String f, byte[] bytes, int offset) throws IOException { ensureOpen(); in.norms(f, bytes, offset); } protected void doSetNorm(int d, String f, byte b) throws CorruptIndexException, IOException { in.setNorm(d, f, b); } public TermEnum terms() throws IOException { ensureOpen(); return in.terms(); } public TermEnum terms(Term t) throws IOException { ensureOpen(); return in.terms(t); } public int docFreq(Term t) throws IOException { ensureOpen(); return in.docFreq(t); } public TermDocs termDocs() throws IOException { ensureOpen(); return in.termDocs(); } public TermDocs termDocs(Term term) throws IOException { ensureOpen(); return in.termDocs(term); } public TermPositions termPositions() throws IOException { ensureOpen(); return in.termPositions(); } protected void doDelete(int n) throws CorruptIndexException, IOException { in.deleteDocument(n); } /** @deprecated */ protected void doCommit() throws IOException { doCommit(null); } protected void doCommit(Map commitUserData) throws IOException { in.commit(commitUserData); } protected void doClose() throws IOException { in.close(); // NOTE: only needed in case someone had asked for // FieldCache for top-level reader (which is generally // not a good idea): FieldCache.DEFAULT.purge(this); } public Collection getFieldNames(IndexReader.FieldOption fieldNames) { ensureOpen(); return in.getFieldNames(fieldNames); } public long getVersion() { ensureOpen(); return in.getVersion(); } public boolean isCurrent() throws CorruptIndexException, IOException { ensureOpen(); return in.isCurrent(); } public boolean isOptimized() { ensureOpen(); return in.isOptimized(); } public IndexReader[] getSequentialSubReaders() { return in.getSequentialSubReaders(); } /** If the subclass of FilteredIndexReader modifies the * contents of the FieldCache, you must override this * method to provide a different key */ public Object getFieldCacheKey() { return in.getFieldCacheKey(); } /** If the subclass of FilteredIndexReader modifies the * deleted docs, you must override this method to provide * a different key */ public Object getDeletesCacheKey() { return in.getDeletesCacheKey(); } } lucene-2.9.4/src/java/org/apache/lucene/index/SegmentWriteState.java0000644000175000017500000000334211474320230026034 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.HashSet; import java.util.Collection; import org.apache.lucene.store.Directory; class SegmentWriteState { DocumentsWriter docWriter; Directory directory; String segmentName; String docStoreSegmentName; int numDocs; int termIndexInterval; int numDocsInStore; Collection flushedFiles; public SegmentWriteState(DocumentsWriter docWriter, Directory directory, String segmentName, String docStoreSegmentName, int numDocs, int numDocsInStore, int termIndexInterval) { this.docWriter = docWriter; this.directory = directory; this.segmentName = segmentName; this.docStoreSegmentName = docStoreSegmentName; this.numDocs = numDocs; this.numDocsInStore = numDocsInStore; this.termIndexInterval = termIndexInterval; flushedFiles = new HashSet(); } public String segmentFileName(String ext) { return segmentName + "." + ext; } } lucene-2.9.4/src/java/org/apache/lucene/index/BufferedDeletes.java0000644000175000017500000001174111474320230025450 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.HashMap; import java.util.Map; import java.util.TreeMap; import java.util.ArrayList; import java.util.List; import java.util.Iterator; import java.util.Map.Entry; /** Holds buffered deletes, by docID, term or query. We * hold two instances of this class: one for the deletes * prior to the last flush, the other for deletes after * the last flush. This is so if we need to abort * (discard all buffered docs) we can also discard the * buffered deletes yet keep the deletes done during * previously flushed segments. */ class BufferedDeletes { int numTerms; Map terms; Map queries = new HashMap(); List docIDs = new ArrayList(); long bytesUsed; private final boolean doTermSort; public BufferedDeletes(boolean doTermSort) { this.doTermSort = doTermSort; if (doTermSort) { terms = new TreeMap(); } else { terms = new HashMap(); } } // Number of documents a delete term applies to. final static class Num { private int num; Num(int num) { this.num = num; } int getNum() { return num; } void setNum(int num) { // Only record the new number if it's greater than the // current one. This is important because if multiple // threads are replacing the same doc at nearly the // same time, it's possible that one thread that got a // higher docID is scheduled before the other // threads. if (num > this.num) this.num = num; } } int size() { // We use numTerms not terms.size() intentionally, so // that deletes by the same term multiple times "count", // ie if you ask to flush every 1000 deletes then even // dup'd terms are counted towards that 1000 return numTerms + queries.size() + docIDs.size(); } void update(BufferedDeletes in) { numTerms += in.numTerms; bytesUsed += in.bytesUsed; terms.putAll(in.terms); queries.putAll(in.queries); docIDs.addAll(in.docIDs); in.clear(); } void clear() { terms.clear(); queries.clear(); docIDs.clear(); numTerms = 0; bytesUsed = 0; } void addBytesUsed(long b) { bytesUsed += b; } boolean any() { return terms.size() > 0 || docIDs.size() > 0 || queries.size() > 0; } // Remaps all buffered deletes based on a completed // merge synchronized void remap(MergeDocIDRemapper mapper, SegmentInfos infos, int[][] docMaps, int[] delCounts, MergePolicy.OneMerge merge, int mergeDocCount) { final Map newDeleteTerms; // Remap delete-by-term if (terms.size() > 0) { if (doTermSort) { newDeleteTerms = new TreeMap(); } else { newDeleteTerms = new HashMap(); } Iterator iter = terms.entrySet().iterator(); while(iter.hasNext()) { Entry entry = (Entry) iter.next(); Num num = (Num) entry.getValue(); newDeleteTerms.put(entry.getKey(), new Num(mapper.remap(num.getNum()))); } } else newDeleteTerms = null; // Remap delete-by-docID final List newDeleteDocIDs; if (docIDs.size() > 0) { newDeleteDocIDs = new ArrayList(docIDs.size()); Iterator iter = docIDs.iterator(); while(iter.hasNext()) { Integer num = (Integer) iter.next(); newDeleteDocIDs.add(new Integer(mapper.remap(num.intValue()))); } } else newDeleteDocIDs = null; // Remap delete-by-query final HashMap newDeleteQueries; if (queries.size() > 0) { newDeleteQueries = new HashMap(queries.size()); Iterator iter = queries.entrySet().iterator(); while(iter.hasNext()) { Entry entry = (Entry) iter.next(); Integer num = (Integer) entry.getValue(); newDeleteQueries.put(entry.getKey(), new Integer(mapper.remap(num.intValue()))); } } else newDeleteQueries = null; if (newDeleteTerms != null) terms = newDeleteTerms; if (newDeleteDocIDs != null) docIDs = newDeleteDocIDs; if (newDeleteQueries != null) queries = newDeleteQueries; } }lucene-2.9.4/src/java/org/apache/lucene/index/SegmentInfos.java0000644000175000017500000007364711474320230025036 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.ChecksumIndexOutput; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.NoSuchDirectoryException; import java.io.FileNotFoundException; import java.io.IOException; import java.io.PrintStream; import java.util.Vector; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.HashMap; import java.util.Map; /** * A collection of segmentInfo objects with methods for operating on * those segments in relation to the file system. * *

    NOTE: This API is new and still experimental * (subject to change suddenly in the next release)

    */ public final class SegmentInfos extends Vector { /** The file format version, a negative number. */ /* Works since counter, the old 1st entry, is always >= 0 */ public static final int FORMAT = -1; /** This format adds details used for lockless commits. It differs * slightly from the previous format in that file names * are never re-used (write once). Instead, each file is * written to the next generation. For example, * segments_1, segments_2, etc. This allows us to not use * a commit lock. See file * formats for details. */ public static final int FORMAT_LOCKLESS = -2; /** This format adds a "hasSingleNormFile" flag into each segment info. * See LUCENE-756 * for details. */ public static final int FORMAT_SINGLE_NORM_FILE = -3; /** This format allows multiple segments to share a single * vectors and stored fields file. */ public static final int FORMAT_SHARED_DOC_STORE = -4; /** This format adds a checksum at the end of the file to * ensure all bytes were successfully written. */ public static final int FORMAT_CHECKSUM = -5; /** This format adds the deletion count for each segment. * This way IndexWriter can efficiently report numDocs(). */ public static final int FORMAT_DEL_COUNT = -6; /** This format adds the boolean hasProx to record if any * fields in the segment store prox information (ie, have * omitTermFreqAndPositions==false) */ public static final int FORMAT_HAS_PROX = -7; /** This format adds optional commit userData (String) storage. */ public static final int FORMAT_USER_DATA = -8; /** This format adds optional per-segment String * diagnostics storage, and switches userData to Map */ public static final int FORMAT_DIAGNOSTICS = -9; /* This must always point to the most recent file format. */ static final int CURRENT_FORMAT = FORMAT_DIAGNOSTICS; public int counter = 0; // used to name new segments /** * counts how often the index has been changed by adding or deleting docs. * starting with the current time in milliseconds forces to create unique version numbers. */ private long version = System.currentTimeMillis(); private long generation = 0; // generation of the "segments_N" for the next commit private long lastGeneration = 0; // generation of the "segments_N" file we last successfully read // or wrote; this is normally the same as generation except if // there was an IOException that had interrupted a commit private Map userData = Collections.EMPTY_MAP; // Opaque Map that user can specify during IndexWriter.commit /** * If non-null, information about loading segments_N files * will be printed here. @see #setInfoStream. */ private static PrintStream infoStream; public final SegmentInfo info(int i) { return (SegmentInfo) get(i); } /** * Get the generation (N) of the current segments_N file * from a list of files. * * @param files -- array of file names to check */ public static long getCurrentSegmentGeneration(String[] files) { if (files == null) { return -1; } long max = -1; for (int i = 0; i < files.length; i++) { String file = files[i]; if (file.startsWith(IndexFileNames.SEGMENTS) && !file.equals(IndexFileNames.SEGMENTS_GEN)) { long gen = generationFromSegmentsFileName(file); if (gen > max) { max = gen; } } } return max; } /** * Get the generation (N) of the current segments_N file * in the directory. * * @param directory -- directory to search for the latest segments_N file */ public static long getCurrentSegmentGeneration(Directory directory) throws IOException { try { return getCurrentSegmentGeneration(directory.listAll()); } catch (NoSuchDirectoryException nsde) { return -1; } } /** * Get the filename of the current segments_N file * from a list of files. * * @param files -- array of file names to check */ public static String getCurrentSegmentFileName(String[] files) throws IOException { return IndexFileNames.fileNameFromGeneration(IndexFileNames.SEGMENTS, "", getCurrentSegmentGeneration(files)); } /** * Get the filename of the current segments_N file * in the directory. * * @param directory -- directory to search for the latest segments_N file */ public static String getCurrentSegmentFileName(Directory directory) throws IOException { return IndexFileNames.fileNameFromGeneration(IndexFileNames.SEGMENTS, "", getCurrentSegmentGeneration(directory)); } /** * Get the segments_N filename in use by this segment infos. */ public String getCurrentSegmentFileName() { return IndexFileNames.fileNameFromGeneration(IndexFileNames.SEGMENTS, "", lastGeneration); } /** * Parse the generation off the segments file name and * return it. */ public static long generationFromSegmentsFileName(String fileName) { if (fileName.equals(IndexFileNames.SEGMENTS)) { return 0; } else if (fileName.startsWith(IndexFileNames.SEGMENTS)) { return Long.parseLong(fileName.substring(1+IndexFileNames.SEGMENTS.length()), Character.MAX_RADIX); } else { throw new IllegalArgumentException("fileName \"" + fileName + "\" is not a segments file"); } } /** * Get the next segments_N filename that will be written. */ public String getNextSegmentFileName() { long nextGeneration; if (generation == -1) { nextGeneration = 1; } else { nextGeneration = generation+1; } return IndexFileNames.fileNameFromGeneration(IndexFileNames.SEGMENTS, "", nextGeneration); } /** * Read a particular segmentFileName. Note that this may * throw an IOException if a commit is in process. * * @param directory -- directory containing the segments file * @param segmentFileName -- segment file to load * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public final void read(Directory directory, String segmentFileName) throws CorruptIndexException, IOException { boolean success = false; // Clear any previous segments: clear(); ChecksumIndexInput input = new ChecksumIndexInput(directory.openInput(segmentFileName)); generation = generationFromSegmentsFileName(segmentFileName); lastGeneration = generation; try { int format = input.readInt(); if(format < 0){ // file contains explicit format info // check that it is a format we can understand if (format < CURRENT_FORMAT) throw new CorruptIndexException("Unknown format version: " + format); version = input.readLong(); // read version counter = input.readInt(); // read counter } else{ // file is in old format without explicit format info counter = format; } for (int i = input.readInt(); i > 0; i--) { // read segmentInfos add(new SegmentInfo(directory, format, input)); } if(format >= 0){ // in old format the version number may be at the end of the file if (input.getFilePointer() >= input.length()) version = System.currentTimeMillis(); // old file format without version number else version = input.readLong(); // read version } if (format <= FORMAT_USER_DATA) { if (format <= FORMAT_DIAGNOSTICS) { userData = input.readStringStringMap(); } else if (0 != input.readByte()) { userData = Collections.singletonMap("userData", input.readString()); } else { userData = Collections.EMPTY_MAP; } } else { userData = Collections.EMPTY_MAP; } if (format <= FORMAT_CHECKSUM) { final long checksumNow = input.getChecksum(); final long checksumThen = input.readLong(); if (checksumNow != checksumThen) throw new CorruptIndexException("checksum mismatch in segments file"); } success = true; } finally { input.close(); if (!success) { // Clear any segment infos we had loaded so we // have a clean slate on retry: clear(); } } } /** * This version of read uses the retry logic (for lock-less * commits) to find the right segments file to load. * @throws CorruptIndexException if the index is corrupt * @throws IOException if there is a low-level IO error */ public final void read(Directory directory) throws CorruptIndexException, IOException { generation = lastGeneration = -1; new FindSegmentsFile(directory) { protected Object doBody(String segmentFileName) throws CorruptIndexException, IOException { read(directory, segmentFileName); return null; } }.run(); } // Only non-null after prepareCommit has been called and // before finishCommit is called ChecksumIndexOutput pendingSegnOutput; private final void write(Directory directory) throws IOException { String segmentFileName = getNextSegmentFileName(); // Always advance the generation on write: if (generation == -1) { generation = 1; } else { generation++; } ChecksumIndexOutput segnOutput = new ChecksumIndexOutput(directory.createOutput(segmentFileName)); boolean success = false; try { segnOutput.writeInt(CURRENT_FORMAT); // write FORMAT segnOutput.writeLong(++version); // every write changes // the index segnOutput.writeInt(counter); // write counter segnOutput.writeInt(size()); // write infos for (int i = 0; i < size(); i++) { info(i).write(segnOutput); } segnOutput.writeStringStringMap(userData); segnOutput.prepareCommit(); success = true; pendingSegnOutput = segnOutput; } finally { if (!success) { // We hit an exception above; try to close the file // but suppress any exception: try { segnOutput.close(); } catch (Throwable t) { // Suppress so we keep throwing the original exception } try { // Try not to leave a truncated segments_N file in // the index: directory.deleteFile(segmentFileName); } catch (Throwable t) { // Suppress so we keep throwing the original exception } } } } /** * Returns a copy of this instance, also copying each * SegmentInfo. */ public Object clone() { SegmentInfos sis = (SegmentInfos) super.clone(); for(int i=0;i 0) { buffer.append(' '); } final SegmentInfo info = info(i); buffer.append(info.segString(directory)); if (info.dir != directory) buffer.append("**"); } return buffer.toString(); } public Map getUserData() { return userData; } void setUserData(Map data) { if (data == null) { userData = Collections.EMPTY_MAP; } else { userData = data; } } /** Replaces all segments in this instance, but keeps * generation, version, counter so that future commits * remain write once. */ void replace(SegmentInfos other) { clear(); addAll(other); lastGeneration = other.lastGeneration; } // Used only for testing public boolean hasExternalSegments(Directory dir) { final int numSegments = size(); for(int i=0;iWARNING: This API is new and experimental, and may suddenly * change.

    */ public final class FieldInvertState { int position; int length; int numOverlap; int offset; float boost; AttributeSource attributeSource; public FieldInvertState() { } public FieldInvertState(int position, int length, int numOverlap, int offset, float boost) { this.position = position; this.length = length; this.numOverlap = numOverlap; this.offset = offset; this.boost = boost; } /** * Re-initialize the state, using this boost value. * @param docBoost boost value to use. */ void reset(float docBoost) { position = 0; length = 0; numOverlap = 0; offset = 0; boost = docBoost; attributeSource = null; } /** * Get the last processed term position. * @return the position */ public int getPosition() { return position; } /** * Get total number of terms in this field. * @return the length */ public int getLength() { return length; } /** * Get the number of terms with positionIncrement == 0. * @return the numOverlap */ public int getNumOverlap() { return numOverlap; } /** * Get end offset of the last processed term. * @return the offset */ public int getOffset() { return offset; } /** * Get boost value. This is the cumulative product of * document boost and field boost for all field instances * sharing the same field name. * @return the boost */ public float getBoost() { return boost; } public AttributeSource getAttributeSource() { return attributeSource; } } lucene-2.9.4/src/java/org/apache/lucene/index/TermPositions.java0000644000175000017500000000577711474320230025253 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; /** * TermPositions provides an interface for enumerating the <document, * frequency, <position>* > tuples for a term.

    The document and * frequency are the same as for a TermDocs. The positions portion lists the ordinal * positions of each occurrence of a term in a document. * * @see IndexReader#termPositions() */ public interface TermPositions extends TermDocs { /** Returns next position in the current document. It is an error to call this more than {@link #freq()} times without calling {@link #next()}

    This is invalid until {@link #next()} is called for the first time. */ int nextPosition() throws IOException; /** * Returns the length of the payload at the current term position. * This is invalid until {@link #nextPosition()} is called for * the first time.
    * @return length of the current payload in number of bytes */ int getPayloadLength(); /** * Returns the payload data at the current term position. * This is invalid until {@link #nextPosition()} is called for * the first time. * This method must not be called more than once after each call * of {@link #nextPosition()}. However, payloads are loaded lazily, * so if the payload data for the current position is not needed, * this method may not be called at all for performance reasons.
    * * @param data the array into which the data of this payload is to be * stored, if it is big enough; otherwise, a new byte[] array * is allocated for this purpose. * @param offset the offset in the array into which the data of this payload * is to be stored. * @return a byte[] array containing the data of this payload * @throws IOException */ byte[] getPayload(byte[] data, int offset) throws IOException; /** * Checks if a payload can be loaded at this position. *

    * Payloads can only be loaded once per call to * {@link #nextPosition()}. * * @return true if there is a payload available at this position that can be loaded */ public boolean isPayloadAvailable(); } lucene-2.9.4/src/java/org/apache/lucene/index/TermsHash.java0000644000175000017500000002070611474320230024317 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Collection; import java.util.Map; import java.util.HashMap; import java.util.Iterator; import java.util.HashSet; import java.util.Arrays; import java.io.IOException; import org.apache.lucene.util.ArrayUtil; /** This class implements {@link InvertedDocConsumer}, which * is passed each token produced by the analyzer on each * field. It stores these tokens in a hash table, and * allocates separate byte streams per token. Consumers of * this class, eg {@link FreqProxTermsWriter} and {@link * TermVectorsTermsWriter}, write their own byte streams * under each term. */ final class TermsHash extends InvertedDocConsumer { final TermsHashConsumer consumer; final TermsHash nextTermsHash; final int bytesPerPosting; final int postingsFreeChunk; final DocumentsWriter docWriter; private RawPostingList[] postingsFreeList = new RawPostingList[1]; private int postingsFreeCount; private int postingsAllocCount; boolean trackAllocations; public TermsHash(final DocumentsWriter docWriter, boolean trackAllocations, final TermsHashConsumer consumer, final TermsHash nextTermsHash) { this.docWriter = docWriter; this.consumer = consumer; this.nextTermsHash = nextTermsHash; this.trackAllocations = trackAllocations; // Why + 4*POINTER_NUM_BYTE below? // +1: Posting is referenced by postingsFreeList array // +3: Posting is referenced by hash, which // targets 25-50% fill factor; approximate this // as 3X # pointers bytesPerPosting = consumer.bytesPerPosting() + 4*DocumentsWriter.POINTER_NUM_BYTE; postingsFreeChunk = (int) (DocumentsWriter.BYTE_BLOCK_SIZE / bytesPerPosting); } InvertedDocConsumerPerThread addThread(DocInverterPerThread docInverterPerThread) { return new TermsHashPerThread(docInverterPerThread, this, nextTermsHash, null); } TermsHashPerThread addThread(DocInverterPerThread docInverterPerThread, TermsHashPerThread primaryPerThread) { return new TermsHashPerThread(docInverterPerThread, this, nextTermsHash, primaryPerThread); } void setFieldInfos(FieldInfos fieldInfos) { this.fieldInfos = fieldInfos; consumer.setFieldInfos(fieldInfos); } // NOTE: do not make this sync'd; it's not necessary (DW // ensures all other threads are idle), and it leads to // deadlock public void abort() { consumer.abort(); if (nextTermsHash != null) nextTermsHash.abort(); } void shrinkFreePostings(Map threadsAndFields, SegmentWriteState state) { assert postingsFreeCount == postingsAllocCount: Thread.currentThread().getName() + ": postingsFreeCount=" + postingsFreeCount + " postingsAllocCount=" + postingsAllocCount + " consumer=" + consumer; final int newSize = 1; if (newSize != postingsFreeList.length) { if (postingsFreeCount > newSize) { if (trackAllocations) { docWriter.bytesAllocated(-(postingsFreeCount-newSize) * bytesPerPosting); } postingsFreeCount = newSize; postingsAllocCount = newSize; } RawPostingList[] newArray = new RawPostingList[newSize]; System.arraycopy(postingsFreeList, 0, newArray, 0, postingsFreeCount); postingsFreeList = newArray; } } synchronized void closeDocStore(SegmentWriteState state) throws IOException { consumer.closeDocStore(state); if (nextTermsHash != null) nextTermsHash.closeDocStore(state); } synchronized void flush(Map threadsAndFields, final SegmentWriteState state) throws IOException { Map childThreadsAndFields = new HashMap(); Map nextThreadsAndFields; if (nextTermsHash != null) nextThreadsAndFields = new HashMap(); else nextThreadsAndFields = null; Iterator it = threadsAndFields.entrySet().iterator(); while(it.hasNext()) { Map.Entry entry = (Map.Entry) it.next(); TermsHashPerThread perThread = (TermsHashPerThread) entry.getKey(); Collection fields = (Collection) entry.getValue(); Iterator fieldsIt = fields.iterator(); Collection childFields = new HashSet(); Collection nextChildFields; if (nextTermsHash != null) nextChildFields = new HashSet(); else nextChildFields = null; while(fieldsIt.hasNext()) { TermsHashPerField perField = (TermsHashPerField) fieldsIt.next(); childFields.add(perField.consumer); if (nextTermsHash != null) nextChildFields.add(perField.nextPerField); } childThreadsAndFields.put(perThread.consumer, childFields); if (nextTermsHash != null) nextThreadsAndFields.put(perThread.nextPerThread, nextChildFields); } consumer.flush(childThreadsAndFields, state); shrinkFreePostings(threadsAndFields, state); if (nextTermsHash != null) nextTermsHash.flush(nextThreadsAndFields, state); } public boolean freeRAM() { if (!trackAllocations) return false; boolean any; long bytesFreed = 0; synchronized(this) { final int numToFree; if (postingsFreeCount >= postingsFreeChunk) numToFree = postingsFreeChunk; else numToFree = postingsFreeCount; any = numToFree > 0; if (any) { Arrays.fill(postingsFreeList, postingsFreeCount-numToFree, postingsFreeCount, null); postingsFreeCount -= numToFree; postingsAllocCount -= numToFree; bytesFreed = -numToFree * bytesPerPosting; any = true; } } if (any) { docWriter.bytesAllocated(bytesFreed); } if (nextTermsHash != null) any |= nextTermsHash.freeRAM(); return any; } synchronized public void recyclePostings(final RawPostingList[] postings, final int numPostings) { assert postings.length >= numPostings; // Move all Postings from this ThreadState back to our // free list. We pre-allocated this array while we were // creating Postings to make sure it's large enough assert postingsFreeCount + numPostings <= postingsFreeList.length; System.arraycopy(postings, 0, postingsFreeList, postingsFreeCount, numPostings); postingsFreeCount += numPostings; } synchronized public void getPostings(final RawPostingList[] postings) { assert docWriter.writer.testPoint("TermsHash.getPostings start"); assert postingsFreeCount <= postingsFreeList.length; assert postingsFreeCount <= postingsAllocCount: "postingsFreeCount=" + postingsFreeCount + " postingsAllocCount=" + postingsAllocCount; final int numToCopy; if (postingsFreeCount < postings.length) numToCopy = postingsFreeCount; else numToCopy = postings.length; final int start = postingsFreeCount-numToCopy; assert start >= 0; assert start + numToCopy <= postingsFreeList.length; assert numToCopy <= postings.length; System.arraycopy(postingsFreeList, start, postings, 0, numToCopy); // Directly allocate the remainder if any if (numToCopy != postings.length) { final int extra = postings.length - numToCopy; final int newPostingsAllocCount = postingsAllocCount + extra; consumer.createPostings(postings, numToCopy, extra); assert docWriter.writer.testPoint("TermsHash.getPostings after create"); postingsAllocCount += extra; if (trackAllocations) docWriter.bytesAllocated(extra * bytesPerPosting); if (newPostingsAllocCount > postingsFreeList.length) // Pre-allocate the postingsFreeList so it's large // enough to hold all postings we've given out postingsFreeList = new RawPostingList[ArrayUtil.getNextSize(newPostingsAllocCount)]; } postingsFreeCount -= numToCopy; if (trackAllocations) docWriter.bytesUsed(postings.length * bytesPerPosting); } } lucene-2.9.4/src/java/org/apache/lucene/index/MultiLevelSkipListReader.java0000644000175000017500000002155111474320230027310 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Arrays; import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.store.IndexInput; /** * This abstract class reads skip lists with multiple levels. * * See {@link MultiLevelSkipListWriter} for the information about the encoding * of the multi level skip lists. * * Subclasses must implement the abstract method {@link #readSkipData(int, IndexInput)} * which defines the actual format of the skip data. */ abstract class MultiLevelSkipListReader { // the maximum number of skip levels possible for this index private int maxNumberOfSkipLevels; // number of levels in this skip list private int numberOfSkipLevels; // Expert: defines the number of top skip levels to buffer in memory. // Reducing this number results in less memory usage, but possibly // slower performance due to more random I/Os. // Please notice that the space each level occupies is limited by // the skipInterval. The top level can not contain more than // skipLevel entries, the second top level can not contain more // than skipLevel^2 entries and so forth. private int numberOfLevelsToBuffer = 1; private int docCount; private boolean haveSkipped; private IndexInput[] skipStream; // skipStream for each level private long skipPointer[]; // the start pointer of each skip level private int skipInterval[]; // skipInterval of each level private int[] numSkipped; // number of docs skipped per level private int[] skipDoc; // doc id of current skip entry per level private int lastDoc; // doc id of last read skip entry with docId <= target private long[] childPointer; // child pointer of current skip entry per level private long lastChildPointer; // childPointer of last read skip entry with docId <= target private boolean inputIsBuffered; public MultiLevelSkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval) { this.skipStream = new IndexInput[maxSkipLevels]; this.skipPointer = new long[maxSkipLevels]; this.childPointer = new long[maxSkipLevels]; this.numSkipped = new int[maxSkipLevels]; this.maxNumberOfSkipLevels = maxSkipLevels; this.skipInterval = new int[maxSkipLevels]; this.skipStream [0]= skipStream; this.inputIsBuffered = (skipStream instanceof BufferedIndexInput); this.skipInterval[0] = skipInterval; for (int i = 1; i < maxSkipLevels; i++) { // cache skip intervals this.skipInterval[i] = this.skipInterval[i - 1] * skipInterval; } skipDoc = new int[maxSkipLevels]; } /** Returns the id of the doc to which the last call of {@link #skipTo(int)} * has skipped. */ int getDoc() { return lastDoc; } /** Skips entries to the first beyond the current whose document number is * greater than or equal to target. Returns the current doc count. */ int skipTo(int target) throws IOException { if (!haveSkipped) { // first time, load skip levels loadSkipLevels(); haveSkipped = true; } // walk up the levels until highest level is found that has a skip // for this target int level = 0; while (level < numberOfSkipLevels - 1 && target > skipDoc[level + 1]) { level++; } while (level >= 0) { if (target > skipDoc[level]) { if (!loadNextSkip(level)) { continue; } } else { // no more skips on this level, go down one level if (level > 0 && lastChildPointer > skipStream[level - 1].getFilePointer()) { seekChild(level - 1); } level--; } } return numSkipped[0] - skipInterval[0] - 1; } private boolean loadNextSkip(int level) throws IOException { // we have to skip, the target document is greater than the current // skip list entry setLastSkipData(level); numSkipped[level] += skipInterval[level]; if (numSkipped[level] > docCount) { // this skip list is exhausted skipDoc[level] = Integer.MAX_VALUE; if (numberOfSkipLevels > level) numberOfSkipLevels = level; return false; } // read next skip entry skipDoc[level] += readSkipData(level, skipStream[level]); if (level != 0) { // read the child pointer if we are not on the leaf level childPointer[level] = skipStream[level].readVLong() + skipPointer[level - 1]; } return true; } /** Seeks the skip entry on the given level */ protected void seekChild(int level) throws IOException { skipStream[level].seek(lastChildPointer); numSkipped[level] = numSkipped[level + 1] - skipInterval[level + 1]; skipDoc[level] = lastDoc; if (level > 0) { childPointer[level] = skipStream[level].readVLong() + skipPointer[level - 1]; } } void close() throws IOException { for (int i = 1; i < skipStream.length; i++) { if (skipStream[i] != null) { skipStream[i].close(); } } } /** initializes the reader */ void init(long skipPointer, int df) { this.skipPointer[0] = skipPointer; this.docCount = df; Arrays.fill(skipDoc, 0); Arrays.fill(numSkipped, 0); Arrays.fill(childPointer, 0); haveSkipped = false; for (int i = 1; i < numberOfSkipLevels; i++) { skipStream[i] = null; } } /** Loads the skip levels */ private void loadSkipLevels() throws IOException { numberOfSkipLevels = docCount == 0 ? 0 : (int) Math.floor(Math.log(docCount) / Math.log(skipInterval[0])); if (numberOfSkipLevels > maxNumberOfSkipLevels) { numberOfSkipLevels = maxNumberOfSkipLevels; } skipStream[0].seek(skipPointer[0]); int toBuffer = numberOfLevelsToBuffer; for (int i = numberOfSkipLevels - 1; i > 0; i--) { // the length of the current level long length = skipStream[0].readVLong(); // the start pointer of the current level skipPointer[i] = skipStream[0].getFilePointer(); if (toBuffer > 0) { // buffer this level skipStream[i] = new SkipBuffer(skipStream[0], (int) length); toBuffer--; } else { // clone this stream, it is already at the start of the current level skipStream[i] = (IndexInput) skipStream[0].clone(); if (inputIsBuffered && length < BufferedIndexInput.BUFFER_SIZE) { ((BufferedIndexInput) skipStream[i]).setBufferSize((int) length); } // move base stream beyond the current level skipStream[0].seek(skipStream[0].getFilePointer() + length); } } // use base stream for the lowest level skipPointer[0] = skipStream[0].getFilePointer(); } /** * Subclasses must implement the actual skip data encoding in this method. * * @param level the level skip data shall be read from * @param skipStream the skip stream to read from */ protected abstract int readSkipData(int level, IndexInput skipStream) throws IOException; /** Copies the values of the last read skip entry on this level */ protected void setLastSkipData(int level) { lastDoc = skipDoc[level]; lastChildPointer = childPointer[level]; } /** used to buffer the top skip levels */ private final static class SkipBuffer extends IndexInput { private byte[] data; private long pointer; private int pos; SkipBuffer(IndexInput input, int length) throws IOException { data = new byte[length]; pointer = input.getFilePointer(); input.readBytes(data, 0, length); } public void close() throws IOException { data = null; } public long getFilePointer() { return pointer + pos; } public long length() { return data.length; } public byte readByte() throws IOException { return data[pos++]; } public void readBytes(byte[] b, int offset, int len) throws IOException { System.arraycopy(data, pos, b, offset, len); pos += len; } public void seek(long pos) throws IOException { this.pos = (int) (pos - pointer); } } } lucene-2.9.4/src/java/org/apache/lucene/index/FieldInfo.java0000644000175000017500000000740311474320230024257 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ final class FieldInfo { String name; boolean isIndexed; int number; // true if term vector for this field should be stored boolean storeTermVector; boolean storeOffsetWithTermVector; boolean storePositionWithTermVector; boolean omitNorms; // omit norms associated with indexed fields boolean omitTermFreqAndPositions; boolean storePayloads; // whether this field stores payloads together with term positions FieldInfo(String na, boolean tk, int nu, boolean storeTermVector, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions) { name = na; isIndexed = tk; number = nu; if (isIndexed) { this.storeTermVector = storeTermVector; this.storeOffsetWithTermVector = storeOffsetWithTermVector; this.storePositionWithTermVector = storePositionWithTermVector; this.storePayloads = storePayloads; this.omitNorms = omitNorms; this.omitTermFreqAndPositions = omitTermFreqAndPositions; } else { // for non-indexed fields, leave defaults this.storeTermVector = false; this.storeOffsetWithTermVector = false; this.storePositionWithTermVector = false; this.storePayloads = false; this.omitNorms = true; this.omitTermFreqAndPositions = false; } } public Object clone() { return new FieldInfo(name, isIndexed, number, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions); } void update(boolean isIndexed, boolean storeTermVector, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions) { if (this.isIndexed != isIndexed) { this.isIndexed = true; // once indexed, always index } if (isIndexed) { // if updated field data is not for indexing, leave the updates out if (this.storeTermVector != storeTermVector) { this.storeTermVector = true; // once vector, always vector } if (this.storePositionWithTermVector != storePositionWithTermVector) { this.storePositionWithTermVector = true; // once vector, always vector } if (this.storeOffsetWithTermVector != storeOffsetWithTermVector) { this.storeOffsetWithTermVector = true; // once vector, always vector } if (this.storePayloads != storePayloads) { this.storePayloads = true; } if (this.omitNorms != omitNorms) { this.omitNorms = false; // once norms are stored, always store } if (this.omitTermFreqAndPositions != omitTermFreqAndPositions) { this.omitTermFreqAndPositions = true; // if one require omitTermFreqAndPositions at least once, it remains off for life } } } } lucene-2.9.4/src/java/org/apache/lucene/index/FieldsReader.java0000644000175000017500000005635211474320230024760 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.*; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.util.CloseableThreadLocal; import org.apache.lucene.util.StringHelper; import java.io.IOException; import java.io.Reader; import java.util.zip.DataFormatException; /** * Class responsible for access to stored document fields. *

    * It uses <segment>.fdt and <segment>.fdx; files. * * @version $Id: FieldsReader.java 1028723 2010-10-29 13:12:08Z uschindler $ */ final class FieldsReader implements Cloneable { private final FieldInfos fieldInfos; // The main fieldStream, used only for cloning. private final IndexInput cloneableFieldsStream; // This is a clone of cloneableFieldsStream used for reading documents. // It should not be cloned outside of a synchronized context. private final IndexInput fieldsStream; private final IndexInput cloneableIndexStream; private final IndexInput indexStream; private int numTotalDocs; private int size; private boolean closed; private final int format; private final int formatSize; // The docID offset where our docs begin in the index // file. This will be 0 if we have our own private file. private int docStoreOffset; private CloseableThreadLocal fieldsStreamTL = new CloseableThreadLocal(); private boolean isOriginal = false; /** Returns a cloned FieldsReader that shares open * IndexInputs with the original one. It is the caller's * job not to close the original FieldsReader until all * clones are called (eg, currently SegmentReader manages * this logic). */ public Object clone() { ensureOpen(); return new FieldsReader(fieldInfos, numTotalDocs, size, format, formatSize, docStoreOffset, cloneableFieldsStream, cloneableIndexStream); } // Used only by clone private FieldsReader(FieldInfos fieldInfos, int numTotalDocs, int size, int format, int formatSize, int docStoreOffset, IndexInput cloneableFieldsStream, IndexInput cloneableIndexStream) { this.fieldInfos = fieldInfos; this.numTotalDocs = numTotalDocs; this.size = size; this.format = format; this.formatSize = formatSize; this.docStoreOffset = docStoreOffset; this.cloneableFieldsStream = cloneableFieldsStream; this.cloneableIndexStream = cloneableIndexStream; fieldsStream = (IndexInput) cloneableFieldsStream.clone(); indexStream = (IndexInput) cloneableIndexStream.clone(); } FieldsReader(Directory d, String segment, FieldInfos fn) throws IOException { this(d, segment, fn, BufferedIndexInput.BUFFER_SIZE, -1, 0); } FieldsReader(Directory d, String segment, FieldInfos fn, int readBufferSize) throws IOException { this(d, segment, fn, readBufferSize, -1, 0); } FieldsReader(Directory d, String segment, FieldInfos fn, int readBufferSize, int docStoreOffset, int size) throws IOException { boolean success = false; isOriginal = true; try { fieldInfos = fn; cloneableFieldsStream = d.openInput(segment + "." + IndexFileNames.FIELDS_EXTENSION, readBufferSize); cloneableIndexStream = d.openInput(segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION, readBufferSize); // First version of fdx did not include a format // header, but, the first int will always be 0 in that // case int firstInt = cloneableIndexStream.readInt(); if (firstInt == 0) format = 0; else format = firstInt; if (format > FieldsWriter.FORMAT_CURRENT /* extra support for Lucene 3.0 indexes: */ && format != FieldsWriter.FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS ) throw new CorruptIndexException("Incompatible format version: " + format + " expected " + FieldsWriter.FORMAT_CURRENT + " or lower"); if (format > FieldsWriter.FORMAT) formatSize = 4; else formatSize = 0; if (format < FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) cloneableFieldsStream.setModifiedUTF8StringsMode(); fieldsStream = (IndexInput) cloneableFieldsStream.clone(); final long indexSize = cloneableIndexStream.length()-formatSize; if (docStoreOffset != -1) { // We read only a slice out of this shared fields file this.docStoreOffset = docStoreOffset; this.size = size; // Verify the file is long enough to hold all of our // docs assert ((int) (indexSize / 8)) >= size + this.docStoreOffset: "indexSize=" + indexSize + " size=" + size + " docStoreOffset=" + docStoreOffset; } else { this.docStoreOffset = 0; this.size = (int) (indexSize >> 3); } indexStream = (IndexInput) cloneableIndexStream.clone(); numTotalDocs = (int) (indexSize >> 3); success = true; } finally { // With lock-less commits, it's entirely possible (and // fine) to hit a FileNotFound exception above. In // this case, we want to explicitly close any subset // of things that were opened so that we don't have to // wait for a GC to do so. if (!success) { close(); } } } /** * @throws AlreadyClosedException if this FieldsReader is closed */ protected final void ensureOpen() throws AlreadyClosedException { if (closed) { throw new AlreadyClosedException("this FieldsReader is closed"); } } /** * Closes the underlying {@link org.apache.lucene.store.IndexInput} streams, including any ones associated with a * lazy implementation of a Field. This means that the Fields values will not be accessible. * * @throws IOException */ final void close() throws IOException { if (!closed) { if (fieldsStream != null) { fieldsStream.close(); } if (isOriginal) { if (cloneableFieldsStream != null) { cloneableFieldsStream.close(); } if (cloneableIndexStream != null) { cloneableIndexStream.close(); } } if (indexStream != null) { indexStream.close(); } fieldsStreamTL.close(); closed = true; } } final int size() { return size; } private final void seekIndex(int docID) throws IOException { indexStream.seek(formatSize + (docID + docStoreOffset) * 8L); } boolean canReadRawDocs() { return format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES; } final Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException { seekIndex(n); long position = indexStream.readLong(); fieldsStream.seek(position); Document doc = new Document(); int numFields = fieldsStream.readVInt(); for (int i = 0; i < numFields; i++) { int fieldNumber = fieldsStream.readVInt(); FieldInfo fi = fieldInfos.fieldInfo(fieldNumber); FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD : fieldSelector.accept(fi.name); byte bits = fieldsStream.readByte(); assert bits <= FieldsWriter.FIELD_IS_COMPRESSED + FieldsWriter.FIELD_IS_TOKENIZED + FieldsWriter.FIELD_IS_BINARY; boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0; boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0; boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0; //TODO: Find an alternative approach here if this list continues to grow beyond the //list of 5 or 6 currently here. See Lucene 762 for discussion if (acceptField.equals(FieldSelectorResult.LOAD)) { addField(doc, fi, binary, compressed, tokenize); } else if (acceptField.equals(FieldSelectorResult.LOAD_FOR_MERGE)) { addFieldForMerge(doc, fi, binary, compressed, tokenize); } else if (acceptField.equals(FieldSelectorResult.LOAD_AND_BREAK)){ addField(doc, fi, binary, compressed, tokenize); break;//Get out of this loop } else if (acceptField.equals(FieldSelectorResult.LAZY_LOAD)) { addFieldLazy(doc, fi, binary, compressed, tokenize); } else if (acceptField.equals(FieldSelectorResult.SIZE)){ skipField(binary, compressed, addFieldSize(doc, fi, binary, compressed)); } else if (acceptField.equals(FieldSelectorResult.SIZE_AND_BREAK)){ addFieldSize(doc, fi, binary, compressed); break; } else { skipField(binary, compressed); } } return doc; } /** Returns the length in bytes of each raw document in a * contiguous range of length numDocs starting with * startDocID. Returns the IndexInput (the fieldStream), * already seeked to the starting point for startDocID.*/ final IndexInput rawDocs(int[] lengths, int startDocID, int numDocs) throws IOException { seekIndex(startDocID); long startOffset = indexStream.readLong(); long lastOffset = startOffset; int count = 0; while (count < numDocs) { final long offset; final int docID = docStoreOffset + startDocID + count + 1; assert docID <= numTotalDocs; if (docID < numTotalDocs) offset = indexStream.readLong(); else offset = fieldsStream.length(); lengths[count++] = (int) (offset-lastOffset); lastOffset = offset; } fieldsStream.seek(startOffset); return fieldsStream; } /** * Skip the field. We still have to read some of the information about the field, but can skip past the actual content. * This will have the most payoff on large fields. */ private void skipField(boolean binary, boolean compressed) throws IOException { skipField(binary, compressed, fieldsStream.readVInt()); } private void skipField(boolean binary, boolean compressed, int toRead) throws IOException { if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES || binary || compressed) { fieldsStream.seek(fieldsStream.getFilePointer() + toRead); } else { // We need to skip chars. This will slow us down, but still better fieldsStream.skipChars(toRead); } } private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws IOException { if (binary) { int toRead = fieldsStream.readVInt(); long pointer = fieldsStream.getFilePointer(); if (compressed) { //was: doc.add(new Fieldable(fi.name, uncompress(b), Fieldable.Store.COMPRESS)); doc.add(new LazyField(fi.name, Field.Store.COMPRESS, toRead, pointer, binary)); } else { //was: doc.add(new Fieldable(fi.name, b, Fieldable.Store.YES)); doc.add(new LazyField(fi.name, Field.Store.YES, toRead, pointer, binary)); } //Need to move the pointer ahead by toRead positions fieldsStream.seek(pointer + toRead); } else { Field.Store store = Field.Store.YES; Field.Index index = getIndexType(fi, tokenize); Field.TermVector termVector = getTermVectorType(fi); AbstractField f; if (compressed) { store = Field.Store.COMPRESS; int toRead = fieldsStream.readVInt(); long pointer = fieldsStream.getFilePointer(); f = new LazyField(fi.name, store, toRead, pointer, binary); //skip over the part that we aren't loading fieldsStream.seek(pointer + toRead); f.setOmitNorms(fi.omitNorms); f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); } else { int length = fieldsStream.readVInt(); long pointer = fieldsStream.getFilePointer(); //Skip ahead of where we are by the length of what is stored if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) fieldsStream.seek(pointer+length); else fieldsStream.skipChars(length); f = new LazyField(fi.name, store, index, termVector, length, pointer, binary); f.setOmitNorms(fi.omitNorms); f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); } doc.add(f); } } // in merge mode we don't uncompress the data of a compressed field private void addFieldForMerge(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws IOException { Object data; if (binary || compressed) { int toRead = fieldsStream.readVInt(); final byte[] b = new byte[toRead]; fieldsStream.readBytes(b, 0, b.length); data = b; } else { data = fieldsStream.readString(); } doc.add(new FieldForMerge(data, fi, binary, compressed, tokenize)); } private void addField(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws CorruptIndexException, IOException { //we have a binary stored field, and it may be compressed if (binary) { int toRead = fieldsStream.readVInt(); final byte[] b = new byte[toRead]; fieldsStream.readBytes(b, 0, b.length); if (compressed) doc.add(new Field(fi.name, uncompress(b), Field.Store.COMPRESS)); else doc.add(new Field(fi.name, b, Field.Store.YES)); } else { Field.Store store = Field.Store.YES; Field.Index index = getIndexType(fi, tokenize); Field.TermVector termVector = getTermVectorType(fi); AbstractField f; if (compressed) { store = Field.Store.COMPRESS; int toRead = fieldsStream.readVInt(); final byte[] b = new byte[toRead]; fieldsStream.readBytes(b, 0, b.length); f = new Field(fi.name, // field name false, new String(uncompress(b), "UTF-8"), // uncompress the value and add as string store, index, termVector); f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); f.setOmitNorms(fi.omitNorms); } else { f = new Field(fi.name, // name false, fieldsStream.readString(), // read value store, index, termVector); f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); f.setOmitNorms(fi.omitNorms); } doc.add(f); } } // Add the size of field as a byte[] containing the 4 bytes of the integer byte size (high order byte first; char = 2 bytes) // Read just the size -- caller must skip the field content to continue reading fields // Return the size in bytes or chars, depending on field type private int addFieldSize(Document doc, FieldInfo fi, boolean binary, boolean compressed) throws IOException { int size = fieldsStream.readVInt(), bytesize = binary || compressed ? size : 2*size; byte[] sizebytes = new byte[4]; sizebytes[0] = (byte) (bytesize>>>24); sizebytes[1] = (byte) (bytesize>>>16); sizebytes[2] = (byte) (bytesize>>> 8); sizebytes[3] = (byte) bytesize ; doc.add(new Field(fi.name, sizebytes, Field.Store.YES)); return size; } private Field.TermVector getTermVectorType(FieldInfo fi) { Field.TermVector termVector = null; if (fi.storeTermVector) { if (fi.storeOffsetWithTermVector) { if (fi.storePositionWithTermVector) { termVector = Field.TermVector.WITH_POSITIONS_OFFSETS; } else { termVector = Field.TermVector.WITH_OFFSETS; } } else if (fi.storePositionWithTermVector) { termVector = Field.TermVector.WITH_POSITIONS; } else { termVector = Field.TermVector.YES; } } else { termVector = Field.TermVector.NO; } return termVector; } private Field.Index getIndexType(FieldInfo fi, boolean tokenize) { Field.Index index; if (fi.isIndexed && tokenize) index = Field.Index.ANALYZED; else if (fi.isIndexed && !tokenize) index = Field.Index.NOT_ANALYZED; else index = Field.Index.NO; return index; } /** * A Lazy implementation of Fieldable that differs loading of fields until asked for, instead of when the Document is * loaded. */ private class LazyField extends AbstractField implements Fieldable { private int toRead; private long pointer; public LazyField(String name, Field.Store store, int toRead, long pointer, boolean isBinary) { super(name, store, Field.Index.NO, Field.TermVector.NO); this.toRead = toRead; this.pointer = pointer; this.isBinary = isBinary; if (isBinary) binaryLength = toRead; lazy = true; } public LazyField(String name, Field.Store store, Field.Index index, Field.TermVector termVector, int toRead, long pointer, boolean isBinary) { super(name, store, index, termVector); this.toRead = toRead; this.pointer = pointer; this.isBinary = isBinary; if (isBinary) binaryLength = toRead; lazy = true; } private IndexInput getFieldStream() { IndexInput localFieldsStream = (IndexInput) fieldsStreamTL.get(); if (localFieldsStream == null) { localFieldsStream = (IndexInput) cloneableFieldsStream.clone(); fieldsStreamTL.set(localFieldsStream); } return localFieldsStream; } /** The value of the field in Binary, or null. If null, the Reader value, * String value, or TokenStream value is used. Exactly one of stringValue(), * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ public byte[] binaryValue() { return getBinaryValue(null); } /** The value of the field as a Reader, or null. If null, the String value, * binary value, or TokenStream value is used. Exactly one of stringValue(), * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ public Reader readerValue() { ensureOpen(); return null; } /** The value of the field as a TokenStream, or null. If null, the Reader value, * String value, or binary value is used. Exactly one of stringValue(), * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ public TokenStream tokenStreamValue() { ensureOpen(); return null; } /** The value of the field as a String, or null. If null, the Reader value, * binary value, or TokenStream value is used. Exactly one of stringValue(), * readerValue(), binaryValue(), and tokenStreamValue() must be set. */ public String stringValue() { ensureOpen(); if (isBinary) return null; else { if (fieldsData == null) { IndexInput localFieldsStream = getFieldStream(); try { localFieldsStream.seek(pointer); if (isCompressed) { final byte[] b = new byte[toRead]; localFieldsStream.readBytes(b, 0, b.length); fieldsData = new String(uncompress(b), "UTF-8"); } else { if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) { byte[] bytes = new byte[toRead]; localFieldsStream.readBytes(bytes, 0, toRead); fieldsData = new String(bytes, "UTF-8"); } else { //read in chars b/c we already know the length we need to read char[] chars = new char[toRead]; localFieldsStream.readChars(chars, 0, toRead); fieldsData = new String(chars); } } } catch (IOException e) { throw new FieldReaderException(e); } } return (String) fieldsData; } } public long getPointer() { ensureOpen(); return pointer; } public void setPointer(long pointer) { ensureOpen(); this.pointer = pointer; } public int getToRead() { ensureOpen(); return toRead; } public void setToRead(int toRead) { ensureOpen(); this.toRead = toRead; } public byte[] getBinaryValue(byte[] result) { ensureOpen(); if (isBinary) { if (fieldsData == null) { // Allocate new buffer if result is null or too small final byte[] b; if (result == null || result.length < toRead) b = new byte[toRead]; else b = result; IndexInput localFieldsStream = getFieldStream(); // Throw this IOException since IndexReader.document does so anyway, so probably not that big of a change for people // since they are already handling this exception when getting the document try { localFieldsStream.seek(pointer); localFieldsStream.readBytes(b, 0, toRead); if (isCompressed == true) { fieldsData = uncompress(b); } else { fieldsData = b; } } catch (IOException e) { throw new FieldReaderException(e); } binaryOffset = 0; binaryLength = toRead; } return (byte[]) fieldsData; } else return null; } } private byte[] uncompress(byte[] b) throws CorruptIndexException { try { return CompressionTools.decompress(b); } catch (DataFormatException e) { // this will happen if the field is not compressed CorruptIndexException newException = new CorruptIndexException("field data are in wrong format: " + e.toString()); newException.initCause(e); throw newException; } } // Instances of this class hold field properties and data // for merge final static class FieldForMerge extends AbstractField { public String stringValue() { return (String) this.fieldsData; } public Reader readerValue() { // not needed for merge return null; } public byte[] binaryValue() { return (byte[]) this.fieldsData; } public TokenStream tokenStreamValue() { // not needed for merge return null; } public FieldForMerge(Object value, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) { this.isStored = true; this.fieldsData = value; this.isCompressed = compressed; this.isBinary = binary; if (binary) binaryLength = ((byte[]) value).length; this.isTokenized = tokenize; this.name = StringHelper.intern(fi.name); this.isIndexed = fi.isIndexed; this.omitNorms = fi.omitNorms; this.omitTermFreqAndPositions = fi.omitTermFreqAndPositions; this.storeOffsetWithTermVector = fi.storeOffsetWithTermVector; this.storePositionWithTermVector = fi.storePositionWithTermVector; this.storeTermVector = fi.storeTermVector; } } } lucene-2.9.4/src/java/org/apache/lucene/index/StoredFieldsWriterPerThread.java0000644000175000017500000000513111474320230027777 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.document.Fieldable; final class StoredFieldsWriterPerThread { final FieldsWriter localFieldsWriter; final StoredFieldsWriter storedFieldsWriter; final DocumentsWriter.DocState docState; StoredFieldsWriter.PerDoc doc; public StoredFieldsWriterPerThread(DocumentsWriter.DocState docState, StoredFieldsWriter storedFieldsWriter) throws IOException { this.storedFieldsWriter = storedFieldsWriter; this.docState = docState; localFieldsWriter = new FieldsWriter((IndexOutput) null, (IndexOutput) null, storedFieldsWriter.fieldInfos); } public void startDocument() { if (doc != null) { // Only happens if previous document hit non-aborting // exception while writing stored fields into // localFieldsWriter: doc.reset(); doc.docID = docState.docID; } } public void addField(Fieldable field, FieldInfo fieldInfo) throws IOException { if (doc == null) { doc = storedFieldsWriter.getPerDoc(); doc.docID = docState.docID; localFieldsWriter.setFieldsStream(doc.fdt); assert doc.numStoredFields == 0: "doc.numStoredFields=" + doc.numStoredFields; assert 0 == doc.fdt.length(); assert 0 == doc.fdt.getFilePointer(); } localFieldsWriter.writeField(fieldInfo, field); assert docState.testPoint("StoredFieldsWriterPerThread.processFields.writeField"); doc.numStoredFields++; } public DocumentsWriter.DocWriter finishDocument() { // If there were any stored fields in this doc, doc will // be non-null; else it's null. try { return doc; } finally { doc = null; } } public void abort() { if (doc != null) { doc.abort(); doc = null; } } } lucene-2.9.4/src/java/org/apache/lucene/index/MultipleTermPositions.java0000644000175000017500000001264511474320230026757 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.util.PriorityQueue; import java.io.IOException; import java.util.Arrays; import java.util.Iterator; import java.util.LinkedList; import java.util.List; /** * Allows you to iterate over the {@link TermPositions} for multiple {@link Term}s as * a single {@link TermPositions}. * */ public class MultipleTermPositions implements TermPositions { private static final class TermPositionsQueue extends PriorityQueue { TermPositionsQueue(List termPositions) throws IOException { initialize(termPositions.size()); Iterator i = termPositions.iterator(); while (i.hasNext()) { TermPositions tp = (TermPositions) i.next(); if (tp.next()) put(tp); } } final TermPositions peek() { return (TermPositions) top(); } public final boolean lessThan(Object a, Object b) { return ((TermPositions) a).doc() < ((TermPositions) b).doc(); } } private static final class IntQueue { private int _arraySize = 16; private int _index = 0; private int _lastIndex = 0; private int[] _array = new int[_arraySize]; final void add(int i) { if (_lastIndex == _arraySize) growArray(); _array[_lastIndex++] = i; } final int next() { return _array[_index++]; } final void sort() { Arrays.sort(_array, _index, _lastIndex); } final void clear() { _index = 0; _lastIndex = 0; } final int size() { return (_lastIndex - _index); } private void growArray() { int[] newArray = new int[_arraySize * 2]; System.arraycopy(_array, 0, newArray, 0, _arraySize); _array = newArray; _arraySize *= 2; } } private int _doc; private int _freq; private TermPositionsQueue _termPositionsQueue; private IntQueue _posList; /** * Creates a new MultipleTermPositions instance. * * @exception IOException */ public MultipleTermPositions(IndexReader indexReader, Term[] terms) throws IOException { List termPositions = new LinkedList(); for (int i = 0; i < terms.length; i++) termPositions.add(indexReader.termPositions(terms[i])); _termPositionsQueue = new TermPositionsQueue(termPositions); _posList = new IntQueue(); } public final boolean next() throws IOException { if (_termPositionsQueue.size() == 0) return false; _posList.clear(); _doc = _termPositionsQueue.peek().doc(); TermPositions tp; do { tp = _termPositionsQueue.peek(); for (int i = 0; i < tp.freq(); i++) _posList.add(tp.nextPosition()); if (tp.next()) _termPositionsQueue.adjustTop(); else { _termPositionsQueue.pop(); tp.close(); } } while (_termPositionsQueue.size() > 0 && _termPositionsQueue.peek().doc() == _doc); _posList.sort(); _freq = _posList.size(); return true; } public final int nextPosition() { return _posList.next(); } public final boolean skipTo(int target) throws IOException { while (_termPositionsQueue.peek() != null && target > _termPositionsQueue.peek().doc()) { TermPositions tp = (TermPositions) _termPositionsQueue.pop(); if (tp.skipTo(target)) _termPositionsQueue.put(tp); else tp.close(); } return next(); } public final int doc() { return _doc; } public final int freq() { return _freq; } public final void close() throws IOException { while (_termPositionsQueue.size() > 0) ((TermPositions) _termPositionsQueue.pop()).close(); } /** * Not implemented. * @throws UnsupportedOperationException */ public void seek(Term arg0) throws IOException { throw new UnsupportedOperationException(); } /** * Not implemented. * @throws UnsupportedOperationException */ public void seek(TermEnum termEnum) throws IOException { throw new UnsupportedOperationException(); } /** * Not implemented. * @throws UnsupportedOperationException */ public int read(int[] arg0, int[] arg1) throws IOException { throw new UnsupportedOperationException(); } /** * Not implemented. * @throws UnsupportedOperationException */ public int getPayloadLength() { throw new UnsupportedOperationException(); } /** * Not implemented. * @throws UnsupportedOperationException */ public byte[] getPayload(byte[] data, int offset) throws IOException { throw new UnsupportedOperationException(); } /** * * @return false */ // TODO: Remove warning after API has been finalized public boolean isPayloadAvailable() { return false; } } lucene-2.9.4/src/java/org/apache/lucene/index/DocInverter.java0000644000175000017500000000620111474320230024637 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import org.apache.lucene.util.AttributeSource; /** This is a DocFieldConsumer that inverts each field, * separately, from a Document, and accepts a * InvertedTermsConsumer to process those terms. */ final class DocInverter extends DocFieldConsumer { final InvertedDocConsumer consumer; final InvertedDocEndConsumer endConsumer; public DocInverter(InvertedDocConsumer consumer, InvertedDocEndConsumer endConsumer) { this.consumer = consumer; this.endConsumer = endConsumer; } void setFieldInfos(FieldInfos fieldInfos) { super.setFieldInfos(fieldInfos); consumer.setFieldInfos(fieldInfos); endConsumer.setFieldInfos(fieldInfos); } void flush(Map threadsAndFields, SegmentWriteState state) throws IOException { Map childThreadsAndFields = new HashMap(); Map endChildThreadsAndFields = new HashMap(); Iterator it = threadsAndFields.entrySet().iterator(); while(it.hasNext()) { Map.Entry entry = (Map.Entry) it.next(); DocInverterPerThread perThread = (DocInverterPerThread) entry.getKey(); Collection fields = (Collection) entry.getValue(); Iterator fieldsIt = fields.iterator(); Collection childFields = new HashSet(); Collection endChildFields = new HashSet(); while(fieldsIt.hasNext()) { DocInverterPerField perField = (DocInverterPerField) fieldsIt.next(); childFields.add(perField.consumer); endChildFields.add(perField.endConsumer); } childThreadsAndFields.put(perThread.consumer, childFields); endChildThreadsAndFields.put(perThread.endConsumer, endChildFields); } consumer.flush(childThreadsAndFields, state); endConsumer.flush(endChildThreadsAndFields, state); } public void closeDocStore(SegmentWriteState state) throws IOException { consumer.closeDocStore(state); endConsumer.closeDocStore(state); } void abort() { consumer.abort(); endConsumer.abort(); } public boolean freeRAM() { return consumer.freeRAM(); } public DocFieldConsumerPerThread addThread(DocFieldProcessorPerThread docFieldProcessorPerThread) { return new DocInverterPerThread(docFieldProcessorPerThread, this); } } lucene-2.9.4/src/java/org/apache/lucene/index/TermsHashConsumer.java0000644000175000017500000000262211474320230026030 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.Map; abstract class TermsHashConsumer { abstract int bytesPerPosting(); abstract void createPostings(RawPostingList[] postings, int start, int count); abstract TermsHashConsumerPerThread addThread(TermsHashPerThread perThread); abstract void flush(Map threadsAndFields, final SegmentWriteState state) throws IOException; abstract void abort(); abstract void closeDocStore(SegmentWriteState state) throws IOException; FieldInfos fieldInfos; void setFieldInfos(FieldInfos fieldInfos) { this.fieldInfos = fieldInfos; } } lucene-2.9.4/src/java/org/apache/lucene/index/StoredFieldsWriter.java0000644000175000017500000001434011474320230026202 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.store.RAMOutputStream; import org.apache.lucene.util.ArrayUtil; /** This is a DocFieldConsumer that writes stored fields. */ final class StoredFieldsWriter { FieldsWriter fieldsWriter; final DocumentsWriter docWriter; final FieldInfos fieldInfos; int lastDocID; PerDoc[] docFreeList = new PerDoc[1]; int freeCount; public StoredFieldsWriter(DocumentsWriter docWriter, FieldInfos fieldInfos) { this.docWriter = docWriter; this.fieldInfos = fieldInfos; } public StoredFieldsWriterPerThread addThread(DocumentsWriter.DocState docState) throws IOException { return new StoredFieldsWriterPerThread(docState, this); } synchronized public void flush(SegmentWriteState state) throws IOException { if (state.numDocsInStore > 0) { // It's possible that all documents seen in this segment // hit non-aborting exceptions, in which case we will // not have yet init'd the FieldsWriter: initFieldsWriter(); // Fill fdx file to include any final docs that we // skipped because they hit non-aborting exceptions fill(state.numDocsInStore - docWriter.getDocStoreOffset()); } if (fieldsWriter != null) fieldsWriter.flush(); } private void initFieldsWriter() throws IOException { if (fieldsWriter == null) { final String docStoreSegment = docWriter.getDocStoreSegment(); if (docStoreSegment != null) { assert docStoreSegment != null; fieldsWriter = new FieldsWriter(docWriter.directory, docStoreSegment, fieldInfos); docWriter.addOpenFile(docStoreSegment + "." + IndexFileNames.FIELDS_EXTENSION); docWriter.addOpenFile(docStoreSegment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION); lastDocID = 0; } } } synchronized public void closeDocStore(SegmentWriteState state) throws IOException { final int inc = state.numDocsInStore - lastDocID; if (inc > 0) { initFieldsWriter(); fill(state.numDocsInStore - docWriter.getDocStoreOffset()); } if (fieldsWriter != null) { fieldsWriter.close(); fieldsWriter = null; lastDocID = 0; assert state.docStoreSegmentName != null; state.flushedFiles.add(state.docStoreSegmentName + "." + IndexFileNames.FIELDS_EXTENSION); state.flushedFiles.add(state.docStoreSegmentName + "." + IndexFileNames.FIELDS_INDEX_EXTENSION); state.docWriter.removeOpenFile(state.docStoreSegmentName + "." + IndexFileNames.FIELDS_EXTENSION); state.docWriter.removeOpenFile(state.docStoreSegmentName + "." + IndexFileNames.FIELDS_INDEX_EXTENSION); final String fileName = state.docStoreSegmentName + "." + IndexFileNames.FIELDS_INDEX_EXTENSION; if (4+((long) state.numDocsInStore)*8 != state.directory.fileLength(fileName)) throw new RuntimeException("after flush: fdx size mismatch: " + state.numDocsInStore + " docs vs " + state.directory.fileLength(fileName) + " length in bytes of " + fileName + " file exists?=" + state.directory.fileExists(fileName)); } } int allocCount; synchronized PerDoc getPerDoc() { if (freeCount == 0) { allocCount++; if (allocCount > docFreeList.length) { // Grow our free list up front to make sure we have // enough space to recycle all outstanding PerDoc // instances assert allocCount == 1+docFreeList.length; docFreeList = new PerDoc[ArrayUtil.getNextSize(allocCount)]; } return new PerDoc(); } else return docFreeList[--freeCount]; } synchronized void abort() { if (fieldsWriter != null) { try { fieldsWriter.close(); } catch (Throwable t) { } fieldsWriter = null; lastDocID = 0; } } /** Fills in any hole in the docIDs */ void fill(int docID) throws IOException { final int docStoreOffset = docWriter.getDocStoreOffset(); // We must "catch up" for all docs before us // that had no stored fields: final int end = docID+docStoreOffset; while(lastDocID < end) { fieldsWriter.skipDocument(); lastDocID++; } } synchronized void finishDocument(PerDoc perDoc) throws IOException { assert docWriter.writer.testPoint("StoredFieldsWriter.finishDocument start"); initFieldsWriter(); fill(perDoc.docID); // Append stored fields to the real FieldsWriter: fieldsWriter.flushDocument(perDoc.numStoredFields, perDoc.fdt); lastDocID++; perDoc.reset(); free(perDoc); assert docWriter.writer.testPoint("StoredFieldsWriter.finishDocument end"); } public boolean freeRAM() { return false; } synchronized void free(PerDoc perDoc) { assert freeCount < docFreeList.length; assert 0 == perDoc.numStoredFields; assert 0 == perDoc.fdt.length(); assert 0 == perDoc.fdt.getFilePointer(); docFreeList[freeCount++] = perDoc; } class PerDoc extends DocumentsWriter.DocWriter { final DocumentsWriter.PerDocBuffer buffer = docWriter.newPerDocBuffer(); RAMOutputStream fdt = new RAMOutputStream(buffer); int numStoredFields; void reset() { fdt.reset(); buffer.recycle(); numStoredFields = 0; } void abort() { reset(); free(this); } public long sizeInBytes() { return buffer.getSizeInBytes(); } public void finish() throws IOException { finishDocument(this); } } } lucene-2.9.4/src/java/org/apache/lucene/index/TermInfosWriter.java0000644000175000017500000002243011474320230025520 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.Directory; import org.apache.lucene.util.UnicodeUtil; /** This stores a monotonically increasing set of pairs in a Directory. A TermInfos can be written once, in order. */ final class TermInfosWriter { /** The file format version, a negative number. */ public static final int FORMAT = -3; // Changed strings to true utf8 with length-in-bytes not // length-in-chars public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4; // NOTE: always change this if you switch to a new format! public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES; private FieldInfos fieldInfos; private IndexOutput output; private TermInfo lastTi = new TermInfo(); private long size; // TODO: the default values for these two parameters should be settable from // IndexWriter. However, once that's done, folks will start setting them to // ridiculous values and complaining that things don't work well, as with // mergeFactor. So, let's wait until a number of folks find that alternate // values work better. Note that both of these values are stored in the // segment, so that it's safe to change these w/o rebuilding all indexes. /** Expert: The fraction of terms in the "dictionary" which should be stored * in RAM. Smaller values use more memory, but make searching slightly * faster, while larger values use less memory and make searching slightly * slower. Searching is typically not dominated by dictionary lookup, so * tweaking this is rarely useful.*/ int indexInterval = 128; /** Expert: The fraction of {@link TermDocs} entries stored in skip tables, * used to accelerate {@link TermDocs#skipTo(int)}. Larger values result in * smaller indexes, greater acceleration, but fewer accelerable cases, while * smaller values result in bigger indexes, less acceleration and more * accelerable cases. More detailed experiments would be useful here. */ int skipInterval = 16; /** Expert: The maximum number of skip levels. Smaller values result in * slightly smaller indexes, but slower skipping in big posting lists. */ int maxSkipLevels = 10; private long lastIndexPointer; private boolean isIndex; private byte[] lastTermBytes = new byte[10]; private int lastTermBytesLength = 0; private int lastFieldNumber = -1; private TermInfosWriter other; private UnicodeUtil.UTF8Result utf8Result = new UnicodeUtil.UTF8Result(); TermInfosWriter(Directory directory, String segment, FieldInfos fis, int interval) throws IOException { initialize(directory, segment, fis, interval, false); other = new TermInfosWriter(directory, segment, fis, interval, true); other.other = this; } private TermInfosWriter(Directory directory, String segment, FieldInfos fis, int interval, boolean isIndex) throws IOException { initialize(directory, segment, fis, interval, isIndex); } private void initialize(Directory directory, String segment, FieldInfos fis, int interval, boolean isi) throws IOException { indexInterval = interval; fieldInfos = fis; isIndex = isi; output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis")); output.writeInt(FORMAT_CURRENT); // write format output.writeLong(0); // leave space for size output.writeInt(indexInterval); // write indexInterval output.writeInt(skipInterval); // write skipInterval output.writeInt(maxSkipLevels); // write maxSkipLevels assert initUTF16Results(); } void add(Term term, TermInfo ti) throws IOException { UnicodeUtil.UTF16toUTF8(term.text, 0, term.text.length(), utf8Result); add(fieldInfos.fieldNumber(term.field), utf8Result.result, utf8Result.length, ti); } // Currently used only by assert statements UnicodeUtil.UTF16Result utf16Result1; UnicodeUtil.UTF16Result utf16Result2; // Currently used only by assert statements private boolean initUTF16Results() { utf16Result1 = new UnicodeUtil.UTF16Result(); utf16Result2 = new UnicodeUtil.UTF16Result(); return true; } // Currently used only by assert statement private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) { if (lastFieldNumber != fieldNumber) { final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber)); // If there is a field named "" (empty string) then we // will get 0 on this comparison, yet, it's "OK". But // it's not OK if two different field numbers map to // the same name. if (cmp != 0 || lastFieldNumber != -1) return cmp; } UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1); UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2); final int len; if (utf16Result1.length < utf16Result2.length) len = utf16Result1.length; else len = utf16Result2.length; for(int i=0;i, TermInfo> pair to the set. Term must be lexicographically greater than all previous Terms added. TermInfo pointers must be positive and greater than all previous.*/ void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti) throws IOException { assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 || (isIndex && termBytesLength == 0 && lastTermBytesLength == 0) : "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" + " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" + " text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8"); assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")"; assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")"; if (!isIndex && size % indexInterval == 0) other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term writeTerm(fieldNumber, termBytes, termBytesLength); // write term output.writeVInt(ti.docFreq); // write doc freq output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers output.writeVLong(ti.proxPointer - lastTi.proxPointer); if (ti.docFreq >= skipInterval) { output.writeVInt(ti.skipOffset); } if (isIndex) { output.writeVLong(other.output.getFilePointer() - lastIndexPointer); lastIndexPointer = other.output.getFilePointer(); // write pointer } lastFieldNumber = fieldNumber; lastTi.set(ti); size++; } private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength) throws IOException { // TODO: UTF16toUTF8 could tell us this prefix // Compute prefix in common with last term: int start = 0; final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength; while(start < limit) { if (termBytes[start] != lastTermBytes[start]) break; start++; } final int length = termBytesLength - start; output.writeVInt(start); // write shared prefix length output.writeVInt(length); // write delta length output.writeBytes(termBytes, start, length); // write delta bytes output.writeVInt(fieldNumber); // write field num if (lastTermBytes.length < termBytesLength) { byte[] newArray = new byte[(int) (termBytesLength*1.5)]; System.arraycopy(lastTermBytes, 0, newArray, 0, start); lastTermBytes = newArray; } System.arraycopy(termBytes, start, lastTermBytes, start, length); lastTermBytesLength = termBytesLength; } /** Called to complete TermInfos creation. */ void close() throws IOException { output.seek(4); // write size after format output.writeLong(size); output.close(); if (!isIndex) other.close(); } } lucene-2.9.4/src/java/org/apache/lucene/index/FormatPostingsFieldsConsumer.java0000644000175000017500000000257111474320230030243 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; /** Abstract API that consumes terms, doc, freq, prox and * payloads postings. Concrete implementations of this * actually do "something" with the postings (write it into * the index in a specific format). * * NOTE: this API is experimental and will likely change */ abstract class FormatPostingsFieldsConsumer { /** Add a new field */ abstract FormatPostingsTermsConsumer addField(FieldInfo field) throws IOException; /** Called when we are done adding everything. */ abstract void finish() throws IOException; } lucene-2.9.4/src/java/org/apache/lucene/index/FreqProxTermsWriterPerThread.java0000644000175000017500000000267611474320230030204 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ final class FreqProxTermsWriterPerThread extends TermsHashConsumerPerThread { final TermsHashPerThread termsHashPerThread; final DocumentsWriter.DocState docState; public FreqProxTermsWriterPerThread(TermsHashPerThread perThread) { docState = perThread.docState; termsHashPerThread = perThread; } public TermsHashConsumerPerField addField(TermsHashPerField termsHashPerField, FieldInfo fieldInfo) { return new FreqProxTermsWriterPerField(termsHashPerField, this, fieldInfo); } void startDocument() { } DocumentsWriter.DocWriter finishDocument() { return null; } public void abort() {} } lucene-2.9.4/src/java/org/apache/lucene/index/FormatPostingsPositionsWriter.java0000644000175000017500000000570711474320230030511 0ustar janpascaljanpascalpackage org.apache.lucene.index; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexInput; import java.io.IOException; final class FormatPostingsPositionsWriter extends FormatPostingsPositionsConsumer { final FormatPostingsDocsWriter parent; final IndexOutput out; boolean omitTermFreqAndPositions; boolean storePayloads; int lastPayloadLength = -1; FormatPostingsPositionsWriter(SegmentWriteState state, FormatPostingsDocsWriter parent) throws IOException { this.parent = parent; omitTermFreqAndPositions = parent.omitTermFreqAndPositions; if (parent.parent.parent.fieldInfos.hasProx()) { // At least one field does not omit TF, so create the // prox file final String fileName = IndexFileNames.segmentFileName(parent.parent.parent.segment, IndexFileNames.PROX_EXTENSION); state.flushedFiles.add(fileName); out = parent.parent.parent.dir.createOutput(fileName); parent.skipListWriter.setProxOutput(out); } else // Every field omits TF so we will write no prox file out = null; } int lastPosition; /** Add a new position & payload */ void addPosition(int position, byte[] payload, int payloadOffset, int payloadLength) throws IOException { assert !omitTermFreqAndPositions: "omitTermFreqAndPositions is true"; assert out != null; final int delta = position - lastPosition; lastPosition = position; if (storePayloads) { if (payloadLength != lastPayloadLength) { lastPayloadLength = payloadLength; out.writeVInt((delta<<1)|1); out.writeVInt(payloadLength); } else out.writeVInt(delta << 1); if (payloadLength > 0) out.writeBytes(payload, payloadLength); } else out.writeVInt(delta); } void setField(FieldInfo fieldInfo) { omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions; storePayloads = omitTermFreqAndPositions ? false : fieldInfo.storePayloads; } /** Called when we are done adding positions & payloads */ void finish() { lastPosition = 0; lastPayloadLength = -1; } void close() throws IOException { if (out != null) out.close(); } } lucene-2.9.4/src/java/org/apache/lucene/queryParser/0000755000175000017500000000000011554106562022775 5ustar janpascaljanpascallucene-2.9.4/src/java/org/apache/lucene/queryParser/FastCharStream.java0000644000175000017500000000717311474320221026506 0ustar janpascaljanpascal// FastCharStream.java package org.apache.lucene.queryParser; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ import java.io.*; /** An efficient implementation of JavaCC's CharStream interface.

    Note that * this does not do line-number counting, but instead keeps track of the * character position of the token in the input, as required by Lucene's {@link * org.apache.lucene.analysis.Token} API. * */ public final class FastCharStream implements CharStream { char[] buffer = null; int bufferLength = 0; // end of valid chars int bufferPosition = 0; // next char to read int tokenStart = 0; // offset in buffer int bufferStart = 0; // position in file of buffer Reader input; // source of chars /** Constructs from a Reader. */ public FastCharStream(Reader r) { input = r; } public final char readChar() throws IOException { if (bufferPosition >= bufferLength) refill(); return buffer[bufferPosition++]; } private final void refill() throws IOException { int newPosition = bufferLength - tokenStart; if (tokenStart == 0) { // token won't fit in buffer if (buffer == null) { // first time: alloc buffer buffer = new char[2048]; } else if (bufferLength == buffer.length) { // grow buffer char[] newBuffer = new char[buffer.length*2]; System.arraycopy(buffer, 0, newBuffer, 0, bufferLength); buffer = newBuffer; } } else { // shift token to front System.arraycopy(buffer, tokenStart, buffer, 0, newPosition); } bufferLength = newPosition; // update state bufferPosition = newPosition; bufferStart += tokenStart; tokenStart = 0; int charsRead = // fill space in buffer input.read(buffer, newPosition, buffer.length-newPosition); if (charsRead == -1) throw new IOException("read past eof"); else bufferLength += charsRead; } public final char BeginToken() throws IOException { tokenStart = bufferPosition; return readChar(); } public final void backup(int amount) { bufferPosition -= amount; } public final String GetImage() { return new String(buffer, tokenStart, bufferPosition - tokenStart); } public final char[] GetSuffix(int len) { char[] value = new char[len]; System.arraycopy(buffer, bufferPosition - len, value, 0, len); return value; } public final void Done() { try { input.close(); } catch (IOException e) { System.err.println("Caught: " + e + "; ignoring."); } } public final int getColumn() { return bufferStart + bufferPosition; } public final int getLine() { return 1; } public final int getEndColumn() { return bufferStart + bufferPosition; } public final int getEndLine() { return 1; } public final int getBeginColumn() { return bufferStart + tokenStart; } public final int getBeginLine() { return 1; } } lucene-2.9.4/src/java/org/apache/lucene/queryParser/MultiFieldQueryParser.java0000644000175000017500000004141211474320221030072 0ustar janpascaljanpascalpackage org.apache.lucene.queryParser; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.ArrayList; import java.util.List; import java.util.Map; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.util.Version; /** * A QueryParser which constructs queries to search multiple fields. * * @version $Revision: 829134 $ */ public class MultiFieldQueryParser extends QueryParser { protected String[] fields; protected Map boosts; /** * Creates a MultiFieldQueryParser. * Allows passing of a map with term to Boost, and the boost to apply to each term. * *

    It will, when parse(String query) * is called, construct a query like this (assuming the query consists of * two terms and you specify the two fields title and body):

    * * * (title:term1 body:term1) (title:term2 body:term2) * * *

    When setDefaultOperator(AND_OPERATOR) is set, the result will be:

    * * * +(title:term1 body:term1) +(title:term2 body:term2) * * *

    When you pass a boost (title=>5 body=>10) you can get

    * * * +(title:term1^5.0 body:term1^10.0) +(title:term2^5.0 body:term2^10.0) * * *

    In other words, all the query's terms must appear, but it doesn't matter in * what fields they appear.

    * * @deprecated Please use {@link #MultiFieldQueryParser(Version, String[], Analyzer, Map)} instead */ public MultiFieldQueryParser(String[] fields, Analyzer analyzer, Map boosts) { this(Version.LUCENE_24, fields, analyzer); this.boosts = boosts; } /** * Creates a MultiFieldQueryParser. * Allows passing of a map with term to Boost, and the boost to apply to each term. * *

    It will, when parse(String query) * is called, construct a query like this (assuming the query consists of * two terms and you specify the two fields title and body):

    * * * (title:term1 body:term1) (title:term2 body:term2) * * *

    When setDefaultOperator(AND_OPERATOR) is set, the result will be:

    * * * +(title:term1 body:term1) +(title:term2 body:term2) * * *

    When you pass a boost (title=>5 body=>10) you can get

    * * * +(title:term1^5.0 body:term1^10.0) +(title:term2^5.0 body:term2^10.0) * * *

    In other words, all the query's terms must appear, but it doesn't matter in * what fields they appear.

    */ public MultiFieldQueryParser(Version matchVersion, String[] fields, Analyzer analyzer, Map boosts) { this(matchVersion, fields, analyzer); this.boosts = boosts; } /** * Creates a MultiFieldQueryParser. * *

    It will, when parse(String query) * is called, construct a query like this (assuming the query consists of * two terms and you specify the two fields title and body):

    * * * (title:term1 body:term1) (title:term2 body:term2) * * *

    When setDefaultOperator(AND_OPERATOR) is set, the result will be:

    * * * +(title:term1 body:term1) +(title:term2 body:term2) * * *

    In other words, all the query's terms must appear, but it doesn't matter in * what fields they appear.

    * * @deprecated Please use {@link #MultiFieldQueryParser(Version, String[], Analyzer)} instead */ public MultiFieldQueryParser(String[] fields, Analyzer analyzer) { this(Version.LUCENE_24, fields, analyzer); } /** * Creates a MultiFieldQueryParser. * *

    It will, when parse(String query) * is called, construct a query like this (assuming the query consists of * two terms and you specify the two fields title and body):

    * * * (title:term1 body:term1) (title:term2 body:term2) * * *

    When setDefaultOperator(AND_OPERATOR) is set, the result will be:

    * * * +(title:term1 body:term1) +(title:term2 body:term2) * * *

    In other words, all the query's terms must appear, but it doesn't matter in * what fields they appear.

    */ public MultiFieldQueryParser(Version matchVersion, String[] fields, Analyzer analyzer) { super(matchVersion, null, analyzer); this.fields = fields; } protected Query getFieldQuery(String field, String queryText, int slop) throws ParseException { if (field == null) { List clauses = new ArrayList(); for (int i = 0; i < fields.length; i++) { Query q = super.getFieldQuery(fields[i], queryText); if (q != null) { //If the user passes a map of boosts if (boosts != null) { //Get the boost from the map and apply them Float boost = (Float)boosts.get(fields[i]); if (boost != null) { q.setBoost(boost.floatValue()); } } applySlop(q,slop); clauses.add(new BooleanClause(q, BooleanClause.Occur.SHOULD)); } } if (clauses.size() == 0) // happens for stopwords return null; return getBooleanQuery(clauses, true); } Query q = super.getFieldQuery(field, queryText); applySlop(q,slop); return q; } private void applySlop(Query q, int slop) { if (q instanceof PhraseQuery) { ((PhraseQuery) q).setSlop(slop); } else if (q instanceof MultiPhraseQuery) { ((MultiPhraseQuery) q).setSlop(slop); } } protected Query getFieldQuery(String field, String queryText) throws ParseException { return getFieldQuery(field, queryText, 0); } protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) throws ParseException { if (field == null) { List clauses = new ArrayList(); for (int i = 0; i < fields.length; i++) { clauses.add(new BooleanClause(getFuzzyQuery(fields[i], termStr, minSimilarity), BooleanClause.Occur.SHOULD)); } return getBooleanQuery(clauses, true); } return super.getFuzzyQuery(field, termStr, minSimilarity); } protected Query getPrefixQuery(String field, String termStr) throws ParseException { if (field == null) { List clauses = new ArrayList(); for (int i = 0; i < fields.length; i++) { clauses.add(new BooleanClause(getPrefixQuery(fields[i], termStr), BooleanClause.Occur.SHOULD)); } return getBooleanQuery(clauses, true); } return super.getPrefixQuery(field, termStr); } protected Query getWildcardQuery(String field, String termStr) throws ParseException { if (field == null) { List clauses = new ArrayList(); for (int i = 0; i < fields.length; i++) { clauses.add(new BooleanClause(getWildcardQuery(fields[i], termStr), BooleanClause.Occur.SHOULD)); } return getBooleanQuery(clauses, true); } return super.getWildcardQuery(field, termStr); } protected Query getRangeQuery(String field, String part1, String part2, boolean inclusive) throws ParseException { if (field == null) { List clauses = new ArrayList(); for (int i = 0; i < fields.length; i++) { clauses.add(new BooleanClause(getRangeQuery(fields[i], part1, part2, inclusive), BooleanClause.Occur.SHOULD)); } return getBooleanQuery(clauses, true); } return super.getRangeQuery(field, part1, part2, inclusive); } /** * Parses a query which searches on the fields specified. *

    * If x fields are specified, this effectively constructs: *

       * 
       * (field1:query1) (field2:query2) (field3:query3)...(fieldx:queryx)
       * 
       * 
    * @param queries Queries strings to parse * @param fields Fields to search on * @param analyzer Analyzer to use * @throws ParseException if query parsing fails * @throws IllegalArgumentException if the length of the queries array differs * from the length of the fields array * @deprecated Use {@link #parse(Version,String[],String[],Analyzer)} instead */ public static Query parse(String[] queries, String[] fields, Analyzer analyzer) throws ParseException { return parse(Version.LUCENE_24, queries, fields, analyzer); } /** * Parses a query which searches on the fields specified. *

    * If x fields are specified, this effectively constructs: *

       * 
       * (field1:query1) (field2:query2) (field3:query3)...(fieldx:queryx)
       * 
       * 
    * @param matchVersion Lucene version to match; this is passed through to QueryParser. * @param queries Queries strings to parse * @param fields Fields to search on * @param analyzer Analyzer to use * @throws ParseException if query parsing fails * @throws IllegalArgumentException if the length of the queries array differs * from the length of the fields array */ public static Query parse(Version matchVersion, String[] queries, String[] fields, Analyzer analyzer) throws ParseException { if (queries.length != fields.length) throw new IllegalArgumentException("queries.length != fields.length"); BooleanQuery bQuery = new BooleanQuery(); for (int i = 0; i < fields.length; i++) { QueryParser qp = new QueryParser(matchVersion, fields[i], analyzer); Query q = qp.parse(queries[i]); if (q!=null && // q never null, just being defensive (!(q instanceof BooleanQuery) || ((BooleanQuery)q).getClauses().length>0)) { bQuery.add(q, BooleanClause.Occur.SHOULD); } } return bQuery; } /** * Parses a query, searching on the fields specified. * Use this if you need to specify certain fields as required, * and others as prohibited. *

       * Usage:
       * 
       * String[] fields = {"filename", "contents", "description"};
       * BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD,
       *                BooleanClause.Occur.MUST,
       *                BooleanClause.Occur.MUST_NOT};
       * MultiFieldQueryParser.parse("query", fields, flags, analyzer);
       * 
       * 
    *

    * The code above would construct a query: *

       * 
       * (filename:query) +(contents:query) -(description:query)
       * 
       * 
    * * @param query Query string to parse * @param fields Fields to search on * @param flags Flags describing the fields * @param analyzer Analyzer to use * @throws ParseException if query parsing fails * @throws IllegalArgumentException if the length of the fields array differs * from the length of the flags array * @deprecated Use {@link #parse(Version, String, String[], BooleanClause.Occur[], Analyzer)} instead */ public static Query parse(String query, String[] fields, BooleanClause.Occur[] flags, Analyzer analyzer) throws ParseException { return parse(Version.LUCENE_24, query, fields, flags, analyzer); } /** * Parses a query, searching on the fields specified. * Use this if you need to specify certain fields as required, * and others as prohibited. *

       * Usage:
       * 
       * String[] fields = {"filename", "contents", "description"};
       * BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD,
       *                BooleanClause.Occur.MUST,
       *                BooleanClause.Occur.MUST_NOT};
       * MultiFieldQueryParser.parse("query", fields, flags, analyzer);
       * 
       * 
    *

    * The code above would construct a query: *

       * 
       * (filename:query) +(contents:query) -(description:query)
       * 
       * 
    * * @param matchVersion Lucene version to match; this is passed through to QueryParser. * @param query Query string to parse * @param fields Fields to search on * @param flags Flags describing the fields * @param analyzer Analyzer to use * @throws ParseException if query parsing fails * @throws IllegalArgumentException if the length of the fields array differs * from the length of the flags array */ public static Query parse(Version matchVersion, String query, String[] fields, BooleanClause.Occur[] flags, Analyzer analyzer) throws ParseException { if (fields.length != flags.length) throw new IllegalArgumentException("fields.length != flags.length"); BooleanQuery bQuery = new BooleanQuery(); for (int i = 0; i < fields.length; i++) { QueryParser qp = new QueryParser(matchVersion, fields[i], analyzer); Query q = qp.parse(query); if (q!=null && // q never null, just being defensive (!(q instanceof BooleanQuery) || ((BooleanQuery)q).getClauses().length>0)) { bQuery.add(q, flags[i]); } } return bQuery; } /** * Parses a query, searching on the fields specified. * Use this if you need to specify certain fields as required, * and others as prohibited. *

       * Usage:
       * 
       * String[] query = {"query1", "query2", "query3"};
       * String[] fields = {"filename", "contents", "description"};
       * BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD,
       *                BooleanClause.Occur.MUST,
       *                BooleanClause.Occur.MUST_NOT};
       * MultiFieldQueryParser.parse(query, fields, flags, analyzer);
       * 
       * 
    *

    * The code above would construct a query: *

       * 
       * (filename:query1) +(contents:query2) -(description:query3)
       * 
       * 
    * * @param queries Queries string to parse * @param fields Fields to search on * @param flags Flags describing the fields * @param analyzer Analyzer to use * @throws ParseException if query parsing fails * @throws IllegalArgumentException if the length of the queries, fields, * and flags array differ * @deprecated Used {@link #parse(Version, String[], String[], BooleanClause.Occur[], Analyzer)} instead */ public static Query parse(String[] queries, String[] fields, BooleanClause.Occur[] flags, Analyzer analyzer) throws ParseException { return parse(Version.LUCENE_24, queries, fields, flags, analyzer); } /** * Parses a query, searching on the fields specified. * Use this if you need to specify certain fields as required, * and others as prohibited. *

       * Usage:
       * 
       * String[] query = {"query1", "query2", "query3"};
       * String[] fields = {"filename", "contents", "description"};
       * BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD,
       *                BooleanClause.Occur.MUST,
       *                BooleanClause.Occur.MUST_NOT};
       * MultiFieldQueryParser.parse(query, fields, flags, analyzer);
       * 
       * 
    *

    * The code above would construct a query: *

       * 
       * (filename:query1) +(contents:query2) -(description:query3)
       * 
       * 
    * * @param matchVersion Lucene version to match; this is passed through to QueryParser. * @param queries Queries string to parse * @param fields Fields to search on * @param flags Flags describing the fields * @param analyzer Analyzer to use * @throws ParseException if query parsing fails * @throws IllegalArgumentException if the length of the queries, fields, * and flags array differ */ public static Query parse(Version matchVersion, String[] queries, String[] fields, BooleanClause.Occur[] flags, Analyzer analyzer) throws ParseException { if (!(queries.length == fields.length && queries.length == flags.length)) throw new IllegalArgumentException("queries, fields, and flags array have have different length"); BooleanQuery bQuery = new BooleanQuery(); for (int i = 0; i < fields.length; i++) { QueryParser qp = new QueryParser(matchVersion, fields[i], analyzer); Query q = qp.parse(queries[i]); if (q!=null && // q never null, just being defensive (!(q instanceof BooleanQuery) || ((BooleanQuery)q).getClauses().length>0)) { bQuery.add(q, flags[i]); } } return bQuery; } } lucene-2.9.4/src/java/org/apache/lucene/queryParser/QueryParser.jj0000644000175000017500000013362511474320221025605 0ustar janpascaljanpascal/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ options { STATIC=false; JAVA_UNICODE_ESCAPE=true; USER_CHAR_STREAM=true; } PARSER_BEGIN(QueryParser) package org.apache.lucene.queryParser; import java.io.IOException; import java.io.StringReader; import java.text.Collator; import java.text.DateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Vector; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CachingTokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.document.DateField; import org.apache.lucene.document.DateTools; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.Parameter; import org.apache.lucene.util.Version; /** * This class is generated by JavaCC. The most important method is * {@link #parse(String)}. * * The syntax for query strings is as follows: * A Query is a series of clauses. * A clause may be prefixed by: *
      *
    • a plus (+) or a minus (-) sign, indicating * that the clause is required or prohibited respectively; or *
    • a term followed by a colon, indicating the field to be searched. * This enables one to construct queries which search multiple fields. *
    * * A clause may be either: *
      *
    • a term, indicating all the documents that contain this term; or *
    • a nested query, enclosed in parentheses. Note that this may be used * with a +/- prefix to require any of a set of * terms. *
    * * Thus, in BNF, the query grammar is: *
     *   Query  ::= ( Clause )*
     *   Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" )
     * 
    * *

    * Examples of appropriately formatted queries can be found in the query syntax * documentation. *

    * *

    * In {@link TermRangeQuery}s, QueryParser tries to detect date values, e.g. * date:[6/1/2005 TO 6/4/2005] produces a range query that searches * for "date" fields between 2005-06-01 and 2005-06-04. Note that the format * of the accepted input depends on {@link #setLocale(Locale) the locale}. * By default a date is converted into a search term using the deprecated * {@link DateField} for compatibility reasons. * To use the new {@link DateTools} to convert dates, a * {@link org.apache.lucene.document.DateTools.Resolution} has to be set. *

    *

    * The date resolution that shall be used for RangeQueries can be set * using {@link #setDateResolution(DateTools.Resolution)} * or {@link #setDateResolution(String, DateTools.Resolution)}. The former * sets the default date resolution for all fields, whereas the latter can * be used to set field specific date resolutions. Field specific date * resolutions take, if set, precedence over the default date resolution. *

    *

    * If you use neither {@link DateField} nor {@link DateTools} in your * index, you can create your own * query parser that inherits QueryParser and overwrites * {@link #getRangeQuery(String, String, String, boolean)} to * use a different method for date conversion. *

    * *

    Note that QueryParser is not thread-safe.

    * *

    NOTE: there is a new QueryParser in contrib, which matches * the same syntax as this class, but is more modular, * enabling substantial customization to how a query is created. * * *

    NOTE: You must specify the required {@link Version} * compatibility when creating QueryParser: *

    */ public class QueryParser { private static final int CONJ_NONE = 0; private static final int CONJ_AND = 1; private static final int CONJ_OR = 2; private static final int MOD_NONE = 0; private static final int MOD_NOT = 10; private static final int MOD_REQ = 11; // make it possible to call setDefaultOperator() without accessing // the nested class: /** Alternative form of QueryParser.Operator.AND */ public static final Operator AND_OPERATOR = Operator.AND; /** Alternative form of QueryParser.Operator.OR */ public static final Operator OR_OPERATOR = Operator.OR; /** The actual operator that parser uses to combine query terms */ private Operator operator = OR_OPERATOR; boolean lowercaseExpandedTerms = true; MultiTermQuery.RewriteMethod multiTermRewriteMethod = MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT; boolean allowLeadingWildcard = false; boolean enablePositionIncrements = true; Analyzer analyzer; String field; int phraseSlop = 0; float fuzzyMinSim = FuzzyQuery.defaultMinSimilarity; int fuzzyPrefixLength = FuzzyQuery.defaultPrefixLength; Locale locale = Locale.getDefault(); // the default date resolution DateTools.Resolution dateResolution = null; // maps field names to date resolutions Map fieldToDateResolution = null; // The collator to use when determining range inclusion, // for use when constructing RangeQuerys. Collator rangeCollator = null; /** The default operator for parsing queries. * Use {@link QueryParser#setDefaultOperator} to change it. */ static public final class Operator extends Parameter { private Operator(String name) { super(name); } static public final Operator OR = new Operator("OR"); static public final Operator AND = new Operator("AND"); } /** Constructs a query parser. * @param f the default field for query terms. * @param a used to find terms in the query text. * @deprecated Use {@link #QueryParser(Version, String, Analyzer)} instead */ public QueryParser(String f, Analyzer a) { this(Version.LUCENE_24, f, a); } /** Constructs a query parser. * @param matchVersion Lucene version to match. See {@link above) * @param f the default field for query terms. * @param a used to find terms in the query text. */ public QueryParser(Version matchVersion, String f, Analyzer a) { this(new FastCharStream(new StringReader(""))); analyzer = a; field = f; if (matchVersion.onOrAfter(Version.LUCENE_29)) { enablePositionIncrements = true; } else { enablePositionIncrements = false; } } /** Parses a query string, returning a {@link org.apache.lucene.search.Query}. * @param query the query string to be parsed. * @throws ParseException if the parsing fails */ public Query parse(String query) throws ParseException { ReInit(new FastCharStream(new StringReader(query))); try { // TopLevelQuery is a Query followed by the end-of-input (EOF) Query res = TopLevelQuery(field); return res!=null ? res : newBooleanQuery(false); } catch (ParseException tme) { // rethrow to include the original query: ParseException e = new ParseException("Cannot parse '" +query+ "': " + tme.getMessage()); e.initCause(tme); throw e; } catch (TokenMgrError tme) { ParseException e = new ParseException("Cannot parse '" +query+ "': " + tme.getMessage()); e.initCause(tme); throw e; } catch (BooleanQuery.TooManyClauses tmc) { ParseException e = new ParseException("Cannot parse '" +query+ "': too many boolean clauses"); e.initCause(tmc); throw e; } } /** * @return Returns the analyzer. */ public Analyzer getAnalyzer() { return analyzer; } /** * @return Returns the field. */ public String getField() { return field; } /** * Get the minimal similarity for fuzzy queries. */ public float getFuzzyMinSim() { return fuzzyMinSim; } /** * Set the minimum similarity for fuzzy queries. * Default is 0.5f. */ public void setFuzzyMinSim(float fuzzyMinSim) { this.fuzzyMinSim = fuzzyMinSim; } /** * Get the prefix length for fuzzy queries. * @return Returns the fuzzyPrefixLength. */ public int getFuzzyPrefixLength() { return fuzzyPrefixLength; } /** * Set the prefix length for fuzzy queries. Default is 0. * @param fuzzyPrefixLength The fuzzyPrefixLength to set. */ public void setFuzzyPrefixLength(int fuzzyPrefixLength) { this.fuzzyPrefixLength = fuzzyPrefixLength; } /** * Sets the default slop for phrases. If zero, then exact phrase matches * are required. Default value is zero. */ public void setPhraseSlop(int phraseSlop) { this.phraseSlop = phraseSlop; } /** * Gets the default slop for phrases. */ public int getPhraseSlop() { return phraseSlop; } /** * Set to true to allow leading wildcard characters. *

    * When set, * or ? are allowed as * the first character of a PrefixQuery and WildcardQuery. * Note that this can produce very slow * queries on big indexes. *

    * Default: false. */ public void setAllowLeadingWildcard(boolean allowLeadingWildcard) { this.allowLeadingWildcard = allowLeadingWildcard; } /** * @see #setAllowLeadingWildcard(boolean) */ public boolean getAllowLeadingWildcard() { return allowLeadingWildcard; } /** * Set to true to enable position increments in result query. *

    * When set, result phrase and multi-phrase queries will * be aware of position increments. * Useful when e.g. a StopFilter increases the position increment of * the token that follows an omitted token. *

    * Default: false. */ public void setEnablePositionIncrements(boolean enable) { this.enablePositionIncrements = enable; } /** * @see #setEnablePositionIncrements(boolean) */ public boolean getEnablePositionIncrements() { return enablePositionIncrements; } /** * Sets the boolean operator of the QueryParser. * In default mode (OR_OPERATOR) terms without any modifiers * are considered optional: for example capital of Hungary is equal to * capital OR of OR Hungary.
    * In AND_OPERATOR mode terms are considered to be in conjunction: the * above mentioned query is parsed as capital AND of AND Hungary */ public void setDefaultOperator(Operator op) { this.operator = op; } /** * Gets implicit operator setting, which will be either AND_OPERATOR * or OR_OPERATOR. */ public Operator getDefaultOperator() { return operator; } /** * Whether terms of wildcard, prefix, fuzzy and range queries are to be automatically * lower-cased or not. Default is true. */ public void setLowercaseExpandedTerms(boolean lowercaseExpandedTerms) { this.lowercaseExpandedTerms = lowercaseExpandedTerms; } /** * @see #setLowercaseExpandedTerms(boolean) */ public boolean getLowercaseExpandedTerms() { return lowercaseExpandedTerms; } /** * @deprecated Please use {@link #setMultiTermRewriteMethod} instead. */ public void setUseOldRangeQuery(boolean useOldRangeQuery) { if (useOldRangeQuery) { setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); } else { setMultiTermRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT); } } /** * @deprecated Please use {@link #getMultiTermRewriteMethod} instead. */ public boolean getUseOldRangeQuery() { if (getMultiTermRewriteMethod() == MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE) { return true; } else { return false; } } /** * By default QueryParser uses {@link MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} * when creating a PrefixQuery, WildcardQuery or RangeQuery. This implementation is generally preferable because it * a) Runs faster b) Does not have the scarcity of terms unduly influence score * c) avoids any "TooManyBooleanClauses" exception. * However, if your application really needs to use the * old-fashioned BooleanQuery expansion rewriting and the above * points are not relevant then use this to change * the rewrite method. */ public void setMultiTermRewriteMethod(MultiTermQuery.RewriteMethod method) { multiTermRewriteMethod = method; } /** * @see #setMultiTermRewriteMethod */ public MultiTermQuery.RewriteMethod getMultiTermRewriteMethod() { return multiTermRewriteMethod; } /** * Set locale used by date range parsing. */ public void setLocale(Locale locale) { this.locale = locale; } /** * Returns current locale, allowing access by subclasses. */ public Locale getLocale() { return locale; } /** * Sets the default date resolution used by RangeQueries for fields for which no * specific date resolutions has been set. Field specific resolutions can be set * with {@link #setDateResolution(String, DateTools.Resolution)}. * * @param dateResolution the default date resolution to set */ public void setDateResolution(DateTools.Resolution dateResolution) { this.dateResolution = dateResolution; } /** * Sets the date resolution used by RangeQueries for a specific field. * * @param fieldName field for which the date resolution is to be set * @param dateResolution date resolution to set */ public void setDateResolution(String fieldName, DateTools.Resolution dateResolution) { if (fieldName == null) { throw new IllegalArgumentException("Field cannot be null."); } if (fieldToDateResolution == null) { // lazily initialize HashMap fieldToDateResolution = new HashMap(); } fieldToDateResolution.put(fieldName, dateResolution); } /** * Returns the date resolution that is used by RangeQueries for the given field. * Returns null, if no default or field specific date resolution has been set * for the given field. * */ public DateTools.Resolution getDateResolution(String fieldName) { if (fieldName == null) { throw new IllegalArgumentException("Field cannot be null."); } if (fieldToDateResolution == null) { // no field specific date resolutions set; return default date resolution instead return this.dateResolution; } DateTools.Resolution resolution = (DateTools.Resolution) fieldToDateResolution.get(fieldName); if (resolution == null) { // no date resolutions set for the given field; return default date resolution instead resolution = this.dateResolution; } return resolution; } /** * Sets the collator used to determine index term inclusion in ranges * for RangeQuerys. *

    * WARNING: Setting the rangeCollator to a non-null * collator using this method will cause every single index Term in the * Field referenced by lowerTerm and/or upperTerm to be examined. * Depending on the number of index Terms in this Field, the operation could * be very slow. * * @param rc the collator to use when constructing RangeQuerys */ public void setRangeCollator(Collator rc) { rangeCollator = rc; } /** * @return the collator used to determine index term inclusion in ranges * for RangeQuerys. */ public Collator getRangeCollator() { return rangeCollator; } /** * @deprecated use {@link #addClause(List, int, int, Query)} instead. */ protected void addClause(Vector clauses, int conj, int mods, Query q) { addClause((List) clauses, conj, mods, q); } protected void addClause(List clauses, int conj, int mods, Query q) { boolean required, prohibited; // If this term is introduced by AND, make the preceding term required, // unless it's already prohibited if (clauses.size() > 0 && conj == CONJ_AND) { BooleanClause c = (BooleanClause) clauses.get(clauses.size()-1); if (!c.isProhibited()) c.setOccur(BooleanClause.Occur.MUST); } if (clauses.size() > 0 && operator == AND_OPERATOR && conj == CONJ_OR) { // If this term is introduced by OR, make the preceding term optional, // unless it's prohibited (that means we leave -a OR b but +a OR b-->a OR b) // notice if the input is a OR b, first term is parsed as required; without // this modification a OR b would parsed as +a OR b BooleanClause c = (BooleanClause) clauses.get(clauses.size()-1); if (!c.isProhibited()) c.setOccur(BooleanClause.Occur.SHOULD); } // We might have been passed a null query; the term might have been // filtered away by the analyzer. if (q == null) return; if (operator == OR_OPERATOR) { // We set REQUIRED if we're introduced by AND or +; PROHIBITED if // introduced by NOT or -; make sure not to set both. prohibited = (mods == MOD_NOT); required = (mods == MOD_REQ); if (conj == CONJ_AND && !prohibited) { required = true; } } else { // We set PROHIBITED if we're introduced by NOT or -; We set REQUIRED // if not PROHIBITED and not introduced by OR prohibited = (mods == MOD_NOT); required = (!prohibited && conj != CONJ_OR); } if (required && !prohibited) clauses.add(newBooleanClause(q, BooleanClause.Occur.MUST)); else if (!required && !prohibited) clauses.add(newBooleanClause(q, BooleanClause.Occur.SHOULD)); else if (!required && prohibited) clauses.add(newBooleanClause(q, BooleanClause.Occur.MUST_NOT)); else throw new RuntimeException("Clause cannot be both required and prohibited"); } /** * @exception ParseException throw in overridden method to disallow */ protected Query getFieldQuery(String field, String queryText) throws ParseException { // Use the analyzer to get all the tokens, and then build a TermQuery, // PhraseQuery, or nothing based on the term count TokenStream source; try { source = analyzer.reusableTokenStream(field, new StringReader(queryText)); source.reset(); } catch (IOException e) { source = analyzer.tokenStream(field, new StringReader(queryText)); } CachingTokenFilter buffer = new CachingTokenFilter(source); TermAttribute termAtt = null; PositionIncrementAttribute posIncrAtt = null; int numTokens = 0; boolean success = false; try { buffer.reset(); success = true; } catch (IOException e) { // success==false if we hit an exception } if (success) { if (buffer.hasAttribute(TermAttribute.class)) { termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class); } if (buffer.hasAttribute(PositionIncrementAttribute.class)) { posIncrAtt = (PositionIncrementAttribute) buffer.getAttribute(PositionIncrementAttribute.class); } } int positionCount = 0; boolean severalTokensAtSamePosition = false; boolean hasMoreTokens = false; if (termAtt != null) { try { hasMoreTokens = buffer.incrementToken(); while (hasMoreTokens) { numTokens++; int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1; if (positionIncrement != 0) { positionCount += positionIncrement; } else { severalTokensAtSamePosition = true; } hasMoreTokens = buffer.incrementToken(); } } catch (IOException e) { // ignore } } try { // rewind the buffer stream buffer.reset(); // close original stream - all tokens buffered source.close(); } catch (IOException e) { // ignore } if (numTokens == 0) return null; else if (numTokens == 1) { String term = null; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.term(); } catch (IOException e) { // safe to ignore, because we know the number of tokens } return newTermQuery(new Term(field, term)); } else { if (severalTokensAtSamePosition) { if (positionCount == 1) { // no phrase query: BooleanQuery q = newBooleanQuery(true); for (int i = 0; i < numTokens; i++) { String term = null; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.term(); } catch (IOException e) { // safe to ignore, because we know the number of tokens } Query currentQuery = newTermQuery( new Term(field, term)); q.add(currentQuery, BooleanClause.Occur.SHOULD); } return q; } else { // phrase query: MultiPhraseQuery mpq = newMultiPhraseQuery(); mpq.setSlop(phraseSlop); List multiTerms = new ArrayList(); int position = -1; for (int i = 0; i < numTokens; i++) { String term = null; int positionIncrement = 1; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.term(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { // safe to ignore, because we know the number of tokens } if (positionIncrement > 0 && multiTerms.size() > 0) { if (enablePositionIncrements) { mpq.add((Term[])multiTerms.toArray(new Term[0]),position); } else { mpq.add((Term[])multiTerms.toArray(new Term[0])); } multiTerms.clear(); } position += positionIncrement; multiTerms.add(new Term(field, term)); } if (enablePositionIncrements) { mpq.add((Term[])multiTerms.toArray(new Term[0]),position); } else { mpq.add((Term[])multiTerms.toArray(new Term[0])); } return mpq; } } else { PhraseQuery pq = newPhraseQuery(); pq.setSlop(phraseSlop); int position = -1; for (int i = 0; i < numTokens; i++) { String term = null; int positionIncrement = 1; try { boolean hasNext = buffer.incrementToken(); assert hasNext == true; term = termAtt.term(); if (posIncrAtt != null) { positionIncrement = posIncrAtt.getPositionIncrement(); } } catch (IOException e) { // safe to ignore, because we know the number of tokens } if (enablePositionIncrements) { position += positionIncrement; pq.add(new Term(field, term),position); } else { pq.add(new Term(field, term)); } } return pq; } } } /** * Base implementation delegates to {@link #getFieldQuery(String,String)}. * This method may be overridden, for example, to return * a SpanNearQuery instead of a PhraseQuery. * * @exception ParseException throw in overridden method to disallow */ protected Query getFieldQuery(String field, String queryText, int slop) throws ParseException { Query query = getFieldQuery(field, queryText); if (query instanceof PhraseQuery) { ((PhraseQuery) query).setSlop(slop); } if (query instanceof MultiPhraseQuery) { ((MultiPhraseQuery) query).setSlop(slop); } return query; } /** * @exception ParseException throw in overridden method to disallow */ protected Query getRangeQuery(String field, String part1, String part2, boolean inclusive) throws ParseException { if (lowercaseExpandedTerms) { part1 = part1.toLowerCase(); part2 = part2.toLowerCase(); } try { DateFormat df = DateFormat.getDateInstance(DateFormat.SHORT, locale); df.setLenient(true); Date d1 = df.parse(part1); Date d2 = df.parse(part2); if (inclusive) { // The user can only specify the date, not the time, so make sure // the time is set to the latest possible time of that date to really // include all documents: Calendar cal = Calendar.getInstance(locale); cal.setTime(d2); cal.set(Calendar.HOUR_OF_DAY, 23); cal.set(Calendar.MINUTE, 59); cal.set(Calendar.SECOND, 59); cal.set(Calendar.MILLISECOND, 999); d2 = cal.getTime(); } DateTools.Resolution resolution = getDateResolution(field); if (resolution == null) { // no default or field specific date resolution has been set, // use deprecated DateField to maintain compatibility with // pre-1.9 Lucene versions. part1 = DateField.dateToString(d1); part2 = DateField.dateToString(d2); } else { part1 = DateTools.dateToString(d1, resolution); part2 = DateTools.dateToString(d2, resolution); } } catch (Exception e) { } return newRangeQuery(field, part1, part2, inclusive); } /** * Builds a new BooleanQuery instance * @param disableCoord disable coord * @return new BooleanQuery instance */ protected BooleanQuery newBooleanQuery(boolean disableCoord) { return new BooleanQuery(disableCoord); } /** * Builds a new BooleanClause instance * @param q sub query * @param occur how this clause should occur when matching documents * @return new BooleanClause instance */ protected BooleanClause newBooleanClause(Query q, BooleanClause.Occur occur) { return new BooleanClause(q, occur); } /** * Builds a new TermQuery instance * @param term term * @return new TermQuery instance */ protected Query newTermQuery(Term term){ return new TermQuery(term); } /** * Builds a new PhraseQuery instance * @return new PhraseQuery instance */ protected PhraseQuery newPhraseQuery(){ return new PhraseQuery(); } /** * Builds a new MultiPhraseQuery instance * @return new MultiPhraseQuery instance */ protected MultiPhraseQuery newMultiPhraseQuery(){ return new MultiPhraseQuery(); } /** * Builds a new PrefixQuery instance * @param prefix Prefix term * @return new PrefixQuery instance */ protected Query newPrefixQuery(Term prefix){ PrefixQuery query = new PrefixQuery(prefix); query.setRewriteMethod(multiTermRewriteMethod); return query; } /** * Builds a new FuzzyQuery instance * @param term Term * @param minimumSimilarity minimum similarity * @param prefixLength prefix length * @return new FuzzyQuery Instance */ protected Query newFuzzyQuery(Term term, float minimumSimilarity, int prefixLength) { // FuzzyQuery doesn't yet allow constant score rewrite return new FuzzyQuery(term,minimumSimilarity,prefixLength); } /** * Builds a new TermRangeQuery instance * @param field Field * @param part1 min * @param part2 max * @param inclusive true if range is inclusive * @return new TermRangeQuery instance */ protected Query newRangeQuery(String field, String part1, String part2, boolean inclusive) { final TermRangeQuery query = new TermRangeQuery(field, part1, part2, inclusive, inclusive, rangeCollator); query.setRewriteMethod(multiTermRewriteMethod); return query; } /** * Builds a new MatchAllDocsQuery instance * @return new MatchAllDocsQuery instance */ protected Query newMatchAllDocsQuery() { return new MatchAllDocsQuery(); } /** * Builds a new WildcardQuery instance * @param t wildcard term * @return new WildcardQuery instance */ protected Query newWildcardQuery(Term t) { WildcardQuery query = new WildcardQuery(t); query.setRewriteMethod(multiTermRewriteMethod); return query; } /** * Factory method for generating query, given a set of clauses. * By default creates a boolean query composed of clauses passed in. * * Can be overridden by extending classes, to modify query being * returned. * * @param clauses List that contains {@link BooleanClause} instances * to join. * * @return Resulting {@link Query} object. * @exception ParseException throw in overridden method to disallow * @deprecated use {@link #getBooleanQuery(List)} instead */ protected Query getBooleanQuery(Vector clauses) throws ParseException { return getBooleanQuery((List) clauses, false); } /** * Factory method for generating query, given a set of clauses. * By default creates a boolean query composed of clauses passed in. * * Can be overridden by extending classes, to modify query being * returned. * * @param clauses List that contains {@link BooleanClause} instances * to join. * * @return Resulting {@link Query} object. * @exception ParseException throw in overridden method to disallow */ protected Query getBooleanQuery(List clauses) throws ParseException { return getBooleanQuery(clauses, false); } /** * Factory method for generating query, given a set of clauses. * By default creates a boolean query composed of clauses passed in. * * Can be overridden by extending classes, to modify query being * returned. * * @param clauses List that contains {@link BooleanClause} instances * to join. * @param disableCoord true if coord scoring should be disabled. * * @return Resulting {@link Query} object. * @exception ParseException throw in overridden method to disallow * @deprecated use {@link #getBooleanQuery(List, boolean)} instead */ protected Query getBooleanQuery(Vector clauses, boolean disableCoord) throws ParseException { return getBooleanQuery((List) clauses, disableCoord); } /** * Factory method for generating query, given a set of clauses. * By default creates a boolean query composed of clauses passed in. * * Can be overridden by extending classes, to modify query being * returned. * * @param clauses List that contains {@link BooleanClause} instances * to join. * @param disableCoord true if coord scoring should be disabled. * * @return Resulting {@link Query} object. * @exception ParseException throw in overridden method to disallow */ protected Query getBooleanQuery(List clauses, boolean disableCoord) throws ParseException { if (clauses.size()==0) { return null; // all clause words were filtered away by the analyzer. } BooleanQuery query = newBooleanQuery(disableCoord); for (int i = 0; i < clauses.size(); i++) { query.add((BooleanClause)clauses.get(i)); } return query; } /** * Factory method for generating a query. Called when parser * parses an input term token that contains one or more wildcard * characters (? and *), but is not a prefix term token (one * that has just a single * character at the end) *

    * Depending on settings, prefix term may be lower-cased * automatically. It will not go through the default Analyzer, * however, since normal Analyzers are unlikely to work properly * with wildcard templates. *

    * Can be overridden by extending classes, to provide custom handling for * wildcard queries, which may be necessary due to missing analyzer calls. * * @param field Name of the field query will use. * @param termStr Term token that contains one or more wild card * characters (? or *), but is not simple prefix term * * @return Resulting {@link Query} built for the term * @exception ParseException throw in overridden method to disallow */ protected Query getWildcardQuery(String field, String termStr) throws ParseException { if ("*".equals(field)) { if ("*".equals(termStr)) return newMatchAllDocsQuery(); } if (!allowLeadingWildcard && (termStr.startsWith("*") || termStr.startsWith("?"))) throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery"); if (lowercaseExpandedTerms) { termStr = termStr.toLowerCase(); } Term t = new Term(field, termStr); return newWildcardQuery(t); } /** * Factory method for generating a query (similar to * {@link #getWildcardQuery}). Called when parser parses an input term * token that uses prefix notation; that is, contains a single '*' wildcard * character as its last character. Since this is a special case * of generic wildcard term, and such a query can be optimized easily, * this usually results in a different query object. *

    * Depending on settings, a prefix term may be lower-cased * automatically. It will not go through the default Analyzer, * however, since normal Analyzers are unlikely to work properly * with wildcard templates. *

    * Can be overridden by extending classes, to provide custom handling for * wild card queries, which may be necessary due to missing analyzer calls. * * @param field Name of the field query will use. * @param termStr Term token to use for building term for the query * (without trailing '*' character!) * * @return Resulting {@link Query} built for the term * @exception ParseException throw in overridden method to disallow */ protected Query getPrefixQuery(String field, String termStr) throws ParseException { if (!allowLeadingWildcard && termStr.startsWith("*")) throw new ParseException("'*' not allowed as first character in PrefixQuery"); if (lowercaseExpandedTerms) { termStr = termStr.toLowerCase(); } Term t = new Term(field, termStr); return newPrefixQuery(t); } /** * Factory method for generating a query (similar to * {@link #getWildcardQuery}). Called when parser parses * an input term token that has the fuzzy suffix (~) appended. * * @param field Name of the field query will use. * @param termStr Term token to use for building term for the query * * @return Resulting {@link Query} built for the term * @exception ParseException throw in overridden method to disallow */ protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) throws ParseException { if (lowercaseExpandedTerms) { termStr = termStr.toLowerCase(); } Term t = new Term(field, termStr); return newFuzzyQuery(t, minSimilarity, fuzzyPrefixLength); } /** * Returns a String where the escape char has been * removed, or kept only once if there was a double escape. * * Supports escaped unicode characters, e. g. translates * \\u0041 to A. * */ private String discardEscapeChar(String input) throws ParseException { // Create char array to hold unescaped char sequence char[] output = new char[input.length()]; // The length of the output can be less than the input // due to discarded escape chars. This variable holds // the actual length of the output int length = 0; // We remember whether the last processed character was // an escape character boolean lastCharWasEscapeChar = false; // The multiplier the current unicode digit must be multiplied with. // E. g. the first digit must be multiplied with 16^3, the second with 16^2... int codePointMultiplier = 0; // Used to calculate the codepoint of the escaped unicode character int codePoint = 0; for (int i = 0; i < input.length(); i++) { char curChar = input.charAt(i); if (codePointMultiplier > 0) { codePoint += hexToInt(curChar) * codePointMultiplier; codePointMultiplier >>>= 4; if (codePointMultiplier == 0) { output[length++] = (char)codePoint; codePoint = 0; } } else if (lastCharWasEscapeChar) { if (curChar == 'u') { // found an escaped unicode character codePointMultiplier = 16 * 16 * 16; } else { // this character was escaped output[length] = curChar; length++; } lastCharWasEscapeChar = false; } else { if (curChar == '\\') { lastCharWasEscapeChar = true; } else { output[length] = curChar; length++; } } } if (codePointMultiplier > 0) { throw new ParseException("Truncated unicode escape sequence."); } if (lastCharWasEscapeChar) { throw new ParseException("Term can not end with escape character."); } return new String(output, 0, length); } /** Returns the numeric value of the hexadecimal character */ private static final int hexToInt(char c) throws ParseException { if ('0' <= c && c <= '9') { return c - '0'; } else if ('a' <= c && c <= 'f'){ return c - 'a' + 10; } else if ('A' <= c && c <= 'F') { return c - 'A' + 10; } else { throw new ParseException("None-hex character in unicode escape sequence: " + c); } } /** * Returns a String where those characters that QueryParser * expects to be escaped are escaped by a preceding \. */ public static String escape(String s) { StringBuffer sb = new StringBuffer(); for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); // These characters are part of the query syntax and must be escaped if (c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || c == ':' || c == '^' || c == '[' || c == ']' || c == '\"' || c == '{' || c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&') { sb.append('\\'); } sb.append(c); } return sb.toString(); } /** * Command line tool to test QueryParser, using {@link org.apache.lucene.analysis.SimpleAnalyzer}. * Usage:
    * java org.apache.lucene.queryParser.QueryParser <input> */ public static void main(String[] args) throws Exception { if (args.length == 0) { System.out.println("Usage: java org.apache.lucene.queryParser.QueryParser "); System.exit(0); } QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, "field", new org.apache.lucene.analysis.SimpleAnalyzer()); Query q = qp.parse(args[0]); System.out.println(q.toString("field")); } } PARSER_END(QueryParser) /* ***************** */ /* Token Definitions */ /* ***************** */ <*> TOKEN : { <#_NUM_CHAR: ["0"-"9"] > // every character that follows a backslash is considered as an escaped character | <#_ESCAPED_CHAR: "\\" ~[] > | <#_TERM_START_CHAR: ( ~[ " ", "\t", "\n", "\r", "\u3000", "+", "-", "!", "(", ")", ":", "^", "[", "]", "\"", "{", "}", "~", "*", "?", "\\" ] | <_ESCAPED_CHAR> ) > | <#_TERM_CHAR: ( <_TERM_START_CHAR> | <_ESCAPED_CHAR> | "-" | "+" ) > | <#_WHITESPACE: ( " " | "\t" | "\n" | "\r" | "\u3000") > | <#_QUOTED_CHAR: ( ~[ "\"", "\\" ] | <_ESCAPED_CHAR> ) > } SKIP : { < <_WHITESPACE>> } TOKEN : { | | | | | | | | | : Boost | )* "\""> | (<_TERM_CHAR>)* > | )+ ( "." (<_NUM_CHAR>)+ )? )? > | (<_TERM_CHAR>)* "*" ) > | | [ "*", "?" ]) (<_TERM_CHAR> | ( [ "*", "?" ] ))* > | : RangeIn | : RangeEx } TOKEN : { )+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT } TOKEN : { | : DEFAULT | | } TOKEN : { | : DEFAULT | | } // * Query ::= ( Clause )* // * Clause ::= ["+", "-"] [ ":"] ( | "(" Query ")" ) int Conjunction() : { int ret = CONJ_NONE; } { [ { ret = CONJ_AND; } | { ret = CONJ_OR; } ] { return ret; } } int Modifiers() : { int ret = MOD_NONE; } { [ { ret = MOD_REQ; } | { ret = MOD_NOT; } | { ret = MOD_NOT; } ] { return ret; } } // This makes sure that there is no garbage after the query string Query TopLevelQuery(String field) : { Query q; } { q=Query(field) { return q; } } Query Query(String field) : { List clauses = new ArrayList(); Query q, firstQuery=null; int conj, mods; } { mods=Modifiers() q=Clause(field) { addClause(clauses, CONJ_NONE, mods, q); if (mods == MOD_NONE) firstQuery=q; } ( conj=Conjunction() mods=Modifiers() q=Clause(field) { addClause(clauses, conj, mods, q); } )* { if (clauses.size() == 1 && firstQuery != null) return firstQuery; else { return getBooleanQuery(clauses); } } } Query Clause(String field) : { Query q; Token fieldToken=null, boost=null; } { [ LOOKAHEAD(2) ( fieldToken= {field=discardEscapeChar(fieldToken.image);} | {field="*";} ) ] ( q=Term(field) | q=Query(field) ( boost=)? ) { if (boost != null) { float f = (float)1.0; try { f = Float.valueOf(boost.image).floatValue(); q.setBoost(f); } catch (Exception ignored) { } } return q; } } Query Term(String field) : { Token term, boost=null, fuzzySlop=null, goop1, goop2; boolean prefix = false; boolean wildcard = false; boolean fuzzy = false; Query q; } { ( ( term= | term= { wildcard=true; } | term= { prefix=true; } | term= { wildcard=true; } | term= ) [ fuzzySlop= { fuzzy=true; } ] [ boost= [ fuzzySlop= { fuzzy=true; } ] ] { String termImage=discardEscapeChar(term.image); if (wildcard) { q = getWildcardQuery(field, termImage); } else if (prefix) { q = getPrefixQuery(field, discardEscapeChar(term.image.substring (0, term.image.length()-1))); } else if (fuzzy) { float fms = fuzzyMinSim; try { fms = Float.valueOf(fuzzySlop.image.substring(1)).floatValue(); } catch (Exception ignored) { } if(fms < 0.0f || fms > 1.0f){ throw new ParseException("Minimum similarity for a FuzzyQuery has to be between 0.0f and 1.0f !"); } q = getFuzzyQuery(field, termImage,fms); } else { q = getFieldQuery(field, termImage); } } | ( ( goop1=|goop1= ) [ ] ( goop2=|goop2= ) ) [ boost= ] { if (goop1.kind == RANGEIN_QUOTED) { goop1.image = goop1.image.substring(1, goop1.image.length()-1); } if (goop2.kind == RANGEIN_QUOTED) { goop2.image = goop2.image.substring(1, goop2.image.length()-1); } q = getRangeQuery(field, discardEscapeChar(goop1.image), discardEscapeChar(goop2.image), true); } | ( ( goop1=|goop1= ) [ ] ( goop2=|goop2= ) ) [ boost= ] { if (goop1.kind == RANGEEX_QUOTED) { goop1.image = goop1.image.substring(1, goop1.image.length()-1); } if (goop2.kind == RANGEEX_QUOTED) { goop2.image = goop2.image.substring(1, goop2.image.length()-1); } q = getRangeQuery(field, discardEscapeChar(goop1.image), discardEscapeChar(goop2.image), false); } | term= [ fuzzySlop= ] [ boost= ] { int s = phraseSlop; if (fuzzySlop != null) { try { s = Float.valueOf(fuzzySlop.image.substring(1)).intValue(); } catch (Exception ignored) { } } q = getFieldQuery(field, discardEscapeChar(term.image.substring(1, term.image.length()-1)), s); } ) { if (boost != null) { float f = (float) 1.0; try { f = Float.valueOf(boost.image).floatValue(); } catch (Exception ignored) { /* Should this be handled somehow? (defaults to "no boost", if * boost number is invalid) */ } // avoid boosting null queries, such as those caused by stop words if (q != null) { q.setBoost(f); } } return q; } } lucene-2.9.4/src/java/org/apache/lucene/queryParser/package.html0000644000175000017500000000276211474320221025254 0ustar janpascaljanpascal A simple query parser implemented with JavaCC.

    Note that JavaCC defines lots of public classes, methods and fields that do not need to be public.  These clutter the documentation.  Sorry.

    Note that because JavaCC defines a class named Token, org.apache.lucene.analysis.Token must always be fully qualified in source code in this package.

    NOTE: contrib/queryparser has an alternative queryparser that matches the syntax of this one, but is more modular, enabling substantial customization to how a query is created. lucene-2.9.4/src/java/org/apache/lucene/messages/0000755000175000017500000000000011554106562022262 5ustar janpascaljanpascallucene-2.9.4/src/java/org/apache/lucene/messages/MessageImpl.java0000644000175000017500000000367511474320230025335 0ustar janpascaljanpascalpackage org.apache.lucene.messages; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Locale; /** * Default implementation of Message interface. * For Native Language Support (NLS), system of software internationalization. */ public class MessageImpl implements Message { private static final long serialVersionUID = -3077643314630884523L; private String key; private Object[] arguments = new Object[0]; public MessageImpl(String key) { this.key = key; } public MessageImpl(String key, Object[] args) { this(key); this.arguments = args; } public Object[] getArguments() { return this.arguments; } public String getKey() { return this.key; } public String getLocalizedMessage() { return getLocalizedMessage(Locale.getDefault()); } public String getLocalizedMessage(Locale locale) { return NLS.getLocalizedMessage(getKey(), locale, getArguments()); } public String toString() { Object[] args = getArguments(); String argsString = ""; if (args != null) { for (int i = 0; i < args.length; i++) { argsString += args[i] + (i < args.length ? "" : ", "); } } return getKey() + " " + argsString; } } lucene-2.9.4/src/java/org/apache/lucene/messages/NLS.java0000644000175000017500000001532011474320230023551 0ustar janpascaljanpascalpackage org.apache.lucene.messages; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.lang.reflect.Field; import java.lang.reflect.Modifier; import java.security.AccessController; import java.security.PrivilegedAction; import java.text.MessageFormat; import java.util.HashMap; import java.util.Iterator; import java.util.Locale; import java.util.Map; import java.util.MissingResourceException; import java.util.ResourceBundle; /** * MessageBundles classes extend this class, to implement a bundle. * * For Native Language Support (NLS), system of software internationalization. * * This interface is similar to the NLS class in eclipse.osgi.util.NLS class - * initializeMessages() method resets the values of all static strings, should * only be called by classes that extend from NLS (see TestMessages.java for * reference) - performs validation of all message in a bundle, at class load * time - performs per message validation at runtime - see NLSTest.java for * usage reference * * MessageBundle classes may subclass this type. */ public class NLS { private static Map/*>*/ bundles = new HashMap/*>*/( 0); protected NLS() { // Do not instantiate } public static String getLocalizedMessage(String key) { return getLocalizedMessage(key, Locale.getDefault()); } public static String getLocalizedMessage(String key, Locale locale) { Object message = getResourceBundleObject(key, locale); if (message == null) { return "Message with key:" + key + " and locale: " + locale + " not found."; } return message.toString(); } public static String getLocalizedMessage(String key, Locale locale, Object[] args) { String str = getLocalizedMessage(key, locale); if (args.length > 0) { str = MessageFormat.format(str, args); } return str; } public static String getLocalizedMessage(String key, Object[] args) { return getLocalizedMessage(key, Locale.getDefault(), args); } /** * Initialize a given class with the message bundle Keys Should be called from * a class that extends NLS in a static block at class load time. * * @param bundleName * Property file with that contains the message bundle * @param clazz * where constants will reside */ //@SuppressWarnings("unchecked") protected static void initializeMessages(String bundleName, Class clazz) { try { load(clazz); if (!bundles.containsKey(bundleName)) bundles.put(bundleName, clazz); } catch (Throwable e) { // ignore all errors and exceptions // because this function is supposed to be called at class load time. } } private static Object getResourceBundleObject(String messageKey, Locale locale) { // slow resource checking // need to loop thru all registered resource bundles for (Iterator/**/ it = bundles.keySet().iterator(); it.hasNext();) { Class/**/ clazz = (Class) bundles.get((String)it.next()); ResourceBundle resourceBundle = ResourceBundle.getBundle(clazz.getName(), locale); if (resourceBundle != null) { try { Object obj = resourceBundle.getObject(messageKey); if (obj != null) return obj; } catch (MissingResourceException e) { // just continue it might be on the next resource bundle } } } // if resource is not found return null; } /** * @param clazz */ private static void load(Class/**/ clazz) { final Field[] fieldArray = clazz.getDeclaredFields(); boolean isFieldAccessible = (clazz.getModifiers() & Modifier.PUBLIC) != 0; // build a map of field names to Field objects final int len = fieldArray.length; Map/**/ fields = new HashMap/**/(len * 2); for (int i = 0; i < len; i++) { fields.put(fieldArray[i].getName(), fieldArray[i]); loadfieldValue(fieldArray[i], isFieldAccessible, clazz); } } /** * @param field * @param isFieldAccessible */ private static void loadfieldValue(Field field, boolean isFieldAccessible, Class/**/ clazz) { int MOD_EXPECTED = Modifier.PUBLIC | Modifier.STATIC; int MOD_MASK = MOD_EXPECTED | Modifier.FINAL; if ((field.getModifiers() & MOD_MASK) != MOD_EXPECTED) return; // Set a value for this empty field. if (!isFieldAccessible) makeAccessible(field); try { field.set(null, field.getName()); validateMessage(field.getName(), clazz); } catch (IllegalArgumentException e) { // should not happen } catch (IllegalAccessException e) { // should not happen } } /** * @param key * - Message Key */ private static void validateMessage(String key, Class/**/ clazz) { // Test if the message is present in the resource bundle try { ResourceBundle resourceBundle = ResourceBundle.getBundle(clazz.getName(), Locale.getDefault()); if (resourceBundle != null) { Object obj = resourceBundle.getObject(key); if (obj == null) System.err.println("WARN: Message with key:" + key + " and locale: " + Locale.getDefault() + " not found."); } } catch (MissingResourceException e) { System.err.println("WARN: Message with key:" + key + " and locale: " + Locale.getDefault() + " not found."); } catch (Throwable e) { // ignore all other errors and exceptions // since this code is just a test to see if the message is present on the // system } } /* * Make a class field accessible */ //@SuppressWarnings("unchecked") private static void makeAccessible(final Field field) { if (System.getSecurityManager() == null) { field.setAccessible(true); } else { AccessController.doPrivileged(new PrivilegedAction() { public Object run() { field.setAccessible(true); return null; } }); } } } lucene-2.9.4/src/java/org/apache/lucene/messages/package.html0000644000175000017500000000644411474320230024542 0ustar janpascaljanpascal For Native Language Support (NLS), system of software internationalization.

    NLS message API

    This utility API, adds support for NLS messages in the apache code. It is currently used by the lucene "New Flexible Query PArser".

    Features:

    1. Message reference in the code, using static Strings
    2. Message resource validation at class load time, for easier debugging
    3. Allows for message IDs to be re-factored using eclipse or other code re-factor tools
    4. Allows for reference count on messages, just like code
    5. Lazy loading of Message Strings
    6. Normal loading Message Strings



    Lazy loading of Message Strings

    	public class MessagesTestBundle extends NLS {
    	
    	  private static final String BUNDLE_NAME = MessagesTestBundle.class.getName();
    	
    	  private MessagesTestBundle() {
    	    // should never be instantiated
    	  }
    	
    	  static {
    	    // register all string ids with NLS class and initialize static string
    	    // values
    	    NLS.initializeMessages(BUNDLE_NAME, MessagesTestBundle.class);
    	  }
    	
    	  // static string must match the strings in the property files.
    	  public static String Q0001E_INVALID_SYNTAX;
    	  public static String Q0004E_INVALID_SYNTAX_ESCAPE_UNICODE_TRUNCATION;
    	
    	  // this message is missing from the properties file
    	  public static String Q0005E_MESSAGE_NOT_IN_BUNDLE;
    	}
    
        // Create a message reference
        Message invalidSyntax = new MessageImpl(MessagesTestBundle.Q0001E_INVALID_SYNTAX, "XXX");
        
        // Do other stuff in the code...
        // when is time to display the message to the user or log the message on a file
        // the message is loaded from the correct bundle
        
        String message1 = invalidSyntax.getLocalizedMessage();
        String message2 = invalidSyntax.getLocalizedMessage(Locale.JAPANESE);
    



    Normal loading of Message Strings

    	String message1 = NLS.getLocalizedMessage(MessagesTestBundle.Q0004E_INVALID_SYNTAX_ESCAPE_UNICODE_TRUNCATION);
    	String message2 = NLS.getLocalizedMessage(MessagesTestBundle.Q0004E_INVALID_SYNTAX_ESCAPE_UNICODE_TRUNCATION, Locale.JAPANESE);
    

    The org.apache.lucene.messages.TestNLS junit contains several other examples. The TestNLS java code is available from the Apache Lucene code repository.

    lucene-2.9.4/src/java/org/apache/lucene/messages/NLSException.java0000644000175000017500000000236111474320230025431 0ustar janpascaljanpascalpackage org.apache.lucene.messages; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Interface that exceptions should implement to support lazy loading of messages. * * For Native Language Support (NLS), system of software internationalization. * * This Interface should be implemented by all exceptions that require * translation * */ public interface NLSException { /** * @return a instance of a class that implements the Message interface */ public Message getMessageObject(); } lucene-2.9.4/src/java/org/apache/lucene/messages/Message.java0000644000175000017500000000231611474320230024502 0ustar janpascaljanpascalpackage org.apache.lucene.messages; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Serializable; import java.util.Locale; /** * Message Interface for a lazy loading. * For Native Language Support (NLS), system of software internationalization. */ public interface Message extends Serializable { public String getKey(); public Object[] getArguments(); public String getLocalizedMessage(); public String getLocalizedMessage(Locale locale); } lucene-2.9.4/src/java/overview.html0000644000175000017500000004171111474320232017721 0ustar janpascaljanpascal Apache Lucene API

    Apache Lucene is a high-performance, full-featured text search engine library. Here's a simple example how to use Lucene for indexing and searching (using JUnit to check if the results are what we expect):

        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);

        // Store the index in memory:
        Directory directory = new RAMDirectory();
        // To store an index on disk, use this instead:
        //Directory directory = FSDirectory.open("/tmp/testindex");
        IndexWriter iwriter = new IndexWriter(directory, analyzer, true,
                                              new IndexWriter.MaxFieldLength(25000));
        Document doc = new Document();
        String text = "This is the text to be indexed.";
        doc.add(new Field("fieldname", text, Field.Store.YES,
            Field.Index.ANALYZED));
        iwriter.addDocument(doc);
        iwriter.close();
        
        // Now search the index:
        IndexSearcher isearcher = new IndexSearcher(directory, true)// read-only=true
        // Parse a simple query that searches for "text":
        QueryParser parser = new QueryParser("fieldname", analyzer);
        Query query = parser.parse("text");
        ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
        assertEquals(1, hits.length);
        // Iterate through the results:
        for (int i = 0; i < hits.length; i++) {
          Document hitDoc = isearcher.doc(hits[i].doc);
          assertEquals("This is the text to be indexed.", hitDoc.get("fieldname"));
        }
        isearcher.close();
        directory.close();

    The Lucene API is divided into several packages:

    To use Lucene, an application should:
    1. Create Documents by adding Fields;
    2. Create an IndexWriter and add documents to it with addDocument();
    3. Call QueryParser.parse() to build a query from a string; and
    4. Create an IndexSearcher and pass the query to its search() method.
    Some simple examples of code which does this are: To demonstrate these, try something like:
    > java -cp lucene.jar:lucene-demo.jar org.apache.lucene.demo.IndexFiles rec.food.recipes/soups
    adding rec.food.recipes/soups/abalone-chowder
      [ ... ]

    > java -cp lucene.jar:lucene-demo.jar org.apache.lucene.demo.SearchFiles
    Query: chowder
    Searching for: chowder
    34 total matching documents
    1. rec.food.recipes/soups/spam-chowder
      [ ... thirty-four documents contain the word "chowder" ... ]

    Query: "clam chowder" AND Manhattan
    Searching for: +"clam chowder" +manhattan
    2 total matching documents
    1. rec.food.recipes/soups/clam-chowder
      [ ... two documents contain the phrase "clam chowder" and the word "manhattan" ... ]
        [ Note: "+" and "-" are canonical, but "AND", "OR" and "NOT" may be used. ]

    The IndexHTML demo is more sophisticated.  It incrementally maintains an index of HTML files, adding new files as they appear, deleting old files as they disappear and re-indexing files as they change.
    > java -cp lucene.jar:lucene-demo.jar org.apache.lucene.demo.IndexHTML -create java/jdk1.1.6/docs/relnotes
    adding java/jdk1.1.6/docs/relnotes/SMICopyright.html
      [ ... create an index containing all the relnotes ]

    > rm java/jdk1.1.6/docs/relnotes/smicopyright.html

    > java -cp lucene.jar:lucene-demo.jar org.apache.lucene.demo.IndexHTML java/jdk1.1.6/docs/relnotes
    deleting java/jdk1.1.6/docs/relnotes/SMICopyright.html

    lucene-2.9.4/src/site/0000755000175000017500000000000011554106562015213 5ustar janpascaljanpascallucene-2.9.4/src/site/changes/0000755000175000017500000000000011554106562016623 5ustar janpascaljanpascallucene-2.9.4/src/site/changes/ChangesSimpleStyle.css0000644000175000017500000000012511474320233023070 0ustar janpascaljanpascalli { margin-top: 1em; margin-bottom: 1em; } span.attrib { color: darkgreen; } lucene-2.9.4/src/site/changes/changes2html.pl0000644000175000017500000007725711474320233021553 0ustar janpascaljanpascal#!/usr/bin/perl # # Transforms Lucene Java's CHANGES.txt into Changes.html # # Input is on STDIN, output is to STDOUT # # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # use strict; use warnings; my $jira_url_prefix = 'http://issues.apache.org/jira/browse/'; my $bugzilla_url_prefix = 'http://issues.apache.org/bugzilla/show_bug.cgi?id='; my %release_dates = &setup_release_dates; my $month_regex = &setup_month_regex; my %month_nums = &setup_month_nums; my %bugzilla_jira_map = &setup_bugzilla_jira_map; my $title = undef; my $release = undef; my $reldate = undef; my $relinfo = undef; my $sections = undef; my $items = undef; my $first_relid = undef; my $second_relid = undef; my @releases = (); my @lines = <>; # Get all input at once # # Parse input and build hierarchical release structure in @releases # for (my $line_num = 0 ; $line_num <= $#lines ; ++$line_num) { $_ = $lines[$line_num]; next unless (/\S/); # Skip blank lines next if (/^\s*\$Id(?::.*)?\$/); # Skip $Id$ lines unless ($title) { if (/\S/) { s/^\s+//; # Trim leading whitespace s/\s+$//; # Trim trailing whitespace } s/^[^Ll]*//; # Trim leading BOM characters if exists $title = $_; next; } if (/\s*===+\s*(.*?)\s*===+\s*/) { # New-style release headings $release = $1; $release =~ s/^release\s*//i; # Trim "Release " prefix ($release, $relinfo) = ($release =~ /^(\d+(?:\.\d+)*|Trunk)\s*(.*)/i); $relinfo =~ s/\s*:\s*$//; # Trim trailing colon $relinfo =~ s/^\s*,\s*//; # Trim leading comma ($reldate, $relinfo) = get_release_date($release, $relinfo); $sections = []; push @releases, [ $release, $reldate, $relinfo, $sections ]; ($first_relid = lc($release)) =~ s/\s+/_/g if ($#releases == 0); ($second_relid = lc($release)) =~ s/\s+/_/g if ($#releases == 1); $items = undef; next; } if (/^\s*([01](?:\.[0-9]{1,2}){1,2}[a-z]?(?:\s*(?:RC\d+|final))?)\s* ((?:200[0-7]-.*|.*,.*200[0-7].*)?)$/x) { # Old-style release heading $release = $1; $relinfo = $2; $relinfo =~ s/\s*:\s*$//; # Trim trailing colon $relinfo =~ s/^\s*,\s*//; # Trim leading comma ($reldate, $relinfo) = get_release_date($release, $relinfo); $sections = []; push @releases, [ $release, $reldate, $relinfo, $sections ]; $items = undef; next; } # Section heading: no leading whitespace, initial word capitalized, # five words or less, and no trailing punctuation if (/^([A-Z]\S*(?:\s+\S+){0,4})(?[0]; # 0th position of items array is list type } else { $type = get_list_type($_); push @$items, $type; } if ($type eq 'numbered') { # The modern items list style # List item boundary is another numbered item or an unindented line my $line; my $item = $_; $item =~ s/^(\s{0,2}\d+\.\d?\s*)//; # Trim the leading item number my $leading_ws_width = length($1); $item =~ s/\s+$//; # Trim trailing whitespace $item .= "\n"; while ($line_num < $#lines and ($line = $lines[++$line_num]) !~ /^(?:\s{0,2}\d+\.\s*\S|\S)/) { $line =~ s/^\s{$leading_ws_width}//; # Trim leading whitespace $line =~ s/\s+$//; # Trim trailing whitespace $item .= "$line\n"; } $item =~ s/\n+\Z/\n/; # Trim trailing blank lines push @$items, $item; --$line_num unless ($line_num == $#lines); } elsif ($type eq 'paragraph') { # List item boundary is a blank line my $line; my $item = $_; $item =~ s/^(\s+)//; my $leading_ws_width = defined($1) ? length($1) : 0; $item =~ s/\s+$//; # Trim trailing whitespace $item .= "\n"; while ($line_num < $#lines and ($line = $lines[++$line_num]) =~ /\S/) { $line =~ s/^\s{$leading_ws_width}//; # Trim leading whitespace $line =~ s/\s+$//; # Trim trailing whitespace $item .= "$line\n"; } push @$items, $item; --$line_num unless ($line_num == $#lines); } else { # $type is one of the bulleted types # List item boundary is another bullet or a blank line my $line; my $item = $_; $item =~ s/^(\s*\Q$type\E\s*)//; # Trim the leading bullet my $leading_ws_width = length($1); $item =~ s/\s+$//; # Trim trailing whitespace $item .= "\n"; while ($line_num < $#lines and ($line = $lines[++$line_num]) !~ /^(?:\S|\s*\Q$type\E)/) { $line =~ s/^\s{$leading_ws_width}//; # Trim leading whitespace $line =~ s/\s+$//; # Trim trailing whitespace $item .= "$line\n"; } push @$items, $item; --$line_num unless ($line_num == $#lines); } } # # Print HTML-ified version to STDOUT # print<<"__HTML_HEADER__"; $title

    $title

    __HTML_HEADER__ my $heading; my $relcnt = 0; my $header = 'h2'; for my $rel (@releases) { if (++$relcnt == 3) { $header = 'h3'; print "

    "; print "Older Releases"; print "

    \n"; print "
      \n" } ($release, $reldate, $relinfo, $sections) = @$rel; # The first section heading is undefined for the older sectionless releases my $has_release_sections = has_release_sections($sections); (my $relid = lc($release)) =~ s/\s+/_/g; print "<$header>"; print "Release " unless ($release =~ /^trunk$/i); print "$release $relinfo"; print " [$reldate]" unless ($reldate eq 'unknown'); print "\n"; print "
        \n" if ($has_release_sections); for my $section (@$sections) { ($heading, $items) = @$section; (my $sectid = lc($heading)) =~ s/\s+/_/g; my $numItemsStr = $#{$items} > 0 ? "($#{$items})" : "(none)"; print "
      • ", ($heading || ''), "   $numItemsStr\n" if ($has_release_sections and $heading); my $list_type = $items->[0] || ''; my $list = ($has_release_sections || $list_type eq 'numbered' ? 'ol' : 'ul'); my $listid = $sectid ? "$relid.$sectid" : $relid; print " <$list id=\"$listid.list\">\n" unless ($has_release_sections and not $heading); for my $itemnum (1..$#{$items}) { my $item = $items->[$itemnum]; $item =~ s:&:&:g; # Escape HTML metachars, $item =~ s:<(?!/?code>):<:gi; # but leave tags intact $item =~ s:(?:>:gi; # and add
         tags so that
              $item =~ s::
        :gi;       #   whitespace is preserved in the
              $item =~ s:\s*:
        :gi; # output. # Put attributions on their own lines. # Check for trailing parenthesized attribution with no following period. # Exclude things like "(see #3 above)" and "(use the bug number instead of xxxx)" unless ($item =~ s:\s*(\((?!see #|use the bug number)[^)"]+?\))\s*$:
        $1:) { # If attribution is not found, then look for attribution with a # trailing period, but try not to include trailing parenthesized things # that are not attributions. # # Rule of thumb: if a trailing parenthesized expression with a following # period does not contain "LUCENE-XXX", and it either has three or # fewer words or it includes the word "via" or the phrase "updates from", # then it is considered to be an attribution. $item =~ s{(\s*(\((?!see \#|use the bug number)[^)"]+?\))) ((?:\.|(?i:\.?\s*Issue\s+\d{3,}|LUCENE-\d+)\.?)\s*)$} { my $subst = $1; # default: no change my $parenthetical = $2; my $trailing_period_and_or_issue = $3; if ($parenthetical !~ /LUCENE-\d+/) { my ($no_parens) = $parenthetical =~ /^\((.*)\)$/s; my @words = grep {/\S/} split /\s+/, $no_parens; if ($no_parens =~ /\b(?:via|updates\s+from)\b/i || scalar(@words) <= 3) { $subst = "
        $parenthetical"; } } $subst . $trailing_period_and_or_issue; }ex; } $item =~ s:\n{2,}:\n

        \n:g; # Keep paragraph breaks # Link LUCENE-XXX, SOLR-XXX and INFRA-XXX to JIRA $item =~ s{(?:${jira_url_prefix})?((?:LUCENE|SOLR|INFRA)-\d+)} {$1}g; $item =~ s{(issue\s*\#?\s*(\d{3,}))} # Link Issue XXX to JIRA {$1}gi; # Link Lucene XXX, SOLR XXX and INFRA XXX to JIRA $item =~ s{((LUCENE|SOLR|INFRA)\s+(\d{3,}))} {$1}gi; # Find single Bugzilla issues $item =~ s~((?i:bug|patch|issue)\s*\#?\s*(\d+)) ~ my $issue = $1; my $jira_issue_num = $bugzilla_jira_map{$2}; # Link to JIRA copies $issue = qq!! . qq!$issue [LUCENE-$jira_issue_num]! if (defined($jira_issue_num)); $issue; ~gex; # Find multiple Bugzilla issues $item =~ s~(?<=(?i:bugs))(\s*)(\d+)(\s*(?i:\&|and)\s*)(\d+) ~ my $leading_whitespace = $1; my $issue_num_1 = $2; my $interlude = $3; my $issue_num_2 = $4; # Link to JIRA copies my $jira_issue_1 = $bugzilla_jira_map{$issue_num_1}; my $issue1 = qq!! . qq!$issue_num_1 [LUCENE-$jira_issue_1]! if (defined($jira_issue_1)); my $jira_issue_2 = $bugzilla_jira_map{$issue_num_2}; my $issue2 = qq!! . qq!$issue_num_2 [LUCENE-$jira_issue_2]! if (defined($jira_issue_2)); $leading_whitespace . $issue1 . $interlude . $issue2; ~gex; print "

      • $item
      • \n"; } print " \n" unless ($has_release_sections and not $heading); print "
      • \n" if ($has_release_sections); } print "
      \n" if ($has_release_sections); } print "
    \n" if ($relcnt > 3); print "\n\n"; # # Subroutine: has_release_sections # # Takes one parameter: # # - The $sections array reference # # Returns one scalar: # # - A boolean indicating whether there are release sections # sub has_release_sections { my $sections = shift; my $has_release_sections = 0; my $first_titled_section_num = -1; for my $section_num (0 .. $#{$sections}) { if ($sections->[$section_num][0]) { $has_release_sections = 1; last; } } return $has_release_sections; } # # Subroutine: get_list_type # # Takes one parameter: # # - The first line of a sub-section/point # # Returns one scalar: # # - The list type: 'numbered'; or one of the bulleted types '-', or '.' or # 'paragraph'. # sub get_list_type { my $first_list_item_line = shift; my $type = 'paragraph'; # Default to paragraph type if ($first_list_item_line =~ /^\s{0,2}\d+\.\s+\S+/) { $type = 'numbered'; } elsif ($first_list_item_line =~ /^\s*([-.*])\s+\S+/) { $type = $1; } return $type; } # # Subroutine: get_release_date # # Takes two parameters: # # - Release name # - Release info, potentially including a release date # # Returns two scalars: # # - The release date, in format YYYY-MM-DD # - The remainder of the release info (if any), with release date stripped # sub get_release_date { my $release = shift; my $relinfo = shift; my ($year, $month, $dom, $reldate); if ($relinfo) { if ($relinfo =~ s:\s*(2\d\d\d)([-./]) (1[012]|0?[1-9])\2 ([12][0-9]|30|31|0?[1-9])\s*: :x) { # YYYY-MM-DD or YYYY-M-D or YYYY-MM-D or YYYY-M-DD $year = $1; $month = $3; $dom = $4; $dom = "0$dom" if (length($dom) == 1); $reldate = "$year-$month-$dom"; } elsif ($relinfo =~ s:\s*(1[012]|0?[1-9])([-./]) ([12][0-9]|30|31|0?[1-9])\2 (2\d\d\d)\s*: :x) { # MM-DD-YYYY or M-D-YYYY or MM-D-YYYY or M-DD-YYYY $month = $1; $dom = $3; $dom = "0$dom" if (length($dom) == 1); $year = $4; $reldate = "$year-$month-$dom"; } elsif ($relinfo =~ s:($month_regex)\s* ([12][0-9]|30|31|0?[1-9])((st|rd|th)\.?)?,?\s* (2\d\d\d)\s*: :x) { # MMMMM DD, YYYY or MMMMM DDth, YYYY $month = $month_nums{$1}; $dom = $2; $dom = "0$dom" if (length($dom) == 1); $year = $5; $reldate = "$year-$month-$dom"; } elsif ($relinfo =~ s:([12][0-9]|30|31|0?[1-9])(\s+|[-/.]) ($month_regex)\2 (2\d\d\d)\s*: :x) { # DD MMMMM YYYY $dom = $1; $dom = "0$dom" if (length($dom) == 1); $month = $month_nums{$3}; $year = $4; $reldate = "$year-$month-$dom"; } } unless ($reldate) { # No date found in $relinfo # Handle '1.2 RC6', which should be '1.2 final' $release = '1.2 final' if ($release eq '1.2 RC6'); $reldate = ( exists($release_dates{$release}) ? $release_dates{$release} : 'unknown'); } $relinfo =~ s/,?\s*$//; # Trim trailing comma and whitespace return ($reldate, $relinfo); } # # setup_release_dates # # Returns a list of alternating release names and dates, for use in populating # the %release_dates hash. # sub setup_release_dates { return ( '0.01' => '2000-03-30', '0.04' => '2000-04-19', '1.0' => '2000-10-04', '1.01b' => '2001-06-02', '1.2 RC1' => '2001-10-02', '1.2 RC2' => '2001-10-19', '1.2 RC3' => '2002-01-27', '1.2 RC4' => '2002-02-14', '1.2 RC5' => '2002-05-14', '1.2 final' => '2002-06-13', '1.3 RC1' => '2003-03-24', '1.3 RC2' => '2003-10-22', '1.3 RC3' => '2003-11-25', '1.3 final' => '2003-12-26', '1.4 RC1' => '2004-03-29', '1.4 RC2' => '2004-03-30', '1.4 RC3' => '2004-05-11', '1.4 final' => '2004-07-01', '1.4.1' => '2004-08-02', '1.4.2' => '2004-10-01', '1.4.3' => '2004-12-07', '1.9 RC1' => '2006-02-21', '1.9 final' => '2006-02-27', '1.9.1' => '2006-03-02', '2.0.0' => '2006-05-26', '2.1.0' => '2007-02-14', '2.2.0' => '2007-06-19', '2.3.0' => '2008-01-21', '2.3.1' => '2008-02-22', '2.3.2' => '2008-05-05', '2.4.0' => '2008-10-06', '2.4.1' => '2009-03-09', '2.9.0' => '2009-09-23', '2.9.1' => '2009-11-06'); } # # setup_month_regex # # Returns a string containing a regular expression with alternations for # the standard month representations in English. # sub setup_month_regex { return '(?i:Jan(?:|\.|uary)|Feb(?:|\.|ruary)|Mar(?:|\.|ch)' . '|Apr(?:|\.|il)|May|Jun(?:|\.|e)|Jul(?:|\.|y)|Aug(?:|\.|ust)' . '|Sep(?:|\.|t(?:|\.|ember))|Oct(?:|\.|ober)|Nov(?:|\.|ember)' . '|Dec(?:|\.|ember))'; } # # setup_month_nums # # Returns a list of alternating English month representations and the two-digit # month number corresponding to them, for use in populating the %month_nums # hash. # sub setup_month_nums { return ( 'Jan' => '01', 'Jan.' => '01', 'January' => '01', 'Feb' => '02', 'Feb.' => '02', 'February' => '02', 'Mar' => '03', 'Mar.' => '03', 'March' => '03', 'Apr' => '04', 'Apr.' => '04', 'April' => '04', 'May' => '05', 'Jun' => '06', 'Jun.' => '06', 'June' => '06', 'Jul' => '07', 'Jul.' => '07', 'July' => '07', 'Aug' => '08', 'Aug.' => '08', 'August' => '08', 'Sep' => '09', 'Sep.' => '09', 'Sept' => '09', 'Sept.' => '09', 'September' => '09', 'Oct' => '10', 'Oct.' => '10', 'October' => '10', 'Nov' => '11', 'Nov.' => '11', 'November' => '11', 'Dec' => '12', 'Dec.' => '12', 'December' => '12' ); } # # setup_bugzilla_jira_map # # Returns a list of alternating Bugzilla bug IDs and LUCENE-* JIRA issue # numbers, for use in populating the %bugzilla_jira_map hash # sub setup_bugzilla_jira_map { return ( 4049 => 1, 4102 => 2, 4105 => 3, 4254 => 4, 4555 => 5, 4568 => 6, 4754 => 7, 5313 => 8, 5456 => 9, 6078 => 10, 6091 => 11, 6140 => 12, 6292 => 13, 6315 => 14, 6469 => 15, 6914 => 16, 6968 => 17, 7017 => 18, 7019 => 19, 7088 => 20, 7089 => 21, 7275 => 22, 7412 => 23, 7461 => 24, 7574 => 25, 7710 => 26, 7750 => 27, 7782 => 28, 7783 => 29, 7912 => 30, 7974 => 31, 8307 => 32, 8525 => 33, 9015 => 34, 9110 => 35, 9347 => 36, 9454 => 37, 9782 => 38, 9853 => 39, 9906 => 40, 9970 => 41, 10340 => 42, 10341 => 43, 10342 => 44, 10343 => 45, 10849 => 46, 11109 => 47, 11359 => 48, 11636 => 49, 11918 => 50, 12137 => 51, 12273 => 52, 12444 => 53, 12569 => 54, 12588 => 55, 12619 => 56, 12667 => 57, 12723 => 58, 12749 => 59, 12761 => 60, 12950 => 61, 13102 => 62, 13166 => 63, 14028 => 64, 14355 => 65, 14373 => 66, 14412 => 67, 14485 => 68, 14585 => 69, 14665 => 70, 14900 => 71, 15739 => 72, 16025 => 73, 16043 => 74, 16167 => 75, 16245 => 76, 16364 => 77, 16437 => 78, 16438 => 79, 16470 => 80, 16677 => 81, 16719 => 82, 16730 => 83, 16816 => 84, 16952 => 85, 17242 => 86, 17954 => 88, 18014 => 89, 18088 => 90, 18177 => 91, 18410 => 87, 18833 => 92, 18847 => 93, 18914 => 94, 18927 => 95, 18928 => 96, 18929 => 97, 18931 => 98, 18932 => 99, 18933 => 100, 18934 => 101, 19058 => 102, 19149 => 103, 19189 => 104, 19253 => 105, 19468 => 106, 19686 => 107, 19736 => 108, 19751 => 109, 19834 => 110, 19844 => 111, 20024 => 112, 20081 => 113, 20123 => 114, 20196 => 115, 20283 => 116, 20290 => 117, 20461 => 118, 20901 => 119, 21128 => 120, 21149 => 121, 21150 => 122, 21189 => 123, 21446 => 124, 21921 => 126, 22344 => 128, 22469 => 130, 22987 => 131, 23307 => 133, 23308 => 134, 23422 => 135, 23466 => 136, 23505 => 137, 23534 => 138, 23545 => 139, 23650 => 140, 23655 => 141, 23685 => 142, 23702 => 143, 23727 => 144, 23730 => 145, 23750 => 146, 23754 => 147, 23770 => 148, 23771 => 149, 23773 => 150, 23774 => 151, 23782 => 152, 23784 => 153, 23786 => 154, 23838 => 155, 23964 => 156, 24084 => 129, 24237 => 157, 24265 => 158, 24301 => 159, 24370 => 160, 24665 => 161, 24786 => 162, 24902 => 163, 24903 => 164, 24913 => 165, 25666 => 125, 25793 => 166, 25820 => 167, 25945 => 168, 26120 => 169, 26196 => 170, 26268 => 171, 26360 => 172, 26396 => 173, 26397 => 174, 26624 => 175, 26634 => 176, 26666 => 177, 26702 => 178, 26716 => 179, 26763 => 180, 26884 => 181, 26939 => 182, 27168 => 183, 27174 => 184, 27182 => 185, 27268 => 186, 27326 => 187, 27354 => 188, 27408 => 189, 27423 => 190, 27433 => 191, 27491 => 192, 27587 => 193, 27626 => 194, 27638 => 195, 27743 => 196, 27772 => 197, 27799 => 198, 27819 => 199, 27865 => 200, 27868 => 201, 27903 => 202, 27987 => 203, 28030 => 204, 28050 => 205, 28065 => 206, 28074 => 207, 28108 => 208, 28181 => 209, 28182 => 210, 28183 => 211, 28187 => 212, 28285 => 213, 28336 => 214, 28339 => 215, 28405 => 216, 28462 => 217, 28601 => 218, 28640 => 219, 28748 => 220, 28827 => 221, 28855 => 222, 28856 => 223, # Clone: 28856 => 507, 28858 => 224, 28960 => 132, 28964 => 127, 29033 => 225, 29256 => 226, 29299 => 227, 29302 => 228, 29370 => 229, 29432 => 230, 29548 => 231, 29749 => 232, 29756 => 233, 29774 => 234, 29931 => 235, 29984 => 236, 30013 => 237, 30016 => 238, 30026 => 239, 30027 => 240, 30049 => 241, 30058 => 242, 30232 => 243, 30237 => 244, 30240 => 245, 30242 => 246, 30265 => 247, 30327 => 248, 30330 => 249, 30360 => 250, 30376 => 251, 30382 => 252, 30421 => 253, 30429 => 254, 30452 => 255, 30480 => 256, 30522 => 257, 30617 => 258, 30621 => 259, 30628 => 260, 30629 => 261, 30668 => 262, 30678 => 263, 30685 => 264, 30736 => 265, 30785 => 266, 30818 => 267, 30835 => 268, 30844 => 269, 30977 => 270, 30985 => 271, 31061 => 272, 31120 => 273, 31149 => 274, 31174 => 275, 31240 => 276, 31241 => 277, 31294 => 278, 31350 => 279, 31368 => 280, 31420 => 281, 31469 => 282, 31508 => 283, 31554 => 284, 31617 => 285, 31619 => 286, 31690 => 287, 31706 => 288, 31708 => 289, 31746 => 290, 31747 => 291, 31748 => 292, 31784 => 293, 31785 => 294, 31841 => 295, 31882 => 296, 31926 => 297, 31976 => 298, 32053 => 299, 32055 => 300, 32088 => 301, 32090 => 302, 32109 => 303, 32115 => 304, 32143 => 305, 32167 => 306, 32171 => 307, 32192 => 308, 32227 => 309, 32228 => 310, 32234 => 311, 32291 => 312, 32307 => 313, 32334 => 314, 32353 => 315, 32365 => 316, 32403 => 317, 32432 => 318, 32467 => 319, 32468 => 320, 32580 => 321, 32626 => 322, 32674 => 323, 32687 => 324, 32712 => 325, 32847 => 326, 32887 => 327, 32921 => 328, 32942 => 329, 32965 => 330, 32981 => 331, 32999 => 332, 33019 => 333, 33076 => 334, 33134 => 335, 33158 => 336, 33161 => 337, 33197 => 338, 33239 => 339, 33389 => 340, 33395 => 341, 33397 => 342, 33442 => 343, 33449 => 344, 33459 => 345, 33472 => 346, 33642 => 347, 33648 => 348, 33649 => 349, 33654 => 350, 33678 => 351, 33725 => 352, 33799 => 353, 33820 => 354, 33835 => 355, 33848 => 356, 33851 => 357, 33877 => 358, 33884 => 359, 33974 => 360, 34028 => 361, 34066 => 362, 34149 => 363, 34154 => 364, 34193 => 365, 34279 => 366, 34320 => 367, 34331 => 368, 34359 => 369, 34407 => 370, 34408 => 371, 34447 => 372, 34453 => 373, 34477 => 374, # Clone: 34477 => 459, 34486 => 375, 34528 => 376, 34544 => 377, 34545 => 378, 34563 => 379, 34570 => 380, 34585 => 381, 34629 => 382, 34673 => 383, 34684 => 384, 34695 => 385, 34816 => 386, 34882 => 387, 34930 => 388, 34946 => 389, 34995 => 390, 35029 => 391, 35037 => 392, 35157 => 393, 35241 => 394, 35284 => 395, # Clone: 35284 => 466, 35388 => 396, 35446 => 397, 35454 => 398, 35455 => 399, 35456 => 400, 35468 => 401, 35491 => 402, 35518 => 403, 35626 => 404, 35664 => 405, 35665 => 406, 35668 => 407, 35729 => 408, 35730 => 409, 35731 => 410, 35796 => 411, 35822 => 412, 35823 => 413, 35838 => 414, 35879 => 415, # Clone: 35879 => 616, 35886 => 416, 35971 => 417, 36021 => 418, 36078 => 419, 36101 => 420, 36135 => 421, 36147 => 422, 36197 => 423, 36219 => 424, 36241 => 425, 36242 => 426, 36292 => 427, 36296 => 428, 36333 => 429, 36622 => 430, 36623 => 431, 36628 => 432); } 1; lucene-2.9.4/src/site/changes/ChangesFixedWidthStyle.css0000644000175000017500000000057011474320233023702 0ustar janpascaljanpascalbody { font-family: Courier New, monospace; font-size: 10pt; } h1 { font-family: Courier New, monospace; font-size: 10pt; } h2 { font-family: Courier New, monospace; font-size: 10pt; } h3 { font-family: Courier New, monospace; font-size: 10pt; } a:link { color: blue; } a:visited { color: purple; } li { margin-top: 1em; margin-bottom: 1em; } lucene-2.9.4/src/site/changes/ChangesFancyStyle.css0000644000175000017500000000077211474320233022707 0ustar janpascaljanpascalbody { font-family: Georgia, "Times New Roman", Times, serif; color: black; background-color: white; } h1 { font-family: Helvetica, Geneva, Arial, SunSans-Regular, sans-serif; color: black; background-color: light-gray; } h2 { font-family: Helvetica, Geneva, Arial, SunSans-Regular, sans-serif; color: black; background-color: #D8D8D8; } a:link { color: blue; } a:visited { color: purple; } li { margin-top: 1em; margin-bottom: 1em; } span.attrib { color: darkgreen; } lucene-2.9.4/src/site/forrest.properties0000644000175000017500000001135311474320233021012 0ustar janpascaljanpascal# Copyright 2002-2005 The Apache Software Foundation or its licensors, # as applicable. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ############## # Properties used by forrest.build.xml for building the website # These are the defaults, un-comment them only if you need to change them. ############## # Prints out a summary of Forrest settings for this project #forrest.echo=true # Project name (used to name .war file) #project.name=my-project # Specifies name of Forrest skin to use # See list at http://forrest.apache.org/docs/skins.html project.skin=lucene # Descriptors for plugins and skins # comma separated list, file:// is supported #forrest.skins.descriptors=http://forrest.apache.org/skins/skins.xml,file:///c:/myskins/skins.xml #forrest.plugins.descriptors=http://forrest.apache.org/plugins/plugins.xml,http://forrest.apache.org/plugins/whiteboard-plugins.xml ############## # behavioural properties #project.menu-scheme=tab_attributes #project.menu-scheme=directories ############## # layout properties # Properties that can be set to override the default locations # # Parent properties must be set. This usually means uncommenting # project.content-dir if any other property using it is uncommented #project.status=status.xml #project.content-dir=src/documentation #project.raw-content-dir=${project.content-dir}/content #project.conf-dir=${project.content-dir}/conf #project.sitemap-dir=${project.content-dir} #project.xdocs-dir=${project.content-dir}/content/xdocs #project.resources-dir=${project.content-dir}/resources #project.stylesheets-dir=${project.resources-dir}/stylesheets #project.images-dir=${project.resources-dir}/images #project.schema-dir=${project.resources-dir}/schema #project.skins-dir=${project.content-dir}/skins #project.skinconf=${project.content-dir}/skinconf.xml #project.lib-dir=${project.content-dir}/lib #project.classes-dir=${project.content-dir}/classes #project.translations-dir=${project.content-dir}/translations project.configfile=${project.home}/src/documentation/conf/cli.xconf ############## # validation properties # This set of properties determine if validation is performed # Values are inherited unless overridden. # e.g. if forrest.validate=false then all others are false unless set to true. #forrest.validate=true #forrest.validate.xdocs=${forrest.validate} #forrest.validate.skinconf=${forrest.validate} #forrest.validate.sitemap=${forrest.validate} #forrest.validate.stylesheets=${forrest.validate} #forrest.validate.skins=${forrest.validate} #forrest.validate.skins.stylesheets=${forrest.validate.skins} # *.failonerror=(true|false) - stop when an XML file is invalid #forrest.validate.failonerror=true # *.excludes=(pattern) - comma-separated list of path patterns to not validate # e.g. #forrest.validate.xdocs.excludes=samples/subdir/**, samples/faq.xml #forrest.validate.xdocs.excludes= ############## # General Forrest properties # The URL to start crawling from #project.start-uri=linkmap.html # Set logging level for messages printed to the console # (DEBUG, INFO, WARN, ERROR, FATAL_ERROR) #project.debuglevel=ERROR # Max memory to allocate to Java #forrest.maxmemory=64m # Any other arguments to pass to the JVM. For example, to run on an X-less # server, set to -Djava.awt.headless=true #forrest.jvmargs= # The bugtracking URL - the issue number will be appended #project.bugtracking-url=http://issues.apache.org/bugzilla/show_bug.cgi?id= #project.bugtracking-url=http://issues.apache.org/jira/browse/ # The issues list as rss #project.issues-rss-url= #I18n Property. Based on the locale request for the browser. #If you want to use it for static site then modify the JVM system.language # and run once per language #project.i18n=true # The names of plugins that are required to build the project # comma separated list (no spaces) # You can request a specific version by appending "-VERSION" to the end of # the plugin name. If you exclude a version number the latest released version # will be used, however, be aware that this may be a development version. In # a production environment it is recomended that you specify a known working # version. # Run "forrest available-plugins" for a list of plug-ins currently available project.required.plugins=org.apache.forrest.plugin.output.pdf # Proxy configuration # proxy.host= # proxy.port= lucene-2.9.4/src/site/src/0000755000175000017500000000000011474320232015773 5ustar janpascaljanpascallucene-2.9.4/src/site/src/documentation/0000755000175000017500000000000011554106562020653 5ustar janpascaljanpascallucene-2.9.4/src/site/src/documentation/classes/0000755000175000017500000000000011554106562022310 5ustar janpascaljanpascallucene-2.9.4/src/site/src/documentation/classes/CatalogManager.properties0000644000175000017500000000463311474320233027273 0ustar janpascaljanpascal# Copyright 2002-2005 The Apache Software Foundation or its licensors, # as applicable. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. #======================================================================= # CatalogManager.properties for Catalog Entity Resolver. # # This is the default properties file for your project. # This facilitates local configuration of application-specific catalogs. # If you have defined any local catalogs, then they will be loaded # before Forrest's core catalogs. # # See the Apache Forrest documentation: # http://forrest.apache.org/docs/your-project.html # http://forrest.apache.org/docs/validation.html # verbosity: # The level of messages for status/debug (messages go to standard output). # The setting here is for your own local catalogs. # The verbosity of Forrest's core catalogs is controlled via # main/webapp/WEB-INF/cocoon.xconf # # The following messages are provided ... # 0 = none # 1 = ? (... not sure yet) # 2 = 1+, Loading catalog, Resolved public, Resolved system # 3 = 2+, Catalog does not exist, resolvePublic, resolveSystem # 10 = 3+, List all catalog entries when loading a catalog # (Cocoon also logs the "Resolved public" messages.) verbosity=1 # catalogs ... list of additional catalogs to load # (Note that Apache Forrest will automatically load its own default catalog # from main/webapp/resources/schema/catalog.xcat) # Use either full pathnames or relative pathnames. # pathname separator is always semi-colon (;) regardless of operating system # directory separator is always slash (/) regardless of operating system catalogs=../resources/schema/catalog.xcat # relative-catalogs # If false, relative catalog URIs are made absolute with respect to the # base URI of the CatalogManager.properties file. This setting only # applies to catalog URIs obtained from the catalogs property in the # CatalogManager.properties file # Example: relative-catalogs=[yes|no] relative-catalogs=no lucene-2.9.4/src/site/src/documentation/skins/0000755000175000017500000000000011474320232021773 5ustar janpascaljanpascallucene-2.9.4/src/site/src/documentation/skins/common/0000755000175000017500000000000011554106562023272 5ustar janpascaljanpascallucene-2.9.4/src/site/src/documentation/skins/common/scripts/0000755000175000017500000000000011554106562024761 5ustar janpascaljanpascallucene-2.9.4/src/site/src/documentation/skins/common/scripts/getBlank.js0000644000175000017500000000310511474320232027036 0ustar janpascaljanpascal/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * getBlank script - when included in a html file and called from a form text field, will set the value of this field to "" * if the text value is still the standard value. * getPrompt script - when included in a html file and called from a form text field, will set the value of this field to the prompt * if the text value is empty. * * Typical usage: * * */ lucene-2.9.4/src/site/src/documentation/skins/common/scripts/prototype.js0000644000175000017500000010020611474320232027354 0ustar janpascaljanpascal/* Prototype JavaScript framework, version 1.4.0_pre4 * (c) 2005 Sam Stephenson * * THIS FILE IS AUTOMATICALLY GENERATED. When sending patches, please diff * against the source tree, available from the Prototype darcs repository. * * Prototype is freely distributable under the terms of an MIT-style license. * * For details, see the Prototype web site: http://prototype.conio.net/ * /*--------------------------------------------------------------------------*/ var Prototype = { Version: '1.4.0_pre4', emptyFunction: function() {}, K: function(x) {return x} } var Class = { create: function() { return function() { this.initialize.apply(this, arguments); } } } var Abstract = new Object(); Object.extend = function(destination, source) { for (property in source) { destination[property] = source[property]; } return destination; } Function.prototype.bind = function(object) { var __method = this; return function() { return __method.apply(object, arguments); } } Function.prototype.bindAsEventListener = function(object) { var __method = this; return function(event) { return __method.call(object, event || window.event); } } Number.prototype.toColorPart = function() { var digits = this.toString(16); if (this < 16) return '0' + digits; return digits; } var Try = { these: function() { var returnValue; for (var i = 0; i < arguments.length; i++) { var lambda = arguments[i]; try { returnValue = lambda(); break; } catch (e) {} } return returnValue; } } /*--------------------------------------------------------------------------*/ var PeriodicalExecuter = Class.create(); PeriodicalExecuter.prototype = { initialize: function(callback, frequency) { this.callback = callback; this.frequency = frequency; this.currentlyExecuting = false; this.registerCallback(); }, registerCallback: function() { setInterval(this.onTimerEvent.bind(this), this.frequency * 1000); }, onTimerEvent: function() { if (!this.currentlyExecuting) { try { this.currentlyExecuting = true; this.callback(); } finally { this.currentlyExecuting = false; } } } } /*--------------------------------------------------------------------------*/ function $() { var elements = new Array(); for (var i = 0; i < arguments.length; i++) { var element = arguments[i]; if (typeof element == 'string') element = document.getElementById(element); if (arguments.length == 1) return element; elements.push(element); } return elements; } if (!Array.prototype.push) { Array.prototype.push = function() { var startLength = this.length; for (var i = 0; i < arguments.length; i++) this[startLength + i] = arguments[i]; return this.length; } } if (!Function.prototype.apply) { // Based on code from http://www.youngpup.net/ Function.prototype.apply = function(object, parameters) { var parameterStrings = new Array(); if (!object) object = window; if (!parameters) parameters = new Array(); for (var i = 0; i < parameters.length; i++) parameterStrings[i] = 'parameters[' + i + ']'; object.__apply__ = this; var result = eval('object.__apply__(' + parameterStrings.join(', ') + ')'); object.__apply__ = null; return result; } } Object.extend(String.prototype, { stripTags: function() { return this.replace(/<\/?[^>]+>/gi, ''); }, escapeHTML: function() { var div = document.createElement('div'); var text = document.createTextNode(this); div.appendChild(text); return div.innerHTML; }, unescapeHTML: function() { var div = document.createElement('div'); div.innerHTML = this.stripTags(); return div.childNodes[0].nodeValue; }, parseQuery: function() { var str = this; if (str.substring(0,1) == '?') { str = this.substring(1); } var result = {}; var pairs = str.split('&'); for (var i = 0; i < pairs.length; i++) { var pair = pairs[i].split('='); result[pair[0]] = pair[1]; } return result; } }); var _break = new Object(); var _continue = new Object(); var Enumerable = { each: function(iterator) { var index = 0; try { this._each(function(value) { try { iterator(value, index++); } catch (e) { if (e != _continue) throw e; } }); } catch (e) { if (e != _break) throw e; } }, all: function(iterator) { var result = true; this.each(function(value, index) { if (!(result &= (iterator || Prototype.K)(value, index))) throw _break; }); return result; }, any: function(iterator) { var result = true; this.each(function(value, index) { if (result &= (iterator || Prototype.K)(value, index)) throw _break; }); return result; }, collect: function(iterator) { var results = []; this.each(function(value, index) { results.push(iterator(value, index)); }); return results; }, detect: function (iterator) { var result; this.each(function(value, index) { if (iterator(value, index)) { result = value; throw _break; } }); return result; }, findAll: function(iterator) { var results = []; this.each(function(value, index) { if (iterator(value, index)) results.push(value); }); return results; }, grep: function(pattern, iterator) { var results = []; this.each(function(value, index) { var stringValue = value.toString(); if (stringValue.match(pattern)) results.push((iterator || Prototype.K)(value, index)); }) return results; }, include: function(object) { var found = false; this.each(function(value) { if (value == object) { found = true; throw _break; } }); return found; }, inject: function(memo, iterator) { this.each(function(value, index) { memo = iterator(memo, value, index); }); return memo; }, invoke: function(method) { var args = $A(arguments).slice(1); return this.collect(function(value) { return value[method].apply(value, args); }); }, max: function(iterator) { var result; this.each(function(value, index) { value = (iterator || Prototype.K)(value, index); if (value >= (result || value)) result = value; }); return result; }, min: function(iterator) { var result; this.each(function(value, index) { value = (iterator || Prototype.K)(value, index); if (value <= (result || value)) result = value; }); return result; }, partition: function(iterator) { var trues = [], falses = []; this.each(function(value, index) { ((iterator || Prototype.K)(value, index) ? trues : falses).push(value); }); return [trues, falses]; }, pluck: function(property) { var results = []; this.each(function(value, index) { results.push(value[property]); }); return results; }, reject: function(iterator) { var results = []; this.each(function(value, index) { if (!iterator(value, index)) results.push(value); }); return results; }, sortBy: function(iterator) { return this.collect(function(value, index) { return {value: value, criteria: iterator(value, index)}; }).sort(function(left, right) { var a = left.criteria, b = right.criteria; return a < b ? -1 : a > b ? 1 : 0; }).pluck('value'); }, toArray: function() { return this.collect(Prototype.K); }, zip: function() { var iterator = Prototype.K, args = $A(arguments); if (typeof args.last() == 'function') iterator = args.pop(); var collections = [this].concat(args).map($A); return this.map(function(value, index) { iterator(value = collections.pluck(index)); return value; }); } } Object.extend(Enumerable, { map: Enumerable.collect, find: Enumerable.detect, select: Enumerable.findAll, member: Enumerable.include, entries: Enumerable.toArray }); $A = Array.from = function(iterable) { var results = []; for (var i = 0; i < iterable.length; i++) results.push(iterable[i]); return results; } Object.extend(Array.prototype, { _each: function(iterator) { for (var i = 0; i < this.length; i++) iterator(this[i]); }, first: function() { return this[0]; }, last: function() { return this[this.length - 1]; } }); Object.extend(Array.prototype, Enumerable); var Ajax = { getTransport: function() { return Try.these( function() {return new ActiveXObject('Msxml2.XMLHTTP')}, function() {return new ActiveXObject('Microsoft.XMLHTTP')}, function() {return new XMLHttpRequest()} ) || false; } } Ajax.Base = function() {}; Ajax.Base.prototype = { setOptions: function(options) { this.options = { method: 'post', asynchronous: true, parameters: '' } Object.extend(this.options, options || {}); }, responseIsSuccess: function() { return this.transport.status == undefined || this.transport.status == 0 || (this.transport.status >= 200 && this.transport.status < 300); }, responseIsFailure: function() { return !this.responseIsSuccess(); } } Ajax.Request = Class.create(); Ajax.Request.Events = ['Uninitialized', 'Loading', 'Loaded', 'Interactive', 'Complete']; Ajax.Request.prototype = Object.extend(new Ajax.Base(), { initialize: function(url, options) { this.transport = Ajax.getTransport(); this.setOptions(options); this.request(url); }, request: function(url) { var parameters = this.options.parameters || ''; if (parameters.length > 0) parameters += '&_='; try { if (this.options.method == 'get') url += '?' + parameters; this.transport.open(this.options.method, url, this.options.asynchronous); if (this.options.asynchronous) { this.transport.onreadystatechange = this.onStateChange.bind(this); setTimeout((function() {this.respondToReadyState(1)}).bind(this), 10); } this.setRequestHeaders(); var body = this.options.postBody ? this.options.postBody : parameters; this.transport.send(this.options.method == 'post' ? body : null); } catch (e) { } }, setRequestHeaders: function() { var requestHeaders = ['X-Requested-With', 'XMLHttpRequest', 'X-Prototype-Version', Prototype.Version]; if (this.options.method == 'post') { requestHeaders.push('Content-type', 'application/x-www-form-urlencoded'); /* Force "Connection: close" for Mozilla browsers to work around * a bug where XMLHttpReqeuest sends an incorrect Content-length * header. See Mozilla Bugzilla #246651. */ if (this.transport.overrideMimeType) requestHeaders.push('Connection', 'close'); } if (this.options.requestHeaders) requestHeaders.push.apply(requestHeaders, this.options.requestHeaders); for (var i = 0; i < requestHeaders.length; i += 2) this.transport.setRequestHeader(requestHeaders[i], requestHeaders[i+1]); }, onStateChange: function() { var readyState = this.transport.readyState; if (readyState != 1) this.respondToReadyState(this.transport.readyState); }, respondToReadyState: function(readyState) { var event = Ajax.Request.Events[readyState]; if (event == 'Complete') (this.options['on' + this.transport.status] || this.options['on' + (this.responseIsSuccess() ? 'Success' : 'Failure')] || Prototype.emptyFunction)(this.transport); (this.options['on' + event] || Prototype.emptyFunction)(this.transport); /* Avoid memory leak in MSIE: clean up the oncomplete event handler */ if (event == 'Complete') this.transport.onreadystatechange = Prototype.emptyFunction; } }); Ajax.Updater = Class.create(); Ajax.Updater.ScriptFragment = '(?:)((\n|.)*?)(?:<\/script>)'; Object.extend(Object.extend(Ajax.Updater.prototype, Ajax.Request.prototype), { initialize: function(container, url, options) { this.containers = { success: container.success ? $(container.success) : $(container), failure: container.failure ? $(container.failure) : (container.success ? null : $(container)) } this.transport = Ajax.getTransport(); this.setOptions(options); var onComplete = this.options.onComplete || Prototype.emptyFunction; this.options.onComplete = (function() { this.updateContent(); onComplete(this.transport); }).bind(this); this.request(url); }, updateContent: function() { var receiver = this.responseIsSuccess() ? this.containers.success : this.containers.failure; var match = new RegExp(Ajax.Updater.ScriptFragment, 'img'); var response = this.transport.responseText.replace(match, ''); var scripts = this.transport.responseText.match(match); if (receiver) { if (this.options.insertion) { new this.options.insertion(receiver, response); } else { receiver.innerHTML = response; } } if (this.responseIsSuccess()) { if (this.onComplete) setTimeout((function() {this.onComplete( this.transport)}).bind(this), 10); } if (this.options.evalScripts && scripts) { match = new RegExp(Ajax.Updater.ScriptFragment, 'im'); setTimeout((function() { for (var i = 0; i < scripts.length; i++) eval(scripts[i].match(match)[1]); }).bind(this), 10); } } }); Ajax.PeriodicalUpdater = Class.create(); Ajax.PeriodicalUpdater.prototype = Object.extend(new Ajax.Base(), { initialize: function(container, url, options) { this.setOptions(options); this.onComplete = this.options.onComplete; this.frequency = (this.options.frequency || 2); this.decay = 1; this.updater = {}; this.container = container; this.url = url; this.start(); }, start: function() { this.options.onComplete = this.updateComplete.bind(this); this.onTimerEvent(); }, stop: function() { this.updater.onComplete = undefined; clearTimeout(this.timer); (this.onComplete || Ajax.emptyFunction).apply(this, arguments); }, updateComplete: function(request) { if (this.options.decay) { this.decay = (request.responseText == this.lastText ? this.decay * this.options.decay : 1); this.lastText = request.responseText; } this.timer = setTimeout(this.onTimerEvent.bind(this), this.decay * this.frequency * 1000); }, onTimerEvent: function() { this.updater = new Ajax.Updater(this.container, this.url, this.options); } }); document.getElementsByClassName = function(className) { var children = document.getElementsByTagName('*') || document.all; var elements = new Array(); for (var i = 0; i < children.length; i++) { var child = children[i]; var classNames = child.className.split(' '); for (var j = 0; j < classNames.length; j++) { if (classNames[j] == className) { elements.push(child); break; } } } return elements; } /*--------------------------------------------------------------------------*/ if (!window.Element) { var Element = new Object(); } Object.extend(Element, { toggle: function() { for (var i = 0; i < arguments.length; i++) { var element = $(arguments[i]); element.style.display = (element.style.display == 'none' ? '' : 'none'); } }, hide: function() { for (var i = 0; i < arguments.length; i++) { var element = $(arguments[i]); element.style.display = 'none'; } }, show: function() { for (var i = 0; i < arguments.length; i++) { var element = $(arguments[i]); element.style.display = ''; } }, remove: function(element) { element = $(element); element.parentNode.removeChild(element); }, getHeight: function(element) { element = $(element); return element.offsetHeight; }, hasClassName: function(element, className) { element = $(element); if (!element) return; var a = element.className.split(' '); for (var i = 0; i < a.length; i++) { if (a[i] == className) return true; } return false; }, addClassName: function(element, className) { element = $(element); Element.removeClassName(element, className); element.className += ' ' + className; }, removeClassName: function(element, className) { element = $(element); if (!element) return; var newClassName = ''; var a = element.className.split(' '); for (var i = 0; i < a.length; i++) { if (a[i] != className) { if (i > 0) newClassName += ' '; newClassName += a[i]; } } element.className = newClassName; }, // removes whitespace-only text node children cleanWhitespace: function(element) { var element = $(element); for (var i = 0; i < element.childNodes.length; i++) { var node = element.childNodes[i]; if (node.nodeType == 3 && !/\S/.test(node.nodeValue)) Element.remove(node); } } }); var Toggle = new Object(); Toggle.display = Element.toggle; /*--------------------------------------------------------------------------*/ Abstract.Insertion = function(adjacency) { this.adjacency = adjacency; } Abstract.Insertion.prototype = { initialize: function(element, content) { this.element = $(element); this.content = content; if (this.adjacency && this.element.insertAdjacentHTML) { this.element.insertAdjacentHTML(this.adjacency, this.content); } else { this.range = this.element.ownerDocument.createRange(); if (this.initializeRange) this.initializeRange(); this.fragment = this.range.createContextualFragment(this.content); this.insertContent(); } } } var Insertion = new Object(); Insertion.Before = Class.create(); Insertion.Before.prototype = Object.extend(new Abstract.Insertion('beforeBegin'), { initializeRange: function() { this.range.setStartBefore(this.element); }, insertContent: function() { this.element.parentNode.insertBefore(this.fragment, this.element); } }); Insertion.Top = Class.create(); Insertion.Top.prototype = Object.extend(new Abstract.Insertion('afterBegin'), { initializeRange: function() { this.range.selectNodeContents(this.element); this.range.collapse(true); }, insertContent: function() { this.element.insertBefore(this.fragment, this.element.firstChild); } }); Insertion.Bottom = Class.create(); Insertion.Bottom.prototype = Object.extend(new Abstract.Insertion('beforeEnd'), { initializeRange: function() { this.range.selectNodeContents(this.element); this.range.collapse(this.element); }, insertContent: function() { this.element.appendChild(this.fragment); } }); Insertion.After = Class.create(); Insertion.After.prototype = Object.extend(new Abstract.Insertion('afterEnd'), { initializeRange: function() { this.range.setStartAfter(this.element); }, insertContent: function() { this.element.parentNode.insertBefore(this.fragment, this.element.nextSibling); } }); var Field = { clear: function() { for (var i = 0; i < arguments.length; i++) $(arguments[i]).value = ''; }, focus: function(element) { $(element).focus(); }, present: function() { for (var i = 0; i < arguments.length; i++) if ($(arguments[i]).value == '') return false; return true; }, select: function(element) { $(element).select(); }, activate: function(element) { $(element).focus(); $(element).select(); } } /*--------------------------------------------------------------------------*/ var Form = { serialize: function(form) { var elements = Form.getElements($(form)); var queryComponents = new Array(); for (var i = 0; i < elements.length; i++) { var queryComponent = Form.Element.serialize(elements[i]); if (queryComponent) queryComponents.push(queryComponent); } return queryComponents.join('&'); }, getElements: function(form) { var form = $(form); var elements = new Array(); for (tagName in Form.Element.Serializers) { var tagElements = form.getElementsByTagName(tagName); for (var j = 0; j < tagElements.length; j++) elements.push(tagElements[j]); } return elements; }, getInputs: function(form, typeName, name) { var form = $(form); var inputs = form.getElementsByTagName('input'); if (!typeName && !name) return inputs; var matchingInputs = new Array(); for (var i = 0; i < inputs.length; i++) { var input = inputs[i]; if ((typeName && input.type != typeName) || (name && input.name != name)) continue; matchingInputs.push(input); } return matchingInputs; }, disable: function(form) { var elements = Form.getElements(form); for (var i = 0; i < elements.length; i++) { var element = elements[i]; element.blur(); element.disabled = 'true'; } }, enable: function(form) { var elements = Form.getElements(form); for (var i = 0; i < elements.length; i++) { var element = elements[i]; element.disabled = ''; } }, focusFirstElement: function(form) { var form = $(form); var elements = Form.getElements(form); for (var i = 0; i < elements.length; i++) { var element = elements[i]; if (element.type != 'hidden' && !element.disabled) { Field.activate(element); break; } } }, reset: function(form) { $(form).reset(); } } Form.Element = { serialize: function(element) { var element = $(element); var method = element.tagName.toLowerCase(); var parameter = Form.Element.Serializers[method](element); if (parameter) return encodeURIComponent(parameter[0]) + '=' + encodeURIComponent(parameter[1]); }, getValue: function(element) { var element = $(element); var method = element.tagName.toLowerCase(); var parameter = Form.Element.Serializers[method](element); if (parameter) return parameter[1]; } } Form.Element.Serializers = { input: function(element) { switch (element.type.toLowerCase()) { case 'submit': case 'hidden': case 'password': case 'text': return Form.Element.Serializers.textarea(element); case 'checkbox': case 'radio': return Form.Element.Serializers.inputSelector(element); } return false; }, inputSelector: function(element) { if (element.checked) return [element.name, element.value]; }, textarea: function(element) { return [element.name, element.value]; }, select: function(element) { var value = ''; if (element.type == 'select-one') { var index = element.selectedIndex; if (index >= 0) value = element.options[index].value || element.options[index].text; } else { value = new Array(); for (var i = 0; i < element.length; i++) { var opt = element.options[i]; if (opt.selected) value.push(opt.value || opt.text); } } return [element.name, value]; } } /*--------------------------------------------------------------------------*/ var $F = Form.Element.getValue; /*--------------------------------------------------------------------------*/ Abstract.TimedObserver = function() {} Abstract.TimedObserver.prototype = { initialize: function(element, frequency, callback) { this.frequency = frequency; this.element = $(element); this.callback = callback; this.lastValue = this.getValue(); this.registerCallback(); }, registerCallback: function() { setInterval(this.onTimerEvent.bind(this), this.frequency * 1000); }, onTimerEvent: function() { var value = this.getValue(); if (this.lastValue != value) { this.callback(this.element, value); this.lastValue = value; } } } Form.Element.Observer = Class.create(); Form.Element.Observer.prototype = Object.extend(new Abstract.TimedObserver(), { getValue: function() { return Form.Element.getValue(this.element); } }); Form.Observer = Class.create(); Form.Observer.prototype = Object.extend(new Abstract.TimedObserver(), { getValue: function() { return Form.serialize(this.element); } }); /*--------------------------------------------------------------------------*/ Abstract.EventObserver = function() {} Abstract.EventObserver.prototype = { initialize: function(element, callback) { this.element = $(element); this.callback = callback; this.lastValue = this.getValue(); if (this.element.tagName.toLowerCase() == 'form') this.registerFormCallbacks(); else this.registerCallback(this.element); }, onElementEvent: function() { var value = this.getValue(); if (this.lastValue != value) { this.callback(this.element, value); this.lastValue = value; } }, registerFormCallbacks: function() { var elements = Form.getElements(this.element); for (var i = 0; i < elements.length; i++) this.registerCallback(elements[i]); }, registerCallback: function(element) { if (element.type) { switch (element.type.toLowerCase()) { case 'checkbox': case 'radio': element.target = this; element.prev_onclick = element.onclick || Prototype.emptyFunction; element.onclick = function() { this.prev_onclick(); this.target.onElementEvent(); } break; case 'password': case 'text': case 'textarea': case 'select-one': case 'select-multiple': element.target = this; element.prev_onchange = element.onchange || Prototype.emptyFunction; element.onchange = function() { this.prev_onchange(); this.target.onElementEvent(); } break; } } } } Form.Element.EventObserver = Class.create(); Form.Element.EventObserver.prototype = Object.extend(new Abstract.EventObserver(), { getValue: function() { return Form.Element.getValue(this.element); } }); Form.EventObserver = Class.create(); Form.EventObserver.prototype = Object.extend(new Abstract.EventObserver(), { getValue: function() { return Form.serialize(this.element); } }); if (!window.Event) { var Event = new Object(); } Object.extend(Event, { KEY_BACKSPACE: 8, KEY_TAB: 9, KEY_RETURN: 13, KEY_ESC: 27, KEY_LEFT: 37, KEY_UP: 38, KEY_RIGHT: 39, KEY_DOWN: 40, KEY_DELETE: 46, element: function(event) { return event.target || event.srcElement; }, isLeftClick: function(event) { return (((event.which) && (event.which == 1)) || ((event.button) && (event.button == 1))); }, pointerX: function(event) { return event.pageX || (event.clientX + (document.documentElement.scrollLeft || document.body.scrollLeft)); }, pointerY: function(event) { return event.pageY || (event.clientY + (document.documentElement.scrollTop || document.body.scrollTop)); }, stop: function(event) { if (event.preventDefault) { event.preventDefault(); event.stopPropagation(); } else { event.returnValue = false; } }, // find the first node with the given tagName, starting from the // node the event was triggered on; traverses the DOM upwards findElement: function(event, tagName) { var element = Event.element(event); while (element.parentNode && (!element.tagName || (element.tagName.toUpperCase() != tagName.toUpperCase()))) element = element.parentNode; return element; }, observers: false, _observeAndCache: function(element, name, observer, useCapture) { if (!this.observers) this.observers = []; if (element.addEventListener) { this.observers.push([element, name, observer, useCapture]); element.addEventListener(name, observer, useCapture); } else if (element.attachEvent) { this.observers.push([element, name, observer, useCapture]); element.attachEvent('on' + name, observer); } }, unloadCache: function() { if (!Event.observers) return; for (var i = 0; i < Event.observers.length; i++) { Event.stopObserving.apply(this, Event.observers[i]); Event.observers[i][0] = null; } Event.observers = false; }, observe: function(element, name, observer, useCapture) { var element = $(element); useCapture = useCapture || false; if (name == 'keypress' && ((/Konqueror|Safari|KHTML/.test(navigator.userAgent)) || element.attachEvent)) name = 'keydown'; this._observeAndCache(element, name, observer, useCapture); }, stopObserving: function(element, name, observer, useCapture) { var element = $(element); useCapture = useCapture || false; if (name == 'keypress' && ((/Konqueror|Safari|KHTML/.test(navigator.userAgent)) || element.detachEvent)) name = 'keydown'; if (element.removeEventListener) { element.removeEventListener(name, observer, useCapture); } else if (element.detachEvent) { element.detachEvent('on' + name, observer); } } }); /* prevent memory leaks in IE */ Event.observe(window, 'unload', Event.unloadCache, false); var Position = { // set to true if needed, warning: firefox performance problems // NOT neeeded for page scrolling, only if draggable contained in // scrollable elements includeScrollOffsets: false, // must be called before calling withinIncludingScrolloffset, every time the // page is scrolled prepare: function() { this.deltaX = window.pageXOffset || document.documentElement.scrollLeft || document.body.scrollLeft || 0; this.deltaY = window.pageYOffset || document.documentElement.scrollTop || document.body.scrollTop || 0; }, realOffset: function(element) { var valueT = 0, valueL = 0; do { valueT += element.scrollTop || 0; valueL += element.scrollLeft || 0; element = element.parentNode; } while (element); return [valueL, valueT]; }, cumulativeOffset: function(element) { var valueT = 0, valueL = 0; do { valueT += element.offsetTop || 0; valueL += element.offsetLeft || 0; element = element.offsetParent; } while (element); return [valueL, valueT]; }, // caches x/y coordinate pair to use with overlap within: function(element, x, y) { if (this.includeScrollOffsets) return this.withinIncludingScrolloffsets(element, x, y); this.xcomp = x; this.ycomp = y; this.offset = this.cumulativeOffset(element); return (y >= this.offset[1] && y < this.offset[1] + element.offsetHeight && x >= this.offset[0] && x < this.offset[0] + element.offsetWidth); }, withinIncludingScrolloffsets: function(element, x, y) { var offsetcache = this.realOffset(element); this.xcomp = x + offsetcache[0] - this.deltaX; this.ycomp = y + offsetcache[1] - this.deltaY; this.offset = this.cumulativeOffset(element); return (this.ycomp >= this.offset[1] && this.ycomp < this.offset[1] + element.offsetHeight && this.xcomp >= this.offset[0] && this.xcomp < this.offset[0] + element.offsetWidth); }, // within must be called directly before overlap: function(mode, element) { if (!mode) return 0; if (mode == 'vertical') return ((this.offset[1] + element.offsetHeight) - this.ycomp) / element.offsetHeight; if (mode == 'horizontal') return ((this.offset[0] + element.offsetWidth) - this.xcomp) / element.offsetWidth; }, clone: function(source, target) { source = $(source); target = $(target); target.style.position = 'absolute'; var offsets = this.cumulativeOffset(source); target.style.top = offsets[1] + 'px'; target.style.left = offsets[0] + 'px'; target.style.width = source.offsetWidth + 'px'; target.style.height = source.offsetHeight + 'px'; } } lucene-2.9.4/src/site/src/documentation/skins/common/scripts/breadcrumbs-optimized.js0000644000175000017500000000564511474320232031615 0ustar janpascaljanpascal/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ var PREPREND_CRUMBS=new Array(); var link1="@skinconfig.trail.link1.name@"; var link2="@skinconfig.trail.link2.name@"; var link3="@skinconfig.trail.link3.name@"; if(!(link1=="")&&!link1.indexOf( "@" ) == 0){ PREPREND_CRUMBS.push( new Array( link1, @skinconfig.trail.link1.href@ ) ); } if(!(link2=="")&&!link2.indexOf( "@" ) == 0){ PREPREND_CRUMBS.push( new Array( link2, @skinconfig.trail.link2.href@ ) ); } if(!(link3=="")&&!link3.indexOf( "@" ) == 0){ PREPREND_CRUMBS.push( new Array( link3, @skinconfig.trail.link3.href@ ) ); } var DISPLAY_SEPARATOR=" > "; var DISPLAY_PREPREND=" > "; var DISPLAY_POSTPREND=":"; var CSS_CLASS_CRUMB="breadcrumb"; var CSS_CLASS_TRAIL="breadcrumbTrail"; var CSS_CLASS_SEPARATOR="crumbSeparator"; var FILE_EXTENSIONS=new Array( ".html", ".htm", ".jsp", ".php", ".php3", ".php4" ); var PATH_SEPARATOR="/"; function sc(s) { var l=s.toLowerCase(); return l.substr(0,1).toUpperCase()+l.substr(1); } function getdirs() { var t=document.location.pathname.split(PATH_SEPARATOR); var lc=t[t.length-1]; for(var i=0;i < FILE_EXTENSIONS.length;i++) { if(lc.indexOf(FILE_EXTENSIONS[i])) return t.slice(1,t.length-1); } return t.slice(1,t.length); } function getcrumbs( d ) { var pre = "/"; var post = "/"; var c = new Array(); if( d != null ) { for(var i=0;i < d.length;i++) { pre+=d[i]+postfix; c.push(new Array(d[i],pre)); } } if(PREPREND_CRUMBS.length > 0 ) return PREPREND_CRUMBS.concat( c ); return c; } function gettrail( c ) { var h=DISPLAY_PREPREND; for(var i=0;i < c.length;i++) { h+=''+sc(c[i][0])+''; if(i!=(c.length-1)) h+=DISPLAY_SEPARATOR; } return h+DISPLAY_POSTPREND; } function gettrailXHTML( c ) { var h=''+DISPLAY_PREPREND; for(var i=0;i < c.length;i++) { h+=''+sc(c[i][0])+''; if(i!=(c.length-1)) h+=''+DISPLAY_SEPARATOR+''; } return h+DISPLAY_POSTPREND+''; } if(document.location.href.toLowerCase().indexOf("http://")==-1) document.write(gettrail(getcrumbs())); else document.write(gettrail(getcrumbs(getdirs()))); lucene-2.9.4/src/site/src/documentation/skins/common/scripts/breadcrumbs.js0000644000175000017500000001467511474320232027616 0ustar janpascaljanpascal/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * This script, when included in a html file, builds a neat breadcrumb trail * based on its url. That is, if it doesn't contains bugs (I'm relatively * sure it does). * * Typical usage: * */ /** * IE 5 on Mac doesn't know Array.push. * * Implement it - courtesy to fritz. */ var abc = new Array(); if (!abc.push) { Array.prototype.push = function(what){this[this.length]=what} } /* ======================================================================== CONSTANTS ======================================================================== */ /** * Two-dimensional array containing extra crumbs to place at the front of * the trail. Specify first the name of the crumb, then the URI that belongs * to it. You'll need to modify this for every domain or subdomain where * you use this script (you can leave it as an empty array if you wish) */ var PREPREND_CRUMBS = new Array(); var link1 = "@skinconfig.trail.link1.name@"; var link2 = "@skinconfig.trail.link2.name@"; var link3 = "@skinconfig.trail.link3.name@"; var href1 = "@skinconfig.trail.link1.href@"; var href2 = "@skinconfig.trail.link2.href@"; var href3 = "@skinconfig.trail.link3.href@"; if(!(link1=="")&&!link1.indexOf( "@" ) == 0){ PREPREND_CRUMBS.push( new Array( link1, href1 ) ); } if(!(link2=="")&&!link2.indexOf( "@" ) == 0){ PREPREND_CRUMBS.push( new Array( link2, href2 ) ); } if(!(link3=="")&&!link3.indexOf( "@" ) == 0){ PREPREND_CRUMBS.push( new Array( link3, href3 ) ); } /** * String to include between crumbs: */ var DISPLAY_SEPARATOR = " > "; /** * String to include at the beginning of the trail */ var DISPLAY_PREPREND = " > "; /** * String to include at the end of the trail */ var DISPLAY_POSTPREND = ""; /** * CSS Class to use for a single crumb: */ var CSS_CLASS_CRUMB = "breadcrumb"; /** * CSS Class to use for the complete trail: */ var CSS_CLASS_TRAIL = "breadcrumbTrail"; /** * CSS Class to use for crumb separator: */ var CSS_CLASS_SEPARATOR = "crumbSeparator"; /** * Array of strings containing common file extensions. We use this to * determine what part of the url to ignore (if it contains one of the * string specified here, we ignore it). */ var FILE_EXTENSIONS = new Array( ".html", ".htm", ".jsp", ".php", ".php3", ".php4" ); /** * String that separates parts of the breadcrumb trail from each other. * When this is no longer a slash, I'm sure I'll be old and grey. */ var PATH_SEPARATOR = "/"; /* ======================================================================== UTILITY FUNCTIONS ======================================================================== */ /** * Capitalize first letter of the provided string and return the modified * string. */ function sentenceCase( string ) { return string; //var lower = string.toLowerCase(); //return lower.substr(0,1).toUpperCase() + lower.substr(1); } /** * Returns an array containing the names of all the directories in the * current document URL */ function getDirectoriesInURL() { var trail = document.location.pathname.split( PATH_SEPARATOR ); // check whether last section is a file or a directory var lastcrumb = trail[trail.length-1]; for( var i = 0; i < FILE_EXTENSIONS.length; i++ ) { if( lastcrumb.indexOf( FILE_EXTENSIONS[i] ) ) { // it is, remove it and send results return trail.slice( 1, trail.length-1 ); } } // it's not; send the trail unmodified return trail.slice( 1, trail.length ); } /* ======================================================================== BREADCRUMB FUNCTIONALITY ======================================================================== */ /** * Return a two-dimensional array describing the breadcrumbs based on the * array of directories passed in. */ function getBreadcrumbs( dirs ) { var prefix = "/"; var postfix = "/"; // the array we will return var crumbs = new Array(); if( dirs != null ) { for( var i = 0; i < dirs.length; i++ ) { prefix += dirs[i] + postfix; crumbs.push( new Array( dirs[i], prefix ) ); } } // preprend the PREPREND_CRUMBS if(PREPREND_CRUMBS.length > 0 ) { return PREPREND_CRUMBS.concat( crumbs ); } return crumbs; } /** * Return a string containing a simple text breadcrumb trail based on the * two-dimensional array passed in. */ function getCrumbTrail( crumbs ) { var xhtml = DISPLAY_PREPREND; for( var i = 0; i < crumbs.length; i++ ) { xhtml += ''; xhtml += unescape( crumbs[i][0] ) + ''; if( i != (crumbs.length-1) ) { xhtml += DISPLAY_SEPARATOR; } } xhtml += DISPLAY_POSTPREND; return xhtml; } /** * Return a string containing an XHTML breadcrumb trail based on the * two-dimensional array passed in. */ function getCrumbTrailXHTML( crumbs ) { var xhtml = ''; xhtml += DISPLAY_PREPREND; for( var i = 0; i < crumbs.length; i++ ) { xhtml += ''; xhtml += unescape( crumbs[i][0] ) + ''; if( i != (crumbs.length-1) ) { xhtml += '' + DISPLAY_SEPARATOR + ''; } } xhtml += DISPLAY_POSTPREND; xhtml += ''; return xhtml; } /* ======================================================================== PRINT BREADCRUMB TRAIL ======================================================================== */ // check if we're local; if so, only print the PREPREND_CRUMBS if( document.location.href.toLowerCase().indexOf( "http://" ) == -1 ) { document.write( getCrumbTrail( getBreadcrumbs() ) ); } else { document.write( getCrumbTrail( getBreadcrumbs( getDirectoriesInURL() ) ) ); } lucene-2.9.4/src/site/src/documentation/skins/common/scripts/menu.js0000644000175000017500000000322311474320232026254 0ustar janpascaljanpascal/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * This script, when included in a html file, can be used to make collapsible menus * * Typical usage: * */ if (document.getElementById){ document.write('') } function SwitchMenu(obj) { if(document.getElementById) { var el = document.getElementById(obj); var title = document.getElementById(obj+'Title'); if(obj.indexOf("_selected_")==0&&el.style.display == ""){ el.style.display = "block"; title.className = "pagegroupselected"; } if(el.style.display != "block"){ el.style.display = "block"; title.className = "pagegroupopen"; } else{ el.style.display = "none"; title.className = "pagegroup"; } }// end - if(document.getElementById) }//end - function SwitchMenu(obj) lucene-2.9.4/src/site/src/documentation/skins/common/scripts/fontsize.js0000644000175000017500000000605011474320232027152 0ustar janpascaljanpascal/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ function init() { //embedded in the doc //ndeSetTextSize(); } function checkBrowser(){ if (!document.getElementsByTagName){ return true; } else{ return false; } } function ndeSetTextSize(chgsize,rs) { var startSize; var newSize; if (!checkBrowser) { return; } startSize = parseInt(ndeGetDocTextSize()); if (!startSize) { startSize = 16; } switch (chgsize) { case 'incr': newSize = startSize + 2; break; case 'decr': newSize = startSize - 2; break; case 'reset': if (rs) {newSize = rs;} else {newSize = 16;} break; default: try{ newSize = parseInt(ndeReadCookie("nde-textsize")); } catch(e){ alert(e); } if (!newSize || newSize == 'NaN') { newSize = startSize; } break; } if (newSize < 10) { newSize = 10; } newSize += 'px'; document.getElementsByTagName('html')[0].style.fontSize = newSize; document.getElementsByTagName('body')[0].style.fontSize = newSize; ndeCreateCookie("nde-textsize", newSize, 365); } function ndeGetDocTextSize() { if (!checkBrowser) { return 0; } var size = 0; var body = document.getElementsByTagName('body')[0]; if (body.style && body.style.fontSize) { size = body.style.fontSize; } else if (typeof(getComputedStyle) != 'undefined') { size = getComputedStyle(body,'').getPropertyValue('font-size'); } else if (body.currentStyle) { size = body.currentStyle.fontSize; } //fix IE bug if( isNaN(size)){ if(size.substring(size.length-1)=="%"){ return } } return size; } function ndeCreateCookie(name,value,days) { var cookie = name + "=" + value + ";"; if (days) { var date = new Date(); date.setTime(date.getTime()+(days*24*60*60*1000)); cookie += " expires=" + date.toGMTString() + ";"; } cookie += " path=/"; document.cookie = cookie; } function ndeReadCookie(name) { var nameEQ = name + "="; var ca = document.cookie.split(';'); for(var i = 0; i < ca.length; i++) { var c = ca[i]; while (c.charAt(0) == ' ') { c = c.substring(1, c.length); } ctest = c.substring(0,name.length); if(ctest == name){ return c.substring(nameEQ.length,c.length); } } return null; } lucene-2.9.4/src/site/src/documentation/skins/common/scripts/getMenu.js0000644000175000017500000000317411474320232026721 0ustar janpascaljanpascal/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * This script, when included in a html file, can be used to make collapsible menus * * Typical usage: * */ if (document.getElementById){ document.write('') } function SwitchMenu(obj, thePath) { var open = 'url("'+thePath + 'images/chapter_open.gif")'; var close = 'url("'+thePath + 'images/chapter.gif")'; if(document.getElementById) { var el = document.getElementById(obj); var title = document.getElementById(obj+'Title'); if(el.style.display != "block"){ title.style.backgroundImage = open; el.style.display = "block"; }else{ title.style.backgroundImage = close; el.style.display = "none"; } }// end - if(document.getElementById) }//end - function SwitchMenu(obj) lucene-2.9.4/src/site/src/documentation/skins/common/translations/0000755000175000017500000000000011554106562026013 5ustar janpascaljanpascallucene-2.9.4/src/site/src/documentation/skins/common/translations/CommonMessages_en_US.xml0000644000175000017500000000206011474320232032535 0ustar janpascaljanpascal Font size: Last Published: Search Search site with lucene-2.9.4/src/site/src/documentation/skins/common/translations/CommonMessages_es.xml0000644000175000017500000000206511474320232032140 0ustar janpascaljanpascal Tamaño del texto: Fecha de publicación: Buscar Buscar en lucene-2.9.4/src/site/src/documentation/skins/common/translations/CommonMessages_de.xml0000644000175000017500000000210211474320232032111 0ustar janpascaljanpascal Schriftgrösse: Zuletzt veröffentlicht: Suche: Suche auf der Seite mit lucene-2.9.4/src/site/src/documentation/skins/common/translations/CommonMessages_fr.xml0000644000175000017500000000210311474320232032131 0ustar janpascaljanpascal Taille : Dernière publication : Rechercher Rechercher sur le site avec lucene-2.9.4/src/site/src/documentation/skins/common/css/0000755000175000017500000000000011554106562024062 5ustar janpascaljanpascallucene-2.9.4/src/site/src/documentation/skins/common/css/forrest.css.xslt0000644000175000017500000000572111474320232027247 0ustar janpascaljanpascal /* ==================== aural ============================ */ @media aural { h1, h2, h3, h4, h5, h6 { voice-family: paul, male; stress: 20; richness: 90 } h1 { pitch: x-low; pitch-range: 90 } h2 { pitch: x-low; pitch-range: 80 } h3 { pitch: low; pitch-range: 70 } h4 { pitch: medium; pitch-range: 60 } h5 { pitch: medium; pitch-range: 50 } h6 { pitch: medium; pitch-range: 40 } li, dt, dd { pitch: medium; richness: 60 } dt { stress: 80 } pre, code, tt { pitch: medium; pitch-range: 0; stress: 0; richness: 80 } em { pitch: medium; pitch-range: 60; stress: 60; richness: 50 } strong { pitch: medium; pitch-range: 60; stress: 90; richness: 90 } dfn { pitch: high; pitch-range: 60; stress: 60 } s, strike { richness: 0 } i { pitch: medium; pitch-range: 60; stress: 60; richness: 50 } b { pitch: medium; pitch-range: 60; stress: 90; richness: 90 } u { richness: 0 } :link { voice-family: harry, male } :visited { voice-family: betty, female } :active { voice-family: betty, female; pitch-range: 80; pitch: x-high } } a.external { padding: 0 20px 0px 0px; display:inline; background-repeat: no-repeat; background-position: center right; background-image: url(images/external-link.gif); } /* extra-css */ lucene-2.9.4/src/site/src/documentation/skins/common/images/0000755000175000017500000000000011554106562024537 5ustar janpascaljanpascallucene-2.9.4/src/site/src/documentation/skins/common/images/txtdoc.svg.xslt0000644000175000017500000000447511474320232027561 0ustar janpascaljanpascal TXT lucene-2.9.4/src/site/src/documentation/skins/common/images/page.gif0000644000175000017500000000011711474320232026132 0ustar janpascaljanpascalGIF89a€¥¶Æÿÿÿ!þCreated with The GIMP!ù , ŒyÀ€ 26…S;lucene-2.9.4/src/site/src/documentation/skins/common/images/poddoc.svg.xslt0000644000175000017500000000447511474320232027524 0ustar janpascaljanpascal POD lucene-2.9.4/src/site/src/documentation/skins/common/images/rc.svg.xslt0000644000175000017500000000230211474320232026643 0ustar janpascaljanpascal lucene-2.9.4/src/site/src/documentation/skins/common/images/corner-imports.svg.xslt0000644000175000017500000000760011474320232031230 0ustar janpascaljanpascal 0 fill:; fill:; stroke:; 1 -1 1 -1 0 - 0 - lucene-2.9.4/src/site/src/documentation/skins/common/images/chapter.gif0000644000175000017500000000006111474320232026642 0ustar janpascaljanpascalGIF89a€¥¶Æÿÿÿ!ù,Diì«T(;lucene-2.9.4/src/site/src/documentation/skins/common/images/dc.svg.xslt0000644000175000017500000000245711474320232026640 0ustar janpascaljanpascal lucene-2.9.4/src/site/src/documentation/skins/common/images/current.gif0000644000175000017500000000006611474320232026703 0ustar janpascaljanpascalGIF89a€¥¶Æÿÿÿ!ù, „¡k`›‰ÒÅ*;lucene-2.9.4/src/site/src/documentation/skins/common/images/README.txt0000644000175000017500000000010611474320232026223 0ustar janpascaljanpascalThe images in this directory are used if the current skin lacks them. lucene-2.9.4/src/site/src/documentation/skins/common/xslt/0000755000175000017500000000000011474320232024255 5ustar janpascaljanpascallucene-2.9.4/src/site/src/documentation/skins/common/xslt/html/0000755000175000017500000000000011554106562025230 5ustar janpascaljanpascallucene-2.9.4/src/site/src/documentation/skins/common/xslt/html/document-to-html.xsl0000644000175000017500000003056411474320232031161 0ustar janpascaljanpascal

    ;
    ^ 15 0
    Note Warning Fixme ()
    Notice:
    _top _blank
    
          
          
    
        
    codefrag
    by , version v

    Type:

    true # lucene-2.9.4/src/site/src/documentation/skins/common/xslt/html/split.xsl0000644000175000017500000000741011474320232027106 0ustar janpascaljanpascal 40 lucene-2.9.4/src/site/src/documentation/skins/common/xslt/html/book-to-menu.xsl0000644000175000017500000001217711474320232030275 0ustar janpascaljanpascal
  • lucene-2.9.4/src/site/src/documentation/skins/common/xslt/html/site-to-xhtml.xsl0000644000175000017500000004323111474320232030472 0ustar janpascaljanpascal > > Valid HTML 4.01! Valid HTML 4.01! Valid CSS! PDF
    PDF
    TXT
    TXT
    POD
    POD
    XML
    XML
    2005 yyyy -
    lucene-2.9.4/src/site/src/documentation/skins/common/xslt/html/strip_namespaces.xsl0000644000175000017500000000314711474320232031316 0ustar janpascaljanpascal lucene-2.9.4/src/site/src/documentation/skins/common/xslt/html/renderlogo.xsl0000644000175000017500000000470111474320232030113 0ustar janpascaljanpascal {$name} lucene-2.9.4/src/site/src/documentation/skins/common/xslt/html/tabutils.xsl0000644000175000017500000000760311474320232027606 0ustar janpascaljanpascal / / lucene-2.9.4/src/site/src/documentation/skins/common/xslt/html/tab-to-menu.xsl0000644000175000017500000001675511474320232030117 0ustar janpascaljanpascal | |
    lucene-2.9.4/src/site/src/documentation/skins/common/xslt/html/pathutils.xsl0000644000175000017500000002041711474320232027772 0ustar janpascaljanpascal lucene-2.9.4/src/site/src/documentation/skins/common/xslt/html/dotdots.xsl0000644000175000017500000000546511474320232027443 0ustar janpascaljanpascal ../ lucene-2.9.4/src/site/src/documentation/skins/common/xslt/svg/0000755000175000017500000000000011554106562025063 5ustar janpascaljanpascallucene-2.9.4/src/site/src/documentation/skins/common/xslt/svg/document-to-svg.xsl0000644000175000017500000000372511474320232030646 0ustar janpascaljanpascal lucene-2.9.4/src/site/src/documentation/skins/common/xslt/fo/0000755000175000017500000000000011554106562024670 5ustar janpascaljanpascallucene-2.9.4/src/site/src/documentation/skins/common/xslt/fo/pdfoutline.xsl0000644000175000017500000000354411474320232027570 0ustar janpascaljanpascal lucene-2.9.4/src/site/src/documentation/skins/common/xslt/fo/document-to-fo.xsl0000644000175000017500000011132011474320232030247 0ustar janpascaljanpascal 841mm 594mm 420mm 297mm 210mm 148mm 7.25in 8.5in 11in 8.5in 8.5in 8.5in 11in 8.5in 1189mm 841mm 594mm 420mm 297mm 210mm 10.5in 13in 17in 14in 10.83in 17in 11in 1189mm 841mm 594mm 420mm 297mm 210mm 10.5in 13in 17in 14in 10.83in 17in 11in 841mm 594mm 420mm 297mm 210mm 148mm 7.25in 8.5in 11in 8.5in 8.5in 8.5in 11in 8.5in start end start NOTICE: 0 . pt   0 by , 6pt" 6pt Warning: Note: FIXME (): () in Table : Table of contents page [] page page lucene-2.9.4/src/site/src/documentation/skins/common/xslt/fo/footerinfo.xsl0000644000175000017500000000551611474320232027572 0ustar janpascaljanpascal Copyright ©   All rights reserved. lucene-2.9.4/src/site/src/documentation/skins/common/skinconf.xsl0000644000175000017500000001767011474320232025640 0ustar janpascaljanpascal true true true true true false false true .at. true Page 1 true Built with Apache Forrest http://forrest.apache.org/ images/built-with-forrest-button.png 88 31 lucene-2.9.4/src/site/src/documentation/skins/lucene/0000755000175000017500000000000011554106562023255 5ustar janpascaljanpascallucene-2.9.4/src/site/src/documentation/skins/lucene/note.txt0000644000175000017500000000255011474320233024757 0ustar janpascaljanpascalNotes for developer: --Legend------------------- TODO -> blocker DONE -> blocker ToDo -> enhancement bug done -> enhancement bug --Issues------------------- - the corner images should be rendered through svg with the header color. -> DONE -> ToDo: get rid of the images and use only divs! - the menu points should be displayed "better". -> DONE -- Use the krysalis-site menu approach for the overall menu display. -> DONE -- Use the old lenya innermenu approch to further enhance the menu . -> DONE - the content area needs some attention. -> DONE -- introduce the heading scheme from krysalis () -> DONE -> ToDo: make box with round corners -> done: make underlined with variable border height -> ToDo: make underline with bottom round corner -- introduce the toc for each html-page -> DONE -- introduce the external-link-images. -> DONE - the publish note should be where now only a border is. Like
    -> DONE , but make it configurable. -> DONE - footer needs some attention -> DONE -- the footer do not have the color profile! Enable it! -> DONE -- the footer should as well contain a feedback link. See http://issues.apache.org/eyebrowse/ReadMsg?listName=forrest-user@xml.apache.org&msgNo=71 -> DONE - introduce credits alternativ location -> DONE - border for published / breadtrail / menu /tab divs -> ToDolucene-2.9.4/src/site/src/documentation/skins/lucene/css/0000755000175000017500000000000011554106562024045 5ustar janpascaljanpascallucene-2.9.4/src/site/src/documentation/skins/lucene/css/print.css0000644000175000017500000000232611474320232025707 0ustar janpascaljanpascal/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ body { font-family: Georgia, Palatino, serif; font-size: 12pt; background: white; } #tabs, #menu, #content .toc { display: none; } #content { width: auto; padding: 0; float: none !important; color: black; background: inherit; } a:link, a:visited { color: #336699; background: inherit; text-decoration: underline; } #top .logo { padding: 0; margin: 0 0 2em 0; } #footer { margin-top: 4em; } acronym { border: 0; } lucene-2.9.4/src/site/src/documentation/skins/lucene/css/profile.css.xslt0000644000175000017500000001726611474320232027215 0ustar janpascaljanpascal #top { background-color: ;} #top .header .current { background-color: ;} #top .header .current a:link { color: ; } #top .header .current a:visited { color: ; } #top .header .current a:hover { color: ; } #tabs li { background-color: ;} #tabs li a:link { color: ; } #tabs li a:visited { color: ; } #tabs li a:hover { color: ; } #level2tabs a.selected { background-color: ;} #level2tabs a:link { color: ; } #level2tabs a:visited { color: ; } #level2tabs a:hover { color: ; } #level2tabs { background-color: ;} #level2tabs a.unselected:link { color: ; } #level2tabs a.unselected:visited { color: ; } #level2tabs a.unselected:hover { color: ; } .heading { background-color: ;} .boxed { background-color: ;} .underlined_5 {border-bottom: solid 5px ;} .underlined_10 {border-bottom: solid 10px ;} table caption { background-color: ; color: ; } #feedback { color: ; background: ; text-align: ; } #feedback #feedbackto { color: ; } #main .breadtrail { background: ; color: ; } #main .breadtrail a:link { color: ; } #main .breadtrail a:visited { color: ; } #main .breadtrail a:hover { color: ; } #top .breadtrail { background: ; color: ; } #top .breadtrail a:link { color: ; } #top .breadtrail a:visited { color: ; } #top .breadtrail a:hover { color: ; } #publishedStrip { color: ; background: ; } #publishedStrip { color: ; background: ; } #menu .menupagetitle { background-color: ; color: ;} #menu { border-color: ;} #menu .menupagetitle { border-color: ;} #menu .menupageitemgroup { border-color: ;} #menu { background-color: ;} #menu { color: ;} #menu a:link { color: ;} #menu a:visited { color: ;} #menu a:hover { background-color: ; color: ;} #menu .menupageitemgroup { background-color: ; } #menu .menupageitem { color: ; } #menu .menupageitem a:link { color: ;} #menu .menupageitem a:visited { color: ;} #menu .menupageitem a:hover { background-color: ; color: ; } #menu h1 { color: ; background-color: ; } #top .searchbox { background-color: ; color: ; } body{ background-color: ; color: ; } a:link { color:} a:visited { color:} a:hover { color:} #footer { background-color: ;} .highlight { background-color: ;} .fixme { border-color: ;} .note { border-color: ;} .warning { border-color: ;} .code { border-color: ;} .ForrestTable { background-color: ;} .ForrestTable td { background-color: ;} lucene-2.9.4/src/site/src/documentation/skins/lucene/css/screen.css0000644000175000017500000003067211474320232026037 0ustar janpascaljanpascal/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ body { margin: 0px 0px 0px 0px; font-family: Verdana, Helvetica, sans-serif; } h1 { font-size : 160%; margin: 0px 0px 0px 0px; padding: 0px; } h2 { font-size : 140%; margin: 1em 0px 0.8em 0px; padding: 0px; font-weight : bold;} h3 { font-size : 130%; margin: 0.8em 0px 0px 0px; padding: 0px; font-weight : bold; } .h3 { margin: 22px 0px 3px 0px; } h4 { font-size : 120%; margin: 0.7em 0px 0px 0px; padding: 0px; font-weight : normal; text-align: left; } .h4 { margin: 18px 0px 0px 0px; } h4.faq { font-size : 120%; margin: 18px 0px 0px 0px; padding: 0px; font-weight : bold; text-align: left; } h5 { font-size : 100%; margin: 14px 0px 0px 0px; padding: 0px; font-weight : normal; text-align: left; } /** * table */ table .title { background-color: #000000; } .ForrestTable { color: #ffffff; background-color: #7099C5; width: 100%; font-size : 100%; empty-cells: show; } table caption { padding-left: 5px; color: white; text-align: left; font-weight: bold; background-color: #000000; } .ForrestTable td { color: black; background-color: #f0f0ff; } .ForrestTable th { text-align: center; } /** * Page Header */ #top { position: relative; float: left; width: 100%; background: #294563; /* if you want a background in the header, put it here */ } #top .breadtrail { background: #CFDCED; color: black; border-bottom: solid 1px white; padding: 3px 10px; font-size: 75%; } #top .breadtrail a { color: black; } #top .header { float: left; width: 100%; background: url("images/header_white_line.gif") repeat-x bottom; } #top .grouplogo { padding: 7px 0 10px 10px; float: left; text-align: left; } #top .projectlogo { padding: 7px 0 10px 10px; float: left; width: 33%; text-align: right; } #top .projectlogoA1 { padding: 7px 0 10px 10px; float: right; } html>body #top .searchbox { bottom: 0px; } #top .searchbox { position: absolute; right: 10px; height: 42px; font-size: 70%; white-space: nowrap; text-align: right; color: white; background-color: #000000; z-index:0; background-image: url(images/rc-t-l-5-1header-2searchbox-3searchbox.png); background-repeat: no-repeat; background-position: top left; bottom: -1px; /* compensate for IE rendering issue */ } #top .searchbox form { padding: 5px 10px; margin: 0; } #top .searchbox p { padding: 0 0 2px 0; margin: 0; } #top .searchbox input { font-size: 100%; } #tabs { clear: both; padding-left: 10px; margin: 0; list-style: none; } /* background: #CFDCED url("images/tab-right.gif") no-repeat right top;*/ #tabs li { float: left; background-image: url(images/rc-t-r-5-1header-2tab-unselected-3tab-unselected.png); background-repeat: no-repeat; background-position: top right; background-color: #000000; margin: 0 3px 0 0; padding: 0; } /*background: url("images/tab-left.gif") no-repeat left top;*/ #tabs li a { float: left; display: block; font-family: verdana, arial, sans-serif; text-decoration: none; color: black; white-space: nowrap; background-image: url(images/rc-t-l-5-1header-2tab-unselected-3tab-unselected.png); background-repeat: no-repeat; background-position: top left; padding: 5px 15px 4px; width: .1em; /* IE/Win fix */ } #tabs li a:hover { cursor: pointer; text-decoration:underline; } #tabs > li a { width: auto; } /* Rest of IE/Win fix */ /* Commented Backslash Hack hides rule from IE5-Mac \*/ #tabs a { float: none; } /* End IE5-Mac hack */ #top .header .current { background-color: #4C6C8F; background-image: url(images/rc-t-r-5-1header-2tab-selected-3tab-selected.png); background-repeat: no-repeat; background-position: top right; } #top .header .current a { font-weight: bold; padding-bottom: 5px; color: white; background-image: url(images/rc-t-l-5-1header-2tab-selected-3tab-selected.png); background-repeat: no-repeat; background-position: top left; } #publishedStrip { padding-right: 10px; padding-left: 20px; padding-top: 3px; padding-bottom:3px; color: #ffffff; font-size : 60%; font-weight: bold; background-color: #4C6C8F; text-align:right; } #level2tabs { margin: 0; float:left; position:relative; } #level2tabs a:hover { cursor: pointer; text-decoration:underline; } #level2tabs a{ cursor: pointer; text-decoration:none; background-image: url('images/chapter.gif'); background-repeat: no-repeat; background-position: center left; padding-left: 6px; margin-left: 6px; } /* * border-top: solid #4C6C8F 15px; */ #main { position: relative; background: white; clear:both; } #main .breadtrail { clear:both; position: relative; background: #CFDCED; color: black; border-bottom: solid 1px black; border-top: solid 1px black; padding: 0px 180px; font-size: 75%; z-index:10; } /** * Round corner */ #roundtop { background-image: url(images/rc-t-r-15-1body-2menu-3menu.png); background-repeat: no-repeat; background-position: top right; } #roundbottom { background-image: url(images/rc-b-r-15-1body-2menu-3menu.png); background-repeat: no-repeat; background-position: top right; } img.corner { width: 15px; height: 15px; border: none; display: block !important; } .roundtopsmall { background-image: url(images/rc-t-r-5-1header-2searchbox-3searchbox.png); background-repeat: no-repeat; background-position: top right; } #roundbottomsmall { background-image: url(images/rc-b-r-5-1header-2tab-selected-3tab-selected.png); background-repeat: no-repeat; background-position: top right; } img.cornersmall { width: 5px; height: 5px; border: none; display: block !important; } /** * Side menu */ #menu a { font-weight: normal; text-decoration: none;} #menu a:visited { font-weight: normal; } #menu a:active { font-weight: normal; } #menu a:hover { font-weight: normal; text-decoration:underline;} #menuarea { width:10em;} #menu { position: relative; float: left; width: 160px; padding-top: 0px; top:-18px; left:10px; z-index: 20; background-color: #f90; font-size : 70%; } .menutitle { cursor:pointer; padding: 3px 12px; margin-left: 10px; background-image: url('images/chapter.gif'); background-repeat: no-repeat; background-position: center left; font-weight : bold; } .menutitle:hover{text-decoration:underline;cursor: pointer;} #menu .menuitemgroup { margin: 0px 0px 6px 8px; padding: 0px; font-weight : bold; } #menu .selectedmenuitemgroup{ margin: 0px 0px 0px 8px; padding: 0px; font-weight : normal; } #menu .menuitem { padding: 2px 0px 1px 13px; background-image: url('images/page.gif'); background-repeat: no-repeat; background-position: center left; font-weight : normal; margin-left: 10px; } #menu .menupage { margin: 2px 0px 1px 10px; padding: 0px 3px 0px 12px; background-image: url('images/page.gif'); background-repeat: no-repeat; background-position: center left; font-style : normal; } #menu .menupagetitle { padding: 0px 0px 0px 1px; font-style : normal; border-style: solid; border-width: 1px; margin-right: 10px; } #menu .menupageitemgroup { padding: 3px 0px 4px 6px; font-style : normal; border-bottom: 1px solid ; border-left: 1px solid ; border-right: 1px solid ; margin-right: 10px; } #menu .menupageitem { font-style : normal; font-weight : normal; border-width: 0px; font-size : 90%; } #menu #credit { text-align: center; } #menu #credit2 { text-align: center; padding: 3px 3px 3px 3px; background-color: #ffffff; } #menu .searchbox { text-align: center; } #menu .searchbox form { padding: 3px 3px; margin: 0; } #menu .searchbox input { font-size: 100%; } #content { padding: 20px 20px 20px 180px; margin: 0; font : small Verdana, Helvetica, sans-serif; font-size : 80%; } #content ul { margin: 0; padding: 0 25px; } #content li { padding: 0 5px; } #feedback { color: black; background: #CFDCED; text-align:center; margin-top: 5px; } #feedback #feedbackto { font-size: 90%; color: black; } #footer { clear: both; position: relative; /* IE bugfix (http://www.dracos.co.uk/web/css/ie6floatbug/) */ width: 100%; background: #CFDCED; border-top: solid 1px #4C6C8F; color: black; } #footer .copyright { position: relative; /* IE bugfix cont'd */ padding: 5px; margin: 0; width: 45%; } #footer .lastmodified { position: relative; /* IE bugfix cont'd */ float: right; width: 45%; padding: 5px; margin: 0; text-align: right; } #footer a { color: white; } #footer #logos { text-align: left; } /** * Misc Styles */ acronym { cursor: help; } .boxed { background-color: #a5b6c6;} .underlined_5 {border-bottom: solid 5px #4C6C8F;} .underlined_10 {border-bottom: solid 10px #4C6C8F;} /* ==================== snail trail ============================ */ .trail { position: relative; /* IE bugfix cont'd */ font-size: 70%; text-align: right; float: right; margin: -10px 5px 0px 5px; padding: 0; } #motd-area { position: relative; /* IE bugfix cont'd */ float: right; width: 35%; background-color: #f0f0ff; border-top: solid 1px #4C6C8F; border-bottom: solid 1px #4C6C8F; margin-bottom: 15px; margin-left: 15px; margin-right: 10%; padding-bottom: 5px; padding-top: 5px; } #minitoc-area { border-top: solid 1px #4C6C8F; border-bottom: solid 1px #4C6C8F; margin: 15px 10% 5px 15px; /* margin-bottom: 15px; margin-left: 15px; margin-right: 10%;*/ padding-bottom: 7px; padding-top: 5px; } .minitoc { list-style-image: url('images/current.gif'); font-weight: normal; } li p { margin: 0; padding: 0; } .pdflink { position: relative; /* IE bugfix cont'd */ float: right; margin: 0px 5px; padding: 0; } .pdflink br { margin-top: -10px; padding-left: 1px; } .pdflink a { display: block; font-size: 70%; text-align: center; margin: 0; padding: 0; } .pdflink img { display: block; height: 16px; width: 16px; } .xmllink { position: relative; /* IE bugfix cont'd */ float: right; margin: 0px 5px; padding: 0; } .xmllink br { margin-top: -10px; padding-left: 1px; } .xmllink a { display: block; font-size: 70%; text-align: center; margin: 0; padding: 0; } .xmllink img { display: block; height: 16px; width: 16px; } .podlink { position: relative; /* IE bugfix cont'd */ float: right; margin: 0px 5px; padding: 0; } .podlink br { margin-top: -10px; padding-left: 1px; } .podlink a { display: block; font-size: 70%; text-align: center; margin: 0; padding: 0; } .podlink img { display: block; height: 16px; width: 16px; } .printlink { position: relative; /* IE bugfix cont'd */ float: right; } .printlink br { margin-top: -10px; padding-left: 1px; } .printlink a { display: block; font-size: 70%; text-align: center; margin: 0; padding: 0; } .printlink img { display: block; height: 16px; width: 16px; } p.instruction { display: list-item; list-style-image: url('../images/instruction_arrow.png'); list-style-position: outside; margin-left: 2em; } lucene-2.9.4/src/site/src/documentation/skins/lucene/css/basic.css0000644000175000017500000000564311474320232025641 0ustar janpascaljanpascal/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * General */ img { border: 0; } #content table { border: 0; width: 100%; } /*Hack to get IE to render the table at 100%*/ * html #content table { margin-left: -3px; } #content th, #content td { margin: 0; padding: 0; vertical-align: top; } .clearboth { clear: both; } .note, .warning, .fixme { border: solid black 1px; margin: 1em 3em; } .note .label { background: #369; color: white; font-weight: bold; padding: 5px 10px; } .note .content { background: #F0F0FF; color: black; line-height: 120%; font-size: 90%; padding: 5px 10px; } .warning .label { background: #C00; color: white; font-weight: bold; padding: 5px 10px; } .warning .content { background: #FFF0F0; color: black; line-height: 120%; font-size: 90%; padding: 5px 10px; } .fixme .label { background: #C6C600; color: black; font-weight: bold; padding: 5px 10px; } .fixme .content { padding: 5px 10px; } /** * Typography */ body { font-family: verdana, "Trebuchet MS", arial, helvetica, sans-serif; font-size: 100%; } #content { font-family: Georgia, Palatino, Times, serif; font-size: 95%; } #tabs { font-size: 70%; } #menu { font-size: 80%; } #footer { font-size: 70%; } h1, h2, h3, h4, h5, h6 { font-family: "Trebuchet MS", verdana, arial, helvetica, sans-serif; font-weight: bold; margin-top: 1em; margin-bottom: .5em; } h1 { margin-top: 0; margin-bottom: 1em; font-size: 1.4em; } #content h1 { font-size: 160%; margin-bottom: .5em; } #menu h1 { margin: 0; padding: 10px; background: #336699; color: white; } h2 { font-size: 120%; } h3 { font-size: 100%; } h4 { font-size: 90%; } h5 { font-size: 80%; } h6 { font-size: 75%; } p { line-height: 120%; text-align: left; margin-top: .5em; margin-bottom: 1em; } #content li, #content th, #content td, #content li ul, #content li ol{ margin-top: .5em; margin-bottom: .5em; } #content li li, #minitoc-area li{ margin-top: 0em; margin-bottom: 0em; } #content .attribution { text-align: right; font-style: italic; font-size: 85%; margin-top: 1em; } .codefrag { font-family: "Courier New", Courier, monospace; font-size: 110%; } lucene-2.9.4/src/site/src/documentation/skins/lucene/xslt/0000755000175000017500000000000011474320233024241 5ustar janpascaljanpascallucene-2.9.4/src/site/src/documentation/skins/lucene/xslt/html/0000755000175000017500000000000011554106562025213 5ustar janpascaljanpascallucene-2.9.4/src/site/src/documentation/skins/lucene/xslt/html/document-to-html.xsl0000644000175000017500000001223111474320233031134 0ustar janpascaljanpascal