The contents of this file are subject to the Mozilla Public License Version
1.1 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.mozilla.org/MPL/
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
for the specific language governing rights and limitations under the
License.
Alternatively, the contents of this file may be used under the terms of
either the GNU General Public License Version 2 or later (the "GPL"), or
the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
in which case the provisions of the GPL or the LGPL are applicable instead
of those above. If you wish to allow use of your version of this file only
under the terms of either the GPL or the LGPL, and not to allow others to
use your version of this file under the terms of the MPL, indicate your
decision by deleting the provisions above and replace them with the notice
and other provisions required by the GPL or the LGPL. If you do not delete
the provisions above, a recipient may use your version of this file under
the terms of any one of the MPL, the GPL or the LGPL.
*/
package org.mozilla.universalchardet;
import java.io.*;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Objects;
/**
* Create a reader from a file with correct encoding
*/
public final class ReaderFactory {
private ReaderFactory() {
throw new AssertionError("No instances allowed");
}
/**
* Create a reader from a file with correct encoding
* @param file The file to read from
* @param defaultCharset defaultCharset to use if can't be determined
* @return BufferedReader for the file with the correct encoding
* @throws java.io.IOException if some I/O error occurs
*/
public static BufferedReader createBufferedReader(File file, Charset defaultCharset) throws IOException {
Charset cs = Objects.requireNonNull(defaultCharset, "defaultCharset must be not null");
String detectedEncoding = UniversalDetector.detectCharset(file);
if (detectedEncoding != null) {
cs = Charset.forName(detectedEncoding);
}
if (!cs.name().contains("UTF")) {
return Files.newBufferedReader(file.toPath(), cs);
}
Path path = file.toPath();
return new BufferedReader(new InputStreamReader(new UnicodeBOMInputStream(new BufferedInputStream(Files.newInputStream(path))), cs));
}
/**
* Create a reader from a file with correct encoding. If charset cannot be determined,
* it uses the system default charset.
* @param file The file to read from
* @return BufferedReader for the file with the correct encoding
* @throws java.io.IOException if some I/O error occurs
*/
public static BufferedReader createBufferedReader(File file) throws IOException {
return createBufferedReader(file, Charset.defaultCharset());
}
/**
* Create a reader from a byte array with correct encoding
* @param data The byte[] to read from
* @param defaultCharset defaultCharset to use if can't be determined
* @return BufferedReader for the file with the correct encoding
* @throws java.io.IOException if some I/O error occurs
*/
public static BufferedReader createBufferedReader(byte[] data, Charset defaultCharset) throws IOException {
Charset cs = Objects.requireNonNull(defaultCharset, "defaultCharset must be not null");
String detectedEncoding = null;
try (InputStream is = new ByteArrayInputStream(data)) {
detectedEncoding = UniversalDetector.detectCharset(is);
}
if (detectedEncoding != null) {
cs = Charset.forName(detectedEncoding);
}
if (!cs.name().contains("UTF")) {
return new BufferedReader(new InputStreamReader(new ByteArrayInputStream(data), cs));
}
return new BufferedReader(new InputStreamReader(new UnicodeBOMInputStream(new ByteArrayInputStream(data)), cs));
}
/**
* Create a reader from a byte array with correct encoding. If charset cannot be determined,
* it uses the system default charset.
* @param data The byte[] to read from
* @return BufferedReader for the file with the correct encoding
* @throws java.io.IOException if some I/O error occurs
*/
public static BufferedReader createBufferedReader(byte[] data) throws IOException {
return createBufferedReader(data, Charset.defaultCharset());
}
/**
* Create a reader from a file with the correct encoding
* @param file The file to read from
* @param defaultCharset defaultCharset to use if can't be determined
* @return Reader for the file with the correct encoding
* @throws java.io.IOException if some I/O error occurs
* @deprecated Use {@link #createBufferedReader(File, Charset)}
*
*/
@Deprecated
public static Reader createReaderFromFile(File file, Charset defaultCharset) throws IOException {
return createBufferedReader(file, defaultCharset);
}
/**
* Create a reader from a file with the correct encoding. If charset cannot be determined,
* it uses the system default charset.
* @param file The file to read from
* @return Reader for the file with the correct encoding
* @throws java.io.IOException if some I/O error occurs
* @deprecated Use {@link #createBufferedReader(File)}
*/
@Deprecated
public static Reader createReaderFromFile(File file) throws IOException {
return createReaderFromFile(file, Charset.defaultCharset());
}
}
juniversalchardet-2.5.0/src/main/java/org/mozilla/universalchardet/UnicodeBOMInputStream.java 0000664 0000000 0000000 00000017233 14625121707 0032466 0 ustar 00root root 0000000 0000000 // (‑●‑●)> released under the WTFPL v2 license, by Gregory Pakosz (@gpakosz)
package org.mozilla.universalchardet;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
/**
* The UnicodeBOMInputStream
class wraps any
* InputStream
and detects the presence of any Unicode BOM
* (Byte Order Mark) at its beginning, as defined by
* RFC 3629 - UTF-8, a
* transformation format of ISO 10646
*
* The
* Unicode FAQ
* defines 5 types of BOMs:
* 00 00 FE FF = UTF-32, big-endian
* FF FE 00 00 = UTF-32, little-endian
* FE FF = UTF-16, big-endian
* FF FE = UTF-16, little-endian
* EF BB BF = UTF-8
*
*
* Use the {@link #getBOM()} method to know whether a BOM has been detected
* or not.
*
* Use the {@link #skipBOM()} method to remove the detected BOM from the
* wrapped InputStream
object.
*
* @author Gregory Pakosz
* @version 1.0
*/
public class UnicodeBOMInputStream extends InputStream {
/**
* Type safe enumeration class that describes the different types of Unicode
* BOMs.
*/
public static final class BOM {
final byte bytes[];
private final String description;
/**
* NONE.
*/
public static final BOM NONE = new BOM(new byte[] {}, "NONE");
/**
* UTF-8 BOM (EF BB BF).
*/
public static final BOM UTF_8 = new BOM(new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF }, "UTF-8");
/**
* UTF-16, little-endian (FF FE).
*/
public static final BOM UTF_16_LE = new BOM(new byte[] { (byte) 0xFF, (byte) 0xFE }, "UTF-16 little-endian");
/**
* UTF-16, big-endian (FE FF).
*/
public static final BOM UTF_16_BE = new BOM(new byte[] { (byte) 0xFE, (byte) 0xFF }, "UTF-16 big-endian");
/**
* UTF-32, little-endian (FF FE 00 00).
*/
public static final BOM UTF_32_LE = new BOM(new byte[] { (byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00 },
"UTF-32 little-endian");
/**
* UTF-32, big-endian (00 00 FE FF).
*/
public static final BOM UTF_32_BE = new BOM(new byte[] { (byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF },
"UTF-32 big-endian");
/**
* Returns a String
representation of this BOM
* value.
*/
public final String toString() {
return description;
}
/**
* Returns the bytes corresponding to this BOM
value.
* @return the bytes corresponding to this BOM
value.
*/
public final byte[] getBytes() {
final int length = bytes.length;
final byte[] result = new byte[length];
// make a defensive copy
System.arraycopy(bytes, 0, result, 0, length);
return result;
}
private BOM(final byte bom[], final String description) {
assert (bom != null) : "invalid BOM: null is not allowed";
assert (description != null) : "invalid description: null is not allowed";
assert (description.length() != 0) : "invalid description: empty string is not allowed";
this.bytes = bom;
this.description = description;
}
} // BOM
private final PushbackInputStream in;
private final BOM bom;
private boolean skipped = false;
/**
* Constructs a new UnicodeBOMInputStream
that wraps the
* specified InputStream
. By default skip BOM bytes
*
* @param inputStream an InputStream
.
*
* @throws NullPointerException when inputStream
is
* null
.
* @throws IOException on reading from the specified InputStream
* when trying to detect the Unicode BOM.
*/
public UnicodeBOMInputStream(final InputStream inputStream) throws IOException {
this(inputStream, true);
}
/**
* Constructs a new UnicodeBOMInputStream
that wraps the
* specified InputStream
.
*
* @param inputStream an InputStream
.
* @param skipIfFound to automatically skip BOM bytes if found
*
* @throws NullPointerException when inputStream
is
* null
.
* @throws IOException on reading from the specified InputStream
* when trying to detect the Unicode BOM.
*/
public UnicodeBOMInputStream(final InputStream inputStream,
boolean skipIfFound) throws IOException {
if (inputStream == null) {
throw new NullPointerException(
"invalid input stream: null is not allowed");
}
in = new PushbackInputStream(inputStream, 4);
final byte bom[] = new byte[4];
final int read = in.read(bom);
switch (read) {
case 4:
if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)
&& (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
this.bom = BOM.UTF_32_LE;
break;
} else if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)
&& (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
this.bom = BOM.UTF_32_BE;
break;
}
case 3:
if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)
&& (bom[2] == (byte) 0xBF)) {
this.bom = BOM.UTF_8;
break;
}
case 2:
if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
this.bom = BOM.UTF_16_LE;
break;
} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
this.bom = BOM.UTF_16_BE;
break;
}
default:
this.bom = BOM.NONE;
break;
}
if (read > 0) {
in.unread(bom, 0, read);
}
if (skipIfFound) {
this.skipBOM();
}
}
/**
* Returns the BOM
that was detected in the wrapped
* InputStream
object.
*
* @return a BOM
value.
*/
public final BOM getBOM() {
// BOM type is immutable.
return bom;
}
/**
* Skips the BOM
that was found in the wrapped
* InputStream
object.
*
* @return this UnicodeBOMInputStream
.
*
* @throws IOException when trying to skip the BOM from the wrapped
* InputStream
object.
*/
public final synchronized UnicodeBOMInputStream skipBOM()
throws IOException {
if (!skipped) {
long bytesToSkip = bom.bytes.length;
long bytesSkipped = in.skip(bytesToSkip);
for (long i = bytesSkipped; i < bytesToSkip; i++) {
in.read();
}
skipped = true;
}
return this;
}
/**
* {@inheritDoc}
*/
public int read() throws IOException {
this.skipped = true;
return in.read();
}
/**
* {@inheritDoc}
*/
public int read(final byte b[]) throws IOException {
this.skipped = true;
return in.read(b, 0, b.length);
}
/**
* {@inheritDoc}
*/
public int read(final byte b[], final int off, final int len) throws IOException {
this.skipped = true;
return in.read(b, off, len);
}
/**
* {@inheritDoc}
*/
public long skip(final long n) throws IOException {
this.skipped = true;
return in.skip(n);
}
/**
* {@inheritDoc}
*/
public int available() throws IOException {
return in.available();
}
/**
* {@inheritDoc}
*/
public void close() throws IOException {
in.close();
}
/**
* {@inheritDoc}
*/
public synchronized void mark(final int readlimit) {
in.mark(readlimit);
}
/**
* {@inheritDoc}
*/
public synchronized void reset() throws IOException {
in.reset();
}
/**
* {@inheritDoc}
*/
public boolean markSupported() {
return in.markSupported();
}
}
juniversalchardet-2.5.0/src/main/java/org/mozilla/universalchardet/UniversalDetector.java 0000664 0000000 0000000 00000033177 14625121707 0032015 0 ustar 00root root 0000000 0000000 /*
(C) Copyright 2016-2017 Alberto Fernández
(C) Copyright 2006-2007 Kohei TAKETA (Java port)
(C) Copyright 2001 Netscape Communications Corporation.
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom
* Kohei TAKETA (Java port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
*
*/
package org.mozilla.universalchardet;
import static org.mozilla.universalchardet.Constants.CHARSET_US_ASCII;
import static org.mozilla.universalchardet.Constants.CHARSET_UTF_16BE;
import static org.mozilla.universalchardet.Constants.CHARSET_UTF_16LE;
import static org.mozilla.universalchardet.Constants.CHARSET_UTF_32BE;
import static org.mozilla.universalchardet.Constants.CHARSET_UTF_32LE;
import static org.mozilla.universalchardet.Constants.CHARSET_UTF_8;
import static org.mozilla.universalchardet.Constants.CHARSET_X_ISO_10646_UCS_4_2143;
import static org.mozilla.universalchardet.Constants.CHARSET_X_ISO_10646_UCS_4_3412;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import org.mozilla.universalchardet.prober.CharsetProber;
import org.mozilla.universalchardet.prober.EscCharsetProber;
import org.mozilla.universalchardet.prober.Latin1Prober;
import org.mozilla.universalchardet.prober.MBCSGroupProber;
import org.mozilla.universalchardet.prober.SBCSGroupProber;
public class UniversalDetector {
////////////////////////////////////////////////////////////////
// constants
////////////////////////////////////////////////////////////////
public static final float SHORTCUT_THRESHOLD = 0.95f;
public static final float MINIMUM_THRESHOLD = 0.20f;
////////////////////////////////////////////////////////////////
// inner types
////////////////////////////////////////////////////////////////
public enum InputState {
PURE_ASCII, ESC_ASCII, HIGHBYTE
}
////////////////////////////////////////////////////////////////
// fields
////////////////////////////////////////////////////////////////
private InputState inputState;
private boolean done;
private boolean start;
private boolean gotData;
private boolean onlyPrintableASCII = true;
private byte lastChar;
private String detectedCharset;
private CharsetProber[] probers;
private CharsetProber escCharsetProber;
private CharsetListener listener;
////////////////////////////////////////////////////////////////
// methods
////////////////////////////////////////////////////////////////
public UniversalDetector() {
this(null);
}
/**
* @param listener a listener object that is notified of
* the detected encocoding. Can be null.
*/
public UniversalDetector(CharsetListener listener) {
this.listener = listener;
this.escCharsetProber = null;
this.probers = new CharsetProber[3];
reset();
}
public boolean isDone() {
return this.done;
}
/**
* @return The detected encoding is returned. If the detector couldn't
* determine what encoding was used, null is returned.
*/
public String getDetectedCharset() {
return this.detectedCharset;
}
public void setListener(CharsetListener listener) {
this.listener = listener;
}
public CharsetListener getListener() {
return this.listener;
}
/**
* Feed the detector with more data
* @param buf The buffer containing the data
*/
public void handleData(final byte[] buf) {
handleData(buf, 0, buf.length);
}
/**
* Feed the detector with more data
* @param buf Buffer with the data
* @param offset initial position of data in buf
* @param length length of data
*/
public void handleData(final byte[] buf, int offset, int length) {
if (this.done) {
return;
}
if (length > 0) {
this.gotData = true;
}
if (this.start) {
this.start = false;
if (length > 3) {
String detectedBOM = detectCharsetFromBOM(buf, offset);
if (detectedBOM != null) {
this.detectedCharset = detectedBOM;
this.done = true;
return;
}
}
} // if (start) end
int maxPos = offset + length;
for (int i=offset; i= 0x20 && c <= 0x7e) // Printable characters
|| c == 0x0A // New Line
|| c == 0x0D // Carriage return
|| c== 0x09; // TAB
}
this.lastChar = buf[i];
}
} // for end
CharsetProber.ProbingState st;
if (this.inputState == InputState.ESC_ASCII) {
if (this.escCharsetProber == null) {
this.escCharsetProber = new EscCharsetProber();
}
st = this.escCharsetProber.handleData(buf, offset, length);
if (st == CharsetProber.ProbingState.FOUND_IT || 0.99f == this.escCharsetProber.getConfidence()) {
this.done = true;
this.detectedCharset = this.escCharsetProber.getCharSetName();
}
} else if (this.inputState == InputState.HIGHBYTE) {
for (int i=0; i (offset + 3)) {
int b1 = buf[offset] & 0xFF;
int b2 = buf[offset+1] & 0xFF;
int b3 = buf[offset+2] & 0xFF;
int b4 = buf[offset+3] & 0xFF;
switch (b1) {
case 0xEF:
if (b2 == 0xBB && b3 == 0xBF) {
return CHARSET_UTF_8;
}
break;
case 0xFE:
if (b2 == 0xFF && b3 == 0x00 && b4 == 0x00) {
return CHARSET_X_ISO_10646_UCS_4_3412;
} else if (b2 == 0xFF) {
return CHARSET_UTF_16BE;
}
break;
case 0x00:
if (b2 == 0x00 && b3 == 0xFE && b4 == 0xFF) {
return CHARSET_UTF_32BE;
} else if (b2 == 0x00 && b3 == 0xFF && b4 == 0xFE) {
return CHARSET_X_ISO_10646_UCS_4_2143;
}
break;
case 0xFF:
if (b2 == 0xFE && b3 == 0x00 && b4 == 0x00) {
return CHARSET_UTF_32LE;
} else if (b2 == 0xFE) {
return CHARSET_UTF_16LE;
}
break;
default:
break;
} // swich end
}
return null;
}
/**
* Marks end of data reading. Finish calculations.
*/
public void dataEnd() {
if (!this.gotData) {
return;
}
if (this.detectedCharset != null) {
this.done = true;
if (this.listener != null) {
this.listener.report(this.detectedCharset);
}
return;
}
if (this.inputState == InputState.HIGHBYTE) {
float proberConfidence;
float maxProberConfidence = 0.0f;
int maxProber = 0;
for (int i=0; i maxProberConfidence) {
maxProberConfidence = proberConfidence;
maxProber = i;
}
}
if (maxProberConfidence > MINIMUM_THRESHOLD) {
this.detectedCharset = this.probers[maxProber].getCharSetName();
if (this.listener != null) {
this.listener.report(this.detectedCharset);
}
}
} else if (this.inputState == InputState.ESC_ASCII) {
// do nothing
} else if (this.inputState == InputState.PURE_ASCII && this.onlyPrintableASCII) {
this.detectedCharset = CHARSET_US_ASCII;
}
else {
// do nothing
}
}
/**
* Resets detector to be used again.
*/
public final void reset() {
this.done = false;
this.start = true;
this.detectedCharset = null;
this.gotData = false;
this.inputState = InputState.PURE_ASCII;
this.lastChar = 0;
if (this.escCharsetProber != null) {
this.escCharsetProber.reset();
}
for (int i=0; i 0 && !detector.isDone()) {
detector.handleData(buf, 0, nread);
}
detector.dataEnd();
String encoding = detector.getDetectedCharset();
detector.reset();
return encoding;
}
}
juniversalchardet-2.5.0/src/main/java/org/mozilla/universalchardet/prober/ 0000775 0000000 0000000 00000000000 14625121707 0026766 5 ustar 00root root 0000000 0000000 juniversalchardet-2.5.0/src/main/java/org/mozilla/universalchardet/prober/Big5Prober.java 0000664 0000000 0000000 00000011635 14625121707 0031577 0 ustar 00root root 0000000 0000000 /* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Kohei TAKETA (Java port)
* Lersh99
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
package org.mozilla.universalchardet.prober;
import org.mozilla.universalchardet.prober.distributionanalysis.Big5DistributionAnalysis;
import org.mozilla.universalchardet.prober.statemachine.Big5SMModel;
import org.mozilla.universalchardet.prober.statemachine.CodingStateMachine;
import org.mozilla.universalchardet.prober.statemachine.SMModel;
import org.mozilla.universalchardet.Constants;
public class Big5Prober extends CharsetProber {
////////////////////////////////////////////////////////////////
// fields
////////////////////////////////////////////////////////////////
private CodingStateMachine codingSM;
private ProbingState state;
private Big5DistributionAnalysis distributionAnalyzer;
private byte[] lastChar;
private static final SMModel smModel = new Big5SMModel();
////////////////////////////////////////////////////////////////
// methods
////////////////////////////////////////////////////////////////
public Big5Prober() {
super();
this.codingSM = new CodingStateMachine(smModel);
this.distributionAnalyzer = new Big5DistributionAnalysis();
this.lastChar = new byte[2];
reset();
}
@Override
public String getCharSetName() {
return Constants.CHARSET_BIG5;
}
@Override
public float getConfidence() {
return this.distributionAnalyzer.getConfidence();
}
@Override
public ProbingState getState() {
return this.state;
}
@Override
public ProbingState handleData(byte[] buf, int offset, int length) {
int codingState;
int maxPos = offset + length;
for (int i=offset; i SHORTCUT_THRESHOLD) {
this.state = ProbingState.FOUND_IT;
}
}
return this.state;
}
@Override
public final void reset() {
this.codingSM.reset();
this.state = ProbingState.DETECTING;
this.distributionAnalyzer.reset();
java.util.Arrays.fill(this.lastChar, (byte)0);
}
@Override
public void setOption() {
//
}
}
juniversalchardet-2.5.0/src/main/java/org/mozilla/universalchardet/prober/CharsetProber.java 0000664 0000000 0000000 00000015055 14625121707 0032402 0 ustar 00root root 0000000 0000000 /* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Shy Shalom
* Kohei TAKETA (Java port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
package org.mozilla.universalchardet.prober;
import java.nio.ByteBuffer;
public abstract class CharsetProber {
////////////////////////////////////////////////////////////////
// constants
////////////////////////////////////////////////////////////////
public static final float SHORTCUT_THRESHOLD = 0.95f;
public static final int ASCII_A = 0x61; // 'a'
public static final int ASCII_Z = 0x7A; // 'z'
public static final int ASCII_A_CAPITAL = 0x41; // 'A'
public static final int ASCII_Z_CAPITAL = 0x5A; // 'Z'
public static final int ASCII_LT = 0x3C; // '<'
public static final int ASCII_GT = 0x3E; // '>'
public static final int ASCII_SP = 0x20; // ' '
private boolean active = true;
////////////////////////////////////////////////////////////////
// inner types
////////////////////////////////////////////////////////////////
public enum ProbingState {
DETECTING, FOUND_IT, NOT_ME
}
////////////////////////////////////////////////////////////////
// methods
////////////////////////////////////////////////////////////////
public CharsetProber() {
super();
}
public abstract String getCharSetName();
public abstract ProbingState handleData(final byte[] buf, int offset, int length);
public abstract ProbingState getState();
public abstract void reset();
public abstract float getConfidence();
public abstract void setOption();
// ByteBuffer.position() indicates number of bytes written.
public ByteBuffer filterWithoutEnglishLetters(final byte[] buf, int offset, int length) {
ByteBuffer out = ByteBuffer.allocate(length);
boolean meetMSB = false;
byte c;
int prevPtr = offset;
int curPtr = offset;
int maxPtr = offset + length;
for (; curPtr prevPtr) {
// this segment contains more than single symbol,
// and it has upper ASCII, we need to keep it
out.put(buf, prevPtr, (curPtr-prevPtr));
out.put((byte)ASCII_SP);
prevPtr = curPtr + 1;
meetMSB = false;
} else {
// ignore current segment.
// (either because it is just a symbol or just an English word)
prevPtr = curPtr + 1;
}
}
}
if (meetMSB && curPtr > prevPtr) {
out.put(buf, prevPtr, (curPtr-prevPtr));
}
return out;
}
public ByteBuffer filterWithEnglishLetters(final byte[] buf, int offset, int length) {
ByteBuffer out = ByteBuffer.allocate(length);
boolean isInTag = false;
byte c;
int prevPtr = offset;
int curPtr = offset;
int maxPtr = offset + length;
for (; curPtr < maxPtr; ++curPtr) {
c = buf[curPtr];
if (c == ASCII_GT) {
isInTag = false;
} else if (c == ASCII_LT) {
isInTag = true;
}
if (isAscii(c) && isAsciiSymbol(c)) {
if (curPtr > prevPtr && !isInTag) {
// Current segment contains more than just a symbol
// and it is not inside a tag, keep it.
out.put(buf, prevPtr, (curPtr-prevPtr));
out.put((byte)ASCII_SP);
prevPtr = curPtr + 1;
} else {
prevPtr = curPtr + 1;
}
}
}
// If the current segment contains more than just a symbol
// and it is not inside a tag then keep it.
if (!isInTag && curPtr > prevPtr) {
out.put(buf, prevPtr, (curPtr-prevPtr));
}
return out;
}
private boolean isAscii(byte b) {
return ((b & 0x80) == 0);
}
// b must be in ASCII code range (MSB can't be 1).
private boolean isAsciiSymbol(byte b) {
int c = b & 0xFF;
return ((c < ASCII_A_CAPITAL) ||
(c > ASCII_Z_CAPITAL && c < ASCII_A) ||
(c > ASCII_Z));
}
public boolean isActive() {
return active;
}
public void setActive(boolean active) {
this.active = active;
}
}
juniversalchardet-2.5.0/src/main/java/org/mozilla/universalchardet/prober/EUCJPProber.java 0000664 0000000 0000000 00000012660 14625121707 0031656 0 ustar 00root root 0000000 0000000 /* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Kohei TAKETA (Java port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
package org.mozilla.universalchardet.prober;
import org.mozilla.universalchardet.prober.statemachine.CodingStateMachine;
import org.mozilla.universalchardet.prober.statemachine.EUCJPSMModel;
import org.mozilla.universalchardet.prober.statemachine.SMModel;
import org.mozilla.universalchardet.prober.contextanalysis.EUCJPContextAnalysis;
import org.mozilla.universalchardet.prober.distributionanalysis.EUCJPDistributionAnalysis;
import org.mozilla.universalchardet.Constants;
public class EUCJPProber extends CharsetProber {
////////////////////////////////////////////////////////////////
// fields
////////////////////////////////////////////////////////////////
private CodingStateMachine codingSM;
private ProbingState state;
private EUCJPContextAnalysis contextAnalyzer;
private EUCJPDistributionAnalysis distributionAnalyzer;
private byte[] lastChar;
private static final SMModel smModel = new EUCJPSMModel();
////////////////////////////////////////////////////////////////
// methods
////////////////////////////////////////////////////////////////
public EUCJPProber() {
super();
this.codingSM = new CodingStateMachine(smModel);
this.contextAnalyzer = new EUCJPContextAnalysis();
this.distributionAnalyzer = new EUCJPDistributionAnalysis();
this.lastChar = new byte[2];
reset();
}
@Override
public String getCharSetName() {
return Constants.CHARSET_EUC_JP;
}
@Override
public float getConfidence() {
float contextCf = this.contextAnalyzer.getConfidence();
float distribCf = this.distributionAnalyzer.getConfidence();
return Math.max(contextCf, distribCf);
}
@Override
public ProbingState getState() {
return this.state;
}
@Override
public ProbingState handleData(byte[] buf, int offset, int length) {
int codingState;
int maxPos = offset + length;
for (int i=offset; i SHORTCUT_THRESHOLD) {
this.state = ProbingState.FOUND_IT;
}
}
return this.state;
}
@Override
public final void reset() {
this.codingSM.reset();
this.state = ProbingState.DETECTING;
this.contextAnalyzer.reset();
this.distributionAnalyzer.reset();
java.util.Arrays.fill(this.lastChar, (byte)0);
}
@Override
public void setOption() {
}
}
juniversalchardet-2.5.0/src/main/java/org/mozilla/universalchardet/prober/EUCKRProber.java 0000664 0000000 0000000 00000011601 14625121707 0031653 0 ustar 00root root 0000000 0000000 /* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Kohei TAKETA (Java port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
package org.mozilla.universalchardet.prober;
import org.mozilla.universalchardet.prober.distributionanalysis.EUCKRDistributionAnalysis;
import org.mozilla.universalchardet.prober.statemachine.CodingStateMachine;
import org.mozilla.universalchardet.prober.statemachine.EUCKRSMModel;
import org.mozilla.universalchardet.prober.statemachine.SMModel;
import org.mozilla.universalchardet.Constants;
public class EUCKRProber extends CharsetProber {
////////////////////////////////////////////////////////////////
// fields
////////////////////////////////////////////////////////////////
private CodingStateMachine codingSM;
private ProbingState state;
private EUCKRDistributionAnalysis distributionAnalyzer;
private byte[] lastChar;
private static final SMModel smModel = new EUCKRSMModel();
////////////////////////////////////////////////////////////////
// methods
////////////////////////////////////////////////////////////////
public EUCKRProber() {
super();
this.codingSM = new CodingStateMachine(smModel);
this.distributionAnalyzer = new EUCKRDistributionAnalysis();
this.lastChar = new byte[2];
reset();
}
@Override
public String getCharSetName() {
return Constants.CHARSET_EUC_KR;
}
@Override
public float getConfidence() {
return this.distributionAnalyzer.getConfidence();
}
@Override
public ProbingState getState() {
return this.state;
}
@Override
public ProbingState handleData(byte[] buf, int offset, int length) {
int codingState;
int maxPos = offset + length;
for (int i=offset; i SHORTCUT_THRESHOLD) {
this.state = ProbingState.FOUND_IT;
}
}
return this.state;
}
@Override
public final void reset() {
this.codingSM.reset();
this.state = ProbingState.DETECTING;
this.distributionAnalyzer.reset();
java.util.Arrays.fill(this.lastChar, (byte)0);
}
@Override
public void setOption() {
}
}
juniversalchardet-2.5.0/src/main/java/org/mozilla/universalchardet/prober/EUCTWProber.java 0000664 0000000 0000000 00000011613 14625121707 0031674 0 ustar 00root root 0000000 0000000 /* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Kohei TAKETA (Java port)
* Lersh99
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
package org.mozilla.universalchardet.prober;
import org.mozilla.universalchardet.prober.distributionanalysis.EUCTWDistributionAnalysis;
import org.mozilla.universalchardet.prober.statemachine.CodingStateMachine;
import org.mozilla.universalchardet.prober.statemachine.EUCTWSMModel;
import org.mozilla.universalchardet.prober.statemachine.SMModel;
import org.mozilla.universalchardet.Constants;
public class EUCTWProber extends CharsetProber {
////////////////////////////////////////////////////////////////
// fields
////////////////////////////////////////////////////////////////
private CodingStateMachine codingSM;
private ProbingState state;
private EUCTWDistributionAnalysis distributionAnalyzer;
private byte[] lastChar;
private static final SMModel smModel = new EUCTWSMModel();
////////////////////////////////////////////////////////////////
// methods
////////////////////////////////////////////////////////////////
public EUCTWProber() {
super();
this.codingSM = new CodingStateMachine(smModel);
this.distributionAnalyzer = new EUCTWDistributionAnalysis();
this.lastChar = new byte[2];
reset();
}
@Override
public String getCharSetName() {
return Constants.CHARSET_EUC_TW;
}
@Override
public float getConfidence() {
return this.distributionAnalyzer.getConfidence();
}
@Override
public ProbingState getState() {
return this.state;
}
@Override
public ProbingState handleData(byte[] buf, int offset, int length) {
int codingState;
int maxPos = offset + length;
for (int i=offset; i SHORTCUT_THRESHOLD) {
this.state = ProbingState.FOUND_IT;
}
}
return this.state;
}
@Override
public final void reset() {
this.codingSM.reset();
this.state = ProbingState.DETECTING;
this.distributionAnalyzer.reset();
java.util.Arrays.fill(this.lastChar, (byte) 0);
}
@Override
public void setOption()
{}
}
juniversalchardet-2.5.0/src/main/java/org/mozilla/universalchardet/prober/EscCharsetProber.java 0000664 0000000 0000000 00000012363 14625121707 0033034 0 ustar 00root root 0000000 0000000 /* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Kohei TAKETA (Java port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
package org.mozilla.universalchardet.prober;
import org.mozilla.universalchardet.prober.statemachine.CodingStateMachine;
import org.mozilla.universalchardet.prober.statemachine.SMModel;
import org.mozilla.universalchardet.prober.statemachine.HZSMModel;
import org.mozilla.universalchardet.prober.statemachine.ISO2022CNSMModel;
import org.mozilla.universalchardet.prober.statemachine.ISO2022JPSMModel;
import org.mozilla.universalchardet.prober.statemachine.ISO2022KRSMModel;
public class EscCharsetProber extends CharsetProber {
////////////////////////////////////////////////////////////////
// fields
////////////////////////////////////////////////////////////////
private CodingStateMachine[] codingSM;
private int activeSM;
private ProbingState state;
private String detectedCharset;
private static final HZSMModel hzsModel = new HZSMModel();
private static final ISO2022CNSMModel iso2022cnModel = new ISO2022CNSMModel();
private static final ISO2022JPSMModel iso2022jpModel = new ISO2022JPSMModel();
private static final ISO2022KRSMModel iso2022krModel = new ISO2022KRSMModel();
////////////////////////////////////////////////////////////////
// methods
////////////////////////////////////////////////////////////////
public EscCharsetProber() {
super();
this.codingSM = new CodingStateMachine[4];
this.codingSM[0] = new CodingStateMachine(hzsModel);
this.codingSM[1] = new CodingStateMachine(iso2022cnModel);
this.codingSM[2] = new CodingStateMachine(iso2022jpModel);
this.codingSM[3] = new CodingStateMachine(iso2022krModel);
reset();
}
@Override
public String getCharSetName() {
return this.detectedCharset;
}
@Override
public float getConfidence() {
return 0.99f;
}
@Override
public ProbingState getState() {
return this.state;
}
@Override
public ProbingState handleData(byte[] buf, int offset, int length) {
int codingState;
int maxPos = offset + length;
for (int i=offset; i=0; --j) {
codingState = this.codingSM[j].nextState(buf[i]);
if (codingState == SMModel.ERROR) {
--this.activeSM;
if (this.activeSM <= 0) {
this.state = ProbingState.NOT_ME;
return this.state;
} else if (j != this.activeSM) {
CodingStateMachine t;
t = this.codingSM[this.activeSM];
this.codingSM[this.activeSM] = this.codingSM[j];
this.codingSM[j] = t;
}
} else if (codingState == SMModel.ITSME) {
this.state = ProbingState.FOUND_IT;
this.detectedCharset = this.codingSM[j].getCodingStateMachine();
return this.state;
}
}
}
return this.state;
}
@Override
public final void reset() {
this.state = ProbingState.DETECTING;
for (int i = 0; i < this.codingSM.length; ++i) {
this.codingSM[i].reset();
}
this.activeSM = this.codingSM.length;
this.detectedCharset = null;
}
@Override
public void setOption() {
}
}
juniversalchardet-2.5.0/src/main/java/org/mozilla/universalchardet/prober/GB18030Prober.java 0000664 0000000 0000000 00000011653 14625121707 0031675 0 ustar 00root root 0000000 0000000 /* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Kohei TAKETA (Java port)
* Lersh99
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
package org.mozilla.universalchardet.prober;
import org.mozilla.universalchardet.prober.distributionanalysis.GB2312DistributionAnalysis;
import org.mozilla.universalchardet.prober.statemachine.CodingStateMachine;
import org.mozilla.universalchardet.prober.statemachine.GB18030SMModel;
import org.mozilla.universalchardet.prober.statemachine.SMModel;
import org.mozilla.universalchardet.Constants;
public class GB18030Prober extends CharsetProber {
////////////////////////////////////////////////////////////////
// fields
////////////////////////////////////////////////////////////////
private CodingStateMachine codingSM;
private ProbingState state;
private GB2312DistributionAnalysis distributionAnalyzer;
private byte[] lastChar;
private static final SMModel smModel = new GB18030SMModel();
////////////////////////////////////////////////////////////////
// methods
////////////////////////////////////////////////////////////////
public GB18030Prober() {
super();
this.codingSM = new CodingStateMachine(smModel);
this.distributionAnalyzer = new GB2312DistributionAnalysis();
this.lastChar = new byte[2];
reset();
}
@Override
public String getCharSetName() {
return Constants.CHARSET_GB18030;
}
@Override
public float getConfidence() {
return this.distributionAnalyzer.getConfidence();
}
@Override
public ProbingState getState() {
return this.state;
}
@Override
public ProbingState handleData(byte[] buf, int offset, int length) {
int codingState;
int maxPos = offset + length;
for (int i=offset; i SHORTCUT_THRESHOLD) {
this.state = ProbingState.FOUND_IT;
}
}
return this.state;
}
@Override
public final void reset() {
this.codingSM.reset();
this.state = ProbingState.DETECTING;
this.distributionAnalyzer.reset();
java.util.Arrays.fill(this.lastChar, (byte)0);
}
@Override
public void setOption() {
}
}
juniversalchardet-2.5.0/src/main/java/org/mozilla/universalchardet/prober/HebrewProber.java 0000664 0000000 0000000 00000017537 14625121707 0032234 0 ustar 00root root 0000000 0000000 /* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Shy Shalom
* Portions created by the Initial Developer are Copyright (C) 2005
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Kohei TAKETA (Java port)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
package org.mozilla.universalchardet.prober;
import org.mozilla.universalchardet.Constants;
public class HebrewProber extends CharsetProber {
////////////////////////////////////////////////////////////////
// fields
////////////////////////////////////////////////////////////////
public static final int FINAL_KAF = 0xEA;
public static final int NORMAL_KAF = 0xEB;
public static final int FINAL_MEM = 0xED;
public static final int NORMAL_MEM = 0xEE;
public static final int FINAL_NUN = 0xEF;
public static final int NORMAL_NUN = 0xF0;
public static final int FINAL_PE = 0xF3;
public static final int NORMAL_PE = 0xF4;
public static final int FINAL_TSADI = 0xF5;
public static final int NORMAL_TSADI= 0xF6;
public static final byte SPACE = 0x20;
public static final int MIN_FINAL_CHAR_DISTANCE = 5;
public static final float MIN_MODEL_DISTANCE = 0.01f;
////////////////////////////////////////////////////////////////
// fields
////////////////////////////////////////////////////////////////
private int finalCharLogicalScore;
private int finalCharVisualScore;
private byte prev;
private byte beforePrev;
private CharsetProber logicalProber;
private CharsetProber visualProber;
////////////////////////////////////////////////////////////////
// methods
////////////////////////////////////////////////////////////////
public HebrewProber() {
super();
this.logicalProber = null;
this.visualProber = null;
reset();
}
public void setModalProbers(CharsetProber logicalProber, CharsetProber visualProber) {
this.logicalProber = logicalProber;
this.visualProber = visualProber;
}
@Override
public String getCharSetName() {
// If the final letter score distance is dominant enough, rely on it.
int finalsub = this.finalCharLogicalScore - this.finalCharVisualScore;
if (finalsub >= MIN_FINAL_CHAR_DISTANCE) {
return Constants.CHARSET_WINDOWS_1255;
}
if (finalsub <= -MIN_FINAL_CHAR_DISTANCE) {
return Constants.CHARSET_ISO_8859_8;
}
// It's not dominant enough, try to rely on the model scores instead.
float modelsub = this.logicalProber.getConfidence() - this.visualProber.getConfidence();
if (modelsub > MIN_MODEL_DISTANCE) {
return Constants.CHARSET_WINDOWS_1255;
}
if (modelsub < -MIN_MODEL_DISTANCE) {
return Constants.CHARSET_ISO_8859_8;
}
// Still no good, back to final letter distance, maybe it'll save the day.
if (finalsub < 0) {
return Constants.CHARSET_ISO_8859_8;
}
// (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
return Constants.CHARSET_WINDOWS_1255;
}
@Override
public float getConfidence() {
return 0.0f;
}
@Override
public ProbingState getState() {
// Remain active as long as any of the model probers are active.
if ((this.logicalProber.getState() == ProbingState.NOT_ME) &&
(this.visualProber.getState() == ProbingState.NOT_ME)) {
return ProbingState.NOT_ME;
}
return ProbingState.DETECTING;
}
@Override
public ProbingState handleData(byte[] buf, int offset, int length) {
if (getState() == ProbingState.NOT_ME) {
return ProbingState.NOT_ME;
}
byte c;
int maxPos = offset + length;
for (int i=offset; i