charlock-holmes-0.6.9.4/0000755000175000017500000000000012133017375014271 5ustar ondrejondrejcharlock-holmes-0.6.9.4/ext/0000755000175000017500000000000012133017375015071 5ustar ondrejondrejcharlock-holmes-0.6.9.4/ext/charlock_holmes/0000755000175000017500000000000012133017375020226 5ustar ondrejondrejcharlock-holmes-0.6.9.4/ext/charlock_holmes/common.h0000644000175000017500000000170212133017375021667 0ustar ondrejondrej#ifndef CHARLOCK_COMMON_H #define CHARLOCK_COMMON_H // tell rbx not to use it's caching compat layer // by doing this we're making a promize to RBX that // we'll never modify the pointers we get back from RSTRING_PTR #define RSTRING_NOT_MODIFIED #include #ifdef HAVE_RUBY_ENCODING_H #include #endif static VALUE charlock_new_enc_str(const char *str, size_t len, void *encoding) { #ifdef HAVE_RUBY_ENCODING_H return rb_external_str_new_with_enc(str, len, (rb_encoding *)encoding); #else return rb_str_new(str, len); #endif } static VALUE charlock_new_str(const char *str, size_t len) { #ifdef HAVE_RUBY_ENCODING_H return rb_external_str_new_with_enc(str, len, rb_utf8_encoding()); #else return rb_str_new(str, len); #endif } static VALUE charlock_new_str2(const char *str) { #ifdef HAVE_RUBY_ENCODING_H return rb_external_str_new_with_enc(str, strlen(str), rb_utf8_encoding()); #else return rb_str_new2(str); #endif } #endifcharlock-holmes-0.6.9.4/ext/charlock_holmes/transliterator.cpp0000644000175000017500000000627412133017375024020 0ustar ondrejondrej#include "common.h" #undef UChar #include #include extern "C" { #ifdef HAVE_RUBY_ENCODING_H #include static VALUE rb_eEncodingCompatibilityError; static void check_utf8_encoding(VALUE str) { static rb_encoding *_cached[3] = {NULL, NULL, NULL}; rb_encoding *enc; if (_cached[0] == NULL) { _cached[0] = rb_utf8_encoding(); _cached[1] = rb_usascii_encoding(); _cached[2] = rb_ascii8bit_encoding(); } enc = rb_enc_get(str); if (enc != _cached[0] && enc != _cached[1] && enc != _cached[2]) { rb_raise(rb_eEncodingCompatibilityError, "Input must be UTF-8 or US-ASCII, %s given", rb_enc_name(enc)); } } #else static void check_utf8_encoding(VALUE str) {} #endif extern VALUE rb_mCharlockHolmes; static VALUE rb_cTransliterator; static VALUE rb_transliterator_id_list(VALUE self) { UErrorCode status = U_ZERO_ERROR; StringEnumeration *id_list; int32_t id_list_size; const char *curr_id; int32_t curr_id_len; VALUE rb_ary; VALUE rb_curr_id; id_list_size = 0; id_list = Transliterator::getAvailableIDs(status); if(!U_SUCCESS(status)) { rb_raise(rb_eArgError, "%s", u_errorName(status)); } status = U_ZERO_ERROR; id_list_size = id_list->count(status); if(!U_SUCCESS(status)) { rb_raise(rb_eArgError, "%s", u_errorName(status)); } rb_ary = rb_ary_new2(id_list_size); do { curr_id_len = 0; curr_id = id_list->next(&curr_id_len, status); if(!U_SUCCESS(status)) { rb_raise(rb_eArgError, "%s", u_errorName(status)); } if (curr_id != NULL) { rb_curr_id = charlock_new_str(curr_id, curr_id_len); rb_ary_push(rb_ary, rb_curr_id); } } while(curr_id != NULL); delete id_list; return rb_ary; } static VALUE rb_transliterator_transliterate(VALUE self, VALUE rb_txt, VALUE rb_id) { UErrorCode status = U_ZERO_ERROR; UParseError p_error; Transliterator *trans; const char *txt; size_t txt_len; const char *id; size_t id_len; UnicodeString *u_txt; std::string result; VALUE rb_out; Check_Type(rb_txt, T_STRING); Check_Type(rb_id, T_STRING); check_utf8_encoding(rb_txt); check_utf8_encoding(rb_id); txt = RSTRING_PTR(rb_txt); txt_len = RSTRING_LEN(rb_txt); id = RSTRING_PTR(rb_id); id_len = RSTRING_LEN(rb_id); trans = Transliterator::createInstance(UnicodeString(id, id_len), UTRANS_FORWARD, p_error, status); if(!U_SUCCESS(status)) { rb_raise(rb_eArgError, "%s", u_errorName(status)); } u_txt = new UnicodeString(txt, txt_len); trans->transliterate(*u_txt); StringByteSink sink(&result); u_txt->toUTF8(sink); delete u_txt; delete trans; rb_out = charlock_new_str(result.data(), result.length()); return rb_out; } void _init_charlock_transliterator() { #ifdef HAVE_RUBY_ENCODING_H rb_eEncodingCompatibilityError = rb_const_get(rb_cEncoding, rb_intern("CompatibilityError")); #endif rb_cTransliterator = rb_define_class_under(rb_mCharlockHolmes, "Transliterator", rb_cObject); rb_define_singleton_method(rb_cTransliterator, "id_list", (VALUE(*)(...))rb_transliterator_id_list, 0); rb_define_singleton_method(rb_cTransliterator, "transliterate", (VALUE(*)(...))rb_transliterator_transliterate, 2); } } charlock-holmes-0.6.9.4/ext/charlock_holmes/src/0000755000175000017500000000000012133017375021015 5ustar ondrejondrejcharlock-holmes-0.6.9.4/ext/charlock_holmes/converter.c0000644000175000017500000000274612133017375022412 0ustar ondrejondrej#include "unicode/ucnv.h" #include "common.h" extern VALUE rb_mCharlockHolmes; static VALUE rb_cConverter; static VALUE rb_converter_convert(VALUE self, VALUE rb_txt, VALUE rb_src_enc, VALUE rb_dst_enc) { VALUE rb_out; const char *src_enc; const char *dst_enc; const char *src_txt; char *out_buf; void *rb_enc = NULL; int32_t src_len; int32_t out_len; UErrorCode status = U_ZERO_ERROR; Check_Type(rb_txt, T_STRING); Check_Type(rb_src_enc, T_STRING); Check_Type(rb_dst_enc, T_STRING); src_txt = RSTRING_PTR(rb_txt); src_len = RSTRING_LEN(rb_txt); src_enc = RSTRING_PTR(rb_src_enc); dst_enc = RSTRING_PTR(rb_dst_enc); // first determin the size of the output buffer out_len = ucnv_convert(dst_enc, src_enc, NULL, 0, src_txt, src_len, &status); if (status != U_BUFFER_OVERFLOW_ERROR) { rb_raise(rb_eArgError, "%s", u_errorName(status)); } out_buf = malloc(out_len); // now do the actual conversion status = U_ZERO_ERROR; out_len = ucnv_convert(dst_enc, src_enc, out_buf, out_len, src_txt, src_len, &status); if (U_FAILURE(status)) { free(out_buf); rb_raise(rb_eArgError, "%s", u_errorName(status)); } #ifdef HAVE_RUBY_ENCODING_H rb_enc = (void *)rb_enc_find(dst_enc); #endif rb_out = charlock_new_enc_str(out_buf, out_len, rb_enc); free(out_buf); return rb_out; } void _init_charlock_converter() { rb_cConverter = rb_define_class_under(rb_mCharlockHolmes, "Converter", rb_cObject); rb_define_singleton_method(rb_cConverter, "convert", rb_converter_convert, 3); } charlock-holmes-0.6.9.4/ext/charlock_holmes/extconf.rb0000644000175000017500000000476012133017375022230 0ustar ondrejondrejrequire 'mkmf' CWD = File.expand_path(File.dirname(__FILE__)) def sys(cmd) puts " -- #{cmd}" unless ret = xsystem(cmd) raise "#{cmd} failed, please report issue on http://github.com/brianmario/charlock_holmes" end ret end if `which make`.strip.empty? STDERR.puts "\n\n" STDERR.puts "***************************************************************************************" STDERR.puts "*************** make required (apt-get install make build-essential) =( ***************" STDERR.puts "***************************************************************************************" exit(1) end ## # ICU dependency # dir_config 'icu' # detect homebrew installs if !have_library 'icui18n' base = if !`which brew`.empty? `brew --prefix`.strip elsif File.exists?("/usr/local/Cellar/icu4c") '/usr/local/Cellar' end if base and icu4c = Dir[File.join(base, 'Cellar/icu4c/*')].sort.last $INCFLAGS << " -I#{icu4c}/include " $LDFLAGS << " -L#{icu4c}/lib " end end unless have_library 'icui18n' and have_header 'unicode/ucnv.h' STDERR.puts "\n\n" STDERR.puts "***************************************************************************************" STDERR.puts "*********** icu required (brew install icu4c or apt-get install libicu-dev) ***********" STDERR.puts "***************************************************************************************" exit(1) end ## # libmagic dependency # src = File.basename('file-5.08.tar.gz') dir = File.basename(src, '.tar.gz') Dir.chdir("#{CWD}/src") do FileUtils.rm_rf(dir) if File.exists?(dir) sys("tar zxvf #{src}") Dir.chdir(dir) do sys("./configure --prefix=#{CWD}/dst/ --disable-shared --enable-static --with-pic") sys("patch -p0 < ../file-soft-check.patch") sys("make -C src install") sys("make -C magic install") end end FileUtils.cp "#{CWD}/dst/lib/libmagic.a", "#{CWD}/libmagic_ext.a" $INCFLAGS[0,0] = " -I#{CWD}/dst/include " $LDFLAGS << " -L#{CWD} " dir_config 'magic' unless have_library 'magic_ext' and have_header 'magic.h' STDERR.puts "\n\n" STDERR.puts "***************************************************************************************" STDERR.puts "********* error compiling and linking libmagic. please report issue on github *********" STDERR.puts "***************************************************************************************" exit(1) end $CFLAGS << ' -Wall -funroll-loops' $CFLAGS << ' -Wextra -O0 -ggdb3' if ENV['DEBUG'] create_makefile 'charlock_holmes/charlock_holmes' charlock-holmes-0.6.9.4/ext/charlock_holmes/encoding_detector.c0000644000175000017500000001726612133017375024065 0ustar ondrejondrej#include "unicode/ucsdet.h" #include "magic.h" #include "common.h" extern VALUE rb_mCharlockHolmes; static VALUE rb_cEncodingDetector; typedef struct { UCharsetDetector *csd; magic_t magic; } charlock_detector_t; static VALUE rb_encdec_buildmatch(const UCharsetMatch *match) { UErrorCode status = U_ZERO_ERROR; const char *mname; const char *mlang; int mconfidence; VALUE rb_match; if (!match) return Qnil; mname = ucsdet_getName(match, &status); mlang = ucsdet_getLanguage(match, &status); mconfidence = ucsdet_getConfidence(match, &status); rb_match = rb_hash_new(); rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("text"))); rb_hash_aset(rb_match, ID2SYM(rb_intern("encoding")), charlock_new_str2(mname)); rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(mconfidence)); if (mlang && mlang[0]) rb_hash_aset(rb_match, ID2SYM(rb_intern("language")), charlock_new_str2(mlang)); return rb_match; } static VALUE rb_encdec_binarymatch() { VALUE rb_match; rb_match = rb_hash_new(); rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("binary"))); rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(100)); return rb_match; } static int detect_binary_content(charlock_detector_t *detector, VALUE rb_str) { const char *binary_result; binary_result = magic_buffer(detector->magic, RSTRING_PTR(rb_str), RSTRING_LEN(rb_str)); if (binary_result) { if (!strstr(binary_result, "text")) return 1; } else { rb_raise(rb_eStandardError, "%s", magic_error(detector->magic)); } return 0; } /* * call-seq: detection_hash = EncodingDetector.detect str[, hint_enc] * * Attempt to detect the encoding of this string * * str - a String, what you want to detect the encoding of * hint_enc - an optional String (like "UTF-8"), the encoding name which will * be used as an additional hint to the charset detector * * Returns: a Hash with :encoding, :language, :type and :confidence */ static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self) { UErrorCode status = U_ZERO_ERROR; charlock_detector_t *detector; VALUE rb_str; VALUE rb_enc_hint; rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint); Check_Type(rb_str, T_STRING); Data_Get_Struct(self, charlock_detector_t, detector); // first lets see if this is binary content if (detect_binary_content(detector, rb_str)) { return rb_encdec_binarymatch(); } // if we got here - the data doesn't look like binary // lets try to figure out what encoding the text is in ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status); if (!NIL_P(rb_enc_hint)) { Check_Type(rb_enc_hint, T_STRING); ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status); } return rb_encdec_buildmatch(ucsdet_detect(detector->csd, &status)); } /* * call-seq: detection_hash_array = EncodingDetector.detect_all str[, hint_enc] * * Attempt to detect the encoding of this string, and return * a list with all the possible encodings that match it. * * * str - a String, what you want to detect the encoding of * hint_enc - an optional String (like "UTF-8"), the encoding name which will * be used as an additional hint to the charset detector * * Returns: an Array with zero or more Hashes, * each one of them with with :encoding, :language, :type and :confidence */ static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self) { UErrorCode status = U_ZERO_ERROR; charlock_detector_t *detector; const UCharsetMatch **csm; VALUE rb_ret; int i, match_count; VALUE rb_str; VALUE rb_enc_hint; VALUE binary_match; rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint); Check_Type(rb_str, T_STRING); Data_Get_Struct(self, charlock_detector_t, detector); rb_ret = rb_ary_new(); // first lets see if this is binary content binary_match = Qnil; if (detect_binary_content(detector, rb_str)) { binary_match = rb_encdec_binarymatch(); } ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status); if (!NIL_P(rb_enc_hint)) { Check_Type(rb_enc_hint, T_STRING); ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status); } csm = ucsdet_detectAll(detector->csd, &match_count, &status); for (i = 0; i < match_count; ++i) { rb_ary_push(rb_ret, rb_encdec_buildmatch(csm[i])); } if (!NIL_P(binary_match)) rb_ary_unshift(rb_ret, binary_match); return rb_ret; } /* * call-seq: EncodingDetector#strip_tags? * * Returns whether or not the strip_tags flag is set on this detector * * Returns: Boolean */ static VALUE rb_get_strip_tags(VALUE self) { charlock_detector_t *detector; UBool val; VALUE rb_val; Data_Get_Struct(self, charlock_detector_t, detector); val = ucsdet_isInputFilterEnabled(detector->csd); rb_val = val == 1 ? Qtrue : Qfalse; return rb_val; } /* * call-seq: EncodingDetector#strip_tags = true * * Enable or disable the stripping of HTML/XML tags from the input before * attempting any detection * * Returns: Boolean, the value passed */ static VALUE rb_set_strip_tags(VALUE self, VALUE rb_val) { charlock_detector_t *detector; UBool val; Data_Get_Struct(self, charlock_detector_t, detector); val = rb_val == Qtrue ? 1 : 0; ucsdet_enableInputFilter(detector->csd, val); return rb_val; } /* * call-seq: detectable_encodings = EncodingDetector.supported_encodings * * The list of detectable encodings supported by this library * * Returns: an Array of Strings */ static VALUE rb_get_supported_encodings(VALUE klass) { UCharsetDetector *csd; UErrorCode status = U_ZERO_ERROR; UEnumeration *encoding_list; VALUE rb_encoding_list; int32_t enc_count; int32_t i; const char *enc_name; int32_t enc_name_len; rb_encoding_list = rb_iv_get(klass, "encoding_list"); // lazily populate the list if (NIL_P(rb_encoding_list)) { csd = ucsdet_open(&status); encoding_list = ucsdet_getAllDetectableCharsets(csd, &status); rb_encoding_list = rb_ary_new(); enc_count = uenum_count(encoding_list, &status); for(i=0; i < enc_count; i++) { enc_name = uenum_next(encoding_list, &enc_name_len, &status); rb_ary_push(rb_encoding_list, charlock_new_str(enc_name, enc_name_len)); } rb_iv_set(klass, "encoding_list", rb_encoding_list); ucsdet_close(csd); } return rb_encoding_list; } static void rb_encdec__free(void *obj) { charlock_detector_t *detector; detector = (charlock_detector_t *)obj; if (detector->csd) ucsdet_close(detector->csd); if (detector->magic) magic_close(detector->magic); free(detector); } static VALUE rb_encdec__alloc(VALUE klass) { charlock_detector_t *detector; UErrorCode status = U_ZERO_ERROR; VALUE obj; detector = calloc(1, sizeof(charlock_detector_t)); obj = Data_Wrap_Struct(klass, NULL, rb_encdec__free, (void *)detector); detector->csd = ucsdet_open(&status); if (U_FAILURE(status)) { rb_raise(rb_eStandardError, "%s", u_errorName(status)); } detector->magic = magic_open(MAGIC_NO_CHECK_SOFT); if (detector->magic == NULL) { rb_raise(rb_eStandardError, "%s", magic_error(detector->magic)); } return obj; } void _init_charlock_encoding_detector() { rb_cEncodingDetector = rb_define_class_under(rb_mCharlockHolmes, "EncodingDetector", rb_cObject); rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc); rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, -1); rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, -1); rb_define_method(rb_cEncodingDetector, "strip_tags", rb_get_strip_tags, 0); rb_define_method(rb_cEncodingDetector, "strip_tags=", rb_set_strip_tags, 1); rb_define_singleton_method(rb_cEncodingDetector, "supported_encodings", rb_get_supported_encodings, 0); } charlock-holmes-0.6.9.4/ext/charlock_holmes/ext.c0000644000175000017500000000056412133017375021177 0ustar ondrejondrej#include "common.h" extern void _init_charlock_encoding_detector(); extern void _init_charlock_converter(); extern void _init_charlock_transliterator(); VALUE rb_mCharlockHolmes; void Init_charlock_holmes() { rb_mCharlockHolmes = rb_define_module("CharlockHolmes"); _init_charlock_encoding_detector(); _init_charlock_converter(); _init_charlock_transliterator(); }charlock-holmes-0.6.9.4/Rakefile0000644000175000017500000000050612133017375015737 0ustar ondrejondrejrequire 'rake/testtask' Rake::TestTask.new do |t| t.pattern = "test/**/*_test.rb" end task :default => :test gem 'rake-compiler', '>= 0.7.5' require "rake/extensiontask" Rake::ExtensionTask.new 'charlock_holmes' do |ext| ext.lib_dir = File.join 'lib', 'charlock_holmes' end Rake::Task[:test].prerequisites << :compilecharlock-holmes-0.6.9.4/test/0000755000175000017500000000000012133017375015250 5ustar ondrejondrejcharlock-holmes-0.6.9.4/test/string_methods_test.rb0000644000175000017500000000261512133017375021671 0ustar ondrejondrejrequire File.expand_path("../helper", __FILE__) require 'charlock_holmes/string' class StringMethodsTest < MiniTest::Unit::TestCase def test_adds_detect_encoding_method str = 'test' str.respond_to? :detect_encoding detected = str.detect_encoding assert_equal 'ISO-8859-1', detected[:encoding] end def test_detect_encoding_accepts_encoding_hint_param str = 'test' str.respond_to? :detect_encoding detected = str.detect_encoding 'UTF-8' assert_equal 'ISO-8859-1', detected[:encoding] end def test_adds_detect_encodings_method str = 'test' str.respond_to? :detect_encodings detected_list = str.detect_encodings assert detected_list.is_a? Array encoding_list = detected_list.map {|d| d[:encoding]}.sort assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list end def test_detect_encodings_accepts_encoding_hint_param str = 'test' str.respond_to? :detect_encodings detected_list = str.detect_encodings 'UTF-8' assert detected_list.is_a? Array encoding_list = detected_list.map {|d| d[:encoding]}.sort assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list end if RUBY_VERSION =~ /1.9/ def test_adds_detect_encoding_bang_method str = 'test' str.respond_to? :detect_encoding! str.detect_encoding! assert_equal Encoding.find('ISO-8859-1'), str.encoding end end endcharlock-holmes-0.6.9.4/test/converter_test.rb0000644000175000017500000000254412133017375020650 0ustar ondrejondrej# encoding: utf-8 require File.expand_path("../helper", __FILE__) class ConverterTest < MiniTest::Unit::TestCase def test_convert_ascii_from_iso859_1_to_utf16_and_back input = 'test' output = CharlockHolmes::Converter.convert input, 'ISO-8859-1', 'UTF-16' assert input.bytesize < output.bytesize assert input != output output = CharlockHolmes::Converter.convert output, 'UTF-16', 'ISO-8859-1' assert input.bytesize == output.bytesize assert input == output end def test_convert_utf8_to_utf16_and_back input = 'λ, λ, λ' output = CharlockHolmes::Converter.convert input, 'UTF-8', 'UTF-16' assert input.bytesize < output.bytesize assert input != output output = CharlockHolmes::Converter.convert output, 'UTF-16', 'UTF-8' assert input.bytesize == output.bytesize assert input == output end def test_params_must_be_strings assert_raises TypeError do CharlockHolmes::Converter.convert nil, 'UTF-8', 'UTF-16' end assert_raises TypeError do CharlockHolmes::Converter.convert 'lol', nil, 'UTF-16' end assert_raises TypeError do CharlockHolmes::Converter.convert 'lol', 'UTF-8', nil end begin CharlockHolmes::Converter.convert 'lol', 'UTF-8', 'UTF-16' rescue Exception => e assert_nil e, "#{e.class.name} raised, expected nothing" end end endcharlock-holmes-0.6.9.4/test/helper.rb0000644000175000017500000000052112133017375017052 0ustar ondrejondrej# Basic test environment. # blah fuck this require 'rubygems' if !defined?(Gem) require 'bundler/setup' require 'charlock_holmes' # bring in minitest require 'minitest/autorun' # put lib and test dirs directly on load path $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__) $LOAD_PATH.unshift File.expand_path('..', __FILE__)charlock-holmes-0.6.9.4/test/encoding_detector_test.rb0000644000175000017500000000772112133017375022322 0ustar ondrejondrej# encoding: utf-8 require File.expand_path("../helper", __FILE__) class EncodingDetectorTest < MiniTest::Unit::TestCase def setup @detector = CharlockHolmes::EncodingDetector.new end def test_has_class_level_detect_method CharlockHolmes::EncodingDetector.respond_to? :detect detected = CharlockHolmes::EncodingDetector.detect 'test' assert_equal 'ISO-8859-1', detected[:encoding] end def test_class_level_detect_accepts_encoding_hint CharlockHolmes::EncodingDetector.respond_to? :detect detected = CharlockHolmes::EncodingDetector.detect 'test', 'UTF-8' assert_equal 'ISO-8859-1', detected[:encoding] end def test_has_class_level_detect_all_method CharlockHolmes::EncodingDetector.respond_to? :detect_all detected_list = CharlockHolmes::EncodingDetector.detect_all 'test' assert detected_list.is_a? Array encoding_list = detected_list.map {|d| d[:encoding]}.sort assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list end def test_class_level_detect_all_method_accepts_encoding_hint CharlockHolmes::EncodingDetector.respond_to? :detect_all detected_list = CharlockHolmes::EncodingDetector.detect_all 'test', 'UTF-8' assert detected_list.is_a? Array encoding_list = detected_list.map {|d| d[:encoding]}.sort assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list end def test_has_detect_method @detector.respond_to? :detect detected = @detector.detect 'test' assert_equal 'ISO-8859-1', detected[:encoding] end def test_detect_accepts_encoding_hint @detector.respond_to? :detect detected = @detector.detect 'test', 'UTF-8' assert_equal 'ISO-8859-1', detected[:encoding] end def test_has_detect_all_method @detector.respond_to? :detect_all detected_list = @detector.detect_all 'test' assert detected_list.is_a? Array encoding_list = detected_list.map {|d| d[:encoding]}.sort assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list end def test_detect_all_accepts_encoding_hint @detector.respond_to? :detect_all detected_list = @detector.detect_all 'test', 'UTF-8' assert detected_list.is_a? Array encoding_list = detected_list.map {|d| d[:encoding]}.sort assert_equal ['ISO-8859-1', 'ISO-8859-2', 'UTF-8'], encoding_list end def test_strip_tags_flag detector = CharlockHolmes::EncodingDetector.new detector.strip_tags = true assert detector.strip_tags detection = detector.detect "
λ, λ, λ
" assert_equal 'UTF-8', detection[:encoding] detector.strip_tags = false assert !detector.strip_tags detection = detector.detect "
λ, λ, λ
" assert_equal 'UTF-8', detection[:encoding] end def test_has_list_of_supported_encodings CharlockHolmes::EncodingDetector.respond_to? :supported_encodings supported_encodings = CharlockHolmes::EncodingDetector.supported_encodings assert supported_encodings.is_a?(Array) assert supported_encodings.include? 'UTF-8' end MAPPING = [ ['repl2.cljs', 'ISO-8859-1', :text], ['core.rkt', 'UTF-8', :text], ['cl-messagepack.lisp', 'ISO-8859-1', :text], ['TwigExtensionsDate.es.yml', 'UTF-8', :text], ['AnsiGraph.psm1', 'UTF-16LE', :text], ['laholator.py', 'UTF-8', :text], ['hello_world', nil, :binary] ] def test_detection_works_as_expected MAPPING.each do |mapping| file, encoding, type = mapping path = File.expand_path "../fixtures/#{file}", __FILE__ content = File.read path guessed = @detector.detect content assert_equal encoding, guessed[:encoding] assert_equal type, guessed[:type] if content.respond_to?(:force_encoding) && guessed[:type] == :text content.force_encoding guessed[:encoding] assert content.valid_encoding? end end end endcharlock-holmes-0.6.9.4/test/transliterator_test.rb0000644000175000017500000001162112133017375021712 0ustar ondrejondrej# encoding: utf-8 require File.expand_path("../helper", __FILE__) class TransliteratorTest < MiniTest::Unit::TestCase DONT_CONVERT = [ "Vitrum edere possum; mihi non nocet.", # Latin "Je puis mangier del voirre. Ne me nuit.", # Old French "Kristala jan dezaket, ez dit minik ematen.", # Basque "Kaya kong kumain nang bubog at hindi ako masaktan.", # Tagalog "Ich kann Glas essen, ohne mir weh zu tun.", # German "I can eat glass and it doesn't hurt me.", # English ] CONVERT_PAIRS = { "Je peux manger du verre, ça ne me fait pas de mal." => # French "Je peux manger du verre, ca ne me fait pas de mal.", "Pot să mănânc sticlă și ea nu mă rănește." => # Romanian "Pot sa mananc sticla si ea nu ma raneste.", "Ég get etið gler án þess að meiða mig." => # Icelandic "Eg get etid gler an thess ad meida mig.", "Unë mund të ha qelq dhe nuk më gjen gjë." => # Albanian "Une mund te ha qelq dhe nuk me gjen gje.", "Mogę jeść szkło i mi nie szkodzi." => # Polish "Moge jesc szklo i mi nie szkodzi.", # "Я могу есть стекло, оно мне не вредит." => # Russian # "Ia moghu iest' stieklo, ono mnie nie vriedit.", # "Мога да ям стъкло, то не ми вреди." => # Bulgarian # "Mogha da iam stklo, to nie mi vriedi.", # "ᛁᚳ᛫ᛗᚨᚷ᛫ᚷᛚᚨᛋ᛫ᛖᚩᛏᚪᚾ᛫ᚩᚾᛞ᛫ᚻᛁᛏ᛫ᚾᛖ᛫ᚻᛖᚪᚱᛗᛁᚪᚧ᛫ᛗᛖ᛬" => # Anglo-Saxon # "ic.mag.glas.eotacn.ond.hit.ne.heacrmiacth.me:", # "ὕαλον ϕαγεῖν δύναμαι· τοῦτο οὔ με βλάπτει" => # Classical Greek # "ualon phagein dunamai; touto ou me blaptei", # "मैं काँच खा सकता हूँ और मुझे उससे कोई चोट नहीं पहुंचती" => # Hindi # "maiN kaaNc khaa sktaa huuN aur mujhe usse koii cott nhiiN phuNctii", # "من می توانم بدونِ احساس درد شيشه بخورم" => # Persian # "mn my twnm bdwni Hss drd shyshh bkhwrm", # "أنا قادر على أكل الزجاج و هذا لا يؤلمن" => # Arabic # "'n qdr 'l~ 'kl lzjj w hdh l yw'lmn", # "אני יכול לאכול זכוכית וזה לא מזיק לי" => # Hebrew # "ny ykvl lkvl zkvkyt vzh l mzyq ly", # "ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ" => # Thai # "chankinkracchkaid aetmanaimthamaihchanecchb", # "我能吞下玻璃而不伤身体。" => # Chinese # "Wo Neng Tun Xia Bo Li Er Bu Shang Shen Ti . ", # "私はガラスを食べられます。それは私を傷つけません。" => # Japanese # "Si hagarasuwoShi beraremasu. sorehaSi woShang tukemasen. ", # "⠋⠗⠁⠝⠉⠑" => # Braille # "france", "Schloß - Assunção - Łódź" => "Schloss - Assuncao - Lodz", "TÜM GOLLER Fb 4-1 Bursa Maç Özeti Íƶle" => "TUM GOLLER Fb 4-1 Bursa Mac Ozeti Izle", "ßßßßß" => "ssssssssss" } def test_transliterate trans_id = "Any-NFD; Any-Latin; Latin-ASCII; Any-NFC" DONT_CONVERT.each do |subject| assert_equal subject, trans(subject, trans_id) end CONVERT_PAIRS.each do |before, after| assert_equal after, trans(before, trans_id) end end if "".respond_to? :force_encoding def test_transliterate_id_must_be_utf8_or_ascii trans_id = "Any-NFD; Any-Latin; Latin-ASCII; Any-NFC".force_encoding('big5') txt = "blah blah blah" assert_raises Encoding::CompatibilityError do trans(txt, trans_id) end trans_id.force_encoding('UTF-8') begin trans(txt, trans_id) rescue Encoding::CompatibilityError => e assert_nil e, "#{e.class.name} raised, expected not to" end trans_id.force_encoding('US-ASCII') begin trans(txt, trans_id) rescue Encoding::CompatibilityError => e assert_nil e, "#{e.class.name} raised, expected not to" end end def test_transliterate_text_must_be_utf8_or_ascii trans_id = "Any-NFD; Any-Latin; Latin-ASCII; Any-NFC" txt = "blah blah blah".force_encoding('big5') assert_raises Encoding::CompatibilityError do trans(txt, trans_id) end txt.force_encoding('UTF-8') begin trans(txt, trans_id) rescue Encoding::CompatibilityError => e assert_nil e, "#{e.class.name} raised, expected not to" end txt.force_encoding('US-ASCII') begin trans(txt, trans_id) rescue Encoding::CompatibilityError => e assert_nil e, "#{e.class.name} raised, expected not to" end end end def test_transliterator_id_list_shouldnt_be_empty assert !CharlockHolmes::Transliterator.id_list.empty? end def trans(text, id) CharlockHolmes::Transliterator.transliterate(text, id) end end charlock-holmes-0.6.9.4/test/fixtures/0000755000175000017500000000000012133017375017121 5ustar ondrejondrejcharlock-holmes-0.6.9.4/test/fixtures/AnsiGraph.psm10000644000175000017500000000404212133017375021577 0ustar ondrejondrej# # Out-AnsiGraph.psm1 # Author: xcud # History: # v0.1 September 21, 2009 initial version # # PS Example> ps | select -first 5 | sort -property VM | # Out-AnsiGraph ProcessName, VM # AEADISRV %%% 14508032 # audiodg %%%%%%%%% 50757632 # conhost %%%%%%%%%%%%% 73740288 # AppleMobileDeviceService %%%%%%%%%%%%%%%% 92061696 # btdna %%%%%%%%%%%%%%%%%%%%% 126443520 # function Out-AnsiGraph($Parameter1=$null) { BEGIN { $q = new-object Collections.queue $max = 0; $namewidth = 0; } PROCESS { if($_) { $name = $_.($Parameter1[0]); $val = $_.($Parameter1[1]) if($max -lt $val) { $max = $val} if($namewidth -lt $name.length) { $namewidth = $name.length } $q.enqueue(@($name, $val)) } } END { $q | %{ $graph = ""; 0..($_[1]/$max*20) | %{ $graph += "%" } $name = "{0,$namewidth}" -f $_[0] "$name $graph " + $_[1] } } } Export-ModuleMember Out-AnsiGraphcharlock-holmes-0.6.9.4/test/fixtures/cl-messagepack.lisp0000644000175000017500000002213112133017375022670 0ustar ondrejondrej;;;; cl-messagepack.lisp (in-package #:messagepack) (declaim (optimize (debug 3))) (eval-when (:compile-toplevel :load-toplevel :execute) (defun mkstr (&rest args) (format nil "~{~a~}" args)) (defun mksymb (&rest args) (intern (apply #'mkstr args)))) (defmacro signed-unsigned-convertors (size) (let ((speed (if (< size 32) 3 0))) `(progn (defun ,(mksymb 'sb size '-> 'ub size) (sb) (declare (optimize (debug 0) (safety 0) (speed ,speed)) (type (integer ,(- (expt 2 (1- size))) ,(1- (expt 2 (1- size)))) sb)) (if (< sb 0) (ldb (byte ,size 0) sb) sb)) (defun ,(mksymb 'ub size '-> 'sb size) (sb) (declare (optimize (debug 0) (safety 0) (speed ,speed)) (type (mod ,(expt 2 size)) sb)) (if (logbitp (1- ,size) sb) (- (1+ (logxor (1- (expt 2 ,size)) sb))) sb))))) (signed-unsigned-convertors 8) (signed-unsigned-convertors 16) (signed-unsigned-convertors 32) (signed-unsigned-convertors 64) (defun write-hex (data) (let (line) (loop for i from 0 to (1- (length data)) do (push (elt data i) line) when (= (length line) 16) do (format t "~{~2,'0x ~}~%" (nreverse line)) (setf line nil)) (when line (format t "~{~2,'0x ~}~%" (nreverse line))))) (defun encode (data) (flexi-streams:with-output-to-sequence (stream) (encode-stream data stream))) (defun make-hash (data) (let ((result (make-hash-table))) (dolist (kv data) (cond ((consp (cdr kv)) (setf (gethash (first kv) result) (second kv))) (t (setf (gethash (car kv) result) (cdr kv))))) result)) (defun is-byte-array (data-type) (and (vectorp data-type) (equal '(unsigned-byte 8) (array-element-type data-type)))) (defun encode-stream (data stream) (cond ((floatp data) (encode-float data stream)) ((numberp data) (encode-integer data stream)) ((null data) (write-byte #xc0 stream)) ((eq data t) (write-byte #xc3 stream)) ((stringp data) (encode-string data stream)) ((is-byte-array data) (encode-raw-bytes data stream)) ((or (consp data) (vectorp data)) (encode-array data stream)) ((hash-table-p data) (encode-hash data stream)) ((symbolp data) (encode-string (symbol-name data) stream)) (t (error "Cannot encode data.")))) (defun encode-string (data stream) (encode-raw-bytes (babel:string-to-octets data) stream)) #+sbcl (defun sbcl-encode-float (data stream) (cond ((equal (type-of data) 'single-float) (write-byte #xca stream) (store-big-endian (sb-kernel:single-float-bits data) stream 4)) ((equal (type-of data) 'double-float) (write-byte #xcb stream) (store-big-endian (sb-kernel:double-float-high-bits data) stream 4) (store-big-endian (sb-kernel:double-float-low-bits data) stream 4))) t) (defun encode-float (data stream) (or #+sbcl (sbcl-encode-float data stream) #-(or sbcl) (error "No floating point support yet."))) (defun encode-each (data stream &optional (encoder #'encode-stream)) (cond ((hash-table-p data) (maphash (lambda (key value) (funcall encoder key stream) (funcall encoder value stream)) data)) ((or (vectorp data) (consp data)) (mapc (lambda (subdata) (funcall encoder subdata stream)) (coerce data 'list))) (t (error "Not sequence or hash table.")))) (defun encode-sequence (data stream short-prefix short-length typecode-16 typecode-32 &optional (encoder #'encode-stream)) (let ((len (if (hash-table-p data) (hash-table-count data) (length data)))) (cond ((<= 0 len short-length) (write-byte (+ short-prefix len) stream) (encode-each data stream encoder)) ((<= 0 len 65535) (write-byte typecode-16 stream) (store-big-endian len stream 2) (encode-each data stream encoder)) ((<= 0 len (1- (expt 2 32))) (write-byte typecode-32 stream) (store-big-endian len stream 4) (encode-each data stream encoder))))) (defun encode-hash (data stream) (encode-sequence data stream #x80 15 #xdc #xdd)) (defun encode-array (data stream) (encode-sequence data stream #x90 15 #xdc #xdd)) (defun encode-raw-bytes (data stream) (encode-sequence data stream #xa0 31 #xda #xdb #'write-byte)) (defun encode-integer (data stream) (cond ((<= 0 data 127) (write-byte data stream)) ((<= -32 data -1) (write-byte (sb8->ub8 data) stream)) ((<= 0 data 255) (write-byte #xcc stream) (write-byte data stream)) ((<= 0 data 65535) (write-byte #xcd stream) (store-big-endian data stream 2)) ((<= 0 data (1- (expt 2 32))) (write-byte #xce stream) (store-big-endian data stream 4)) ((<= 0 data (1- (expt 2 64))) (write-byte #xcf stream) (store-big-endian data stream 8)) ((<= -128 data 127) (write-byte #xd0 stream) (write-byte (sb8->ub8 data) stream)) ((<= -32768 data 32767) (write-byte #xd1 stream) (write-byte (sb16->ub16 data) stream)) ((<= (- (expt 2 31)) data (1- (expt 2 31))) (write-byte #xd2 stream) (write-byte (sb32->ub32 data) stream)) ((<= (- (expt 2 63)) data (1- (expt 2 63))) (write-byte #xd3 stream) (write-byte (sb64->ub64 data) stream)) (t (error "Integer too large or too small.")))) (defun store-big-endian (number stream byte-count) (let (byte-list) (loop while (> number 0) do (push (rem number 256) byte-list) (setf number (ash number -8))) (loop while (< (length byte-list) byte-count) do (push 0 byte-list)) (when (> (length byte-list) byte-count) (error "Number too large.")) (write-sequence byte-list stream))) (defun decode (byte-array) (flexi-streams:with-input-from-sequence (stream byte-array) (decode-stream stream))) (defun decode-stream (stream) (let ((byte (read-byte stream))) (cond ((= 0 (ldb (byte 1 7) byte)) byte) ((= 7 (ldb (byte 3 5) byte)) (ub8->sb8 byte)) ((= #xcc byte) (read-byte stream)) ((= #xcd byte) (load-big-endian stream 2)) ((= #xce byte) (load-big-endian stream 4)) ((= #xcf byte) (load-big-endian stream 8)) ((= #xd0 byte) (ub8->sb8 (read-byte stream))) ((= #xd1 byte) (ub16->sb16 (load-big-endian stream 2))) ((= #xd2 byte) (ub32->sb32 (load-big-endian stream 4))) ((= #xd3 byte) (ub64->sb64 (load-big-endian stream 8))) ((= #xc0 byte) nil) ((= #xc3 byte) t) ((= #xc2 byte) nil) ((= #xca byte) (or #+sbcl (sb-kernel:make-single-float (load-big-endian stream 4)) #-(or sbcl) (error "No floating point support yet."))) ((= #xcb byte) (or #+sbcl (sb-kernel:make-double-float (load-big-endian stream 4) (load-big-endian stream 4)) #-(or sbcl) (error "No floating point support yet."))) ((= 5 (ldb (byte 3 5) byte)) (decode-raw-sequence (ldb (byte 5 0) byte) stream)) ((= #xda byte) (decode-raw-sequence (load-big-endian stream 2) stream)) ((= #xdb byte) (decode-raw-sequence (load-big-endian stream 4) stream)) ((= 9 (ldb (byte 4 4) byte)) (decode-array (- byte #x90) stream)) ((= #xdc byte) (decode-array (load-big-endian stream 2) stream)) ((= #xdd byte) (decode-array (load-big-endian stream 4) stream)) ((= 8 (ldb (byte 4 4) byte)) (decode-map (- byte #x80) stream)) ((= #xde byte) (decode-map (load-big-endian stream 2) stream)) ((= #xdf byte) (decode-map (load-big-endian stream 4) stream))))) (defun decode-map (length stream) (let ((hash-table (make-hash-table :test #'equal))) (loop repeat length do (let ((key (decode-stream stream)) (value (decode-stream stream))) (setf (gethash key hash-table) value))) hash-table)) (defun decode-array (length stream) (let ((array (make-array length))) (dotimes (i length) (setf (aref array i) (decode-stream stream))) array)) (defun decode-raw-sequence (length stream) (let ((seq (make-array length :element-type '(mod 256)))) (read-sequence seq stream) (babel:octets-to-string seq))) (defun load-big-endian (stream byte-count) (let ((result 0)) (loop repeat byte-count do (setf result (+ (ash result 8) (read-byte stream)))) result)) charlock-holmes-0.6.9.4/test/fixtures/repl2.cljs0000644000175000017500000000726312133017375021032 0ustar ondrejondrej; Copyright (c) Rich Hickey. All rights reserved. ; The use and distribution terms for this software are covered by the ; Eclipse Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) ; which can be found in the file epl-v10.html at the root of this distribution. ; By using this software in any fashion, you are agreeing to be bound by ; the terms of this license. ; You must not remove this notice, or any other, from this software. (ns clojure.browser.repl2 (:require [clojure.browser.net :as net] [clojure.browser.event :as event] [goog.json :as gjson])) ;; Notes ;; ===== ;; ;; Using keywords for the service names does not work in Chrome or ;; FireFox. ;; ;; -- (defn log-obj [obj] (.log js/console obj)) ;; Outer/Parent Peer ;; ================= ;; ;; The code in this section will be run in the parent page which ;; exists in the application's domain. This is where code will be ;; evaluated. (def parent-channel (atom nil)) (defn- ensure-string [val] (if (string? val) val (str val))) (defn evaluate-javascript "Given a block of JavaScript, evaluate it and transmit the result to the inner peer of the cross domain channel." [block] (log-obj (str "evaluating: " block)) (let [result (pr-str (try {:status :success :value (ensure-string (js* "eval(~{block})"))} (catch js/Error e {:status :exception :value (pr-str e)})))] (log-obj (str "result: " result)) (net/transmit @parent-channel "return-value" result))) (defn create-cross-domain-channel "Create a cross domain channel with an iframe which can communicate with the REPL server." [url] (let [chnl (doto (net/xpc-connection {:peer_uri (str url "/repl")}) (net/register-service "evaluate-javascript" evaluate-javascript) (net/connect document.body (fn [] (log-obj "Parent channel connected.")) (fn [iframe] (set! iframe.style.display "none"))))] (reset! parent-channel chnl))) (defn connect "Connect to a ClojureScript REPL server located at the passed url." [url] (goog.events/listen js/window "load" #(create-cross-domain-channel url))) ;; Inner peer ;; ========= ;; ;; The code in this section will be run in the child iframe and can ;; communicate with REPL server. (def state (atom {:connection nil :url nil})) (def child-channel (atom nil)) (defn transmit-post [connection url data] (net/transmit connection url "POST" data nil 0)) (defn start-repl-connection "Start the REPL loop" [url] (let [connection (net/xhr-connection)] (reset! state {:connection connection :url url}) (event/listen connection :success (fn [e] (net/transmit @child-channel "evaluate-javascript" (.getResponseText e/currentTarget ())))) ;; The server is expecting to see the string "ready" for the ;; initial connection. (transmit-post connection url "ready"))) (defn return-value [val] (log-obj (str "sending: " val)) (transmit-post (:connection @state) (:url @state) val)) ;; I can't get this to work using the clojure.browser.net api. (defn inner-peer-channel "This function will be called from a script in the child iframe." [repl-url] (let [cfg (gjson/parse (.getParameterValue (goog.Uri. window.location.href) "xpc")) chnl (doto (goog.net.xpc.CrossPageChannel. cfg) (net/register-service "return-value" return-value) (.connect #(log-obj "Child channel connected.")))] (do (reset! child-channel chnl) (js/setTimeout #(start-repl-connection repl-url) 500)))) charlock-holmes-0.6.9.4/test/fixtures/core.rkt0000644000175000017500000003032612133017375020577 0ustar ondrejondrej#lang racket/base (require (for-syntax syntax/parse racket/syntax racket) ffi/unsafe racket/function racket/string "start.rkt" "c.rkt") (struct jtype (signature tag predicate ctype racket->java java->racket)) (struct jtype/object jtype (class)) (struct jtype/vector jtype/object (element)) (struct jvector (cpointer type length)) (struct jprocedure (args return proc)) (define ((single-compose f1 f2) e) (f1 (f2 e))) (define (make-jtype obj racket->java java->racket) (let ([composed-racket->java (single-compose (jtype-racket->java obj) racket->java)] [composed-java->racket (single-compose java->racket (jtype-java->racket obj))]) ; due to limitation in racket's struct-copy (cond [(jtype/vector? obj) (struct-copy jtype/vector obj [racket->java #:parent jtype composed-racket->java] [java->racket #:parent jtype composed-java->racket])] [(jtype/object? obj) (struct-copy jtype/object obj [racket->java #:parent jtype composed-racket->java] [java->racket #:parent jtype composed-java->racket])] [else (struct-copy jtype obj [racket->java composed-racket->java] [java->racket composed-java->racket])]))) (define (jtype->ctype obj) (make-ctype (jtype-ctype obj) (jtype-racket->java obj) (jtype-java->racket obj))) ; --- signature makers --- (define (make-class-signature c) (string-append "L" c ";")) (define (make-vector-signature s) (string-append "[" s)) (define (make-signature args return) (let ([args-signature (string-append* (map jtype-signature args))] [return-signature (jtype-signature return)]) (string-append "(" args-signature ")" return-signature))) ; --- predicates for java types on racket --- (require (only-in web-server/dispatch/extend make-coerce-safe?) srfi/26/cut) (define jboolean? boolean?) (define jbyte? byte?) (define jchar? char?) (define jshort? (make-coerce-safe? (cut < -32768 <> 32767))) (define jint? (make-coerce-safe? (cut < -2147483648 <> 2147483647))) (define jlong? (make-coerce-safe? (cut < -9223372036854775808 <> 9223372036854775807))) (define jfloat? single-flonum?) (define jdouble? flonum?) (define jstring? string?) (define ((make-jobject-predicate clss) o) (instance-of? o clss)) (define ((make-jlist-predicate element?) o) (andmap element? o)) ; --- java types --- (define _jboolean (jtype "Z" 'boolean jboolean? __jboolean #f #f)) (define _jbyte (jtype "B" 'byte jbyte? __jbyte #f #f)) (define _jchar (jtype "C" 'char jchar? __jchar char->integer integer->char)) (define _jshort (jtype "S" 'short jshort? __jshort #f #f)) (define _jint (jtype "I" 'int jint? __jint #f #f)) (define _jlong (jtype "J" 'long jlong? __jlong #f #f)) (define _jfloat (jtype "F" 'float jfloat? __jfloat #f #f)) (define _jdouble (jtype "D" 'double jdouble? __jdouble #f #f)) (define _jvoid (jtype "V" 'void #f __jvoid #f #f)) ; hack for _jobject and _jlist so that they dual as a jtype and function (define _jobject ((λ () (struct _jobject jtype/object () #:property prop:procedure (λ (self class-name [racket->java #f] [java->racket #f] [predicate #f]) (let ([class-id (find-class class-name)]) (struct-copy jtype/object self [signature #:parent jtype (make-class-signature class-name)] [predicate #:parent jtype (or predicate (make-jobject-predicate class-id))] [racket->java #:parent jtype racket->java] [java->racket #:parent jtype java->racket] [class class-id])))) (let ([class-id (find-class "Ljava/lang/Object;")]) (_jobject "Ljava/lang/Object;" 'object (make-jobject-predicate class-id) __jobject #f #f class-id))))) (define _jstring (_jobject "java/lang/String" new-string get-string jstring?)) (define _jlist ((λ () (struct _jlist jtype/vector () #:property prop:procedure (λ (self element) (define-values (make-array array-ref array-set!) (tag->array-info (jtype-tag element))) (when (jtype/object? element) (let ([clss (jtype/object-class element)]) (set! make-array (λ (n) (new-object-array n clss #f))))) (let* ([signature (make-vector-signature (jtype-signature element))] [element-racket->java (or (jtype-racket->java element) identity)] [element-java->racket (or (jtype-java->racket element) identity)] [element? (or (jtype-predicate element) (λ (_) #t))]) (struct-copy jtype/vector self [signature #:parent jtype signature] [predicate #:parent jtype (make-jlist-predicate element?)] [ctype #:parent jtype __jobject] [racket->java #:parent jtype (λ (c) (let ([array (make-array (length c))]) (for ([e (in-list c)] [i (in-naturals)]) (array-set! array i (element-racket->java e))) array))] [java->racket #:parent jtype (λ (c) (for/list ([i (in-range (get-array-length c))]) (element-java->racket (array-ref c i))))] [class #:parent jtype/object (find-class signature)] [element element])))) (let ([class-id (find-class "[Ljava/lang/Object;")] [element-class-id (jtype/object-class _jobject)]) (_jlist "[Ljava/lang/Object;" 'object (make-jobject-predicate element-class-id) __jobject (λ (c) (let ([array (new-object-array (length c) element-class-id #f)]) (for ([e (in-list c)] [i (in-naturals)]) (set-object-array-element array i e)) array)) (λ (c) (for/list ([i (in-range (get-array-length c))]) (get-object-array-element c i))) class-id _jobject))))) (define-syntax (_jmethod stx) (define-syntax-class type #:literals (->) (pattern (~and x (~not (~or (~literal ...) ->))))) (syntax-parse stx #:literals (->) [(_ arg:type ... (~optional (~seq farg:type (~literal ...))) (~optional (~seq -> return*))) (with-syntax* ([(arg* ...) (generate-temporaries #'(arg ...))] [(larg ... . marg) #`(arg* ... #,@(if (attribute farg) #'arg-rest #`()))] [(aarg ...) #`(arg* ... #,@(if (attribute farg) #'(arg-rest) #`()))] [return (if (attribute return*) #'return* #'_jvoid)]) #`(let* ([args (list arg ... #,@(if (attribute farg) #`((_jlist farg)) #`()))]) (jprocedure args return (λ (type jnienv clss method func) (case type [(constructor) (λ (larg ... . marg) (func jnienv clss method aarg ...))] [(static-method) (λ (larg ... . marg) (func jnienv clss method aarg ...))] [(method) (λ (o larg ... . marg) (func jnienv o method aarg ...))] [else (error '_jmethod "invalid type provided")])))))])) ; dynamic and slower version of _jmethod (define (_jprocedure args return #:repeat-last-arg? [repeat-last-arg? #f]) (define (nest-at lst i) (if (null? lst) (list null) (let loop ([lst lst] [i i]) (cond [(null? lst) null] [(zero? i) (list lst)] [else (cons (car lst) (loop (cdr lst) (sub1 i)))])))) (jprocedure args return (if repeat-last-arg? (let ([repeat-position (sub1 (length args))]) (λ (type jnienv clss method func) (case type [(constructor) (λ larg (apply func jnienv clss method (nest-at larg repeat-position)))] [(static-method) (λ larg (apply func jnienv clss method (nest-at larg repeat-position)))] [(method) (λ (o . larg) (apply func jnienv o method (nest-at larg repeat-position)))]))) (λ (type jnienv clss method func) (case type [(constructor) (λ larg (apply func jnienv clss method larg))] [(static-method) (λ larg (apply func jnienv clss method larg))] [(method) (λ (o . larg) (apply func jnienv o method larg))] [else (error '_jprocedure "invalid type provided")]))))) ; get-jmethod/get-jconstructor pass the following arguments (type jnienv class method func) ; to a function created by _jmethod or _jprocedure ; according to the type the function returns one of the following functions ; | constructor (λ (args ...) ; doesn't need to take in an object and the class is static ; | static-method (λ (args ...) ; same reasoning as above ; | method (λ (object args ...) ; --- interfacing with java methods --- (define (get-jconstructor class-id type) (let* ([args (jprocedure-args type)] [return (jprocedure-return type)] [proc (jprocedure-proc type)] [signature (make-signature args return)] [method-id (get-method-id class-id "" signature)] [ffi-func (get-jrffi-obj "new-object" (_cprocedure (list* __jnienv __jclass __jmethodID (map jtype->ctype args)) __jobject))]) (proc 'constructor current-jnienv class-id method-id ffi-func))) (define (get-jmethod class-id method-name type #:static? [static? #f]) (let* ([args (jprocedure-args type)] [return (jprocedure-return type)] [proc (jprocedure-proc type)] [signature (make-signature args return)] [method-id (get-method-id class-id method-name signature #:static? static?)] [type (if static? 'static-method 'method)] [ffi-func (get-jrffi-obj (format "call-~a~a-method" (if static? "static-" "") (jtype-tag return)) (_cprocedure (append (list __jnienv (if static? __jclass __jobject) __jmethodID) (map jtype->ctype args)) (jtype->ctype return)))]) (proc type current-jnienv class-id method-id ffi-func))) ; --- interfacing with java fields --- (define (get-jaccessor class-id field-name type #:static? [static? #f]) (let* ([signature (jtype-signature class-id field-name (jtype-signature type))] [field-id (get-field-id class-id field-name signature #:static? static?)] [ffi-func (get-jrffi-obj (format "get-~a~a-field" (if static? "static-" "") (jtype-tag type)) (_cprocedure (list __jnienv (if static? __jclass __jobject) __jfieldID) type))]) (if static? (λ () (ffi-func current-jnienv class-id field-id)) (λ (obj) (ffi-func current-jnienv obj field-id))))) (define (get-jmutator class-id field-name type #:static? [static? #f]) (let* ([signature (jtype-signature class-id field-name (jtype-signature type))] [field-id (get-field-id class-id field-name signature #:static? static?)] [ffi-func (get-jrffi-obj (format "set-~a~a-field" (if static? "static-" "") (jtype-tag type)) (_cprocedure (list __jnienv (if static? __jclass __jobject) __jfieldID type) type))]) (if static? (λ (new-value) (ffi-func current-jnienv class-id field-id new-value)) (λ (obj new-value) (ffi-func current-jnienv obj field-id new-value))))) (define (get-jparameter class-id field-name type #:static? [static? #f]) (let* ([accessor (get-jaccessor class-id field-name type #:static? static?)] [mutator (get-jmutator class-id field-name type #:static? static?)]) (if static? (case-lambda [() (accessor)] [(new-value) (mutator new-value)]) (case-lambda [(obj) (accessor obj)] [(obj new-value) (mutator obj new-value)])))) (provide _jboolean _jbyte _jchar _jshort _jint _jlong _jfloat _jdouble _jvoid _jobject _jstring _jlist) (provide get-jconstructor get-jmethod get-jparameter get-jmutator get-jaccessor) ;(provide instance-of? (rename-out [find-class find-class]) get-method-id get-field-id) (provide (all-defined-out) : -> current-jnienv) charlock-holmes-0.6.9.4/test/fixtures/hello_world0000755000175000017500000002076012133017375021366 0ustar ondrejondrej  H__PAGEZERO(__TEXT__text__TEXT`__stubs__TEXT  __stub_helper__TEXT,$,__cstring__TEXTP P__unwind_info__TEXT\P\__eh_frame__TEXTH__DATA__program_vars__DATA(__nl_symbol_ptr__DATA((__la_symbol_ptr__DATA88__common__DATAH H__LINKEDIT  "0   8 x !p Ph! /usr/lib/dyld>;9m:$ * 8/usr/lib/libSystem.B.dylib& jHHH}HuHHHH9uH %UHHHAHEH]%%h h LAS%hello world44!4 @zRx ,0  HPX`,6"UBR@dyld_stub_binderQr(r8@_exitr@@_puts_ startK_'mainPNXArgUenvirongmh_execute_headerG_prognamelc]vb    @  H P `%9 XBHNTZ @@ _pvars_NXArgc_NXArgv___progname__mh_execute_header_environ_mainstart_exit_putsdyld_stub_bindercharlock-holmes-0.6.9.4/test/fixtures/TwigExtensionsDate.es.yml0000644000175000017500000000044712133017375024047 0ustar ondrejondrejdate.year: '%year% año|%year% años' date.month: '%month% mes|%month% meses' date.day: '%day% día|%day% días' date.hour: '%hour% hora|%hour% horas' date.minute: '%minute% minuto|%minute% minutos' date.second: '%second% segundo|%second% segundos' date.new: 'menos de un minuto' date.and: ' y 'charlock-holmes-0.6.9.4/.gitignore0000644000175000017500000000017212133017375016261 0ustar ondrejondrej.bundle/ tmp/ vendor/ *.bundle ext/charlock_holmes/dst *.a ext/charlock_holmes/src/file-* ext/charlock_holmes/src/mkmf.logcharlock-holmes-0.6.9.4/charlock_holmes.gemspec0000644000175000017500000000155312133017375020777 0ustar ondrejondrej# encoding: utf-8 require './lib/charlock_holmes/version' unless defined? CharlockHolmes::VERSION Gem::Specification.new do |s| s.name = %q{charlock_holmes} s.version = CharlockHolmes::VERSION s.authors = ["Brian Lopez", "Vicent Martí"] s.date = Time.now.utc.strftime("%Y-%m-%d") s.email = %q{seniorlopez@gmail.com} s.extensions = ["ext/charlock_holmes/extconf.rb"] s.files = `git ls-files`.split("\n") s.homepage = %q{http://github.com/brianmario/charlock_holmes} s.rdoc_options = ["--charset=UTF-8"] s.require_paths = ["lib"] s.rubygems_version = %q{1.4.2} s.summary = %q{Character encoding detection, brought to you by ICU} s.test_files = `git ls-files spec`.split("\n") # tests s.add_development_dependency 'rake-compiler', ">= 0.7.5" s.add_development_dependency 'minitest' # benchmarks s.add_development_dependency 'chardet' end charlock-holmes-0.6.9.4/lib/0000755000175000017500000000000012133017375015037 5ustar ondrejondrejcharlock-holmes-0.6.9.4/lib/charlock_holmes/0000755000175000017500000000000012133017375020174 5ustar ondrejondrejcharlock-holmes-0.6.9.4/lib/charlock_holmes/string.rb0000644000175000017500000000210512133017375022025 0ustar ondrejondrejrequire 'charlock_holmes' unless defined? CharlockHolmes class String # Attempt to detect the encoding of this string # # Returns: a Hash with :encoding, :language, :type and :confidence def detect_encoding(hint_enc=nil) detector = CharlockHolmes::EncodingDetector.new detector.detect(self, hint_enc) end # Attempt to detect the encoding of this string, and return # a list with all the possible encodings that match it. # # Returns: an Array with zero or more Hashes, # each one of them with with :encoding, :language, :type and :confidence def detect_encodings(hint_enc=nil) detector = CharlockHolmes::EncodingDetector.new detector.detect_all(self, hint_enc) end if RUBY_VERSION =~ /1.9/ # Attempt to detect the encoding of this string # then set the encoding to what was detected ala `force_encoding` # # Returns: self def detect_encoding!(hint_enc=nil) if detected = self.detect_encoding(hint_enc) self.force_encoding(detected[:encoding]) if detected[:encoding] end self end end end charlock-holmes-0.6.9.4/lib/charlock_holmes/encoding_detector.rb0000644000175000017500000000241112133017375024176 0ustar ondrejondrejmodule CharlockHolmes class EncodingDetector alias :strip_tags? :strip_tags # Attempt to detect the encoding of this string # # NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call # # str - a String, what you want to detect the encoding of # hint_enc - an optional String (like "UTF-8"), the encoding name which will # be used as an additional hint to the charset detector # # Returns: a Hash with :encoding, :language, :type and :confidence def self.detect(str, hint_enc=nil) new.detect(str, hint_enc) end # Attempt to detect the encoding of this string, and return # a list with all the possible encodings that match it. # # NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call # # str - a String, what you want to detect the encoding of # hint_enc - an optional String (like "UTF-8"), the encoding name which will # be used as an additional hint to the charset detector # # Returns: an Array with zero or more Hashes, # each one of them with with :encoding, :language, :type and :confidence def self.detect_all(str, hint_enc=nil) new.detect_all(str, hint_enc) end end endcharlock-holmes-0.6.9.4/lib/charlock_holmes/version.rb0000644000175000017500000000006012133017375022202 0ustar ondrejondrejmodule CharlockHolmes VERSION = "0.6.9.4" end charlock-holmes-0.6.9.4/lib/charlock_holmes.rb0000644000175000017500000000037112133017375020522 0ustar ondrejondrejrequire 'charlock_holmes/charlock_holmes' require 'charlock_holmes/encoding_detector' require 'charlock_holmes/version' unless defined? CharlockHolmes::VERSION # require this if you want the String monkey patches # require 'charlock_holmes/string' charlock-holmes-0.6.9.4/README.md0000644000175000017500000000640312133017375015553 0ustar ondrejondrej# CharlockHolmes Character encoding detecting library for Ruby using [ICU](http://site.icu-project.org/) ## Usage First you'll need to require it ``` ruby require 'charlock_holmes' ``` ## Encoding detection ``` ruby contents = File.read('test.xml') detection = CharlockHolmes::EncodingDetector.detect(contents) # => {:encoding => 'UTF-8', :confidence => 100, :type => :text} # optionally there will be a :language key as well, but # that's mostly only returned for legacy encodings like ISO-8859-1 ``` NOTE: `CharlockHolmes::EncodingDetector.detect` will return `nil` if it was unable to find an encoding. For binary content, `:type` will be set to `:binary` Though it's more efficient to reuse once detector instance: ``` ruby detector = CharlockHolmes::EncodingDetector.new detection1 = detector.detect(File.read('test.xml')) detection2 = detector.detect(File.read('test2.json')) # and so on... ``` ### String monkey patch Alternatively, you can just use the `detect_encoding` method on the `String` class ``` ruby require 'charlock_holmes/string' contents = File.read('test.xml') detection = contents.detect_encoding ``` ### Ruby 1.9 specific NOTE: This method only exists on Ruby 1.9+ If you want to use this library to detect and set the encoding flag on strings, you can use the `detect_encoding!` method on the `String` class ``` ruby require 'charlock_holmes/string' contents = File.read('test.xml') # this will detect and set the encoding of `contents`, then return self contents.detect_encoding! ``` ## Transcoding Being able to detect the encoding of some arbitrary content is nice, but what you probably want is to be able to transcode that content into an encoding your application is using. ``` ruby content = File.read('test2.txt') detection = CharlockHolmes::EncodingDetector.detect(content) utf8_encoded_content = CharlockHolmes::Converter.convert content, detection[:encoding], 'UTF-8' ``` The first parameter is the content to transcode, the second is the source encoding (the encoding the content is assumed to be in), and the third parameter is the destination encoding. ## Installing If the traditional `gem install charlock_holmes` doesn't work, you may need to specify the path to your installation of ICU using the `--with-icu-dir` option during the gem install or by configuring Bundler to pass those arguments to Gem: Configure Bundler to always use the correct arguments when installing: bundle config build.charlock_holmes --with-icu-dir=/path/to/installed/icu4c Using Gem to install directly without Bundler: gem install charlock_holmes -- --with-icu-dir=/path/to/installed/icu4c ### Homebrew If you're installing on Mac OS X then using [Homebrew](http://mxcl.github.com/homebrew/) is the easiest way to install ICU. However, be warned; it is a Keg-Only (see [homedir issue #167](https://github.com/mxcl/homebrew/issues/167) for more info) install meaning RubyGems won't find it when installing without specifying `--with-icu-dir` To install ICU with Homebrew: brew install icu4c Configure Bundler to always use the correct arguments when installing: bundle config build.charlock_holmes --with-icu-dir=/usr/local/opt/icu4c Using Gem to install directly without Bundler: gem install charlock_holmes -- --with-icu-dir=/usr/local/opt/icu4c charlock-holmes-0.6.9.4/Gemfile.lock0000644000175000017500000000044712133017375016520 0ustar ondrejondrejPATH remote: . specs: charlock_holmes (0.6.9.4) GEM remote: http://rubygems.org/ specs: chardet (0.9.0) minitest (4.6.2) rake (0.9.2) rake-compiler (0.7.9) rake PLATFORMS ruby DEPENDENCIES chardet charlock_holmes! minitest rake-compiler (>= 0.7.5) charlock-holmes-0.6.9.4/metadata.yml0000644000175000017500000000612212133017375016575 0ustar ondrejondrej--- !ruby/object:Gem::Specification name: charlock_holmes version: !ruby/object:Gem::Version version: 0.6.9.4 prerelease: platform: ruby authors: - Brian Lopez - Vicent Martí autorequire: bindir: bin cert_chain: [] date: 2013-04-03 00:00:00.000000000 Z dependencies: - !ruby/object:Gem::Dependency name: rake-compiler requirement: !ruby/object:Gem::Requirement none: false requirements: - - ! '>=' - !ruby/object:Gem::Version version: 0.7.5 type: :development prerelease: false version_requirements: !ruby/object:Gem::Requirement none: false requirements: - - ! '>=' - !ruby/object:Gem::Version version: 0.7.5 - !ruby/object:Gem::Dependency name: minitest requirement: !ruby/object:Gem::Requirement none: false requirements: - - ! '>=' - !ruby/object:Gem::Version version: '0' type: :development prerelease: false version_requirements: !ruby/object:Gem::Requirement none: false requirements: - - ! '>=' - !ruby/object:Gem::Version version: '0' - !ruby/object:Gem::Dependency name: chardet requirement: !ruby/object:Gem::Requirement none: false requirements: - - ! '>=' - !ruby/object:Gem::Version version: '0' type: :development prerelease: false version_requirements: !ruby/object:Gem::Requirement none: false requirements: - - ! '>=' - !ruby/object:Gem::Version version: '0' description: email: seniorlopez@gmail.com executables: [] extensions: - ext/charlock_holmes/extconf.rb extra_rdoc_files: [] files: - .gitignore - Gemfile - Gemfile.lock - MIT-LICENSE - README.md - Rakefile - benchmark/detection.rb - benchmark/test.txt - charlock_holmes.gemspec - ext/charlock_holmes/common.h - ext/charlock_holmes/converter.c - ext/charlock_holmes/encoding_detector.c - ext/charlock_holmes/ext.c - ext/charlock_holmes/extconf.rb - ext/charlock_holmes/src/file-5.08.tar.gz - ext/charlock_holmes/src/file-soft-check.patch - ext/charlock_holmes/transliterator.cpp - lib/charlock_holmes.rb - lib/charlock_holmes/encoding_detector.rb - lib/charlock_holmes/string.rb - lib/charlock_holmes/version.rb - test/converter_test.rb - test/encoding_detector_test.rb - test/fixtures/AnsiGraph.psm1 - test/fixtures/TwigExtensionsDate.es.yml - test/fixtures/cl-messagepack.lisp - test/fixtures/core.rkt - test/fixtures/hello_world - test/fixtures/laholator.py - test/fixtures/repl2.cljs - test/helper.rb - test/string_methods_test.rb - test/transliterator_test.rb homepage: http://github.com/brianmario/charlock_holmes licenses: [] post_install_message: rdoc_options: - --charset=UTF-8 require_paths: - lib required_ruby_version: !ruby/object:Gem::Requirement none: false requirements: - - ! '>=' - !ruby/object:Gem::Version version: '0' required_rubygems_version: !ruby/object:Gem::Requirement none: false requirements: - - ! '>=' - !ruby/object:Gem::Version version: '0' requirements: [] rubyforge_project: rubygems_version: 1.8.23 signing_key: specification_version: 3 summary: Character encoding detection, brought to you by ICU test_files: [] charlock-holmes-0.6.9.4/Gemfile0000644000175000017500000000003112133017375015556 0ustar ondrejondrejsource :rubygems gemspeccharlock-holmes-0.6.9.4/benchmark/0000755000175000017500000000000012133017375016223 5ustar ondrejondrejcharlock-holmes-0.6.9.4/benchmark/test.txt0000644000175000017500000020162712133017375017753 0ustar ondrejondrej XML - 维基百科,自由的百科全书

XML

维基百科,自由的百科全书
跳转到: 导航, 搜索

跳过字词转换说明

RecipeBook的例子,一種基於XML語法上的烹飪技術書刊。此標籤可轉換為:HTML, PDF以及Rich Text Format並使用程式語言XSL

可扩展置标语言英语eXtensible Markup Language,简称:XML),又称可扩展标记语言,是一种置标语言。置标指计算机所能理解的信息符号,通过此种标记,计算机之间可以处理包含各种信息的文章等。如何定义这些标记,既可以选择国际通用的标记语言,比如HTML,也可以使用像XML这样由相关人士自由决定的标记语言,这就是语言的可扩展性。XML是从标准通用置标语言(SGML)中简化修改出来的。它主要用到的有可扩展置标语言、可扩展样式语言(XSL)、XBRLXPath等。

目录

[编辑] 歷史

XML是從1995年開始有其雛形,並向W3C全球資訊網聯盟)提案,而在1998二月發佈為W3C的標準(XML1.0)。XML的前身是SGMLThe Standard Generalized Markup Language),是自IBM從1960年代就開始發展的GMLGeneralized Markup Language)標準化後的名稱。

GML的重要概念:

  • 文件中能夠明確的將標示與內容分開
  • 所有文件的標示使用方法均一致

1978年,ANSI將GML加以整理規範,發佈成為SGML,1986年起為ISO所採用(ISO 8879),並且被廣泛地運用在各種大型的文件計劃中,但是SGML是一種非常嚴謹的文件描述法,導致過於龐大複雜(標準手冊就有500多頁),難以理解和學習,進而影響其推廣與應用。

同時W3C也發現到HTML的問題:

  • 不能解決所有解釋資料的問題 - 像是影音檔或化學公式、音樂符號等其他形態的內容。
  • 效能問題 - 需要下載整份文件,才能開始對文件做搜尋。
  • 擴充性、彈性、易讀性均不佳。

為了解決以上問題,專家們使用SGML精簡製作,並依照HTML的發展經驗,產生出一套使用上規則嚴謹,但是簡單的描述資料語言:XML。 XML是在一個這樣的背景下誕生的——为了有一個更中立的方式,讓消費端自行決定要如何消化、呈現從服務端所提供的資訊。

XML被廣泛用來作為跨平台之間交互數據的形式,主要針對數據的內容,通過不同的格式化描述手段(XSLT,CSS等)可以完成最終的形式表達(生成對應的HTML,PDF或者其他的文件格式)。

[编辑] 用途

XML设计用来传送及携带数据信息,不用来表现或展示数据,HTML语言則用来表现数据,所以XML用途的焦点是它说明数据是什么,以及携带数据信息。

  • 丰富文件(Rich Documents)- 自定文件描述并使其更丰富
    • 属于文件为主的XML技术应用
    • 标记是用来定义一份资料应该如何呈现
  • 元数据(Metadata)- 描述其它文件或网络资讯
    • 属于资料为主的XML技术应用
    • 标记是用来说明一份资料的意义
  • 設定档案(Configuration Files)- 描述软件設定的参数

[编辑]

XML定义结构、存储信息、传送信息。下例為張旭发送给陳貞伶的便条,存储为XML。

 
<小纸条> 
    <收件人>陳貞伶</收件人> 
    <发件人>張旭</发件人> 
    <主题>問候</主题> 
    <具体内容>最近可好?</具体内容> 
</小纸条> 

这XML文档仅是纯粹的信息标签,这些标签意义的展开依赖于应用它的程序。

[编辑] 结构

每个XML文档都由XML序言开始,在前面的代码中的第一行便是XML序言,<?xml version="1.0"?>。这一行代码会告诉解析器和浏览器,这个文件应该按照前面讨论过的XML规则进行解析。第二行代码,<books>,则是文档元素(document element),它是文件中最外面的标签(我们认为元素(element)是起始标签和结束标签之间的内容)。所有其他的标签必须包含在这个标签之内来组成一个有效的XML文件。XML文件的第二行并不一定要包含文档元素;如果有注释或者其他内容,文档元素可以迟些出现。

范例文件中的第三行代码是注释,你会发现它与HTML中使用的注释风格是一样的。这是XML从SGML中继承的语法元素之一。

页面再往下的一些地方,可以发现<desc>标签裡有一些特殊的语法。<![CDATA[ ]]>代码用于表示无需进行解析的文本,允许诸如大于号和小于号之类的特殊字符包含在文本中,而无需担心破坏XML的语法。文本必须出现在<![CDATA[和]]>之间才能合适地避免被解析。这样的文本称为Character Data Section,简称CData Section。

下面的一行就是在第二本书的定义之前的:

<?page render multiple authors ?>

虽然它看上去很像XML序言,但实际上是一种称为处理指令(processing instruction)的不同类型的语法。处理指令(以下简称PI)的目的是为了给处理页面的程序(例如XML解析器)提供额外的信息。PI通常情况下是没有固定格式的,唯一的要求是紧随第一个问号必须至少有一个字母。在此之后,PI可以包含除了小于号和大于号之外的任何字符串序列。

最常见的PI是用来指定XML文件的样式表:

这个PI一般会直接放在XML序言之后,通常由Web浏览器使用,来将XML数据以特殊的样式显示出来。

XML的结构有一个缺陷,那就是不支持分帧(framing)。当多条XML消息在TCP上传输的时候,无法基于XML协议来确定一条XML消息是否已经结束。

[编辑] 参见

[编辑] 外部链接


charlock-holmes-0.6.9.4/benchmark/detection.rb0000644000175000017500000000145312133017375020531 0ustar ondrejondrej$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__) RUBY_19 = !!(RUBY_VERSION =~ /1.9/) require 'charlock_holmes' # the chardet gem isn't compatible with 1.9 require 'UniversalDetector' unless RUBY_19 require 'benchmark' CONTENT = File.read(File.expand_path('../test.txt', __FILE__)) TIMES = 100 DETECTOR = CharlockHolmes::EncodingDetector.new Benchmark.bmbm do |x| # new detector every iteration x.report 'singleton call' do TIMES.times do CharlockHolmes::EncodingDetector.detect CONTENT end end # shared detector for all iterations x.report 'reusing a single detector' do TIMES.times do DETECTOR.detect CONTENT end end unless RUBY_19 x.report 'chardet' do TIMES.times do UniversalDetector.chardet CONTENT end end end end charlock-holmes-0.6.9.4/MIT-LICENSE0000644000175000017500000000207512133017375015731 0ustar ondrejondrejCopyright (c) 2011 Brian Lopez - http://github.com/brianmario Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.