charlock_holmes-0.7.6/0000755000004100000410000000000013301507117014707 5ustar www-datawww-datacharlock_holmes-0.7.6/charlock_holmes.gemspec0000644000004100000410000000442613301507117021417 0ustar www-datawww-data######################################################### # This file has been automatically generated by gem2tgz # ######################################################### # -*- encoding: utf-8 -*- # stub: charlock_holmes 0.7.6 ruby lib # stub: ext/charlock_holmes/extconf.rb Gem::Specification.new do |s| s.name = "charlock_holmes".freeze s.version = "0.7.6" s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version= s.require_paths = ["lib".freeze] s.authors = ["Brian Lopez".freeze, "Vicent Mart\u{ed}".freeze] s.date = "2018-03-29" s.description = "charlock_holmes provides binary and text detection as well as text transcoding using libicu".freeze s.email = "seniorlopez@gmail.com".freeze s.extensions = ["ext/charlock_holmes/extconf.rb".freeze] s.files = ["ext/charlock_holmes/common.h".freeze, "ext/charlock_holmes/converter.c".freeze, "ext/charlock_holmes/encoding_detector.c".freeze, "ext/charlock_holmes/ext.c".freeze, "ext/charlock_holmes/extconf.rb".freeze, "ext/charlock_holmes/transliterator.cpp".freeze, "lib/charlock_holmes.rb".freeze, "lib/charlock_holmes/encoding_detector.rb".freeze, "lib/charlock_holmes/string.rb".freeze, "lib/charlock_holmes/version.rb".freeze] s.homepage = "https://github.com/brianmario/charlock_holmes".freeze s.licenses = ["MIT".freeze] s.rdoc_options = ["--charset=UTF-8".freeze] s.required_ruby_version = Gem::Requirement.new(">= 1.9.3".freeze) s.rubygems_version = "2.5.2.1".freeze s.summary = "Character encoding detection, brought to you by ICU".freeze if s.respond_to? :specification_version then s.specification_version = 4 if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then s.add_development_dependency(%q.freeze, ["~> 0.9"]) s.add_development_dependency(%q.freeze, ["~> 5.11"]) s.add_development_dependency(%q.freeze, ["~> 1.0"]) else s.add_dependency(%q.freeze, ["~> 0.9"]) s.add_dependency(%q.freeze, ["~> 5.11"]) s.add_dependency(%q.freeze, ["~> 1.0"]) end else s.add_dependency(%q.freeze, ["~> 0.9"]) s.add_dependency(%q.freeze, ["~> 5.11"]) s.add_dependency(%q.freeze, ["~> 1.0"]) end end charlock_holmes-0.7.6/lib/0000755000004100000410000000000013301507117015455 5ustar www-datawww-datacharlock_holmes-0.7.6/lib/charlock_holmes.rb0000644000004100000410000000037113301507117021140 0ustar www-datawww-datarequire 'charlock_holmes/charlock_holmes' require 'charlock_holmes/encoding_detector' require 'charlock_holmes/version' unless defined? CharlockHolmes::VERSION # require this if you want the String monkey patches # require 'charlock_holmes/string' charlock_holmes-0.7.6/lib/charlock_holmes/0000755000004100000410000000000013301507117020612 5ustar www-datawww-datacharlock_holmes-0.7.6/lib/charlock_holmes/version.rb0000644000004100000410000000005613301507117022625 0ustar www-datawww-datamodule CharlockHolmes VERSION = "0.7.6" end charlock_holmes-0.7.6/lib/charlock_holmes/string.rb0000644000004100000410000000213113301507117022442 0ustar www-datawww-datarequire 'charlock_holmes' unless defined? CharlockHolmes class String # Attempt to detect the encoding of this string # # Returns: a Hash with :encoding, :language, :type and :confidence def detect_encoding(hint_enc=nil) detector = CharlockHolmes::EncodingDetector.new detector.detect(self, hint_enc) end # Attempt to detect the encoding of this string, and return # a list with all the possible encodings that match it. # # Returns: an Array with zero or more Hashes, # each one of them with with :encoding, :language, :type and :confidence def detect_encodings(hint_enc=nil) detector = CharlockHolmes::EncodingDetector.new detector.detect_all(self, hint_enc) end if method_defined? :force_encoding # Attempt to detect the encoding of this string # then set the encoding to what was detected ala `force_encoding` # # Returns: self def detect_encoding!(hint_enc=nil) if detected = self.detect_encoding(hint_enc) self.force_encoding(detected[:ruby_encoding]) if detected[:ruby_encoding] end self end end end charlock_holmes-0.7.6/lib/charlock_holmes/encoding_detector.rb0000644000004100000410000000522513301507117024622 0ustar www-datawww-datamodule CharlockHolmes class EncodingDetector # Default length for which to scan content for NULL bytes DEFAULT_BINARY_SCAN_LEN = 1024*1024 # Length for which to scan content for NULL bytes attr_accessor :binary_scan_length alias :strip_tags? :strip_tags def initialize(scan_len=DEFAULT_BINARY_SCAN_LEN) @binary_scan_length = scan_len end # Attempt to detect the encoding of this string # # NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call # as well as use the default binary scan length # # str - a String, what you want to detect the encoding of # hint_enc - an optional String (like "UTF-8"), the encoding name which will # be used as an additional hint to the charset detector # # Returns: a Hash with :encoding, :language, :type and :confidence def self.detect(str, hint_enc=nil) new.detect(str, hint_enc) end # Attempt to detect the encoding of this string, and return # a list with all the possible encodings that match it. # # NOTE: This will create a new CharlockHolmes::EncodingDetector instance on every call # as well as use the default binary scan length # # str - a String, what you want to detect the encoding of # hint_enc - an optional String (like "UTF-8"), the encoding name which will # be used as an additional hint to the charset detector # # Returns: an Array with zero or more Hashes, # each one of them with with :encoding, :language, :type and :confidence def self.detect_all(str, hint_enc=nil) new.detect_all(str, hint_enc) end # A mapping table of supported encoding names from EncodingDetector # which point to the corresponding supported encoding name in Ruby. # Like: {"UTF-8" => "UTF-8", "IBM420_rtl" => "ASCII-8BIT"} # # Note that encodings that can't be mapped between Charlock and Ruby will resolve # to "ASCII-8BIT". @encoding_table = {} def self.encoding_table @encoding_table end BINARY = 'binary' # Builds the ENCODING_TABLE hash by running through the list of supported encodings # in the ICU detection API and trying to map them to supported encodings in Ruby. # This is built dynamically so as to take advantage of ICU upgrades which may have # support for more encodings in the future. # # Returns nothing. def self.build_encoding_table supported_encodings.each do |name| @encoding_table[name] = begin ::Encoding.find(name).name rescue ArgumentError BINARY end end end build_encoding_table end end charlock_holmes-0.7.6/ext/0000755000004100000410000000000013301507117015507 5ustar www-datawww-datacharlock_holmes-0.7.6/ext/charlock_holmes/0000755000004100000410000000000013301507117020644 5ustar www-datawww-datacharlock_holmes-0.7.6/ext/charlock_holmes/converter.c0000644000004100000410000000274613301507117023030 0ustar www-datawww-data#include "unicode/ucnv.h" #include "common.h" extern VALUE rb_mCharlockHolmes; static VALUE rb_cConverter; static VALUE rb_converter_convert(VALUE self, VALUE rb_txt, VALUE rb_src_enc, VALUE rb_dst_enc) { VALUE rb_out; const char *src_enc; const char *dst_enc; const char *src_txt; char *out_buf; void *rb_enc = NULL; int32_t src_len; int32_t out_len; UErrorCode status = U_ZERO_ERROR; Check_Type(rb_txt, T_STRING); Check_Type(rb_src_enc, T_STRING); Check_Type(rb_dst_enc, T_STRING); src_txt = RSTRING_PTR(rb_txt); src_len = RSTRING_LEN(rb_txt); src_enc = RSTRING_PTR(rb_src_enc); dst_enc = RSTRING_PTR(rb_dst_enc); // first determin the size of the output buffer out_len = ucnv_convert(dst_enc, src_enc, NULL, 0, src_txt, src_len, &status); if (status != U_BUFFER_OVERFLOW_ERROR) { rb_raise(rb_eArgError, "%s", u_errorName(status)); } out_buf = malloc(out_len); // now do the actual conversion status = U_ZERO_ERROR; out_len = ucnv_convert(dst_enc, src_enc, out_buf, out_len, src_txt, src_len, &status); if (U_FAILURE(status)) { free(out_buf); rb_raise(rb_eArgError, "%s", u_errorName(status)); } #ifdef HAVE_RUBY_ENCODING_H rb_enc = (void *)rb_enc_find(dst_enc); #endif rb_out = charlock_new_enc_str(out_buf, out_len, rb_enc); free(out_buf); return rb_out; } void _init_charlock_converter() { rb_cConverter = rb_define_class_under(rb_mCharlockHolmes, "Converter", rb_cObject); rb_define_singleton_method(rb_cConverter, "convert", rb_converter_convert, 3); } charlock_holmes-0.7.6/ext/charlock_holmes/extconf.rb0000644000004100000410000000400513301507117022636 0ustar www-datawww-datarequire 'mkmf' CWD = File.expand_path(File.dirname(__FILE__)) def sys(cmd) puts " -- #{cmd}" unless ret = xsystem(cmd) raise "#{cmd} failed, please report issue on https://github.com/brianmario/charlock_holmes" end ret end if `which make`.strip.empty? STDERR.puts "\n\n" STDERR.puts "***************************************************************************************" STDERR.puts "*************** make required (apt-get install make build-essential) =( ***************" STDERR.puts "***************************************************************************************" exit(1) end ## # ICU dependency # dir_config 'icu' rubyopt = ENV.delete("RUBYOPT") icuconfig = "" icu4c = "/usr" # detect homebrew installs if !have_library 'icui18n' base = if !`which brew`.empty? `brew --cellar`.strip elsif File.exists?("/usr/local/Cellar/icu4c") '/usr/local/Cellar' end if base and icu4c = Dir[File.join(base, 'icu4c/*')].sort.last $INCFLAGS << " -I#{icu4c}/include " $LDFLAGS << " -L#{icu4c}/lib " icuconfig = "#{icu4c}/bin/icu-config" end end unless have_library 'icui18n' and have_header 'unicode/ucnv.h' STDERR.puts "\n\n" STDERR.puts "***************************************************************************************" STDERR.puts "*********** icu required (brew install icu4c or apt-get install libicu-dev) ***********" STDERR.puts "***************************************************************************************" exit(1) end have_library 'z' or abort 'libz missing' have_library 'icuuc' or abort 'libicuuc missing' have_library 'icudata' or abort 'libicudata missing' # icu4c might be built in C++11 mode, but it also might not have been icuconfig = `which icu-config`.chomp if icuconfig.empty? if File.exist?(icuconfig) && `#{icuconfig} --cxxflags`.include?("c++11") $CXXFLAGS << ' -std=c++11' end $CFLAGS << ' -Wall -funroll-loops' $CFLAGS << ' -Wextra -O0 -ggdb3' if ENV['DEBUG'] ENV['RUBYOPT'] = rubyopt create_makefile 'charlock_holmes/charlock_holmes' charlock_holmes-0.7.6/ext/charlock_holmes/common.h0000644000004100000410000000173013301507117022306 0ustar www-datawww-data#ifndef CHARLOCK_COMMON_H #define CHARLOCK_COMMON_H // tell rbx not to use it's caching compat layer // by doing this we're making a promize to RBX that // we'll never modify the pointers we get back from RSTRING_PTR #define RSTRING_NOT_MODIFIED #include #ifdef HAVE_RUBY_ENCODING_H #include #endif static inline VALUE charlock_new_enc_str(const char *str, size_t len, void *encoding) { #ifdef HAVE_RUBY_ENCODING_H return rb_external_str_new_with_enc(str, len, (rb_encoding *)encoding); #else return rb_str_new(str, len); #endif } static inline VALUE charlock_new_str(const char *str, size_t len) { #ifdef HAVE_RUBY_ENCODING_H return rb_external_str_new_with_enc(str, len, rb_utf8_encoding()); #else return rb_str_new(str, len); #endif } static inline VALUE charlock_new_str2(const char *str) { #ifdef HAVE_RUBY_ENCODING_H return rb_external_str_new_with_enc(str, strlen(str), rb_utf8_encoding()); #else return rb_str_new2(str); #endif } #endif charlock_holmes-0.7.6/ext/charlock_holmes/transliterator.cpp0000644000004100000410000000634413301507117024434 0ustar www-datawww-data#include "common.h" #undef UChar #include #include extern "C" { #ifdef HAVE_RUBY_ENCODING_H #include static VALUE rb_eEncodingCompatibilityError; static void check_utf8_encoding(VALUE str) { static rb_encoding *_cached[3] = {NULL, NULL, NULL}; rb_encoding *enc; if (_cached[0] == NULL) { _cached[0] = rb_utf8_encoding(); _cached[1] = rb_usascii_encoding(); _cached[2] = rb_ascii8bit_encoding(); } enc = rb_enc_get(str); if (enc != _cached[0] && enc != _cached[1] && enc != _cached[2]) { rb_raise(rb_eEncodingCompatibilityError, "Input must be UTF-8 or US-ASCII, %s given", rb_enc_name(enc)); } } #else static void check_utf8_encoding(VALUE str) {} #endif extern VALUE rb_mCharlockHolmes; static VALUE rb_cTransliterator; static VALUE rb_transliterator_id_list(VALUE self) { UErrorCode status = U_ZERO_ERROR; icu::StringEnumeration *id_list; int32_t id_list_size; const char *curr_id; int32_t curr_id_len; VALUE rb_ary; VALUE rb_curr_id; id_list_size = 0; id_list = icu::Transliterator::getAvailableIDs(status); if(!U_SUCCESS(status)) { rb_raise(rb_eArgError, "%s", u_errorName(status)); } status = U_ZERO_ERROR; id_list_size = id_list->count(status); if(!U_SUCCESS(status)) { rb_raise(rb_eArgError, "%s", u_errorName(status)); } rb_ary = rb_ary_new2(id_list_size); do { curr_id_len = 0; curr_id = id_list->next(&curr_id_len, status); if(!U_SUCCESS(status)) { rb_raise(rb_eArgError, "%s", u_errorName(status)); } if (curr_id != NULL) { rb_curr_id = charlock_new_str(curr_id, curr_id_len); rb_ary_push(rb_ary, rb_curr_id); } } while(curr_id != NULL); delete id_list; return rb_ary; } static VALUE rb_transliterator_transliterate(VALUE self, VALUE rb_txt, VALUE rb_id) { UErrorCode status = U_ZERO_ERROR; UParseError p_error; icu::Transliterator *trans; const char *txt; size_t txt_len; const char *id; size_t id_len; icu::UnicodeString *u_txt; std::string result; VALUE rb_out; Check_Type(rb_txt, T_STRING); Check_Type(rb_id, T_STRING); check_utf8_encoding(rb_txt); check_utf8_encoding(rb_id); txt = RSTRING_PTR(rb_txt); txt_len = RSTRING_LEN(rb_txt); id = RSTRING_PTR(rb_id); id_len = RSTRING_LEN(rb_id); trans = icu::Transliterator::createInstance(icu::UnicodeString(id, id_len), UTRANS_FORWARD, p_error, status); if(!U_SUCCESS(status)) { rb_raise(rb_eArgError, "%s", u_errorName(status)); } u_txt = new icu::UnicodeString(txt, txt_len); trans->transliterate(*u_txt); icu::StringByteSink sink(&result); u_txt->toUTF8(sink); delete u_txt; delete trans; rb_out = charlock_new_str(result.data(), result.length()); return rb_out; } void _init_charlock_transliterator() { #ifdef HAVE_RUBY_ENCODING_H rb_eEncodingCompatibilityError = rb_const_get(rb_cEncoding, rb_intern("CompatibilityError")); #endif rb_cTransliterator = rb_define_class_under(rb_mCharlockHolmes, "Transliterator", rb_cObject); rb_define_singleton_method(rb_cTransliterator, "id_list", (VALUE(*)(...))rb_transliterator_id_list, 0); rb_define_singleton_method(rb_cTransliterator, "transliterate", (VALUE(*)(...))rb_transliterator_transliterate, 2); } } charlock_holmes-0.7.6/ext/charlock_holmes/encoding_detector.c0000644000004100000410000002256513301507117024501 0ustar www-datawww-data#include "unicode/ucsdet.h" #include "common.h" extern VALUE rb_mCharlockHolmes; static VALUE rb_cEncodingDetector; typedef struct { UCharsetDetector *csd; } charlock_detector_t; static VALUE rb_encdec_buildmatch(const UCharsetMatch *match) { UErrorCode status = U_ZERO_ERROR; const char *mname; const char *mlang; int mconfidence; VALUE rb_match; VALUE enc_tbl; VALUE enc_name; VALUE compat_enc; if (!match) return Qnil; mname = ucsdet_getName(match, &status); mlang = ucsdet_getLanguage(match, &status); mconfidence = ucsdet_getConfidence(match, &status); rb_match = rb_hash_new(); rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("text"))); enc_name = charlock_new_str2(mname); rb_hash_aset(rb_match, ID2SYM(rb_intern("encoding")), enc_name); enc_tbl = rb_iv_get(rb_cEncodingDetector, "@encoding_table"); compat_enc = rb_hash_aref(enc_tbl, enc_name); if (!NIL_P(compat_enc)) { rb_hash_aset(rb_match, ID2SYM(rb_intern("ruby_encoding")), compat_enc); } rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(mconfidence)); if (mlang && mlang[0]) rb_hash_aset(rb_match, ID2SYM(rb_intern("language")), charlock_new_str2(mlang)); return rb_match; } static VALUE rb_encdec_binarymatch() { VALUE rb_match; rb_match = rb_hash_new(); rb_hash_aset(rb_match, ID2SYM(rb_intern("type")), ID2SYM(rb_intern("binary"))); rb_hash_aset(rb_match, ID2SYM(rb_intern("confidence")), INT2NUM(100)); return rb_match; } static int detect_binary_content(VALUE self, VALUE rb_str) { size_t buf_len, scan_len; const char *buf; buf = RSTRING_PTR(rb_str); buf_len = RSTRING_LEN(rb_str); scan_len = NUM2ULL(rb_iv_get(self, "@binary_scan_length")); if (buf_len > 10) { // application/postscript if (!memcmp(buf, "%!PS-Adobe-", 11)) return 0; } if (buf_len > 7) { // image/png if (!memcmp(buf, "\x89PNG\x0D\x0A\x1A\x0A", 8)) return 1; } if (buf_len > 5) { // image/gif if (!memcmp(buf, "GIF87a", 6)) return 1; // image/gif if (!memcmp(buf, "GIF89a", 6)) return 1; } if (buf_len > 4) { // application/pdf if (!memcmp(buf, "%PDF-", 5)) return 1; } if (buf_len > 3) { // UTF-32BE if (!memcmp(buf, "\0\0\xfe\xff", 4)) return 0; // UTF-32LE if (!memcmp(buf, "\xff\xfe\0\0", 4)) return 0; } if (buf_len > 2) { // image/jpeg if (!memcmp(buf, "\xFF\xD8\xFF", 3)) return 1; } if (buf_len > 1) { // UTF-16BE if (!memcmp(buf, "\xfe\xff", 2)) return 0; // UTF-16LE if (!memcmp(buf, "\xff\xfe", 2)) return 0; } /* * If we got this far, any NULL bytes within the `scan_len` * range will likely mean the contents are binary. */ if (scan_len < buf_len) buf_len = scan_len; return !!memchr(buf, 0, buf_len); } /* * call-seq: true/false = EncodingDetector.is_binary? str * * Attempt to detect if a string is binary or text * * str - a String, what you want to perform the binary check on * * Returns: true or false */ static VALUE rb_encdec_is_binary(VALUE self, VALUE str) { if (detect_binary_content(self, str)) return Qtrue; else return Qfalse; } /* * call-seq: detection_hash = EncodingDetector.detect str[, hint_enc] * * Attempt to detect the encoding of this string * * str - a String, what you want to detect the encoding of * hint_enc - an optional String (like "UTF-8"), the encoding name which will * be used as an additional hint to the charset detector * * Returns: a Hash with :encoding, :language, :type and :confidence */ static VALUE rb_encdec_detect(int argc, VALUE *argv, VALUE self) { UErrorCode status = U_ZERO_ERROR; charlock_detector_t *detector; VALUE rb_str; VALUE rb_enc_hint; rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint); Check_Type(rb_str, T_STRING); Data_Get_Struct(self, charlock_detector_t, detector); // first lets see if this is binary content if (detect_binary_content(self, rb_str)) { return rb_encdec_binarymatch(); } // if we got here - the data doesn't look like binary // lets try to figure out what encoding the text is in ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status); if (!NIL_P(rb_enc_hint)) { Check_Type(rb_enc_hint, T_STRING); ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status); } return rb_encdec_buildmatch(ucsdet_detect(detector->csd, &status)); } /* * call-seq: detection_hash_array = EncodingDetector.detect_all str[, hint_enc] * * Attempt to detect the encoding of this string, and return * a list with all the possible encodings that match it. * * * str - a String, what you want to detect the encoding of * hint_enc - an optional String (like "UTF-8"), the encoding name which will * be used as an additional hint to the charset detector * * Returns: an Array with zero or more Hashes, * each one of them with with :encoding, :language, :type and :confidence */ static VALUE rb_encdec_detect_all(int argc, VALUE *argv, VALUE self) { UErrorCode status = U_ZERO_ERROR; charlock_detector_t *detector; const UCharsetMatch **csm; VALUE rb_ret; int i, match_count; VALUE rb_str; VALUE rb_enc_hint; VALUE binary_match; rb_scan_args(argc, argv, "11", &rb_str, &rb_enc_hint); Check_Type(rb_str, T_STRING); Data_Get_Struct(self, charlock_detector_t, detector); rb_ret = rb_ary_new(); // first lets see if this is binary content binary_match = Qnil; if (detect_binary_content(self, rb_str)) { binary_match = rb_encdec_binarymatch(); } ucsdet_setText(detector->csd, RSTRING_PTR(rb_str), (int32_t)RSTRING_LEN(rb_str), &status); if (!NIL_P(rb_enc_hint)) { Check_Type(rb_enc_hint, T_STRING); ucsdet_setDeclaredEncoding(detector->csd, RSTRING_PTR(rb_enc_hint), RSTRING_LEN(rb_enc_hint), &status); } csm = ucsdet_detectAll(detector->csd, &match_count, &status); for (i = 0; i < match_count; ++i) { rb_ary_push(rb_ret, rb_encdec_buildmatch(csm[i])); } if (!NIL_P(binary_match)) rb_ary_unshift(rb_ret, binary_match); return rb_ret; } /* * call-seq: EncodingDetector#strip_tags? * * Returns whether or not the strip_tags flag is set on this detector * * Returns: Boolean */ static VALUE rb_get_strip_tags(VALUE self) { charlock_detector_t *detector; UBool val; VALUE rb_val; Data_Get_Struct(self, charlock_detector_t, detector); val = ucsdet_isInputFilterEnabled(detector->csd); rb_val = val == 1 ? Qtrue : Qfalse; return rb_val; } /* * call-seq: EncodingDetector#strip_tags = true * * Enable or disable the stripping of HTML/XML tags from the input before * attempting any detection * * Returns: Boolean, the value passed */ static VALUE rb_set_strip_tags(VALUE self, VALUE rb_val) { charlock_detector_t *detector; UBool val; Data_Get_Struct(self, charlock_detector_t, detector); val = rb_val == Qtrue ? 1 : 0; ucsdet_enableInputFilter(detector->csd, val); return rb_val; } /* * call-seq: detectable_encodings = EncodingDetector.supported_encodings * * The list of detectable encodings supported by this library * * Returns: an Array of Strings */ static VALUE rb_get_supported_encodings(VALUE klass) { UCharsetDetector *csd; UErrorCode status = U_ZERO_ERROR; UEnumeration *encoding_list; VALUE rb_encoding_list; int32_t enc_count; int32_t i; const char *enc_name; int32_t enc_name_len; rb_encoding_list = rb_iv_get(klass, "encoding_list"); // lazily populate the list if (NIL_P(rb_encoding_list)) { csd = ucsdet_open(&status); encoding_list = ucsdet_getAllDetectableCharsets(csd, &status); rb_encoding_list = rb_ary_new(); enc_count = uenum_count(encoding_list, &status); rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1250")); rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1252")); rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1253")); rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1254")); rb_ary_push(rb_encoding_list, charlock_new_str2("windows-1255")); for(i=0; i < enc_count; i++) { enc_name = uenum_next(encoding_list, &enc_name_len, &status); rb_ary_push(rb_encoding_list, charlock_new_str(enc_name, enc_name_len)); } rb_iv_set(klass, "encoding_list", rb_encoding_list); ucsdet_close(csd); } return rb_encoding_list; } static void rb_encdec__free(void *obj) { charlock_detector_t *detector; detector = (charlock_detector_t *)obj; if (detector->csd) ucsdet_close(detector->csd); free(detector); } static VALUE rb_encdec__alloc(VALUE klass) { charlock_detector_t *detector; UErrorCode status = U_ZERO_ERROR; VALUE obj; detector = calloc(1, sizeof(charlock_detector_t)); obj = Data_Wrap_Struct(klass, NULL, rb_encdec__free, (void *)detector); detector->csd = ucsdet_open(&status); if (U_FAILURE(status)) { rb_raise(rb_eStandardError, "%s", u_errorName(status)); } return obj; } void _init_charlock_encoding_detector() { rb_cEncodingDetector = rb_define_class_under(rb_mCharlockHolmes, "EncodingDetector", rb_cObject); rb_define_alloc_func(rb_cEncodingDetector, rb_encdec__alloc); rb_define_method(rb_cEncodingDetector, "is_binary?", rb_encdec_is_binary, 1); rb_define_method(rb_cEncodingDetector, "detect", rb_encdec_detect, -1); rb_define_method(rb_cEncodingDetector, "detect_all", rb_encdec_detect_all, -1); rb_define_method(rb_cEncodingDetector, "strip_tags", rb_get_strip_tags, 0); rb_define_method(rb_cEncodingDetector, "strip_tags=", rb_set_strip_tags, 1); rb_define_singleton_method(rb_cEncodingDetector, "supported_encodings", rb_get_supported_encodings, 0); } charlock_holmes-0.7.6/ext/charlock_holmes/ext.c0000644000004100000410000000056413301507117021615 0ustar www-datawww-data#include "common.h" extern void _init_charlock_encoding_detector(); extern void _init_charlock_converter(); extern void _init_charlock_transliterator(); VALUE rb_mCharlockHolmes; void Init_charlock_holmes() { rb_mCharlockHolmes = rb_define_module("CharlockHolmes"); _init_charlock_encoding_detector(); _init_charlock_converter(); _init_charlock_transliterator(); }