NLP/0000755000175100001440000000000013741602721010724 5ustar hornikusersNLP/NAMESPACE0000644000175100001440000002124113741573501012146 0ustar hornikusersimportFrom("utils", "head", "tail") export("content", "content<-", "meta", "meta<-") export("String", "as.String", "is.String") S3method("as.String", "String") S3method("as.String", "default") S3method("[", "String") S3method("*", "String") S3method("+", "String") S3method("print", "String") export("Span", "as.Span", "is.Span") S3method("as.Span", "Span") S3method("as.Span", "Annotation") S3method("[", "Span") ## S3method("[<-", "Span") S3method("[[", "Span") ## S3method("[[<-", "Span") S3method("$<-", "Span") S3method("Ops", "Span") S3method("as.data.frame", "Span") S3method("as.list", "Span") S3method("c", "Span") S3method("duplicated", "Span") S3method("format", "Span") S3method("length", "Span") S3method("names", "Span") S3method("print", "Span") S3method("unique", "Span") export("Annotation", "as.Annotation", "is.Annotation") S3method("as.Annotation", "Annotation") S3method("as.Annotation", "Span") S3method("[", "Annotation") ## S3method("[<-", "Annotation") S3method("[[", "Annotation") ## S3method("[[<-", "Annotation") S3method("$<-", "Annotation") S3method("as.data.frame", "Annotation") S3method("as.list", "Annotation") S3method("c", "Annotation") S3method("duplicated", "Annotation") S3method("format", "Annotation") S3method("length", "Annotation") S3method("merge", "Annotation") S3method("meta", "Annotation", .get_meta_if_attr) S3method("meta<-", "Annotation", .set_meta_if_attr) S3method("names", "Annotation") S3method("print", "Annotation", .print_via_format) S3method("subset", "Annotation") S3method("unique", "Annotation") export("annotations_in_spans", "features") export("annotate") export("Annotator") S3method("format", "Annotator") S3method("meta", "Annotator", .get_meta_if_attr) S3method("meta<-", "Annotator", .set_meta_if_attr) S3method("print", "Annotator", .print_via_format) export("Simple_Para_Token_Annotator", "Simple_Sent_Token_Annotator", "Simple_Word_Token_Annotator", "Simple_POS_Tag_Annotator", "Simple_Entity_Annotator", "Simple_Chunk_Annotator", "Simple_Stem_Annotator") export("Annotator_Pipeline", "as.Annotator_Pipeline") S3method("as.Annotator_Pipeline", "Annotator_Pipeline") S3method("as.Annotator_Pipeline", "Annotator") S3method("as.Annotator_Pipeline", "list") S3method("[", "Annotator_Pipeline") S3method("as.list", "Annotator_Pipeline") S3method("c", "Annotator_Pipeline") S3method("format", "Annotator_Pipeline") S3method("meta", "Annotator_Pipeline", .get_meta_if_attr) S3method("meta<-", "Annotator_Pipeline", .set_meta_if_attr) S3method("print", "Annotator_Pipeline", .print_via_format) export("next_id", "single_feature") export("Regexp_Tokenizer", "blankline_tokenizer", "whitespace_tokenizer", "wordpunct_tokenizer") export("Tree") S3method("format", "Tree") S3method("print", "Tree", .print_via_format) export("Tree_parse", "Tree_apply") export("words", "sents", "paras", "tagged_words", "tagged_sents", "tagged_paras", "chunked_sents", "parsed_sents", "parsed_paras") S3method("format", "TextDocument", .format_TextDocument) S3method("print", "TextDocument", .print_via_format) export("AnnotatedPlainTextDocument", "annotation") S3method("format", "AnnotatedPlainTextDocument") S3method("print", "AnnotatedPlainTextDocument", .print_via_format) S3method("content", "AnnotatedPlainTextDocument") S3method("content<-", "AnnotatedPlainTextDocument") S3method("meta", "AnnotatedPlainTextDocument", .get_meta_if_slot) S3method("meta<-", "AnnotatedPlainTextDocument", .set_meta_if_slot) S3method("as.character", "AnnotatedPlainTextDocument") S3method("words", "AnnotatedPlainTextDocument") S3method("sents", "AnnotatedPlainTextDocument") S3method("paras", "AnnotatedPlainTextDocument") S3method("tagged_words", "AnnotatedPlainTextDocument") S3method("tagged_sents", "AnnotatedPlainTextDocument") S3method("tagged_paras", "AnnotatedPlainTextDocument") S3method("chunked_sents", "AnnotatedPlainTextDocument") S3method("parsed_sents", "AnnotatedPlainTextDocument") S3method("parsed_paras", "AnnotatedPlainTextDocument") export("CoNLLTextDocument") S3method("format", "CoNLLTextDocument") S3method("print", "CoNLLTextDocument", .print_via_format) S3method("content", "CoNLLTextDocument") S3method("meta", "CoNLLTextDocument", .get_meta_if_slot) S3method("meta<-", "CoNLLTextDocument", .set_meta_if_slot) S3method("as.character", "CoNLLTextDocument") S3method("words", "CoNLLTextDocument") S3method("sents", "CoNLLTextDocument") S3method("tagged_words", "CoNLLTextDocument") S3method("tagged_sents", "CoNLLTextDocument") S3method("chunked_sents", "CoNLLTextDocument") export("CoNLLUTextDocument") S3method("format", "CoNLLUTextDocument") S3method("print", "CoNLLUTextDocument", .print_via_format) S3method("content", "CoNLLUTextDocument") S3method("meta", "CoNLLUTextDocument", .get_meta_if_slot) S3method("meta<-", "CoNLLUTextDocument", .set_meta_if_slot) S3method("as.character", "CoNLLUTextDocument") S3method("words", "CoNLLUTextDocument") S3method("sents", "CoNLLUTextDocument") S3method("tagged_words", "CoNLLUTextDocument") S3method("tagged_sents", "CoNLLUTextDocument") export("TaggedTextDocument") S3method("format", "TaggedTextDocument") S3method("print", "TaggedTextDocument", .print_via_format) S3method("content", "TaggedTextDocument") S3method("meta", "TaggedTextDocument", .get_meta_if_slot) S3method("meta<-", "TaggedTextDocument", .set_meta_if_slot) S3method("as.character", "TaggedTextDocument") S3method("words", "TaggedTextDocument") S3method("sents", "TaggedTextDocument") S3method("paras", "TaggedTextDocument") S3method("tagged_words", "TaggedTextDocument") S3method("tagged_sents", "TaggedTextDocument") S3method("tagged_paras", "TaggedTextDocument") export("WordListDocument") S3method("format", "WordListDocument") S3method("print", "WordListDocument", .print_via_format) S3method("content", "WordListDocument") S3method("meta", "WordListDocument", .get_meta_if_slot) S3method("meta<-", "WordListDocument", .set_meta_if_slot) S3method("as.character", "WordListDocument") S3method("words", "WordListDocument") export("Penn_Treebank_POS_tags") export("Brown_POS_tags") export("Universal_POS_tags") export("Universal_POS_tags_map") export("parse_IETF_language_tag") export("parse_ISO_8601_datetime") S3method("[", "ISO_8601_datetime") S3method("$", "ISO_8601_datetime") S3method("as.Date", "ISO_8601_datetime") S3method("as.POSIXct", "ISO_8601_datetime") S3method("as.POSIXlt", "ISO_8601_datetime") S3method("as.data.frame", "ISO_8601_datetime") S3method("as.matrix", "ISO_8601_datetime") S3method("print", "ISO_8601_datetime") export("ngrams") export("Tagged_Token", "as.Tagged_Token", "is.Tagged_Token") S3method("as.Tagged_Token", "Tagged_Token") S3method("as.Tagged_Token", "TextDocument") S3method("[", "Tagged_Token") ## S3method("[<-", "Tagged_Token") S3method("[[", "Tagged_Token") ## S3method("[[<-", "Tagged_Token") S3method("$<-", "Tagged_Token") S3method("as.data.frame", "Tagged_Token") S3method("as.list", "Tagged_Token") S3method("c", "Tagged_Token") S3method("duplicated", "Tagged_Token") S3method("format", "Tagged_Token") S3method("length", "Tagged_Token") S3method("names", "Tagged_Token") S3method("print", "Tagged_Token", .print_via_format) S3method("unique", "Tagged_Token") export("Span_Tokenizer", "as.Span_Tokenizer", "is.Span_Tokenizer") S3method("as.Span_Tokenizer", "Span_Tokenizer") S3method("as.Span_Tokenizer", "Token_Tokenizer") S3method("as.Span_Tokenizer", "Annotator") S3method("as.Span_Tokenizer", "Annotator_Pipeline") S3method("format", "Span_Tokenizer") S3method("print", "Span_Tokenizer", .print_via_format) S3method("meta", "Span_Tokenizer", .get_meta_if_attr) S3method("meta<-", "Span_Tokenizer", .set_meta_if_attr) export("Token_Tokenizer", "as.Token_Tokenizer", "is.Token_Tokenizer") S3method("as.Token_Tokenizer", "Token_Tokenizer") S3method("as.Token_Tokenizer", "Span_Tokenizer") S3method("as.Token_Tokenizer", "Annotator") S3method("as.Token_Tokenizer", "Annotator_Pipeline") S3method("format", "Token_Tokenizer") S3method("print", "Token_Tokenizer", .print_via_format) S3method("meta", "Token_Tokenizer", .get_meta_if_attr) S3method("meta<-", "Token_Tokenizer", .set_meta_if_attr) S3method("words", "udpipe_connlu") S3method("sents", "udpipe_connlu") S3method("paras", "udpipe_connlu") S3method("tagged_words", "udpipe_connlu") S3method("tagged_sents", "udpipe_connlu") S3method("tagged_paras", "udpipe_connlu") S3method("words", "spacyr_parsed") S3method("sents", "spacyr_parsed") S3method("tagged_words", "spacyr_parsed") S3method("tagged_sents", "spacyr_parsed") S3method("words", "cnlp_annotation") S3method("sents", "cnlp_annotation") ## S3method("paras", "cnlp_annotation") S3method("tagged_words", "cnlp_annotation") S3method("tagged_sents", "cnlp_annotation") ## S3method("tagged_paras", "cnlp_annotation") NLP/man/0000755000175100001440000000000013362123135011473 5ustar hornikusersNLP/man/WordListDocument.Rd0000644000175100001440000000233613333071074015236 0ustar hornikusers\name{WordListDocument} \alias{WordListDocument} \title{Word List Text Documents} \description{ Create text documents from word lists. } \usage{ WordListDocument(con, encoding = "unknown", meta = list()) } \arguments{ \item{con}{a connection object or a character string. See \code{\link{readLines}()} for details. } \item{encoding}{encoding to be assumed for input strings. See \code{\link{readLines}()} for details. } \item{meta}{a named or empty list of document metadata tag-value pairs.} } \details{ \code{WordListDocument()} uses \code{\link{readLines}()} to read collections of words from connections for which each line provides one word, with blank lines ignored, and returns a word list document object which inherits from classes \code{"WordListDocument"} and \code{"\link{TextDocument}"}. The methods for generics \code{\link{words}()} and \code{\link{as.character}()} and class \code{"WordListDocument"} can be used to extract the words. } \value{ A word list document object inheriting from \code{"WordListDocument"} and \code{"\link{TextDocument}"}. } \seealso{ \code{\link{TextDocument}} for basic information on the text document infrastructure employed by package \pkg{NLP}. } NLP/man/language.Rd0000644000175100001440000000635513333070446013562 0ustar hornikusers\name{language} \alias{parse_IETF_language_tag} \title{Parse IETF Language Tag} \description{ Extract language, script, region and variant subtags from IETF language tags. } \usage{parse_IETF_language_tag(x, expand = FALSE)} \arguments{ \item{x}{a character vector with IETF language tags.} \item{expand}{a logical indicating whether to expand subtags into their description(s).} } \details{ Internet Engineering Task Force (IETF) language tags are defined by IETF BCP 47, which is currently composed by the normative RFC 5646 (\url{https://tools.ietf.org/html/rfc5646}) and RFC 4647 (\url{https://tools.ietf.org/html/rfc4646}), along with the normative content of the IANA Language Subtag Registry regulated by these RFCs. These tags are used in a number of modern computing standards. Each language tag is composed of one or more \dQuote{subtags} separated by hyphens. Normal language tags have the following subtags: \itemize{ \item a language subtag (optionally, with language extension subtags), \item an optional script subtag, \item an optional region subtag, \item optional variant subtags, \item optional extension subtags, \item an optional private use subtag. } Language subtags are mainly derived from ISO 639-1 and ISO 639-2, script subtags from ISO 15924, and region subtags from ISO 3166-1 alpha-2 and UN M.49, see package \pkg{ISOcodes} for more information about these standards. Variant subtags are not derived from any standard. The Language Subtag Registry (\url{https://www.iana.org/assignments/language-subtag-registry}), maintained by the Internet Assigned Numbers Authority (IANA), lists the current valid public subtags, as well as the so-called \dQuote{grandfathered} language tags. See \url{https://en.wikipedia.org/wiki/IETF_language_tag} for more information. } \value{ If \code{expand} is false, a list of character vectors of the form \code{"\var{type}=\var{subtag}"}, where \var{type} gives the type of the corresponding subtag (one of \sQuote{Language}, \sQuote{Extlang}, \sQuote{Script}, \sQuote{Region}, \sQuote{Variant}, or \sQuote{Extension}), or \code{"\var{type}=\var{tag}"} with \var{type} either \sQuote{Privateuse} or \sQuote{Grandfathered}. Otherwise, a list of lists of character vectors obtained by replacing the subtags by their corresponding descriptions (which may be multiple) from the IANA registry. Note that no such descriptions for Extension and Privateuse subtags are available in the registry; on the other hand, empty expansions of the other subtags indicate malformed tags (as these subtags must be available in the registry). } \examples{ ## German as used in Switzerland: parse_IETF_language_tag("de-CH") ## Serbian written using Latin script as used in Serbia and Montenegro: parse_IETF_language_tag("sr-Latn-CS") ## Spanish appropriate to the UN Latin American and Caribbean region: parse_IETF_language_tag("es-419") ## All in one: parse_IETF_language_tag(c("de-CH", "sr-Latn-CS", "es-419")) parse_IETF_language_tag(c("de-CH", "sr-Latn-CS", "es-419"), expand = TRUE) ## Two grandfathered tags: parse_IETF_language_tag(c("i-klingon", "zh-min-nan"), expand = TRUE) } \keyword{utilities} NLP/man/annotations_in_spans.Rd0000644000175100001440000000214213333064566016222 0ustar hornikusers\name{annotations_in_spans} \alias{annotations_in_spans} \title{Annotations contained in character spans} \description{ Extract annotations contained in character spans. } \usage{ annotations_in_spans(x, y) } \arguments{ \item{x}{an \code{\link{Annotation}} object.} \item{y}{a \code{\link{Span}} object, or something coercible to this (such as an \code{\link{Annotation}} object).} } \value{ A list with elements the annotations in \code{x} with character spans contained in the respective elements of \code{y}. } \examples{ ## A simple text. s <- String(" First sentence. Second sentence. ") ## ****5****0****5****0****5****0****5** ## Basic sentence and word token annotation for the text. a <- c(Annotation(1 : 2, rep.int("sentence", 2L), c( 3L, 20L), c(17L, 35L)), Annotation(3 : 6, rep.int("word", 4L), c( 3L, 9L, 20L, 27L), c( 7L, 16L, 25L, 34L))) ## Annotation for word tokens according to sentence: annotations_in_spans(a[a$type == "word"], a[a$type == "sentence"]) } NLP/man/CoNLLTextDocument.Rd0000644000175100001440000000736013741575633015262 0ustar hornikusers\name{CoNLLTextDocument} \alias{CoNLLTextDocument} \title{CoNLL-Style Text Documents} \description{ Create text documents from CoNLL-style files. } \usage{ CoNLLTextDocument(con, encoding = "unknown", format = "conll00", meta = list()) } \arguments{ \item{con}{a connection object or a character string. See \code{\link{scan}()} for details. } \item{encoding}{encoding to be assumed for input strings. See \code{\link{scan}()} for details. } \item{format}{a character vector specifying the format. See \bold{Details}. } \item{meta}{a named or empty list of document metadata tag-value pairs.} } \details{ CoNLL-style files use an extended tabular format where empty lines separate sentences, and non-empty lines consist of whitespace separated columns giving the word tokens and annotations for these. Such formats were popularized through their use for the shared tasks of CoNLL (Conference on Natural Language Learning), the yearly meeting of the Special Interest Group on Natural Language Learning of the Association for Computational Linguistics (see \url{https://www.signll.org/conll/} for more information about CoNLL). The precise format can vary according to corpus, and must be specified via argument \code{format}, as either a character string giving a pre-defined format, or otherwise a character vector with elements giving the names of the \sQuote{fields} (columns), and names used to give the field \sQuote{types}, with \sQuote{WORD}, \sQuote{POS} and \sQuote{CHUNK} to be used for, respectively, word tokens, POS tags, and chunk tags. For example, \preformatted{ c(WORD = "WORD", POS = "POS", CHUNK = "CHUNK")} would be a format specification appropriate for the CoNLL-2000 chunking task, as also available as the pre-defined \code{"conll00"}, which serves as default format for reasons of back-compatibility. Other pre-defined formats are \code{"conll01"} (for the CoNLL-2001 clause identification task), \code{"conll02"} (for the CoNLL-2002 language-independent named entity recognition task), \code{"conllx"} (for the CoNLL-X format used in at least the CoNLL-2006 and CoNLL-2007 multilingual dependency parsing tasks), and \code{"conll09"} (for the CoNLL-2009 shared task on syntactic and semantic dependencies in multiple languages). The lines are read from the given connection and split into fields using \code{\link{scan}()}. From this, a suitable representation of the provided information is obtained, and returned as a CoNLL text document object inheriting from classes \code{"CoNLLTextDocument"} and \code{"\link{TextDocument}"}. There are methods for class \code{"CoNLLTextDocument"} and generics \code{\link{words}()}, \code{\link{sents}()}, \code{\link{tagged_words}()}, \code{\link{tagged_sents}()}, and \code{\link{chunked_sents}()} (as well as \code{\link{as.character}()}), which should be used to access the text in such text document objects. The methods for generics \code{\link{tagged_words}()} and \code{\link{tagged_sents}()} provide a mechanism for mapping POS tags via the \code{map} argument, see section \bold{Details} in the help page for \code{\link{tagged_words}()} for more information. The POS tagset used will be inferred from the \code{POS_tagset} metadata element of the CoNLL-style text document. } \value{ An object inheriting from \code{"CoNLLTextDocument"} and \code{"\link{TextDocument}"}. } \seealso{ \code{\link{TextDocument}} for basic information on the text document infrastructure employed by package \pkg{NLP}. \url{https://www.clips.uantwerpen.be/conll2000/chunking/} for the CoNLL-2000 chunking task, and training and test data sets which can be read in using \code{CoNLLTextDocument()}. } NLP/man/annotate.Rd0000644000175100001440000000442313333064776013613 0ustar hornikusers\name{annotate} \alias{annotate} \title{Annotate text strings} \description{ Compute annotations by iteratively calling the given annotators with the given text and current annotations, and merging the newly computed annotations with the current ones. } \usage{ annotate(s, f, a = Annotation()) } \arguments{ \item{s}{a \code{\link{String}} object, or something coercible to this using \code{\link{as.String}} (e.g., a character string with appropriate encoding information).} \item{f}{an \code{\link{Annotator}} or \code{\link{Annotator_Pipeline}} object, or something coercible to the latter via \code{\link{as.Annotator_Pipeline}()} (such as a list of annotator objects).} \item{a}{an \code{\link{Annotation}} object giving the annotations to start with.} } \value{ An \code{\link{Annotation}} object containing the iteratively computed and merged annotations. } \examples{ ## A simple text. s <- String(" First sentence. Second sentence. ") ## ****5****0****5****0****5****0****5** ## A very trivial sentence tokenizer. sent_tokenizer <- function(s) { s <- as.String(s) m <- gregexpr("[^[:space:]][^.]*\\\\.", s)[[1L]] Span(m, m + attr(m, "match.length") - 1L) } ## (Could also use Regexp_Tokenizer() with the above regexp pattern.) ## A simple sentence token annotator based on the sentence tokenizer. sent_token_annotator <- Simple_Sent_Token_Annotator(sent_tokenizer) ## Annotate sentence tokens. a1 <- annotate(s, sent_token_annotator) a1 ## A very trivial word tokenizer. word_tokenizer <- function(s) { s <- as.String(s) ## Remove the last character (should be a period when using ## sentences determined with the trivial sentence tokenizer). s <- substring(s, 1L, nchar(s) - 1L) ## Split on whitespace separators. m <- gregexpr("[^[:space:]]+", s)[[1L]] Span(m, m + attr(m, "match.length") - 1L) } ## A simple word token annotator based on the word tokenizer. word_token_annotator <- Simple_Word_Token_Annotator(word_tokenizer) ## Annotate word tokens using the already available sentence token ## annotations. a2 <- annotate(s, word_token_annotator, a1) a2 ## Can also perform sentence and word token annotations in a pipeline: p <- Annotator_Pipeline(sent_token_annotator, word_token_annotator) annotate(s, p) } NLP/man/Tagged_Token.Rd0000644000175100001440000000433712510736106014326 0ustar hornikusers\name{Tagged_Token} \alias{Tagged_Token} \alias{as.Tagged_Token} \alias{is.Tagged_Token} \alias{[.Tagged_Token} %% \alias{[<-.Tagged_Token} \alias{[[.Tagged_Token} %% \alias{[[<-.Tagged_Token} \alias{$<-.Tagged_Token} \alias{as.data.frame.Tagged_Token} \alias{as.list.Tagged_Token} \alias{c.Tagged_Token} \alias{duplicated.Tagged_Token} \alias{format.Tagged_Token} \alias{length.Tagged_Token} \alias{names.Tagged_Token} \alias{print.Tagged_Token} \alias{unique.Tagged_Token} \title{Tagged_Token objects} \description{ Creation and manipulation of tagged token objects. } \usage{ Tagged_Token(token, tag) as.Tagged_Token(x) is.Tagged_Token(x) } \arguments{ \item{token, tag}{character vectors giving tokens and the corresponding tags.} \item{x}{an \R object.} } \details{ A tagged token is a pair with \dQuote{slots} \sQuote{token} and \sQuote{tag}, giving the token and the corresponding tag. Tagged token objects provide sequences (allowing positional access) of single tagged tokens. They have class \code{"Tagged_Token"}. Subscripting tagged token objects via \code{[} extracts subsets of tagged tokens; subscripting via \code{$} extracts character vectors with the sequence of values of the named slot. There are several additional methods for class \code{"Tagged_Token"}: \code{print()} and \code{format()} (which concatenate tokens and tags separated by \samp{/}); \code{c()} combines tagged token objects (or objects coercible to these using \code{as.Tagged_Token()}), and \code{as.list()} and \code{as.data.frame()} coerce, respectively, to lists (of single tagged token objects) and data frames (with tagged tokens and slots corresponding to rows and columns). \code{Tagged_Token()} creates tagged token objects from the given sequences of tokens and tags, which must have the same length. \code{as.Tagged_Token()} coerces to tagged token objects, with a method for \code{\link{TextDocument}} objects using \code{\link{tagged_words}()}. \code{is.Tagged_Token()} tests whether an object inherits from class \code{"Tagged_Token"}. } \value{ For \code{Tagged_Token()} and \code{as.Tagged_Token()}, a tagged token object (of class \code{"Tagged_Token"}). For \code{is.Tagged_Token()}, a logical. } NLP/man/viewers.Rd0000644000175100001440000000622713337765004013466 0ustar hornikusers\name{viewers} \alias{sents} \alias{words} \alias{paras} \alias{tagged_sents} \alias{tagged_paras} \alias{tagged_words} \alias{chunked_sents} \alias{parsed_sents} \alias{parsed_paras} \title{Text Document Viewers} \description{ Provide suitable \dQuote{views} of the text contained in text documents. } \usage{ words(x, ...) sents(x, ...) paras(x, ...) tagged_words(x, ...) tagged_sents(x, ...) tagged_paras(x, ...) chunked_sents(x, ...) parsed_sents(x, ...) parsed_paras(x, ...) } \arguments{ \item{x}{a text document object.} \item{...}{further arguments to be passed to or from methods.} } \details{ Methods for extracting POS tagged word tokens (i.e., for generics \code{tagged_words()}, \code{tagged_sents()} and \code{tagged_paras()}) can optionally provide a mechanism for mapping the POS tags via a \code{map} argument. This can give a function, a named character vector (with names and elements the tags to map from and to, respectively), or a named list of such named character vectors, with names corresponding to POS tagsets (see \code{\link{Universal_POS_tags_map}} for an example). If a list, the map used will be the element with name matching the POS tagset used (this information is typically determined from the text document metadata; see the the help pages for text document extension classes implementing this mechanism for details). In addition to methods for the text document classes provided by package \pkg{NLP} itself, (see \link{TextDocument}), package \pkg{NLP} also provides word tokens and POS tagged word tokens for the results of \code{\link[udpipe:udpipe_annotate]{udpipe_annotate}()} from package \CRANpkg{udpipe}, \code{\link[spacyr:spacy_parse]{spacy_parse}()} from package \CRANpkg{spacyr}, and \code{\link[cleanNLP:cnlp_annotate]{cnlp_annotate}()} from package \CRANpkg{cleanNLP}. } \value{ For \code{words()}, a character vector with the word tokens in the document. For \code{sents()}, a list of character vectors with the word tokens in the sentences. For \code{paras()}, a list of lists of character vectors with the word tokens in the sentences, grouped according to the paragraphs. For \code{tagged_words()}, a character vector with the POS tagged word tokens in the document (i.e., the word tokens and their POS tags, separated by \samp{/}). For \code{tagged_sents()}, a list of character vectors with the POS tagged word tokens in the sentences. For \code{tagged_paras()}, a list of lists of character vectors with the POS tagged word tokens in the sentences, grouped according to the paragraphs. For \code{chunked_sents()}, a list of (flat) \code{\link{Tree}} objects giving the chunk trees for the sentences in the document. For \code{parsed_sents()}, a list of \code{\link{Tree}} objects giving the parse trees for the sentences in the document. For \code{parsed_paras()}, a list of lists of \code{\link{Tree}} objects giving the parse trees for the sentences in the document, grouped according to the paragraphs in the document. } \seealso{ \code{\link{TextDocument}} for basic information on the text document infrastructure employed by package \pkg{NLP}. } NLP/man/Annotator.Rd0000644000175100001440000000634013112000174013720 0ustar hornikusers\name{Annotator} \alias{Annotator} \alias{Annotator_Pipeline} \alias{as.Annotator_Pipeline} \title{Annotator (pipeline) objects} \description{ Create annotator (pipeline) objects. } \usage{ Annotator(f, meta = list(), classes = NULL) Annotator_Pipeline(..., meta = list()) as.Annotator_Pipeline(x) } \arguments{ \item{f}{an annotator function, which must have formals \code{s} and \code{a} giving, respectively, the string with the natural language text to annotate and an annotation object to start from, and return an annotation object with the computed annotations.} \item{meta}{an empty or named list of annotator (pipeline) metadata tag-value pairs.} \item{classes}{a character vector or \code{NULL} (default) giving classes to be used for the created annotator object in addition to \code{"Annotator"}.} \item{...}{annotator objects.} \item{x}{an \R object.} } \details{ \code{Annotator()} checks that the given annotator function has the appropriate formals, and returns an annotator object which inherits from the given classes and \code{"Annotator"}. There are \code{print()} and \code{format()} methods for such objects, which use the \code{description} element of the metadata if available. \code{Annotator_Pipeline()} creates an annotator pipeline object from the given annotator objects. Such pipeline objects can be used by \code{\link{annotate}()} for successively computing and merging annotations, and can also be obtained by coercion with \code{as.Annotator_Pipeline()}, which currently handles annotator objects and lists of such (and of course, annotator pipeline objects). } \value{ For \code{Annotator()}, an annotator object inheriting from the given classes and class \code{"Annotator"}. For \code{Annotator_Pipeline()} and \code{as.Annotator_Pipeline()}, an annotator pipeline object inheriting from class \code{"Annotator_Pipeline"}. } \seealso{ \link{Simple annotator generators} for creating \dQuote{simple} annotator objects based on function performing simple basic NLP tasks. Package \pkg{StanfordCoreNLP} available from the repository at \url{https://datacube.wu.ac.at} which provides generators for annotator pipelines based on the Stanford CoreNLP tools. } \examples{ ## Use blankline_tokenizer() for a simple paragraph token annotator: para_token_annotator <- Annotator(function(s, a = Annotation()) { spans <- blankline_tokenizer(s) n <- length(spans) ## Need n consecutive ids, starting with the next "free" ## one: from <- next_id(a$id) Annotation(seq(from = from, length.out = n), rep.int("paragraph", n), spans$start, spans$end) }, list(description = "A paragraph token annotator based on blankline_tokenizer().")) para_token_annotator ## Alternatively, use Simple_Para_Token_Annotator(). ## A simple text with two paragraphs: s <- String(paste(" First sentence. Second sentence. ", " Second paragraph. ", sep = "\n\n")) a <- annotate(s, para_token_annotator) ## Annotations for paragraph tokens. a ## Extract paragraph tokens. s[a] } NLP/man/TextDocument.Rd0000644000175100001440000000302313144533560014410 0ustar hornikusers\name{TextDocument} \alias{TextDocument} \title{Text Documents} \description{ Representing and computing on text documents. } \details{ \emph{Text documents} are documents containing (natural language) text. In packages which employ the infrastructure provided by package \pkg{NLP}, such documents are represented via the virtual S3 class \code{"TextDocument"}: such packages then provide S3 text document classes extending the virtual base class (such as the \code{\link{AnnotatedPlainTextDocument}} objects provided by package \pkg{NLP} itself). All extension classes must provide an \code{\link{as.character}()} method which extracts the natural language text in documents of the respective classes in a \dQuote{suitable} (not necessarily structured) form, as well as \code{\link{content}()} and \code{\link{meta}()} methods for accessing the (possibly raw) document content and metadata. In addition, the infrastructure features the generic functions \code{\link{words}()}, \code{\link{sents}()}, etc., for which extension classes can provide methods giving a structured view of the text contained in documents of these classes (returning, e.g., a character vector with the word tokens in these documents, and a list of such character vectors). } \seealso{ \code{\link{AnnotatedPlainTextDocument}}, \code{\link{CoNLLTextDocument}}, \code{\link{CoNLLUTextDocument}}, \code{\link{TaggedTextDocument}}, and \code{\link{WordListDocument}} for the text document classes provided by package \pkg{NLP}. } NLP/man/utils.Rd0000644000175100001440000000113212313516303013115 0ustar hornikusers\name{utils} \alias{next_id} \alias{single_feature} \title{Annotation Utilities} \description{Utilities for creating annotation objects.} \usage{ next_id(id) single_feature(value, tag) } \arguments{ \item{id}{an integer vector of annotation ids.} \item{value}{an \R object.} \item{tag}{a character string.} } \details{ \code{next_id()} obtains the next \dQuote{available} id based on the given annotation ids (one more than the maximal non-missing id). \code{single_feature()} creates a single feature from the given value and tag (i.e., a named list with the value named by the tag). } NLP/man/Tokenizer.Rd0000644000175100001440000000544413333070646013751 0ustar hornikusers\name{Tokenizer} \alias{Span_Tokenizer} \alias{as.Span_Tokenizer} \alias{is.Span_Tokenizer} \alias{Token_Tokenizer} \alias{as.Token_Tokenizer} \alias{is.Token_Tokenizer} \title{Tokenizer objects} \description{ Create tokenizer objects. } \usage{ Span_Tokenizer(f, meta = list()) as.Span_Tokenizer(x, ...) Token_Tokenizer(f, meta = list()) as.Token_Tokenizer(x, ...) } \arguments{ \item{f}{a tokenizer function taking the string to tokenize as argument, and returning either the tokens (for \code{Token_Tokenizer}) or their spans (for \code{Span_Tokenizer}).} \item{meta}{a named or empty list of tokenizer metadata tag-value pairs.} \item{x}{an \R object.} \item{...}{further arguments passed to or from other methods.} } \details{ Tokenization is the process of breaking a text string up into words, phrases, symbols, or other meaningful elements called tokens. This can be accomplished by returning the sequence of tokens, or the corresponding spans (character start and end positions). We refer to tokenization resources of the respective kinds as \dQuote{token tokenizers} and \dQuote{span tokenizers}. \code{Span_Tokenizer()} and \code{Token_Tokenizer()} return tokenizer objects which are functions with metadata and suitable class information, which in turn can be used for converting between the two kinds using \code{as.Span_Tokenizer()} or \code{as.Token_Tokenizer()}. It is also possible to coerce annotator (pipeline) objects to tokenizer objects, provided that the annotators provide suitable token annotations. By default, word tokens are used; this can be controlled via the \code{type} argument of the coercion methods (e.g., use \code{type = "sentence"} to extract sentence tokens). There are also \code{print()} and \code{format()} methods for tokenizer objects, which use the \code{description} element of the metadata if available. } \seealso{ \code{\link{Regexp_Tokenizer}()} for creating regexp span tokenizers. } \examples{ ## A simple text. s <- String(" First sentence. Second sentence. ") ## ****5****0****5****0****5****0****5** ## Use a pre-built regexp (span) tokenizer: wordpunct_tokenizer wordpunct_tokenizer(s) ## Turn into a token tokenizer: tt <- as.Token_Tokenizer(wordpunct_tokenizer) tt tt(s) ## Of course, in this case we could simply have done s[wordpunct_tokenizer(s)] ## to obtain the tokens from the spans. ## Conversion also works the other way round: package 'tm' provides ## the following token tokenizer function: scan_tokenizer <- function(x) scan(text = as.character(x), what = "character", quote = "", quiet = TRUE) ## Create a token tokenizer from this: tt <- Token_Tokenizer(scan_tokenizer) tt(s) ## Turn into a span tokenizer: st <- as.Span_Tokenizer(tt) st(s) ## Checking tokens from spans: s[st(s)] } NLP/man/String.Rd0000644000175100001440000000440012532125025013224 0ustar hornikusers\name{String} \alias{String} \alias{as.String} \alias{is.String} \title{String objects} \description{ Creation and manipulation of string objects. } \usage{ String(x) as.String(x) is.String(x) } \arguments{ \item{x}{a character vector with the appropriate encoding information for \code{String()}; an arbitrary \R object otherwise. } } \details{ String objects provide character strings encoded in UTF-8 with class \code{"String"}, which currently has a useful \code{[} subscript method: with indices \code{i} and \code{j} of length one, this gives a string object with the substring starting at the position given by \code{i} and ending at the position given by \code{j}; subscripting with a single index which is an object inheriting from class \code{"\link{Span}"} or a list of such objects returns a character vector of substrings with the respective spans, or a list thereof. Additional methods may be added in the future. \code{String()} creates a string object from a given character vector, taking the first element of the vector and converting it to UTF-8 encoding. \code{as.String()} is a generic function to coerce to a string object. The default method calls \code{String()} on the result of converting to character and concatenating into a single string with the elements separated by newlines. \code{is.String()} tests whether an object inherits from class \code{"String"}. } \value{ For \code{String()} and \code{as.String()}, a string object (of class \code{"String"}). For \code{is.String()}, a logical. } \examples{ ## A simple text. s <- String(" First sentence. Second sentence. ") ## ****5****0****5****0****5****0****5** ## Basic sentence and word token annotation for the text. a <- c(Annotation(1 : 2, rep.int("sentence", 2L), c( 3L, 20L), c(17L, 35L)), Annotation(3 : 6, rep.int("word", 4L), c( 3L, 9L, 20L, 27L), c( 7L, 16L, 25L, 34L))) ## All word tokens (by subscripting with an annotation object): s[a[a$type == "word"]] ## Word tokens according to sentence (by subscripting with a list of ## annotation objects): s[annotations_in_spans(a[a$type == "word"], a[a$type == "sentence"])] } NLP/man/features.Rd0000644000175100001440000000310713362122624013603 0ustar hornikusers\name{features} \alias{features} \title{Extract Annotation Features} \description{ Conveniently extract features from annotations and annotated plain text documents. } \usage{ features(x, type = NULL, simplify = TRUE) } \arguments{ \item{x}{an object inheriting from class \code{"Annotation"} or \code{"AnnotatedPlainTextDocument"}.} \item{type}{a character vector of annotation types to be used for selecting annotations, or \code{NULL} (default) to use all annotations. When selecting, the elements of \code{type} will partially be matched against the annotation types.} \item{simplify}{a logical indicating whether to simplify feature values to a vector.} } \details{ \code{features()} conveniently gathers all feature tag-value pairs in the selected annotations into a data frame with variables the values for all tags found (using a \code{NULL} value for tags without a value). In general, variables will be \emph{lists} of extracted values. By default, variables where all elements are length one atomic vectors are simplified into an atomic vector of values. The values for specific tags can be extracted by suitably subscripting the obtained data frame. } \examples{ ## Use a pre-built annotated plain text document, ## see ? AnnotatedPlainTextDocument. doc <- readRDS(system.file("texts", "stanford.rds", package = "NLP")) ## Extract features of all *word* annotations in doc: x <- features(doc, "word") ## Could also have abbreviated "word" to "w". x ## Only lemmas: x$lemma ## Words together with lemmas: paste(words(doc), x$lemma, sep = "/") } NLP/man/tokenizers.Rd0000644000175100001440000000415113333064645014170 0ustar hornikusers\name{tokenizers} \alias{Regexp_Tokenizer} \alias{blankline_tokenizer} \alias{whitespace_tokenizer} \alias{wordpunct_tokenizer} \title{Regexp tokenizers} \description{ Tokenizers using regular expressions to match either tokens or separators between tokens. } \usage{ Regexp_Tokenizer(pattern, invert = FALSE, ..., meta = list()) blankline_tokenizer(s) whitespace_tokenizer(s) wordpunct_tokenizer(s) } \arguments{ \item{pattern}{a character string giving the regular expression to use for matching.} \item{invert}{a logical indicating whether to match separators between tokens.} \item{...}{further arguments to be passed to \code{\link{gregexpr}()}.} \item{meta}{a named or empty list of tokenizer metadata tag-value pairs.} \item{s}{a \code{\link{String}} object, or something coercible to this using \code{\link{as.String}()} (e.g., a character string with appropriate encoding information).} } \details{ \code{Regexp_Tokenizer()} creates regexp span tokenizers which use the given \code{pattern} and \code{...} arguments to match tokens or separators between tokens via \code{\link{gregexpr}()}, and then transform the results of this into character spans of the tokens found. \code{whitespace_tokenizer()} tokenizes by treating any sequence of whitespace characters as a separator. \code{blankline_tokenizer()} tokenizes by treating any sequence of blank lines as a separator. \code{wordpunct_tokenizer()} tokenizes by matching sequences of alphabetic characters and sequences of (non-whitespace) non-alphabetic characters. } \value{ \code{Regexp_Tokenizer()} returns the created regexp span tokenizer. \code{blankline_tokenizer()}, \code{whitespace_tokenizer()} and \code{wordpunct_tokenizer()} return the spans of the tokens found in \code{s}. } \seealso{ \code{\link{Span_Tokenizer}()} for general information on span tokenizer objects. } \examples{ ## A simple text. s <- String(" First sentence. Second sentence. ") ## ****5****0****5****0****5****0****5** spans <- whitespace_tokenizer(s) spans s[spans] spans <- wordpunct_tokenizer(s) spans s[spans] } NLP/man/Annotation.Rd0000644000175100001440000001513013357140760014103 0ustar hornikusers\name{Annotation} \alias{Annotation} \alias{as.Annotation} \alias{as.Annotation.Span} \alias{is.Annotation} \alias{[.Annotation} %% \alias{[<-.Annotation} \alias{[[.Annotation} %% \alias{[[<-.Annotation} \alias{$<-.Annotation} \alias{as.data.frame.Annotation} \alias{as.list.Annotation} \alias{c.Annotation} \alias{duplicated.Annotation} \alias{format.Annotation} \alias{length.Annotation} \alias{merge.Annotation} \alias{meta.Annotation} \alias{meta<-.Annotation} \alias{names.Annotation} \alias{print.Annotation} \alias{subset.Annotation} \alias{unique.Annotation} \title{Annotation objects} \description{ Creation and manipulation of annotation objects. } \usage{ Annotation(id = NULL, type = NULL, start, end, features = NULL, meta = list()) as.Annotation(x, ...) \method{as.Annotation}{Span}(x, id = NULL, type = NULL, ...) is.Annotation(x) } \arguments{ \item{id}{an integer vector giving the annotation ids, or \code{NULL} (default) resulting in missing ids.} \item{type}{a character vector giving the annotation types, or \code{NULL} (default) resulting in missing types.} \item{start, end}{integer vectors giving the start and end positions of the character spans the annotations refer to.} \item{features}{a list of (named or empty) feature lists, or \code{NULL} (default), resulting in empty feature lists.} \item{meta}{a named or empty list of annotation metadata tag-value pairs.} \item{x}{an \R object (an object of class \code{"\link{Span}"} for the coercion methods for such objects).} \item{...}{further arguments passed to or from other methods.} } \details{ A single annotation (of natural language text) is a quintuple with \dQuote{slots} \sQuote{id}, \sQuote{type}, \sQuote{start}, \sQuote{end}, and \sQuote{features}. These give, respectively, id and type, the character span the annotation refers to, and a collection of annotation features (tag/value pairs). Annotation objects provide sequences (allowing positional access) of single annotations, together with metadata about these. They have class \code{"Annotation"} and, as they contain character spans, also inherit from class \code{"\link{Span}"}. Span objects can be coerced to annotation objects via \code{as.Annotation()} which allows to specify ids and types (using the default values sets these to missing), and annotation objects can be coerced to span objects using \code{\link{as.Span}()}. The features of a single annotation are represented as named or empty lists. Subscripting annotation objects via \code{[} extracts subsets of annotations; subscripting via \code{$} extracts the sequence of values of the named slot, i.e., an integer vector for \sQuote{id}, \sQuote{start}, and \sQuote{end}, a character vector for \sQuote{type}, and a list of named or empty lists for \sQuote{features}. There are several additional methods for class \code{"Annotation"}: \code{print()} and \code{format()} (which both have a \code{values} argument which if \code{FALSE} suppresses indicating the feature map values); \code{c()} combines annotations (or objects coercible to these using \code{as.Annotation()}); \code{merge()} merges annotations by combining the feature lists of annotations with otherwise identical slots; \code{subset()} allows subsetting by expressions involving the slot names; and \code{as.list()} and \code{as.data.frame()} coerce, respectively, to lists (of single annotation objects) and data frames (with annotations and slots corresponding to rows and columns). \code{Annotation()} creates annotation objects from the given sequences of slot values: those not \code{NULL} must all have the same length (the number of annotations in the object). \code{as.Annotation()} coerces to annotation objects, with a method for span objects. \code{is.Annotation()} tests whether an object inherits from class \code{"Annotation"}. } \value{ For \code{Annotation()} and \code{as.Annotation()}, an annotation object (of class \code{"Annotation"} also inheriting from class \code{"Span"}). For \code{is.Annotation()}, a logical. } \examples{ ## A simple text. s <- String(" First sentence. Second sentence. ") ## ****5****0****5****0****5****0****5** ## Basic sentence and word token annotations for the text. a1s <- Annotation(1 : 2, rep.int("sentence", 2L), c( 3L, 20L), c(17L, 35L)) a1w <- Annotation(3 : 6, rep.int("word", 4L), c( 3L, 9L, 20L, 27L), c( 7L, 16L, 25L, 34L)) ## Use c() to combine these annotations: a1 <- c(a1s, a1w) a1 ## Subscripting via '[': a1[3 : 4] ## Subscripting via '$': a1$type ## Subsetting according to slot values, directly: a1[a1$type == "word"] ## or using subset(): subset(a1, type == "word") ## We can subscript string objects by annotation objects to extract the ## annotated substrings: s[subset(a1, type == "word")] ## We can also subscript by lists of annotation objects: s[annotations_in_spans(subset(a1, type == "word"), subset(a1, type == "sentence"))] ## Suppose we want to add the sentence constituents (the ids of the ## words in the respective sentences) to the features of the sentence ## annotations. The basic computation is lapply(annotations_in_spans(a1[a1$type == "word"], a1[a1$type == "sentence"]), function(a) a$id) ## For annotations, we need lists of feature lists: features <- lapply(annotations_in_spans(a1[a1$type == "word"], a1[a1$type == "sentence"]), function(e) list(constituents = e$id)) ## Could add these directly: a2 <- a1 a2$features[a2$type == "sentence"] <- features a2 ## Note how the print() method summarizes the features. ## We could also write a sentence constituent annotator ## (note that annotators should always have formals 's' and 'a', even ## though for computing the sentence constituents s is not needed): sent_constituent_annotator <- Annotator(function(s, a) { i <- which(a$type == "sentence") features <- lapply(annotations_in_spans(a[a$type == "word"], a[i]), function(e) list(constituents = e$id)) Annotation(a$id[i], a$type[i], a$start[i], a$end[i], features) }) sent_constituent_annotator(s, a1) ## Can use merge() to merge the annotations: a2 <- merge(a1, sent_constituent_annotator(s, a1)) a2 ## Equivalently, could have used a2 <- annotate(s, sent_constituent_annotator, a1) a2 ## which merges automatically. } NLP/man/datetime.Rd0000644000175100001440000000366413333066244013574 0ustar hornikusers\name{datetime} \alias{parse_ISO_8601_datetime} \title{Parse ISO 8601 Date/Time Strings} \description{ Extract date/time components from strings following one of the six formats specified in the NOTE-datetime ISO 8601 profile (\url{https://www.w3.org/TR/NOTE-datetime}). } \arguments{ \item{x}{a character vector.} } \details{ For character strings in one of the formats in the profile, the corresponding date/time components are extracted, with seconds and decimal fractions of seconds combined. Other (malformed) strings are warned about. The extracted components for each string are gathered into a named list with elements of the appropriate type (integer for year to min; double for sec; character for the time zone designator). The object returned is a (suitably classed) list of such named lists. This internal representation may change in future versions. One can subscript such ISO 8601 date/time objects using \code{[} and extract components using \code{$} (where missing components will result in \code{NA}s), and convert them to the standard R date/time classes using \code{\link{as.Date}()}, \code{\link{as.POSIXct}()} and \code{\link{as.POSIXlt}()} (incomplete elements will convert to suitably missing elements). In addition, there are \code{print()} and \code{as.data.frame()} methods for such objects. } \value{ An object inheriting from class \code{"ISO_8601_datetime"} with the extracted date/time components. } \examples{ ## Use the examples from , plus one ## in UTC. x <- c("1997", "1997-07", "1997-07-16", "1997-07-16T19:20+01:00", "1997-07-16T19:20:30+01:00", "1997-07-16T19:20:30.45+01:00", "1997-07-16T19:20:30.45Z") y <- parse_ISO_8601_datetime(x) y ## Conversions: note that "incomplete" elements are converted to ## "missing". as.Date(y) as.POSIXlt(y) ## Subscripting and extracting components: head(y, 3) y$mon } NLP/man/Tree.Rd0000644000175100001440000000636213741575543012707 0ustar hornikusers\name{Tree} \alias{Tree} \alias{format.Tree} \alias{print.Tree} \alias{Tree_parse} \alias{Tree_apply} \title{Tree objects} \description{Creation and manipulation of tree objects.} \usage{ Tree(value, children = list()) \method{format}{Tree}(x, width = 0.9 * getOption("width"), indent = 0, brackets = c("(", ")"), ...) Tree_parse(x, brackets = c("(", ")")) Tree_apply(x, f, recursive = FALSE) } \arguments{ \item{value}{a (non-tree) node value of the tree.} \item{children}{a list giving the children of the tree.} \item{x}{a tree object for the \code{format()} method and \code{Tree_apply()}; a character string for \code{Tree_parse()}.} \item{width}{a positive integer giving the target column for a single-line nested bracketting.} \item{indent}{a non-negative integer giving the indentation used for formatting.} \item{brackets}{a character vector of length two giving the pair of opening and closing brackets to be employed for formatting or parsing.} \item{...}{further arguments passed to or from other methods.} \item{f}{a function to be applied to the children nodes.} \item{recursive}{a logical indicating whether to apply \code{f} recursively to the children of the children and so forth.} } \details{ Trees give hierarchical groupings of leaves and subtrees, starting from the root node of the tree. In natural language processing, the syntactic structure of sentences is typically represented by parse trees (e.g., \url{https://en.wikipedia.org/wiki/Concrete_syntax_tree}) and displayed using nested brackettings. The tree objects in package \pkg{NLP} are patterned after the ones in NLTK (\url{https://www.nltk.org}), and primarily designed for representing parse trees. A tree object consists of the value of the root node and its children as a list of leaves and subtrees, where the leaves are elements with arbitrary non-tree values (and not subtrees with no children). The value and children can be extracted via \code{$} subscripting using names \code{value} and \code{children}, respectively. There is a \code{format()} method for tree objects: this first tries a nested bracketting in a single line of the given width, and if this is not possible, produces a nested indented bracketting. The \code{print()} method uses the \code{format()} method, and hence its arguments to control the formatting. \code{Tree_parse()} reads nested brackettings into a tree object. } \examples{ x <- Tree(1, list(2, Tree(3, list(4)), 5)) format(x) x$value x$children p <- Tree("VP", list(Tree("V", list("saw")), Tree("NP", list("him")))) p <- Tree("S", list(Tree("NP", list("I")), p)) p ## Force nested indented bracketting: print(p, width = 10) s <- "(S (NP I) (VP (V saw) (NP him)))" p <- Tree_parse(s) p ## Extract the leaves by recursively traversing the children and ## recording the non-tree ones: Tree_leaf_gatherer <- function() { v <- list() list(update = function(e) if(!inherits(e, "Tree")) v <<- c(v, list(e)), value = function() v, reset = function() { v <<- list() }) } g <- Tree_leaf_gatherer() y <- Tree_apply(p, g$update, recursive = TRUE) g$value() } NLP/man/tagsets.Rd0000644000175100001440000000363413741575676013470 0ustar hornikusers\name{tagsets} \alias{Penn_Treebank_POS_tags} \alias{Brown_POS_tags} \alias{Universal_POS_tags} \alias{Universal_POS_tags_map} \title{NLP Tag Sets} \description{ Tag sets frequently used in Natural Language Processing. } \usage{ Penn_Treebank_POS_tags Brown_POS_tags Universal_POS_tags Universal_POS_tags_map } \details{ \code{Penn_Treebank_POS_tags} and \code{Brown_POS_tags} provide, respectively, the Penn Treebank POS tags (\url{https://catalog.ldc.upenn.edu/docs/LDC95T7/cl93.html}, Table 2) and the POS tags used for the Brown corpus (\url{http://www.hit.uib.no/icame/brown/bcm.html}), both as data frames with the following variables: \describe{ \item{entry}{a character vector with the POS tags} \item{description}{a character vector with short descriptions of the tags} \item{examples}{a character vector with examples for the tags} } \code{Universal_POS_tags} provides the universal POS tagset introduced by Slav Petrov, Dipanjan Das, and Ryan McDonald (\url{https://arxiv.org/abs/1104.2086}), as a data frame with character variables \code{entry} and \code{description}. \code{Universal_POS_tags_map} is a named list of mappings from language and treebank specific POS tagsets to the universal POS tags, with elements named \samp{en-ptb} and \samp{en-brown} giving the mappings, respectively, for the Penn Treebank and Brown POS tags. } \source{ \url{https://catalog.ldc.upenn.edu/docs/LDC95T7/cl93.html}, \url{http://www.hit.uib.no/icame/brown/bcm.html}, \url{https://github.com/slavpetrov/universal-pos-tags}. } \examples{ ## Penn Treebank POS tags dim(Penn_Treebank_POS_tags) ## Inspect first 20 entries: write.dcf(head(Penn_Treebank_POS_tags, 20L)) ## Brown POS tags dim(Brown_POS_tags) ## Inspect first 20 entries: write.dcf(head(Brown_POS_tags, 20L)) ## Universal POS tags Universal_POS_tags ## Available mappings to universal POS tags names(Universal_POS_tags_map) } NLP/man/TaggedTextDocument.Rd0000644000175100001440000000662113741575523015543 0ustar hornikusers\name{TaggedTextDocument} \alias{TaggedTextDocument} \title{POS-Tagged Word Text Documents} \description{ Create text documents from files containing POS-tagged words. } \usage{ TaggedTextDocument(con, encoding = "unknown", word_tokenizer = whitespace_tokenizer, sent_tokenizer = Regexp_Tokenizer("\n", invert = TRUE), para_tokenizer = blankline_tokenizer, sep = "/", meta = list()) } \arguments{ \item{con}{a connection object or a character string. See \code{\link{readLines}()} for details. } \item{encoding}{encoding to be assumed for input strings. See \code{\link{readLines}()} for details. } \item{word_tokenizer}{a function for obtaining the word token spans.} \item{sent_tokenizer}{a function for obtaining the sentence token spans.} \item{para_tokenizer}{a function for obtaining the paragraph token spans, or \code{NULL} in which case no paragraph tokenization is performed.} \item{sep}{the character string separating the word tokens and their POS tags.} \item{meta}{a named or empty list of document metadata tag-value pairs.} } \details{ \code{TaggedTextDocument()} creates documents representing natural language text as suitable collections of POS-tagged words, based on using \code{\link{readLines}()} to read text lines from connections providing such collections. The text read is split into paragraph, sentence and tagged word tokens using the span tokenizers specified by arguments \code{para_tokenizer}, \code{sent_tokenizer} and \code{word_tokenizer}. By default, paragraphs are assumed to be separated by blank lines, sentences by newlines and tagged word tokens by whitespace. Finally, word tokens and their POS tags are obtained by splitting the tagged word tokens according to \code{sep}. From this, a suitable representation of the provided collection of POS-tagged words is obtained, and returned as a tagged text document object inheriting from classes \code{"TaggedTextDocument"} and \code{"\link{TextDocument}"}. There are methods for generics \code{\link{words}()}, \code{\link{sents}()}, \code{\link{paras}()}, \code{\link{tagged_words}()}, \code{\link{tagged_sents}()}, and \code{\link{tagged_paras}()} (as well as \code{\link{as.character}()}) and class \code{"TaggedTextDocument"}, which should be used to access the text in such text document objects. The methods for generics \code{\link{tagged_words}()}, \code{\link{tagged_sents}()} and \code{\link{tagged_paras}()} provide a mechanism for mapping POS tags via the \code{map} argument, see section \bold{Details} in the help page for \code{\link{tagged_words}()} for more information. The POS tagset used will be inferred from the \code{POS_tagset} metadata element of the CoNLL-style text document. } \value{ A tagged text document object inheriting from \code{"TaggedTextDocument"} and \code{"\link{TextDocument}"}. } \seealso{ \url{https://www.nltk.org/nltk_data/packages/corpora/brown.zip} which provides the W. N. Francis and H. Kucera Brown tagged word corpus as an archive of files which can be read in using \code{TaggedTextDocument()}. Package \pkg{tm.corpus.Brown} available from the repository at \url{https://datacube.wu.ac.at} conveniently provides this corpus as a \pkg{tm} \link[tm:VCorpus]{VCorpus} of tagged text documents. } NLP/man/AnnotatedPlainTextDocument.Rd0000644000175100001440000000676013336501160017237 0ustar hornikusers\name{AnnotatedPlainTextDocument} \alias{AnnotatedPlainTextDocument} \alias{annotation} \title{Annotated Plain Text Documents} \description{ Create annotated plain text documents from plain text and collections of annotations for this text. } \usage{ AnnotatedPlainTextDocument(s, a, meta = list()) annotation(x) } \arguments{ \item{s}{a \code{\link{String}} object, or something coercible to this using \code{\link{as.String}()} (e.g., a character string with appropriate encoding information).} \item{a}{an \code{\link{Annotation}} object with annotations for \code{x}.} \item{meta}{a named or empty list of document metadata tag-value pairs.} \item{x}{an object inheriting from class \code{"AnnotatedPlainTextDocument"}.} } \details{ Annotated plain text documents combine plain text with annotations for the text. A typical workflow is to use \code{\link{annotate}()} with suitable annotator pipelines to obtain the annotations, and then use \code{AnnotatedPlainTextDocument()} to combine these with the text being annotated. This yields an object inheriting from \code{"AnnotatedPlainTextDocument"} and \code{"\link{TextDocument}"}, from which the text and annotations can be obtained using, respectively, \code{\link{as.character}()} and \code{annotation()}. There are methods for class \code{"AnnotatedPlainTextDocument"} and generics \code{\link{words}()}, \code{\link{sents}()}, \code{\link{paras}()}, \code{\link{tagged_words}()}, \code{\link{tagged_sents}()}, \code{\link{tagged_paras}()}, \code{\link{chunked_sents}()}, \code{\link{parsed_sents}()} and \code{\link{parsed_paras}()} providing structured views of the text in such documents. These all require the necessary annotations to be available in the annotation object used. The methods for generics \code{\link{tagged_words}()}, \code{\link{tagged_sents}()} and \code{\link{tagged_paras}()} provide a mechanism for mapping POS tags via the \code{map} argument, see section \bold{Details} in the help page for \code{\link{tagged_words}()} for more information. The POS tagset used will be inferred from the \code{POS_tagset} metadata element of the annotation object used. } \value{ For \code{AnnotatedPlainTextDocument()}, an annotated plain text document object inheriting from \code{"AnnotatedPlainTextTextDocument"} and \code{"\link{TextDocument}"}. For \code{annotation()}, an \code{\link{Annotation}} object. } \seealso{ \code{\link{TextDocument}} for basic information on the text document infrastructure employed by package \pkg{NLP}. } \examples{ ## Use a pre-built annotated plain text document obtained by employing an ## annotator pipeline from package 'StanfordCoreNLP', available from the ## repository at , using the following code: ## require("StanfordCoreNLP") ## s <- paste("Stanford University is located in California.", ## "It is a great university.") ## p <- StanfordCoreNLP_Pipeline(c("pos", "lemma", "parse")) ## doc <- AnnotatedPlainTextDocument(s, p(s)) doc <- readRDS(system.file("texts", "stanford.rds", package = "NLP")) doc ## Extract available annotation: a <- annotation(doc) a ## Structured views: sents(doc) tagged_sents(doc) tagged_sents(doc, map = Universal_POS_tags_map) parsed_sents(doc) ## Add (trivial) paragraph annotation: s <- as.character(doc) a <- annotate(s, Simple_Para_Token_Annotator(blankline_tokenizer), a) doc <- AnnotatedPlainTextDocument(s, a) ## Structured view: paras(doc) } NLP/man/ngrams.Rd0000644000175100001440000000113513333064622013254 0ustar hornikusers\name{ngrams} \alias{ngrams} \title{Compute N-Grams} \description{ Compute the \eqn{n}-grams (contiguous sub-sequences of length \eqn{n}) of a given sequence. } \arguments{ \item{x}{a sequence (vector).} \item{n}{a positive integer giving the length of contiguous sub-sequences to be computed.} } \value{ a list with the computed sub-sequences. } \examples{ s <- "The quick brown fox jumps over the lazy dog" ## Split into words: w <- strsplit(s, " ", fixed = TRUE)[[1L]] ## Word tri-grams: ngrams(w, 3L) ## Word tri-grams pasted together: vapply(ngrams(w, 3L), paste, "", collapse = " ") } NLP/man/Span.Rd0000644000175100001440000000450012502573125012665 0ustar hornikusers\name{Span} \alias{Span} \alias{as.Span} \alias{is.Span} \alias{[.Span} %% \alias{[<-.Span} \alias{[[.Span} %% \alias{[[<-.Span} \alias{$<-.Span} \alias{Ops.Span} \alias{as.data.frame.Span} \alias{as.list.Span} \alias{c.Span} \alias{duplicated.Span} \alias{format.Span} \alias{length.Span} \alias{names.Span} \alias{print.Span} \alias{unique.Span} \title{Span objects} \description{ Creation and manipulation of span objects. } \usage{ Span(start, end) as.Span(x) is.Span(x) } \arguments{ \item{start, end}{integer vectors giving the start and end positions of the spans.} \item{x}{an \R object.} } \details{ A single span is a pair with \dQuote{slots} \sQuote{start} and \sQuote{end}, giving the start and end positions of the span. Span objects provide sequences (allowing positional access) of single spans. They have class \code{"Span"}. Span objects can be coerced to annotation objects via \code{\link{as.Annotation}()} (which of course is only appropriate provided that the spans are character spans of the natural language text being annotated), and annotation objects can be coerced to span objects via \code{as.Span()} (giving the character spans of the annotations). Subscripting span objects via \code{[} extracts subsets of spans; subscripting via \code{$} extracts integer vectors with the sequence of values of the named slot. There are several additional methods for class \code{"Span"}: \code{print()} and \code{format()}; \code{c()} combines spans (or objects coercible to these using \code{as.Span()}), and \code{as.list()} and \code{as.data.frame()} coerce, respectively, to lists (of single span objects) and data frames (with spans and slots corresponding to rows and columns). Finally, one can add a scalar and a span object (resulting in shifting the start and end positions by the scalar). \code{Span()} creates span objects from the given sequences of start and end positions, which must have the same length. \code{as.Span()} coerces to span objects, with a method for annotation objects. \code{is.Span()} tests whether an object inherits from class \code{"Span"} (and hence returns \code{TRUE} for both span and annotation objects). } \value{ For \code{Span()} and \code{as.Span()}, a span object (of class \code{"Span"}). For \code{is.Span()}, a logical. } NLP/man/annotators.Rd0000644000175100001440000002111312520713754014157 0ustar hornikusers\name{annotators} \alias{Simple_Para_Token_Annotator} \alias{Simple_Sent_Token_Annotator} \alias{Simple_Word_Token_Annotator} \alias{Simple_POS_Tag_Annotator} \alias{Simple_Entity_Annotator} \alias{Simple_Chunk_Annotator} \alias{Simple_Stem_Annotator} \alias{Simple annotator generators} \title{Simple annotator generators} \description{ Create annotator objects for composite basic NLP tasks based on functions performing simple basic tasks. } \usage{ Simple_Para_Token_Annotator(f, meta = list(), classes = NULL) Simple_Sent_Token_Annotator(f, meta = list(), classes = NULL) Simple_Word_Token_Annotator(f, meta = list(), classes = NULL) Simple_POS_Tag_Annotator(f, meta = list(), classes = NULL) Simple_Entity_Annotator(f, meta = list(), classes = NULL) Simple_Chunk_Annotator(f, meta = list(), classes = NULL) Simple_Stem_Annotator(f, meta = list(), classes = NULL) } \arguments{ \item{f}{a function performing a \dQuote{simple} basic NLP task (see \bold{Details}).} \item{meta}{an empty or named list of annotator (pipeline) metadata tag-value pairs.} \item{classes}{a character vector or \code{NULL} (default) giving classes to be used for the created annotator object in addition to the default ones (see \bold{Details}).} } \details{ The purpose of these functions is to facilitate the creation of annotators for basic NLP tasks as described below. \code{Simple_Para_Token_Annotator()} creates \dQuote{simple} paragraph token annotators. Argument \code{f} should be a paragraph tokenizer, which takes a string \code{s} with the whole text to be processed, and returns the spans of the paragraphs in \code{s}, or an annotation object with these spans and (possibly) additional features. The generated annotator inherits from the default classes \code{"Simple_Para_Token_Annotator"} and \code{"Annotator"}. It uses the results of the simple paragraph tokenizer to create and return annotations with unique ids and type \sQuote{paragraph}. \code{Simple_Sent_Token_Annotator()} creates \dQuote{simple} sentence token annotators. Argument \code{f} should be a sentence tokenizer, which takes a string \code{s} with the whole text to be processed, and returns the spans of the sentences in \code{s}, or an annotation object with these spans and (possibly) additional features. The generated annotator inherits from the default classes \code{"Simple_Sent_Token_Annotator"} and \code{"Annotator"}. It uses the results of the simple sentence tokenizer to create and return annotations with unique ids and type \sQuote{sentence}, possibly combined with sentence constituent features for already available paragraph annotations. \code{Simple_Word_Token_Annotator()} creates \dQuote{simple} word token annotators. Argument \code{f} should be a simple word tokenizer, which takes a string \code{s} giving a sentence to be processed, and returns the spans of the word tokens in \code{s}, or an annotation object with these spans and (possibly) additional features. The generated annotator inherits from the default classes \code{"Simple_Word_Token_Annotator"} and \code{"Annotator"}. It uses already available sentence token annotations to extract the sentences and obtains the results of the word tokenizer for these. It then adds the sentence character offsets and unique word token ids, and word token constituents features for the sentences, and returns the word token annotations combined with the augmented sentence token annotations. \code{Simple_POS_Tag_Annotator()} creates \dQuote{simple} POS tag annotators. Argument \code{f} should be a simple POS tagger, which takes a character vector giving the word tokens in a sentence, and returns either a character vector with the tags, or a list of feature maps with the tags as \sQuote{POS} feature and possibly other features. The generated annotator inherits from the default classes \code{"Simple_POS_Tag_Annotator"} and \code{"Annotator"}. It uses already available sentence and word token annotations to extract the word tokens for each sentence and obtains the results of the simple POS tagger for these, and returns annotations for the word tokens with the features obtained from the POS tagger. \code{Simple_Entity_Annotator()} creates \dQuote{simple} entity annotators. Argument \code{f} should be a simple entity detector (\dQuote{named entity recognizer}) which takes a character vector giving the word tokens in a sentence, and return an annotation object with the \emph{word} token spans, a \sQuote{kind} feature giving the kind of the entity detected, and possibly other features. The generated annotator inherits from the default classes \code{"Simple_Entity_Annotator"} and \code{"Annotator"}. It uses already available sentence and word token annotations to extract the word tokens for each sentence and obtains the results of the simple entity detector for these, transforms word token spans to character spans and adds unique ids, and returns the combined entity annotations. \code{Simple_Chunk_Annotator()} creates \dQuote{simple} chunk annotators. Argument \code{f} should be a simple chunker, which takes as arguments character vectors giving the word tokens and the corresponding POS tags, and returns either a character vector with the chunk tags, or a list of feature lists with the tags as \sQuote{chunk_tag} feature and possibly other features. The generated annotator inherits from the default classes \code{"Simple_Chunk_Annotator"} and \code{"Annotator"}. It uses already available annotations to extract the word tokens and POS tags for each sentence and obtains the results of the simple chunker for these, and returns word token annotations with the chunk features (only). \code{Simple_Stem_Annotator()} creates \dQuote{simple} stem annotators. Argument \code{f} should be a simple stemmer, which takes as arguments a character vector giving the word tokens, and returns a character vector with the corresponding word stems. The generated annotator inherits from the default classes \code{"Simple_Stem_Annotator"} and \code{"Annotator"}. It uses already available annotations to extract the word tokens, and returns word token annotations with the corresponding stem features (only). In all cases, if the underlying simple processing function returns annotation objects these should not provide their own ids (or use such in the features), as the generated annotators will necessarily provide these (the already available annotations are only available at the annotator level, but not at the simple processing level). } \value{ An annotator object inheriting from the given classes and the default ones. } \seealso{ Package \pkg{openNLP} which provides annotator generators for sentence and word tokens, POS tags, entities and chunks, using processing functions based on the respective Apache OpenNLP MaxEnt processing resources. } \examples{ ## A simple text. s <- String(" First sentence. Second sentence. ") ## ****5****0****5****0****5****0****5** ## A very trivial sentence tokenizer. sent_tokenizer <- function(s) { s <- as.String(s) m <- gregexpr("[^[:space:]][^.]*\\\\.", s)[[1L]] Span(m, m + attr(m, "match.length") - 1L) } ## (Could also use Regexp_Tokenizer() with the above regexp pattern.) sent_tokenizer(s) ## A simple sentence token annotator based on the sentence tokenizer. sent_token_annotator <- Simple_Sent_Token_Annotator(sent_tokenizer) sent_token_annotator a1 <- annotate(s, sent_token_annotator) a1 ## Extract the sentence tokens. s[a1] ## A very trivial word tokenizer. word_tokenizer <- function(s) { s <- as.String(s) ## Remove the last character (should be a period when using ## sentences determined with the trivial sentence tokenizer). s <- substring(s, 1L, nchar(s) - 1L) ## Split on whitespace separators. m <- gregexpr("[^[:space:]]+", s)[[1L]] Span(m, m + attr(m, "match.length") - 1L) } lapply(s[a1], word_tokenizer) ## A simple word token annotator based on the word tokenizer. word_token_annotator <- Simple_Word_Token_Annotator(word_tokenizer) word_token_annotator a2 <- annotate(s, word_token_annotator, a1) a2 ## Extract the word tokens. s[subset(a2, type == "word")] ## A simple word token annotator based on wordpunct_tokenizer(): word_token_annotator <- Simple_Word_Token_Annotator(wordpunct_tokenizer, list(description = "Based on wordpunct_tokenizer().")) word_token_annotator a2 <- annotate(s, word_token_annotator, a1) a2 ## Extract the word tokens. s[subset(a2, type == "word")] } NLP/man/CoNLLUTextDocument.Rd0000644000175100001440000000624213741576043015401 0ustar hornikusers\name{CoNLLUTextDocument} \alias{CoNLLUTextDocument} \title{ CoNNL-U Text Documents } \description{ Create text documents from CoNNL-U format files. } \usage{ CoNLLUTextDocument(con, meta = list()) } \arguments{ \item{con}{a connection object or a character string. See \code{\link{scan}()} for details. } \item{meta}{a named or empty list of document metadata tag-value pairs.} } \details{ The CoNLL-U format (see \url{https://universaldependencies.org/format.html}) is a CoNLL-style format for annotated texts popularized and employed by the Universal Dependencies project (see \url{https://universaldependencies.org/}). For each \dQuote{word} in the text, this provides exactly the 10 fields \code{ID}, \code{FORM} (word form or punctuation symbol), \code{LEMMA} (lemma or stem of word form), \code{UPOSTAG} (universal part-of-speech tag, see \url{https://universaldependencies.org/u/pos/index.html}), \code{XPOSTAG} (language-specific part-of-speech tag, may be unavailable), \code{FEATS} (list of morphological features), \code{HEAD}, \code{DEPREL}, \code{DEPS}, and \code{MISC}. The lines with these fields and optional comments are read from the given connection and split into fields using \code{\link{scan}()}. This is combined with consecutive sentence ids into a data frame used for representing the annotation information, and together with the given metadata returned as a CoNLL-U text document inheriting from classes \code{"CoNLLUTextDocument"} and \code{"\link{TextDocument}"}. The complete annotation information data frame can be extracted via \code{content()}. CoNLL-U v2 requires providing the complete texts of each sentence (or a reconstruction thereof) in \samp{# text =} comment lines. Where consistently provided, these are made available in the \code{text} attribute of the content data frame. In addition, there are methods for generics \code{\link{as.character}()}, \code{\link{words}()}, \code{\link{sents}()}, \code{\link{tagged_words}()}, and \code{\link{tagged_sents}()} and class \code{"CoNLLUTextDocument"}, which should be used to access the text in such text document objects. The CoNLL-U format allows to represent both words and (multiword) tokens (see section \sQuote{Words, Tokens and Empty Nodes} in the format documentation), as distinguished by ids being integers or integer ranges, with the words being annotated further. One can use \code{as.character()} to extract the \emph{tokens}; all other viewers listed above use the \emph{words}. Finally, the viewers incorporating POS tags take a \code{which} argument to specify using the universal or language-specific tags, by giving a substring of \code{"UPOSTAG"} (default) or \code{"XPOSTAG"}. } \value{ An object inheriting from \code{"CoNLLUTextDocument"} and \code{"\link{TextDocument}"}. } \seealso{ \code{\link{TextDocument}} for basic information on the text document infrastructure employed by package \pkg{NLP}. \url{https://universaldependencies.org/} for access to the Universal Dependencies treebanks, which provide annotated texts in \emph{many} different languages using CoNLL-U format. } NLP/man/generics.Rd0000644000175100001440000000273212314546106013570 0ustar hornikusers\name{generics} \alias{content} \alias{content<-} \alias{meta} \alias{meta<-} \title{Access or Modify Content or Metadata} \description{ Access or modify the content or metadata of \R objects. } \usage{ content(x) content(x) <- value meta(x, tag = NULL, ...) meta(x, tag = NULL, ...) <- value } \arguments{ \item{x}{an \R object.} \item{value}{a suitable \R object.} \item{tag}{a character string or \code{NULL} (default), indicating to return the single metadata value for the given tag, or all metadata tag/value pairs.} \item{...}{arguments to be passed to or from methods.} } \details{ These are generic functions, with no default methods. Often, classed \R objects (e.g., those representing text documents in packages \pkg{NLP} and \pkg{tm}) contain information that can be grouped into \dQuote{content}, metadata and other components, where content can be arbitrary, and metadata are collections of tag/value pairs represented as named or empty lists. The \code{content()} and \code{meta()} getters and setters aim at providing a consistent high-level interface to the respective information (abstracting from how classes internally represent the information). } \value{ Methods for \code{meta()} should return a named or empty list of tag/value pairs if no tag is given (default), or the value for the given tag. } \seealso{ \code{\link{TextDocument}} for basic information on the text document infrastructure employed by package \pkg{NLP}. } NLP/DESCRIPTION0000644000175100001440000000120713741602721012432 0ustar hornikusersPackage: NLP Version: 0.2-1 Title: Natural Language Processing Infrastructure Authors@R: person("Kurt", "Hornik", role = c("aut", "cre"), email = "Kurt.Hornik@R-project.org", comment = c(ORCID = "0000-0003-4198-9911")) Description: Basic classes and methods for Natural Language Processing. License: GPL-3 Imports: utils Depends: R (>= 3.2.0) Enhances: udpipe, spacyr, cleanNLP NeedsCompilation: no Packaged: 2020-10-14 13:54:10 UTC; hornik Author: Kurt Hornik [aut, cre] () Maintainer: Kurt Hornik Repository: CRAN Date/Publication: 2020-10-14 14:06:09 UTC NLP/R/0000755000175100001440000000000013741575236011137 5ustar hornikusersNLP/R/udpipe.R0000644000175100001440000000262413337763133012550 0ustar hornikusers## Viewer methods for objects of class "udpipe_connlu" as obtained by ## udpipe::udpipe_annotate(). ## ## All methods will need the udpipe namespace loaded to use the ## as.data.frame() method for class "udpipe_connlu". ## Should we check for this? ## Perhaps simply call loadNamespace("udpipe") in the methods? ## words.udpipe_connlu <- function(x, ...) { x <- as.data.frame(x) x$token } sents.udpipe_connlu <- function(x, ...) { x <- as.data.frame(x) split(x$token, x$sentence_id) } paras.udpipe_connlu <- function(x, ...) { x <- as.data.frame(x) lapply(split(x, x$paragraph_id), function(e) split(e$token, e$sentence_id)) } tagged_words.udpipe_connlu <- function(x, which = c("upos", "xpos"), ...) { x <- as.data.frame(x) which <- match.arg(which) Tagged_Token(x$token, x[[which]]) } tagged_sents.udpipe_connlu <- function(x, which = c("upos", "xpos"), ...) { x <- as.data.frame(x) which <- match.arg(which) .tagged_sents_from_conllu_frame(x, which) } .tagged_sents_from_conllu_frame <- function(x, which) { lapply(split(x, x$sentence_id), function(e) Tagged_Token(e$token, e[[which]])) } tagged_paras.udpipe_connlu <- function(x, which = c("upos", "xpos"), ...) { x <- as.data.frame(x) which <- match.arg(which) lapply(split(x, x$paragraph_id), .tagged_sents_from_conllu_frame, which) } NLP/R/sysdata.rda0000644000175100001440000027033013741601403013266 0ustar hornikusers7zXZi"6!X壉])TW"nRʟF\ XvӖnMdpR}sMi :dxB,aMO]+kU.nHӜ+y曢)d94٣Jh$kV!t䉮7'2#1AAI5Q_=s^h! d=9q<|s$폿"K+gC+RC9_^O& Txf#(5τe8$tBbP /1,MА e7Ae~t38{18!uߵ~PƖ=zyrY-/`Fs \ZM*hA4<(>X}li5=vd{_Yܑb%j:xnm@us3~)n)J8|D)ʺ p=0GS off{ ]Ȍ>_XGJf\2#c4:4]*B%o>q%:*SU=B7t^|Хr"[KR0.NYa= W庁s+ǞsC#=Py7}0OJǡh xt92> o٭6F]l i+.ڲ+60@g/7#3df; rK &罧o90R"N"Y}㛻r}|(e$~l.h%sԹ/VKPɤLX?QX\a)EZ \4=f,0/@./Ezvmy$#Kٌ+rG:2w,h^ӳю'fEk,.[DDy/3my8ÅJB'ׯV;,\} ts~FGL II:R=jlfhJS,nV[f X Z, )o$&A"̞zVa>>)#K{>T%>yD=vbb7DWjaZz"TP:`{:x4"NJLmbRHtDH!ݿ*\lDrS.QglZBկP &!E;yw #:)MxdI\1(< dMb4opxHV g.a=]KoH۝]ogs+ӡ.ʭ AtҗEM[s>Q^ h~;W9YO#fFa#g5eukbӫP4E/}K-(X[&kOgޚA)ʝGoYdա ?U[%-:~u#+YYoC43?'tP`(/Gað9΀&~=2Pqކ̙h;vmR)VE؈!^޷ӜzL- TK ػK1U Akc#\O ׇI4u5SK%$3vƧEJ[zb|R+MT*5ox'(o!p뾑,6iOY-u=X;iLg!_hI!0B,F34Q 'sND4H1`fVRˠ>_Yk"HÉ75b@ qXOW@ ƛń98ISځ24gK6@ $\}8`9cWvCl}*=:%_71R aA,"!Z)\x!#rt@1Z\qm$Yݾ[vt%6c$Ev .%9YX ;K!{omՂzupnӓE7i~*,cbസKr0Ȗ=/o*t'mZt4;I >HLk:єPm 6tu3£ t[IWYr >!'ύy_P;9/t>E{J:`RnrRFARYU#4)縆3RoZdld&x]xf9l:8Fɲ&\Zw[Ts|Hɘt#pwh/E*0!<-XȪsy8g "Y #aחR5`|ADՋMf]5/צH(@c 23[:ЉAѸTߡ:gqDƂ]UAw?#<%tQҸ2f ($Ccy6dj[\2pxxzv pM< g>nABvKfv&᫽#ʤڳK F `5QO'&C0#@43B#N+ETys /״sZP0p-xmWeɢɡM1KwG')UԴFUrGJKngNwoꞻ) ;אEq\,tY(H~xů&Ma%qDjO&> ۚǔmM)ɢoؙT*[!U@e|#BRwZ&/yJM B&H")H僙ahjTF$~ĠQɇkh 0PE:S叩pNʼHx`~] !mdzIWGW`!|j],QNXohT8. 3-OE_ybv b#9D,筞,M)A`2Z"Ca\ʱ ˸@p&|&C6VB_Ͳ/`F@4ka·ƽp˘5emNB3/О>ۤNSs[{ Zt9y)oD8 oWYiOT)z?nϪ@l4}+\1#{go pZLI s'Z fc*NAymA'g\G` VC~[CMlye"?,Q=$OJרEBHDߙO0{cd{eţ"MYԘ >Aе]M8$6N $6j&:,*zwv^Ȇ8a{TeGB:W0+#g'TŞTH؆DY"em p%5.LbˉeԖny4 {;HFc1EN|MTgxX؍LK7 lt~'3s^?M_He﫢]3/3@W`L:3d˥-aCxP>[^#lj>N2wz3DtU\\B-RŞ\\L g ?$>w7m= p>I% N~S.jV`48 Q_zΔ&!θFHf>3Lğtc^?Ҵ%f"%h0InEkn&1R6 #-^k X |2tU؜&B$MCibi=N,ZA"-Y{_oe'̆'6k8| j::+ˇ{4wMC:k9F)jPj ǀϹwm;->Z$_ՎВ`_Eˤ÷ jǜoyc.p.<Fv,.{.`7ڳ '.DXWZjuAJ-b"o`a8go_ȈK͡n*j ue<P*]Dz uhQિb?)swi(2*zFJ*4@n̞,#E6HŬ L&5~qqXͥ,^^O=\ȓQuL$^t11^l1^{Nfç$wݿ18-g-ʐkNDϥԛFLI=?8a v;Xxe T\aڭYkעQr]261tpkxf/J}`6)PnVf7M,ݩ*nZ42"۝*Eäd _=B#AP:sc[oaլ1l[S+rҬV`$+Yߦ4kQ[,cm SW^@\28\@{,yqFaDLƻd;!ԙhC) mL`hE71iDPo.e쐙QF{>]cep7WgRCZ+GRie8SY ub-{[L}i[,9ʗM~z0S3Ӝ8SV] /r ݃|}̇ܯ+2{ET[%a؀>Z%x\dfee RݕMLߨkbPHDōvs˚6lbӼm٦Sh_@LpS|_jz^2I&P#XA~r7߇VI}*@P qb`:Kwz%cL BV0<3Nm $ĚzL/lk/;Tmuӡ^T yɕ;o"=-en/ C >ǝo$cMc0Rn4JzG],_mԷ^K8-o8]WtEuxPf՘_yzFw$6IT!VCn>]:}hN`|`O~ 8/kmX4!y56 NW~蔝L%L 6 8u??#p|uxxNq̝,GouƴVKZǭ*@W ^:nscQ3;5>&4h(B>"8p s Mb \z+e l,$J򜆐T}z5^$gYM@!0d<7]!,[T_Y* Iv#Ĉ`oB t)z0)â~&je)C4pgWn  1r%[i݅±G#H70L CJJnvVa >@EG &w\?RԞwZ,>',-ӕFL^zLjVt4NEs !/eL;X2D)gKSPA;ќǻLUk4OYrSc/YX Qƾ&- ^c!7z:Šy3uZ6˫.P!$)VͅN&ڎJk ols؂BW \c \ F>T3Yr^* 4FRXckQJ.YkG-&p"Z2p?֋*Pa99knh$S7qX:5 0'Z:tՊ ^IϾI;@8i~1θM%kX SB WDA?ߪFr7,c^oFDxXT8mnYamOؠq& %@c]Lt"åCbA$ s +O΍2=?s2amq/^8@tn/Ʈ Rk\*o Krx?K$ _:9֡ LPyV?] ~t=7b4PJ7s}e]4Gsi[{)%;#Uj1Ŵl3~(lTR$ $`rp3 Z̞w!4C*dJ%ؓ祚WmjP}XoktxPLyP@l|Or/JAuk6.:s·;"|iDF3?ITq%φw˓R3R1%(,t* ~T(YݐyFiıųvh;ژSv"O/uXbL~` SA2 la @c$<:$؊GgO/G ,olk=Bdl%W“!|. ׺$rDvŤ"6RxHh5[ZO{tq"LLahot{ө(Y"vYRe},`tbzvBR1[}ѤS?G z'?>d>ãvI  i:є)%d5ˣ> b $LE h i'+yfܜlc+{>NkƇ#(^Um{^/3]4NqJ`/f䂺T&ߝh|Fh]tyűw办 F 3 9,܎Qx6]`5:[C~ɞC|q([k\~ej'v/9j>ALȶ1ay4ZdOKjkᜥK21 j T3IL`{U\7ƒ<^9 ts r/v^q[~%QnFJwfG i"_Bd6 V&,c`F@~\.}WWGmNF!}q꒛{z ZowEv!|rbi"ƃb_rK[~KmWrNQע̸Sϴg[i>1t6yvS"J|pttr0T-@ahMhmߟ@/2HS7줌Ja62b Ly$BӖVqcjhG',AnАTuv Ka<8G,puw`18M9 U?O#඗{A|xC[،$TPKByMbu\pl^1~. ՓB)l[&ϖ 9-l ֧pT3jl}vgJ2=4joT0Н(%LҒgDQj~jqd$şlgZ:KA^E ϐyI)}s"E(:*N]Z]{xa1o{HWbQ%vüd/1#"vދШX5ymi-Ց Qe ܫ|RQA(:(&έbVܟVCBi{_me5pvlMtט?xgf1"@]l>Z%-7޺~|JP-1HkĺSDB z>o )'_/ىAESsnlf3^kg!] ?'Y?{: a{ [וj5̲Tqdr"ch~,]:sT?Gy9y<݉tg^pf@]\͡IHT-  KT1dxCɊ"ϟEo<#?UEeb&nw(͛Y?F-ҢzcpGqԀ2a~^LTqˠ,h` [vst*(Sሒq Cۂ>ih V&n\`29owlqsqؖ)S1oxsj>c5g5O֠Ew6</L=*׊yoA2qV.v +P ֥S Y8E/H(L`Yx sV+E7]s{fFXp/\r0g=S奲:SD9EM3l?m #؋ }:oe(݅l׿moEWd1R Ƅ^nU]#c/)煣gў55TUS_$ bY+~^>]qɁ`F|Q~XҒx ]5a"Pᶙ:J{7GӑO,;G>ws@M1̏,b6bfݤвM BmslGᇚ*kɑ>\)7a{NGPi0c'۞\gA}!.@ORuQ%Z@cpu .f1>qDsp.q0D2wY~{H+Um!ږMJf51~>#63Ȭ\(5dUu̽s )N:s ? RT-tlj|R~u *=,ΩyȢAj2}la6Y/kr[k^P>J)l owŇks>@TȪD2Li]/<&xS_4ldW]]| D밉_b[̈́ǡ{tO&ـrLųqh=ДN[Kyzrc?)A^jw͗)pC4/[2h@Ƚ VR;i CGdEP~+SgڳbAZܪD SB!ggѐ3 $e F9riDW[!g%7|"b@aIpΈ QFlb77ў@2%R]AƜa>10e4ZΛCe9m_r if5JqJrWo/'D. ,\&`.¸8 S] Reو3TSeB1y($ ]P4 QtUWI8~^=RfM2U=ֈ9uJ& 0 W/աa<724RZ|RB&qDvA 9]9&/xrw-c|T>8- PX#, >Ѵۂ&xXGC<^S0F)vPa?7GAq=-o3Hl\,aJ9YϹbF5vpU$Q18r~( ӻ~NAſ|OT$c[a M¡5R5P*0Y'*q뚔 x늷D}vcdž]8b9)2%D䀵oclmy.ml6-xB$oqb0*-7v^͡ؽu1Ze/#-Ҥ1!e2.aqv)K ϱ%yiE1o(AR9)Gb>=x6r'1ٻ븓dioT NƅNQ| )eAZppZ v ]&W?6gC<RieZcJLwU{ Dd4q6%q-'#1W\$v\(Y ٛSH.BnGS;E @'+!u=iQ2Hp< EPX> nq67WzàNQ>%sFut3('Y;"u@4QQrȰ1 Ukjh*ۤ9FmWqo4F͝tmQ:p&3\dVs5kxu-'octi=Ua-Ie Cx͟^MSGK/+R>V${_sV,r8̢lz4#jn&ɀXăk{6ovZS5 ^˽yݓR,ݶȱԺ;ChYZ^hj9}^Y4扳'5cf/48 n)FˌCk6YWlk q$q6;(p')vh }}k ~oB,nH7PR$j^-ʽsv2uv:lC܁*KU(C_Y#A66swhgpg`ozڮͨk k>}+]>*5'uiuudaǐ 3wNkÌLa$%-Og(6xsd\$mCSjs 玆z IAFL,I't}ssXקy3~cc L#R*Y2@ *A!~Eပ %L3XVŧDz[Vb9W.N"C7@ _BǛ2;͏ hvZ-ƖdNۂ*[u[(Zs`lzs+UCE d&ztbkI#iP$lS"2NA T,|ou%ҸaX`Ztʂإ3Iw[L""ѻB&ОTVa졷"tPU)dԿ;cQa~EV-Gl],Ār8N=v5W&aIŁ|ߵѤUǍGJ*eޑMn.56jpF%q̪*&}T| 0|L< u8;u;C k`LF:wuTѺbb^4L`& P >#@:e7Eb/Nrpv@>:RC`/Ii4",\}P+(bvbVa%ݵ:*NTd{ѿ⇥+̢+Z⎒O|9͑|NlA0zߝzf)>BC!T#!k0jU_g $2S&e*}/S AsK,Po7ߐ vm;Rpj[|3{r&Mmr!TN~Yu-n7ofGnM&V%jM';j"?MP`⦍]o%Yqh ͞.7X1YoÜ<6yK~fD|~ K]E |zT$(lֲCac5N5mv3 'Xrl^Y0Sr A*I̞ ˆ5>`^q#9R~~| ͗hy'Or?PMq;$UmAJ"8h`j͞+ ü9ۈy ϩb Y<́YVnĜDlwAlaNڸ @ldFprWE@Ŧ;`M W7/KfyҶ dĩ3;ܗ!O7Εb\+FBPЭ6dV./? R^?ZCvK^냯bt߻Re) 3m uw$vqLS7E7ifk,}t e lj{pA!MFmgYv$k .?A'SB|[cѵ#8:!w3uRGВZ ..~ z^W*>2V8ƬE|'\?_U/4`1B,IplO#z)Aے[ƆtxzoJy֎&e]5[ %&)'2(F%/o,l~jGоg,X)Ġ1ĤOԓp(H>FG@;nmxk%xa-(qz(؋DT[=+[bS&HFɇ+)ݚxӍ@u.Zoa v0 1Di3{9] ,~DIх%/Pƭ?sK2 C7[[O*A}Rs!uEB`<\<ь_z|^Sfì~%:.pY96MhK^(K /eݲY,Ӥ7?,϶6**B-wz]h$ Osw}(@Thş-fG<=8A=Hķē<+E"3QughYtũC  E#υQs5,+Q%?9ܷbn!Pu/|\\F+C11YC.̢'!|X֛NFq>M+/g1'!>O8b&A/WF"ph-g c9@9FIzxcֱ;ϭ6~|DPޝ긆t`B-” PY褛һ8'ߣq]婦錵h^Oaeuf[C0n*r/v0AdQ` k3%i0 -aq7 j< )"Z>Zc4;VjDU*h9&Ir5) 3&JL+jI}Uȗk3F&&e y$%-'q~{]Pݞm՜[ZV-×WF }(NDϵYaTJ_F"Ut4w{!fa ZPhd| Ǣ,DVw(:@'aDI.U/Vw~!^ȑQ7)WRG(Þu1Ѷrj(fDFlE8~W.GjdX"MDO}#OLU%-!?$"&0LW47+\.}fZySֽǐ6ίmv EZo,Mh-=z(ј0);6fTTE0lZc:)\ì{P> C]Tye7xes{qvG6\u P"9ͫZ5Ekڱ8l.pS5ɚDui1~dw m>=r,<<  r)d:‹ iԯ&֖rxw7U<|H{=xH*&]iCКH 9">I k/)T1s]҄GILv ?Uݨ-<[ ExEva`|+Ei0o^%x6"}:IҝFX3X?ͤ=&+P9z'}XJ1ph?`cmf 0kSC[p~A"s~I-0ȻR_9CD{,`;0}WS_NklFr8'Jw届l_lxBGWV3]3T<ΉiL_p$4[rꙷHe)R 럍"Uq[zF!SV9'ЄӶ7˿LPt3E`10tU`A 04bFq ZHnnԖ V mdWamMi*yӇ}  01.P5ء&۬EcWυAv-k?6|F-z )>a34sѴC;!42YgH$k p[/:E2Iл`v{raۃzqf-*>fЙ yGRHIqlDz8ea ]h$]nS2fI4ONυJY-ʁDd~gkZ 4Iơ*Vf#uu-<*sWh%R8IUBwx`L)LIK>Mqtkp&J[}xniv$KTp=86;<׮X&Q}#"p<@[s0HWY_.O~VZZӽT\*X rC4)𠲛Ljwo>!_UD )+p{5nO-o%uUqg闷Zb;|\)32 B%$90:)*E_g?6O<.MF|)oJ+_P7k9 Xh4>VjEJSq WT6XvzU20{~Kyղfwp‡D"uUX(C&6pd˵PKڴ%:j`Z!D%ZY#-&cy1*"q;o*mTGqa~m~?-=FR*P@dbKDM='=#! סHo25>.:T:Θ< 8veD 8l?wG@5 |$gHA7ȃE7#EHfR Ł+rtתA0)c:IUZchԼBiY[Iax[ΜTtMg 14&EO_ϷFZAF #{qm!e'bdG ⡭7$0q5 [Y=kWV Ҋ.ꝥQ@.zO5:>9״bP w \g6#( 9Ϝw&6Z({C|p0QBQ|1;eDd˜0tNiJr{q8ZN).2GSYPؼ iB-pP5Kiܺ'!`2`j<pa6NMܸDspն3R jEa4p`F_Vϟ+f-H}m Ɵ\{*({ިPr7- OB~LRC7}A3ݠXkhnf[֒>p*4=X$3+vI!6Z]r/eϢSGMCh{a5igxcL( x"/z`}*TbCO2tT~jvIY(ql í{I$XvϋF!V;U3n:XFwHӑ{yÀWޢϰ#(4 >cewhw1PMG9}{hy8Bi23'd_q*Z~!=toD4a_OZw qw[gi}AMGsJԚk12m5DsP 1?v2Y鬪ӠxX[a}&<)Y ū7iɧ|CuFK3H-6]m56b%F< *{f$ˆ~R> t55.E1$#7wj/Ӹ_~84\*=iP <ͦRkNHZy1IzOG*54ȣ7-.p~H"܆s l `5?+M%l 0 SX{BM6,GÑ !vWWio]jnwh R5?1zE3F0!g_.r:z9uPdۿ)r{(}U*Тx Jz |K/7q}m#ɳa. lt};߻}HV£GE$3/K %6=*6DzUka>"tp?s#E>"ֹ:@aީt,U#p6%|#'$ '%ӗ !)M|yߍX1~0gĠҒ,Knn:k-g1˾aw`yn+'X?P,=FYbX⹚^zw8S;ftL]+XK_Q3=_)$Y?+{Q +ނ[[ޝF%30C _|D>=r,}FH1QtQUQ=IкJZur <&][<:tZuH/0K\c"چCZ4h T_.ssmf+a0Wrnă\Fw#x]0 "06epY@˜lx߽ L %J22]PG7x.[% ɋs{it7)T[vQDoRQ,~^;Hi=jݗygWfO ~,[%뙎h<v=|#̉cfFYc1%Oyt!Eamw䔇u)F09꠴h-ߋ}V"{;7b36OШl j ҿxH20h^m.%,=sECW0Xn3l 9@89T%*#Ll1Z%㼶?d$;@x3.O(=dm1Acm)1ۦ2\qxh|e|u_G)1֘lx+r~† >bf/4%5 3%smQ?Z~uO}OUP/$=ga07ah30A-0mxKxvncÆOL5ۄwb?/y3a'bX3OӇ:O3\լ4q%\u J! ZLCLv1sd/+ ߂r +n/0FKjں RT3>YDa@k$7.nJ,_\. Au1lROÍ 7)Rݳu]sEkt\m1_G;[ժ)\>_ q&Z G( "> 20U-7IM賨z}}|Ŕ.+f1dBSDawOH  p2>|֧a}w鋩)Ӽ鎲;JOW9RV!_N3%ѻmpebêsL3wɫΔw&Ǡ-O]HzZLqAzYa2pH $' EZPgQv::ux(D?\<_peƱ4a^scEJZ\#L%L-z}o͑jT)^g?_ /$~?@*_Z N{p䡣oHĒw?ѦF>uHb4DeQ3˿9 B sfS3ۮ *@md g*"^Xǰ07XJgCS\}CY =C!9 *TUu*"ELp1@w qQZp&hEIG%S9fV)wUe1:Tw_2֨>৕Afm"ks ˺T%-ڱI3_q0V زI24C0Ïl -ߺ}{a{xc\7%g >9D;5ms!КRe%C>λqd\ g\N_)c 6`oSM4kr;p􆯮ɍm<<;|c*/r;'_h5蠦ܖI<{ǡĴNϨ?-l?ǰU&*X t5[ss%1[4 wh5KZCӰB͝ݹCƳ70_]kU?<8%xYΑWFҋg 2\pU=mȄ!;:xZ3q^acz'?*MX:Om 0|YV8Ջ\c2ŊI~奝 &*@S7*H8o)f}dN ?gFj[{D*PU;PgYp_TJ9PcOU֏͈qn(9X$ZB&g;ߑw/ӺcJtVz:2lG<;ojda"U0(R>i.#0(=|a֚#003vXF Ns1#-pQ"P Tösx(Iwl2Xuq O/kJO)}9?Z>D1ǦIA ^T?t1VϺn J`IeHyqڒb_}%nI@3~@e*.aC$OC*dX DߖjV~KntUW%FhMWEÀ0CcXI^vH(z`1ʉ8eh %u#?[ۊj2vY^MVCVIfՎ՛!X9uԶ4閗ݵ)u4$JzUlI}-|QT%#1\aA;ڎu+/J4V)됈'%hF=[3PNOUu>GyU;Y`qQn% ˪'J zĎo|R"& HV^;s$ 4i끪StS"@ ifB9'Cu+) -hz= gc^!o`@!Vp<;Xi rF3(J99|Bufx'e-Iyh${_sg9,82!G4HC\%D2E=$U7AFgVn)])|'ډ.]+[]!( ̩sI FL <;l=uYL>wx%ZUvha,6ϖv+H?)g9ThЯ![9꒐_KI˻kYֽcCnz>@}8z6Lh*.Ӱ.dvݰ<=L~ݙBujNo>:z`z WDZ%kHa4QY+f } H?4cDpǥ@ӥ s(/y3%SԒQemr:h=X u:)tQ lʢmUv نEP` {``gI*՚3ޫ]`M V}2cƭ10NW^yig٫%qBc<0ߍif <_$ׄ>^hZP5moj݅r ϣ&]FO Z0@쀱薎^w`~7 faf*>E \`vlDLaQ-l= S}(C D![ޱ^d7{Cpet)#ZPrU;I˙{Oo)]-rUѿE_U}~,ec"?g)cMܫJG$Ap~<r9F k ]Xq+3G:dv][=DvjqS`0/UD8s/ N6\sİcT\+w9 >!o64?Py5F|AX sh_Ԉ{N?j?H.!ʛ0mk}kSߔtZ`K<PE``FzU O_d:I- 23sYo:tJSsjw϶d5P&O㶒Xv,3A~W{K8qq)B2;`I]N"BDT`[3^DB՟Rϝ>JAG$^mK[)k5+N9h){?{ө"/\ɶe ( G/<Uvx9c˿5$#O,瓾&񝍮ek+| }њҩ @\z&Nc9bȄra\ihx<k2szu$樻mfBqPA}  CvHj/!"6.ARORާU> Ghb#eZdsrwc]PVG:x)bj-B }ʧң>P`Ii(dP"arܽa;|tS@$ZU-omnqKhOȦ@u3@4#({6,uʈԍϒ7|"[6(b' lv{ +/64N럞PD\edZh]/Wo@x3 ȇYYIbz|Cx҉Yݝǩ4F2Gq`6gXǁe x7ߪ;n!ši%jŐxXOΉg Ylr%eQCQcƚzZ^)GKm!ȐxUկQLna!:0l"795O*248 w*j@-3JM 2=aV$ECe7(xƖv_ʌ94RGCu-įTL{l%r1GF;WJ\ht t8&q̙2]vV_/3SWܥ΀xJi|p|f,  d ʾțG~] D HqhR)F^SF¾,㹀}yɛn&*a;Npï$ aNm*mYErsB+Sayw3s wzیdBkh.+uo &V@y!2mp8vO>SrrPZm~=Mt ,BQ1qWo@ n(r#ͨ3 &P; ֌7gO l?B w|õ{NrP1cY7ԝܻ&́ݏlle|5TDIXZ\%&I6TgIKղ(yU}4S wX앩lхG{*`T-A;쇡dX}™b7nT`1}EӬ0xi.d*⸡U⹝=T!scW p)+g9ǜ3*&{oԮ4@&XьdVn%G1,_P'p]T_#/0%==fVd= ._8E/PJiv,~CwYZZ[zXc̲Im(1]"2+10"=}{|g+B8|0jz"'G?Ѯ>=]n6qd;2b|h@ bl?e · 'ǣQShwE j ^Gk[nO#nͳ/ RGiZJpӇb3!f+HAHt7[dc6ԏX C՞:8ËH҃])jdLҢDTTP&3 M_ҵjtM%nPr9+=Q2dN${j _ǨC*Q| zR-N/ж?W`_9ȝS;J6}W|Kz 62B,{R,|W I#i.CSL*3C3B1#o7L;|nF:N)#!WFoJ=sV̤?'k)BCsCy @"qJҏ)8]I3) Qӌ/Ro0?IJCFKp_,켐 іS}Yx~6tѢW<>UW"v c?8" }U,e) >'<Fږ.*qD@,fՅ3#G^QcQݳ )v=nDkvՏtDP78YBQ[іHT 1B1Q3yu꼑1(T !Ԯ#IQΒ50@2'\YXtF rSJMdB"[L_clr,;nA)^mFFpW#^ln(7_.@%y*NO|n=Fz<09 Ӝ]lg=98腗ad*-QJԂ`9t4dlkcs=(BiW-?rhOOQ"-)s9b[-ؕ9X`?Sʐ x*rMp1DN)AOMp!۶B`sAܑF/,$KE//gkSy_t2mu(֋LCx eڀE/LMlgמ|x5]wFn1T'7:/GHDrFyrCzpoW >OUw4yh+&da*|RJ7k5e C=PUn^v$tAVch),euaLS0[x<^0탗@z_: @ ~yLV:0(]; :PjC:dB3I0e6]fM0mհsk)cV2dӣ9rtDC lPL.0$JowٕL?cE0KxHng v/'StԞ%;b[(Nx,< -`tqr. 2B=_-lrZN?D n+}v$4TYU Y'>8SŶTbc:^I &ݡ3@˻ä>H7pځq@06me&5)QW)g#(-Y9ϭWz =E!ۭm+jo/Nk8ro1XUJS\%٦ i+TeZus>ƻ:!8 6I*fJ0#H`˛kbS+G 4(zz  oq0˰Nd[١G[ 6hPw΢N^XyBI9A> rfXPø6X˞0Lؿa׆|)CfkYqv*+6nseY;m{Bp9 t=NlA et鰁##p),A_m`= Jҷ~(Gք%Y\0fcFWrQZK#x.f^`˾4rfk{/YPPM[f]0@WU{ Vo0PL^(--v1arBp+:e{BFAfTm˸`:\_9T%!-Dg'dl`E3&HnASa cAeMA.hyo%>ў0k{T$f#[YhiɌx#֝5#5+2iAt(E\&\{`Z<8(b&VcTһ̜90K[YUc/'9GAP(e}l+!O΀ޡ Iax|`]WWΔ˙8)uuGIdJ$GRgK xu?^n)Գ])䆭oyBMu<J݀*F:Ra"#N ;G"׽EgGb1 Y#ܓvPgB(Y j"TtS%1{ǦR栵k' T A$^$,דv{t&mW\yYt' " R6U`Ȣ : %&3nPАx5x&{vyFkQucw'{>֮?clv5WQhn~ؾ?eۀ=ɬ{ {iJJlR<4upz{mP7<hmrJX=x%W\$+:_1v HN(%uA!>nCϮ{`"կ]>1N.|V`{Fzd~HQ|a ֪\zYj0 c"SDx]#_(!&>rDhW="vrxVkc69ube^-_Dl$qQ6;GSJ.(1[Ll98 G.P9'rǰHkiiGa>7JT2]ZtZ  EQQlQr}bM3=j 0W}A:3/>[|T h_h@zEI'q dtUx(H~ uZε" oMtU6ZDE?O#iCI Lx#]/Rt!O7G$_iDR+I30hsotUa ʰN`;imbd&"ѐ=M{W'HW-#`sm\wrT` Lz(X5h/m8)mSQ7V IWxgXW#e {f񀬥u=S]oB1[uͮ=@PvM>+!Z,b#!FqaTl3,yV\Ϩ0445_OA3&ߤq<@^Q,\cU6|å Uwbu}*?/c>DhΆqIuI<꘷7e UJzX$@]14 -#H)l|+.eyޣ77IH.M}?_|')QXe @d=uBR9TW;u>Ghn݉䙫( nyqZV!k}2 ժ]عwM !-܊#CZӾv<3s[jE! !΅/HY@ooZi^xHa+oV% )@k<$]c* _OlrWpp S <ڱX?B-sQЦaǴBz"KգJleDP _J3 Cછa6|\2^a((s„{Dx6zĨI1Dz壇ZL"JYp9ddmoi J>bHxj<(s(s- rir.(]]pT\pLڋu6gy'^*3=JP!=.ː0C` Ӆ| >lIךdXp|L= ]tMG7"RAYe}\6J5i /}~Q9+Dj9]$]+?sl`C{A^8ӄGI1V{3jd}^l;mn!HTv{^iEk pB=rC Eb-?{r璊w&=u^9ڊo5 cUk 6&G:cIw1T< tt ٴ*%rq~M7P'EG{˾($q"d@ k^gr@$7K[j"9.񭦬]/Gi_/c;̏:L>Χ7^rv)͜nAde?]TbSe:5PA>'OZ[#_=?$/66/uf.ބ nPk%GYJnt=7S]/NVpǹD8,6(!\,{ﹾ.("wk9ưEZ?R-tMGng{v̅'㜙s,xSu*kRi- 54~.Yc!2&RE Hk^f/7U$ŧԃL]׍dᥒ~T8!9zYx/N 5G?N]E4lv )'2%uvMb%"g.rMN; ikH~Xt^ Qx\2ubzH#b^(}G kG @n4$L 7>wTD*(ZɋL(&MJResuJ%T#Nmu_pۍ ו7OϺȶi)p)Q:h-e: r?F-!F1:c%Qwp6XGj("ic#V%ˣalO\3?1(+66R3tll@?u.l+K `<EƂŸ c8k} ]\1v- ϯ+c8ܱZ YuJ;ךLCcXؘYSϵ1\0~hکeByJ;5P~M5AeZ~NR@L8!@igHhN`UAĻQ:U/PiGnm]_) Q~ 2yo- #IH{!;-~b6a~cX-ag_/19Vyt1lG #ۮ_yբ`1=l'V.bT [A룸S8F[͂􄥟*T 6աw*vŎxhř<93pR#6_9Q[OYuqf<[LMVjB6^{ˆO)d%dOqJK$үU '_İ}K >DJLv#!PȚly 5lIV+xt#%uDF{;ezca>ț]E!a^n2LRxܫu:qxݼmZQa[nQ r, չ:d^APfZpeطݑfh 2^A ~oYjjiL q=߶C( YL8!П :vRaA Ӡ%ߵ~!NJ6-|vLIZxOsrrK =6AMDXJ Ft ,;uQ@E S.w1d}JyR`lLL᧽ήf9路j}ZBs?0])BPq-ÿۜF@wàyR|?DYeEnDA!5/f'IL/6Onp!x~;&";3a?=]!/QFZ5 R~["=r.Ҝz2x\Fn>=-VڢfVivM2mԻ};ؚY4 ͰG#5 2`5* tTIes+st1mԖ8]JKzH 8mgYRuǥ LC/[T 4]&Cq%v4>RB vR5hZ(_jSs|+R"٣,WZ eʊd/dAc)kݣo\c&zjv(d"Ÿ4C ng?d'v!PVJ-*к!a{kZ[43Be83^ލK zzcLU6KxMDԺbo͠ W_ of5'V]>OxkSiq<j|,qF: }Ʌ2\[Aeu1(L2]Aq|YPQ~2|UCH̗߼|fΝמbjC)o{|o+|E;GoJ/d~vmJ|8y |~CJ/_傤ějc8\dQpf,V4!Cwj㭕wRt:x}[ŏbcq.a!u??Qƹ , bO_p9&nEVVOc'xٿ]e(a&GЬؿrǝ@)|[g 2şэ*ԏ ;}'n,-t~ƽDc}`+[[%u>'D }_Uq2Z|:q2e;]"`lݲneM"u,g0 ūUE=RT$|G5/#`Rvf1uf茑 K7@R:ӤV"5H+l{g?SW#ª &Qo @~o IqmuV0IE39㑎?YpA)7{iYwU T/WvḂFW1x{x,(t|w`GG1OFDyĉk7~Kۂӷ(;.}i$ ArS~췣mv/PS$?+Lz D9zݭ{d 9QF9Yw4ۢ"?ԪZR ks; 0i Mtyٸ)sNeh>^kX+VDJ8Þ@(Š5E&xr5N`GTt43Ҭo-G]sv=>2?5d `r@f\;PCz?;hm_% ƪd慺,a>-"eʮ퐼^mp nwn oƪ=ܺ>1)3dϺw (`SQ`;:oq2%ڃhcL:ӟu2rQ̪LR cķxa4ӣUյ/.GGUhB(~Rgn\Pu#_dxYGJ|;2Ռ +W@#^s>]n ), lt4bJ6qt-uz(>`^ u-%X;gP bҏތR[WGHP?@6V1^wMӿum~;4NER czsYI?"MÓl\x +5iU1 : nyǝ- SbJ/|VHuB]8Jddc XYhhWr+HEڳm@ j UKp|Y$yr@3]$&H41OvZ.?n1طݤ+A` xfDvmn#ƠX<s7s7G\7oqRLSWUϳ>F(WO;VB8Dm+(zw?0<6Rx0Y/lo:1ژL#&'i%6DWj74<-I [<f4E=jUA /F[plF c8ps33q\ WC$㭾t/Obϸ ywhlXp!_[6i` /dFgWѢkuEFWOriϫV|e?L ћ3(^ʣ7 r_,LRG蜔=sˣ[ mԘcFcH'A)!Mox+ʯʨTȡtO$t:I4mL9OIԆ p-!$7 )Ȝu3ΰ/2/=ky>'^ĵ]2b, ow4TisZ_UuU>3^b` Il |)%80B٩ JsY2~Lf,3x-,^$bl\}Sp>I5G@<6vZ&{jqiI *=)BV?'& HnXp U내]Ez2w,= nE{`3QbXHƥJdMaa[g}˖X%qo&q?ĉ$a#dwL4)Z&bD6!Rs/gMٰ,~:bD.8>F/?#"Eź앃w*Ad}$RI4-{yV:44/>m“ӿ3fY-xNd;h[$/ X$hOSߩϤv5]O-PGb֭?t-B0QE.GR_x*f[2 ($42!Ǖ2 ]!rOa<Ǝ.#䞥(28 9¼_3UrS,AӁZ"@]ej,.[MPv}?+=RUGV~5F Re z?q"f5 |Q_w0;CLED0q)X3p0|7뚇²t@iHWY1˽E>*)nab/xѱQߟK X&k(b*gn`n lvNEw]Yѫ1oW 1YcסjN'+5?T701̒jB '5Hf`8娚E[Ēj*w |&C J2>T)`fM)VLL؛z-qNgqZdr~h+5@vŇZaAks.o{ E`29Д>OL +qzT}]Xp'$,YǍuF(e d LBցޕSX7 P-JUt&(~iGӢxKCSyEف˲qiyY9,`{ ¡rbaԯΐUB΅p+ľ3y|%gE0F.=+64k%Q"Ê $UH ;C"'Otz,QI#x5鈎_9i:VTfU5B1,Rw 9 4ڭ9}g4w;[zSyxQMFP O(S O ; ;MY2!IFTN vOӦA.7H(+Vn!ʒZ 1K#)[ _Fs]kvZX8q9*MOʟU/H-asО`ZC~{\zYUejϵ%֛~Aԗj9}=w]"fGVibAiΰ?]:H%5=uuO9ZH:`z%v+Up߁c4%\=Clz67"~SWzoUhNtDiI^]!T }ڞi1 6_ i?TTIco{KGuRr=FY-"pݔbkD Rsj 8@Q 3\_֞, ?]%j)Ϙ%cw 냳m`O3k([\A( ; C#O;:qLbUoZ8ca,KC$ `1)XuC_/1 2 mM~${̦,%8w3 P*F -xJHs&n@7` g6IfPfG&u7A;㔽[ [H 9$>lcXx@48cLrZ e'Y PO/;Hާ`̃t(|mUwjɇ&rpibclbx !¼g$Gӟ;09b%M7=a:5-юHt`ԌGri>?!o^7`Wu(}{At[.Y՘E۵5Fo )a_*}RqXRFbX QQ)pj@/30yԤrAVraiiRCR-ə4!4ݔ..|ú$RQO&H2#/)~E '7(9GjG0> Y L/{@}M5Qx0Fp):C'EokkFu6h-ɜnPrlgg̰˙ܺ1$fp] /(a(Pqo:e$K 5/bkp06?\n*'ߩr}PX"\W K|̧WY\XJ#l*iS+awmea/!8,D4 ȟCo1ϑv(R34g6F2ʮ 8q/+Xq7tZ[7 Ձd(6cl쁍Kأ]z0)!oWE9ڀxK i<~\߬)KP\V = 昨,"G U9nߦѭ=4?Ci! "A#v"ZJXi~7D5b߄$"4/XaFM ;NmG1ܐ煙7AK#ywԔ>Հ``DO^ YQy7M{+ޢxdA0A4,~ĸk)KwQ8eudB%9ߏDNq )N_5qXGyDU5}.9``G}v 7֪`li"-xX옘1M/{d'UujTz-WMTFsbr~E܃oT,Ɨˣ |;!+ģ{2v , ދ?|1JoW&,Uұs_ڈ@/_XYYrYy%8w vfVG3Vf{dD3O*ぇBJ#:ap7dWweek@gn[̎EVj?ʟw+HnOj?׳U Kp7b!\g(piihGU/xVE R)zȨX䩞2L-}i_"PI^ h0X <c[8Lްؖ D ޲J[OJ++/v:RE# PX+aq{\fŀhil !%skF/hE a(MPlZr2ٻpKnnoہ:>5ɭ$?#ٲFy1vbWY cSO 6bRkO"=OSOupWhe-3znaDM攨`H{m\2w#خ5*\PMhK_(975XtS .´UڊiV8$ Th5'C̀vjM(pf B$Ex$6.7Ft&P*5࠯ ._[7o hBKB8/ӶJI͜g=4j!r G_ [`@L/AiHQ;')ܘw>}w8j""*`MnףjT_00;oҊrǠQ2Af¤W^_ z*wt grwaj%RU/i;[(X)vI5纍ڌS+CYTiAbN>/z,x U*@o[ڹϴؘ{FOP$k?];fC@rΓ^uWx1e}fY-Єb{Aÿbu(@\ۭf׼2jaGiK;t HSAť꛻24)c끱}70Œ$yǬrL}Aw\!7I\;!af8эoA X1w@oO1];Q/ qQx답G!lG, Ês1Z4@sPΊs_$E@wRñ zrkFi.N±pI=Ԩ)ޘfiRQs0c(<4Img OcDXВe,)ɞJ] /%)DzO"]Ah0[ _L;4l;[}i!"P`pid4+"$ڬQ7-anSەޕ\ l~Ҡ; I| G0o@{C NSY4ϵyYx2 drrnwiϘ~6l޿Ņ~2yb֒ L2A~TaAyۈϔͶ.9 }ocKY醀.Dgoa۟,1FN$OTI0`QaAE-!C2$RrU2:aB"Cm!:d1O0 YBYFYv6k_001=ozPRyShb\ kôrqk&ab@c9\I04rΡMZ˦rSd5[V …=qh+vek-DWycŹ{CxGeEmIWpp/z$4%WY/)2V#2j`ɦb DJs`JaM t97WNY`e%% qB 3= ,a³د %< ?!K1O1iim91{4/t #0w?yUlLXN͝v7FMojսԬc&/$E+Ӈ-خ!qJL./KaDCRM3] Fc4=S7g& X~DA ݫ!Uv7|ƞ2t1Mj|kmCL>k8Aչ8'#˟k4,"a݂5ٗGGvg2m(9Osa bUzAl,Vz+) (>e HkEuɐ3Օ?Q}Em/mxhqQL2> JzjWM%)0]h_wbz%[D*3Yl5BGT,*4JFqW"b !KYR4_(O%>ܕտ&OF-قvs5bXm'w$VI}~4R..gTG7J>aVial{^ig?!_ _;MM|zjЉ{[< 6b6hĆOY2e6J7R{ޖc/^'5Ӕt3u 8/,r|vbJ?2sglMg?x-cp[B=k"Fow3y|zT93 `ω`P*B9{׍/);j[2o/#E%W&lvfپ% k F8£Ck-ELxWs8-ѠwMX8cT(-j )3'*MF _2s$j)Ŏ&_~fkIzZms#y' md7ilA 5hBN&bpNZ=4/oD [*v/zIO2n_ iB̒⠭?) ϑ&JyV͚ Zx$Ioʳ\uSI}h/@ftf20﵈+0O1SཷRk`2HB2$5-S\n%OD)lQJ@-i[@ 5/SjeHh2T&^z ]^m U7;{=\ayG<n+x ҇IF(.`INۛ $f<- V?O=.U IڒRHt/rʳUHlƵfצʼn $M%bCԲӡU<4{tZ6wJ&*@Nl/IxJRommwVqdMؙr?r= E*6t#θ>@U3?cP~06=1!}D !@҆|m:f>V~r\suQ vpԃve}Ctx"ՙڽKn]P>c by&ើ AR=dC/$߅-NF*0J-qlav*N$n;޲i?Y"I \N$O{C(kvr9ֽʣF֟}it0WܦT󪂊.C&2Sv8M$ mX-҆i v~/<˄MsJuM/^J8{7VY"-^\SE󏯱hLkUi/Ls5 z0E!TP" WgH%>w0TGP/as}Y}?&#EvfF%6D0Z^ZX׮ n</L~"W74RE  Нj> M}'&pTEI Geux@9|%Jt9( C(<TI`΃P;Md磓|Ӗ1%X~1O>eחe8Q_uJx\g'y?A$lM_ Ƀ W@]ɬ |>u.|nffh_x}1cb5v4QӏO# PvUCl;͹r kM0 <|Ә{QRZjeUpyxO5$j`W"j8 : wţ,碭8H}K@뵣[OA ggx1 Y+:!M pRe} }K.H 8S^<>& ÇټML`4&G}õ>7{,o(zNe5:7mcQJPWY!fBWj}G24~(0L= w}vXِS]#&!us&!ooù>SWbLv FdbS퉕> pƯۮ!'EcNlPc4D[6y~N>6 ]LÚݷGbUzߺx!sl3L_1_,.#CA!^p7%Y*TX=('ޖSL.DorjU'AcWޑOo{=إWǀl{A+l +]dijMzTΞk382kORg8 wI|S'%#˿ݑ^!,D`@ ~b^li#י{Z@̮f/Q& nZ@s^c eQP#a:,٦.S3})᪩ Srwi"mkJsyCNstU4Dt2~ӗڗ3}O[G* wcŏ:KJ)]2MzKYMj~xG3'%kvY\shM,{iI@/n"k). "ՌB)RhXL ۤ򹣜>wƋ*$;[mp;WL!;--zzxrMB4>LB"-EN{%%x1ڄFl ItߪH.Ohf9q\+@~ 8 F)K_ɭ8O|-7'ǀy,D)3rŁ/StH k=ٜZ0n4#&"/emVՙCy`tR՚aHMWpw9nS5vY>O8ۘMRk7M|q<,@IlBFOUڥaѽwmWj3̤ZG5;(O8b:f<`]s@ NKd m_ QhQM~rm")%*e6 }/s{e}dT Vav2MQtttsERj/~Z L ;ܢ~|=(2F`b^EQ`IJצ1GCSi5-pG,e*?np:<мqmz̷3Rz:5bKF ܠ@o}&)OR]x}̐Aiޯ-MF{!A4-;"Ɂk@[H#ZgC8G5“AaZQ nra.˃*!KM('y. G5P{ҵ=8Y&[!0m}BDipE Կ} |L#mvGEw*f%OjO^<{{PhiiLzɴf ]} J c(֢u9Sn8uϾsZ4lzB$[!0b@Ĥ+b9k8]h1fh+79>zL4lhKmIgLȭZPb3M1DqI,aS5j]$?R/S ƂOe6UkhEaY?(@;<$!VMl 0'!TW e!u f ͒SD/D +,,F-kƆL) W›̃[v rz׊PTIKr&Sl݅Bl]^tPh#($ՇxͲ`sW@r|xzTe'32Qߨ'볈Yx =PFOfƜ9Zv毹"m]W~ú%h;^GYS\aHxt6_u鉓Nw<`F'Ýҡ8N[U{ HNbE%%HOb܄^%H«ghu)xzG~z8@fd |N(! w5x݃>"O>,$um8ݢ$fqgyqܿ qA6UaA3ʐ.ĆODꗆM2e9vE=ܚ|I` oTE+utq:~\>@RݵT('#hJj H JCeZ" &8\̰|5%NCD!&mdRIN хL%8'g}QiW薓NE\խMbK؅|p҅M kA9#ӿT\J~9bqT?j)l9v[Ds{*ћS8v"fo/TA6.ͫښ*m/ޜDVå6 InaFTw^x0SKj~>Jɜ^f=jY&yrZ. ÏBc'Bh. fcF+]urj&t8Q-Gm(0df &4L'MD$ʹ= J @(;;pN=첫EUF4✥RMp\FŪ 9~<*;ww,N:JH=k`X0T5pB%q$p|i٤#),'K4tӱ՝d%N-`䯥",|&Pl%Sݜ vuډ&= b85hAPB3<Ѹ0>TRpwwYU%G !0w1ضI@k; P~{tIș?%hLI <5r@Gm+؏񑽊:I`+eZc"3кxV}V򿠞lGXMTa){+;Ig@[/}f]?~5UɡyHhRs9ОcGN3rXIse|h ,]HZcyeyK a}EBް4x`paTBN]zc [\H޲%8&>eʏ3`+\HpX-M_AȋU9=?2~Cng;itkmvlx"rTnRbY5z͊H0s×Vd@է\GA*:H7CFX+`}GpipFEȆAy4 d-_gR X+X,}~ҵ) ̟q¶lؒ&J"`L*k, ?I_Y4HmzOF3OHvj'~;΂nTJp}EMy(=0=Ymg@ )GwOFτo#{}z'崆d7O!+C:WpG br"H)LH%Qif4,!C$T [lDxWohiKe?|5uͧ7defϵv/Pe|AچJ:톦Ee(Fc6[0`ߢ xYI;f p;Ο :؄Su !kl͹ AN: r ydLz)º\Nz8+PnSiq.ٿ_3}!*!~m8溪NF\{/~ҡ@Kݻ Jl7qn([ۺ@×{j4U2#a"13 IqEG>QE.^hwv\_ 0n9viuy)/Įen)míΡtH@}_ Kn'S}Wgv3[RAROKp׉Ƌ\N}^4Hw>%1|,EJHsdBb9%O6:eupAvBǩ yN-Md,9Y1ؾӋ&P?tA kA 08h'Tw9CZ']͕QcniR:*:P8vkGj2TKrAَ)s]b ,/L/y=&tا=E{7_uph ( 9By-OC,8/(]`z/y% &LOy+B?zd? #8Ƹm0l=B6 %Rk"bpi m3^-=îЗՈohF] }|,^ wlgrb:4KgJXdY%$X 4@fDO=m%ZѤSdj'\0t&eUE}`[1d=!=9%Mp3 !u@my QbHf)TWwrf] {YԄ8KB(48c;dy1'3|||ʻҊo78..o+0p@Qd^q@:5vJd  s{ݩ- p^l9nT5@/  &/uYz#0\ÑEyQDUv݁si]p4~rzrt`En/ܞdqDR}Er4XR*gYV>(@C:.-wgC4N _^u?(VѡoVȷ4f\%,@u2?^`ٗ'tDJvS6] Lf%*f`<V8Oɇ}F=&BΚ ;2.wtH|`ܚps'ƽ> ^ 4UU4%<՗4l },y] !Sj uCXE˺ư](M-1d1 ^.g^߈oL]"X>b_-OH%DUJS-^$VhZ!!~G7#=wv`LSxAy~DlA2Y{@7e[1DuQ23oͰ9 :utJhA7I#X]/j3HSJj'{Wty^? 1XyN%,n0f"Ғm궠4uAkxC5(fB,~<)bƳ$2$="@tuݙDŽx;B1I!-l),%- ]NEKIP 8rO*) m4۴\sHkcnbMa5<'cV-Sܒ>5z+ίm" Q q u{U^f,ڎ9ҊQ]a !7^%vd:<0S7Rq{U S3;ԽBzdS+ <%a&zץ8 "ĺD'M>+s`#pS}ba/ QC{6ŶC!gw6# MH$Hs^{1Ǎ*4fҎd`M`Ḓ#Ԡ!u42 īEx uKzcds.QfׁaPצɦ}1ƴ `H^t'Ĉ-wf)srolRsz[lꃲ#7`Y I r7 HF㩚`5"1uœ%dfa) 5S;Ȥ#DV7 #8uUqpz`(|z\HF2֥VNn?V2.Y[vQEcކf뱢?_Ov[I#XPl-cc9c0,cM"af,+>pIDMu%PnNs(7DhɆ) $]Wb3 qr.8pkNu:9pDڄ3x REbu=S~}Kjۚ7iX;"VGxRw;]t|5&(Ǝ^>wL 鞋y5 kiJ:$XALd#(7dVH2hTqq^2.Gn;ڤ69Oژ)O2|׎dLhA^yG{V⪷LN0h=#}|'Xt)J4Ŷ#N,wc4z.ST,`ĵ)4ct= 5$RǴt܆ -Sg|cE^ 2ʸNJaN{l ohW!pl0e@2v%,-@TբX-e֍QoF )qzv>|*KYL%g?ZEH>j51AN i_3oTC/);hE"\fJ(WO쨸0\Up+a|O2̙wc9I!Rlv]l^+)HY t>׉SeyD_.ܫjupH7\&Bj;QfEنX(P:Kid h[ Ip*(KL{#lv%ٓr7X/i}o>L<\XD` |F.~db1!$iTkS5v \p> @ؽHI>&AM 0ǚ9z9*8CK'!UKJ/+ q<^3<蜷q8]L|VboizƱvmyq>hI{8fOA3\5|TBrN50MPSuQagU!2j#GMX-6U}Y+۾ !VႅCp$ "ke5msn}*b0#vkPqrܥ2_ .JeSUx'H Feks2w`v g;jwhrD;)1SJ7`R!:֡jDϜΪمqa}p[/Iy774S &*Jvd|`kZjw*] ߅Jo~О,k1hD]0ZM+@%X6 `kgS+dI=塥bǯ o2 -gv<[\mIQGrdPxhsUfMUQ2.>qOÃvep;3|5後(3m-` KmUeb3O09ן닝b`}z&:ϓDŞm|Q͠Rƕo66.Р.ڧP9]Lqi@HsfY˿ nU{kS~5&z>dru=h@hGvT }R@*r/W-u}ixuZXKw;|J<.K1&FZb!;Iwz)Ӱ;:~rw/< ;[\8&ML, ߐb:OD.{j awƱBvyt ^0Pe)6]V<# \ɘ^ 8v{s㈹Y⒇@yL,Y7M:Y~u&\&0fc(Bqкy=6}jP҅/1i>. Ҽ_뒳%3B%HO`A"ė Oza䈱 h*盩P;L|QR[$+*J|;Lfn~ Hf68IjYAAylp\y7qWcrݏqHQ.acP7mdɽ3O ]^,Bί)BJa0.XE#@*Ǒ[6 Y$ſ0te}Cpz#>%%y)DnPx4A-/s{ yD,Ddߵ 0!ܗcy/ mB.~q,K pKJ9%sѨBLF++RƬͯ"`v-*2ǜM]=}k P1V7gy /O]Yg` :YG\j{ c4CFr(Z|2i%>u|ܭ sFRΖT9q}zDGK0(ωӽ*}?vl>:qIsVЇZk@z[:ògn aj36SLv#Tb}~k1-,u7;!=™.6%L[\kۨ6͹.丅DbZ(? `'Ia0 xJu ~ FU Zベ;BN+Nto} ɧj̜;t6uB`\ ͧ~oȫY1 g74 | ;G_s7/1ȭD%/au[aJ۫T2BnD$Y*=IH%S(6ZeMkq_T/ >01:/v>Q>*?;/5a `xʤ焖hC+5M{|ZDeKc/͙p:"7a+~&oug"a.~?t3U[K6sgq~ &DsfT`!o'!ȧ! YaV]$:gUG31:qrڞ$àƝs,eg/ӵ}eieM5ukD701ԏh\.w80 2WƟ^ݞЁ' (ko65@z1 *!e rM'D跨 u<$ s[b j..tIL?4P@[tC*VO3zFz;2dx_@Phm4pj bzd k `O'?hh3]@veyIx,͢Lky.&kʳ,YFi?b[2)?,g_ԋ!Q~rnAf_VdBZX +ݯP%RU@aT#D h&gvz,wFm5ʧ,upۛ4Y} ?u̼RAnسSDʀq^9$iu(DYW W &WnBd߹DmWq4s<QP3!^t-ڙD@6ijЌ4lm r'׺8\wSЅO =35bvRyIYouxxڛO_wwUiMDױ63]ť?hHEavaRThbGSCyޕDmPϣ>dž`ԗ&MoqՂ'",+>fzJD"ˌ[=0,)&YJެI$oHQ͗c M3_LVPHN3($-Wky G't pе 187AmjQ"'QۂEhKݹ?2NݷS>a o)Kom,Divcٮڕhvyw: OIt(t;m2-kKۯo>pz~~ @4!{3(a9qieD1yƖ@TIyB$oAC2>D v/BԸ|Nw՟%Z0kDrx =}"i˧K?$(b-!,J`_.C&˹mpJc+5t[ox#*f;xSx"*"4) @wx9kWAj7}u[kԁv1 I?UIse:ѯ쳨4 RO zv'Θ-X9ߘryD' VȨm6{/I(w+㾿/ǎ!IQC&gxk7 ADeQq)n!$¡_j6ͧȅ>?FO.֒9pP @<#ø Z `NA [ڬ3\ZwY3x k7czh@~H^CoYRvj&[(_߱WBx A 6,=SՍ>D!ԟ+hJ`^聁zmMh rOVх8Ѐ$w fsd׋~^(,Qm^uFJ\S_BE,*8BvޙXT=ޔg(ymZmPy`!)wt%3JO C11tX|'(×܏` ;_F_: ZP>/義xGoItr-abGӄ P+="(aoNQƣ` 89-o+HR`-b]HF rBbNKּ-P՗ŕHKg'v u:iG,91&p=_db+hO6Vvt|N̼C:qF>" ŋec$FԸSGu82ns:.rؘq5=W.z WcxߑCJd/WxYqo0u%]+fS߮F$/2 j-l`L+2I8# 8[5XEY g;!Yi\Rj(~8=hǺtwS3SsBoK0-b xH jMahQ=RTZu !3[pKkG1% CHR$JLS Dn;6_p=tu"G0hzju`~k* 0 15deoT_d0iMsShu|D^ vtN]O9Е Ab%oc܄C,3ۙh5wp{ZkhB@*=L[w[ zM4+1QktALH 7)f1(A1 sN,zLn-6b}7yEDy`'%S56a{Gs&O 8 #:@fS~he)Tk0/uãL[Nj8< BHџ_jD6 Mq{MQlxJ|BW\komoaŃvDDxD53!y+lҳ_F7E% Z|l5IQzGE%y]Lz" +ɡ-?:n7&>.c86p^Է;![U&]iʨ9S U=GWf T[pzέOUtn}#3 n.F-67R*7,SU ɧFG%HTP''rZMS2/pzh@dcjWD)? }>Xט2zu.?Ef<5昪Δ-$,u{j&oIR2 9w[A$e\r( o>PAӝ63o,<m\g~ E|Bh@}@D^&w OVV1rp{c/4FfZ?k*\%b2L#h?g& BS_ v w@H+Sh&*k{al>H\y⨒Mlv~QIZhl=e;je]DA^/Ԣ+7y׮ 5􄚮%8h;)Nϝ]e/ h<^ 3E,eؖBɚ<:/6/@}/ x˯g8{ͻLKRs*%yb$|]ߴɂnW63q\ 8\hi®ϨUa z\.:yKUȞ0{DLk8M }Sut25Ei{%w;jbtflT'荧|,d /1l;x\30қ&D&d2 Ӻ$ k1!;1X޶Ѳ\ʪlMZ"F^̤Do:5 R0 6IȆQb躞b~W+h/$m68;x5e~k G(+ě& r>2HK \.ҍ-Hkm+W.GKjҩ3' fnY*p<6ƔA:ԋy#}a~zͣ=/ T_uPM qjIN qpWn97pVIp44wΫmd).X6Q2Jh+%aFHK.'oP?K5X|'+HߢMa}'{Z UX9sFڅ~}x׼icZK9*ǜza&#\ ˄1K(αqа&,W \ j9|"ou5<(_yOC ȝRP;I;'PeŅzB>IUsuܟBWMq⾎kҬAW?P*ǂ'F o46Nc^y\MVj : >mfY펛K<'(߽^+&b~ Y­؋?`tfg DC.VcFP+*gJ1[5.ܼgt4vlŮ6e<t3 V~?ewh=R̶y}Ndna™1F Z%[ZW5oMsdU)trVDV$; z+%\[-#aЊqqMx?lzQ*|ˠ\1ݻx`>PAzċ<50Ҿ0~rUެi[urL@9-P-ώ\RP]oe*#%9U"8i]sedlVvXϤohG@AURKS86i6;CPR$\rH.v}ٷ{=n&7&FN/GZrC0;SB٥15LZVV.%ųdz29BP$lwn 3\U70=Nsl @x \HRV%Cnntq+6H2̓"~N ,5zb#V"jh$s 1bn_k<9V+ڎuqX10y.K]1>wy:>W=;/V`,-6'Y]tl}Ud#%$CFX|IR7Q U e6*׶πȾӿ:N[Xu!c6JYzZvm9miFrͷ#/,?&~ ]?j' RJ>ʑ$bӏ5gCu8bpˇ*x#W T>j_j(cWThyS~qrD@-B6fV'ڶt& A62?I& ~UlXI}1H<^?-G=C%"Og7=0Q*i){~{" MhA 9iLͨ}P8gv<+Xl~_ "2;0\%,sBBV(wb keEKY Ps[j=hg!%x"4].j=3胬7ʀ?OlL6WJϪ1BjQb8`0&t,Uia͂:X9 2zSc=ɬD89ܫ eK@QD4|u8(~$g`,I،mRƒ"zMj@sLB^9Y xjrsm1O?!:<4Y04Ad=J:SB'wSƔW8 -= cW:('wR,"{T@2ۚƴ~);_t@ӌo&&&rR6W#_^N`C(TNKh݈7htBrJfŮWS>{" ]bX%+l wUZTїA|9XqUJx㝏o&g֬,]MRՉ7˗}i$żHyw~FykhjcS`T7%6R!Z躬GDzbRc$/16j% Ɂ)EJadLPFwZ4ǒ81^ҳIch|VV[*7'½E r^PVn8N="qئiA)9ٔ9 $H}Crٕ[(>qgaa6 |M*. =#vNxۆbh+7ĚqN[G| I4~Ecw5fnm,-V$\o+cF^z-YİMLg@]})o6վBEK8^H #Uַ}$aMXi"vP866eun'я.\EF6fxf LLQKn+ ju9ν*JS"4P':S$DDھp8Dōk1Ew8~M'ȭ1{?n@Zjq.h%E#@LHSSOX-oøl#Pi>bF;MsBCk6F+#C Z+Tz ; =vo{WCz2F'>Q~86I8z:vYa>G:Et, :ㄾ%0e?XQ잸2{Y!8]n2妎;R5.Ѿqffsy'p2qsPbc|ז< #?uss8~/vyEA'QKT) | "||h 83۠2(ę)/4?x2ѕXP>)¿/ZJ("cafoY8ܩBf@=˶Uυy\vLc'޾% :zomԒA 8 )xݿXs v敽樐rrDQK-G,*K{e>x ]f+?.ELJ1O-}yyJG^Fԍ^ax=2oBp UClYTZd *GLyk'7}sG>Q7CͤB?2D4jVFН6`$t G LR<}@``Z\CM0 mJ}Z:K0@|Jl8'|N$م*3m'ϦENrWB-b%ܲ]< To1|I:@CjX&)kzwH =O|iTw&!_ՒV̌*7NbO9p.N }Kħm5Vƣ W2R2.!ڧ[d9ƥ5Vk2a{`;-F/,OFn؍ܫ;%M q3! m>DǛMsL׸Bc1[D[o^o)*w'_ 9"s$ҡٯ _34v,@P&U++\eć;Id=4;Jf\vq}q{/ROIy0hJ=iEj8'a_x2kjQ0|QU[₴^w-n5_!"vVΣd]3Aݿ->܌ڸc'>njzS5N w5a}ars?6}s5ryzk߈C|}ʮ'vҎB(7&npl^pD}0h)% aq#ԣ4St9CGEݲAAg:T3A]˜ ^"g*xF8qm-? O-ʲ {`9w9{qWi;{Nw^%MuL4_ͷP'+^14$(6gwzryt/ɞ|G|Fc߱1[YB[g|$&rQUe]&l{*_lx+nVý~NgG+gZ$/$e[VŸ|Q.AnX >~y|/҄4\a-b@GO=XqPY 5!pdrO"vDQ> = !sN&ՒǕuL sʛwҮicm[3>Baplī>>y|UVɯtGq7wV@s)U${dgnB,~`l2!d =LmfvޟI)7cdi(^$wezѲͬ=fٛIV[kH0>m zޅwJ{A֘)PMV!,o(%rXE!cݑv靛XqC2@W(tݖ}-dDJ|a؟@v1 vm obۘe"aӤ?=qZ h;hVy –X$.KNbEkJVp>-nRr W㼝(t?IN+HuiXUPsrjK- ].zGj;΃DTS7BOF>͈Q{3,.):6\ּxdBR^S^+ 'RB緖|+nM.ӹ4编꒥[x(i9u@\O1!*ek h HOJp.׸e0Wq<.^chI;+. tkqa>t@Z!,>YwPsqgv0uV=T&ZO\,,AVg0%&yϫ4unq绩vn+`}^ CQ-?-S*w)zbR9M.<$.zVx@?YP=ޔ,bDca Nj!ywԠUZ\)Tcbo>.X7kTrHfёZrçp@EđB@>;;BRs]Jp WNmlöc,JQ86ԑRA´gVDj*$ 蓨ۼ0Bs\}ǐ/8L?.BY`\?6Ia`bΏyŗ,ajѠq sް2WqՍbz &yc0hL`CҜm._Pz#AQKPŦ+tޥqƞqeN$Wz1K_սܛZbIG˩V, mf7w7j8+ sn&il|+Wu-/0#ӓ娇,Ӆ\^=EMz/ye^wUν moRꦁ︍^3=ӴI8)@f>[1g wQXgcʴȏm;.dG*B!e Læ:=]n z ȼ+~%PH~aoY+r_΀_Ef?^[`y={Tj7 |̣xRU\.SZ:ck XgK/nINb[c ɌP4!nM^"(TYaqkX[җ#X{V_]6S_cȆ #VP}87Və>k)hnq̪EALolj@n>4t;}a5ci  2 c rx&vdȃYOC\]MZ~p(_'W!U2{I5>L/^/})K}Tp3q!<ȊzNHuPt;8Ԃgdr&שj:'! < j)!~r;y'r2 "7Ml@Ӥf| eBQB ¼%zͥܬ=ULFz_/v9y?-5[E6he;ISRkWW t؁an 'wb1TY0o >u0{냡nY'.A^V3gyN0/ND9H/G0>S]8[NOś!.V\2|4e>m{h1BNiUjɝZtq^[mcAg@^*"wՇdWO^Z.'߇u琶Z#+neĀbTkCns_g-6߸=TJ$<|мsJj7=<{' )[0.:_M;F@MmeGkЍ218?CՌf{A'srآa/l17fPio'mYF2EN!n})|jgz񉶔U:^L.?zuf#TOWOV->]EFF.Cĭ:epC!AJHG+ߞyP 9¯Wsbpq5C_HMäCA\#fx$eB*fU2ZU8rxYoQ5?B*_ %jxaud6F2 ,c ӯ cK&ThK; x|~pfbq 8Mhty3+؃K_Lj3 "IChd+8 )bK$z>/ss8gN֬Üw?PbK,Є$XswX` Yrś:zP Ϝf)n&4ީ[OzgO^ *KSpB~/RHYf@dnouc7Xi"TkvmZX4y(TJ9Wv -ں0 < 5*}Q"3E%u%Lymͅ(]yey*FbCbyRQmپ@7`@,n?20DƖl*!ۍڽB~%L|yP+Z -כ ѳf;SGsHiL[ lӡ۴{15Ś;/&&ûNS;ҝӫ`/L^ |@h;(埄K2eܰ@dQ74|%B/rk>;D 3X]2!-LVsC/#!<5$GO@#6L2mOfi φF㜯Fq/L yB^,F"8%:7$=絇R,7>@r] nq8J uq>-0L7`,^'te[Ys:vo?Q3 5euO/ a5r ?8luiչ_!g vJ&OT_jLM.zz^\gpCN.e9nd`!"d~2C?L\.HXRMM{НH ~]uDѶQ/=)GئL [㥏Os ]إ?juqIIye- ܫ.j< }=;(s/i ׏B'NV.'xޅ'5*BZ}p35U[*]C jus`?#}!loR@#.q `5_7} :! '?v] M*qS LZZ[ʰͧ<aqΪ4{ժ#pK٫|ΩyFvbNwƸ.B0˙e{5c~#LINOb "/la2 z1$UQOw֟6,ڸNW_KF-VR4JmR@GdކVQg ݹa_T[C7KMI33 y!3TA ߇_CB-G68-pby( I$-ڙv3:Bs*Z.L͛ZCc3Jȕ}y}qիDwd(n+ŗC%c٥YS :Mcf {]'[ps,"t>aA {~X+>YJ4A?:+߶7PIjq 8fAZCB'\FɞU*aSYҋJ h4Qé]';m\r|IʔCQhv3Z~\H". y6sۖ 2DLsh%"/OMQ'o=b&G t=.oB)UD gS 6Rx j\xc! _z锎|/vGƵD"} +QAS|Gsh4|:J[r*]Z"6v|P19R، ,^懲4P{]bo ]w"7(>X_e0_ݬw܋NGȪ.N:#LMl~8%1ϵ$I5ؿ߅7;A ݂źˤiNe>;n/_vN;"~3{2[Hchܻ 60't+ [YaAE4?ɡȴQ/Zo [K>->''Sh@lEu'? qsƝansؘ@@cD B樑Q3Cyve" B?_  #jtp0L+{le0s{;ڢc XܓmЊt56}t_sl>Aۃ͡Lyk`߆ہ51,'(a,][ehM˒^_d mSl0QTXQn)eP _ &է=qe2qMXk s7}Pˏ1#]j1k;|Iz}'ID'k]0 QA'`je"u"2 "/hK`U}\=U-fz#BI9; Zo)d m4I8߾u-|Ou%:vFY6MiPI(),=a{އ)+@Э6GN]#r2 oMӯO3gȑY qa`ANk{/]icW,b >Bw0W l` -AJi!qlcq[1?uy-`xh2Aj;k]a߃#~a,CAM=F|pT څ'b3 WCݶG#V/_f#7ՙ]C?B~#}e(ײK ȟ畲r'g*[bƦbG6KDxTɨ̓]9u+K]D/c7γp%n_Իs ' !?8vQ'pOd"#a[܏%?Z|WޞW<@3|B:v A%E]/Q}ᣥ+/CpM =H;W(C:mU2=?*ܐf9BP?;V>oFXf)~[-0g#6V});+.gWOhE_*y^Đ?l̿(44 ;Gajޙ7q2e-ԋ~Qqwx_^ݷ̹qu&tjl-)πOTl}׷x}(soZ֥:3'y4 8Q ,H)H i{{[(q9q[%M\B`^3yײC&(L FGf菽S6E\^ =>j /6 屠tk;,("lxqxZ޼!`wI}'eѱ+[ޯQl W'ҟzs%Z;4+p!R[IrKY;k1u{EYyfG" E0Ffĭ'EuN{v[W0kײ3fOl$'#Cc5[F@\!z琶^_A[^T9HI0HXo*ǫV9a h̸ @O ˂\xJ,Ok93%/L$,XKI u/,éM-28ǓZ}0:VP4, ɕب9Y%yn@{KA{M0}{8{rDyĿK h':wOҲ"Hju6[BkE)[p0 [O+g^=+\7CNsP&2_/TU_Mwt1prg9*<"=UL۝U?@5RJ)A& uqPb$~x=2k+;ܭ>/ N O Pjj;ʝHo ʹu , ߘj%+qo:29/WQZkʛ,fԿ] */mxn~) qw~{<\/-q˄q_I;6ڞfPd92Ac |9jp#||G͖M)=>7K8@-t̗Q-"% |1杣!9J_MM 3XYEkSr?9;n9m{2V@3`ZμhjD8]Z `uʥ6fE&jVV=3P??vAhIo⯜f 71'Uej$0^ w^NޏȈ.+z jPcS]x~!ʊKۍxpD-5x(k)kpCoYN"/ƍ p*?xfí}On˦u]]4kGs-:D_+"K?)G@J3{[ԐQ~%5ޟ95t8s!W1˵°׸a'-Z#;8ۃ:g2D27)"9{FDҗ:]鴸?WYVi!b/U>q Rv8 ZK&%|a0H坅p6]П[vR3QgBN|0 0֏h DB{ZM4/n_",ƎYT+f-3TF}ۼzCV[R)v'X[G;hqˎްʙر#9|@EDq $a'>UsFb1ppHRȑh=wvBx}uyeϊD^cqy~dt 3"6T+@s gWs(L꼔MBSsVխ;Ub M\ 3cL]k?e3› e< &9]fyyhǰGþ~Dy;c?K O +OdvidJCNϕ,#pRg0;nPFH(i淃Xˢ5ۄ;77`INOnIlC-e-0 ϮT=.4%T .7]]LgsRNMVx/œu!@jL{! ˺TL*%yWF !b{ewT,JPdT#La]r*8=uTy#_⪌39Š:K-KӜ ŠC?PU~r)w'ګp5{|C2 ,YTB=&- 8X8HgR(fwEdP?-ERDIc2AT Nzf- JV JM*sr0f!E-Ag3 L>K.ᮠb\﷉zw{7Ѫ۩y/@:5r#Z~ӻ> (UAwr9_VrhUp3-Jc} (mGܦm&pWV2S# $ U8̕_ L8xO&/L5D,:4@s=⢠DM!Vٷ}AVCNTIZ gg [3ӵCFH0;HxoҳM &u3ag*L;Q-gCx<*(Vl^BY(hF,5 ϛyʪ! ǔӫ )wwuU凾9`&u~PGYqc%q1\{`SOř6l4O'^jh$YbkD6@)Ƃ?-"q߆#8b_Hi <+1UqC 9Q2"(INOGH<$.R,p܆h% aU]KͪK 䠬 {>0j .4#P/AJ+GT4$cG([Gm--URҀ~Ϩ!5@Α܋ ⛌\L>Xbus }'1y|ݦvr6bj7!+M{,wt(@8ssb!`^WO {E7~#G[yv[j<ژ$Kͦ='ݒ@bտ,Qd)wtҽTCC5{})\$6g;וUo-(piAu:dAqyHRXF0%IUJbߏeN$ /Mv!C;#ȆW5d-(|4+h?Cs:qvFԓKpjWΩ/rk6Ho5 j"  T!I?^TEjl{p99ڑUc49o~6d6Nb-!f7-G&kMՇ!)(e`IaLb[*Nã2.%~Yck YTS*/ET8e]YRN+URZ m_~"Q +oSS`6 6n`J,TTy/60SqVIU h[99X%se>;:Y5ҎYABGmԖSx[bn&C+9@k \e#"(s%Nl~J&$@8W\Ŀ؉doǟ^4Hh}ZCY_ &r6Fctu(al[Ro9%Ub}jЫjOד}bȠ<-M"/Z,^`1!B:TIODYk3lGD*9E+BNlؑQcr0H}8kR+賁szqXK%jxIK ;6o.‘1BtIVV@2f%h,eɯXn%y[ ۠.2 Lr59r}/@4*LM{8]]0kUl d/d~K{5#Z/9=|I0.F ӳ NsB KGk =sX*v~ L=c ~al\x ihrd?? C:8Vg&kuP]m{LɁ݂MhlSf:ٮrK66l9}[Lh5uȩ>?CsXLhlfN@+()k%M0ͯer6p_Qfpߴ9Bߧ]{=v=бCBxjqc:uϢl H*w 9!r,hmX%̒ AMa.ϑdKa?Zo$+oGSY-`Y8;x{%E%Ѩr *M/qX;Sg٣1E~zPt^W _P1;O7"h tа%LRC9bW1 uҗ/Rs$r}K~nu@}(=Qul]|n06S|rXgU?{)kZ`z?HUmMncWEع54Sm-E"\U玟/k9AXk9iNfZGK5+~:hBk'V{𠓷Eԣ! @᪌*=.z}F Vؙ;,#yK]}1%v|}.{=Eż3丯B>eVy][P^cslПCu>4:nS+T%F&⸖}_kQ[Qw@[bl^:wm &j> {́f@ժJkzt3Q.JTCk$4ޜ{Y7(-99T!iб|kxmF/ ONkHvrW>Q3űocyljtz'd6 F#t8#,u7_+!!m$dgXg|=ϽrQ,(}/KbdݼJ 6`C{f)shf%l$S/u|vꏖgc}1`j0+ eYMsaK ӳ 2yK S>38:Z;+i5~˿TkߑU;sYv.x$BQYړ۷0 wbs)$ |<d*6pɳxAޙNkp@" 99fù [JW^6pS9˜}z =*?2N*58n,7`@{כR26+=_Pa~xF_XVo5lZ#&-84#JQޚۆi82FO|^ќ 6&4UwbY zUl Km)&ZC6SR:\ܩK^u|`8PY߼ym| Ps>/*F~·IW3-4dow$HbQMf#[`CMq(o1'%?(u@4]=`{U%ѵR@qcy͔g^†%X`91(!(fp:=<$Jkg8a:yںs0,nx؆r2|RJU/9yїnթ@JS^}Zlkjdx]-t!^[^w*lWi$qZh`dGKN+;Ae[R6.@M|7RȭVy0}QFT9Fo laόeek xs T w"缁~&fk3^5b["ci2G`BλD m[g-p)jY/Qf7JAxs` ,ǟMX[Po| 9wC)*Đ`x@aCax#13s].-?0+P@>_cK e_3\h߀ĀX[uMUd_NL (AƷ{p^ )e{G$1ZON?Ϋ% vٵbVR ~X8> T-`2 3A<:V+Gyh%2x\|)~ o/4ϡ}IJTjâ|2ˁ"dvR'8?b ѻ6yr&憘+ߓi!selmʼ3ib(d!(6z<$!MϋmT_ul=B!kk;L_Yeτ&{AO%]=AgVeWbBNă ?-<5n'ΟRǞ}-UlPss%A@xW=r6`ޑ ejtTJ jvW`qD3H L< ~\Q!5ti9Sc ,qp1s͇Z 3uQ,;k]<:?p}ce4V .N5G;B=Qe Mmdq4{b}Fi_qKZkMƇb@ivh\;b=5y- t.RzŎn[$UFI^x⽽>ސPy|,&| 0{ 6l&[{@5pp 7?jxI\D#W0 YZNLP/R/utils.R0000644000175100001440000000145612521153333012411 0ustar hornikusers### Format and print. .format_TextDocument <- function(x, ...) c(sprintf("<<%s>>", class(x)[1L]), sprintf("Metadata: %d", length(meta(x)))) .print_via_format <- function(x, ...) { writeLines(format(x, ...)) invisible(x) } ### Get and set metadata. .get_meta_if_attr <- function(x, tag = NULL, ...) { m <- attr(x, "meta") if(is.null(tag)) m else m[[tag]] } .set_meta_if_attr <- function(x, tag = NULL, ..., value) { if(is.null(tag)) attr(x, "meta") <- value else attr(x, "meta")[[tag]] <- value x } .get_meta_if_slot <- function(x, tag = NULL, ...) if(is.null(tag)) x$meta else x$meta[[tag]] .set_meta_if_slot <- function(x, tag = NULL, ..., value) { if(is.null(tag)) x$meta <- value else x$meta[[tag]] <- value x } NLP/R/conllu.R0000644000175100001440000000763713144532262012560 0ustar hornikusers## See . CoNLLUTextDocument <- function(con, meta = list()) { lines <- readLines(con, encoding = "UTF-8") ind_b <- lines == "" ind_c <- startsWith(lines, "#") ind <- !ind_b & !ind_c ## Now using scan(text = lines[ind]) to read in the records is ## possible but unbearably slow for large documents: instead, try to ## proceed "directly". ## records <- strsplit(lines[ind], "\t", fixed = TRUE) ## records <- as.data.frame(do.call(rbind, records), ## stringsAsFactors = FALSE) ## names(records) <- ## c("ID", "FORM", "LEMMA", "UPOSTAG", "XPOSTAG", "FEATS", "HEAD", ## "DEPREL", "DEPS", "MISC") ## sent <- cumsum(ind_b) + 1L ## tab <- cbind(data.frame(sent = sent), ## as.data.frame(do.call(cbind, records), ## stringsAsFactors = FALSE))[ind , ] sent <- cumsum(ind_b) + 1L tab <- cbind(data.frame(sent[ind]), as.data.frame(do.call(rbind, strsplit(lines[ind], "\t", fixed = TRUE)), stringsAsFactors = FALSE)) names(tab) <- c("sent", "ID", "FORM", "LEMMA", "UPOSTAG", "XPOSTAG", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC") comments <- split(lines[ind_c], sent[ind_c]) attr(tab, "comments") <- comments ## From CoNLL-U v2 on 'sent_id' and 'text' comments are compulsory ## for every sentence. Be defensive and add these as attributes ## only if always available. ind <- startsWith("# sent_id =", lines) if(all(diff(sent[ind]) == 1)) attr(tab, "sent_id") <- sub("^# sent_id = *", "", lines[ind]) ind <- startsWith("# text =", lines) if(all(diff(sent[ind]) == 1)) attr(tab, "text") <- sub("^# text = *", "", lines[ind]) doc <- list(content = tab, meta = meta) class(doc) <- c("CoNLLUTextDocument", "TextDocument") doc } ## CoNLL-U allows to represent both words and (multiword) tokens, which ## both have FORM entries, with ID single integers for words and integer ## ranges for the tokens. We provide the tokens with as.character() and ## the words with the other "viewers", in particular, words(). format.CoNLLUTextDocument <- function(x, ...) { y <- x$content ind <- !grepl("[.-]", y$ID) c(.format_TextDocument(x), sprintf("Content: words: %d, sents: %d", sum(ind), y[NROW(y), "sent"])) } content.CoNLLUTextDocument <- function(x) x$content as.character.CoNLLUTextDocument <- function(x, ...) { y <- x$content ## Drop empty nodes. y <- y[!grepl(".", y$ID, fixed = TRUE), ] ## Expand ranges to determine forms to be skipped for tokens. ind <- grepl("-", y$ID, fixed = TRUE) ids <- y$ID[ind] skip <- Map(seq, sub("-.*", "", ids), sub(".*-", "", ids)) skip <- paste(rep.int(y$sent[ind], lengths(skip)), unlist(skip), sep = ".") y$FORM[is.na(match(paste(y$sent, y$ID, sep = "."), skip))] } words.CoNLLUTextDocument <- function(x, ...) { ind <- !grepl("[.-]", x$content$ID) x$content$FORM[ind] } sents.CoNLLUTextDocument <- function(x, ...) { ind <- !grepl("[.-]", x$content$ID) split(x$content$FORM[ind], x$content$sent[ind]) } tagged_words.CoNLLUTextDocument <- function(x, which = c("UPOSTAG", "XPOSTAG"), ...) { which <- match.arg(which) ind <- !grepl("[.-]", x$content$ID) Tagged_Token(x$content$FORM[ind], x$content[[which]][ind]) } tagged_sents.CoNLLUTextDocument <- function(x, which = c("UPOSTAG", "XPOSTAG"), ...) { which <- match.arg(which) ind <- !grepl("[.-]", x$content$ID) split(Tagged_Token(x$content$FORM[ind], x$content[[which]][ind]), x$content$sent[ind]) } NLP/R/tokenize.R0000644000175100001440000001365313334313212013100 0ustar hornikusers## Tokenizers break text up into words, phrases, symbols, or other ## meaningful elements called tokens, see e.g. ## . ## This can be accomplished by returning the sequence of tokens, or the ## corresponding spans (character start and end positions). ## Apache OpenNLP provides a Tokenizer interface, with methods ## String[] tokenize() and Span[] tokenizePos() for the two variants. ## See e.g. ## . ## NLTK provides an interface class nltk.tokenize.api.TokenizerI, for ## which subclasses must define a tokenize() method, and can define a ## span_tokenize() method. ## See e.g. . ## In R, this could be mimicked by having two generics for getting the ## tokens or spans, and have a virtual Tokenizer class for which ## extension classes must provide methods for at least one of the ## generics. ## However, it seems more natural to have tokenizers be *functions* ## (instead of interface classes) which can be called directly (instead ## of calling the respective generics), and have two "kinds" of such ## functions: token tokenizers and span tokenizers. We use the class ## information to indicate the kind, which in turn allows to provide a ## generic mechanism for mapping between the two kinds (straightforward ## when going from spans to tokens, doable for the opposite direction). ## This also allows to "extract" both kinds of tokenizers from suitable ## annotators or annotator pipelines. ## For now, there is no underlying virtual Tokenizer class. ### * Span tokenizers Span_Tokenizer <- function(f, meta = list()) { attr(f, "meta") <- meta class(f) <- "Span_Tokenizer" f } as.Span_Tokenizer <- function(x, ...) UseMethod("as.Span_Tokenizer") as.Span_Tokenizer.Span_Tokenizer <- function(x, ...) x ## For now, pass metadata as is. as.Span_Tokenizer.Token_Tokenizer <- function(x, ...) { f <- function(s) { s <- as.String(s) spans_from_tokens(s, x(s)) } Span_Tokenizer(f, meta(x)) } ## For now, do not pass metadata. as.Span_Tokenizer.Annotator <- as.Span_Tokenizer.Annotator_Pipeline <- function(x, type = "word", ...) { f <- function(s) { a <- x(as.String(s)) as.Span(a[a$type == "word", ]) } Span_Tokenizer(f) } is.Span_Tokenizer <- function(x) inherits(x, "Span_Tokenizer") format.Span_Tokenizer <- function(x, ...) { d <- meta(x, "description") if(is.null(d)) { "A span tokenizer." } else { c("A span tokenizer, with description", strwrap(d, indent = 2L, exdent = 2L)) } } ### * Token tokenizers Token_Tokenizer <- function(f, meta = list()) { attr(f, "meta") <- meta class(f) <- "Token_Tokenizer" f } as.Token_Tokenizer <- function(x, ...) UseMethod("as.Token_Tokenizer") as.Token_Tokenizer.Token_Tokenizer <- function(x, ...) x ## For now, pass metadata as is. as.Token_Tokenizer.Span_Tokenizer <- function(x, ...) { f <- function(s) { s <- as.String(s) s[x(s)] } Token_Tokenizer(f, meta(x)) } ## For now, do not pass metadata. as.Token_Tokenizer.Annotator <- as.Token_Tokenizer.Annotator_Pipeline <- function(x, type = "word", ...) { f <- function(s) { s <- as.String(s) a <- x(s) s[a[a$type == "word", ]] } Token_Tokenizer(f) } is.Token_Tokenizer <- function(x) inherits(x, "Token_Tokenizer") format.Token_Tokenizer <- function(x, ...) { d <- meta(x, "description") if(is.null(d)) { "A token tokenizer." } else { c("A token tokenizer, with description", strwrap(d, indent = 2L, exdent = 2L)) } } ### Regexp span tokenizers a la NLTK. Regexp_Tokenizer <- function(pattern, invert = FALSE, ..., meta = list()) { force(pattern) args <- list(...) f <- if(invert) { ## Pattern gives the separators. function(s) { s <- as.String(s) if(is.na(s) || !nchar(s)) stop("Need a non-empty string.") m <- do.call(gregexpr, c(list(pattern = pattern, text = s), args))[[1L]] if((length(m) == 1L) && (m == -1L)) return(Span(1L, nchar(s))) start <- c(1L, m + attr(m, "match.length")) end <- c(m - 1L, nchar(s)) ind <- start <= end Span(start[ind], end[ind]) } } else { ## Pattern gives the tokens. function(s) { s <- as.String(s) if(is.na(s) || !nchar(s)) stop("Need a non-empty string.") m <- do.call(gregexpr, c(list(pattern = pattern, text = s), args))[[1L]] Span(m, m + attr(m, "match.length") - 1L) } } Span_Tokenizer(f, meta) } whitespace_tokenizer <- Regexp_Tokenizer("\\s+", invert = TRUE, meta = list(description = "Divides strings into substrings by treating any sequence of whitespace characters as a separator.")) blankline_tokenizer <- Regexp_Tokenizer("\\s*\n\\s*\\n\\s*", invert = TRUE, meta = list(description = "Divides strings into substrings by treating any sequence of blank lines as a separator.")) wordpunct_tokenizer <- Regexp_Tokenizer("\\w+|[^\\w\\s]+", perl = TRUE, meta = list(description = "Divides strings into substrings of alphabetic and (non-whitespace) non-alphabetic characters.")) ### * Utilities spans_from_tokens <- function(x, tokens) { start <- end <- integer(length(tokens)) off <- 0L for(i in seq_along(tokens)) { m <- regexpr(tokens[i], x, fixed = TRUE) pos <- m + attr(m, "match.length") x <- substring(x, pos) start[i] <- off + m end[i] <- off <- off + pos - 1L } Span(start, end) } NLP/R/span.R0000644000175100001440000000654612503505414012220 0ustar hornikusers## A simple span class for storing start and end integer offsets. ## Apache OpenNLP optionally allows storing types in spans: we use ## Annotation objects (with no ids or features) for this. ## Conceptually, a single span is a start/end pair and our Span objects ## are sequences (to allow positional access) of spans, i.e., sequences ## of pairs. ## The implementation actually uses a "pair" (named list of length two) ## of "slots" giving the start and end value sequences. ## Subscripting via [ extracts subsets of spans. ## Subscripting via $ extracts one slot. Span_slot_names <- c("start", "end") Span <- function(start, end) { start <- as.integer(start) end <- as.integer(end) if(length(start) != length(end)) stop("arguments must have the same length") .Span_from_args(start, end) } .Span_from_args <- function(start, end) .Span_from_list(list(start = start, end = end)) .Span_from_list <- function(x) { class(x) <- "Span" x } as.Span <- function(x) UseMethod("as.Span") as.Span.Span <- identity as.Span.Annotation <- function(x) .Span_from_args(x$start, x$end) is.Span <- function(x) inherits(x, "Span") `[.Span` <- function(x, i) .Span_from_list(lapply(unclass(x), `[`, i)) ## ## Implement eventually ... `[<-.Span` <- function(x, i, value) .NotYetImplemented() ## `[[.Span` <- function(x, i) .Span_from_list(lapply(unclass(x), `[[`, i)) ## ## Implement eventually ... `[[<-.Span` <- function(x, i, value) .NotYetImplemented() ## ## $.Span is not really necessary. `$<-.Span` <- function(x, name, value) { n <- length(x) x <- unclass(x) if(is.na(pos <- pmatch(name, Span_slot_names))) stop("invalid element name") value <- as.integer(value) if(length(value) != n) stop("replacement must have the same length as object") x[[pos]] <- value .Span_from_list(x) } Ops.Span <- function(e1, e2) { ## Allow to add offsets. switch(as.character(.Generic), "+" = { if(is.Span(e1) && is.numeric(e2) && (length(e2) == 1L)) return(Span(e1$start + e2, e1$end + e2)) if(is.Span(e2) && is.numeric(e1) && (length(e1) == 1L)) return(Span(e2$start + e1, e2$end + e1)) stop("Invalid operands.") }, stop(gettextf("'%s' not defined for \"Span\" objects", .Generic), domain = NA)) } as.data.frame.Span <- function(x, row.names = NULL, optional = FALSE, ...) { data.frame(start = x$start, end = x$end, row.names = row.names) } as.list.Span <- function(x, ...) lapply(seq_along(x), function(i) x[i]) c.Span <- function(..., recursive = FALSE) { args <- lapply(list(...), function(e) unclass(as.Span(e))) y <- lapply(Span_slot_names, function(e) unlist(lapply(args, `[[`, e))) names(y) <- Span_slot_names .Span_from_list(y) } duplicated.Span <- function(x, incomparables = FALSE, ...) { do.call(`&`, lapply(unclass(x), duplicated)) } format.Span <- function(x, ...) { format(as.data.frame(x), ...) } length.Span <- function(x) length(x$start) names.Span <- function(x) NULL print.Span <- function(x, ...) { print.data.frame(format(x), ..., row.names = FALSE) invisible(x) } unique.Span <- function(x, incomparables = FALSE, ...) x[!duplicated(x)] NLP/R/datetime.R0000644000175100001440000000764513741575236013072 0ustar hornikusersISO_8601_datetime_components <- c("year", "mon", "day", "hour", "min", "sec", "tzd") parse_ISO_8601_datetime <- function(x) { x <- as.character(x) n <- length(x) y <- matrix("", n, 7L) dimnames(y) <- list(x, ISO_8601_datetime_components) pos <- seq_along(x) bad <- (is.na(x) | (x == "") | ((nzchar(x) > 10L) & (substring(x, 11L, 11L) != "T"))) if(any(bad)) { pos <- pos[!bad] x <- x[pos] } dates <- substring(x, 1L, 10L) pat <- "^([[:digit:]]{4})(-[[:digit:]]{2})?(-[[:digit:]]{2})?$" m <- regmatches(dates, regexec(pat, dates)) ind <- lengths(m) > 0L if(!all(ind)) { bad[pos[!ind]] <- TRUE pos <- pos[ind] x <- x[ind] m <- m[ind] } y[pos, 1L : 3L] <- do.call(rbind, m)[, 2L : 4L] ind <- (nchar(x) > 10L) if(any(ind)) { if(!all(ind)) { pos <- pos[ind] x <- x[ind] } times <- substring(x, 12L) pat <- paste("^", "([[:digit:]]{2}):([[:digit:]]{2})", "(:[[:digit:]]{2}([.][[:digit:]]+)?)?", "(Z|[+-][[:digit:]]{2}:[[:digit:]]{2})", "$", sep = "") m <- regmatches(times, regexec(pat, times)) ind <- lengths(m) > 0L if(!all(ind)) bad[pos[!ind]] <- TRUE y[pos[ind], 4L : 7L] <- do.call(rbind, m[ind])[, c(2L, 3L, 4L, 6L)] } y[, c(2L, 3L, 6L)] <- substring(y[, c(2L, 3L, 6L)], 2L) ## Warn about the bad entries. if(any(bad)) { warning("Invalid entries:", paste("\n ", rownames(y)[bad], collapse = " ")) y[bad, ] <- "" } ## If we want year to sec as numeric and tzd as character, we need ## to do ## y <- as.data.frame(y, stringsAsFactors = FALSE) ## and convert variables 1 to 6: note that this would turn empty to ## missing ... x <- rownames(y) w <- which(y != "", arr.ind = TRUE) y <- as.data.frame(y, stringsAsFactors = FALSE) y[, 1L : 5L] <- lapply(y[, 1L : 5L], as.integer) y[[6L]] <- as.numeric(y[[6L]]) y <- Map(function(u, v) as.list(u[v]), split(y, seq_len(n)), split(w[, 2L], factor(w[, 1L], seq_len(n)))) names(y) <- x class(y) <- "ISO_8601_datetime" y } `[.ISO_8601_datetime` <- function(x, i) { y <- unclass(x)[i] class(y) <- class(x) y } `$.ISO_8601_datetime` <- function(x, name) { name <- pmatch(name, ISO_8601_datetime_components) as.data.frame(x)[[name]] } as.matrix.ISO_8601_datetime <- function(x, ...) { y <- matrix("", length(x), 7L, dimnames = list(names(x), ISO_8601_datetime_components)) nms <- lapply(x, names) y[cbind(rep.int(seq_along(x), lengths(nms)), match(unlist(nms), ISO_8601_datetime_components))] <- as.character(unlist(x)) y } as.data.frame.ISO_8601_datetime <- function(x, row.names = NULL, optional = FALSE, ...) { y <- as.matrix(x) y[y == ""] <- NA_character_ y <- as.data.frame(y, stringsAsFactors = FALSE) y[, 1L : 5L] <- lapply(y[, 1L : 5L], as.integer) y[[6L]] <- as.numeric(y[[6L]]) y } as.Date.ISO_8601_datetime <- function(x, ...) { y <- as.matrix(x) y[y == ""] <- NA_character_ as.Date(sprintf("%s-%s-%s", y[, 1L], y[, 2L], y[, 3L]), "%Y-%m-%d") } as.POSIXct.ISO_8601_datetime <- function(x, tz = "", ...) as.POSIXct(as.POSIXlt(x)) as.POSIXlt.ISO_8601_datetime <- function(x, tz = "", ...) { y <- as.matrix(x) y[y == ""] <- NA_character_ offsets <- sub(":", "", y[, 7L]) offsets[offsets == "Z"] <- "+0000" y[, 7L] <- offsets strptime(do.call(paste, split(y, col(y))), "%Y %m %d %H %M %OS %z", tz = "UTC") } print.ISO_8601_datetime <- function(x, ...) { y <- as.matrix(x) y <- as.data.frame(y, stringsAsFactors = FALSE) print(y) invisible(x) } NLP/R/tnt.R0000644000175100001440000000613112517713330012055 0ustar hornikusers## A simple class for storing tokens and tags ("tagged tokens"). ## Conceptually, a single tagged token is a token/tag pair and our ## Tagged_Token objects are sequences (to allow positional access) of ## tagged tokens, i.e., sequences of pairs. ## The implementation actually uses a "pair" (named list of length two) ## of "slots" giving the token and tag sequences. ## Subscripting via [ extracts subsets of tagged tokens. ## Subscripting via $ extracts one slot. Tagged_Token_slot_names <- c("token", "tag") Tagged_Token <- function(token, tag) { token <- as.character(token) tag <- as.character(tag) if(length(token) != length(tag)) stop("arguments must have the same length") .Tagged_Token_from_args(token, tag) } .Tagged_Token_from_args <- function(token, tag) { x <- list(token, tag) names(x) <- Tagged_Token_slot_names .Tagged_Token_from_list(x) } .Tagged_Token_from_list <- function(x) { class(x) <- "Tagged_Token" x } as.Tagged_Token <- function(x) UseMethod("as.Tagged_Token") as.Tagged_Token.Tagged_Token <- identity ## ## Should this get a '...'? (And hence the generic, too?) as.Tagged_Token.TextDocument <- function(x) tagged_words(x) ## is.Tagged_Token <- function(x) inherits(x, "Tagged_Token") `[.Tagged_Token` <- function(x, i) .Tagged_Token_from_list(lapply(unclass(x), `[`, i)) ## ## Implement eventually ... `[<-.Tagged_Token` <- function(x, i, value) .NotYetImplemented() ## `[[.Tagged_Token` <- function(x, i) .Tagged_Token_from_list(lapply(unclass(x), `[[`, i)) ## ## Implement eventually ... `[[<-.Tagged_Token` <- function(x, i, value) .NotYetImplemented() ## ## $.Tagged_Token is not really necessary. `$<-.Tagged_Token` <- function(x, name, value) { n <- length(x) x <- unclass(x) if(is.na(pos <- pmatch(name, Tagged_Token_slot_names))) stop("invalid element name") value <- as.integer(value) if(length(value) != n) stop("replacement must have the same length as object") x[[pos]] <- value .Tagged_Token_from_list(x) } as.data.frame.Tagged_Token <- function(x, row.names = NULL, optional = FALSE, ...) { data.frame(token = x$token, tag = x$tag, row.names = row.names) } as.list.Tagged_Token <- function(x, ...) lapply(seq_along(x), function(i) x[i]) c.Tagged_Token <- function(..., recursive = FALSE) { args <- lapply(list(...), function(e) unclass(as.Tagged_Token(e))) y <- lapply(Tagged_Token_slot_names, function(e) unlist(lapply(args, `[[`, e))) names(y) <- Tagged_Token_slot_names .Tagged_Token_from_list(y) } duplicated.Tagged_Token <- function(x, incomparables = FALSE, ...) { do.call(`&`, lapply(unclass(x), duplicated)) } format.Tagged_Token <- function(x, ...) { sprintf("%s/%s", x$token, x$tag) } length.Tagged_Token <- function(x) length(x$token) names.Tagged_Token <- function(x) NULL ## print.Tagged_Token <- ## function(x, ...) ## { ## print(format(x, ...)) ## invisible(x) ## } unique.Tagged_Token <- function(x, incomparables = FALSE, ...) x[!duplicated(x)] NLP/R/spacyr.R0000644000175100001440000000110713337763355012564 0ustar hornikusers## Viewer methods for objects of class "spacyr_parsed" as obtained by ## spacyr::spacy_parse(). words.spacyr_parsed <- function(x, ...) { x$token } sents.spacyr_parsed <- function(x, ...) { split(x$token, x$sentence_id) } tagged_words.spacyr_parsed <- function(x, which = c("pos", "tag"), ...) { which <- match.arg(which) Tagged_Token(x$token, x[[which]]) } tagged_sents.spacyr_parsed <- function(x, which = c("pos", "tag"), ...) { which <- match.arg(which) lapply(split(x, x$sentence_id), function(e) Tagged_Token(e$token, e[[which]])) } NLP/R/generics.R0000644000175100001440000000040012314326274013042 0ustar hornikuserscontent <- function(x) UseMethod("content", x) `content<-` <- function(x, value) UseMethod("content<-", x) meta <- function(x, tag = NULL, ...) UseMethod("meta", x) `meta<-` <- function(x, tag = NULL, ..., value) UseMethod("meta<-", x) NLP/R/viewers.R0000644000175100001440000000205512503572502012734 0ustar hornikuserswords <- function(x, ...) UseMethod("words") sents <- function(x, ...) UseMethod("sents") paras <- function(x, ...) UseMethod("paras") tagged_words <- function(x, ...) UseMethod("tagged_words") tagged_sents <- function(x, ...) UseMethod("tagged_sents") tagged_paras <- function(x, ...) UseMethod("tagged_paras") chunked_sents <- function(x, ...) UseMethod("chunked_sents") parsed_sents <- function(x, ...) UseMethod("parsed_sents") parsed_paras <- function(x, ...) UseMethod("parsed_paras") chunk_tree_from_chunk_info <- function(words, ptags, ctags) { ind <- grepl("^[BO]", ctags) ## ## Should this also use Tagged_Token()? chunks <- split(sprintf("%s/%s", words, ptags), cumsum(ind)) ## nms <- sub(".*-", "", ctags[ind]) ind <- nms != "O" chunks[ind] <- Map(Tree, nms[ind], chunks[ind]) Tree("S", chunks) } POS_tag_mapper <- function(map, set) { if(is.function(map)) return(map) if(is.list(map)) map <- map[[set]] function(pos) map[pos] } NLP/R/cleannlp.R0000644000175100001440000000246013741573436013060 0ustar hornikusers## Viewer methods for objects of class "cnlp_annotation" as obtained by ## cleanNLP::cnlp_annotate(). words.cnlp_annotation <- function(x, ...) { x$token$token } sents.cnlp_annotation <- function(x, ...) { x <- x$token split(x$token, x$sid) } ## paras.cnlp_annotation <- ## function(x, ...) ## { ## x <- x$token ## if(is.na(match("pid", names(x)))) ## stop("unavailable paragraph ids") ## lapply(split(x, x$pid), ## function(e) split(e$token, e$sid)) ## } tagged_words.cnlp_annotation <- function(x, which = c("upos", "pos"), ...) { x <- x$token which <- match.arg(which) Tagged_Token(x$token, x[[which]]) } tagged_sents.cnlp_annotation <- function(x, which = c("upos", "pos"), ...) { x <- x$token which <- match.arg(which) .tagged_sents_from_cnlp_token_frame(x, which) } .tagged_sents_from_cnlp_token_frame <- function(x, which) { lapply(split(x, x$sid), function(e) Tagged_Token(e$token, e[[which]])) } ## tagged_paras.cnlp_annotation <- ## function(x, which = c("upos", "pos"), ...) ## { ## x <- x$token ## if(is.na(match("pid", names(x)))) ## stop("unavailable paragraph ids") ## which <- match.arg(which) ## lapply(split(x, x$pid), ## .tagged_sents_from_cnlp_token_frame, ## which) ## } NLP/R/aptd.R0000644000175100001440000001516413357324561012215 0ustar hornikusersAnnotatedPlainTextDocument <- function(s, a, meta = list()) { s <- as.String(s) ## Be nice. a <- as.Annotation(a) doc <- list(content = s, annotation = a, meta = meta) class(doc) <- c("AnnotatedPlainTextDocument", "PlainTextDocument", "TextDocument") doc } format.AnnotatedPlainTextDocument <- function(x, ...) { c(.format_TextDocument(x), sprintf("Annotations: length: %s", length(x$annotation)), sprintf("Content: chars: %d", nchar(x$content))) } content.AnnotatedPlainTextDocument <- function(x) x$content `content<-.AnnotatedPlainTextDocument` <- function(x, value) stop("content modification is not possible for AnnotatedPlainTextDocument objects") ## meta.AnnotatedPlainTextDocument <- ## function(x, tag = NULL, ...) ## if(is.null(tag)) x$meta else x$meta[[tag]] ## `meta<-.AnnotatedPlainTextDocument` <- ## function(x, tag = NULL, ..., value) ## { ## if(is.null(tag)) ## x$meta <- value ## else ## x$meta[[tag]] <- value ## x ## } as.character.AnnotatedPlainTextDocument <- function(x, ...) x$content annotation <- function(x) { if(!inherits(x, "AnnotatedPlainTextDocument")) stop("argument 'x' must be an AnnotatedPlainTextDocument object") x$annotation } ## NLTK style functions for high level access words.AnnotatedPlainTextDocument <- function(x, ...) { if(!inherits(x, "AnnotatedPlainTextDocument")) stop("argument 'x' must be an AnnotatedPlainTextDocument object") s <- x$content a <- x$annotation ## Could check for word token annotations ... s[a[a$type == "word"]] } sents.AnnotatedPlainTextDocument <- function(x, ...) { if(!inherits(x, "AnnotatedPlainTextDocument")) stop("argument 'x' must be an AnnotatedPlainTextDocument object") s <- x$content a <- x$annotation .sents_from_annotation_and_text(a, s) } .sents_from_annotation_and_text <- function(a, s) { ## Could check for sentence and word token annotations ... s[annotations_in_spans(a[a$type == "word"], a[a$type == "sentence"])] } paras.AnnotatedPlainTextDocument <- function(x, ...) { if(!inherits(x, "AnnotatedPlainTextDocument")) stop("argument 'x' must be an AnnotatedPlainTextDocument object") s <- x$content a <- x$annotation ## Could check for paragraph annotations ... lapply(annotations_in_spans(a, a[a$type == "paragraph"]), .sents_from_annotation_and_text, s) } tagged_words.AnnotatedPlainTextDocument <- function(x, map = NULL, ...) { if(!inherits(x, "AnnotatedPlainTextDocument")) stop("argument 'x' must be an AnnotatedPlainTextDocument object") s <- x$content a <- x$annotation ## Could check for word token annotations ... a <- a[a$type == "word"] if(!is.null(map)) a <- .map_POS_tags_Annotation(a, map) .tagged_words_from_annotation_and_text(a, s) } .tagged_words_from_annotation_and_text <- function(a, s) { pos <- .annotation_features_with_template(a, "POS") Tagged_Token(s[a], pos) } tagged_sents.AnnotatedPlainTextDocument <- function(x, map = NULL, ...) { if(!inherits(x, "AnnotatedPlainTextDocument")) stop("argument 'x' must be an AnnotatedPlainTextDocument object") s <- x$content a <- x$annotation if(!is.null(map)) a <- .map_POS_tags_Annotation(a, map) .tagged_sents_from_annotation_and_text(a, s) } .tagged_sents_from_annotation_and_text <- function(a, s) { ## Could check for word and sentence token annotations ... lapply(annotations_in_spans(a[a$type == "word"], a[a$type == "sentence"]), .tagged_words_from_annotation_and_text, s) } tagged_paras.AnnotatedPlainTextDocument <- function(x, map = NULL, ...) { if(!inherits(x, "AnnotatedPlainTextDocument")) stop("argument 'x' must be an AnnotatedPlainTextDocument object") s <- x$content a <- x$annotation if(!is.null(map)) a <- .map_POS_tags_Annotation(a, map) ## Could check for paragraph annotations ... lapply(annotations_in_spans(a, a[a$type == "paragraph"]), .tagged_sents_from_annotation_and_text, s) } parsed_sents.AnnotatedPlainTextDocument <- function(x, ...) { if(!inherits(x, "AnnotatedPlainTextDocument")) stop("argument 'x' must be an AnnotatedPlainTextDocument object") a <- x$annotation .parsed_sents_from_annotation(a) } .parsed_sents_from_annotation <- function(a) { ## Could check for sentence token annotations ... a <- a[a$type == "sentence"] ptexts <- .annotation_features_with_template(a, "parse") lapply(ptexts, Tree_parse) } parsed_paras.AnnotatedPlainTextDocument <- function(x, ...) { if(!inherits(x, "AnnotatedPlainTextDocument")) stop("argument 'x' must be an AnnotatedPlainTextDocument object") a <- x$annotation ## Could check for paragraph annotations ... lapply(annotations_in_spans(a, a[a$type == "paragraph"]), .parsed_sents_from_annotation) } chunked_sents.AnnotatedPlainTextDocument <- function(x, ...) { if(!inherits(x, "AnnotatedPlainTextDocument")) stop("argument 'x' must be an AnnotatedPlainTextDocument object") s <- x$content a <- x$annotation ## Require annotations with POS and chunk_tag features, as obtained ## e.g. with the Apache OpenNLP POS tag and chunk annotators. We ## could alternatively use annotations with parse features and ## flatten the parse trees. ## Could check for word and sentence token annotations ... lapply(annotations_in_spans(a[a$type == "word"], a[a$type == "sentence"]), function(a) { ## Could check for POS and chunk tag features ... ptags <- .annotation_features_with_template(a, "POS") ctags <- .annotation_features_with_template(a, "chunk_tag") words <- s[a] chunk_tree_from_chunk_info(words, ptags, ctags) }) } .map_POS_tags_Annotation <- function(x, map) { map <- POS_tag_mapper(map, meta(x, "POS_tagset")) x$features <- lapply(x$features, function(e) { if(!is.null(pos <- e$POS)) e$POS <- map(pos) e }) x } .annotation_features_with_template <- function(x, tag, FUN.VALUE = "") { tryCatch(vapply(x$features, function(e) e[[tag]], FUN.VALUE), error = function(e) { stop(sprintf("incomplete or invalid '%s' features", tag), call. = FALSE) }) } NLP/R/annotation.R0000644000175100001440000002631613362123103013421 0ustar hornikusers## Annotations. ## Conceptually, a single annotation is a quintuple with "slots" id, ## type, start, end and features, and our Annotation objects are ## sequences (to allow positional access) of annotations, i.e., sequence ## of such quintuples. ## The implementation actually uses a "quintuple" (named list of length ## five) with slots giving the respective sequences of slot values. ## The feature slot of a single annotation is a feature map which we ## represent as named lists (at least for now also allowing NULL for an ## empty feature map), hence the sequence of feature values is a list of ## named lists. ## Subscripting via [ extracts subsets of annotations. ## Subscripting via $ extracts one slot value sequence. ## As Annotation objects have all slots of Span objects, we take them to ## have class "Annotation" and also inherit from class "Span". ## We allow for ids to be missing, and Annotation(id = NULL) creates ## missing ids as needed. Annotation_classes <- c("Annotation", "Span") Annotation_slot_names <- c("id", "type", "start", "end", "features") Annotation <- function(id = NULL, type = NULL, start, end, features = NULL, meta = list()) { if(nargs() == 0L) { ## Could also provide default values (e.g., NULL) for all ## arguments ... return(.Annotation_from_args(integer(), character(), integer(), integer(), list(), meta)) } start <- as.integer(start) end <- as.integer(end) n <- length(start) id <- if(is.null(id)) rep.int(NA_integer_, n) else as.integer(id) type <- if(is.null(type)) rep.int(NA_character_, n) else as.character(type) features <- if(is.null(features)) rep.int(list(list()), n) else lapply(features, as.list) ## ## Should perhaps check that all elements of 'features' are named or ## empty lists. ## .Annotation_from_args(id, type, start, end, features, meta) } .Annotation_from_args <- function(id, type, start, end, features, meta) { x <- list(id, type, start, end, features) if(any(diff(lengths(x)) != 0L)) stop("arguments must have the same length") names(x) <- Annotation_slot_names .Annotation_from_list_and_meta(x, meta) } .Annotation_from_list_and_meta <- function(x, meta) { class(x) <- Annotation_classes attr(x, "meta") <- meta x } as.Annotation <- function(x, ...) UseMethod("as.Annotation") as.Annotation.Annotation <- function(x, ...) x as.Annotation.Span <- function(x, id = NULL, type = NULL, ...) { ## Call Annotation() so we get coercion and length checking. Annotation(id, type, x$start, x$end, NULL) } is.Annotation <- function(x) inherits(x, "Annotation") `[.Annotation` <- function(x, i) .Annotation_from_list_and_meta(lapply(unclass(x), `[`, i), attr(x, "meta")) ## ## Implement eventually ... `[<-.Annotation` <- function(x, i, value) .NotYetImplemented() ## `[[.Annotation` <- function(x, i) { y <- lapply(unclass(x), `[[`, i) y$features <- list(y$features) .Annotation_from_list_and_meta(y, attr(x, "meta")) } ## ## Implement eventually ... `[[<-.Annotation` <- function(x, i, value) .NotYetImplemented() ## ## $.Annotation is not really necessary. `$<-.Annotation` <- function(x, name, value) { n <- length(x) x <- unclass(x) if(is.na(pos <- pmatch(name, Annotation_slot_names))) stop("invalid element name") name <- Annotation_slot_names[pos] value <- if(name == "type") as.character(value) else if(name == "features") as.list(value) else as.integer(value) ## This is not very elegant: we could record the slot modes as ## Annotation_slot_modes <- ## c("integer", "character", "integer", "integer", "list") ## but then coercion via the respective as.MODE functions would need ## some trickery ... maybe do this at a later stage, and modify the ## Annotation() creator accordingly. if(length(value) != n) stop("replacement must have the same length as object") x[[pos]] <- value .Annotation_from_list_and_meta(x, attr(x, "meta")) } as.data.frame.Annotation <- function(x, row.names = NULL, optional = FALSE, ...) { y <- data.frame(id = x$id, type = x$type, start = x$start, end = x$end, stringsAsFactors = FALSE, row.names = row.names) y$features <- x$features y } as.list.Annotation <- function(x, ...) lapply(seq_along(x), function(i) x[i]) c.Annotation <- function(..., recursive = FALSE) { args <- lapply(list(...), as.Annotation) meta <- do.call(c, lapply(args, meta)) args <- lapply(args, unclass) y <- lapply(Annotation_slot_names, function(e) unlist(lapply(args, `[[`, e), recursive = FALSE)) names(y) <- Annotation_slot_names ## Remove *exact* duplicates from metadata: if(length(meta)) { meta <- tapply(meta, names(meta), unique, simplify = FALSE) tags <- rep.int(names(meta), lengths(meta)) meta <- unlist(meta, recursive = FALSE, use.names = FALSE) names(meta) <- tags } .Annotation_from_list_and_meta(y, meta) } ## This is at the mercy of duplicated() working well on lists ... duplicated.Annotation <- function(x, incomparables = FALSE, ...) { Reduce(`&`, lapply(unclass(x), duplicated)) } format.Annotation <- function(x, values = TRUE, ...) { if(!length(x)) return(character()) y <- .format_Annotation_without_features(x) wy <- nchar(y[1L], type = "width") wf <- max(16L, 0.95 * getOption("width") - wy) collapse <- format("\n", width = wy + 2L) features <- lapply(x$features, function(e) { if(!(l <- length(e))) return("") s <- if(values) .format_feature_map(e) else names(e) s <- paste0(s, c(rep_len(",", l - 1L), "")) w <- nchar(strwrap(paste(gsub(".", "X", s), collapse = " "), width = wf)) v <- c(0L, head(cumsum(w + 1L), -1L)) f <- v + 1L t <- v + w paste(substring(paste(s, collapse = " "), f, t), collapse = collapse) }) paste0(y, c("features", features), collapse = "\n") } inspect.Annotation <- function(x) { x$features <- vapply(x$features, function(e) { if(length(s <- .format_feature_map(e))) { paste(sprintf("\n %s", s), collapse = "") } else NA_character_ }, "") write.dcf(x, keep.white = "features") } length.Annotation <- function(x) length(x$start) merge.Annotation <- function(x, y, ...) { pos <- match(paste(y$id, y$type, y$start, y$end, sep = "\r"), paste(x$id, x$type, x$start, x$end, sep = "\r"), nomatch = 0L) ## ## This should really combine the unique tag/value pairs. ## In fact, duplicated tags are a problem, but how should they be ## handled (take the pair from x or from y)? x$features[pos] <- Map(c, x$features[pos], y$features[pos > 0L]) ## c(x, y[pos == 0L]) } ## meta.Annotation <- ## function(x, tag = NULL, ...) ## { ## m <- attr(x, "meta") ## if(is.null(tag)) m else m[[tag]] ## } ## `meta<-.Annotation` <- ## function(x, tag = NULL, ..., value) ## { ## if(is.null(tag)) ## attr(x, "meta") <- value ## else ## attr(x, "meta")[[tag]] <- value ## x ## } names.Annotation <- function(x) NULL ## print.Annotation <- ## function(x, values = TRUE, ...) ## { ## writeLines(format(x, values = values)) ## invisible(x) ## } subset.Annotation <- function(x, subset, ...) { e <- substitute(subset) i <- eval(e, unclass(x), parent.frame()) if(!is.logical(i)) stop("'subset' must be logical") i <- i & !is.na(i) x[i] } unique.Annotation <- function(x, incomparables = FALSE, ...) x[!duplicated(x)] .format_Annotation_without_features <- function(x) { sprintf(" %s %s %s %s ", .format_values_with_header(x$id, "id", "right"), .format_values_with_header(x$type, "type", "left"), .format_values_with_header(x$start, "start", "right"), .format_values_with_header(x$end, "end", "right")) } .format_values_with_header <- function(v, h, justify = c("left", "right")) { justify <- match.arg(justify) width <- max(nchar(h), nchar(v)) len <- length(v) fmt <- sprintf("%%%s%ds", c("-", rep.int(if(justify == "left") "-" else "", len)), rep.int(width, len + 1L)) sprintf(fmt, c(h, v)) } ## Try formatting feature maps nicely. ## Similar to what we do in package 'sets', I guess ... .format_feature_map <- function(x, ...) { if(!length(x)) return(character()) sprintf("%s=%s", names(x), vapply(x, .format_feature_value, "")) } ## Formatter for a single value. .format_feature_value <- function(x) { ## Could also make this a generic, which currently seems an ## overkill, in particular if it is not exported so that no one else ## can register methods. if(inherits(x, "Stanford_typed_dependencies")) sprintf("<<%s,%s>>", class(x)[1L], nrow(x)) else if(is.object(x)) sprintf("<<%s>>", class(x)[1L]) else if(is.array(x)) sprintf("<>", paste(dim(x), collapse = ",")) else if(is.character(x) && (length(x) == 1L)) { if(nchar(x) <= 32L) x else "<>" } else if(is.atomic(x) && (length(x) == 1L)) { ## ## Should this take ... args? ## Also, might want to ensure this does not get too long. format(x) ## } else if(is.vector(x)) sprintf("<<%s,%s>>", typeof(x), length(x)) else if(is.null(x)) "NULL" else "<>" } annotations_in_spans <- function(x, y) { y <- as.Span(y) ## An annotation node is contained in a span if it does not start ## ahead of the span and does not end later than the span. ind <- outer(x$start, y$start, ">=") & outer(x$end, y$end, "<=") lapply(seq_len(ncol(ind)), function(j) x[ind[, j]]) } features <- function(x, type = NULL, simplify = TRUE) { if(inherits(x, "AnnotatedPlainTextDocument")) x <- x$annotation else if(!is.Annotation(x)) stop("argument 'x' must be an Annotation object") if(!is.null(type)) { types <- unique(x$type) i <- pmatch(type, types) if(any(is.na(i))) stop("incomplete or invalid 'type'") x <- x[x$type %in% types[i]] } features <- x$features tags <- unique(unlist(lapply(features, names))) y <- lapply(tags, function(tag) lapply(features, `[[`, tag)) if(simplify) y <- lapply(y, .simplify) names(y) <- tags class(y) <- "data.frame" attr(y, "row.names") <- .set_row_names(length(features)) y } .simplify <- function(x) { if((length(len <- unique(lengths(x))) == 1L) && (len == 1L)) unlist(x, recursive = FALSE) else x } NLP/R/wordlist.R0000644000175100001440000000177612521153434013127 0ustar hornikusersWordListDocument <- function(con, encoding = "unknown", meta = list()) { words <- readLines(con, encoding = encoding, warn = FALSE) doc <- list(content = words, meta = meta) class(doc) <- c("WordListDocument", "TextDocument") doc } format.WordListDocument <- function(x, ...) c(.format_TextDocument(x), sprintf("Content: words: %d", length(x$content))) ## print.WordListDocument <- ## function(x, ...) ## { ## writeLines(sprintf("<>", ## length(x$content))) ## invisible(x) ## } content.WordListDocument <- function(x) x$content ## meta.WordListDocument <- ## function(x, tag = NULL, ...) ## if(is.null(tag)) x$meta else x$meta[[tag]] ## `meta<-.WordListDocument` <- ## function(x, tag = NULL, ..., value) ## { ## if(is.null(tag)) ## x$meta <- value ## else ## x$meta[[tag]] <- value ## x ## } as.character.WordListDocument <- words.WordListDocument <- function(x, ...) x$content NLP/R/annotate.R0000644000175100001440000000053612517716566013102 0ustar hornikusers## annotate() can use a single annotator or an annotator pipeline or ## something coercible to this, such as a list of annotators, and ## recursively calls the given annotators and merges annotations. annotate <- function(s, f, a = Annotation()) { s <- as.String(s) for(e in as.Annotator_Pipeline(f)) a <- merge(a, e(s, a)) a } NLP/R/language.R0000644000175100001440000002314213334312770013035 0ustar hornikusersparse_IETF_language_tag <- function(x, expand = FALSE) { n <- length(x) y <- rep.int(list(character()), n) names(y) <- x ## How nice should we be? ## Allow for empty or missing elements ... pos <- seq_along(x) if(any(ind <- (is.na(x) | (x == "")))) { pos <- pos[!ind] x <- x[pos] } ## See . ## Language tags can be of the form (in ABNF, see ## ): ## langtag / privateuse / grandfathered ## where ## privateuse = ("x"/"X") 1*("-" (1*8alphanum)) ## grandfathered = 1*3ALPHA 1*2("-" (2*8alphanum)) re_privateuse <- "[xX]((-[[:alnum:]]{1,8}){1,})" ## Grandfathered tags must really be determined by exact matching. ind <- !is.na(match(x, IANA_language_subtag_registry_grandfathered_table$Tag)) if(any(ind)) { y[pos[ind]] <- as.list(sprintf("Grandfathered=%s", x[ind])) x[ind] <- "" pos <- pos[!ind] } if(length(pos)) { pat <- sprintf("^%s$", re_privateuse) ind <- grepl(pat, x, perl = TRUE) if(any(ind)) { y[pos[ind]] <- as.list(sprintf("Privateuse=%s", substring(x[ind], 3L))) x[ind] <- "" pos <- pos[!ind] } } ## Now for the real thing. ## Remaining tags should now be as follows: ## (language ## ["-" script] ## ["-" region] ## *(["-" variant]) ## *(["-" extension]) ## ["-" privateuse] ## where ## language = (2*3ALPHA [-extlang]) ; shortest ISO 639 code ## / 4ALPHA ; reserved for future use ## / 5*8ALPHA ; registered language subtag ## extlang = *3("-" 3*ALPHA) ; reserved for future use ## script = 4ALPHA ; ISO 15924 code ## region = 2ALPHA ; ISO 3166 code ## / 3DIGIT ; UN M.49 code ## variant = 5*8alphanum ; registered variants ## / (DIGIT 3alphanum) ## extension = singleton 1*("-" (2*8alphanum)) ## singleton = %x41-57 / %x59-5A / %x61-77 / %x79-7A / DIGIT ## ; "a"-"w" / "y"-"z" / "A"-"W" / "Y"-"Z" / "0"-"9" ## We handle language/extlang a bit differently (more generously). re_extlang <- "[[:alpha:]]{3}" re_language <- sprintf("[[:alpha:]]{2,3}(-%s){0,3}|[[:alpha:]]{4,8}", re_extlang) re_script <- "[[:alpha:]]{4}" re_region <- "[[:alpha:]]{2}|[[:digit:]]{3}" re_variant <- "[[:alnum:]]{5,8}|[[:digit:]][[:alnum:]]{3}" re_singleton <- "[abcdefghijklmnopqrstuvwyzABCDEFGHIJKLMNOPQRSTUVWYZ0123456789]" re_extension <- sprintf("(%s)(-[[:alnum:]]{2,8}){1,}", re_singleton) bad <- integer() if(length(pos)) { pat <- sprintf("^(%s)(-.*|$)", re_language) ind <- grepl(pat, x, perl = TRUE) if(!all(ind)) { bad <- which(!ind) x[bad] <- "" } y[pos[ind]] <- lapply(strsplit(sub(pat, "\\1", x[ind], perl = TRUE), "-", fixed = TRUE), function(e) { c(sprintf("Language=%s", e[1L]), sprintf("Extension=%s", e[-1L])) }) x[ind] <- sub(pat, "\\3", x[ind], perl = TRUE) ind <- nzchar(x) pos <- pos[ind] x <- x[ind] } if(length(pos)) { repeat { ## Use a loop so that we can finally stop when done. ## Script. pat <- sprintf("^-(%s)(-.*|$)", re_script) if(any(ind <- grepl(pat, x, perl = TRUE))) { y[pos[ind]] <- Map(c, y[pos[ind]], sprintf("Script=%s", sub(pat, "\\1", x[ind], perl = TRUE))) x[ind] <- sub(pat, "\\2", x[ind], perl = TRUE) ind <- nzchar(x) pos <- pos[ind] x <- x[ind] if(!length(x)) break } ## Region. pat <- sprintf("^-(%s)(-.*|$)", re_region) if(any(ind <- grepl(pat, x, perl = TRUE))) { y[pos[ind]] <- Map(c, y[pos[ind]], sprintf("Region=%s", sub(pat, "\\1", x[ind], perl = TRUE))) x[ind] <- sub(pat, "\\2", x[ind], perl = TRUE) ind <- nzchar(x) pos <- pos[ind] x <- x[ind] if(!length(x)) break } ## Variant(s). pat <- sprintf("^-(%s)(-.*|$)", re_variant) while(any(ind <- grepl(pat, x, perl = TRUE))) { y[pos[ind]] <- Map(c, y[pos[ind]], sprintf("Variant=%s", sub(pat, "\\1", x[ind], perl = TRUE))) x[ind] <- sub(pat, "\\2", x[ind], perl = TRUE) ind <- nzchar(x) pos <- pos[ind] x <- x[ind] } if(!length(x)) break ## Extension(s). pat <- sprintf("^-%s(-.*|$)", re_extension) while(any(ind <- grepl(pat, x, perl = TRUE))) { ## ## We keep the singleton prefix: this could be used in ## expansions of registered extensions: currently, ## BCP 47 Extension U ## BCP 47 Extension T y[pos[ind]] <- Map(c, y[pos[ind]], sprintf("Extension=%s", sub(pat, "\\1\\2", x[ind], perl = TRUE))) ## x[ind] <- sub(pat, "\\3", x[ind], perl = TRUE) ind <- nzchar(x) pos <- pos[ind] x <- x[ind] } if(!length(x)) break ## Private use. pat <- sprintf("^-%s(-.*|$)", re_privateuse) if(any(ind <- grepl(pat, x, perl = TRUE))) { y[pos[ind]] <- Map(c, y[pos[ind]], sprintf("Privateuse=%s", substring(sub(pat, "\\1", x[ind], perl = TRUE), 2L))) x[ind] <- sub(pat, "\\4", x[ind], perl = TRUE) } break } } ## Be a nuisance: singletons for extensions must not be duplicated. ind <- as.logical(lapply(y, function(e) { e <- grep("^Extension=", e, value = TRUE) if(!length(e)) return(FALSE) any(duplicated(sub("^Extension=(.).*", "\\1", e))) })) if(any(ind)) bad <- c(bad, which(ind)) if(any(ind <- nzchar(x))) { bad <- c(bad, pos[ind]) } if(length(bad)) { stop("Invalid language tag(s):", paste("\n ", names(y)[bad], collapse = " "), call. = FALSE) } if(!expand) return(y) x <- tolower(unlist(y)) pos <- match(x, IANA_language_subtag_registry$Index) z <- IANA_language_subtag_registry$Description[pos] ## Special case private use ranges. if(!all(lengths(z))) { pos <- match(x, IANA_language_subtag_registry_private_use_index_table) z[pos > 0L | grepl("^privateuse=", x)] <- "Private use" } z <- Map(`names<-`, split(z, rep.int(seq_along(y), lengths(y))), y) names(z) <- names(y) z } get_IANA_language_subtag_registry <- function(con = "https://www.iana.org/assignments/language-subtag-registry") { ## This is a collection of records in tag-value format, but ## unfortunately separated by '%%' lines rather than empty lines, so ## we cannot use read.dcf() directly. Let us keep things simple: ## extract the records, write them out as DCF, and call read.dcf(). lines <- readLines(con) ## The first line is something like ## File-Date: 2009-03-13 ## which we drop for reading the records. fdate <- sub(".*: *", "", lines[1L]) pos <- grep("^%%", lines) lines[c(seq_len(pos[1L]), pos[-1L])] <- "" tcon <- textConnection(lines, encoding = "UTF-8") on.exit(close(tcon)) db <- read.dcf(tcon, all = TRUE) ## Add index for lookups. subtag <- db$Subtag db$Index <- tolower(sprintf("%s=%s", db$Type, ifelse(is.na(subtag), db$Tag, subtag))) db$Type <- factor(db$Type) attr(db, "File_Date") <- fdate db } IANA_language_subtag_registry_language_private_use_subtags <- outer(letters[1L : 20L], letters, function(u, v) sprintf("q%s%s", u, v)) IANA_language_subtag_registry_script_private_use_subtags <- outer(c("a", "b"), letters[1L : 24L], function(u, v) sprintf("Qa%s%s", u, v)) IANA_language_subtag_registry_region_private_use_subtags <- c(sprintf("Q%s", LETTERS[13L : 26L]), sprintf("X%s", LETTERS)) IANA_language_subtag_registry_private_use_index_table <- tolower(c(sprintf("Language=%s", IANA_language_subtag_registry_language_private_use_subtags), sprintf("Script=%s", IANA_language_subtag_registry_script_private_use_subtags), sprintf("Region=%s", IANA_language_subtag_registry_region_private_use_subtags))) NLP/R/tree.R0000644000175100001440000000671712517657432012233 0ustar hornikusersTree <- function(value, children = list()) { y <- list(value = value, children = as.list(children)) class(y) <- "Tree" y } format.Tree <- function(x, width = 0.9 * getOption("width"), indent = 0, brackets = c("(", ")"), ...) { ffmt <- function(x) { sprintf("%s%s %s%s", brackets[1L], x$value, paste(sapply(x$children, function(e) { if(inherits(e, "Tree")) ffmt(e) else format(e) }), collapse = " "), brackets[2L]) } s <- ffmt(x) if(nchar(s) + indent < width) return(s) y <- sapply(x$children, function(e) { if(inherits(e, "Tree")) format(e, width = width, indent = indent + 2L, brackets = brackets) else format(e) }) y <- sprintf("\n%s%s", paste(rep.int(" ", indent + 2L), collapse = ""), y) sprintf("%s%s%s%s", brackets[1L], x$value, paste(y, collapse = ""), brackets[2L]) } ## print.Tree <- ## function(x, ...) ## { ## writeLines(format(x, ...)) ## invisible(x) ## } Tree_parse <- function(x, brackets = c("(", ")")) { errfmt <- function(token, expected) { sprintf("expected %s but got %s", expected, token) } re_o <- sprintf("\\%s", brackets[1L]) # open re_c <- sprintf("\\%s", brackets[2L]) # close re_n <- sprintf("[^\\s%s%s]+", re_o, re_c) # node re_l <- sprintf("[^\\s%s%s]+", re_o, re_c) # leaf re <- sprintf("%s\\s*(%s)?|%s|(%s)", re_o, re_n, re_c, re_l) m <- gregexpr(re, x, perl = TRUE) stack <- list(list(NULL, list())) for(token in regmatches(x, m)[[1L]]) { if(substring(token, 1L, 1L) == "(") { if((length(stack) == 1L) && (length(stack[[1L]][[2L]]) > 0L)) stop(errfmt(sQuote(token), "end of string")) value <- sub("\\s*", "", substring(token, 2L)) stack <- c(stack, list(Tree(value, list()))) } else if(token == ")") { if((n <- length(stack)) == 1L) { if(!length(stack[[1L]][[2L]])) stop(errfmt(sQuote(token), sQuote(brackets[1L]))) else stop(errfmt(sQuote(token), "end of string")) } elt <- stack[[n]] ## class(elt) <- "Tree" stack <- stack[-n] n <- n - 1L stack[[n]][[2L]] <- c(stack[[n]][[2L]], list(elt)) } else { if((n <- length(stack)) == 1L) stop(errfmt(sQuote(token), sQuote(brackets[1L]))) stack[[n]][[2L]] <- c(stack[[n]][[2L]], list(token)) } } if(length(stack) > 1L) stop(errfmt("end of string", sQuote(brackets[2L]))) else if(!length(stack[[1L]][[2L]])) stop(errfmt("end of string", sQuote(brackets[1L]))) stack[[1L]][[2L]][[1L]] } Tree_apply <- function(x, f, recursive = FALSE) { if(!recursive) return(lapply(x$children, f)) g <- function(e) { y <- f(e) if(inherits(e, "Tree")) list(y, lapply(e$children, g)) else y } lapply(x$children, g) } NLP/R/ngram.R0000644000175100001440000000061012474110665012354 0ustar hornikusersngrams <- function(x, n) { N <- length(x) n <- n[(n >= 1L) & (n <= N)] lapply(unlist(lapply(n, function(k) { pos <- seq_len(k) lapply(seq.int(0, N - k), `+`, pos) }), recursive = FALSE), function(e) x[e]) } NLP/R/string.R0000644000175100001440000000470413334576423012572 0ustar hornikusers## A simple string class. String <- function(x) { .String_from_string(as.character(x)[[1L]]) } ## Note subscripting by [[: this insists on the first element, and ## hence gives an error instead of NA_character_ if there is none. as.String <- function(x) UseMethod("as.String") as.String.String <- identity as.String.default <- function(x) String(paste(x, collapse = "\n")) is.String <- function(x) inherits(x, "String") print.String <- function(x, ...) writeLines(x) ## Provide a '[' method performing slicing (as we cannot provide S3 ## methods for substr, and clearly substrings of strings should be ## strings. ## Note that we have no distinction between spans and span arrays (same ## issue as having no distinction between strings and string arrays in ## base R). Hence, we take spans to always operate in an array context ## (for now: we could add a drop = FALSE argument to have subscripting ## turn character vectors of length one back to strings again). `[.String` <- function(x, i, j) { mysubstring <- function(x, i, j) { ## substring() recycles to max length of args only when this is ## positive. if(!length(i)) character() else substring(x, i, j) } if(missing(j)) { if(is.Span(i)) return(mysubstring(x, i$start, i$end)) if(is.list(i)) { if(!length(i)) return(list()) else if(all(vapply(i, is.Span, NA))) return(lapply(i, function(e) mysubstring(x, e$start, e$end))) } } ## Regular slicing operators in a scalar context. String(substr(x, i, j)) } ## More methods? ## ## A popular mailing list discussion item is to use a Java style '+' ## operator for concatenating strings (not uniformly liked as the ## corresponding operation is not commutative): `+.String` <- function(e1, e2) .String_from_string(paste0(as.String(e1), as.String(e2))) ## Also provide Python-style string repetition. `*.String` <- function(e1, e2) { if(is.numeric(e1) && (length(e1) == 1L)) .String_from_string(paste(rep.int(e2, e1), collapse = "")) else if(is.numeric(e2) && (length(e2) == 1L)) .String_from_string(paste(rep.int(e1, e2), collapse = "")) else stop("Invalid operands.") } ## What about c.String? .String_from_string <- function(x) { y <- enc2utf8(x) class(y) <- "String" y } NLP/R/annotators.R0000644000175100001440000003543313357324620013452 0ustar hornikusers## All annotators should have formals s and a, giving the string to ## annotate and an annotation to start from, and return "their own" ## annotation. Annotator <- function(f, meta = list(), classes = NULL) { if(!identical(names(formals(f)), c("s", "a"))) stop("Annotators must have formals 's' and 'a'.") attr(f, "meta") <- meta class(f) <- .classes_with_default(classes, "Annotator") f } is.Annotator <- function(x) inherits(x, "Annotator") format.Annotator <- function(x, ...) { d <- meta(x, "description") c(sprintf("An annotator inheriting from classes\n %s", paste(class(x), collapse = " ")), if(is.null(d)) { "with no additional description." } else { c("with description", strwrap(d, indent = 2L, exdent = 2L)) }) } ## Annotator generators. ## Provide annotator generators for composite basic NLP tasks (e.g., ## obtaining POS tags for the tokens in all sentences) based on ## functions which perform simple tasks (e.g., obtaining POS tags for ## the token in a single sentence) and return spans/features or simple ## annotations (but do not provide ids themselves). Simple_Para_Token_Annotator <- function(f, meta = list(), classes = NULL) { ## f should be a simple paragraph tokenizer, which takes a string s ## representing the whole text, and returns the spans of the ## paragraphs in s, or a simple annotation with these spans and ## (possibly) additional features. force(f) default <- "Simple_Para_Token_Annotator" classes <- .classes_with_default(classes, default) g <- function(s, a = Annotation()) { s <- as.String(s) y <- f(s) n <- length(y) id <- .seq_id(next_id(a$id), n) type <- rep.int("paragraph", n) if(is.Annotation(y)) { ## Could check whether ids are really missing. y$id <- id y$type <- type # Just making sure ... } else if(is.Span(y)) { y <- as.Annotation(y, id = id, type = type) } else stop("Invalid result from underlying paragraph tokenizer.") y } Annotator(g, meta, classes) } Simple_Sent_Token_Annotator <- function(f, meta = list(), classes = NULL) { ## f should be a simple sentence tokenizer, which takes a string s ## representing the whole text, and returns the spans of the ## sentences in s, or a simple annotation with these spans and ## (possibly) additional features. ## Note that in case paragraph annotations are available, we ## (currently) do not split the whole text into paragraphs before ## performing sentence tokenization. Instead, we add a sentence ## constituents feature for the paragraphs. force(f) default <- "Simple_Sent_Token_Annotator" classes <- .classes_with_default(classes, default) g <- function(s, a = Annotation()) { s <- as.String(s) y <- f(s) n <- length(y) id <- .seq_id(next_id(a$id), n) type <- rep.int("sentence", n) if(is.Annotation(y)) { ## Could check whether ids are really missing. y$id <- id y$type <- type # Just making sure ... } else if(is.Span(y)) { y <- as.Annotation(y, id = id, type = type) } else stop("Invalid result from underlying sentence tokenizer.") if(length(i <- which(a$type == "paragraph"))) { a <- a[i] a$features <- lapply(annotations_in_spans(y, a), function(e) list(constituents = e$id)) y <- c(y, a) } y } Annotator(g, meta, classes) } Simple_Word_Token_Annotator <- function(f, meta = list(), classes = NULL) { ## f should be a simple "word" tokenizer, which takes a string s ## representing a single sentence, and returns the spans of the word ## tokens in s, or a simple annotation with these spans and ## (possibly) additional features. ## The generated annotator adds the sentence offsets and unique ## word token ids, and constituents features for the sentences. force(f) default <- "Simple_Word_Token_Annotator" classes <- .classes_with_default(classes, default) g <- function(s, a) { s <- as.String(s) ## Use the given annotation to extract the sentences. i <- which(a$type == "sentence") if(!length(i)) stop("no sentence token annotations found") ## Obtain the results of the word tokenizer for these sentences. y <- lapply(substring(s, a$start[i], a$end[i]), f) ## Compute ids for the word tokens, and turn results into ## annotations. ## If m is the maximal id used in a and sentence i has n_i ## tokens, then the ids for these start from ## m + 1 + sum(n_j: j < i) ## and have length n_i, of course. if(all(vapply(y, is.Annotation, NA))) { y <- Map(function(u, v) { u$start <- u$start + v u$end <- u$end + v u }, y, a$start[i] - 1L) n <- lengths(y) id <- Map(.seq_id, next_id(a$id) + c(0L, cumsum(head(n, -1L))), n) type <- Map(rep.int, "word", n) y <- Map(function(u, id, type) { u$id <- id u$type <- type # Just making sure ... u }, y, id, type) } else if(all(vapply(y, is.Span, NA))) { y <- Map(`+`, y, a$start[i] - 1L) # Add sentence offsets. n <- lengths(y) id <- Map(.seq_id, next_id(a$id) + c(0L, cumsum(head(n, -1L))), n) type <- Map(rep.int, "word", n) y <- Map(function(u, id, type) as.Annotation(u, id = id, type = type), y, id, type) } else stop("Invalid result from underlying word tokenizer.") ## Constituent features for the sentences. a <- a[i] a$features <- lapply(id, single_feature, "constituents") ## Combine sentence annotation with constituent features and the ## word token annotations. c(a, do.call(c, y)) } Annotator(g, meta, classes) } Simple_POS_Tag_Annotator <- function(f, meta = list(), classes = NULL) { ## f should be a simple POS tagger, which takes a character vector ## giving the word tokens in a sentence, and returns either a ## character vector with the tags, or a list of feature maps with ## the tags as 'POS' feature and possibly other features. ## The generated annotator simply computes an annotation for the ## word tokens with the features obtained from the POS tagger. force(f) default <- "Simple_POS_Tag_Annotator" classes <- .classes_with_default(classes, default) g <- function(s, a) { s <- as.String(s) a <- annotations_in_spans(a[a$type == "word"], a[a$type == "sentence"]) if(!length(a)) stop("no sentence token annotations found") if(!any(lengths(a) > 0L)) stop("no word token annotations found") y <- lapply(s[a], f) if(all(vapply(y, is.character, NA))) features <- lapply(unlist(y), single_feature, "POS") else if(all(vapply(y, is.list, NA))) features <- unlist(y, recursive = FALSE) else stop("Invalid result from underlying POS tagger.") a <- do.call(c, a) a$features <- features ## As simple POS taggers do not return annotations, information ## about the POS tagset cannot be passed as annotation metadata. ## Instead, for now we look for a 'POS_tagset' attribute. ## Similarly for 'POS_tagset_URL'. for(tag in c("POS_tagset", "POS_tagset_URL")) { if(!is.null(val <- attr(f, tag))) attr(a, "meta")[[tag]] <- val } a } Annotator(g, meta, classes) } Simple_Entity_Annotator <- function(f, meta = list(), classes = NULL) { ## f should be a simple entity detector ("named entity recognizer") ## which takes a character vector giving the word tokens in a ## sentence, and return a simple annotation containing the word ## token spans and types of the entities found. ## The generated annotator adds ids and transforms word token spans ## to character spans. force(f) default <- "Simple_Entity_Annotator" classes <- .classes_with_default(classes, default) g <- function(s, a) { s <- as.String(s) i <- next_id(a$id) a <- annotations_in_spans(a[a$type == "word"], a[a$type == "sentence"]) if(!length(a)) stop("no sentence token annotations found") if(!any(lengths(a) > 0L)) stop("no word token annotations found") y <- lapply(a, function(e) { result <- f(s[e]) if(!inherits(result, "Annotation")) stop("Invalid result from underlying name finder.") result$start <- e$start[result$start] result$end <- e$end[result$end] result }) y <- do.call(c, y) y$id <- .seq_id(i, length(y)) y } Annotator(g, meta, classes) } Simple_Chunk_Annotator <- function(f, meta = list(), classes = NULL) { ## f should be a simple chunker, which takes character vectors ## giving the word tokens and the corresponding POS tags as inputs, ## and returns either a character vector with the chunk tags, or a ## list of feature maps with the tags as 'chunk_tag' feature and ## possibly other features. ## The generated annotator simply extracts the word token ## annotations for the sentences, obtains the chunk features for ## these, and returns the word token annotations with these features ## (only). force(f) default <- "Simple_Chunk_Annotator" classes <- .classes_with_default(classes, default) g <- function(s, a) { s <- as.String(s) a <- annotations_in_spans(a[a$type == "word"], a[a$type == "sentence"]) if(!length(a)) stop("no sentence token annotations found") if(!any(lengths(a) > 0L)) stop("no word token annotations found") y <- lapply(a, function(e) f(s[e], .annotation_features_with_template(e, "POS"))) if(all(vapply(y, is.character, NA))) features <- lapply(unlist(y), single_feature, "chunk_tag") else if(all(vapply(y, is.list, NA))) features <- unlist(y, recursive = FALSE) else stop("Invalid result from underlying chunker.") a <- do.call(c, a) a$features <- features a } Annotator(g, meta, classes) } Simple_Stem_Annotator <- function(f, meta = list(), classes = NULL) { ## f should be a simple stemmer, which takes a character vector of ## word tokens and returns the corresponding word stems. ## The generated annotator simply computes an annotation for the ## word tokens with the stem features obtained from the stemmer. force(f) default <- "Simple_Stem_Annotator" classes <- .classes_with_default(classes, default) g <- function(s, a) { s <- as.String(s) a <- a[a$type == "word"] a$features <- lapply(f(s[a]), single_feature, "stem") a } Annotator(g, meta, classes) } sentence_constituents <- function(a) { i <- which(a$type == "sentence") constituents <- lapply(a$features[i], `[[`, "constituents") if(!all(lengths(constituents) > 0L)) { ## Looks like we have an annotation with no constituents ## features for the sentences ... need to compute these. ## Make sure sentences are ordered by character offsets. i <- i[order(a$end[i])] j <- which(a$type == "word") ## Should we also make sure tokens are ordered by character ## offsets? k <- rowSums(outer(a$start[j], a$start[i], ">=")) constituents <- split(a$id[j], k) names(constituents) <- a$id[i][as.integer(names(constituents))] ## Assuming there can not be empty sentences, we could more ## simply do ## names(constituents) <- a$id[i] } else names(constituents) <- a$id[i] constituents } next_id <- function(id) .max_id(id) + 1L single_feature <- function(value, tag) { y <- list(value) names(y) <- tag y } .max_id <- function(id) { id <- id[!is.na(id)] if(!length(id)) 0L else max(id) } .seq_id <- function(f, l) as.integer(seq.int(from = f, length.out = l)) .classes_with_default <- function(classes, default) c(classes[classes != default], default) ## .simple_feature_map <- ## function(x, tag) ## { ## ## Turn a sequence of values x into a list of feature maps with ## ## given tag and respective values in x. ## lapply(x, single_feature, tag) ## } ### * Annotator pipelines Annotator_Pipeline <- function(..., meta = list()) { x <- list(...) if(!all(vapply(x, is.Annotator, FALSE))) stop("all pipeline elements must be annotator objects") .Annotator_Pipeline_from_list_and_meta(x, meta) } ## ## Should we move the is.Annotator checking here, perhaps with a way to ## turn it off? .Annotator_Pipeline_from_list_and_meta <- function(x, meta = list()) { attr(x, "meta") <- meta class(x) <- "Annotator_Pipeline" x } ## as.Annotator_Pipeline <- function(x) UseMethod("as.Annotator_Pipeline") as.Annotator_Pipeline.Annotator_Pipeline <- identity as.Annotator_Pipeline.Annotator <- function(x) .Annotator_Pipeline_from_list_and_meta(list(x)) as.Annotator_Pipeline.list <- function(x) { if(!all(vapply(x, is.Annotator, FALSE))) stop("all pipeline elements must be annotator objects") .Annotator_Pipeline_from_list_and_meta(x) } `[.Annotator_Pipeline` <- function(x, i) .Annotator_Pipeline_from_list_and_meta(unclass(x)[i], meta(x)) as.list.Annotator_Pipeline <- function(x, ...) { x <- unclass(x) attr(x, "meta") <- NULL x } ## No merging of metadata for now. c.Annotator_Pipeline <- function(..., recursive = FALSE) { annotators <- unlist(lapply(list(...), as.Annotator_Pipeline), recursive = FALSE) .Annotator_Pipeline_from_list_and_meta(annotators) } format.Annotator_Pipeline <- function(x, ...) sprintf("An annotator pipeline of length %d.", length(x)) NLP/R/conll.R0000644000175100001440000001136513143571336012370 0ustar hornikusersCoNLLTextDocument <- function(con, encoding = "unknown", format = "conll00", meta = list()) { if(length(format) == 1L) { format <- switch(format, conll00 = c(WORD = "WORD", POS = "POS", CHUNK = "CHUNK"), conll01 = c(WORD = "WORD", POS = "POS", CHUNK = "CHUNK", "CLAUSE"), conll02 = c(WORD = "WORD", NE = "NE"), ## conll03 would have different fields for the German ## variant conllx = c("ID", WORD = "FORM", "LEMMA", POS = "CPOSTAG", "POSTAG", "FEATS", "HEAD", "DEPREL", "PHEAD", "PDEPREL"), ## Corresponding to CoNLL X (10) from 2006, also used ## for conll07 conll09 = c("ID", WORD = "FORM", "LEMMA", "PLEMMA", POS = "POS", "PPOS", "FEAT", "PFEAT", "HEAD", "PHEAD", "DEPREL", "PDEPREL", "FILLPRED", "PRED", "APREDs")) } records <- scan(con, what = rep.int(list(""), length(format)), encoding = encoding, quote = NULL, quiet = TRUE, fill = TRUE, blank.lines.skip = FALSE) names(records) <- format ind <- (records[[1L]] == "") tab <- cbind(data.frame(sent = cumsum(ind) + 1L), as.data.frame(do.call(cbind, records), stringsAsFactors = FALSE))[!ind, ] attr(tab, "format") <- c("sent", format) doc <- list(content = tab, meta = meta) class(doc) <- c("CoNLLTextDocument", "TextDocument") doc } format.CoNLLTextDocument <- function(x, ...) { content <- x$content nr <- NROW(content) c(.format_TextDocument(x), sprintf("Content: words: %d, sents: %d", nr, content[[nr, "sent"]])) } ## print.CoNLLTextDocument <- ## function(x, ...) ## { ## content <- x$content ## nr <- NROW(content) ## writeLines(sprintf("<>", ## nr, content[[nr, "sent"]])) ## invisible(x) ## } content.CoNLLTextDocument <- function(x) x$content ## meta.CoNLLTextDocument <- ## function(x, tag = NULL, ...) ## if(is.null(tag)) x$meta else x$meta[[tag]] ## `meta<-.CoNLLTextDocument` <- ## function(x, tag = NULL, ..., value) ## { ## if(is.null(tag)) ## x$meta <- value ## else ## x$meta[[tag]] <- value ## x ## } as.character.CoNLLTextDocument <- words.CoNLLTextDocument <- function(x, ...) { fmt <- attr(x$content, "format") pos <- .position_of_field(fmt, "WORD") x$content[[pos]] } sents.CoNLLTextDocument <- function(x, ...) { fmt <- attr(x$content, "format") pos <- .position_of_field(fmt, "WORD") split(x$content[[pos]], x$content$sent) } tagged_words.CoNLLTextDocument <- function(x, map = NULL, ...) { if(!is.null(map)) x <- .map_POS_tags_CoNLLTextDocument(x, map) fmt <- attr(x$content, "format") pos_W <- .position_of_field(fmt, "WORD") pos_P <- .position_of_field(fmt, "POS") Tagged_Token(x$content[[pos_W]], x$content[[pos_P]]) } tagged_sents.CoNLLTextDocument <- function(x, map = NULL, ...) { if(!is.null(map)) x <- .map_POS_tags_CoNLLTextDocument(x, map) fmt <- attr(x$content, "format") pos_W <- .position_of_field(fmt, "WORD") pos_P <- .position_of_field(fmt, "POS") split(Tagged_Token(x$content[[pos_W]], x$content[[pos_P]]), x$content$sent) } chunked_sents.CoNLLTextDocument <- function(x, ...) { fmt <- attr(x$content, "format") pos_W <- .position_of_field(fmt, "WORD") pos_P <- .position_of_field(fmt, "POS") pos_C <- .position_of_field(fmt, "CHUNK") Map(chunk_tree_from_chunk_info, split(x$content[[pos_W]], x$content$sent), split(x$content[[pos_P]], x$content$sent), split(x$content[[pos_C]], x$content$sent)) } .map_POS_tags_CoNLLTextDocument <- function(x, map) { map <- POS_tag_mapper(map, meta(x, "POS_tagset")) fmt <- attr(x$content, "format") pos <- .position_of_field(fmt, "POS") x$content[[pos]] <- map(x$content[[pos]]) x } .position_of_field <- function(fmt, kind) { pos <- which(names(fmt) == kind) if(length(pos) != 1L) stop(gettextf("Cannot determine position of '%s'", kind), call. = FALSE, domain = NA) pos } NLP/R/ttd.R0000644000175100001440000001163713333632544012056 0ustar hornikusersTaggedTextDocument <- function(con, encoding = "unknown", word_tokenizer = whitespace_tokenizer, sent_tokenizer = Regexp_Tokenizer("\n", invert = TRUE), para_tokenizer = blankline_tokenizer, sep = "/", meta = list()) { s <- String(paste(readLines(con, encoding = encoding, warn = FALSE), collapse = "\n")) paras <- if(!is.null(para_tokenizer)) s[para_tokenizer(s)] else as.character(s) x <- lapply(paras, function(para) { ## Avoid as.String() coercion. spans <- sent_tokenizer(para) sents <- substring(para, spans$start, spans$end) lapply(sents, function(sent) { spans <- word_tokenizer(sent) words <- substring(sent, spans$start, spans$end) toks <- strsplit(words, sep, fixed = TRUE) one <- vapply(toks, `[[`, "", 1L) two <- vapply(toks, `[[`, "", 2L) data.frame(word = one, POS = toupper(two), stringsAsFactors = FALSE) }) }) ## Use sentence ids which are unique across paras. lens <- lapply(x, length) ids <- Map(function(f, l) as.integer(seq.int(from = f, length.out = l)), c(0L, head(cumsum(lens), -1L)) + 1L, lens) x <- Map(function(u, v) { cbind(data.frame(sent = rep.int(u, vapply(v, nrow, 0L))), do.call(rbind, v)) }, ids, x) doc <- list(content = x, meta = meta) class(doc) <- c("TaggedTextDocument", "TextDocument") doc } format.TaggedTextDocument <- function(x, ...) { content <- x$content len <- length(content) c(.format_TextDocument(x), sprintf("Content: words: %d, sents: %d, paras: %d", sum(vapply(content, NROW, 0L)), tail(content[[len]]$sent, 1L), len)) } ## print.TaggedTextDocument <- ## function(x, ...) ## { ## content <- x$content ## len <- length(content) ## writeLines(sprintf("<>", ## sum(vapply(content, NROW, 0L)), ## tail(content[[len]]$sent, 1L), ## len)) ## invisible(x) ## } content.TaggedTextDocument <- function(x) x$content ## meta.TaggedTextDocument <- ## function(x, tag = NULL, ...) ## if(is.null(tag)) x$meta else x$meta[[tag]] ## `meta<-.TaggedTextDocument` <- ## function(x, tag = NULL, ..., value) ## { ## if(is.null(tag)) ## x$meta <- value ## else ## x$meta[[tag]] <- value ## x ## } ## ## It would be nice if the as.character() method could "suitably" ## detokenize the word tokens into sentences. But this requires ## (a) knowing at least the language of the text ## (b) having code to detokenize when knowing the language ... ## as.character.TaggedTextDocument <- words.TaggedTextDocument <- function(x, ...) { unlist(lapply(x$content, `[[`, "word")) } ## ## Could more simply do ## sents.TaggedTextDocument <- function(x, ...) ## unlist(paras(x), recursive = FALSE) ## sents.TaggedTextDocument <- function(x, ...) { unlist(lapply(x$content, function(e) split(e$word, e$sent)), recursive = FALSE) } paras.TaggedTextDocument <- function(x, ...) { lapply(x$content, function(e) split(e$word, e$sent)) } tagged_words.TaggedTextDocument <- function(x, map = NULL, ...) { if(!is.null(map)) { x <- .map_POS_tags_TaggedTextDocument(x, map) } Tagged_Token(unlist(lapply(x$content, `[[`, "word")), unlist(lapply(x$content, `[[`, "POS"))) } ## ## Could more simply do ## tagged_sents.TaggedTextDocument <- function(x, ...) ## unlist(tagged_paras(x), recursive = FALSE) ## tagged_sents.TaggedTextDocument <- function(x, map = NULL, ...) { if(!is.null(map)) { x <- .map_POS_tags_TaggedTextDocument(x, map) } unlist(lapply(x$content, function(e) split(Tagged_Token(e$word, e$POS), e$sent)), recursive = FALSE) } tagged_paras.TaggedTextDocument <- function(x, map = NULL, ...) { if(!is.null(map)) { x <- .map_POS_tags_TaggedTextDocument(x, map) } lapply(x$content, function(e) split(Tagged_Token(e$word, e$POS), e$sent)) } .map_POS_tags_TaggedTextDocument <- function(x, map) { map <- POS_tag_mapper(map, meta(x, "POS_tagset")) x$content <- lapply(x$content, function(e) { e$POS <- map(e$POS) e }) x } NLP/MD50000644000175100001440000000510413741602721011234 0ustar hornikusersd7901002366c4c7b470a09e97b017da8 *DESCRIPTION f4975301de9a037968fc39707341f9e6 *NAMESPACE 68230f5b697cb282615c5f24544fb720 *R/annotate.R b32662f311cc8a7e9713cc7df81c2a99 *R/annotation.R 0ff4e30b9fb2b2c698fd99c16ad5f842 *R/annotators.R 06a8f0e3b35d1087146c0070ca6c4f3c *R/aptd.R cb5569e546faa23d9b4d487d8ca3bc73 *R/cleannlp.R 4d137191ce73c09e783ce10354878782 *R/conll.R ea6bedd7b78dde569080a2b402f29e31 *R/conllu.R 628ed6e68ce4cd8143e831b89bdfcb33 *R/datetime.R 8719adb9925c8f93aab0b0d86c3dcfe6 *R/generics.R 440039df75036e35ffc94f131f043e26 *R/language.R 8379bdef18713b0ecb99b8ac396450a0 *R/ngram.R c6495d9526f33fda1de5b5e3d7279561 *R/spacyr.R 58bcbd30d5d9a8e395a4fda8d478a767 *R/span.R 35d690b3ed8f2afe80711a6d651678a2 *R/string.R fe16b1d62100b2f4ecc48f18f687c84c *R/sysdata.rda 5de59c9b5892f7d8d02528145c59204a *R/tnt.R 765811bd3df5707c9cb346bcc90e03a5 *R/tokenize.R ff6b88b533ee9947d9678898b585b4e1 *R/tree.R 4b2706888e9753775a53ff49a970e36f *R/ttd.R b311a3b5d8975bf6668c2d291285e85c *R/udpipe.R 6af7bebbf00ecd79f85c99187bc7cefa *R/utils.R b21c0cbc0cc217d035071f33dfeac9da *R/viewers.R c1f351e3ab36cc882b6ae7626f423533 *R/wordlist.R c5b17a0d86d6a55cd79435ecdcb2250b *inst/po/en@quot/LC_MESSAGES/R-NLP.mo 708dff54d1cbd38c26083b534ceec207 *inst/texts/stanford.rds c99abe064b5f0da0ef4966740aa13431 *man/AnnotatedPlainTextDocument.Rd 53f4422eefea39e1d9d23639393ccae6 *man/Annotation.Rd 6e4f79c7f155ed43628104851db52cb5 *man/Annotator.Rd 20e8c3b19582494e65c07967bbf76c9f *man/CoNLLTextDocument.Rd 68945a2d9a572029430ddd6178b46492 *man/CoNLLUTextDocument.Rd f153bffba3e2259288df2f51564e91c1 *man/Span.Rd 97a707a3526f6a0f2166a2c5f56bf6de *man/String.Rd dce373c1a7cafab98f068ac52f350249 *man/TaggedTextDocument.Rd 82a5eb65b8fff6ecb1bda2818873dec5 *man/Tagged_Token.Rd bd0514ec3a9f5f18a422c7e5e7113a05 *man/TextDocument.Rd 424e17206654615bf8686fe16939f363 *man/Tokenizer.Rd cf3d71312c15619fb8bcfa90213c8d26 *man/Tree.Rd 87f52a9560fccb3f7d8591d2f90939b0 *man/WordListDocument.Rd 029f8becf569972e5809e7e5917f7314 *man/annotate.Rd cd795f9471a96beac4c836b1d6ea7d19 *man/annotations_in_spans.Rd 4f1067404a36f11ac97cbb38f9e4acde *man/annotators.Rd 5c3f85fbed985be1eba2a91efb1ac76e *man/datetime.Rd 95ba0807805196d2a5873fedd179ba67 *man/features.Rd b95f45ebfcda3d640a3045daf4bff61a *man/generics.Rd e81356f1c15ef0735cb52a6df4a227a4 *man/language.Rd 1c83d8d4b6d39bd74ce34c8c5f7f3516 *man/ngrams.Rd c8311c7ad8e75b1337c09cccb75ee4af *man/tagsets.Rd cde03cad007456c9cbc598543e16e12b *man/tokenizers.Rd 02e1fc6afdcc13025e034272636e061d *man/utils.Rd 8e60531d555aca515ee2c291ad588c94 *man/viewers.Rd 4d1dfef1a602b0b2cd87100a04c1da7e *po/R-NLP.pot NLP/inst/0000755000175100001440000000000013143661406011702 5ustar hornikusersNLP/inst/texts/0000755000175100001440000000000012252103770013044 5ustar hornikusersNLP/inst/texts/stanford.rds0000644000175100001440000000204113336510534015377 0ustar hornikusersYo0OʀB AUb)b  mZe J(v/@9vM++S/ܞaiXuxZP>kUc/C+'!'0ĵ=j?C=#d#!AŽ!aT͌PUУC(}W@΂{Ioc \7@A3 j95v-~OfY B1QX݂ˋ#CE6&HK=m Nfll*$ƗAd߻ [#J>a+Ϫq{{{oo@9֜TS5A:~x'.T]?P@"6S)%plG3ZS uQ Xz F<^:bI Xdp16U%L9焄E)Jek%N IYܤi%.~eL5}iG  Pɕ}pI y44VoAhNf>˨LVAe6݂& 5KLh~S[Y<ѓ xogwT-5|;Љ̭1u}ż%Kɤ e( Q=&fLI L2;">o*(G7cgCI% J4(iP࿧oMet;/?GfِqBIn֖(\'2R ZUuD:8(hV 2u䳡؉!nr歗g[pξ@CRb;2V>Ye'=‰=yt@>6(wa^jN~?!NLP/inst/po/0000755000175100001440000000000013143661406012320 5ustar hornikusersNLP/inst/po/en@quot/0000755000175100001440000000000013143661406013733 5ustar hornikusersNLP/inst/po/en@quot/LC_MESSAGES/0000755000175100001440000000000013143661406015520 5ustar hornikusersNLP/inst/po/en@quot/LC_MESSAGES/R-NLP.mo0000644000175100001440000000535713143661406016717 0ustar hornikusers%@#Ae)~!*'1+Y32./4Hd9#K  We#z/7'&N1k%*'*+R3~2. /- L] = # K X f #{  /     '%s' not defined for "Span" objects'subset' must be logicalAnnotators must have formals 's' and 'a'.Cannot determine position of '%s'Invalid entries:Invalid language tag(s):Invalid operands.Invalid result from underlying POS tagger.Invalid result from underlying chunker.Invalid result from underlying name finder.Invalid result from underlying paragraph tokenizer.Invalid result from underlying sentence tokenizer.Invalid result from underlying word tokenizer.Need a non-empty string.all pipeline elements must be annotator objectsargument 'annotations' must give a positive number of Annotation objectsargument 'x' must be an AnnotatedPlainTextDocument objectarguments must have the same lengthcontent modification is not possible for AnnotatedPlainTextDocument objectsend of stringinvalid element nameno sentence token annotations foundno word token annotations foundreplacement must have the same length as objectProject-Id-Version: NLP 0.1-10.1 POT-Creation-Date: 2017-08-12 22:20 PO-Revision-Date: 2017-08-12 22:20 Last-Translator: Automatically generated Language-Team: none MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Language: en Plural-Forms: nplurals=2; plural=(n != 1); ‘%s’ not defined for "Span" objects‘subset’ must be logicalAnnotators must have formals ‘s’ and ‘a’.Cannot determine position of ‘%s’Invalid entries:Invalid language tag(s):Invalid operands.Invalid result from underlying POS tagger.Invalid result from underlying chunker.Invalid result from underlying name finder.Invalid result from underlying paragraph tokenizer.Invalid result from underlying sentence tokenizer.Invalid result from underlying word tokenizer.Need a non-empty string.all pipeline elements must be annotator objectsargument ‘annotations’ must give a positive number of Annotation objectsargument ‘x’ must be an AnnotatedPlainTextDocument objectarguments must have the same lengthcontent modification is not possible for AnnotatedPlainTextDocument objectsend of stringinvalid element nameno sentence token annotations foundno word token annotations foundreplacement must have the same length as objectNLP/po/0000755000175100001440000000000013143661406011343 5ustar hornikusersNLP/po/R-NLP.pot0000644000175100001440000000330313143661406012716 0ustar hornikusersmsgid "" msgstr "" "Project-Id-Version: NLP 0.1-10.1\n" "POT-Creation-Date: 2017-08-12 22:20\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=CHARSET\n" "Content-Transfer-Encoding: 8bit\n" msgid "arguments must have the same length" msgstr "" msgid "invalid element name" msgstr "" msgid "replacement must have the same length as object" msgstr "" msgid "'subset' must be logical" msgstr "" msgid "Annotators must have formals 's' and 'a'." msgstr "" msgid "Invalid result from underlying paragraph tokenizer." msgstr "" msgid "Invalid result from underlying sentence tokenizer." msgstr "" msgid "no sentence token annotations found" msgstr "" msgid "Invalid result from underlying word tokenizer." msgstr "" msgid "no word token annotations found" msgstr "" msgid "Invalid result from underlying POS tagger." msgstr "" msgid "Invalid result from underlying name finder." msgstr "" msgid "Invalid result from underlying chunker." msgstr "" msgid "all pipeline elements must be annotator objects" msgstr "" msgid "argument 'annotations' must give a positive number of Annotation objects" msgstr "" msgid "content modification is not possible for AnnotatedPlainTextDocument objects" msgstr "" msgid "argument 'x' must be an AnnotatedPlainTextDocument object" msgstr "" msgid "Cannot determine position of '%s'" msgstr "" msgid "Invalid entries:" msgstr "" msgid "Invalid language tag(s):" msgstr "" msgid "Invalid operands." msgstr "" msgid "'%s' not defined for \"Span\" objects" msgstr "" msgid "Need a non-empty string." msgstr "" msgid "end of string" msgstr ""