tm/ 0000755 0001751 0000144 00000000000 14755316632 010724 5 ustar hornik users tm/tests/ 0000755 0001751 0000144 00000000000 13065660374 012064 5 ustar hornik users tm/tests/testthat/ 0000755 0001751 0000144 00000000000 14755316631 013725 5 ustar hornik users tm/tests/testthat/test-TermDocumentMatrix.R 0000644 0001751 0000144 00000004061 13206514642 020611 0 ustar hornik users context("Term-document matrices")
test_that("construction works", {
vs <- VectorSource(c("one two two three three three",
"This is a short text with a few words"))
scorpus <- Corpus(vs)
vcorpus <- VCorpus(vs)
ms <- TermDocumentMatrix(scorpus)
mv <- TermDocumentMatrix(vcorpus)
terms <- c("few", "one", "short", "text", "this",
"three", "two", "with", "words")
docs <- c("1", "2")
expect_equal(sort(Terms(ms)), terms)
expect_equal(sort(Terms(mv)), terms)
expect_equal(Docs(ms), docs)
expect_equal(Docs(mv), docs)
m <- matrix(c(0, 1, 0, 0, 0, 3, 2, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1), ncol = 2,
dimnames = list("Terms" = terms, "Docs" = docs))
expect_equal(as.matrix(ms[order(Terms(ms)), ]), m)
expect_equal(as.matrix(mv), m)
})
test_that("construction with control arguments works", {
vs <- VectorSource("one two two three three three")
scorpus <- Corpus(vs)
vcorpus <- VCorpus(vs)
docs <- "1"
ctrl <- list(dictionary = c("three", "two", "zero"))
ms <- TermDocumentMatrix(scorpus, ctrl)
mv <- TermDocumentMatrix(vcorpus, ctrl)
m <- matrix(c(3, 2, 0),
dimnames = list("Terms" = ctrl$dictionary, "Docs" = docs))
expect_equal(as.matrix(ms[order(Terms(ms)), ]), m)
expect_equal(as.matrix(mv), m)
})
test_that("zero matrix works", {
vs <- VectorSource("one two three")
scorpus <- Corpus(vs)
vcorpus <- VCorpus(vs)
ctrl <- list(dictionary = "four", wordLengths = c(1, Inf))
ms <- TermDocumentMatrix(scorpus, ctrl)
mv <- TermDocumentMatrix(vcorpus, ctrl)
m <- matrix(0, dimnames = list("Terms" = ctrl$dictionary, "Docs" = "1"))
expect_equal(as.matrix(ms), m)
expect_equal(as.matrix(mv), m)
})
test_that("empty matrix works", {
docs <- "1"
ds <- DataframeSource(data.frame(doc_id = docs, text = NA))
scorpus <- Corpus(ds)
vcorpus <- VCorpus(ds)
ms <- TermDocumentMatrix(scorpus)
mv <- TermDocumentMatrix(vcorpus)
m <- matrix(numeric(), dimnames = list("Terms" = character(), "Docs" = docs))
expect_equal(as.matrix(ms), m)
expect_equal(as.matrix(mv), m)
})
tm/tests/testthat/test-Transformation.R 0000644 0001751 0000144 00000001307 13207271046 020023 0 ustar hornik users context("Transformations")
test_that("removePunctuation works in latin1 locale", {
if (nzchar(suppressWarnings(Sys.setlocale("LC_CTYPE", "en_US.iso88591")))) {
id <- c(73L, 108L, 32L, 115L, 39L, 101L, 120L, 112L, 114L, 105L, 109L, 97L,
105L, 116L, 32L, 101L, 110L, 32L, 117L, 110L, 32L, 108L, 97L, 110L,
103L, 97L, 103L, 101L, 32L, 99L, 104L, 226L, 116L, 105L, 233L)
iu <- intToUtf8(id)
il <- iconv(iu, from = "UTF-8", to = "latin1")
td <- id[-5L]
tu <- intToUtf8(td)
tl <- iconv(tu, from = "UTF-8", to = "latin1")
expect_equal(removePunctuation(iu), tu)
expect_equal(removePunctuation(il), tl)
} else
skip("latin1 locale not available")
})
tm/tests/testthat/test-Tokenizer.R 0000644 0001751 0000144 00000000612 13206536446 016774 0 ustar hornik users context("Tokenizers")
test_that("scan_tokenizer works with character vectors", {
tokens <-
c("a", "character", "vector", "consisting", "of", "multiple", "elements")
expect_equal(scan_tokenizer(c(paste0(tokens[1:3], collapse = " "),
paste0(tokens[4:5], collapse = " "),
paste0(tokens[6:7], collapse = " "))), tokens)
})
tm/tests/testthat/test-Source.R 0000644 0001751 0000144 00000001172 13110235234 016245 0 ustar hornik users context("Sources")
test_that("DataframeSource works", {
txt <- c("First document.", "Second document.")
dm1 <- 1:2
dm2 <- letters[1:2]
df <- data.frame(doc_id = c("doc_1", "doc_2"), text = txt,
dmeta1 = dm1, dmeta2 = dm2, stringsAsFactors = FALSE)
ds <- DataframeSource(df)
scorpus <- Corpus(ds)
vcorpus <- VCorpus(ds)
expect_equal(as.character(scorpus[[2]]), as.character(vcorpus[[2]]))
expect_equal(as.character(scorpus[[2]]), txt[2])
expect_equal(meta(scorpus), meta(vcorpus))
expect_equal(meta(scorpus),
data.frame(dmeta1 = dm1, dmeta2 = dm2, stringsAsFactors = FALSE))
})
tm/tests/testthat.R 0000644 0001751 0000144 00000000060 13065660374 014043 0 ustar hornik users library(testthat)
library(tm)
test_check("tm")
tm/MD5 0000644 0001751 0000144 00000026545 14755316632 011250 0 ustar hornik users f471275864f1b29f750384e5665b0562 *DESCRIPTION
9eb5de67c11a7a2ace73df31bec560cc *NAMESPACE
c587e5c09daeb47a355ec8510eb75341 *R/RcppExports.R
881f00e795e17803432949ff05facc96 *R/complete.R
dffd17856a1f0d1ad45fa6e9cc8deaa1 *R/corpus.R
c1ac8a79992c42d3ec695df39b8c3bc9 *R/doc.R
beba1a821bfdf61ece1708123ab71324 *R/filter.R
b205235d27368949ee5ea0dd3a10b9d7 *R/foreign.R
cb5367e831c1be819b9773304985724a *R/hpc.R
32b666ea3b78f2b188cb56c7f3e26790 *R/matrix.R
c36f8ed69c326c2b027a670d2662e1d1 *R/meta.R
07d1407f6cfdbdbb6060ebfb11f97f6f *R/pdftools.R
b9cd19804a89de8eca51394726256e68 *R/plot.R
fd701389b291a843584167ab7385c453 *R/reader.R
5f6ff8b218e7679919b85230b11cdebb *R/score.R
658b904bc1ec319e536ca3844568dabd *R/source.R
dee7e0a8b245fd670436a019c54d904c *R/stopwords.R
e57141f4a63f3dc13b0ef97c6960a41b *R/tokenizer.R
22ebb540c91c1a2d1494967c6c7395a5 *R/transform.R
1c59b79f99cdeb623f387ea378d0331c *R/utils.R
c1de3acc3bc1bc9f64926b93c3be8301 *R/weight.R
76161b65639451c966be75488458b3c3 *build/partial.rdb
5308c469d0f1febee530edbe9c931e3e *build/vignette.rds
687f47ce82c62c4d9dbf26007cb38f0c *data/acq.rda
4127a983c6a0778646eb825d80a01f57 *data/crude.rda
1710cf3dc724c13df75da9f29169d59d *inst/CITATION
68aabe6ff1f89e59ed966800de2ae050 *inst/NEWS.Rd
ad6a6fe44b80541732690af3f36a4c32 *inst/doc/extensions.R
d194109d976d7f242e64a8eab85026f8 *inst/doc/extensions.Rnw
02783e40ac830733bcf24bb5ddfbbdaf *inst/doc/extensions.pdf
fa0d35568c6b1bf9e923168b8118b7d5 *inst/doc/tm.R
788171d86f35f92d96b7320bc5dcad60 *inst/doc/tm.Rnw
aaa1296c199cbfaa36038783aaff0e64 *inst/doc/tm.pdf
98f3b5f3d1f670032af4131a627c18d7 *inst/ghostscript/pdf_info.ps
7ec7b5de9c642afedf1159021c89f12a *inst/stopwords/SMART.dat
4c8fb2c1404c10540c267425fcc005f0 *inst/stopwords/catalan.dat
4e8d44fa90d87908846a2d92c2618b31 *inst/stopwords/danish.dat
a638b876d5cbec644685d12d452a7407 *inst/stopwords/dutch.dat
e181651a30ec45694b7fafc787f357dc *inst/stopwords/english.dat
1094269bf20052a5259983e23c69a552 *inst/stopwords/finnish.dat
29772f7c7dacf306981ad50c5484c4ad *inst/stopwords/french.dat
4a562db64979f200804127c3751a6efa *inst/stopwords/german.dat
1e1f45e67297e049bb22527d7efa8025 *inst/stopwords/hungarian.dat
7dfee49b4660f65f7bb935bef0c773bd *inst/stopwords/italian.dat
4cd3ddc90492cc5a3cbb9f0292d3844d *inst/stopwords/norwegian.dat
d3483742365aa7d477512fd1810452c5 *inst/stopwords/portuguese.dat
f6a262767ae1863b9e8cc92f78e3bb01 *inst/stopwords/romanian.dat
4bf4046fe7701b4940b8eb2c86f19c08 *inst/stopwords/russian.dat
fddb7f14207d2649597b36e22b5eab18 *inst/stopwords/spanish.dat
d3930c86664d4112ae772285dca85fd6 *inst/stopwords/swedish.dat
4dc7bdaa3323e71845cf4c018e871048 *inst/texts/acq/reut-00001.xml
a63b803ca46191dc3a30eda875d95136 *inst/texts/acq/reut-00002.xml
7638d681bcb7d2f3539b8be8a454dff9 *inst/texts/acq/reut-00003.xml
f822ea4bdb0691950284856b51c87e41 *inst/texts/acq/reut-00004.xml
1f8f1f8699bb3883748fa29807477a55 *inst/texts/acq/reut-00005.xml
f44aa9f0b51556f382cf8a91d7f36244 *inst/texts/acq/reut-00006.xml
e0d5ea56a8f42146f5b7d3735da730dc *inst/texts/acq/reut-00007.xml
b7560c91c1f18e919d7548d9d1b59843 *inst/texts/acq/reut-00008.xml
6b2913f0f666d7f84dd38ac05b326726 *inst/texts/acq/reut-00009.xml
5625c064bfff14db909a25a6719dc3f8 *inst/texts/acq/reut-00010.xml
047f38558920a11ebaeab94727465e58 *inst/texts/acq/reut-00011.xml
eb26151fa8a7fcd2c87065b0ad8f0924 *inst/texts/acq/reut-00012.xml
abdbeb14424b6f5994674e604a0a5590 *inst/texts/acq/reut-00013.xml
05b945b892bbb8d575c6ff6193bb17b8 *inst/texts/acq/reut-00014.xml
e5159c22413cae49c015a631df3a74e2 *inst/texts/acq/reut-00015.xml
cd87fc59bfcbe37c847bd1548537effa *inst/texts/acq/reut-00016.xml
75ec08b1337a6035d553f8344ece2c2a *inst/texts/acq/reut-00017.xml
908e51c4b6f9f4e65805adef7029c884 *inst/texts/acq/reut-00018.xml
e67944c5bb9ef8e0fe811b1ead21199b *inst/texts/acq/reut-00020.xml
1d19206cd4478bfc03bc9335316f6816 *inst/texts/acq/reut-00021.xml
621a7e8ba27aac9b8040adc7fc1d11f9 *inst/texts/acq/reut-00022.xml
736bff1fabc3f07b35cd992e8630ed90 *inst/texts/acq/reut-00023.xml
da2ddc7ac585134cb7fe80e812d3ac80 *inst/texts/acq/reut-00024.xml
a04162294ae6ae69f3d1a74f0ad0b9b1 *inst/texts/acq/reut-00025.xml
5e757cb13baa266c292da3ff010f1434 *inst/texts/acq/reut-00026.xml
7974dd802d4ca66b7f7f51c355c8e558 *inst/texts/acq/reut-00027.xml
62368bea00c9a71f01293060708fc6a4 *inst/texts/acq/reut-00028.xml
7e06015b7518b608148002364989c4f7 *inst/texts/acq/reut-00029.xml
f24469e27c9f16266db0e141892e97d1 *inst/texts/acq/reut-00030.xml
acc36dbfdffe0362d39975db07569b85 *inst/texts/acq/reut-00031.xml
7e342636219116a2d428e2188b1dcb0b *inst/texts/acq/reut-00032.xml
c40ce905c6896410a672bee72f132b46 *inst/texts/acq/reut-00034.xml
ead5a03af44fb5cf4e896f039a122e4b *inst/texts/acq/reut-00035.xml
684ddc28a9bb0fbb6f49fa412b54231d *inst/texts/acq/reut-00036.xml
1be33a6347aa406b843132da98286506 *inst/texts/acq/reut-00039.xml
1bdf38586ab43a0f6996d3135ff1f48c *inst/texts/acq/reut-00040.xml
b89e5d9aeba1b0e02cf3bf3fa729e346 *inst/texts/acq/reut-00042.xml
7c3703135baad41765ad1f58fcab0ba5 *inst/texts/acq/reut-00043.xml
d5ab6f6dfe5fefb25422b258bcd339d0 *inst/texts/acq/reut-00045.xml
1af51ea6ba1898d33a84b680c1fa4d09 *inst/texts/acq/reut-00046.xml
cb00fc7833f2eb9e3ac97c12d900dd4f *inst/texts/acq/reut-00047.xml
e5b440d419fa528d4c996cd47e88c0b4 *inst/texts/acq/reut-00048.xml
4ed77929b16a0c6f3264272183b6c951 *inst/texts/acq/reut-00049.xml
7f6df11fcb6617c253921861e217c3c6 *inst/texts/acq/reut-00050.xml
ba0a88d8b9caaa0d0fa8bba01bf2a9d9 *inst/texts/acq/reut-00051.xml
c8b4ee7875ddba1c1d2886c3e32a7cb6 *inst/texts/acq/reut-00052.xml
b0e4f9f398ba4e2ab847e1dc44c2594e *inst/texts/acq/reut-00053.xml
ea25a8bf959fe2769e578474d5f0176f *inst/texts/acq/reut-00054.xml
574a5170c695ad0bbc91055ef8fdd2e9 *inst/texts/acq/reut-00055.xml
66cf87f5587906604d96c3f64ab77a9b *inst/texts/acq/reut-00056.xml
e1c26b346a6683c393b2f420593b02e5 *inst/texts/crude/reut-00001.xml
401049764894ad7b37be02cee2e926f6 *inst/texts/crude/reut-00002.xml
15a57b39a4172799d7926c440548b1fd *inst/texts/crude/reut-00004.xml
95474b7494ce4835ed952374601f921e *inst/texts/crude/reut-00005.xml
e91c3ec329c1f82fc27ea79d33650d32 *inst/texts/crude/reut-00006.xml
5344713574482c3d393766422bd72498 *inst/texts/crude/reut-00007.xml
5803359fee327a77342d4d16bc467271 *inst/texts/crude/reut-00008.xml
c0f88331bbf3da5ec273838ac832e7fa *inst/texts/crude/reut-00009.xml
ed3994f50fa16217a6c62dfae5909a03 *inst/texts/crude/reut-00010.xml
c74f1b54db67c730bcc117536903dc52 *inst/texts/crude/reut-00011.xml
32cf0da1d923fd2aee4fe28200047c3b *inst/texts/crude/reut-00012.xml
42f6d47f40304ddc482e62bf1d1c3c21 *inst/texts/crude/reut-00013.xml
51565e0b464e626cf1db1d812642e295 *inst/texts/crude/reut-00014.xml
8b107465269cd463e8d7deb470423dda *inst/texts/crude/reut-00015.xml
6b69f531b6953be522a58b0456820e04 *inst/texts/crude/reut-00016.xml
5deaf389a9067a5b6090c13195c0d254 *inst/texts/crude/reut-00018.xml
9e745c906a03765fb0b364ae78bbdcd5 *inst/texts/crude/reut-00019.xml
488f96e28466feeac3175f57724a1f8e *inst/texts/crude/reut-00021.xml
da9f871a845a256e2c12ace2a2e2fb36 *inst/texts/crude/reut-00022.xml
2439e7823a1ff6403efd3108fa5ecc45 *inst/texts/crude/reut-00023.xml
7d9482d1fc4a624492dacf584a940b4c *inst/texts/custom.xml
717801d47bc20af5d69340eee342ce21 *inst/texts/loremipsum.txt
e76c36aad136268277f2c036dc1c37cd *inst/texts/rcv1_2330.xml
eda82aaa0c873d62be4905cb32dedb05 *inst/texts/reuters-21578.xml
5901120140c757daf5f21fba990e2bbe *inst/texts/txt/ovid_1.txt
2b5dc16305207ed29df7bbe0cc47abee *inst/texts/txt/ovid_2.txt
08197bca339b621d395220bd7ab719a7 *inst/texts/txt/ovid_3.txt
832ea34c305426cc653701df40750edf *inst/texts/txt/ovid_4.txt
3b3cb14d62de578684d6c59fa6dcba60 *inst/texts/txt/ovid_5.txt
d44474e05cd96e80932106e24ed572a1 *man/Corpus.Rd
6339b0d2bae8c6d1e3a383bdea82d425 *man/DataframeSource.Rd
1c104e63fd71cd63ad6e0da3669fbdf5 *man/DirSource.Rd
5871b5f9883ba4359e269bbfca27db37 *man/Docs.Rd
00fa0c14e4086a140646ad23597ca5eb *man/PCorpus.Rd
8a778ebd67c6b9c7af89a2654e665bf6 *man/PlainTextDocument.Rd
f1c465f51d627af46612833ffcc17f59 *man/Reader.Rd
b4d2dcdc0c2b16f38561637956a7a328 *man/SimpleCorpus.Rd
24c30f62fdf0d4a0219147e947477920 *man/Source.Rd
421f5a82e8adcceeb733dbe31455b8d7 *man/TextDocument.Rd
c82a889b500268683904a4ad7fc9d3b1 *man/URISource.Rd
7c84cd5a42cdac47a1b0301e2b6459a6 *man/VCorpus.Rd
3fb4034c6df0b6277f07a028a958b932 *man/VectorSource.Rd
5a32dfd6e72da8d3c8569803d6761126 *man/WeightFunction.Rd
0b79ee972dac094d6f0ed9c1f4d2685f *man/XMLSource.Rd
0a982a855094b02e983d7c7bf5e60c2b *man/XMLTextDocument.Rd
2d25fcd9863b4ac7128c1d2a521e27f2 *man/ZipSource.Rd
ca38d43ef3a58075443e49cd244bd1ea *man/Zipf_n_Heaps.Rd
f171dc99dd06472ef193979db9f16779 *man/acq.Rd
aa36762f11d31e840ba6115b9b913341 *man/combine.Rd
72567bcd2d6725219799bb32ccdf8ffa *man/content_transformer.Rd
a6f9579086069dbfd44d6931c8f2b66a *man/crude.Rd
f30ebc7d2c9ad750ef0e6037d1669827 *man/findAssocs.Rd
74d7ea8ee4c4ac46492bbc3b52a10dca *man/findFreqTerms.Rd
36e135250b446bbd0e677115bcf1a82a *man/findMostFreqTerms.Rd
b40ce87658be7e505312812d97907627 *man/foreign.Rd
be785d88b0821a06be0b4772868dc37c *man/getTokenizers.Rd
9ad9e3d7afb9815f04529a435f430a53 *man/getTransformations.Rd
571b3a0a81bfffe0e6ebfc34289fb2de *man/hpc.Rd
6a72cef1df5795bb189bd1a0177e5d4d *man/inspect.Rd
ca0d40d80911df57f1e4d825ca645044 *man/matrix.Rd
33870f4b1f105daa8307e58f3ec61fa2 *man/meta.Rd
a67ad1293bc40179efe281cf1faeeb63 *man/plot.Rd
7de11cf5180caee710b5fda07b211eb8 *man/readDOC.Rd
13b3964279323a7d94ccab25ca7afaef *man/readDataframe.Rd
56f162b724f8a1ffd21bd47633bbd068 *man/readPDF.Rd
d625f0434c021f98e4529ce1427703cf *man/readPlain.Rd
b49b3852a0344d682e6bb4f6b30aa6d5 *man/readRCV1.Rd
78b1b12b618650c11e6e3d787f79f807 *man/readReut21578XML.Rd
ec13c14161ee1c95f89ce75237aa3df7 *man/readTagged.Rd
ce6a6feb64dd79693b7ceba7bdb4c6a0 *man/readXML.Rd
295b85ec0a37c83bc105f97ca48dfc9a *man/removeNumbers.Rd
f8e578de76e389cf55176fb546743468 *man/removePunctuation.Rd
ef0d87508b367cdd71f066244605407e *man/removeSparseTerms.Rd
2484a54292458f80e26f2956fc5d7501 *man/removeWords.Rd
5bdcaccf0076e98a2341078e61c59be5 *man/stemCompletion.Rd
6a9b411d93cf0276218bca553f1e37e3 *man/stemDocument.Rd
e787cae6198de27c6c3d457c41f3af95 *man/stopwords.Rd
15b8549fd381105839451d9b15c7efa3 *man/stripWhitespace.Rd
00e5d6599fb3815ce62ef1f5bf6aa744 *man/termFreq.Rd
1dd2e47bdc3ac7481366dc0d359ef94a *man/tm_filter.Rd
0a2583e28333847146cc5168593b140c *man/tm_map.Rd
6eb083c9b6f1b08700065fd58bf1f8be *man/tm_reduce.Rd
458b061071b9b320951c3b48adf16264 *man/tm_term_score.Rd
ca827420b159ae91646b637bb48f75f7 *man/tokenizer.Rd
47bc8704437b53709120add15f205be0 *man/weightBin.Rd
abe06433d8438326d1e03c8367312a59 *man/weightSMART.Rd
4e7d2dd30d4de494ba122cd3aff128ee *man/weightTf.Rd
88fbb7eda2e788887e1fe67cb7fd0855 *man/weightTfIdf.Rd
193b23f2d16e20a4944846725eebd155 *man/writeCorpus.Rd
813f07011de972121885f35821e6426b *src/RcppExports.cpp
1b7544de4c9e45507e82e6e5033819fa *src/copy.c
45b4524bfac392e34ba96c2609c77f7c *src/init.c
706a1d7e181fc2acd829a541bc769478 *src/remove.c
d6c1687f7b18dd1c068ad609f03a888b *src/scan.c
fe9ef490894f8d93571ffb091669c7dd *src/tdm.cpp
b7995b66ea58d9604a6bf61ef68381fb *src/tokenizer.cpp
f280e050264388e7c120d4869357efb7 *tests/testthat.R
7987b16eeb87d6c4e9787b85e5b764a4 *tests/testthat/test-Source.R
ef259599b4562c161bf3e0c4529ebcf5 *tests/testthat/test-TermDocumentMatrix.R
7f1736751d70509612e9a728766fe146 *tests/testthat/test-Tokenizer.R
2003b069d4a811c99d5edf34a42eb2a1 *tests/testthat/test-Transformation.R
d194109d976d7f242e64a8eab85026f8 *vignettes/extensions.Rnw
3641da272a48168ad7b4ffef9fbf7d21 *vignettes/references.bib
788171d86f35f92d96b7320bc5dcad60 *vignettes/tm.Rnw
tm/R/ 0000755 0001751 0000144 00000000000 14716601054 011115 5 ustar hornik users tm/R/doc.R 0000644 0001751 0000144 00000005777 13177022574 012031 0 ustar hornik users c.TextDocument <-
function(..., recursive = FALSE)
{
args <- list(...)
x <- args[[1L]]
if (length(args) == 1L)
return(x)
if (!all(unlist(lapply(args, inherits, class(x)))))
stop("not all arguments are text documents")
v <- list(content = args,
meta = CorpusMeta(),
dmeta = data.frame(row.names = seq_along(args)))
class(v) <- c("VCorpus", "Corpus")
v
}
.format_TextDocument <-
function(x, ...)
c(sprintf("<<%s>>", class(x)[1L]),
sprintf("Metadata: %d", length(meta(x))))
inspect.TextDocument <-
function(x)
{
print(x)
cat("\n")
writeLines(as.character(x))
invisible(x)
}
PlainTextDocument <-
function(x = character(0),
author = character(0),
datetimestamp = as.POSIXlt(Sys.time(), tz = "GMT"),
description = character(0),
heading = character(0),
id = character(0),
language = character(0),
origin = character(0),
...,
meta = NULL,
class = NULL)
{
p <- list(content = as.character(x),
meta = TextDocumentMeta(author, datetimestamp, description,
heading, id, language, origin, ...,
meta = meta))
class(p) <- unique(c(class, "PlainTextDocument", "TextDocument"))
p
}
as.character.PlainTextDocument <-
function(x, ...)
content(x)
content.PlainTextDocument <-
function(x)
x$content
`content<-.PlainTextDocument` <-
function(x, value)
{
x$content <- as.character(value)
x
}
format.PlainTextDocument <-
function(x, ...)
c(.format_TextDocument(x),
sprintf("Content: chars: %d", sum(nchar(x$content))))
meta.PlainTextDocument <-
function(x, tag = NULL, ...)
if (is.null(tag)) x$meta else x$meta[[tag]]
`meta<-.PlainTextDocument` <-
function(x, tag = NULL, ..., value)
{
if (is.null(tag))
x$meta <- value
else
x$meta[[tag]] <- value
x
}
words.character <-
words.PlainTextDocument <-
function(x, ...)
scan_tokenizer(x)
XMLTextDocument <-
function(x = xml_missing(),
author = character(0),
datetimestamp = as.POSIXlt(Sys.time(), tz = "GMT"),
description = character(0),
heading = character(0),
id = character(0),
language = character(0),
origin = character(0),
...,
meta = NULL)
{
d <- list(content = x,
meta = TextDocumentMeta(author, datetimestamp, description,
heading, id, language, origin, ...,
meta = meta))
class(d) <- c("XMLTextDocument", "TextDocument")
d
}
as.character.XMLTextDocument <-
function(x, ...)
xml_text(content(x))
content.XMLTextDocument <-
function(x)
x$content
`content<-.XMLTextDocument` <-
function(x, value)
{
x$content <- value
x
}
format.XMLTextDocument <- .format_TextDocument
meta.XMLTextDocument <- meta.PlainTextDocument
`meta<-.XMLTextDocument` <- `meta<-.PlainTextDocument`
tm/R/plot.R 0000644 0001751 0000144 00000005312 13023472034 012211 0 ustar hornik users plot.TermDocumentMatrix <- plot.DocumentTermMatrix <-
function(x,
terms = sample(Terms(x), 20),
corThreshold = 0.7,
weighting = FALSE,
attrs = list(graph = list(rankdir = "BT"),
node = list(shape = "rectangle", fixedsize = FALSE)),
...)
{
if (system.file(package = "Rgraphviz") == "")
stop("Plotting requires package 'Rgraphviz'.")
m <- if (inherits(x, "TermDocumentMatrix")) t(x) else x
m <- as.matrix(m[, terms])
c <- cor(m)
c[c < corThreshold] <- 0
c[is.na(c)] <- 0
diag(c) <- 0
p <- Rgraphviz::plot(methods::as(c, "graphNEL"), attrs = attrs, ...)
if (weighting) {
i <- 1
lw <- round(c[lower.tri(c) & c >= corThreshold] * 10)
for (ae in Rgraphviz::AgEdge(p)) {
Rgraphviz::lines(ae, lwd = lw[i], len = 1)
i <- i + 1
}
}
invisible(p)
}
## Plotting functions for Zipf's and Heaps'law contributed by Kurt Hornik
## See http://en.wikipedia.org/wiki/Zipf%27s_law
Zipf_plot <-
function(x, type = "l", ...)
{
if (inherits(x, "TermDocumentMatrix"))
x <- t(x)
y <- log(sort(col_sums(x), decreasing = TRUE))
x <- log(seq_along(y))
m <- lm(y ~ x)
dots <- list(...)
if (is.null(dots$xlab)) dots$xlab <- "log(rank)"
if (is.null(dots$ylab)) dots$ylab <- "log(frequency)"
do.call(plot, c(list(x, y, type = type), dots))
abline(m)
##
## Perhaps this should (invisibly) return the fitted linear model
## instead of just the coefficients?
coef(m)
##
}
## http://en.wikipedia.org/wiki/Heaps%27_law
## http://en.wikipedia.org/wiki/Text_corpus
## cum_vocabulary_size <-
## function(m)
## {
## ## Should work in general, but it very slow for large simple triplet
## ## matrices ...
## s <- double(nrow(m))
## v <- double(ncol(m))
## for(i in seq_along(s)) {
## v <- pmax(v, c(m[i, ]))
## s[i] <- sum(v > 0)
## }
## s
## }
cum_vocabulary_size <-
function(m)
{
## Only works for simple triplet matrices.
i <- sapply(split(m$i, m$j), min)
tab <- table(i)
v <- double(nrow(m))
v[as.numeric(names(tab))] <- tab
cumsum(v)
}
Heaps_plot <-
function(x, type = "l", ...)
{
if (inherits(x, "TermDocumentMatrix"))
x <- t(x)
y <- log(cum_vocabulary_size(x))
x <- log(cumsum(row_sums(x)))
m <- lm(y ~ x)
dots <- list(...)
if (is.null(dots$xlab)) dots$xlab <- "log(T)"
if (is.null(dots$ylab)) dots$ylab <- "log(V)"
do.call(plot, c(list(x, y, type = type), dots))
abline(m)
##
## Perhaps this should (invisibly) return the fitted linear model
## instead of just the coefficients?
coef(m)
##
}
tm/R/source.R 0000644 0001751 0000144 00000020166 14346266014 012547 0 ustar hornik users ## Author: Ingo Feinerer
## Sources
getSources <-
function()
c("DataframeSource", "DirSource", "URISource", "VectorSource", "XMLSource",
"ZipSource")
SimpleSource <-
function(encoding = "",
length = 0,
position = 0,
reader = readPlain,
...,
class)
{
if (!is.character(encoding))
stop("invalid encoding")
if (!is.numeric(length) || (length < 0))
stop("invalid length entry denoting the number of elements")
if (!is.numeric(position))
stop("invalid position")
if (!is.function(reader))
stop("invalid default reader")
s <- list(encoding = encoding, length = length,
position = position, reader = reader, ...)
class(s) <- unique(c(class, "SimpleSource", "Source"))
s
}
# A data frame where each row is interpreted as document
DataframeSource <-
function(x)
{
stopifnot(all(!is.na(match(c("doc_id", "text"), names(x)))))
SimpleSource(length = nrow(x), reader = readDataframe,
content = x, class = "DataframeSource")
}
# A directory with files interpreted as documents
DirSource <-
function(directory = ".", encoding = "", pattern = NULL,
recursive = FALSE, ignore.case = FALSE, mode = "text")
{
if (!identical(mode, "text") &&
!identical(mode, "binary") &&
!identical(mode, ""))
stop(sprintf("invalid mode '%s'", mode))
d <- dir(directory, full.names = TRUE, pattern = pattern,
recursive = recursive, ignore.case = ignore.case)
if (!length(d))
stop("empty directory")
isfile <- !file.info(d)[["isdir"]]
if (any(is.na(isfile)))
stop("non-existent or non-readable file(s): ",
paste(d[is.na(isfile)], collapse = " "))
SimpleSource(encoding = encoding, length = sum(isfile),
mode = mode, filelist = d[isfile], class = "DirSource")
}
# Documents identified by a Uniform Resource Identifier
URISource <-
function(x, encoding = "", mode = "text")
{
if (!identical(mode, "text") &&
!identical(mode, "binary") &&
!identical(mode, ""))
stop(sprintf("invalid mode '%s'", mode))
SimpleSource(encoding = encoding, length = length(x), mode = mode, uri = x,
class = "URISource")
}
# A vector where each component is interpreted as document
VectorSource <-
function(x)
SimpleSource(length = length(x), content = x, class = "VectorSource")
XMLSource <-
function(x, parser = xml_contents, reader)
{
xmldoc <- read_xml(x)
content <- parser(xmldoc)
SimpleSource(length = length(content), reader = reader, content = content,
uri = x, class = "XMLSource")
}
# A ZIP file with its compressed files interpreted as documents
ZipSource <-
function(zipfile, pattern = NULL, recursive = FALSE, ignore.case = FALSE,
mode = "text")
{
if (!identical(mode, "text") &&
!identical(mode, "binary") &&
!identical(mode, ""))
stop(sprintf("invalid mode '%s'", mode))
SimpleSource(exdir = NULL,
files = NULL,
mode = mode,
pattern = pattern,
recursive = recursive,
ignore.case = ignore.case,
zipfile = zipfile,
class = "ZipSource")
}
# tau:::read_all_bytes
read_all_bytes <-
function(con, chunksize = 2 ^ 16)
{
if (is.character(con)) {
return(readBin(con, raw(), file.info(con)$size))
}
if (!isOpen(con)) {
open(con, "rb")
on.exit(close(con))
}
bytes <- list()
repeat {
chunk <- readBin(con, raw(), chunksize)
bytes <- c(bytes, list(chunk))
if (length(chunk) < chunksize) break
}
unlist(bytes)
}
readContent <-
function(x, encoding, mode)
{
if (identical(mode, "text"))
iconv(readLines(x, warn = FALSE), encoding, "UTF-8", "byte")
else if (identical(mode, "binary"))
read_all_bytes(x)
else if (identical(mode, ""))
NULL
else
stop("invalid mode")
}
open.SimpleSource <-
close.SimpleSource <-
function(con, ...)
con
open.ZipSource <-
function(con, ...)
{
x <- con
exdir <- tempfile("ZipSource")
dir.create(exdir, mode = "0700")
destfile <- x$zipfile
if (!file.exists(destfile)) {
destfile <- tempfile()
download.file(x$zipfile, destfile)
on.exit(file.remove(destfile))
}
files <- unzip(destfile, list = TRUE)
## Directories have length 0
files <- files[files$Length > 0, "Name"]
## Idea: Subdirectories contain file separators
if (!x$recursive)
files <- files[!grepl(.Platform$file.sep, files, fixed = TRUE)]
## Idea: pattern and ignore.case refer to the file name (like basename)
## Cf. also ?dir
if (!is.null(x$pattern))
files <- files[grepl(x$pattern, files, ignore.case = x$ignore.case)]
unzip(destfile, files, exdir = exdir)
x$exdir <- exdir
x$files <- files
x$length <- length(files)
x
}
close.ZipSource <-
function(con, ...)
{
x <- con
if (!is.null(x$exdir)) {
unlink(x$exdir, recursive = TRUE)
x$exdir <- NULL
x$files <- NULL
x$length <- 0
}
x
}
eoi <-
function(x)
UseMethod("eoi", x)
eoi.SimpleSource <-
function(x)
x$length <= x$position
getElem <-
function(x)
UseMethod("getElem", x)
getElem.DataframeSource <-
function(x)
list(content = x$content[x$position, ],
uri = NULL)
getElem.DirSource <-
function(x)
{
filename <- x$filelist[x$position]
list(content = readContent(filename, x$encoding, x$mode),
uri = paste0("file://", filename))
}
getElem.URISource <-
function(x)
list(content = readContent(x$uri[x$position], x$encoding, x$mode),
uri = x$uri[x$position])
getElem.VectorSource <-
function(x)
list(content = x$content[x$position],
uri = NULL)
getElem.XMLSource <-
function(x)
list(content = x$content[[x$position]],
uri = x$uri)
getElem.ZipSource <-
function(x)
{
path <- file.path(x$exdir, x$files[x$position])
list(content = readContent(path, x$encoding, x$mode),
uri = paste0("file://", path))
}
getMeta <-
function(x)
UseMethod("getMeta", x)
getMeta.DataframeSource <-
function(x)
list(cmeta = NULL,
dmeta = x$content[, is.na(match(names(x$content),
c("doc_id", "text"))),
drop = FALSE])
length.SimpleSource <-
function(x)
x$length
pGetElem <-
function(x)
UseMethod("pGetElem", x)
pGetElem.DataframeSource <-
function(x)
tm_parLapply(seq_len(x$length),
function(y)
list(content = x$content[y, ],
uri = NULL))
`[.DataframeSource` <- function(x, i, j, ...) x$content[i, j, ...]
`[[.DataframeSource` <- function(x, ...) x$content[[...]]
pGetElem.DirSource <-
function(x)
tm_parLapply(x$filelist,
function(f)
list(content = readContent(f, x$encoding, x$mode),
uri = paste0("file://", f)))
`[.DirSource` <- function(x, i, ...) x$filelist[i, ...]
`[[.DirSource` <- function(x, i, ...) x$filelist[[i, ...]]
pGetElem.URISource <-
function(x)
tm_parLapply(x$uri,
function(uri)
list(content = readContent(uri, x$encoding, x$mode),
uri = uri))
`[.URISource` <- function(x, i, ...) x$uri[i, ...]
`[[.URISource` <- function(x, i, ...) x$uri[[i, ...]]
pGetElem.VectorSource <-
function(x)
tm_parLapply(x$content,
function(y) list(content = y,
uri = NULL))
`[.VectorSource` <- function(x, i, ...) x$content[i, ...]
`[[.VectorSource` <- function(x, i, ...) x$content[[i, ...]]
pGetElem.ZipSource <-
function(x)
tm_parLapply(file.path(x$exdir, x$files),
function(f)
list(content = readContent(f, x$encoding, x$mode),
uri = paste0("file://", f)))
reader <-
function(x)
UseMethod("reader", x)
reader.SimpleSource <-
function(x)
x$reader
stepNext <-
function(x)
UseMethod("stepNext", x)
stepNext.SimpleSource <-
function(x)
{
x$position <- x$position + 1
x
}
tm/R/complete.R 0000644 0001751 0000144 00000003721 13667334116 013061 0 ustar hornik users # Author: Ingo Feinerer
stemCompletion <-
function(x, dictionary,
type = c("prevalent", "first", "longest",
"none", "random", "shortest"))
{
if (inherits(dictionary, "Corpus"))
dictionary <- unlist(lapply(dictionary, words))
type <- match.arg(type)
possibleCompletions <- lapply(x, function(w) grep(sprintf("^%s", w),
dictionary,
value = TRUE))
switch(type,
first = {
setNames(sapply(possibleCompletions, "[", 1), x)
},
longest = {
ordering <-
lapply(possibleCompletions,
function(x) order(nchar(x), decreasing = TRUE))
possibleCompletions <-
mapply(function(x, id) x[id], possibleCompletions,
ordering, SIMPLIFY = FALSE)
setNames(sapply(possibleCompletions, "[", 1), x)
},
none = {
setNames(x, x)
},
prevalent = {
possibleCompletions <-
lapply(possibleCompletions,
function(x) sort(table(x), decreasing = TRUE))
n <- names(sapply(possibleCompletions, "[", 1))
setNames(if (length(n)) n else rep_len(NA, length(x)), x)
},
random = {
setNames(sapply(possibleCompletions, function(x) {
if (length(x)) sample(x, 1) else NA
}), x)
},
shortest = {
ordering <- lapply(possibleCompletions,
function(x) order(nchar(x)))
possibleCompletions <-
mapply(function(x, id) x[id], possibleCompletions,
ordering, SIMPLIFY = FALSE)
setNames(sapply(possibleCompletions, "[", 1), x)
}
)
}
tm/R/RcppExports.R 0000644 0001751 0000144 00000000771 13404767400 013537 0 ustar hornik users # Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
tdm <- function(strings, remove_puncts, remove_digits, stopwords, dictionary, min_term_freq, max_term_freq, min_word_length, max_word_length) {
.Call(`_tm_tdm`, strings, remove_puncts, remove_digits, stopwords, dictionary, min_term_freq, max_term_freq, min_word_length, max_word_length)
}
Boost_Tokenizer <- function(strings) {
.Call(`_tm_Boost_Tokenizer`, strings)
}
tm/R/transform.R 0000644 0001751 0000144 00000010674 13311700175 013255 0 ustar hornik users # Author: Ingo Feinerer
# Transformations
tm_map <-
function(x, FUN, ...)
UseMethod("tm_map", x)
tm_map.VCorpus <-
function(x, FUN, ..., lazy = FALSE)
{
# Lazy mapping
if (lazy) {
fun <- function(x) FUN(x, ...)
if (is.null(x$lazy))
x$lazy <- list(index = rep_len(TRUE, length(x)), maps = list(fun))
else
x$lazy$maps <- c(x$lazy$maps, list(fun))
} else
x$content <- tm_parLapply(content(x), FUN, ...)
x
}
tm_map.SimpleCorpus <-
function(x, FUN, ...)
{
if (inherits(FUN, "content_transformer"))
FUN <- get("FUN", envir = environment(FUN))
n <- names(content(x))
x$content <- FUN(content(x), ...)
if (length(content(x)) != length(n))
warning("transformation drops documents")
else
names(x$content) <- n
x
}
tm_map.PCorpus <-
function(x, FUN, ...)
{
db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
for (i in seq_along(x))
db[[x$content[[i]]]] <- FUN(x[[i]], ...)
filehash::dbReorganize(db)
x
}
# Materialize lazy mappings
materialize <-
function(x, range = seq_along(x))
{
if (!is.null(x$lazy)) {
i <- (seq_along(x) %in% range) & x$lazy$index
if (any(i)) {
x$content[i] <-
tm_parLapply(x$content[i],
function(d) tm_reduce(d, x$lazy$maps))
x$lazy$index[i] <- FALSE
}
# Clean up if everything is materialized
if (!any(x$lazy$index))
x["lazy"] <- list(NULL)
}
x
}
tm_reduce <-
function(x, tmFuns, ...)
Reduce(function(f, ...) f(...), tmFuns, x, right = TRUE)
getTransformations <-
function()
c("removeNumbers", "removePunctuation", "removeWords", "stemDocument",
"stripWhitespace")
content_transformer <-
function(FUN)
{
f <- function(x, ...) {
content(x) <- FUN(content(x), ...)
x
}
class(f) <- c("content_transformer", "function")
f
}
removeNumbers <-
function(x, ...)
UseMethod("removeNumbers")
removeNumbers.character <-
function(x, ucp = FALSE, ...)
{
if (ucp)
gsub("\\p{Nd}+", "", x, perl = TRUE)
else
.Call(`_tm_remove_chars`, x, 1L)
}
removeNumbers.PlainTextDocument <-
content_transformer(removeNumbers.character)
removePunctuation <-
function(x, ...)
UseMethod("removePunctuation")
removePunctuation.character <-
function(x,
preserve_intra_word_contractions = FALSE,
preserve_intra_word_dashes = FALSE,
ucp = FALSE,
...)
{
# Assume there are no ASCII 0x01 (SOH) or ASCII 0x02 (STX) characters.
if (preserve_intra_word_contractions)
x <- gsub("(\\w)'(\\w)", "\\1\1\\2", x, perl = TRUE)
if (preserve_intra_word_dashes)
x <- gsub("(\\w)-(\\w)", "\\1\2\\2", x, perl = TRUE)
if (ucp)
x <- gsub("\\p{P}+", "", x, perl = TRUE)
else
x <- .Call(`_tm_remove_chars`, x, 0L)
if (preserve_intra_word_contractions)
x <- gsub("\1", "'", x, fixed = TRUE)
if (preserve_intra_word_dashes)
x <- gsub("\2", "-", x, fixed = TRUE)
x
}
removePunctuation.PlainTextDocument <-
content_transformer(removePunctuation.character)
removeWords <-
function(x, words)
UseMethod("removeWords", x)
# Improvements by Kurt Hornik
removeWords.character <-
function(x, words)
gsub(sprintf("(*UCP)\\b(%s)\\b",
paste(sort(words, decreasing = TRUE), collapse = "|")),
"", x, perl = TRUE)
removeWords.PlainTextDocument <-
content_transformer(removeWords.character)
stemDocument <-
function(x, language = "english")
UseMethod("stemDocument", x)
stemDocument.character <-
function(x, language = "english")
{
s <- unlist(lapply(x, function(line)
paste(SnowballC::wordStem(words(line),
as.character(language)),
collapse = " ")))
if (is.character(s)) s else ""
}
stemDocument.PlainTextDocument <-
function(x, language = meta(x, "language"))
{
language <- as.character(language)
if (identical(language, "") ||
identical(language, character(0)) ||
is.na(language))
language <- "english"
content_transformer(stemDocument.character)(x)
}
stripWhitespace <-
function(x)
UseMethod("stripWhitespace", x)
stripWhitespace.character <-
function(x)
gsub("[[:space:]]+", " ", x)
stripWhitespace.PlainTextDocument <-
content_transformer(stripWhitespace.character)
tm/R/matrix.R 0000644 0001751 0000144 00000044622 14262016526 012554 0 ustar hornik users ## Authors: Ingo Feinerer, Kurt Hornik
TermDocumentMatrix_classes <-
c("TermDocumentMatrix", "simple_triplet_matrix")
DocumentTermMatrix_classes <-
c("DocumentTermMatrix", "simple_triplet_matrix")
.TermDocumentMatrix <-
function(x, weighting)
{
x <- as.simple_triplet_matrix(x)
if (!is.null(dimnames(x)))
names(dimnames(x)) <- c("Terms", "Docs")
class(x) <- TermDocumentMatrix_classes
if (is.null(weighting))
weighting <- weightTf
##
## Note that if weighting is a weight function, it already needs to
## know whether we have a term-document or document-term matrix.
##
## Ideally we would require weighting to be a WeightFunction object
## or a character string of length 2. But then
## dtm <- DocumentTermMatrix(crude,
## control = list(weighting =
## function(x)
## weightTfIdf(x, normalize =
## FALSE),
## stopwords = TRUE))
## in example("DocumentTermMatrix") fails [because weightTfIdf() is
## a weight function and not a weight function generator ...]
## Hence, for now, instead of
## if (inherits(weighting, "WeightFunction"))
## x <- weighting(x)
## use
if (is.function(weighting))
x <- weighting(x)
## and hope for the best ...
##
else if (is.character(weighting) && (length(weighting) == 2L))
attr(x, "weighting") <- weighting
x
}
.SimpleTripletMatrix <-
function(i, j, v, terms, corpus)
{
docs <- as.character(meta(corpus, "id", "local"))
if (length(docs) != length(corpus)) {
warning("invalid document identifiers")
docs <- NULL
}
simple_triplet_matrix(i, j, v,
nrow = length(terms),
ncol = length(corpus),
dimnames = list(Terms = terms, Docs = docs))
}
filter_global_bounds <-
function(m, bounds)
{
m <- as.simple_triplet_matrix(m)
if (length(bounds) == 2L && is.numeric(bounds)) {
rs <- row_sums(m > 0)
m <- m[(rs >= bounds[1]) & (rs <= bounds[2]), ]
}
m
}
TermDocumentMatrix <-
function(x, control = list())
UseMethod("TermDocumentMatrix", x)
TermDocumentMatrix.SimpleCorpus <-
function(x, control = list())
{
stopifnot(is.list(control))
if (any(unlist(lapply(control, is.function))))
warning("custom functions are ignored")
if (!is.null(control$tokenize) && !identical(control$tokenize, "Boost"))
warning("custom tokenizer is ignored")
txt <- content(x)
## Conversion to lower case
if (is.null(control$tolower) || isTRUE(control$tolower))
txt <- tolower(txt)
## Stopword filtering
.stopwords <- if (isTRUE(control$stopwords)) stopwords(meta(x, "language"))
else if (is.character(control$stopwords)) control$stopwords
else character(0)
.dictionary <- if (is.null(control$dictionary)) character(0)
else control$dictionary
## Ensure local bounds
bl <- control$bounds$local
min_term_freq <-
if (length(bl) == 2L && is.numeric(bl) && bl[1] >= 0) bl[1] else 0L
max_term_freq <-
if (length(bl) == 2L && is.numeric(bl) && bl[2] >= 0)
min(bl[2], .Machine$integer.max) else .Machine$integer.max
## Filter out too short or too long terms
wl <- control$wordLengths
min_word_length <- if (is.numeric(wl[1]) && wl[1] >= 0) wl[1] else 3L
max_word_length <- if (is.numeric(wl[2]) && wl[2] >= 0)
min(wl[2], .Machine$integer.max) else .Machine$integer.max
m <- tdm(txt,
isTRUE(control$removePunctuation),
isTRUE(control$removeNumbers),
.stopwords, .dictionary,
as.integer(min_term_freq), as.integer(max_term_freq),
as.integer(min_word_length), as.integer(max_word_length))
Encoding(m$terms) <- "UTF-8"
m <- .SimpleTripletMatrix(m$i, m$j, m$v, m$terms, x)
## Stemming
##
## Ideally tdm() could perform stemming as well but there is no easy way to
## access the SnowballC::wordStem() function from C++ (via Rcpp) without
## significant overhead (as SnowballC does not export its internal C
## functions).
##
## Stemming afterwards is still quite performant as we already have
## all terms. However, there is some overhead involved as we need
## to recheck local bounds and word lengths.
##
if (isTRUE(control$stemming)) {
stems <- SnowballC::wordStem(m$dimnames$Terms,
meta(x, "language"))
## Do as.factor(stems) "by hand" for performance reasons.
uniqs <- sort(unique(stems))
stems <- match(stems, uniqs)
attributes(stems) <- list(levels = uniqs, class = "factor")
m <- rollup(m, "Terms", stems)
## Recheck local bounds
## No need to check lower local bound as rollup aggregates frequencies
m[m > max_term_freq] <- 0
## Recheck word lengths
terms_length <- nchar(rownames(m))
m <- m[min_word_length <= terms_length &
terms_length <= max_word_length, ]
}
m <- filter_global_bounds(m, control$bounds$global)
.TermDocumentMatrix(m, control$weighting)
}
TermDocumentMatrix.PCorpus <-
TermDocumentMatrix.VCorpus <-
function(x, control = list())
{
stopifnot(is.list(control))
tflist <- tm_parLapply(unname(content(x)), termFreq, control)
v <- unlist(tflist)
i <- names(v)
terms <- sort(unique(as.character(if (is.null(control$dictionary)) i
else control$dictionary)))
i <- match(i, terms)
j <- rep.int(seq_along(x), lengths(tflist))
m <- .SimpleTripletMatrix(i, j, as.numeric(v), terms, x)
m <- filter_global_bounds(m, control$bounds$global)
.TermDocumentMatrix(m, control$weighting)
}
TermDocumentMatrix.default <-
function(x, control = list())
TermDocumentMatrix(Corpus(VectorSource(x)), control)
DocumentTermMatrix <-
function(x, control = list())
t(TermDocumentMatrix(x, control))
as.TermDocumentMatrix <-
function(x, ...)
UseMethod("as.TermDocumentMatrix")
as.TermDocumentMatrix.TermDocumentMatrix <-
function(x, ...)
x
as.TermDocumentMatrix.DocumentTermMatrix <-
function(x, ...)
t(x)
as.TermDocumentMatrix.term_frequency <-
as.TermDocumentMatrix.textcnt <-
function(x, ...)
{
m <- simple_triplet_matrix(i = seq_along(x),
j = rep_len(1L, length(x)),
v = as.numeric(x),
nrow = length(x),
ncol = 1,
dimnames =
list(Terms = names(x),
Docs = NA_character_))
.TermDocumentMatrix(m, weightTf)
}
as.TermDocumentMatrix.default <-
function(x, weighting, ...)
.TermDocumentMatrix(x, weighting)
as.DocumentTermMatrix <-
function(x, ...)
UseMethod("as.DocumentTermMatrix")
as.DocumentTermMatrix.DocumentTermMatrix <-
function(x, ...)
x
as.DocumentTermMatrix.TermDocumentMatrix <-
function(x, ...)
t(x)
as.DocumentTermMatrix.term_frequency <-
as.DocumentTermMatrix.textcnt <-
function(x, ...)
t(as.TermDocumentMatrix(x))
as.DocumentTermMatrix.default <-
function(x, weighting, ...)
{
x <- as.simple_triplet_matrix(x)
t(.TermDocumentMatrix(t(x), weighting))
}
t.TermDocumentMatrix <-
t.DocumentTermMatrix <-
function(x)
{
m <- NextMethod("t")
attr(m, "weighting") <- attr(x, "weighting")
class(m) <- if (inherits(x, "DocumentTermMatrix"))
TermDocumentMatrix_classes
else
DocumentTermMatrix_classes
m
}
termFreq <-
function(doc, control = list())
{
stopifnot(inherits(doc, "TextDocument") || is.character(doc),
is.list(control))
## Tokenize the corpus
.tokenize <- control$tokenize
if (is.null(.tokenize) || identical(.tokenize, "words"))
.tokenize <- words
else if (identical(.tokenize, "Boost"))
.tokenize <- Boost_tokenizer
else if (identical(.tokenize, "MC"))
.tokenize <- MC_tokenizer
else if (identical(.tokenize, "scan"))
.tokenize <- scan_tokenizer
else if (is.Span_Tokenizer(.tokenize))
.tokenize <- as.Token_Tokenizer(.tokenize)
if (is.function(.tokenize))
txt <- .tokenize(doc)
else
stop("invalid tokenizer")
## Conversion to lower case
.tolower <- control$tolower
if (is.null(.tolower) || isTRUE(.tolower))
.tolower <- tolower
if (is.function(.tolower))
txt <- .tolower(txt)
## Punctuation removal
.removePunctuation <- control$removePunctuation
if (isTRUE(.removePunctuation))
.removePunctuation <- removePunctuation
else if (is.list(.removePunctuation))
.removePunctuation <-
function(x) do.call(removePunctuation,
c(list(x), control$removePunctuation))
## Number removal
.removeNumbers <- control$removeNumbers
if (isTRUE(.removeNumbers))
.removeNumbers <- removeNumbers
.language <- control$language
if (inherits(doc, "TextDocument"))
.language <- meta(doc, "language")
if (is.null(.language))
.language <- "en"
## Stopword filtering
.stopwords <- control$stopwords
if (isTRUE(.stopwords))
.stopwords <- function(x) x[is.na(match(x, stopwords(.language)))]
else if (is.character(.stopwords))
.stopwords <- function(x) x[is.na(match(x, control$stopwords))]
## Stemming
.stemming <- control$stemming
if (isTRUE(.stemming))
.stemming <- function(x) SnowballC::wordStem(x, .language)
## Default order for options which support reordering
or <- c("removePunctuation", "removeNumbers", "stopwords", "stemming")
## Process control options in specified order
nc <- names(control)
n <- nc[!is.na(match(nc, or))]
for (name in sprintf(".%s", c(n, setdiff(or, n)))) {
g <- get(name)
if (is.function(g))
txt <- g(txt)
}
## If dictionary is set tabulate against it
dictionary <- control$dictionary
tab <- .table(if (is.null(dictionary))
txt
else
txt[!is.na(match(txt, dictionary))])
## Ensure local bounds
bl <- control$bounds$local
if (length(bl) == 2L && is.numeric(bl))
tab <- tab[(tab >= bl[1]) & (tab <= bl[2]), drop = FALSE]
## Filter out too short or too long terms
nc <- nchar(names(tab), type = "chars")
wl <- control$wordLengths
lb <- if (is.numeric(wl[1])) wl[1] else 3
ub <- if (is.numeric(wl[2])) wl[2] else Inf
tab <- tab[(nc >= lb) & (nc <= ub), drop = FALSE]
class(tab) <- c("term_frequency", class(tab))
tab
}
print.TermDocumentMatrix <-
print.DocumentTermMatrix <-
function(x, ...)
{
format <- c("term", "document")
if (inherits(x, "DocumentTermMatrix"))
format <- rev(format)
writeLines(sprintf("<<%s (%ss: %d, %ss: %d)>>",
class(x)[1], format[1L], nrow(x), format[2L], ncol(x)))
writeLines(sprintf("Non-/sparse entries: %d/%.0f",
length(x$v), prod(dim(x)) - length(x$v)))
sparsity <- if (!prod(dim(x))) 100
else round( (1 - length(x$v) / prod(dim(x))) * 100)
writeLines(sprintf("Sparsity : %s%%", sparsity))
writeLines(sprintf("Maximal term length: %s",
max(nchar(Terms(x), type = "chars"), 0)))
writeLines(sprintf("Weighting : %s (%s)",
attr(x, "weighting")[1L], attr(x, "weighting")[2L]))
invisible(x)
}
inspect.TermDocumentMatrix <-
inspect.DocumentTermMatrix <-
function(x)
{
print(x)
cat("Sample :\n")
print(as.matrix(sample.TermDocumentMatrix(x)))
}
`[.TermDocumentMatrix` <-
`[.DocumentTermMatrix` <-
function(x, i, j, ..., drop)
{
m <- NextMethod("[")
attr(m, "weighting") <- attr(x, "weighting")
class(m) <- if (inherits(x, "DocumentTermMatrix"))
DocumentTermMatrix_classes
else
TermDocumentMatrix_classes
m
}
`dimnames<-.DocumentTermMatrix` <-
function(x, value)
{
x <- NextMethod("dimnames<-")
dnx <- x$dimnames
if (!is.null(dnx))
names(dnx) <- c("Docs", "Terms")
x$dimnames <- dnx
x
}
`dimnames<-.TermDocumentMatrix` <-
function(x, value)
{
x <- NextMethod("dimnames<-")
dnx <- x$dimnames
if (!is.null(dnx))
names(dnx) <- c("Terms", "Docs")
x$dimnames <- dnx
x
}
nDocs <-
function(x)
UseMethod("nDocs")
nTerms <-
function(x)
UseMethod("nTerms")
nDocs.DocumentTermMatrix <-
nTerms.TermDocumentMatrix <-
function(x)
x$nrow
nDocs.TermDocumentMatrix <-
nTerms.DocumentTermMatrix <-
function(x)
x$ncol
Docs <-
function(x)
UseMethod("Docs")
Terms <-
function(x)
UseMethod("Terms")
Docs.DocumentTermMatrix <-
Terms.TermDocumentMatrix <-
function(x)
{
s <- x$dimnames[[1L]]
if (is.null(s))
s <- rep.int(NA_character_, x$nrow)
s
}
Docs.TermDocumentMatrix <-
Terms.DocumentTermMatrix <-
function(x)
{
s <- x$dimnames[[2L]]
if (is.null(s))
s <- rep.int(NA_character_, x$ncol)
s
}
c.term_frequency <-
function(..., recursive = FALSE)
{
do.call("c", lapply(list(...), as.TermDocumentMatrix))
}
c.TermDocumentMatrix <-
function(..., recursive = FALSE)
{
m <- lapply(list(...), as.TermDocumentMatrix)
if (length(m) == 1L)
return(m[[1L]])
weighting <- attr(m[[1L]], "weighting")
allTermsNonUnique <- unlist(lapply(m, function(x) Terms(x)[x$i]))
allTerms <- unique(allTermsNonUnique)
allDocs <- unlist(lapply(m, Docs))
cs <- cumsum(lapply(m, nDocs))
cs <- c(0, cs[-length(cs)])
j <- lapply(m, "[[", "j")
m <- simple_triplet_matrix(i = match(allTermsNonUnique, allTerms),
j = unlist(j) + rep.int(cs, lengths(j)),
v = unlist(lapply(m, "[[", "v")),
nrow = length(allTerms),
ncol = length(allDocs),
dimnames =
list(Terms = allTerms,
Docs = allDocs))
##
## - We assume that all arguments have the same weighting
## - Even if all matrices have the same input weighting it might be
## necessary to take additional steps (e.g., normalization for tf-idf or
## check for (0,1)-range for binary tf)
##
.TermDocumentMatrix(m, weighting)
}
c.DocumentTermMatrix <-
function(..., recursive = FALSE)
{
t(do.call("c", lapply(list(...), as.TermDocumentMatrix)))
}
findFreqTerms <-
function(x, lowfreq = 0, highfreq = Inf)
{
stopifnot(inherits(x, c("DocumentTermMatrix", "TermDocumentMatrix")),
is.numeric(lowfreq), is.numeric(highfreq))
if (inherits(x, "DocumentTermMatrix")) x <- t(x)
rs <- row_sums(x)
names(rs[rs >= lowfreq & rs <= highfreq])
}
findAssocs <-
function(x, terms, corlimit)
UseMethod("findAssocs", x)
findAssocs.TermDocumentMatrix <-
function(x, terms, corlimit)
findAssocs(t(x), terms, corlimit)
findAssocs.DocumentTermMatrix <-
function(x, terms, corlimit)
{
stopifnot(is.character(terms), is.numeric(corlimit),
corlimit >= 0, corlimit <= 1)
j <- match(unique(terms), Terms(x), nomatch = 0L)
suppressWarnings(
findAssocs(crossapply_simple_triplet_matrix(x[, j], x[, -j], cor),
terms, rep_len(corlimit, length(terms))))
}
findAssocs.matrix <-
function(x, terms, corlimit)
{
stopifnot(is.numeric(x))
i <- match(terms, rownames(x), nomatch = 0L)
names(i) <- terms
Map(function(i, cl) {
xi <- x[i, ]
t <- sort(round(xi[which(xi >= cl)], 2), TRUE)
if (!length(t))
names(t) <- NULL
t
},
i, corlimit)
}
removeSparseTerms <-
function(x, sparse)
{
stopifnot(inherits(x, c("DocumentTermMatrix", "TermDocumentMatrix")),
is.numeric(sparse), sparse > 0, sparse < 1)
m <- if (inherits(x, "DocumentTermMatrix")) t(x) else x
t <- table(m$i) > m$ncol * (1 - sparse)
termIndex <- as.numeric(names(t[t]))
if (inherits(x, "DocumentTermMatrix")) x[, termIndex] else x[termIndex, ]
}
sample.TermDocumentMatrix <-
function(x, size = 10)
{
stopifnot(inherits(x, c("DocumentTermMatrix", "TermDocumentMatrix")),
is.numeric(size), size >= 0)
if (length(x$v) == 0L)
return(x)
m <- if (inherits(x, "DocumentTermMatrix")) t(x) else x
terms <- sort(names(sort(row_sums(m), decreasing = TRUE)
[0:min(size, nTerms(m))]))
docs <- sort(names(sort(col_sums(m), decreasing = TRUE)
[0:min(size, nDocs(m))]))
if (inherits(x, "DocumentTermMatrix")) x[docs, terms] else x[terms, docs]
}
CategorizedDocumentTermMatrix <-
function(x, c)
{
if (inherits(x, "TermDocumentMatrix"))
x <- t(x)
else if (!inherits(x, "DocumentTermMatrix"))
stop("wrong class")
if (length(c) != nDocs(x))
stop("invalid category ids")
attr(x, "Category") <- c
class(x) <- c("CategorizedDocumentTermMatrix",
DocumentTermMatrix_classes)
x
}
findMostFreqTerms <-
function(x, n = 6L, ...)
UseMethod("findMostFreqTerms")
findMostFreqTerms.term_frequency <-
function(x, n = 6L, ...)
{
y <- x[order(x, decreasing = TRUE)[seq_len(n)]]
y[y > 0]
}
findMostFreqTerms.DocumentTermMatrix <-
function(x, n = 6L, INDEX = NULL, ...)
{
terms <- Terms(x)
if (!is.null(INDEX))
x <- rollup(x, 1L, INDEX)
f <- factor(x$i, seq_len(x$nrow))
js <- split(x$j, f)
vs <- split(x$v, f)
y <- Map(function(j, v, n) {
p <- order(v, decreasing = TRUE)[seq_len(n)]
v <- v[p]
names(v) <- terms[j[p]]
v
},
js, vs, pmin(lengths(vs), n))
names(y) <- x$dimnames[[1L]]
y
}
findMostFreqTerms.TermDocumentMatrix <-
function(x, n = 6L, INDEX = NULL, ...)
{
terms <- Terms(x)
if (!is.null(INDEX))
x <- rollup(x, 2L, INDEX)
f <- factor(x$j, seq_len(x$ncol))
is <- split(x$i, f)
vs <- split(x$v, f)
y <- Map(function(i, v, n) {
p <- order(v, decreasing = TRUE)[seq_len(n)]
v <- v[p]
names(v) <- terms[i[p]]
v
},
is, vs, pmin(lengths(vs), n))
names(y) <- x$dimnames[[2L]]
y
}
tm/R/corpus.R 0000644 0001751 0000144 00000022164 14716601054 012560 0 ustar hornik users # Author: Ingo Feinerer
Corpus <-
function(x, readerControl = list(reader = reader(x), language = "en"))
{
stopifnot(inherits(x, "Source"))
readerControl <- prepareReader(readerControl, reader(x))
if ( (inherits(x, "DataframeSource") || inherits(x, "DirSource") ||
inherits(x, "VectorSource") ) &&
identical(readerControl$reader, reader(x)))
SimpleCorpus(x, readerControl)
else
VCorpus(x, readerControl)
}
PCorpus <-
function(x,
readerControl = list(reader = reader(x), language = "en"),
dbControl = list(dbName = "", dbType = "DB1"))
{
stopifnot(inherits(x, "Source"))
readerControl <- prepareReader(readerControl, reader(x))
if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
stop("error in creating database")
db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
x <- open(x)
tdl <- vector("list", length(x))
counter <- 1
while (!eoi(x)) {
x <- stepNext(x)
elem <- getElem(x)
doc <- readerControl$reader(elem,
readerControl$language,
as.character(counter))
filehash::dbInsert(db, meta(doc, "id"), doc)
tdl[[counter]] <- meta(doc, "id")
counter <- counter + 1
}
x <- close(x)
cmeta <- CorpusMeta()
dmeta <- data.frame(row.names = seq_along(tdl))
## Check if metadata retrieval is supported
for(cl in class(x)) {
if (is.function(getS3method("getMeta", cl, TRUE))) {
m <- getMeta(x)
if (!is.null(m$cmeta)) cmeta <- m$cmeta
if (!is.null(m$dmeta)) dmeta <- m$dmeta
break
}
}
p <- list(content = tdl, meta = cmeta, dmeta = dmeta, dbcontrol = dbControl)
class(p) <- c("PCorpus", "Corpus")
p
}
SimpleCorpus <-
function(x, control = list(language = "en"))
{
stopifnot(inherits(x, "Source"))
if (!is.null(control$reader) && !identical(control$reader, reader(x)))
warning("custom reader is ignored")
content <- if (inherits(x, "VectorSource")) {
if (is.character(x$content)) x$content else as.character(x$content)
} else if (inherits(x, "DirSource")) {
setNames(as.character(
lapply(x$filelist,
function(f) paste(readContent(f, x$encoding, "text"),
collapse = "\n"))
),
basename(x$filelist))
} else if (inherits(x, "DataframeSource")) {
setNames(as.character(x$content[, "text"]), x$content[, "doc_id"])
} else
stop("unsupported source type")
dmeta <- if (inherits(x, "DataframeSource"))
x$content[, is.na(match(names(x$content),
c("doc_id", "text"))),
drop = FALSE]
else
data.frame(row.names = seq_along(x))
s <- list(content = content,
meta = CorpusMeta(language = control$language),
dmeta = dmeta)
class(s) <- c("SimpleCorpus", "Corpus")
s
}
VCorpus <-
function(x, readerControl = list(reader = reader(x), language = "en"))
{
stopifnot(inherits(x, "Source"))
readerControl <- prepareReader(readerControl, reader(x))
x <- open(x)
tdl <- vector("list", length(x))
## Check for parallel element access
found <- FALSE
for(cl in class(x)) {
if (is.function(getS3method("pGetElem", cl, TRUE))) {
tdl <- mapply(function(elem, id)
readerControl$reader(elem,
readerControl$language,
id),
pGetElem(x),
id = as.character(seq_along(x)),
SIMPLIFY = FALSE)
found <- TRUE
break
}
}
if(!found) {
counter <- 1
while (!eoi(x)) {
x <- stepNext(x)
elem <- getElem(x)
doc <- readerControl$reader(elem,
readerControl$language,
as.character(counter))
tdl[[counter]] <- doc
counter <- counter + 1
}
}
x <- close(x)
cmeta <- CorpusMeta()
dmeta <- data.frame(row.names = seq_along(tdl))
## Check if metadata retrieval is supported
for(cl in class(x)) {
if (is.function(getS3method("getMeta", cl, TRUE))) {
m <- getMeta(x)
if (!is.null(m$cmeta)) cmeta <- m$cmeta
if (!is.null(m$dmeta)) dmeta <- m$dmeta
break
}
}
v <- as.VCorpus(tdl)
v$meta <- cmeta
v$dmeta <- dmeta
v
}
`[.PCorpus` <-
`[.SimpleCorpus` <-
function(x, i)
{
if (!missing(i)) {
x$content <- x$content[i]
x$dmeta <- x$dmeta[i, , drop = FALSE]
}
x
}
`[.VCorpus` <-
function(x, i)
{
if (!missing(i)) {
x$content <- x$content[i]
x$dmeta <- x$dmeta[i, , drop = FALSE]
if (!is.null(x$lazy))
x$lazy$index <- x$lazy$index[i]
}
x
}
.map_name_index <-
function(x, i)
{
if (is.character(i))
match(i, meta(x, "id", "local"))
else
i
}
`[[.PCorpus` <-
function(x, i)
{
i <- .map_name_index(x, i)
db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
filehash::dbFetch(db, x$content[[i]])
}
`[[.SimpleCorpus` <-
function(x, i)
{
i <- .map_name_index(x, i)
n <- names(x$content)
PlainTextDocument(x$content[[i]],
id = if (is.null(n)) i else n[i],
language = meta(x, "language"))
}
`[[.VCorpus` <-
function(x, i)
{
i <- .map_name_index(x, i)
if (!is.null(x$lazy))
.Call(`_tm_copyCorpus`, x, materialize(x, i))
x$content[[i]]
}
`[[<-.SimpleCorpus` <-
function(x, i, value)
{
x$content[i] <- paste0(as.character(value), collapse = "\n")
x
}
`[[<-.PCorpus` <-
function(x, i, value)
{
i <- .map_name_index(x, i)
db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
db[[x$content[[i]]]] <- value
x
}
`[[<-.VCorpus` <-
function(x, i, value)
{
i <- .map_name_index(x, i)
# Mark new objects as inactive for lazy mapping
if (!is.null(x$lazy))
x$lazy$index[i] <- FALSE
x$content[[i]] <- value
x
}
as.list.PCorpus <- as.list.VCorpus <-
function(x, ...)
setNames(content(x), as.character(lapply(content(x), meta, "id")))
as.list.SimpleCorpus <-
function(x, ...)
as.list(content(x))
as.VCorpus <-
function(x)
UseMethod("as.VCorpus")
as.VCorpus.VCorpus <- identity
as.VCorpus.list <-
function(x)
{
v <- list(content = x,
meta = CorpusMeta(),
dmeta = data.frame(row.names = seq_along(x)))
class(v) <- c("VCorpus", "Corpus")
v
}
outer_union <-
function(x, y, ...)
{
if (nrow(x) > 0L)
x[, setdiff(names(y), names(x))] <- NA
if (nrow(y) > 0L)
y[, setdiff(names(x), names(y))] <- NA
res <- rbind(x, y)
if (ncol(res) == 0L)
res <- data.frame(row.names = seq_len(nrow(x) + nrow(y)))
res
}
c.VCorpus <-
function(..., recursive = FALSE)
{
args <- list(...)
x <- args[[1L]]
if (length(args) == 1L)
return(x)
if (!all(unlist(lapply(args, inherits, class(x)))))
stop("not all arguments are of the same corpus type")
v <- list(content = do.call("c", lapply(args, content)),
meta = CorpusMeta(meta = do.call("c",
lapply(args, function(a) meta(a, type = "corpus")))),
dmeta = Reduce(outer_union, lapply(args, meta)))
class(v) <- c("VCorpus", "Corpus")
v
}
content.VCorpus <-
function(x)
{
if (!is.null(x$lazy))
.Call(`_tm_copyCorpus`, x, materialize(x))
x$content
}
content.SimpleCorpus <-
function(x)
x$content
content.PCorpus <-
function(x)
{
db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
filehash::dbMultiFetch(db, unlist(x$content))
}
inspect <-
function(x)
UseMethod("inspect", x)
inspect.PCorpus <-
inspect.SimpleCorpus <-
inspect.VCorpus <-
function(x)
{
print(x)
cat("\n")
print(noquote(content(x)))
invisible(x)
}
length.PCorpus <-
length.SimpleCorpus <-
length.VCorpus <-
function(x)
length(x$content)
names.PCorpus <-
names.SimpleCorpus <-
names.VCorpus <-
function(x)
as.character(meta(x, "id", "local"))
`names<-.PCorpus` <- `names<-.VCorpus` <-
function(x, value)
{
meta(x, "id", "local") <- as.character(value)
x
}
format.PCorpus <-
format.SimpleCorpus <-
format.VCorpus <-
function(x, ...)
{
c(sprintf("<<%s>>", class(x)[1L]),
sprintf("Metadata: corpus specific: %d, document level (indexed): %d",
length(meta(x, type = "corpus")),
ncol(meta(x, type = "indexed"))),
sprintf("Content: documents: %d", length(x)))
}
writeCorpus <-
function(x, path = ".", filenames = NULL)
{
filenames <- file.path(path,
if (is.null(filenames))
sprintf("%s.txt", as.character(meta(x, "id", "local")))
else filenames)
stopifnot(length(x) == length(filenames))
mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames)
invisible(x)
}
tm/R/filter.R 0000644 0001751 0000144 00000000670 13667624327 012544 0 ustar hornik users # Author: Ingo Feinerer
# Filters
tm_filter <-
function(x, FUN, ...)
UseMethod("tm_filter", x)
tm_filter.PCorpus <-
tm_filter.SimpleCorpus <-
tm_filter.VCorpus <-
function(x, FUN, ...)
x[tm_index(x, FUN, ...)]
tm_index <-
function(x, FUN, ...)
UseMethod("tm_index", x)
tm_index.PCorpus <-
tm_index.SimpleCorpus <-
tm_index.VCorpus <-
function(x, FUN, ...)
unlist(tm_parLapply(content(x), function(y) isTRUE(FUN(y, ...))))
tm/R/utils.R 0000644 0001751 0000144 00000003302 13206313007 012365 0 ustar hornik users ## Helper functions
.print_via_format <-
function(x, ...)
{
writeLines(format(x, ...))
invisible(x)
}
## Efficient alternative to table() proposed by Kurt Hornik
.table <- function(x) {
u <- sort(unique(x))
if(!length(u)) return(integer())
v <- tabulate(match(x, u))
names(v) <- u
v
}
.xml_content <- function(doc, spec) {
switch(spec[[1]],
node = xml_text(xml_find_all(doc, spec[[2]])),
"function" = spec[[2]](doc),
unevaluated = spec[[2]])
}
IETF_Snowball_map <-
list("danish" = c("da", "dan"),
"dutch" = c("nl", "nld", "dut"),
"english" = c("en", "eng"),
"finnish" = c("fi", "fin"),
"french" = c("fr", "fra", "fre"),
"german" = c("de", "deu", "ger"),
"hungarian" = c("hu", "hun"),
"italian" = c("it", "ita"),
"norwegian" = c("no", "nor"),
"portuguese" = c("pt", "por"),
"romanian" = c("ro", "ron", "rum"),
"russian" = c("ru", "rus"),
"spanish" = c("es", "esl", "spa"),
"swedish" = c("sv", "swe"),
## Have stopwords but no SnowballC stemmer ...
"catalan" = c("ca", "cat"),
## Have SnowballC stemmer but no stopwords ...
"turkish" = c("tr", "tur")
)
# Map IETF language tags to languages used by the Snowball stemmer project
# http://en.wikipedia.org/wiki/IETF_language_tag
map_IETF_Snowball <-
local({
codes <- unlist(IETF_Snowball_map, use.names = FALSE)
names <- rep.int(names(IETF_Snowball_map), lengths(IETF_Snowball_map))
function(code) {
code <- as.character(code)
if (identical(code, "") || identical(code, character(0)) || is.na(code))
return("porter")
names[charmatch(gsub("-.*", "", code), codes)]
}
})
tm/R/pdftools.R 0000644 0001751 0000144 00000010463 12776627444 013115 0 ustar hornik users pdf_info_via_xpdf <-
function(file, options = NULL)
{
outfile <- tempfile("pdfinfo")
on.exit(unlink(outfile))
status <- system2("pdfinfo", c(options, shQuote(normalizePath(file))),
stdout = outfile)
## Could check the status ...
## This does not work ...
## info <- as.list(read.dcf(outfile)[1L, ])
tags <- c("Title", "Subject", "Keywords", "Author", "Creator",
"Producer", "CreationDate", "ModDate", "Tagged", "Form",
"Pages", "Encrypted", "Page size", "File size",
"Optimized", "PDF version")
re <- sprintf("^(%s)",
paste(sprintf("%-16s", sprintf("%s:", tags)),
collapse = "|"))
lines <- readLines(outfile, warn = FALSE)
ind <- grepl(re, lines)
tags <- sub(": *", "", substring(lines[ind], 1L, 16L))
info <- split(sub(re, "", lines), cumsum(ind))
names(info) <- tags
fmt <- "%a %b %d %X %Y"
if (!is.null(d <- info$CreationDate))
info$CreationDate <- strptime(d, fmt)
if (!is.null(d <- info$ModDate))
info$ModDate <- strptime(d, fmt)
if (!is.null(p <- info$Pages))
info$Pages <- as.integer(p)
info
}
pdf_info_via_gs <-
function(file)
{
file <- normalizePath(file)
gs_cmd <- tools::find_gs_cmd()
out <- system2(gs_cmd,
c("-dNODISPLAY -q",
sprintf("-sFile=%s", shQuote(file)),
system.file("ghostscript", "pdf_info.ps",
package = "tm")),
stdout = TRUE)
out <- out[cumsum(out == "") == 2L][-1L]
val <- sub("^[^:]+:[[:space:]]*", "", out)
names(val) <- sub(":.*", "", out)
val <- as.list(val)
if (!is.null(d <- val$CreationDate))
val$CreationDate <- PDF_Date_to_POSIXt(d)
if (!is.null(d <- val$ModDate))
val$ModDate <- PDF_Date_to_POSIXt(d)
val
}
PDF_Date_to_POSIXt <-
function(s)
{
## Strip optional 'D:' prefix.
s <- sub("^D:", "", s)
## Strip apostrophes in offset spec.
s <- gsub("'", "", s)
if (nchar(s) <= 14L) {
s <- sprintf("%s%s", s,
substring(" 0101000000", nchar(s) + 1L, 14L))
strptime(s, "%Y%m%d%H%M%S")
} else if (substring(s, 15L, 15L) == "Z") {
strptime(substring(s, 1L, 14L), "%Y%m%d%H%M%S")
} else {
strptime(s, "%Y%m%d%H%M%S%z")
}
}
pdf_text_via_gs <-
function(file)
{
file <- normalizePath(file)
gs_cmd <- tools::find_gs_cmd()
tf <- tempfile("pdf")
on.exit(unlink(tf))
## The current mechanism is first converting PDF to Postscript using
## the ps2write device, and then extract text using the ps2ascii.ps
## program. This fails for some files (e.g.,
## /data/rsync/PKGS/AlleleRetain/inst/doc/AlleleRetain_User_Guide.pdf
## which Ghostscript also fails to render. Note that rendering via
## gv works "fine": but this uses the pswrite device which produces
## bitmap (from which no text can be extracted, of course).
## Using the txtwrite device is simply too unstable: e.g.,
## gs -dBATCH -dNOPAUSE -sDEVICE=txtwrite -dQUIET -sOutputFile=- \
## /data/rsync/PKGS/AlleleRetain/inst/doc/AlleleRetain_User_Guide.pdf
## keeps segfaulting.
## An additional nuisance is that there seems no simple way to
## detect a ps2ascii.ps failure.
## Finally, note that we currently use -DSIMPLE: without this, more
## information would be made available, but require post-processing.
## Step 1. Convert PDF to Postscript.
res <- system2(gs_cmd,
c("-q -dNOPAUSE -dBATCH -P- -dSAFER -sDEVICE=ps2write",
sprintf("-sOutputFile=%s", tf),
"-c save pop -f",
shQuote(file)))
## Step 2. Extract text.
txt <- system2(gs_cmd,
c("-q -dNODISPLAY -P- -dSAFER -dDELAYBIND -dWRITESYSTEMDICT -dSIMPLE",
"-c save -f ps2ascii.ps",
tf,
"-c quit"),
stdout = TRUE)
## Argh. How can we catch errors?
## The return values are always 0 ...
if (any(grepl("Error handled by opdfread.ps", txt))) {
stop(paste(c("Ghostscript failed, with output:", txt),
collapse = "\n"))
}
strsplit(paste(txt, collapse = "\n"), "\f")[[1L]]
}
tm/R/score.R 0000644 0001751 0000144 00000001430 13023472115 012343 0 ustar hornik users tm_term_score <-
function(x, terms, FUN)
UseMethod("tm_term_score", x)
tm_term_score.term_frequency <-
function(x, terms, FUN = function(x) sum(x, na.rm = TRUE))
FUN(x[match(terms, names(x), nomatch = 0L)])
tm_term_score.PlainTextDocument <-
function(x, terms, FUN = function(x) sum(x, na.rm = TRUE))
tm_term_score(termFreq(x, control = list(tolower = FALSE,
removePunctuation = TRUE,
wordLengths = c(1, Inf))),
terms, FUN)
tm_term_score.TermDocumentMatrix <-
function(x, terms, FUN = col_sums)
FUN(x[match(terms, Terms(x), nomatch = 0L), ])
tm_term_score.DocumentTermMatrix <-
function(x, terms, FUN = row_sums)
FUN(x[, match(terms, Terms(x), nomatch = 0L)])
tm/R/meta.R 0000644 0001751 0000144 00000010671 13110235234 012161 0 ustar hornik users # Author: Ingo Feinerer
TextDocumentMeta <-
function(author, datetimestamp, description, heading, id, language, origin, ...,
meta = NULL)
{
if (is.null(meta))
meta <- list(author = author, datetimestamp = datetimestamp,
description = description, heading = heading, id = id,
language = language, origin = origin, ...)
stopifnot(is.list(meta))
if (!is.null(meta$author) && !inherits(meta$author, "person"))
meta$author <- as.character(meta$author)
if (!is.null(meta$datetimestamp) && !inherits(meta$datetimestamp, "POSIXt"))
meta$datetimestamp <- as.character(meta$datetimestamp)
if (!is.null(meta$description))
meta$description <- as.character(meta$description)
if (!is.null(meta$heading))
meta$heading <- as.character(meta$heading)
if (!is.null(meta$id))
meta$id <- as.character(meta$id)
if (!is.null(meta$language))
meta$language <- as.character(meta$language)
if (!is.null(meta$origin))
meta$origin <- as.character(meta$origin)
class(meta) <- "TextDocumentMeta"
meta
}
print.TextDocumentMeta <-
function(x, ...)
{
cat(sprintf(" %s: %s",
format(names(x), justify = "left"),
sapply(x, as.character)),
sep = "\n")
invisible(x)
}
CorpusMeta <-
function(..., meta = NULL)
{
if (is.null(meta))
meta <- list(...)
stopifnot(is.list(meta))
class(meta) <- "CorpusMeta"
meta
}
meta.SimpleCorpus <-
function(x, tag = NULL, type = c("indexed", "corpus"), ...)
{
if (identical(tag, "id")) {
n <- names(content(x))
return(if (is.null(n)) as.character(seq_along(x)) else n)
}
if (!is.null(tag) && missing(type))
type <- if (tag %in% names(x$meta)) "corpus" else "indexed"
type <- match.arg(type)
if (identical(type, "indexed"))
if (is.null(tag)) x$dmeta else x$dmeta[tag]
else if (identical(type, "corpus"))
if (is.null(tag)) x$meta else x$meta[[tag]]
else
stop("invalid type")
}
meta.VCorpus <- meta.PCorpus <-
function(x, tag = NULL, type = c("indexed", "corpus", "local"), ...)
{
if (!is.null(tag) && missing(type)) {
type <- if (tag %in% names(x$dmeta)) "indexed"
else if (tag %in% names(x$meta)) "corpus"
else "local"
}
type <- match.arg(type)
if (identical(type, "indexed"))
if (is.null(tag)) x$dmeta else x$dmeta[tag]
else if (identical(type, "corpus"))
if (is.null(tag)) x$meta else x$meta[[tag]]
else if (identical(type, "local"))
lapply(x, meta, tag)
else
stop("invalid type")
}
`meta<-.SimpleCorpus` <-
function(x, tag, type = c("indexed", "corpus"), ..., value)
{
type <- match.arg(type)
if (identical(type, "indexed"))
x$dmeta[, tag] <- value
else if (type == "corpus")
x$meta[[tag]] <- value
else
stop("invalid type")
x
}
`meta<-.VCorpus` <- `meta<-.PCorpus` <-
function(x, tag, type = c("indexed", "corpus", "local"), ..., value)
{
type <- match.arg(type)
if (identical(type, "indexed"))
x$dmeta[, tag] <- value
else if (type == "corpus")
x$meta[[tag]] <- value
else if (identical(type, "local")) {
for (i in seq_along(x))
meta(x[[i]], tag) <- value[i]
} else
stop("invalid type")
x
}
# Simple Dublin Core to tm metadata mapping
# http://en.wikipedia.org/wiki/Dublin_core#Simple_Dublin_Core
Dublin_Core_tm_map <-
list("contributor" = "contributor",
"coverage" = "coverage",
"creator" = "author",
"date" = "datetimestamp",
"description" = "description",
"format" = "format",
"identifier" = "id",
"language" = "language",
"publisher" = "publisher",
"relation" = "relation",
"rights" = "rights",
"source" = "source", # or better "origin"?
"subject" = "subject",
"title" = "heading",
"type" = "type"
)
DublinCore <-
function(x, tag = NULL)
{
tmm <- unlist(Dublin_Core_tm_map, use.names = FALSE)
dcm <- names(Dublin_Core_tm_map)
if (is.null(tag)) {
m <- lapply(tmm, function(t) meta(x, t))
names(m) <- dcm
class(m) <- "TextDocumentMeta"
m
} else
meta(x, tmm[charmatch(tolower(tag), dcm)])
}
`DublinCore<-` <-
function(x, tag, value)
{
tmm <- unlist(Dublin_Core_tm_map, use.names = FALSE)
dcm <- names(Dublin_Core_tm_map)
meta(x, tmm[charmatch(tolower(tag), dcm)]) <- value
x
}
tm/R/weight.R 0000644 0001751 0000144 00000012115 12776627444 012546 0 ustar hornik users # Author: Ingo Feinerer
WeightFunction <- function(x, name, acronym) {
class(x) <- c("WeightFunction", "function")
attr(x, "name") <- name
attr(x, "acronym") <- acronym
x
}
# Actual TermDocumentMatrix weighting functions
weightTf <-
WeightFunction(function(m) {
attr(m, "weighting") <- c("term frequency", "tf")
m
}, "term frequency", "tf")
weightTfIdf <-
WeightFunction(function(m, normalize = TRUE) {
isDTM <- inherits(m, "DocumentTermMatrix")
if (isDTM) m <- t(m)
if (normalize) {
cs <- col_sums(m)
if (any(cs == 0))
warning("empty document(s): ",
paste(Docs(m)[cs == 0], collapse = " "))
names(cs) <- seq_len(nDocs(m))
m$v <- m$v / cs[m$j]
}
rs <- row_sums(m > 0)
if (any(rs == 0))
warning("unreferenced term(s): ",
paste(Terms(m)[rs == 0], collapse = " "))
lnrs <- log2(nDocs(m) / rs)
lnrs[!is.finite(lnrs)] <- 0
m <- m * lnrs
attr(m, "weighting") <-
c(sprintf("%s%s",
"term frequency - inverse document frequency",
if (normalize) " (normalized)" else ""),
"tf-idf")
if (isDTM) t(m) else m
}, "term frequency - inverse document frequency", "tf-idf")
weightSMART <-
WeightFunction(function(m, spec = "nnn", control = list()) {
stopifnot(inherits(m, c("DocumentTermMatrix", "TermDocumentMatrix")),
is.character(spec), nchar(spec) == 3L, is.list(control))
term_frequency <-
match.arg(substr(spec, 1L, 1L),
c("n", "l", "a", "b", "L"))
document_frequency <-
match.arg(substr(spec, 2L, 2L),
c("n", "t", "p"))
normalization <-
match.arg(substr(spec, 3L, 3L),
c("n", "c", "u", "b"))
isDTM <- inherits(m, "DocumentTermMatrix")
if (isDTM) m <- t(m)
if (normalization == "b") {
## Need to compute the character lengths of the documents
## before starting the weighting.
charlengths <-
tapply(nchar(Terms(m))[m$i] * m$v, m$j, sum)
}
## Term frequency
m$v <- switch(term_frequency,
## natural
n = m$v,
## logarithm
l = 1 + log2(m$v),
## augmented
a = {
s <- tapply(m$v, m$j, max)
0.5 + (0.5 * m$v) / s[as.character(m$j)]
},
## boolean
b = as.numeric(m$v > 0),
## log ave
L = {
s <- tapply(m$v, m$j, mean)
((1 + log2(m$v)) / (1 + log2(s[as.character(m$j)])))
})
## Document frequency
rs <- row_sums(m > 0)
if (any(rs == 0))
warning("unreferenced term(s): ",
paste(Terms(m)[rs == 0], collapse = " "))
df <- switch(document_frequency,
## natural
n = 1,
## idf
t = log2(nDocs(m) / rs),
## prob idf
p = max(0, log2((nDocs(m) - rs) / rs)))
df[!is.finite(df)] <- 0
## Normalization
cs <- col_sums(m)
if (any(cs == 0))
warning("empty document(s): ",
paste(Docs(m)[cs == 0], collapse = " "))
norm <- switch(normalization,
## none
n = rep.int(1, nDocs(m)),
## cosine
c = sqrt(col_sums(m ^ 2)),
## pivoted unique
u = {
if (is.null(pivot <- control$pivot))
stop("invalid control argument pivot")
if (is.null(slope <- control$slope))
stop("invalid control argument slope")
(slope * sqrt(col_sums(m ^ 2)) +
(1 - slope) * pivot)
},
## byte size
b = {
if (is.null(alpha <- control$alpha))
stop("invalid control argument alpha")
norm <- double(nDocs(m))
norm[match(names(charlengths),
seq_along(norm))] <-
charlengths ^ alpha
norm
})
m <- m * df
m$v <- m$v / norm[m$j]
attr(m, "weighting") <- c(paste("SMART", spec), "SMART")
if (isDTM) t(m) else m
}, "SMART", "SMART")
weightBin <-
WeightFunction(function(m) {
m$v <- rep_len(1L, length(m$v))
attr(m, "weighting") <- c("binary", "bin")
m
}, "binary", "bin")
tm/R/reader.R 0000644 0001751 0000144 00000016305 13177046106 012510 0 ustar hornik users ## Author: Ingo Feinerer
## Readers
FunctionGenerator <-
function(x)
{
class(x) <- c("FunctionGenerator", "function")
x
}
getReaders <-
function()
c("readDataframe", "readDOC", "readPDF", "readPlain", "readRCV1",
"readRCV1asPlain", "readReut21578XML", "readReut21578XMLasPlain",
"readTagged", "readXML")
prepareReader <-
function(readerControl, reader = NULL, ...)
{
if (is.null(readerControl$reader))
readerControl$reader <- reader
if (inherits(readerControl$reader, "FunctionGenerator"))
readerControl$reader <- readerControl$reader(...)
if (is.null(readerControl$language))
readerControl$language <- "en"
readerControl
}
processURI <-
function(uri)
{
uri <- as.character(uri)
if (identical(substr(uri, 1, 7), "file://"))
uri <- substr(uri, 8, nchar(uri))
uri
}
readDataframe <-
function(elem, language, id) {
PlainTextDocument(elem$content[, "text"],
id = elem$content[, "doc_id"],
language = language)
}
# readDOC needs antiword installed to be able to extract the text
readDOC <-
function(engine = c("antiword", "executable"),
AntiwordOptions = "")
{
stopifnot(is.character(engine), is.character(AntiwordOptions))
engine <- match.arg(engine)
antiword <-
switch(engine,
antiword = antiword::antiword,
executable = function(x)
system2("antiword",
c(AntiwordOptions, shQuote(normalizePath(x))),
stdout = TRUE))
if (!is.function(antiword))
stop("invalid function for DOC extraction")
function(elem, language, id) {
uri <- processURI(elem$uri)
content <- antiword(uri)
PlainTextDocument(content, id = basename(elem$uri), language = language)
}
}
class(readDOC) <- c("FunctionGenerator", "function")
readPDF <-
function(engine = c("pdftools", "xpdf", "Rpoppler",
"ghostscript", "Rcampdf", "custom"),
control = list(info = NULL, text = NULL))
{
stopifnot(is.character(engine), is.list(control))
engine <- match.arg(engine)
pdf_info <-
switch(engine,
pdftools = function(x) {
i <- pdftools::pdf_info(x)
c(i$keys, list(CreationDate = i$created))
},
xpdf = function(x) pdf_info_via_xpdf(x, control$info),
Rpoppler = Rpoppler::PDF_info,
ghostscript = pdf_info_via_gs,
Rcampdf = Rcampdf::pdf_info,
custom = control$info)
pdf_text <-
switch(engine,
pdftools = pdftools::pdf_text,
xpdf = function(x) system2("pdftotext",
c(control$text, shQuote(x), "-"),
stdout = TRUE),
Rpoppler = Rpoppler::PDF_text,
ghostscript = pdf_text_via_gs,
Rcampdf = Rcampdf::pdf_text,
custom = control$text)
if (!is.function(pdf_info) || !is.function(pdf_text))
stop("invalid function for PDF extraction")
function(elem, language, id) {
uri <- processURI(elem$uri)
meta <- pdf_info(uri)
content <- pdf_text(uri)
PlainTextDocument(content, meta$Author, meta$CreationDate, meta$Subject,
meta$Title, basename(elem$uri), language,
meta$Creator)
}
}
class(readPDF) <- c("FunctionGenerator", "function")
readPlain <-
function(elem, language, id) {
if (!is.null(elem$uri))
id <- basename(elem$uri)
PlainTextDocument(elem$content, id = id, language = language)
}
readXML <-
function(spec, doc)
{
stopifnot(is.list(spec), inherits(doc, "TextDocument"))
function(elem, language, id) {
content <- elem$content
node <- if(inherits(content, "xml_node"))
content
else if(is.character(content))
read_xml(paste(elem$content, collapse = "\n"))
else
read_xml(content)
content(doc) <- if ("content" %in% names(spec))
.xml_content(node, spec[["content"]])
else
node
for (n in setdiff(names(spec), "content"))
meta(doc, n) <- .xml_content(node, spec[[n]])
if (!is.null(elem$uri))
id <- basename(elem$uri)
if (!length(meta(doc, "id")))
meta(doc, "id") <- as.character(id)
if (!length(meta(doc, "language")))
meta(doc, "language") <- as.character(language)
doc
}
}
class(readXML) <- c("FunctionGenerator", "function")
RCV1Spec <-
list(author = list("unevaluated", ""),
datetimestamp = list("function", function(node)
as.POSIXlt(xml_text(xml_find_all(node, "@date")),
tz = "GMT")),
description = list("unevaluated", ""),
heading = list("node", "title"),
id = list("node", "@itemid"),
origin = list("unevaluated", "Reuters Corpus Volume 1"),
publisher = list("node",
"metadata/dc[@element='dc.publisher']/@value"),
topics = list("node",
"metadata/codes[@class='bip:topics:1.0']/code/@code"),
industries = list("node",
"metadata/codes[@class='bip:industries:1.0']/code/@code"),
countries = list("node",
"metadata/codes[@class='bip:countries:1.0']/code/@code"))
readRCV1 <-
readXML(spec = RCV1Spec,
doc = XMLTextDocument())
readRCV1asPlain <-
readXML(spec = c(RCV1Spec,
list(content = list("node", "text"))),
doc = PlainTextDocument())
Reut21578XMLSpec <-
list(author = list("node", "TEXT/AUTHOR"),
datetimestamp = list("function", function(node)
strptime(xml_text(xml_find_all(node, "DATE")),
format = "%d-%B-%Y %H:%M:%S",
tz = "GMT")),
description = list("unevaluated", ""),
heading = list("node", "TEXT/TITLE"),
id = list("node", "@NEWID"),
topics = list("node", "@TOPICS"),
lewissplit = list("node", "@LEWISSPLIT"),
cgisplit = list("node", "@CGISPLIT"),
oldid = list("node", "@OLDID"),
origin = list("unevaluated", "Reuters-21578 XML"),
topics_cat = list("node", "TOPICS/D"),
places = list("node", "PLACES/D"),
people = list("node", "PEOPLE/D"),
orgs = list("node", "ORGS/D"),
exchanges = list("node", "EXCHANGES/D"))
readReut21578XML <-
readXML(spec = Reut21578XMLSpec,
doc = XMLTextDocument())
readReut21578XMLasPlain <-
readXML(spec = c(Reut21578XMLSpec,
list(content = list("node", "TEXT/BODY"))),
doc = PlainTextDocument())
readTagged <-
function(...)
{
args <- list(...)
function(elem, language, id) {
if (!is.null(elem$content)) {
con <- textConnection(elem$content)
on.exit(close(con))
} else
con <- elem$uri
if (!is.null(elem$uri))
id <- basename(elem$uri)
a <- c(list(con = con, meta = list(id = id, language = language)), args)
do.call(TaggedTextDocument, a)
}
}
class(readTagged) <- c("FunctionGenerator", "function")
tm/R/foreign.R 0000644 0001751 0000144 00000003135 13023471774 012677 0 ustar hornik users ## Readers and writers (eventually?) for foreign document-term matrix
## format files.
## CLUTO: as we do not know the weighting, there is no high-level DTM
## reader. If the weighting is weightTf, one can do
## as.DocumentTermMatrix(read_stm_CLUTO(file), weightTf)
## as CLUTO always has rows as documents and cols as terms.
## MC: a simple reader for now, could certainly use more effort to name
## the weightings more properly.
read_dtm_MC <-
function(file, scalingtype = NULL)
{
m <- read_stm_MC(file, scalingtype)
s <- attr(m, "scalingtype")
as.DocumentTermMatrix(m, rep.int(s, 2L))
}
##
## To write a decent writer we would need to be able to turn weighting
## information into MC scaling information, which may not even be
## possible. Alternatively, we could always use 'txx', or use this in
## case we cannot map ...
##
## Data files for the Blei et al LDA and CTM codes are in a List of List
## format, with lines
## n j1: x1 j2: x2 ... jn: xn
## (see http://www.cs.princeton.edu/~blei/lda-c/).
## As they are used for topic models, they *always* contain raw term
## frequencies.
read_dtm_Blei_et_al <-
function(file, vocab = NULL)
{
x <- scan(file, character(), quiet = TRUE)
ind <- grepl(":", x, fixed = TRUE)
counts <- x[!ind]
i <- rep.int(seq_along(counts), counts)
x <- strsplit(x[ind], ":", fixed = TRUE)
j <- as.integer(unlist(lapply(x, `[`, 1L))) + 1L
x <- as.numeric(unlist(lapply(x, `[`, 2L)))
m <- simple_triplet_matrix(i, j, x)
if (!is.null(vocab))
colnames(m) <- readLines(vocab)
as.DocumentTermMatrix(m, weightTf)
}
tm/R/hpc.R 0000644 0001751 0000144 00000000754 14716601054 012020 0 ustar hornik users tm_parLapply_engine <-
local({
val <- NULL
## Could do some checking on new if given: should inherit from
## "cluster" or have formals (X, FUN, ...).
function(new) {
if (missing(new)) val else val <<- new
}
})
tm_parLapply <-
function(X, FUN, ...)
{
engine <- tm_parLapply_engine()
if (inherits(engine, "cluster"))
parLapply(engine, X, FUN, ...)
else if (is.function(engine))
engine(X, FUN, ...)
else
lapply(X, FUN, ...)
}
tm/R/stopwords.R 0000644 0001751 0000144 00000001032 13034740255 013277 0 ustar hornik users stopwords <- {
function(kind = "en") {
kind <- as.character(kind)
resolved <- map_IETF_Snowball(kind)
base <- if (is.na(resolved))
kind
else if (identical(resolved, "porter"))
"english"
else
resolved
s <- system.file("stopwords", paste0(base, ".dat"),
package = "tm")
if (identical(s, ""))
stop(paste("no stopwords available for '", base, "'", sep = ""))
readLines(s, encoding = "UTF-8")
}
}
tm/R/tokenizer.R 0000644 0001751 0000144 00000002026 13307435131 013246 0 ustar hornik users getTokenizers <-
function()
c("Boost_tokenizer", "MC_tokenizer", "scan_tokenizer")
##
Boost_tokenizer <-
Token_Tokenizer(function(x)
{
y <- Boost_Tokenizer(as.character(x))
Encoding(y) <- "UTF-8"
y
})
##
MC_tokenizer <-
Token_Tokenizer(function(x)
{
x <- as.character(x)
if(!length(x))
return(character())
ASCII_letters <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
id <- sprintf("[%s]+", ASCII_letters)
http <- sprintf("(https?://%s(\\.%s)*)", id, id)
email <- sprintf("(%s@%s(\\.%s)*)", id, id, id)
http_or_email <- sprintf("%s|%s", http, email)
y <- c(unlist(regmatches(x, gregexpr(http_or_email, x)),
FALSE, FALSE),
unlist(strsplit(gsub(http_or_email, "", x),
sprintf("[^%s]", ASCII_letters)),
FALSE, FALSE))
y[nzchar(y)]
})
scan_tokenizer <-
Token_Tokenizer(function(x)
{
.Call(`_tm_scan`, as.character(x), 0L)
})
tm/vignettes/ 0000755 0001751 0000144 00000000000 14755301616 012730 5 ustar hornik users tm/vignettes/extensions.Rnw 0000644 0001751 0000144 00000027271 13177024075 015626 0 ustar hornik users \documentclass[a4paper]{article}
\usepackage[margin=2cm]{geometry}
\usepackage[round]{natbib}
\usepackage{url}
\newcommand{\acronym}[1]{\textsc{#1}}
\newcommand{\pkg}[1]{{\normalfont\fontseries{b}\selectfont #1}}
\newcommand{\proglang}[1]{\textsf{#1}}
\let\code\texttt
%% \VignetteIndexEntry{Extensions}
\begin{document}
<>=
library("tm")
library("xml2")
@
\title{Extensions\\How to Handle Custom File Formats}
\author{Ingo Feinerer}
\maketitle
\section*{Introduction}
The possibility to handle custom file formats is a substantial feature in any
modern text mining infrastructure. \pkg{tm} has been designed aware of this
aspect from the beginning on, and has modular components which allow for
extensions. A general explanation of \pkg{tm}'s extension mechanism is
described by~\citet[Sec.~3.3]{Feinerer_etal_2008}, with an updated description
as follows.
\section*{Sources}
A source abstracts input locations and provides uniform methods for access.
Each source must provide implementations for following interface functions:
\begin{description}
\item[close()] closes the source and returns it,
\item[eoi()] returns \code{TRUE} if the end of input of the source is reached,
\item[getElem()] fetches the element at the current position,
\item[length()] gives the number of elements,
\item[open()] opens the source and returns it,
\item[reader()] returns a default reader for processing elements,
\item[pGetElem()] (optional) retrieves all elements in parallel at once, and
\item[stepNext()] increases the position in the source to the next element.
\end{description}
Retrieved elements must be encapsulated in a list with the named components
\code{content} holding the document and \code{uri} pointing to the origin of
the document (e.g., a file path or a \acronym{URL}; \code{NULL} if not
applicable or unavailable).
Custom sources are required to inherit from the virtual base class
\code{Source} and typically do so by extending the functionality provided by the
simple reference implementation \code{SimpleSource}.
E.g., a simple source which accepts an \proglang{R} vector as input could be
defined as
<>=
VecSource <- function(x)
SimpleSource(length = length(x), content = as.character(x),
class = "VecSource")
@
which overrides a few defaults (see \code{?SimpleSource} for defaults) and
stores the vector in the \code{content} component. The functions
\code{close()}, \code{eoi()}, \code{open()}, and \code{stepNext()} have
reasonable default methods already for the \code{SimpleSource} class: the
identity function for \code{open()} and \code{close()}, incrementing a position
counter for \code{stepNext()}, and comparing the current position with the
number of available elements as claimed by \code{length()} for \code{eoi()},
respectively. So we only need custom methods for element access:
<>=
getElem.VecSource <-
function(x) list(content = x$content[x$position], uri = NULL)
pGetElem.VecSource <-
function(x) lapply(x$content, function(y) list(content = y, uri = NULL))
@
\section*{Readers}
Readers are functions for extracting textual content and metadata out of
elements delivered by a source and for constructing a text document. Each reader
must accept following arguments in its signature:
\begin{description}
\item[elem] a list with the named components \code{content} and \code{uri} (as
delivered by a source via \code{getElem()} or \code{pGetElem()}),
\item[language] a string giving the language, and
\item[id] a character giving a unique identifier for the created text document.
\end{description}
The element \code{elem} is typically provided by a source whereas the language
and the identifier are normally provided by a corpus constructor (for the case
that \code{elem\$content} does not give information on these two essential
items).
In case a reader expects configuration arguments we can use a function
generator. A function generator is indicated by inheriting from class
\code{FunctionGenerator} and \code{function}. It allows us to process
additional arguments, store them in an environment, return a reader function
with the well-defined signature described above, and still be able to access
the additional arguments via lexical scoping. All corpus constructors in
package \pkg{tm} check the reader function for being a function generator and
if so apply it to yield the reader with the expected signature.
E.g., the reader function \code{readPlain()} is defined as
<>=
readPlain <- function(elem, language, id)
PlainTextDocument(elem$content, id = id, language = language)
@
For examples on readers using the function generator please have a look at
\code{?readPDF} or \code{?readPDF}.
However, for many cases, it is not necessary to define each detailed aspect of
how to extend \pkg{tm}. Typical examples are \acronym{XML} files which are very
common but can be rather easily handled via standard conforming \acronym{XML}
parsers. The aim of the remainder in this document is to give an overview on
how simpler, more user-friendly, forms of extension mechanisms can be applied
in \pkg{tm}.
\section*{Custom Data Formats}
A general situation is that you have gathered together some information into a
tabular data structure (like a data frame or a list matrix) that suffices to
describe documents in a corpus. However, you do not have a distinct file format
because you extracted the information out of various resources, e.g., as
delivered by \code{readtext()} in package \pkg{readtext}. Now you want to use
your information to build a corpus which is recognized by \pkg{tm}.
We assume that your information is put together in a data frame. E.g.,
consider the following example:
<>=
df <- data.frame(doc_id = c("doc 1" , "doc 2" , "doc 3" ),
text = c("content 1", "content 2", "content 3"),
title = c("title 1" , "title 2" , "title 3" ),
authors = c("author 1" , "author 2" , "author 3" ),
topics = c("topic 1" , "topic 2" , "topic 3" ),
stringsAsFactors = FALSE)
@
We want to map the data frame rows to the relevant entries of a text document.
An entry \code{text} in the mapping will be matched to fill the actual content
of the text document, \code{doc\_id} will be used as document ID, all other
fields will be used as metadata tags. So we can construct a corpus out of the
data frame:
<<>>=
(corpus <- Corpus(DataframeSource(df)))
corpus[[1]]
meta(corpus[[1]])
@
\section*{Custom XML Sources}
Many modern file formats already come in \acronym{XML} format which
allows to extract information with any \acronym{XML} conforming
parser, e.g., as implemented in \proglang{R} by the \pkg{xml2}
package.
Now assume we have some custom \acronym{XML} format which we want to
access with \pkg{tm}. Then a viable way is to create a custom
\acronym{XML} source which can be configured with only a few
commands. E.g., have a look at the following example:
<>=
custom.xml <- system.file("texts", "custom.xml", package = "tm")
print(readLines(custom.xml), quote = FALSE)
@
As you see there is a top-level tag stating that there is a corpus,
and several document tags below. In fact, this structure is very
common in \acronym{XML} files found in text mining applications (e.g.,
both the Reuters-21578 and the Reuters Corpus Volume 1 data sets follow
this general scheme). In \pkg{tm} we expect a source to deliver
self-contained blocks of information to a reader function, each block
containing all information necessary such that the reader can
construct a (subclass of a) \code{TextDocument} from it.
The \code{XMLSource()} function can now be used to construct a custom
\acronym{XML} source. It has three arguments:
\begin{description}
\item[x] a character giving a uniform resource identifier,
\item[parser] a function accepting an \acronym{XML} document (as delivered by
\code{read\_xml()} in package \pkg{xml2}) as input and returning a
\acronym{XML} elements/nodes (each element/node will then be
delivered to the reader as a self-contained block),
\item[reader] a reader function capable of turning \acronym{XML}
elements/nodes as returned by the parser into a subclass of
\code{TextDocument}.
\end{description}
E.g., a custom source which can cope with our custom \acronym{XML}
format could be:
<>=
mySource <- function(x)
XMLSource(x, parser = xml2::xml_children, reader = myXMLReader)
@
As you notice in this example we also provide a custom reader function
(\code{myXMLReader}). See the next section for details.
\section*{Custom XML Readers}
As we saw in the previous section we often need a custom reader
function to extract information out of \acronym{XML} chunks (typically
as delivered by some source). Fortunately, \pkg{tm} provides an easy
way to define custom \acronym{XML} reader functions. All you need to
do is to provide a so-called \emph{specification}.
Let us start with an example which defines a reader function for the
file format from the previous section:
<>=
myXMLReader <- readXML(
spec = list(author = list("node", "writer"),
content = list("node", "description"),
datetimestamp = list("function",
function(x) as.POSIXlt(Sys.time(), tz = "GMT")),
description = list("node", "@short"),
heading = list("node", "caption"),
id = list("function", function(x) tempfile()),
origin = list("unevaluated", "My private bibliography"),
type = list("node", "type")),
doc = PlainTextDocument())
@
Formally, \code{readXML()} is the relevant function which constructs an reader.
The customization is done via the first argument \code{spec}, the second
provides an empty instance of the document which should be returned (augmented
with the extracted information out of the \acronym{XML} chunks). The
specification must consist of a named list of lists each containing two
character vectors. The constructed reader will map each list entry to the
content or a metadatum of the text document as specified by the named list
entry. Valid names include \code{content} to access the document's content, and
character strings which are mapped to metadata entries.
Each list entry must consist of two character vectors: the first
describes the type of the second argument, and the second is the
specification entry. Valid combinations are:
\begin{description}
\item[\code{type = "node", spec = "XPathExpression"}] the XPath (1.0) expression
\code{spec} extracts information out of an \acronym{XML} node (as seen for
\code{author}, \code{content}, \code{description}, \code{heading}, and
\code{type} in our example specification).
\item[\code{type = "function", spec = function(doc) \ldots}] The function
\code{spec} is called, passing over the \acronym{XML} document (as
delivered by \code{read\_xml()} from package \pkg{xml2})
as first argument (as seen for \code{datetimestamp} and \code{id}).
As you notice in our example nobody forces us to actually use the passed over
document, instead we can do anything we want (e.g., create a unique character
vector via \code{tempfile()} to have a unique identification string).
\item[\code{type = "unevaluated", spec = "String"}] the character vector
\code{spec} is returned without modification (e.g., \code{origin} in
our specification).
\end{description}
Now that we have all we need to cope with our custom file format, we
can apply the source and reader function at any place in \pkg{tm}
where a source or reader is expected, respectively. E.g.,
<<>>=
corpus <- VCorpus(mySource(custom.xml))
@
constructs a corpus out of the information in our \acronym{XML}
file:
<<>>=
corpus[[1]]
meta(corpus[[1]])
@
\bibliographystyle{abbrvnat}
\bibliography{references}
\end{document}
tm/vignettes/tm.Rnw 0000644 0001751 0000144 00000033544 14656640247 014057 0 ustar hornik users \documentclass[a4paper]{article}
\usepackage[margin=2cm]{geometry}
\usepackage[utf8]{inputenc}
\usepackage[round]{natbib}
\usepackage{url}
\newcommand{\acronym}[1]{\textsc{#1}}
\newcommand{\class}[1]{\mbox{\textsf{#1}}}
\newcommand{\code}[1]{\mbox{\texttt{#1}}}
\newcommand{\pkg}[1]{{\normalfont\fontseries{b}\selectfont #1}}
\newcommand{\proglang}[1]{\textsf{#1}}
%% \VignetteIndexEntry{Introduction to the tm Package}
%% \VignetteDepends{SnowballC}
\begin{document}
<>=
library("tm")
data("crude")
@
\title{Introduction to the \pkg{tm} Package\\Text Mining in \proglang{R}}
\author{Ingo Feinerer}
\maketitle
\section*{Introduction}
This vignette gives a short introduction to text mining in
\proglang{R} utilizing the text mining framework provided by the
\pkg{tm} package. We present methods for data import, corpus handling,
preprocessing, metadata management, and creation of term-document
matrices. Our focus is on the main aspects of getting started with
text mining in \proglang{R}---an in-depth description of the text
mining infrastructure offered by \pkg{tm} was published in the
\emph{Journal of Statistical Software}~\citep{Feinerer_etal_2008}. An
introductory article on text mining in \proglang{R} was published in
\emph{R News}~\citep{Rnews:Feinerer:2008}.
\section*{Data Import}
The main structure for managing documents in \pkg{tm} is a so-called
\class{Corpus}, representing a collection of text documents. A corpus
is an abstract concept, and there can exist several implementations in
parallel. The default implementation is the so-called \class{VCorpus}
(short for \emph{Volatile Corpus}) which realizes a semantics as known
from most \proglang{R} objects: corpora are \proglang{R} objects held
fully in memory. We denote this as volatile since once the
\proglang{R} object is destroyed, the whole corpus is gone. Such a
volatile corpus can be created via the constructor \code{VCorpus(x,
readerControl)}. Another implementation is the \class{PCorpus} which
implements a \emph{Permanent Corpus} semantics, i.e., the documents
are physically stored outside of \proglang{R} (e.g., in a database),
corresponding \proglang{R} objects are basically only pointers to
external structures, and changes to the underlying corpus are
reflected to all \proglang{R} objects associated with it. Compared to
the volatile corpus the corpus encapsulated by a permanent corpus
object is not destroyed if the corresponding \proglang{R} object is
released.
Within the corpus constructor, \code{x} must be a \class{Source}
object which abstracts the input location. \pkg{tm} provides a set of
predefined sources, e.g., \class{DirSource}, \class{VectorSource}, or
\class{DataframeSource}, which handle a directory, a vector
interpreting each component as document, or data frame like structures
(like \acronym{CSV} files), respectively. Except \class{DirSource},
which is designed solely for directories on a file system, and
\class{VectorSource}, which only accepts (character) vectors, most
other implemented sources can take connections as input (a character
string is interpreted as file path). \code{getSources()} lists
available sources, and users can create their own sources.
The second argument \code{readerControl} of the corpus constructor has to be a
list with the named components \code{reader} and \code{language}. The first
component \code{reader} constructs a text document from elements delivered by a
source. The \pkg{tm} package ships with several readers (e.g.,
\code{readPlain()}, \code{readPDF()}, \code{readDOC()}, \ldots). See
\code{getReaders()} for an up-to-date list of available readers. Each source
has a default reader which can be overridden. E.g., for \code{DirSource} the
default just reads in the input files and interprets their content as text.
Finally, the second component \code{language} sets the texts' language
(preferably using \acronym{ISO} 639-2 codes).
In case of a permanent corpus, a third argument \code{dbControl} has
to be a list with the named components \code{dbName} giving the
filename holding the sourced out objects (i.e., the database), and
\code{dbType} holding a valid database type as supported by package
\pkg{filehash}. Activated database support reduces the memory demand,
however, access gets slower since each operation is limited by the
hard disk's read and write capabilities.
So e.g., plain text files in the directory \code{txt} containing Latin
(\code{lat}) texts by the Roman poet \emph{Ovid} can be read in with
following code:
<>=
txt <- system.file("texts", "txt", package = "tm")
(ovid <- VCorpus(DirSource(txt, encoding = "UTF-8"),
readerControl = list(language = "lat")))
@
For simple examples \code{VectorSource} is quite useful, as it can
create a corpus from character vectors, e.g.:
<>=
docs <- c("This is a text.", "This another one.")
VCorpus(VectorSource(docs))
@
Finally we create a corpus for some Reuters documents as example for
later use:
<>=
reut21578 <- system.file("texts", "crude", package = "tm")
reuters <- VCorpus(DirSource(reut21578, mode = "binary"),
readerControl = list(reader = readReut21578XMLasPlain))
@
\section*{Data Export}
For the case you have created a corpus via manipulating other
objects in \proglang{R}, thus do not have the texts already stored on
a hard disk, and want to save the text documents to disk, you can
simply use \code{writeCorpus()}
<>=
writeCorpus(ovid)
@
which writes a character representation of the documents in a corpus to
multiple files on disk.
\section*{Inspecting Corpora}
Custom \code{print()} methods are available which hide the raw amount of
information (consider a corpus could consist of several thousand documents,
like a database). \code{print()} gives a concise overview whereas more details
are displayed with \code{inspect()}.
<<>>=
inspect(ovid[1:2])
@
Individual documents can be accessed via \code{[[}, either via the
position in the corpus, or via their identifier.
<>=
meta(ovid[[2]], "id")
identical(ovid[[2]], ovid[["ovid_2.txt"]])
@
A character representation of a document is available via
\code{as.character()} which is also used when inspecting a document:
<>=
inspect(ovid[[2]])
lapply(ovid[1:2], as.character)
@
\section*{Transformations}
Once we have a corpus we typically want to modify the documents in it,
e.g., stemming, stopword removal, et cetera. In \pkg{tm}, all this
functionality is subsumed into the concept of a
\emph{transformation}. Transformations are done via the \code{tm\_map()}
function which applies (maps) a function to all elements of the
corpus. Basically, all transformations work on single text documents
and \code{tm\_map()} just applies them to all documents in a corpus.
\subsection*{Eliminating Extra Whitespace}
Extra whitespace is eliminated by:
<<>>=
reuters <- tm_map(reuters, stripWhitespace)
@
\subsection*{Convert to Lower Case}
Conversion to lower case by:
<<>>=
reuters <- tm_map(reuters, content_transformer(tolower))
@
We can use arbitrary character processing functions as transformations as long
as the function returns a text document. In this case we use
\code{content\_transformer()} which provides a convenience wrapper to access and
set the content of a document. Consequently most text manipulation functions
from base \proglang{R} can directly be used with this wrapper. This works for
\code{tolower()} as used here but also e.g.\ for \code{gsub()} which comes quite
handy for a broad range of text manipulation tasks.
\subsection*{Remove Stopwords}
Removal of stopwords by:
<>=
reuters <- tm_map(reuters, removeWords, stopwords("english"))
@
\subsection*{Stemming}
Stemming is done by:
<>=
tm_map(reuters, stemDocument)
@
\section*{Filters}
Often it is of special interest to filter out documents satisfying
given properties. For this purpose the function \code{tm\_filter} is
designed. It is possible to write custom filter functions which get applied to
each document in the corpus. Alternatively, we can create indices based on
selections and subset the corpus with them. E.g., the following
statement filters out those documents having an \code{ID} equal to \code{"237"}
and the string \code{"INDONESIA SEEN AT CROSSROADS OVER ECONOMIC CHANGE"} as
their heading.
<<>>=
idx <- meta(reuters, "id") == '237' &
meta(reuters, "heading") == 'INDONESIA SEEN AT CROSSROADS OVER ECONOMIC CHANGE'
reuters[idx]
@
\section*{Metadata Management}
Metadata is used to annotate text documents or whole corpora with
additional information. The easiest way to accomplish this with
\pkg{tm} is to use the \code{meta()} function. A text document has a
few predefined attributes like \code{author} but can be extended with
an arbitrary number of additional user-defined metadata tags. These
additional metadata tags are individually attached to a single text
document. From a corpus perspective these metadata attachments are
locally stored together with each individual text
document. Alternatively to \code{meta()} the function
\code{DublinCore()} provides a full mapping between Simple Dublin Core
metadata and \pkg{tm} metadata structures and can be similarly used
to get and set metadata information for text documents, e.g.:
<>=
DublinCore(crude[[1]], "Creator") <- "Ano Nymous"
meta(crude[[1]])
@
For corpora the story is a bit more sophisticated. Corpora in \pkg{tm}
have two types of metadata: one is the metadata on the corpus level
(\code{corpus}), the other is the metadata related to the individual
documents (\code{indexed}) in form of a data frame. The latter is
often done for performance reasons (hence the named \code{indexed} for
indexing) or because the metadata has an own entity but still relates
directly to individual text documents, e.g., a classification result;
the classifications directly relate to the documents but the set of
classification levels forms an own entity. Both cases can be handled
with \code{meta()}:
<<>>=
meta(crude, tag = "test", type = "corpus") <- "test meta"
meta(crude, type = "corpus")
meta(crude, "foo") <- letters[1:20]
meta(crude)
@
\section*{Standard Operators and Functions}
Many standard operators and functions (\code{[}, \code{[<-},
\code{[[}, \code{[[<-}, \code{c()}, \code{lapply()}) are available for
corpora with semantics similar to standard \proglang{R}
routines. E.g., \code{c()} concatenates two (or more) corpora. Applied
to several text documents it returns a corpus. The metadata is
automatically updated, if corpora are concatenated (i.e., merged).
\section*{Creating Term-Document Matrices}
A common approach in text mining is to create a term-document matrix
from a corpus. In the \pkg{tm} package the classes
\class{TermDocumentMatrix} and \class{DocumentTermMatrix} (depending
on whether you want terms as rows and documents as columns, or vice
versa) employ sparse matrices for corpora. Inspecting a term-document matrix
displays a sample, whereas \code{as.matrix()} yields the full matrix in
dense format (which can be very memory consuming for large matrices).
<<>>=
dtm <- DocumentTermMatrix(reuters)
inspect(dtm)
@
\section*{Operations on Term-Document Matrices}
Besides the fact that on this matrix a huge amount of \proglang{R}
functions (like clustering, classifications, etc.) can be applied,
this package brings some shortcuts. Imagine we want to find those
terms that occur at least five times, then we can use the
\code{findFreqTerms()} function:
<<>>=
findFreqTerms(dtm, 5)
@
Or we want to find associations (i.e., terms which correlate) with at
least $0.8$ correlation for the term \code{opec}, then we use
\code{findAssocs()}:
<<>>=
findAssocs(dtm, "opec", 0.8)
@
Term-document matrices tend to get very big already for normal sized
data sets. Therefore we provide a method to remove \emph{sparse} terms,
i.e., terms occurring only in very few documents. Normally, this
reduces the matrix dramatically without losing significant relations
inherent to the matrix:
<<>>=
inspect(removeSparseTerms(dtm, 0.4))
@
This function call removes those terms which have at least a 40
percentage of sparse (i.e., terms occurring 0 times in a document)
elements.
\section*{Dictionary}
A dictionary is a (multi-)set of strings. It is often used to denote relevant
terms in text mining. We represent a dictionary with a character vector which
may be passed to the \code{DocumentTermMatrix()} constructor as a control
argument. Then the created matrix is tabulated against the dictionary, i.e.,
only terms from the dictionary appear in the matrix. This allows to restrict
the dimension of the matrix a priori and to focus on specific terms for
distinct text mining contexts, e.g.,
<<>>=
inspect(DocumentTermMatrix(reuters,
list(dictionary = c("prices", "crude", "oil"))))
@
\section*{Performance}
Often you do not need all the generality, modularity and full range of features
offered by \pkg{tm} as this sometimes comes at the price of performance.
\class{SimpleCorpus} provides a corpus which is optimized for the most common
usage scenario: importing plain texts from files in a directory or directly
from a vector in \proglang{R}, preprocessing and transforming the texts, and
finally exporting them to a term-document matrix. The aim is to boost
performance and minimize memory pressure. It loads all documents into memory,
and is designed for medium-sized to large data sets.
However, it operates only under the following contraints:
\begin{itemize}
\item only \code{DirSource} and \code{VectorSource} are supported,
\item no custom readers, i.e., each document is read in and stored as plain
text (as a string, i.e., a character vector of length one),
\item transformations applied via \code{tm\_map} must be able to
process strings and return strings,
\item no lazy transformations in \code{tm\_map},
\item no meta data for individual documents (i.e., no \code{"local"} in
\code{meta()}).
\end{itemize}
\bibliographystyle{abbrvnat}
\bibliography{references}
\end{document}
tm/vignettes/references.bib 0000644 0001751 0000144 00000001313 11704521032 015510 0 ustar hornik users @Article{Feinerer_etal_2008,
author = {Ingo Feinerer and Kurt Hornik and David Meyer},
title = {Text Mining Infrastructure in {R}},
journal = {Journal of Statistical Software},
volume = 25,
number = 5,
pages = {1--54},
month = {March},
year = 2008,
issn = {1548-7660},
coden = {JSSOBK},
url = {http://www.jstatsoft.org/v25/i05}
}
@Article{Rnews:Feinerer:2008,
author = {Ingo Feinerer},
title = {An Introduction to Text Mining in {R}},
journal = {R News},
year = 2008,
volume = 8,
number = 2,
pages = {19--22},
month = oct,
url = {http://CRAN.R-project.org/doc/Rnews/},
pdf =
{http://CRAN.R-project.org/doc/Rnews/Rnews_2008-2.pdf}
}
tm/data/ 0000755 0001751 0000144 00000000000 12315572766 011637 5 ustar hornik users tm/data/crude.rda 0000644 0001751 0000144 00000027012 14755301617 013426 0 ustar hornik users ‹ í}Ýr#G–¦%ÍŒjçggwìXáFì¶!‚lþô¬B6›”H‚C€jõe¢*AÔ°PÕÑ諹ö…¯á+?ïü~‚µÏwNfUD·8c{–=KFH
‚@VæÉ“ç÷;'/¿Ûö¾óÆ“ÆÇ?i<ùˆ^Ò¿Æ7>¥?ñÓ"0ÆGK¿Ð_Ÿàß_ÛßñÁ¿®>üä—‡¡ž&q =MÿFu“t¦2*ŸèÜ3ã±ñóðÖ¨< ôB…¹šè@ùEN/3å'qžj?W³4ôM¦ÆIªøù* #5ZxíÖî–
¢4SZtšš¨å)úNŒJMPÐØI¬Fi_g<â,ÉrÈx<Ü+“åjhÞèÌ;‰s“NMêóQí½ÖÖÊèMš¶¡iÍt¼àeÈã6ð<³zª¬h®35Õ4ã0æïFáõ$WÉXuÑ´<¬d–&ø–[¦&Ši57ú¦¶Ú©NoLÞÜ ¿øÉð²Yrc²y2Õqm2Žæa&¤ÕÐéñš<ûª5hñ 2Rh2ÙŽ‰¦X%~S™„nM7ÃäÖ¤vt;Ÿ'ŠÖšy~˜Ó’dæÕœ3š×¥)ˆ¸–M~iÙ¤a_ýlóŸNîççÌF˯U{ý›ÚëÕ^¿ª½þ¸öú`yÌežŽõÔdvRŸÚ7?ÊŒï^NÃØ¾üx’©{=¥E—IÊ,Œ.?2¯>òñ¢zýI˜Y~çh5Ý4žØ7rÑœ|åö×ó¯w¾™¿Mb³|ø½<Ò?ÿ´òvcå÷ÏO:gýóC58îœ]ö»ß(¼ó¹ê^
ª{yuØS—'ÝÞ`uøööþÊ[OL¼òίdß³Ííöîþúîìtu”×wþdxÙ99_yógüæÉùËÍAo¸ò·w÷ö·VÇ-2]qÙê¿ Þ“ÿmÿ¥#.òIâvîç›<$¾Èõtfßü«Àd~Îp¾ÝþLŒˆé ÂÀ¾úi¤ãëB_7|’†×%ý8Of¡ŸÙß¼ÈÌÃ,›E¡ÛçŸú×aý÷O’((‡þñ,Òt
ËßL2‹Üc>NÒk÷—OÍB³ ocɹ%;*ý5 ¾ü0ñ‹©‰ó3“ká—%ò”Œy@+™Ÿ?¾4¬ûè¯."Æõ±Ë]¬¿ÇÏZ§7¼¯û½.I…±”öId“@ž“Ó;ô†QÚËü ÉÚˆþôuAb-3Y&b—d°‚”~‘/Â#ÑåéëÔÌA…ccD4ƒo5j®cú3ÑQ.¿ áOŸÎ¢00‰RÚ…M~ÆA‘åé‚D·Ž}}E9LIbò1,© ŸÕ¬$bÃ25Á’éW£³…Gÿ2%ˆ/R-èµÓ%6´nû4hzmbÁÄð¥²$%Sä^Îj)Efš‘æ`½{y©×ô½0nª LI“„'µÐÕSR•ÁµQ½˜þ¼ð.MF²ÌŸ¨N–%>ô"·Û»ìÈÊ:n±PXKdÈHJbaVÓ—³àåxcp/ôñ'v _ÍŠÙ,Z@MÍ“4
VÔȈ¯>ÍÊ‘èëq’“R½ëÞòžÛsÎUax†SšSW˜ƒÈ:YçDÿ…#Ï…."u%yrÃ[ëÙUó0Ÿ¨Ž’ôêEšÐªÈ08‰}™Þ±©+ó~ŠML£[McfS¥G´-²§˜±§§G!óöíÆ˜Y]‹8 /³N"ÅÊùšl%p(^{30=œ‡>6