snowball_code/0000755000175000017500000000000012707117052012013 5ustar domdomsnowball_code/GNUmakefile0000644000175000017500000002724612707117052014100 0ustar domdom# -*- makefile -*- c_src_dir = src_c java_src_main_dir = java/org/tartarus/snowball java_src_dir = $(java_src_main_dir)/ext libstemmer_algorithms = danish dutch english finnish french german hungarian \ italian \ norwegian porter portuguese romanian \ russian spanish swedish turkish KOI8_R_algorithms = russian ISO_8859_1_algorithms = danish dutch english finnish french german hungarian \ italian \ norwegian porter portuguese spanish swedish ISO_8859_2_algorithms = romanian other_algorithms = german2 kraaij_pohlmann lovins all_algorithms = $(libstemmer_algorithms) $(other_algorithms) COMPILER_SOURCES = compiler/space.c \ compiler/tokeniser.c \ compiler/analyser.c \ compiler/generator.c \ compiler/driver.c \ compiler/generator_java.c COMPILER_HEADERS = compiler/header.h \ compiler/syswords.h \ compiler/syswords2.h RUNTIME_SOURCES = runtime/api.c \ runtime/utilities.c RUNTIME_HEADERS = runtime/api.h \ runtime/header.h JAVARUNTIME_SOURCES = java/org/tartarus/snowball/Among.java \ java/org/tartarus/snowball/SnowballProgram.java \ java/org/tartarus/snowball/SnowballStemmer.java \ java/org/tartarus/snowball/TestApp.java LIBSTEMMER_SOURCES = libstemmer/libstemmer.c LIBSTEMMER_UTF8_SOURCES = libstemmer/libstemmer_utf8.c LIBSTEMMER_HEADERS = include/libstemmer.h libstemmer/modules.h libstemmer/modules_utf8.h LIBSTEMMER_EXTRA = libstemmer/modules.txt libstemmer/modules_utf8.txt libstemmer/libstemmer_c.in STEMWORDS_SOURCES = examples/stemwords.c ALL_ALGORITHM_FILES = $(all_algorithms:%=algorithms/%/stem*.sbl) C_LIB_SOURCES = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c) \ $(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.c) \ $(ISO_8859_1_algorithms:%=$(c_src_dir)/stem_ISO_8859_1_%.c) \ $(ISO_8859_2_algorithms:%=$(c_src_dir)/stem_ISO_8859_2_%.c) C_LIB_HEADERS = $(libstemmer_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h) \ $(KOI8_R_algorithms:%=$(c_src_dir)/stem_KOI8_R_%.h) \ $(ISO_8859_1_algorithms:%=$(c_src_dir)/stem_ISO_8859_1_%.h) \ $(ISO_8859_2_algorithms:%=$(c_src_dir)/stem_ISO_8859_2_%.h) C_OTHER_SOURCES = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.c) C_OTHER_HEADERS = $(other_algorithms:%=$(c_src_dir)/stem_UTF_8_%.h) JAVA_SOURCES = $(libstemmer_algorithms:%=$(java_src_dir)/%Stemmer.java) COMPILER_OBJECTS=$(COMPILER_SOURCES:.c=.o) RUNTIME_OBJECTS=$(RUNTIME_SOURCES:.c=.o) LIBSTEMMER_OBJECTS=$(LIBSTEMMER_SOURCES:.c=.o) LIBSTEMMER_UTF8_OBJECTS=$(LIBSTEMMER_UTF8_SOURCES:.c=.o) STEMWORDS_OBJECTS=$(STEMWORDS_SOURCES:.c=.o) C_LIB_OBJECTS = $(C_LIB_SOURCES:.c=.o) C_OTHER_OBJECTS = $(C_OTHER_SOURCES:.c=.o) JAVA_CLASSES = $(JAVA_SOURCES:.java=.class) JAVA_RUNTIME_CLASSES=$(JAVARUNTIME_SOURCES:.java=.class) CFLAGS=-Iinclude -O2 CPPFLAGS=-W -Wall -Wmissing-prototypes -Wmissing-declarations all: snowball libstemmer.o stemwords $(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS) clean: rm -f $(COMPILER_OBJECTS) $(RUNTIME_OBJECTS) \ $(LIBSTEMMER_OBJECTS) $(LIBSTEMMER_UTF8_OBJECTS) $(STEMWORDS_OBJECTS) snowball \ libstemmer.o stemwords \ libstemmer/modules.h \ libstemmer/modules_utf8.h \ snowball.splint \ $(C_LIB_SOURCES) $(C_LIB_HEADERS) $(C_LIB_OBJECTS) \ $(C_OTHER_SOURCES) $(C_OTHER_HEADERS) $(C_OTHER_OBJECTS) \ $(JAVA_SOURCES) $(JAVA_CLASSES) $(JAVA_RUNTIME_CLASSES) \ libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak \ libstemmer/libstemmer.c libstemmer/libstemmer_utf8.c rm -rf dist rmdir $(c_src_dir) || true snowball: $(COMPILER_OBJECTS) $(CC) -o $@ $^ $(COMPILER_OBJECTS): $(COMPILER_HEADERS) libstemmer/libstemmer.c: libstemmer/libstemmer_c.in sed 's/@MODULES_H@/modules.h/' $^ >$@ libstemmer/libstemmer_utf8.c: libstemmer/libstemmer_c.in sed 's/@MODULES_H@/modules_utf8.h/' $^ >$@ libstemmer/modules.h libstemmer/mkinc.mak: libstemmer/mkmodules.pl libstemmer/modules.txt libstemmer/mkmodules.pl $@ $(c_src_dir) libstemmer/modules.txt libstemmer/mkinc.mak libstemmer/modules_utf8.h libstemmer/mkinc_utf8.mak: libstemmer/mkmodules.pl libstemmer/modules_utf8.txt libstemmer/mkmodules.pl $@ $(c_src_dir) libstemmer/modules_utf8.txt libstemmer/mkinc_utf8.mak utf8 libstemmer/libstemmer.o: libstemmer/modules.h $(C_LIB_HEADERS) libstemmer.o: libstemmer/libstemmer.o $(RUNTIME_OBJECTS) $(C_LIB_OBJECTS) $(AR) -cru $@ $^ stemwords: $(STEMWORDS_OBJECTS) libstemmer.o $(CC) -o $@ $^ algorithms/%/stem_Unicode.sbl: algorithms/%/stem_ISO_8859_1.sbl cp $^ $@ $(c_src_dir)/stem_UTF_8_%.c $(c_src_dir)/stem_UTF_8_%.h: algorithms/%/stem_Unicode.sbl snowball @mkdir -p $(c_src_dir) @l=`echo "$<" | sed 's!\(.*\)/stem_Unicode.sbl$$!\1!;s!^.*/!!'`; \ o="$(c_src_dir)/stem_UTF_8_$${l}"; \ echo "./snowball $< -o $${o} -eprefix $${l}_UTF_8_ -r ../runtime -u"; \ ./snowball $< -o $${o} -eprefix $${l}_UTF_8_ -r ../runtime -u $(c_src_dir)/stem_KOI8_R_%.c $(c_src_dir)/stem_KOI8_R_%.h: algorithms/%/stem_KOI8_R.sbl snowball @mkdir -p $(c_src_dir) @l=`echo "$<" | sed 's!\(.*\)/stem_KOI8_R.sbl$$!\1!;s!^.*/!!'`; \ o="$(c_src_dir)/stem_KOI8_R_$${l}"; \ echo "./snowball $< -o $${o} -eprefix $${l}_KOI8_R_ -r ../runtime"; \ ./snowball $< -o $${o} -eprefix $${l}_KOI8_R_ -r ../runtime $(c_src_dir)/stem_ISO_8859_1_%.c $(c_src_dir)/stem_ISO_8859_1_%.h: algorithms/%/stem_ISO_8859_1.sbl snowball @mkdir -p $(c_src_dir) @l=`echo "$<" | sed 's!\(.*\)/stem_ISO_8859_1.sbl$$!\1!;s!^.*/!!'`; \ o="$(c_src_dir)/stem_ISO_8859_1_$${l}"; \ echo "./snowball $< -o $${o} -eprefix $${l}_ISO_8859_1_ -r ../runtime"; \ ./snowball $< -o $${o} -eprefix $${l}_ISO_8859_1_ -r ../runtime $(c_src_dir)/stem_ISO_8859_2_%.c $(c_src_dir)/stem_ISO_8859_2_%.h: algorithms/%/stem_ISO_8859_2.sbl snowball @mkdir -p $(c_src_dir) @l=`echo "$<" | sed 's!\(.*\)/stem_ISO_8859_2.sbl$$!\1!;s!^.*/!!'`; \ o="$(c_src_dir)/stem_ISO_8859_2_$${l}"; \ echo "./snowball $< -o $${o} -eprefix $${l}_ISO_8859_2_ -r ../runtime"; \ ./snowball $< -o $${o} -eprefix $${l}_ISO_8859_2_ -r ../runtime $(c_src_dir)/stem_%.o: $(c_src_dir)/stem_%.c $(c_src_dir)/stem_%.h $(CC) $(CFLAGS) -O2 -c -o $@ $< -Wall $(java_src_dir)/%Stemmer.java: algorithms/%/stem_Unicode.sbl snowball @mkdir -p $(java_src_dir) @l=`echo "$<" | sed 's!\(.*\)/stem_Unicode.sbl$$!\1!;s!^.*/!!'`; \ o="$(java_src_dir)/$${l}Stemmer"; \ echo "./snowball $< -j -o $${o} -p \"org.tartarus.snowball.SnowballStemmer\" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer"; \ ./snowball $< -j -o $${o} -p "org.tartarus.snowball.SnowballStemmer" -eprefix $${l}_ -r ../runtime -n $${l}Stemmer splint: snowball.splint snowball.splint: $(COMPILER_SOURCES) splint $^ >$@ -weak # Make a full source distribution dist: dist_snowball dist_libstemmer_c dist_libstemmer_java # Make a distribution of all the sources involved in snowball dist_snowball: $(COMPILER_SOURCES) $(COMPILER_HEADERS) \ $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \ $(LIBSTEMMER_SOURCES) \ $(LIBSTEMMER_UTF8_SOURCES) \ $(LIBSTEMMER_HEADERS) \ $(LIBSTEMMER_EXTRA) \ $(ALL_ALGORITHM_FILES) $(STEMWORDS_SOURCES) \ GNUmakefile README doc/TODO libstemmer/mkmodules.pl destname=snowball_code; \ dest=dist/$${destname}; \ rm -rf $${dest} && \ rm -f $${dest}.tgz && \ for file in $^; do \ dir=`dirname $$file` && \ mkdir -p $${dest}/$${dir} && \ cp -a $${file} $${dest}/$${dir} || exit 1 ; \ done && \ (cd dist && tar zcf $${destname}.tgz $${destname}) && \ rm -rf $${dest} # Make a distribution of all the sources required to compile the C library. dist_libstemmer_c: \ $(RUNTIME_SOURCES) \ $(RUNTIME_HEADERS) \ $(LIBSTEMMER_SOURCES) \ $(LIBSTEMMER_UTF8_SOURCES) \ $(LIBSTEMMER_HEADERS) \ $(LIBSTEMMER_EXTRA) \ $(C_LIB_SOURCES) \ $(C_LIB_HEADERS) \ libstemmer/mkinc.mak \ libstemmer/mkinc_utf8.mak destname=libstemmer_c; \ dest=dist/$${destname}; \ rm -rf $${dest} && \ rm -f $${dest}.tgz && \ mkdir -p $${dest} && \ cp -a doc/libstemmer_c_README $${dest}/README && \ mkdir -p $${dest}/examples && \ cp -a examples/stemwords.c $${dest}/examples && \ mkdir -p $${dest}/$(c_src_dir) && \ cp -a $(C_LIB_SOURCES) $(C_LIB_HEADERS) $${dest}/$(c_src_dir) && \ mkdir -p $${dest}/runtime && \ cp -a $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) $${dest}/runtime && \ mkdir -p $${dest}/libstemmer && \ cp -a $(LIBSTEMMER_SOURCES) $(LIBSTEMMER_UTF8_SOURCES) $(LIBSTEMMER_HEADERS) $(LIBSTEMMER_EXTRA) $${dest}/libstemmer && \ mkdir -p $${dest}/include && \ mv $${dest}/libstemmer/libstemmer.h $${dest}/include && \ (cd $${dest} && \ echo "README" >> MANIFEST && \ ls $(c_src_dir)/*.c $(c_src_dir)/*.h >> MANIFEST && \ ls runtime/*.c runtime/*.h >> MANIFEST && \ ls libstemmer/*.c libstemmer/*.h >> MANIFEST && \ ls include/*.h >> MANIFEST) && \ cp -a libstemmer/mkinc.mak libstemmer/mkinc_utf8.mak $${dest}/ && \ echo 'include mkinc.mak' >> $${dest}/Makefile && \ echo 'CFLAGS=-Iinclude' >> $${dest}/Makefile && \ echo 'all: libstemmer.o stemwords' >> $${dest}/Makefile && \ echo 'libstemmer.o: $$(snowball_sources:.c=.o)' >> $${dest}/Makefile && \ echo ' $$(AR) -cru $$@ $$^' >> $${dest}/Makefile && \ echo 'stemwords: examples/stemwords.o libstemmer.o' >> $${dest}/Makefile && \ echo ' $$(CC) -o $$@ $$^' >> $${dest}/Makefile && \ echo 'clean:' >> $${dest}/Makefile && \ echo ' rm -f stemwords *.o $(c_src_dir)/*.o runtime/*.o libstemmer/*.o' >> $${dest}/Makefile && \ (cd dist && tar zcf $${destname}.tgz $${destname}) && \ rm -rf $${dest} # Make a distribution of all the sources required to compile the Java library. dist_libstemmer_java: $(RUNTIME_SOURCES) $(RUNTIME_HEADERS) \ $(LIBSTEMMER_EXTRA) \ $(JAVA_SOURCES) destname=libstemmer_java; \ dest=dist/$${destname}; \ rm -rf $${dest} && \ rm -f $${dest}.tgz && \ mkdir -p $${dest} && \ cp -a doc/libstemmer_java_README $${dest}/README && \ mkdir -p $${dest}/$(java_src_dir) && \ cp -a $(JAVA_SOURCES) $${dest}/$(java_src_dir) && \ mkdir -p $${dest}/$(java_src_main_dir) && \ cp -a $(JAVARUNTIME_SOURCES) $${dest}/$(java_src_main_dir) && \ (cd $${dest} && \ echo "README" >> MANIFEST && \ ls $(java_src_dir)/*.java >> MANIFEST && \ ls $(java_src_main_dir)/*.java >> MANIFEST) && \ (cd dist && tar zcf $${destname}.tgz $${destname}) && \ rm -rf $${dest} check: check_utf8 check_iso_8859_1 check_iso_8859_2 check_koi8r check_utf8: $(libstemmer_algorithms:%=check_utf8_%) check_iso_8859_1: $(ISO_8859_1_algorithms:%=check_iso_8859_1_%) check_iso_8859_2: $(ISO_8859_2_algorithms:%=check_iso_8859_2_%) check_koi8r: $(KOI8_R_algorithms:%=check_koi8r_%) check_utf8_%: ../data/% stemwords @echo "Checking output of `echo $<|sed 's!.*/!!'` stemmer with UTF-8" @./stemwords -c UTF_8 -l `echo $<|sed 's!.*/!!'` -i $ #include "api.h" #define MAXINT INT_MAX #define MININT INT_MIN #define HEAD 2*sizeof(int) #define SIZE(p) ((int *)(p))[-1] #define SET_SIZE(p, n) ((int *)(p))[-1] = n #define CAPACITY(p) ((int *)(p))[-2] struct among { int s_size; /* number of chars in string */ const symbol * s; /* search string */ int substring_i;/* index to longest matching substring */ int result; /* result of the lookup */ int (* function)(struct SN_env *); }; extern symbol * create_s(void); extern void lose_s(symbol * p); extern int skip_utf8(const symbol * p, int c, int lb, int l, int n); extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat); extern int eq_s(struct SN_env * z, int s_size, const symbol * s); extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s); extern int eq_v(struct SN_env * z, const symbol * p); extern int eq_v_b(struct SN_env * z, const symbol * p); extern int find_among(struct SN_env * z, const struct among * v, int v_size); extern int find_among_b(struct SN_env * z, const struct among * v, int v_size); extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjustment); extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s); extern int slice_from_v(struct SN_env * z, const symbol * p); extern int slice_del(struct SN_env * z); extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s); extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p); extern symbol * slice_to(struct SN_env * z, symbol * p); extern symbol * assign_to(struct SN_env * z, symbol * p); extern void debug(struct SN_env * z, int number, int line_count); snowball_code/runtime/api.c0000644000175000017500000000255012707117052014415 0ustar domdom #include /* for calloc, free */ #include "header.h" extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size) { struct SN_env * z = (struct SN_env *) calloc(1, sizeof(struct SN_env)); if (z == NULL) return NULL; z->p = create_s(); if (z->p == NULL) goto error; if (S_size) { int i; z->S = (symbol * *) calloc(S_size, sizeof(symbol *)); if (z->S == NULL) goto error; for (i = 0; i < S_size; i++) { z->S[i] = create_s(); if (z->S[i] == NULL) goto error; } } if (I_size) { z->I = (int *) calloc(I_size, sizeof(int)); if (z->I == NULL) goto error; } if (B_size) { z->B = (unsigned char *) calloc(B_size, sizeof(unsigned char)); if (z->B == NULL) goto error; } return z; error: SN_close_env(z, S_size); return NULL; } extern void SN_close_env(struct SN_env * z, int S_size) { if (z == NULL) return; if (S_size) { int i; for (i = 0; i < S_size; i++) { lose_s(z->S[i]); } free(z->S); } free(z->I); free(z->B); if (z->p) lose_s(z->p); free(z); } extern int SN_set_current(struct SN_env * z, int size, const symbol * s) { int err = replace_s(z, 0, z->l, size, s, NULL); z->c = 0; return err; } snowball_code/runtime/api.h0000644000175000017500000000137312707117052014424 0ustar domdom typedef unsigned char symbol; /* Or replace 'char' above with 'short' for 16 bit characters. More precisely, replace 'char' with whatever type guarantees the character width you need. Note however that sizeof(symbol) should divide HEAD, defined in header.h as 2*sizeof(int), without remainder, otherwise there is an alignment problem. In the unlikely event of a problem here, consult Martin Porter. */ struct SN_env { symbol * p; int c; int l; int lb; int bra; int ket; symbol * * S; int * I; unsigned char * B; }; extern struct SN_env * SN_create_env(int S_size, int I_size, int B_size); extern void SN_close_env(struct SN_env * z, int S_size); extern int SN_set_current(struct SN_env * z, int size, const symbol * s); snowball_code/runtime/utilities.c0000644000175000017500000003134312707117052015661 0ustar domdom #include #include #include #include "header.h" #define unless(C) if(!(C)) #define CREATE_SIZE 1 extern symbol * create_s(void) { symbol * p; void * mem = malloc(HEAD + (CREATE_SIZE + 1) * sizeof(symbol)); if (mem == NULL) return NULL; p = (symbol *) (HEAD + (char *) mem); CAPACITY(p) = CREATE_SIZE; SET_SIZE(p, CREATE_SIZE); return p; } extern void lose_s(symbol * p) { if (p == NULL) return; free((char *) p - HEAD); } /* new_p = skip_utf8(p, c, lb, l, n); skips n characters forwards from p + c if n +ve, or n characters backwards from p + c - 1 if n -ve. new_p is the new position, or 0 on failure. -- used to implement hop and next in the utf8 case. */ extern int skip_utf8(const symbol * p, int c, int lb, int l, int n) { int b; if (n >= 0) { for (; n > 0; n--) { if (c >= l) return -1; b = p[c++]; if (b >= 0xC0) { /* 1100 0000 */ while (c < l) { b = p[c]; if (b >= 0xC0 || b < 0x80) break; /* break unless b is 10------ */ c++; } } } } else { for (; n < 0; n++) { if (c <= lb) return -1; b = p[--c]; if (b >= 0x80) { /* 1000 0000 */ while (c > lb) { b = p[c]; if (b >= 0xC0) break; /* 1100 0000 */ c--; } } } } return c; } /* Code for character groupings: utf8 cases */ static int get_utf8(const symbol * p, int c, int l, int * slot) { int b0, b1; if (c >= l) return 0; b0 = p[c++]; if (b0 < 0xC0 || c == l) { /* 1100 0000 */ * slot = b0; return 1; } b1 = p[c++]; if (b0 < 0xE0 || c == l) { /* 1110 0000 */ * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2; } * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (p[c] & 0x3F); return 3; } static int get_b_utf8(const symbol * p, int c, int lb, int * slot) { int b0, b1; if (c <= lb) return 0; b0 = p[--c]; if (b0 < 0x80 || c == lb) { /* 1000 0000 */ * slot = b0; return 1; } b1 = p[--c]; if (b1 >= 0xC0 || c == lb) { /* 1100 0000 */ * slot = (b1 & 0x1F) << 6 | (b0 & 0x3F); return 2; } * slot = (p[c] & 0xF) << 12 | (b1 & 0x3F) << 6 | (b0 & 0x3F); return 3; } extern int in_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; int w = get_utf8(z->p, z->c, z->l, & ch); unless (w) return -1; if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return w; z->c += w; } while (repeat); return 0; } extern int in_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; int w = get_b_utf8(z->p, z->c, z->lb, & ch); unless (w) return -1; if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return w; z->c -= w; } while (repeat); return 0; } extern int out_grouping_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; int w = get_utf8(z->p, z->c, z->l, & ch); unless (w) return -1; unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return w; z->c += w; } while (repeat); return 0; } extern int out_grouping_b_U(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; int w = get_b_utf8(z->p, z->c, z->lb, & ch); unless (w) return -1; unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return w; z->c -= w; } while (repeat); return 0; } /* Code for character groupings: non-utf8 cases */ extern int in_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; if (z->c >= z->l) return -1; ch = z->p[z->c]; if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 1; z->c++; } while (repeat); return 0; } extern int in_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; if (z->c <= z->lb) return -1; ch = z->p[z->c - 1]; if (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 1; z->c--; } while (repeat); return 0; } extern int out_grouping(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; if (z->c >= z->l) return -1; ch = z->p[z->c]; unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 1; z->c++; } while (repeat); return 0; } extern int out_grouping_b(struct SN_env * z, const unsigned char * s, int min, int max, int repeat) { do { int ch; if (z->c <= z->lb) return -1; ch = z->p[z->c - 1]; unless (ch > max || (ch -= min) < 0 || (s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return 1; z->c--; } while (repeat); return 0; } extern int eq_s(struct SN_env * z, int s_size, const symbol * s) { if (z->l - z->c < s_size || memcmp(z->p + z->c, s, s_size * sizeof(symbol)) != 0) return 0; z->c += s_size; return 1; } extern int eq_s_b(struct SN_env * z, int s_size, const symbol * s) { if (z->c - z->lb < s_size || memcmp(z->p + z->c - s_size, s, s_size * sizeof(symbol)) != 0) return 0; z->c -= s_size; return 1; } extern int eq_v(struct SN_env * z, const symbol * p) { return eq_s(z, SIZE(p), p); } extern int eq_v_b(struct SN_env * z, const symbol * p) { return eq_s_b(z, SIZE(p), p); } extern int find_among(struct SN_env * z, const struct among * v, int v_size) { int i = 0; int j = v_size; int c = z->c; int l = z->l; symbol * q = z->p + c; const struct among * w; int common_i = 0; int common_j = 0; int first_key_inspected = 0; while(1) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; /* smaller */ w = v + k; { int i2; for (i2 = common; i2 < w->s_size; i2++) { if (c + common == l) { diff = -1; break; } diff = q[common] - w->s[i2]; if (diff != 0) break; common++; } } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; /* v->s has been inspected */ if (j == i) break; /* only one item in v */ /* - but now we need to go round once more to get v->s inspected. This looks messy, but is actually the optimal approach. */ if (first_key_inspected) break; first_key_inspected = 1; } } while(1) { w = v + i; if (common_i >= w->s_size) { z->c = c + w->s_size; if (w->function == 0) return w->result; { int res = w->function(z); z->c = c + w->s_size; if (res) return w->result; } } i = w->substring_i; if (i < 0) return 0; } } /* find_among_b is for backwards processing. Same comments apply */ extern int find_among_b(struct SN_env * z, const struct among * v, int v_size) { int i = 0; int j = v_size; int c = z->c; int lb = z->lb; symbol * q = z->p + c - 1; const struct among * w; int common_i = 0; int common_j = 0; int first_key_inspected = 0; while(1) { int k = i + ((j - i) >> 1); int diff = 0; int common = common_i < common_j ? common_i : common_j; w = v + k; { int i2; for (i2 = w->s_size - 1 - common; i2 >= 0; i2--) { if (c - common == lb) { diff = -1; break; } diff = q[- common] - w->s[i2]; if (diff != 0) break; common++; } } if (diff < 0) { j = k; common_j = common; } else { i = k; common_i = common; } if (j - i <= 1) { if (i > 0) break; if (j == i) break; if (first_key_inspected) break; first_key_inspected = 1; } } while(1) { w = v + i; if (common_i >= w->s_size) { z->c = c - w->s_size; if (w->function == 0) return w->result; { int res = w->function(z); z->c = c - w->s_size; if (res) return w->result; } } i = w->substring_i; if (i < 0) return 0; } } /* Increase the size of the buffer pointed to by p to at least n symbols. * If insufficient memory, returns NULL and frees the old buffer. */ static symbol * increase_size(symbol * p, int n) { symbol * q; int new_size = n + 20; void * mem = realloc((char *) p - HEAD, HEAD + (new_size + 1) * sizeof(symbol)); if (mem == NULL) { lose_s(p); return NULL; } q = (symbol *) (HEAD + (char *)mem); CAPACITY(q) = new_size; return q; } /* to replace symbols between c_bra and c_ket in z->p by the s_size symbols at s. Returns 0 on success, -1 on error. Also, frees z->p (and sets it to NULL) on error. */ extern int replace_s(struct SN_env * z, int c_bra, int c_ket, int s_size, const symbol * s, int * adjptr) { int adjustment; int len; if (z->p == NULL) { z->p = create_s(); if (z->p == NULL) return -1; } adjustment = s_size - (c_ket - c_bra); len = SIZE(z->p); if (adjustment != 0) { if (adjustment + len > CAPACITY(z->p)) { z->p = increase_size(z->p, adjustment + len); if (z->p == NULL) return -1; } memmove(z->p + c_ket + adjustment, z->p + c_ket, (len - c_ket) * sizeof(symbol)); SET_SIZE(z->p, adjustment + len); z->l += adjustment; if (z->c >= c_ket) z->c += adjustment; else if (z->c > c_bra) z->c = c_bra; } unless (s_size == 0) memmove(z->p + c_bra, s, s_size * sizeof(symbol)); if (adjptr != NULL) *adjptr = adjustment; return 0; } static int slice_check(struct SN_env * z) { if (z->bra < 0 || z->bra > z->ket || z->ket > z->l || z->p == NULL || z->l > SIZE(z->p)) /* this line could be removed */ { #if 0 fprintf(stderr, "faulty slice operation:\n"); debug(z, -1, 0); #endif return -1; } return 0; } extern int slice_from_s(struct SN_env * z, int s_size, const symbol * s) { if (slice_check(z)) return -1; return replace_s(z, z->bra, z->ket, s_size, s, NULL); } extern int slice_from_v(struct SN_env * z, const symbol * p) { return slice_from_s(z, SIZE(p), p); } extern int slice_del(struct SN_env * z) { return slice_from_s(z, 0, 0); } extern int insert_s(struct SN_env * z, int bra, int ket, int s_size, const symbol * s) { int adjustment; if (replace_s(z, bra, ket, s_size, s, &adjustment)) return -1; if (bra <= z->bra) z->bra += adjustment; if (bra <= z->ket) z->ket += adjustment; return 0; } extern int insert_v(struct SN_env * z, int bra, int ket, const symbol * p) { int adjustment; if (replace_s(z, bra, ket, SIZE(p), p, &adjustment)) return -1; if (bra <= z->bra) z->bra += adjustment; if (bra <= z->ket) z->ket += adjustment; return 0; } extern symbol * slice_to(struct SN_env * z, symbol * p) { if (slice_check(z)) { lose_s(p); return NULL; } { int len = z->ket - z->bra; if (CAPACITY(p) < len) { p = increase_size(p, len); if (p == NULL) return NULL; } memmove(p, z->p + z->bra, len * sizeof(symbol)); SET_SIZE(p, len); } return p; } extern symbol * assign_to(struct SN_env * z, symbol * p) { int len = z->l; if (CAPACITY(p) < len) { p = increase_size(p, len); if (p == NULL) return NULL; } memmove(p, z->p, len * sizeof(symbol)); SET_SIZE(p, len); return p; } #if 0 extern void debug(struct SN_env * z, int number, int line_count) { int i; int limit = SIZE(z->p); /*if (number >= 0) printf("%3d (line %4d): '", number, line_count);*/ if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit); for (i = 0; i <= limit; i++) { if (z->lb == i) printf("{"); if (z->bra == i) printf("["); if (z->c == i) printf("|"); if (z->ket == i) printf("]"); if (z->l == i) printf("}"); if (i < limit) { int ch = z->p[i]; if (ch == 0) ch = '#'; printf("%c", ch); } } printf("'\n"); } #endif snowball_code/libstemmer/0000755000175000017500000000000012707117052014156 5ustar domdomsnowball_code/libstemmer/modules_utf8.txt0000644000175000017500000000465212707117052017344 0ustar domdom# This file contains a list of stemmers to include in the distribution. # The format is a set of space separated lines - on each line: # First item is name of stemmer. # Second item is comma separated list of character sets. # Third item is comma separated list of names to refer to the stemmer by. # # Lines starting with a #, or blank lines, are ignored. # List all the main algorithms for each language, in UTF-8. danish UTF_8 danish,da,dan dutch UTF_8 dutch,nl,dut,nld english UTF_8 english,en,eng finnish UTF_8 finnish,fi,fin french UTF_8 french,fr,fre,fra german UTF_8 german,de,ger,deu hungarian UTF_8 hungarian,hu,hun italian UTF_8 italian,it,ita norwegian UTF_8 norwegian,no,nor portuguese UTF_8 portuguese,pt,por romanian UTF_8 romanian,ro,rum,ron russian UTF_8 russian,ru,rus spanish UTF_8 spanish,es,esl,spa swedish UTF_8 swedish,sv,swe turkish UTF_8 turkish,tr,tur # Also include the traditional porter algorithm for english. # The porter algorithm is included in the libstemmer distribution to assist # with backwards compatibility, but for new systems the english algorithm # should be used in preference. porter UTF_8 porter # Some other stemmers in the snowball project are not included in the standard # distribution. To compile a libstemmer with them in, add them to this list, # and regenerate the distribution. (You will need a full source checkout for # this.) They are included in the snowball website as curiosities, but are not # intended for general use, and use of them is is not fully supported. These # algorithms are: # # german2 - This is a slight modification of the german stemmer. #german2 UTF_8 german2 # # kraaij_pohlmann - This is a different dutch stemmer. #kraaij_pohlmann UTF_8 kraaij_pohlmann # # lovins - This is an english stemmer, but fairly outdated, and # only really applicable to a restricted type of input text # (keywords in academic publications). #lovins UTF_8 lovins snowball_code/libstemmer/libstemmer.c0000644000175000017500000000425312707117052016471 0ustar domdom #include #include #include "../include/libstemmer.h" #include "../runtime/api.h" #include "modules.h" struct sb_stemmer { struct SN_env * (*create)(void); void (*close)(struct SN_env *); int (*stem)(struct SN_env *); struct SN_env * env; }; extern const char ** sb_stemmer_list(void) { return algorithm_names; } static stemmer_encoding_t sb_getenc(const char * charenc) { struct stemmer_encoding * encoding; if (charenc == NULL) return ENC_UTF_8; for (encoding = encodings; encoding->name != 0; encoding++) { if (strcmp(encoding->name, charenc) == 0) break; } if (encoding->name == NULL) return ENC_UNKNOWN; return encoding->enc; } extern struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc) { stemmer_encoding_t enc; struct stemmer_modules * module; struct sb_stemmer * stemmer; enc = sb_getenc(charenc); if (enc == ENC_UNKNOWN) return NULL; for (module = modules; module->name != 0; module++) { if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break; } if (module->name == NULL) return NULL; stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer)); if (stemmer == NULL) return NULL; stemmer->create = module->create; stemmer->close = module->close; stemmer->stem = module->stem; stemmer->env = stemmer->create(); if (stemmer->env == NULL) { sb_stemmer_delete(stemmer); return NULL; } return stemmer; } void sb_stemmer_delete(struct sb_stemmer * stemmer) { if (stemmer == 0) return; if (stemmer->close == 0) return; stemmer->close(stemmer->env); stemmer->close = 0; free(stemmer); } const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size) { int ret; if (SN_set_current(stemmer->env, size, (const symbol *)(word))) { stemmer->env->l = 0; return NULL; } ret = stemmer->stem(stemmer->env); if (ret < 0) return NULL; stemmer->env->p[stemmer->env->l] = 0; return (const sb_symbol *)(stemmer->env->p); } int sb_stemmer_length(struct sb_stemmer * stemmer) { return stemmer->env->l; } snowball_code/libstemmer/modules_utf8.h0000644000175000017500000001466312707117052016757 0ustar domdom/* libstemmer/modules_utf8.h: List of stemming modules. * * This file is generated by mkmodules.pl from a list of module names. * Do not edit manually. * * Modules included by this file are: danish, dutch, english, finnish, french, * german, hungarian, italian, norwegian, porter, portuguese, romanian, * russian, spanish, swedish, turkish */ #include "../src_c/stem_UTF_8_danish.h" #include "../src_c/stem_UTF_8_dutch.h" #include "../src_c/stem_UTF_8_english.h" #include "../src_c/stem_UTF_8_finnish.h" #include "../src_c/stem_UTF_8_french.h" #include "../src_c/stem_UTF_8_german.h" #include "../src_c/stem_UTF_8_hungarian.h" #include "../src_c/stem_UTF_8_italian.h" #include "../src_c/stem_UTF_8_norwegian.h" #include "../src_c/stem_UTF_8_porter.h" #include "../src_c/stem_UTF_8_portuguese.h" #include "../src_c/stem_UTF_8_romanian.h" #include "../src_c/stem_UTF_8_russian.h" #include "../src_c/stem_UTF_8_spanish.h" #include "../src_c/stem_UTF_8_swedish.h" #include "../src_c/stem_UTF_8_turkish.h" typedef enum { ENC_UNKNOWN=0, ENC_UTF_8 } stemmer_encoding_t; struct stemmer_encoding { const char * name; stemmer_encoding_t enc; }; static struct stemmer_encoding encodings[] = { {"UTF_8", ENC_UTF_8}, {0,ENC_UNKNOWN} }; struct stemmer_modules { const char * name; stemmer_encoding_t enc; struct SN_env * (*create)(void); void (*close)(struct SN_env *); int (*stem)(struct SN_env *); }; static struct stemmer_modules modules[] = { {"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem}, {"dan", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem}, {"danish", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem}, {"de", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, {"deu", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, {"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}, {"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}, {"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem}, {"eng", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem}, {"english", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem}, {"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}, {"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}, {"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem}, {"fin", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem}, {"finnish", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem}, {"fr", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}, {"fra", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}, {"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}, {"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}, {"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, {"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, {"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem}, {"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem}, {"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem}, {"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem}, {"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem}, {"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem}, {"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}, {"nld", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}, {"no", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem}, {"nor", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem}, {"norwegian", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem}, {"por", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem}, {"porter", ENC_UTF_8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem}, {"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem}, {"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem}, {"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}, {"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}, {"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}, {"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem}, {"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}, {"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem}, {"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem}, {"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}, {"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}, {"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem}, {"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem}, {"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem}, {"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem}, {"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem}, {"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem}, {0,ENC_UNKNOWN,0,0,0} }; static const char * algorithm_names[] = { "danish", "dutch", "english", "finnish", "french", "german", "hungarian", "italian", "norwegian", "porter", "portuguese", "romanian", "russian", "spanish", "swedish", "turkish", 0 }; snowball_code/libstemmer/libstemmer_utf8.c0000644000175000017500000000426012707117052017435 0ustar domdom #include #include #include "../include/libstemmer.h" #include "../runtime/api.h" #include "modules_utf8.h" struct sb_stemmer { struct SN_env * (*create)(void); void (*close)(struct SN_env *); int (*stem)(struct SN_env *); struct SN_env * env; }; extern const char ** sb_stemmer_list(void) { return algorithm_names; } static stemmer_encoding_t sb_getenc(const char * charenc) { struct stemmer_encoding * encoding; if (charenc == NULL) return ENC_UTF_8; for (encoding = encodings; encoding->name != 0; encoding++) { if (strcmp(encoding->name, charenc) == 0) break; } if (encoding->name == NULL) return ENC_UNKNOWN; return encoding->enc; } extern struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc) { stemmer_encoding_t enc; struct stemmer_modules * module; struct sb_stemmer * stemmer; enc = sb_getenc(charenc); if (enc == ENC_UNKNOWN) return NULL; for (module = modules; module->name != 0; module++) { if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break; } if (module->name == NULL) return NULL; stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer)); if (stemmer == NULL) return NULL; stemmer->create = module->create; stemmer->close = module->close; stemmer->stem = module->stem; stemmer->env = stemmer->create(); if (stemmer->env == NULL) { sb_stemmer_delete(stemmer); return NULL; } return stemmer; } void sb_stemmer_delete(struct sb_stemmer * stemmer) { if (stemmer == 0) return; if (stemmer->close == 0) return; stemmer->close(stemmer->env); stemmer->close = 0; free(stemmer); } const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size) { int ret; if (SN_set_current(stemmer->env, size, (const symbol *)(word))) { stemmer->env->l = 0; return NULL; } ret = stemmer->stem(stemmer->env); if (ret < 0) return NULL; stemmer->env->p[stemmer->env->l] = 0; return (const sb_symbol *)(stemmer->env->p); } int sb_stemmer_length(struct sb_stemmer * stemmer) { return stemmer->env->l; } snowball_code/libstemmer/mkmodules.pl0000755000175000017500000001342312707117052016521 0ustar domdom#!/usr/bin/perl -w use strict; my $progname = $0; if (scalar @ARGV < 4 || scalar @ARGV > 5) { print "Usage: $progname []\n"; exit 1; } my $outname = shift(@ARGV); my $c_src_dir = shift(@ARGV); my $descfile = shift(@ARGV); my $srclistfile = shift(@ARGV); my $extn = ''; if (@ARGV) { $extn = '_'.shift(@ARGV); } my %aliases = (); my %algorithms = (); my %algorithm_encs = (); my %encs = (); sub addalgenc($$) { my $alg = shift(); my $enc = shift(); if (defined $algorithm_encs{$alg}) { my $hashref = $algorithm_encs{$alg}; $$hashref{$enc}=1; } else { my %newhash = ($enc => 1); $algorithm_encs{$alg}=\%newhash; } $encs{$enc} = 1; } sub readinput() { open DESCFILE, $descfile; my $line; while($line = ) { next if $line =~ m/^\s*#/; next if $line =~ m/^\s*$/; my ($alg,$encstr,$aliases) = split(/\s+/, $line); my $enc; my $alias; $algorithms{$alg} = 1; foreach $alias (split(/,/, $aliases)) { foreach $enc (split(/,/, $encstr)) { # print "$alias, $enc\n"; $aliases{$alias} = $alg; addalgenc($alg, $enc); } } } } sub printoutput() { open (OUT, ">$outname") or die "Can't open output file `$outname': $!\n"; print OUT < 77) { print OUT ",\n * "; $linelen = 3; } else { print OUT ', '; $linelen += 2; } } print OUT $lang; $linelen += length($lang); $need_sep = 1; } print OUT "\n */\n\n"; foreach $lang (@algorithms) { my $hashref = $algorithm_encs{$lang}; foreach $enc (sort keys (%$hashref)) { print OUT "#include \"../$c_src_dir/stem_${enc}_$lang.h\"\n"; } } print OUT <$srclistfile") or die "Can't open output file `$srclistfile': $!\n"; print OUT < 77) { print OUT ",\n# "; $linelen = 3; } else { print OUT ', '; $linelen += 2; } } print OUT $lang; $linelen += length($lang); $need_sep = 1; } print OUT "\n\nsnowball_sources= \\\n"; for $lang (sort keys %aliases) { my $hashref = $algorithm_encs{$lang}; my $enc; foreach $enc (sort keys (%$hashref)) { print OUT " src_c/stem_${enc}_${lang}.c \\\n"; } } $need_sep = 0; for $srcfile ('runtime/api.c', 'runtime/utilities.c', "libstemmer/libstemmer${extn}.c") { print OUT " \\\n" if $need_sep; print OUT " $srcfile"; $need_sep = 1; } print OUT "\n\nsnowball_headers= \\\n"; for $lang (sort keys %aliases) { my $hashref = $algorithm_encs{$lang}; my $enc; foreach $enc (sort keys (%$hashref)) { my $p = "${lang}_${enc}"; print OUT " src_c/stem_${enc}_${lang}.h \\\n"; } } $need_sep = 0; for $srcfile ('include/libstemmer.h', "libstemmer/modules${extn}.h", 'runtime/api.h', 'runtime/header.h') { print OUT " \\\n" if $need_sep; print OUT " $srcfile"; $need_sep = 1; } print OUT "\n\n"; close OUT or die "Can't close ${srclistfile}: $!\n"; } readinput(); printoutput(); printsrclist(); snowball_code/libstemmer/modules.txt0000644000175000017500000000473312707117052016376 0ustar domdom# This file contains a list of stemmers to include in the distribution. # The format is a set of space separated lines - on each line: # First item is name of stemmer. # Second item is comma separated list of character sets. # Third item is comma separated list of names to refer to the stemmer by. # # Lines starting with a #, or blank lines, are ignored. # List all the main algorithms for each language, in UTF-8, and also with # the most commonly used encoding. danish UTF_8,ISO_8859_1 danish,da,dan dutch UTF_8,ISO_8859_1 dutch,nl,dut,nld english UTF_8,ISO_8859_1 english,en,eng finnish UTF_8,ISO_8859_1 finnish,fi,fin french UTF_8,ISO_8859_1 french,fr,fre,fra german UTF_8,ISO_8859_1 german,de,ger,deu hungarian UTF_8,ISO_8859_1 hungarian,hu,hun italian UTF_8,ISO_8859_1 italian,it,ita norwegian UTF_8,ISO_8859_1 norwegian,no,nor portuguese UTF_8,ISO_8859_1 portuguese,pt,por romanian UTF_8,ISO_8859_2 romanian,ro,rum,ron russian UTF_8,KOI8_R russian,ru,rus spanish UTF_8,ISO_8859_1 spanish,es,esl,spa swedish UTF_8,ISO_8859_1 swedish,sv,swe turkish UTF_8 turkish,tr,tur # Also include the traditional porter algorithm for english. # The porter algorithm is included in the libstemmer distribution to assist # with backwards compatibility, but for new systems the english algorithm # should be used in preference. porter UTF_8,ISO_8859_1 porter # Some other stemmers in the snowball project are not included in the standard # distribution. To compile a libstemmer with them in, add them to this list, # and regenerate the distribution. (You will need a full source checkout for # this.) They are included in the snowball website as curiosities, but are not # intended for general use, and use of them is is not fully supported. These # algorithms are: # # german2 - This is a slight modification of the german stemmer. #german2 UTF_8,ISO_8859_1 german2 # # kraaij_pohlmann - This is a different dutch stemmer. #kraaij_pohlmann UTF_8,ISO_8859_1 kraaij_pohlmann # # lovins - This is an english stemmer, but fairly outdated, and # only really applicable to a restricted type of input text # (keywords in academic publications). #lovins UTF_8,ISO_8859_1 lovins snowball_code/libstemmer/modules.h0000644000175000017500000003105512707117052016003 0ustar domdom/* libstemmer/modules.h: List of stemming modules. * * This file is generated by mkmodules.pl from a list of module names. * Do not edit manually. * * Modules included by this file are: danish, dutch, english, finnish, french, * german, hungarian, italian, norwegian, porter, portuguese, romanian, * russian, spanish, swedish, turkish */ #include "../src_c/stem_ISO_8859_1_danish.h" #include "../src_c/stem_UTF_8_danish.h" #include "../src_c/stem_ISO_8859_1_dutch.h" #include "../src_c/stem_UTF_8_dutch.h" #include "../src_c/stem_ISO_8859_1_english.h" #include "../src_c/stem_UTF_8_english.h" #include "../src_c/stem_ISO_8859_1_finnish.h" #include "../src_c/stem_UTF_8_finnish.h" #include "../src_c/stem_ISO_8859_1_french.h" #include "../src_c/stem_UTF_8_french.h" #include "../src_c/stem_ISO_8859_1_german.h" #include "../src_c/stem_UTF_8_german.h" #include "../src_c/stem_ISO_8859_1_hungarian.h" #include "../src_c/stem_UTF_8_hungarian.h" #include "../src_c/stem_ISO_8859_1_italian.h" #include "../src_c/stem_UTF_8_italian.h" #include "../src_c/stem_ISO_8859_1_norwegian.h" #include "../src_c/stem_UTF_8_norwegian.h" #include "../src_c/stem_ISO_8859_1_porter.h" #include "../src_c/stem_UTF_8_porter.h" #include "../src_c/stem_ISO_8859_1_portuguese.h" #include "../src_c/stem_UTF_8_portuguese.h" #include "../src_c/stem_ISO_8859_2_romanian.h" #include "../src_c/stem_UTF_8_romanian.h" #include "../src_c/stem_KOI8_R_russian.h" #include "../src_c/stem_UTF_8_russian.h" #include "../src_c/stem_ISO_8859_1_spanish.h" #include "../src_c/stem_UTF_8_spanish.h" #include "../src_c/stem_ISO_8859_1_swedish.h" #include "../src_c/stem_UTF_8_swedish.h" #include "../src_c/stem_UTF_8_turkish.h" typedef enum { ENC_UNKNOWN=0, ENC_ISO_8859_1, ENC_ISO_8859_2, ENC_KOI8_R, ENC_UTF_8 } stemmer_encoding_t; struct stemmer_encoding { const char * name; stemmer_encoding_t enc; }; static struct stemmer_encoding encodings[] = { {"ISO_8859_1", ENC_ISO_8859_1}, {"ISO_8859_2", ENC_ISO_8859_2}, {"KOI8_R", ENC_KOI8_R}, {"UTF_8", ENC_UTF_8}, {0,ENC_UNKNOWN} }; struct stemmer_modules { const char * name; stemmer_encoding_t enc; struct SN_env * (*create)(void); void (*close)(struct SN_env *); int (*stem)(struct SN_env *); }; static struct stemmer_modules modules[] = { {"da", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem}, {"da", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem}, {"dan", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem}, {"dan", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem}, {"danish", ENC_ISO_8859_1, danish_ISO_8859_1_create_env, danish_ISO_8859_1_close_env, danish_ISO_8859_1_stem}, {"danish", ENC_UTF_8, danish_UTF_8_create_env, danish_UTF_8_close_env, danish_UTF_8_stem}, {"de", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem}, {"de", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, {"deu", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem}, {"deu", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, {"dut", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem}, {"dut", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}, {"dutch", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem}, {"dutch", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}, {"en", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem}, {"en", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem}, {"eng", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem}, {"eng", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem}, {"english", ENC_ISO_8859_1, english_ISO_8859_1_create_env, english_ISO_8859_1_close_env, english_ISO_8859_1_stem}, {"english", ENC_UTF_8, english_UTF_8_create_env, english_UTF_8_close_env, english_UTF_8_stem}, {"es", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem}, {"es", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}, {"esl", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem}, {"esl", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}, {"fi", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem}, {"fi", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem}, {"fin", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem}, {"fin", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem}, {"finnish", ENC_ISO_8859_1, finnish_ISO_8859_1_create_env, finnish_ISO_8859_1_close_env, finnish_ISO_8859_1_stem}, {"finnish", ENC_UTF_8, finnish_UTF_8_create_env, finnish_UTF_8_close_env, finnish_UTF_8_stem}, {"fr", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem}, {"fr", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}, {"fra", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem}, {"fra", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}, {"fre", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem}, {"fre", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}, {"french", ENC_ISO_8859_1, french_ISO_8859_1_create_env, french_ISO_8859_1_close_env, french_ISO_8859_1_stem}, {"french", ENC_UTF_8, french_UTF_8_create_env, french_UTF_8_close_env, french_UTF_8_stem}, {"ger", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem}, {"ger", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, {"german", ENC_ISO_8859_1, german_ISO_8859_1_create_env, german_ISO_8859_1_close_env, german_ISO_8859_1_stem}, {"german", ENC_UTF_8, german_UTF_8_create_env, german_UTF_8_close_env, german_UTF_8_stem}, {"hu", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem}, {"hu", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem}, {"hun", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem}, {"hun", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem}, {"hungarian", ENC_ISO_8859_1, hungarian_ISO_8859_1_create_env, hungarian_ISO_8859_1_close_env, hungarian_ISO_8859_1_stem}, {"hungarian", ENC_UTF_8, hungarian_UTF_8_create_env, hungarian_UTF_8_close_env, hungarian_UTF_8_stem}, {"it", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem}, {"it", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem}, {"ita", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem}, {"ita", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem}, {"italian", ENC_ISO_8859_1, italian_ISO_8859_1_create_env, italian_ISO_8859_1_close_env, italian_ISO_8859_1_stem}, {"italian", ENC_UTF_8, italian_UTF_8_create_env, italian_UTF_8_close_env, italian_UTF_8_stem}, {"nl", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem}, {"nl", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}, {"nld", ENC_ISO_8859_1, dutch_ISO_8859_1_create_env, dutch_ISO_8859_1_close_env, dutch_ISO_8859_1_stem}, {"nld", ENC_UTF_8, dutch_UTF_8_create_env, dutch_UTF_8_close_env, dutch_UTF_8_stem}, {"no", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem}, {"no", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem}, {"nor", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem}, {"nor", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem}, {"norwegian", ENC_ISO_8859_1, norwegian_ISO_8859_1_create_env, norwegian_ISO_8859_1_close_env, norwegian_ISO_8859_1_stem}, {"norwegian", ENC_UTF_8, norwegian_UTF_8_create_env, norwegian_UTF_8_close_env, norwegian_UTF_8_stem}, {"por", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem}, {"por", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem}, {"porter", ENC_ISO_8859_1, porter_ISO_8859_1_create_env, porter_ISO_8859_1_close_env, porter_ISO_8859_1_stem}, {"porter", ENC_UTF_8, porter_UTF_8_create_env, porter_UTF_8_close_env, porter_UTF_8_stem}, {"portuguese", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem}, {"portuguese", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem}, {"pt", ENC_ISO_8859_1, portuguese_ISO_8859_1_create_env, portuguese_ISO_8859_1_close_env, portuguese_ISO_8859_1_stem}, {"pt", ENC_UTF_8, portuguese_UTF_8_create_env, portuguese_UTF_8_close_env, portuguese_UTF_8_stem}, {"ro", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem}, {"ro", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}, {"romanian", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem}, {"romanian", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}, {"ron", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem}, {"ron", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}, {"ru", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem}, {"ru", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem}, {"rum", ENC_ISO_8859_2, romanian_ISO_8859_2_create_env, romanian_ISO_8859_2_close_env, romanian_ISO_8859_2_stem}, {"rum", ENC_UTF_8, romanian_UTF_8_create_env, romanian_UTF_8_close_env, romanian_UTF_8_stem}, {"rus", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem}, {"rus", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem}, {"russian", ENC_KOI8_R, russian_KOI8_R_create_env, russian_KOI8_R_close_env, russian_KOI8_R_stem}, {"russian", ENC_UTF_8, russian_UTF_8_create_env, russian_UTF_8_close_env, russian_UTF_8_stem}, {"spa", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem}, {"spa", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}, {"spanish", ENC_ISO_8859_1, spanish_ISO_8859_1_create_env, spanish_ISO_8859_1_close_env, spanish_ISO_8859_1_stem}, {"spanish", ENC_UTF_8, spanish_UTF_8_create_env, spanish_UTF_8_close_env, spanish_UTF_8_stem}, {"sv", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem}, {"sv", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem}, {"swe", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem}, {"swe", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem}, {"swedish", ENC_ISO_8859_1, swedish_ISO_8859_1_create_env, swedish_ISO_8859_1_close_env, swedish_ISO_8859_1_stem}, {"swedish", ENC_UTF_8, swedish_UTF_8_create_env, swedish_UTF_8_close_env, swedish_UTF_8_stem}, {"tr", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem}, {"tur", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem}, {"turkish", ENC_UTF_8, turkish_UTF_8_create_env, turkish_UTF_8_close_env, turkish_UTF_8_stem}, {0,ENC_UNKNOWN,0,0,0} }; static const char * algorithm_names[] = { "danish", "dutch", "english", "finnish", "french", "german", "hungarian", "italian", "norwegian", "porter", "portuguese", "romanian", "russian", "spanish", "swedish", "turkish", 0 }; snowball_code/libstemmer/libstemmer_c.in0000644000175000017500000000425512707117052017161 0ustar domdom #include #include #include "../include/libstemmer.h" #include "../runtime/api.h" #include "@MODULES_H@" struct sb_stemmer { struct SN_env * (*create)(void); void (*close)(struct SN_env *); int (*stem)(struct SN_env *); struct SN_env * env; }; extern const char ** sb_stemmer_list(void) { return algorithm_names; } static stemmer_encoding_t sb_getenc(const char * charenc) { struct stemmer_encoding * encoding; if (charenc == NULL) return ENC_UTF_8; for (encoding = encodings; encoding->name != 0; encoding++) { if (strcmp(encoding->name, charenc) == 0) break; } if (encoding->name == NULL) return ENC_UNKNOWN; return encoding->enc; } extern struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc) { stemmer_encoding_t enc; struct stemmer_modules * module; struct sb_stemmer * stemmer; enc = sb_getenc(charenc); if (enc == ENC_UNKNOWN) return NULL; for (module = modules; module->name != 0; module++) { if (strcmp(module->name, algorithm) == 0 && module->enc == enc) break; } if (module->name == NULL) return NULL; stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer)); if (stemmer == NULL) return NULL; stemmer->create = module->create; stemmer->close = module->close; stemmer->stem = module->stem; stemmer->env = stemmer->create(); if (stemmer->env == NULL) { sb_stemmer_delete(stemmer); return NULL; } return stemmer; } void sb_stemmer_delete(struct sb_stemmer * stemmer) { if (stemmer == 0) return; if (stemmer->close == 0) return; stemmer->close(stemmer->env); stemmer->close = 0; free(stemmer); } const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size) { int ret; if (SN_set_current(stemmer->env, size, (const symbol *)(word))) { stemmer->env->l = 0; return NULL; } ret = stemmer->stem(stemmer->env); if (ret < 0) return NULL; stemmer->env->p[stemmer->env->l] = 0; return (const sb_symbol *)(stemmer->env->p); } int sb_stemmer_length(struct sb_stemmer * stemmer) { return stemmer->env->l; } snowball_code/README0000644000175000017500000000023212707117052012670 0ustar domdomThis contains the source code for the snowball compiler and the stemming algorithms on the website. See http://snowball.tartarus.org/ for more details. snowball_code/compiler/0000755000175000017500000000000012707117052013625 5ustar domdomsnowball_code/compiler/syswords2.h0000644000175000017500000000151112707117052015753 0ustar domdom c_among = 4, c_and, c_as, c_assign, c_assignto, c_atleast, c_atlimit, c_atmark, c_attach, c_backwardmode, c_backwards, c_booleans, c_bra, c_comment1, c_comment2, c_cursor, c_debug, c_decimal, c_define, c_delete, c_divide, c_divideassign, c_do, c_dollar, c_eq, c_externals, c_fail, c_false, c_for, c_ge, c_get, c_gopast, c_goto, c_gr, c_groupings, c_hex, c_hop, c_insert, c_integers, c_ket, c_le, c_leftslice, c_limit, c_loop, c_ls, c_maxint, c_minint, c_minus, c_minusassign, c_multiply, c_multiplyassign, c_ne, c_next, c_non, c_not, c_or, c_plus, c_plusassign, c_repeat, c_reverse, c_rightslice, c_routines, c_set, c_setlimit, c_setmark, c_size, c_sizeof, c_slicefrom, c_sliceto, c_stringdef, c_stringescapes, c_strings, c_substring, c_test, c_tolimit, c_tomark, c_true, c_try, c_unset, snowball_code/compiler/space.c0000644000175000017500000001412612707117052015070 0ustar domdom #include /* for printf */ #include /* malloc, free */ #include /* memmove */ #include "header.h" #define HEAD 2*sizeof(int) #define EXTENDER 40 /* This modules provides a simple mechanism for arbitrary length writable strings, called 'blocks'. They are 'symbol *' items rather than 'char *' items however. The calls are: symbol * b = create_b(n); - create an empty block b with room for n symbols b = increase_capacity(b, n); - increase the capacity of block b by n symbols (b may change) b2 = copy_b(b) - copy block b into b2 lose_b(b); - lose block b b = move_to_b(b, n, p); - set the data in b to be the n symbols at address p b = add_to_b(b, n, p); - add the n symbols at address p to the end of the data in b SIZE(b) - is the number of symbols in b For example: symbol * b = create_b(0); { int i; char p[10]; for (i = 0; i < 100; i++) { sprintf(p, " %d", i); add_s_to_b(b, p); } } and b contains " 0 1 2 ... 99" spaced out as symbols. */ /* For a block b, SIZE(b) is the number of symbols so far written into it, CAPACITY(b) the total number it can contain, so SIZE(b) <= CAPACITY(b). In fact blocks have 1 extra character over the promised capacity so they can be zero terminated by 'b[SIZE(b)] = 0;' without fear of overwriting. */ extern symbol * create_b(int n) { symbol * p = (symbol *) (HEAD + (char *) MALLOC(HEAD + (n + 1) * sizeof(symbol))); CAPACITY(p) = n; SIZE(p) = 0; return p; } extern void report_b(FILE * out, symbol * p) { int i; for (i = 0; i < SIZE(p); i++) fprintf(out, "%c", p[i]); } extern void lose_b(symbol * p) { if (p == 0) return; FREE((char *) p - HEAD); } extern symbol * increase_capacity(symbol * p, int n) { symbol * q = create_b(CAPACITY(p) + n + EXTENDER); memmove(q, p, CAPACITY(p) * sizeof(symbol)); SIZE(q) = SIZE(p); lose_b(p); return q; } extern symbol * move_to_b(symbol * p, int n, symbol * q) { int x = n - CAPACITY(p); if (x > 0) p = increase_capacity(p, x); memmove(p, q, n * sizeof(symbol)); SIZE(p) = n; return p; } extern symbol * add_to_b(symbol * p, int n, symbol * q) { int x = SIZE(p) + n - CAPACITY(p); if (x > 0) p = increase_capacity(p, x); memmove(p + SIZE(p), q, n * sizeof(symbol)); SIZE(p) += n; return p; } extern symbol * copy_b(symbol * p) { int n = SIZE(p); symbol * q = create_b(n); move_to_b(q, n, p); return q; } int space_count = 0; extern void * check_malloc(int n) { space_count++; return malloc(n); } extern void check_free(void * p) { space_count--; free(p); } /* To convert a block to a zero terminated string: */ extern char * b_to_s(symbol * p) { int n = SIZE(p); char * s = (char *)malloc(n + 1); { int i; for (i = 0; i < n; i++) s[i] = (char)p[i]; /* cast to suppress possible warnings */ } s[n] = 0; return s; } /* To add a zero terminated string to a block. If p = 0 the block is created. */ extern symbol * add_s_to_b(symbol * p, const char * s) { int n = strlen(s); int k; if (p == 0) p = create_b(n); k = SIZE(p); { int x = k + n - CAPACITY(p); if (x > 0) p = increase_capacity(p, x); } { int i; for (i = 0; i < n; i++) p[i + k] = s[i]; } SIZE(p) += n; return p; } /* The next section defines string handling capabilities in terms of the lower level block handling capabilities of space.c */ /* -------------------------------------------------------------*/ struct str { symbol * data; }; /* Create a new string. */ extern struct str * str_new() { struct str * output = (struct str *) malloc(sizeof(struct str)); output->data = create_b(0); return output; } /* Delete a string. */ extern void str_delete(struct str * str) { lose_b(str->data); free(str); } /* Append a str to this str. */ extern void str_append(struct str * str, struct str * add) { symbol * q = add->data; str->data = add_to_b(str->data, SIZE(q), q); } /* Append a character to this str. */ extern void str_append_ch(struct str * str, char add) { symbol q[1]; q[0] = add; str->data = add_to_b(str->data, 1, q); } /* Append a low level block to a str. */ extern void str_append_b(struct str * str, symbol * q) { str->data = add_to_b(str->data, SIZE(q), q); } /* Append a (char *, null teminated) string to a str. */ extern void str_append_string(struct str * str, const char * s) { str->data = add_s_to_b(str->data, s); } /* Append an integer to a str. */ extern void str_append_int(struct str * str, int i) { char s[30]; sprintf(s, "%d", i); str_append_string(str, s); } /* Clear a string */ extern void str_clear(struct str * str) { SIZE(str->data) = 0; } /* Set a string */ extern void str_assign(struct str * str, char * s) { str_clear(str); str_append_string(str, s); } /* Copy a string. */ extern struct str * str_copy(struct str * old) { struct str * newstr = str_new(); str_append(newstr, old); return newstr; } /* Get the data stored in this str. */ extern symbol * str_data(struct str * str) { return str->data; } /* Get the length of the str. */ extern int str_len(struct str * str) { return SIZE(str->data); } extern int get_utf8(const symbol * p, int * slot) { int b0, b1; b0 = *p++; if (b0 < 0xC0) { /* 1100 0000 */ * slot = b0; return 1; } b1 = *p++; if (b0 < 0xE0) { /* 1110 0000 */ * slot = (b0 & 0x1F) << 6 | (b1 & 0x3F); return 2; } * slot = (b0 & 0xF) << 12 | (b1 & 0x3F) << 6 | (*p & 0x3F); return 3; } extern int put_utf8(int ch, symbol * p) { if (ch < 0x80) { p[0] = ch; return 1; } if (ch < 0x800) { p[0] = (ch >> 6) | 0xC0; p[1] = (ch & 0x3F) | 0x80; return 2; } p[0] = (ch >> 12) | 0xE0; p[1] = ((ch >> 6) & 0x3F) | 0x80; p[2] = (ch & 0x3F) | 0x80; return 3; } snowball_code/compiler/driver.c0000644000175000017500000002015412707117052015266 0ustar domdom#include /* for main etc */ #include /* for free etc */ #include /* for strlen */ #include "header.h" #define DEFAULT_PACKAGE "org.tartarus.snowball.ext" #define DEFAULT_BASE_CLASS "org.tartarus.snowball.SnowballProgram" #define DEFAULT_AMONG_CLASS "org.tartarus.snowball.Among" #define DEFAULT_STRING_CLASS "java.lang.StringBuilder" static int eq(char * s1, char * s2) { int s1_len = strlen(s1); int s2_len = strlen(s2); return s1_len == s2_len && memcmp(s1, s2, s1_len) == 0; } static void print_arglist(void) { fprintf(stderr, "Usage: snowball [options]\n\n" "options are: [-o[utput] file]\n" " [-s[yntax]]\n" #ifndef DISABLE_JAVA " [-j[ava]]\n" #endif " [-c++]\n" " [-w[idechars]]\n" " [-u[tf8]]\n" " [-n[ame] class name]\n" " [-ep[refix] string]\n" " [-vp[refix] string]\n" " [-i[nclude] directory]\n" " [-r[untime] path to runtime headers]\n" #ifndef DISABLE_JAVA " [-p[arentclassname] fully qualified parent class name]\n" " [-P[ackage] package name for stemmers]\n" " [-S[tringclass] StringBuffer-compatible class]\n" " [-a[mongclass] fully qualified name of the Among class]\n" #endif ); exit(1); } static void check_lim(int i, int argc) { if (i >= argc) { fprintf(stderr, "argument list is one short\n"); print_arglist(); } } static FILE * get_output(symbol * b) { char * s = b_to_s(b); FILE * output = fopen(s, "w"); if (output == 0) { fprintf(stderr, "Can't open output %s\n", s); exit(1); } free(s); return output; } static void read_options(struct options * o, int argc, char * argv[]) { char * s; int i = 2; /* set defauts: */ o->output_file = 0; o->syntax_tree = false; o->externals_prefix = ""; o->variables_prefix = 0; o->runtime_path = 0; o->parent_class_name = DEFAULT_BASE_CLASS; o->string_class = DEFAULT_STRING_CLASS; o->among_class = DEFAULT_AMONG_CLASS; o->package = DEFAULT_PACKAGE; o->name = ""; o->make_lang = LANG_C; o->widechars = false; o->includes = 0; o->includes_end = 0; o->utf8 = false; /* read options: */ repeat { if (i >= argc) break; s = argv[i++]; { if (eq(s, "-o") || eq(s, "-output")) { check_lim(i, argc); o->output_file = argv[i++]; continue; } if (eq(s, "-n") || eq(s, "-name")) { check_lim(i, argc); o->name = argv[i++]; continue; } #ifndef DISABLE_JAVA if (eq(s, "-j") || eq(s, "-java")) { o->make_lang = LANG_JAVA; o->widechars = true; continue; } #endif if (eq(s, "-c++")) { o->make_lang = LANG_CPLUSPLUS; continue; } if (eq(s, "-w") || eq(s, "-widechars")) { o->widechars = true; o->utf8 = false; continue; } if (eq(s, "-s") || eq(s, "-syntax")) { o->syntax_tree = true; continue; } if (eq(s, "-ep") || eq(s, "-eprefix")) { check_lim(i, argc); o->externals_prefix = argv[i++]; continue; } if (eq(s, "-vp") || eq(s, "-vprefix")) { check_lim(i, argc); o->variables_prefix = argv[i++]; continue; } if (eq(s, "-i") || eq(s, "-include")) { check_lim(i, argc); { NEW(include, p); symbol * b = add_s_to_b(0, argv[i++]); b = add_s_to_b(b, "/"); p->next = 0; p->b = b; if (o->includes == 0) o->includes = p; else o->includes_end->next = p; o->includes_end = p; } continue; } if (eq(s, "-r") || eq(s, "-runtime")) { check_lim(i, argc); o->runtime_path = argv[i++]; continue; } if (eq(s, "-u") || eq(s, "-utf8")) { o->utf8 = true; o->widechars = false; continue; } #ifndef DISABLE_JAVA if (eq(s, "-p") || eq(s, "-parentclassname")) { check_lim(i, argc); o->parent_class_name = argv[i++]; continue; } if (eq(s, "-P") || eq(s, "-Package")) { check_lim(i, argc); o->package = argv[i++]; continue; } if (eq(s, "-S") || eq(s, "-stringclass")) { check_lim(i, argc); o->string_class = argv[i++]; continue; } if (eq(s, "-a") || eq(s, "-amongclass")) { check_lim(i, argc); o->among_class = argv[i++]; continue; } #endif fprintf(stderr, "'%s' misplaced\n", s); print_arglist(); } } } extern int main(int argc, char * argv[]) { NEW(options, o); if (argc == 1) print_arglist(); read_options(o, argc, argv); { symbol * filename = add_s_to_b(0, argv[1]); symbol * u = get_input(filename); if (u == 0) { fprintf(stderr, "Can't open input %s\n", argv[1]); exit(1); } { struct tokeniser * t = create_tokeniser(u); struct analyser * a = create_analyser(t); t->widechars = o->widechars; t->includes = o->includes; a->utf8 = t->utf8 = o->utf8; read_program(a); if (t->error_count > 0) exit(1); if (o->syntax_tree) print_program(a); close_tokeniser(t); unless (o->syntax_tree) { struct generator * g; char * s = o->output_file; unless (s) { fprintf(stderr, "Please include the -o option\n"); print_arglist(); exit(1); } if (o->make_lang == LANG_C || o->make_lang == LANG_CPLUSPLUS) { symbol * b = add_s_to_b(0, s); b = add_s_to_b(b, ".h"); o->output_h = get_output(b); b[SIZE(b) - 1] = 'c'; if (o->make_lang == LANG_CPLUSPLUS) { b = add_s_to_b(b, "c"); } o->output_c = get_output(b); lose_b(b); g = create_generator_c(a, o); generate_program_c(g); close_generator_c(g); fclose(o->output_c); fclose(o->output_h); } #ifndef DISABLE_JAVA if (o->make_lang == LANG_JAVA) { symbol * b = add_s_to_b(0, s); b = add_s_to_b(b, ".java"); o->output_java = get_output(b); lose_b(b); g = create_generator_java(a, o); generate_program_java(g); close_generator_java(g); fclose(o->output_java); } #endif } close_analyser(a); } lose_b(u); lose_b(filename); } { struct include * p = o->includes; until (p == 0) { struct include * q = p->next; lose_b(p->b); FREE(p); p = q; } } FREE(o); unless (space_count == 0) fprintf(stderr, "%d blocks unfreed\n", space_count); return 0; } snowball_code/compiler/header.h0000644000175000017500000002047112707117052015232 0ustar domdom typedef unsigned char byte; typedef unsigned short symbol; #define true 1 #define false 0 #define repeat while(true) #define unless(C) if(!(C)) #define until(C) while(!(C)) #define MALLOC check_malloc #define FREE check_free #define NEW(type, p) struct type * p = (struct type *) MALLOC(sizeof(struct type)) #define NEWVEC(type, p, n) struct type * p = (struct type *) MALLOC(sizeof(struct type) * n) #define STARTSIZE 10 #define SIZE(p) ((int *)(p))[-1] #define CAPACITY(p) ((int *)(p))[-2] extern symbol * create_b(int n); extern void report_b(FILE * out, symbol * p); extern void lose_b(symbol * p); extern symbol * increase_capacity(symbol * p, int n); extern symbol * move_to_b(symbol * p, int n, symbol * q); extern symbol * add_to_b(symbol * p, int n, symbol * q); extern symbol * copy_b(symbol * p); extern char * b_to_s(symbol * p); extern symbol * add_s_to_b(symbol * p, const char * s); struct str; /* defined in space.c */ extern struct str * str_new(void); extern void str_delete(struct str * str); extern void str_append(struct str * str, struct str * add); extern void str_append_ch(struct str * str, char add); extern void str_append_b(struct str * str, symbol * q); extern void str_append_string(struct str * str, const char * s); extern void str_append_int(struct str * str, int i); extern void str_clear(struct str * str); extern void str_assign(struct str * str, char * s); extern struct str * str_copy(struct str * old); extern symbol * str_data(struct str * str); extern int str_len(struct str * str); extern int get_utf8(const symbol * p, int * slot); extern int put_utf8(int ch, symbol * p); struct m_pair { struct m_pair * next; symbol * name; symbol * value; }; struct input { struct input * next; symbol * p; int c; int line_number; }; struct include { struct include * next; symbol * b; }; struct tokeniser { struct input * next; symbol * p; int c; int line_number; symbol * b; symbol * b2; int number; int m_start; int m_end; struct m_pair * m_pairs; int get_depth; int error_count; int token; int previous_token; byte token_held; byte widechars; byte utf8; int omission; struct include * includes; }; extern symbol * get_input(symbol * p); extern struct tokeniser * create_tokeniser(symbol * b); extern int read_token(struct tokeniser * t); extern byte * name_of_token(int code); extern void close_tokeniser(struct tokeniser * t); enum token_codes { #include "syswords2.h" c_mathassign, c_name, c_number, c_literalstring, c_neg, c_call, c_grouping, c_booltest }; extern int space_count; extern void * check_malloc(int n); extern void check_free(void * p); struct node; struct name { struct name * next; symbol * b; int type; /* t_string etc */ int mode; /* )_ for routines, externals */ struct node * definition; /* ) */ int count; /* 0, 1, 2 for each type */ struct grouping * grouping; /* for grouping names */ byte referenced; byte used; }; struct literalstring { struct literalstring * next; symbol * b; }; struct amongvec { symbol * b; /* the string giving the case */ int size; /* - and its size */ struct node * p; /* the corresponding command */ int i; /* the amongvec index of the longest substring of b */ int result; /* the numeric result for the case */ struct name * function; }; struct among { struct among * next; struct amongvec * b; /* pointer to the amongvec */ int number; /* amongs are numbered 0, 1, 2 ... */ int literalstring_count; /* in this among */ int command_count; /* in this among */ struct node * starter; /* i.e. among( (starter) 'string' ... ) */ struct node * substring; /* i.e. substring ... among ( ... ) */ }; struct grouping { struct grouping * next; int number; /* groupings are numbered 0, 1, 2 ... */ symbol * b; /* the characters of this group */ int largest_ch; /* character with max code */ int smallest_ch; /* character with min code */ byte no_gaps; /* not used in generator.c after 11/5/05 */ struct name * name; /* so g->name->grouping == g */ }; struct node { struct node * next; struct node * left; struct node * aux; /* used in setlimit */ struct among * among; /* used in among */ struct node * right; int type; int mode; struct node * AE; struct name * name; symbol * literalstring; int number; int line_number; int amongvar_needed; /* used in routine definitions */ }; enum name_types { t_size = 6, t_string = 0, t_boolean = 1, t_integer = 2, t_routine = 3, t_external = 4, t_grouping = 5 /* If this list is extended, adjust wvn in generator.c */ }; /* In name_count[i] below, remember that type is ----+---- 0 | string 1 | boolean 2 | integer 3 | routine 4 | external 5 | grouping */ struct analyser { struct tokeniser * tokeniser; struct node * nodes; struct name * names; struct literalstring * literalstrings; int mode; byte modifyable; /* false inside reverse(...) */ struct node * program; struct node * program_end; int name_count[t_size]; /* name_count[i] counts the number of names of type i */ struct among * amongs; struct among * amongs_end; int among_count; int amongvar_needed; /* used in reading routine definitions */ struct grouping * groupings; struct grouping * groupings_end; struct node * substring; /* pending 'substring' in current routine definition */ byte utf8; }; enum analyser_modes { m_forward = 0, m_backward /*, m_integer */ }; extern void print_program(struct analyser * a); extern struct analyser * create_analyser(struct tokeniser * t); extern void close_analyser(struct analyser * a); extern void read_program(struct analyser * a); struct generator { struct analyser * analyser; struct options * options; int unreachable; /* 0 if code can be reached, 1 if current code * is unreachable. */ int var_number; /* Number of next variable to use. */ struct str * outbuf; /* temporary str to store output */ struct str * declarations; /* str storing variable declarations */ int next_label; int margin; const char * failure_string; /* String to output in case of a failure. */ struct str * failure_str; /* This is used by the java generator instead of failure_string */ int label_used; /* Keep track of whether the failure label is used. */ int failure_label; int debug_count; const char * S[10]; /* strings */ symbol * B[10]; /* blocks */ int I[10]; /* integers */ struct name * V[5]; /* variables */ symbol * L[5]; /* literals, used in formatted write */ int line_count; /* counts number of lines output */ int line_labelled; /* in ANSI C, will need extra ';' if it is a block end */ int literalstring_count; int keep_count; /* used to number keep/restore pairs to avoid compiler warnings about shadowed variables */ }; struct options { /* for the command line: */ char * output_file; char * name; FILE * output_c; FILE * output_h; FILE * output_java; byte syntax_tree; byte widechars; enum { LANG_JAVA, LANG_C, LANG_CPLUSPLUS } make_lang; char * externals_prefix; char * variables_prefix; char * runtime_path; char * parent_class_name; char * package; char * string_class; char * among_class; struct include * includes; struct include * includes_end; byte utf8; }; /* Generator for C code. */ extern struct generator * create_generator_c(struct analyser * a, struct options * o); extern void close_generator_c(struct generator * g); extern void generate_program_c(struct generator * g); /* Generator for Java code. */ extern struct generator * create_generator_java(struct analyser * a, struct options * o); extern void close_generator_java(struct generator * g); extern void generate_program_java(struct generator * g); snowball_code/compiler/tokeniser.c0000644000175000017500000003357512707117052016011 0ustar domdom #include /* stderr etc */ #include /* malloc free */ #include /* strlen */ #include /* isalpha etc */ #include "header.h" struct system_word { int s_size; /* size of system word */ byte * s; /* pointer to the system word */ int code; /* it's internal code */ }; /* ASCII collating assumed in syswords.c */ #include "syswords.h" static int smaller(int a, int b) { return a < b ? a : b; } extern symbol * get_input(symbol * p) { char * s = b_to_s(p); { FILE * input = fopen(s, "r"); free(s); if (input == 0) return 0; { symbol * u = create_b(STARTSIZE); int size = 0; repeat { int ch = getc(input); if (ch == EOF) break; if (size >= CAPACITY(u)) u = increase_capacity(u, size/2); u[size++] = ch; } fclose(input); SIZE(u) = size; return u; } } } static void error(struct tokeniser * t, char * s1, int n, symbol * p, char * s2) { if (t->error_count == 20) { fprintf(stderr, "... etc\n"); exit(1); } fprintf(stderr, "Line %d", t->line_number); if (t->get_depth > 0) fprintf(stderr, " (of included file)"); fprintf(stderr, ": "); unless (s1 == 0) fprintf(stderr, "%s", s1); unless (p == 0) { int i; for (i = 0; i < n; i++) fprintf(stderr, "%c", p[i]); } unless (s2 == 0) fprintf(stderr, "%s", s2); fprintf(stderr, "\n"); t->error_count++; } static void error1(struct tokeniser * t, char * s) { error(t, s, 0,0, 0); } static void error2(struct tokeniser * t, char * s) { error(t, "unexpected end of text after ", 0,0, s); } static int compare_words(int m, symbol * p, int n, byte * q) { unless (m == n) return m - n; { int i; for (i = 0; i < n; i++) { int diff = p[i] - q[i]; unless (diff == 0) return diff; } } return 0; } static int find_word(int n, symbol * p) { int i = 0; int j = vocab->code; repeat { int k = i + (j - i)/2; struct system_word * w = vocab + k; int diff = compare_words(n, p, w->s_size, w->s); if (diff == 0) return w->code; if (diff < 0) j = k; else i = k; if (j - i == 1) break; } return -1; } static int get_number(int n, symbol * p) { int x = 0; int i; for (i = 0; i < n; i++) x = 10*x + p[i] - '0'; return x; } static int eq_s(struct tokeniser * t, char * s) { int l = strlen(s); if (SIZE(t->p) - t->c < l) return false; { int i; for (i = 0; i < l; i++) if (t->p[t->c + i] != s[i]) return false; } t->c += l; return true; } static int white_space(struct tokeniser * t, int ch) { switch (ch) { case '\n': t->line_number++; case '\r': case '\t': case ' ': return true; } return false; } static symbol * find_in_m(struct tokeniser * t, int n, symbol * p) { struct m_pair * q = t->m_pairs; repeat { if (q == 0) return 0; { symbol * name = q->name; if (n == SIZE(name) && memcmp(name, p, n * sizeof(symbol)) == 0) return q->value; } q = q->next; } } static int read_literal_string(struct tokeniser * t, int c) { symbol * p = t->p; int ch; SIZE(t->b) = 0; repeat { if (c >= SIZE(p)) { error2(t, "'"); return c; } ch = p[c]; if (ch == '\n') { error1(t, "string not terminated"); return c; } c++; if (ch == t->m_start) { int c0 = c; int newlines = false; /* no newlines as yet */ int black_found = false; /* no printing chars as yet */ repeat { if (c >= SIZE(p)) { error2(t, "'"); return c; } ch = p[c]; c++; if (ch == t->m_end) break; unless (white_space(t, ch)) black_found = true; if (ch == '\n') newlines = true; if (newlines && black_found) { error1(t, "string not terminated"); return c; } } unless (newlines) { int n = c - c0 - 1; /* macro size */ int firstch = p[c0]; symbol * q = find_in_m(t, n, p + c0); if (q == 0) { if (n == 1 && (firstch == '\'' || firstch == t->m_start)) t->b = add_to_b(t->b, 1, p + c0); else error(t, "string macro '", n, p + c0, "' undeclared"); } else t->b = add_to_b(t->b, SIZE(q), q); } } else { if (ch == '\'') return c; t->b = add_to_b(t->b, 1, p + c - 1); } } } static int next_token(struct tokeniser * t) { symbol * p = t->p; int c = t->c; int ch; int code = -1; repeat { if (c >= SIZE(p)) { t->c = c; return -1; } ch = p[c]; if (white_space(t, ch)) { c++; continue; } if (isalpha(ch)) { int c0 = c; while (c < SIZE(p) && (isalnum(p[c]) || p[c] == '_')) c++; code = find_word(c - c0, p + c0); if (code < 0) { t->b = move_to_b(t->b, c - c0, p + c0); code = c_name; } } else if (isdigit(ch)) { int c0 = c; while (c < SIZE(p) && isdigit(p[c])) c++; t->number = get_number(c - c0, p + c0); code = c_number; } else if (ch == '\'') { c = read_literal_string(t, c + 1); code = c_literalstring; } else { int lim = smaller(2, SIZE(p) - c); int i; for (i = lim; i > 0; i--) { code = find_word(i, p + c); if (code >= 0) { c += i; break; } } } if (code >= 0) { t->c = c; return code; } error(t, "'", 1, p + c, "' unknown"); c++; continue; } } static int next_char(struct tokeniser * t) { if (t->c >= SIZE(t->p)) return -1; return t->p[t->c++]; } static int next_real_char(struct tokeniser * t) { repeat { int ch = next_char(t); if (white_space(t, ch)) continue; return ch; } } static void read_chars(struct tokeniser * t) { int ch = next_real_char(t); if (ch < 0) { error2(t, "stringdef"); return; } { int c0 = t->c-1; repeat { ch = next_char(t); if (white_space(t, ch) || ch < 0) break; } t->b2 = move_to_b(t->b2, t->c - c0 - 1, t->p + c0); } } static int decimal_to_num(int ch) { if ('0' <= ch && ch <= '9') return ch - '0'; return -1; } static int hex_to_num(int ch) { if ('0' <= ch && ch <= '9') return ch - '0'; if ('a' <= ch && ch <= 'f') return ch - 'a' + 10; return -1; } static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) { int c = 0; int d = 0; repeat { while (c < SIZE(p) && p[c] == ' ') c++; if (c == SIZE(p)) break; { int number = 0; repeat { int ch = p[c]; if (c == SIZE(p) || ch == ' ') break; if (base == 10) { ch = decimal_to_num(ch); if (ch < 0) { error1(t, "decimal string contains non-digits"); return; } } else { ch = hex_to_num(tolower(ch)); if (ch < 0) { error1(t, "hex string contains non-hex characters"); return; } } number = base * number + ch; c++; } if (t->widechars || t->utf8) { unless (0 <= number && number <= 0xffff) { error1(t, "character values exceed 64K"); return; } } else { unless (0 <= number && number <= 0xff) { error1(t, "character values exceed 256"); return; } } if (t->utf8) d += put_utf8(number, p + d); else p[d++] = number; } } SIZE(p) = d; } extern int read_token(struct tokeniser * t) { symbol * p = t->p; int held = t->token_held; t->token_held = false; if (held) return t->token; repeat { int code = next_token(t); switch (code) { case c_comment1: /* slash-slash comment */ while (t->c < SIZE(p) && p[t->c] != '\n') t->c++; continue; case c_comment2: /* slash-star comment */ repeat { if (t->c >= SIZE(p)) { error1(t, "/* comment not terminated"); t->token = -1; return -1; } if (p[t->c] == '\n') t->line_number++; if (eq_s(t, "*/")) break; t->c++; } continue; case c_stringescapes: { int ch1 = next_real_char(t); int ch2 = next_real_char(t); if (ch2 < 0) { error2(t, "stringescapes"); continue; } if (ch1 == '\'') { error1(t, "first stringescape cannot be '"); continue; } t->m_start = ch1; t->m_end = ch2; } continue; case c_stringdef: { int base = 0; read_chars(t); code = read_token(t); if (code == c_hex) { base = 16; code = read_token(t); } else if (code == c_decimal) { base = 10; code = read_token(t); } unless (code == c_literalstring) { error1(t, "string omitted after stringdef"); continue; } if (base > 0) convert_numeric_string(t, t->b, base); { NEW(m_pair, q); q->next = t->m_pairs; q->name = copy_b(t->b2); q->value = copy_b(t->b); t->m_pairs = q; } } continue; case c_get: code = read_token(t); unless (code == c_literalstring) { error1(t, "string omitted after get"); continue; } t->get_depth++; if (t->get_depth > 10) { fprintf(stderr, "get directives go 10 deep. Looping?\n"); exit(1); } { NEW(input, q); symbol * u = get_input(t->b); if (u == 0) { struct include * r = t->includes; until (r == 0) { symbol * b = copy_b(r->b); b = add_to_b(b, SIZE(t->b), t->b); u = get_input(b); lose_b(b); unless (u == 0) break; r = r->next; } } if (u == 0) { error(t, "Can't get '", SIZE(t->b), t->b, "'"); exit(1); } memmove(q, t, sizeof(struct input)); t->next = q; t->p = u; t->c = 0; t->line_number = 1; } p = t->p; continue; case -1: unless (t->next == 0) { lose_b(p); { struct input * q = t->next; memmove(t, q, sizeof(struct input)); p = t->p; FREE(q); } t->get_depth--; continue; } /* drop through */ default: t->previous_token = t->token; t->token = code; return code; } } } extern byte * name_of_token(int code) { int i; for (i = 1; i < vocab->code; i++) if ((vocab + i)->code == code) return (vocab + i)->s; switch (code) { case c_mathassign: return (byte *) "="; case c_name: return (byte *) "name"; case c_number: return (byte *) "number"; case c_literalstring:return (byte *) "literal"; case c_neg: return (byte *) "neg"; case c_grouping: return (byte *) "grouping"; case c_call: return (byte *) "call"; case c_booltest: return (byte *) "Boolean test"; case -2: return (byte *) "start of text"; case -1: return (byte *) "end of text"; default: return (byte *) "?"; } } extern struct tokeniser * create_tokeniser(symbol * p) { NEW(tokeniser, t); t->next = 0; t->p = p; t->c = 0; t->line_number = 1; t->b = create_b(0); t->b2 = create_b(0); t->m_start = -1; t->m_pairs = 0; t->get_depth = 0; t->error_count = 0; t->token_held = false; t->token = -2; t->previous_token = -2; return t; } extern void close_tokeniser(struct tokeniser * t) { lose_b(t->b); lose_b(t->b2); { struct m_pair * q = t->m_pairs; until (q == 0) { struct m_pair * q_next = q->next; lose_b(q->name); lose_b(q->value); FREE(q); q = q_next; } } { struct input * q = t->next; until (q == 0) { struct input * q_next = q->next; FREE(q); q = q_next; } } FREE(t); } snowball_code/compiler/syswords.h0000644000175000017500000000700112707117052015671 0ustar domdomstatic struct system_word vocab[80+1] = { { 0, (byte *)"", 80+1}, { 1, (byte *)"$", c_dollar }, { 1, (byte *)"(", c_bra }, { 1, (byte *)")", c_ket }, { 1, (byte *)"*", c_multiply }, { 1, (byte *)"+", c_plus }, { 1, (byte *)"-", c_minus }, { 1, (byte *)"/", c_divide }, { 1, (byte *)"<", c_ls }, { 1, (byte *)"=", c_assign }, { 1, (byte *)">", c_gr }, { 1, (byte *)"?", c_debug }, { 1, (byte *)"[", c_leftslice }, { 1, (byte *)"]", c_rightslice }, { 2, (byte *)"!=", c_ne }, { 2, (byte *)"*=", c_multiplyassign }, { 2, (byte *)"+=", c_plusassign }, { 2, (byte *)"-=", c_minusassign }, { 2, (byte *)"->", c_sliceto }, { 2, (byte *)"/*", c_comment2 }, { 2, (byte *)"//", c_comment1 }, { 2, (byte *)"/=", c_divideassign }, { 2, (byte *)"<+", c_insert }, { 2, (byte *)"<-", c_slicefrom }, { 2, (byte *)"<=", c_le }, { 2, (byte *)"==", c_eq }, { 2, (byte *)"=>", c_assignto }, { 2, (byte *)">=", c_ge }, { 2, (byte *)"as", c_as }, { 2, (byte *)"do", c_do }, { 2, (byte *)"or", c_or }, { 3, (byte *)"and", c_and }, { 3, (byte *)"for", c_for }, { 3, (byte *)"get", c_get }, { 3, (byte *)"hex", c_hex }, { 3, (byte *)"hop", c_hop }, { 3, (byte *)"non", c_non }, { 3, (byte *)"not", c_not }, { 3, (byte *)"set", c_set }, { 3, (byte *)"try", c_try }, { 4, (byte *)"fail", c_fail }, { 4, (byte *)"goto", c_goto }, { 4, (byte *)"loop", c_loop }, { 4, (byte *)"next", c_next }, { 4, (byte *)"size", c_size }, { 4, (byte *)"test", c_test }, { 4, (byte *)"true", c_true }, { 5, (byte *)"among", c_among }, { 5, (byte *)"false", c_false }, { 5, (byte *)"limit", c_limit }, { 5, (byte *)"unset", c_unset }, { 6, (byte *)"atmark", c_atmark }, { 6, (byte *)"attach", c_attach }, { 6, (byte *)"cursor", c_cursor }, { 6, (byte *)"define", c_define }, { 6, (byte *)"delete", c_delete }, { 6, (byte *)"gopast", c_gopast }, { 6, (byte *)"insert", c_insert }, { 6, (byte *)"maxint", c_maxint }, { 6, (byte *)"minint", c_minint }, { 6, (byte *)"repeat", c_repeat }, { 6, (byte *)"sizeof", c_sizeof }, { 6, (byte *)"tomark", c_tomark }, { 7, (byte *)"atleast", c_atleast }, { 7, (byte *)"atlimit", c_atlimit }, { 7, (byte *)"decimal", c_decimal }, { 7, (byte *)"reverse", c_reverse }, { 7, (byte *)"setmark", c_setmark }, { 7, (byte *)"strings", c_strings }, { 7, (byte *)"tolimit", c_tolimit }, { 8, (byte *)"booleans", c_booleans }, { 8, (byte *)"integers", c_integers }, { 8, (byte *)"routines", c_routines }, { 8, (byte *)"setlimit", c_setlimit }, { 9, (byte *)"backwards", c_backwards }, { 9, (byte *)"externals", c_externals }, { 9, (byte *)"groupings", c_groupings }, { 9, (byte *)"stringdef", c_stringdef }, { 9, (byte *)"substring", c_substring }, { 12, (byte *)"backwardmode", c_backwardmode }, { 13, (byte *)"stringescapes", c_stringescapes } }; snowball_code/compiler/analyser.c0000644000175000017500000007135012707117052015615 0ustar domdom #include /* main etc */ #include /* exit */ #include /* memmove */ #include "header.h" /* recursive usage: */ static void read_program_(struct analyser * a, int terminator); static struct node * read_C(struct analyser * a); static struct node * C_style(struct analyser * a, char * s, int token); static void fault(int n) { fprintf(stderr, "fault %d\n", n); exit(1); } static void print_node_(struct node * p, int n, char * s) { int i; for (i = 0; i < n; i++) printf(i == n - 1 ? s : " "); printf("%s ", name_of_token(p->type)); unless (p->name == 0) report_b(stdout, p->name->b); unless (p->literalstring == 0) { printf("'"); report_b(stdout, p->literalstring); printf("'"); } printf("\n"); unless (p->AE == 0) print_node_(p->AE, n+1, "# "); unless (p->left == 0) print_node_(p->left, n+1, " "); unless (p->right == 0) print_node_(p->right, n, " "); if (p->aux != 0) print_node_(p->aux, n+1, "@ "); } extern void print_program(struct analyser * a) { print_node_(a->program, 0, " "); } static struct node * new_node(struct analyser * a, int type) { NEW(node, p); p->next = a->nodes; a->nodes = p; p->left = 0; p->right = 0; p->aux = 0; p->AE = 0; p->name = 0; p->literalstring = 0; p->mode = a->mode; p->line_number = a->tokeniser->line_number; p->type = type; return p; } static char * name_of_mode(int n) { switch (n) { default: fault(0); case m_backward: return "string backward"; case m_forward: return "string forward"; /* case m_integer: return "integer"; */ } } static char * name_of_type(int n) { switch (n) { default: fault(1); case 's': return "string"; case 'i': return "integer"; case 'r': return "routine"; case 'R': return "routine or grouping"; case 'g': return "grouping"; } } static void count_error(struct analyser * a) { struct tokeniser * t = a->tokeniser; if (t->error_count >= 20) { fprintf(stderr, "... etc\n"); exit(1); } t->error_count++; } static void error2(struct analyser * a, int n, int x) { struct tokeniser * t = a->tokeniser; count_error(a); fprintf(stderr, "Line %d", t->line_number); if (t->get_depth > 0) fprintf(stderr, " (of included file)"); fprintf(stderr, ": "); if (n >= 30) report_b(stderr, t->b); switch (n) { case 0: fprintf(stderr, "%s omitted", name_of_token(t->omission)); break; case 3: fprintf(stderr, "in among(...), "); case 1: fprintf(stderr, "unexpected %s", name_of_token(t->token)); if (t->token == c_number) fprintf(stderr, " %d", t->number); if (t->token == c_name) { fprintf(stderr, " "); report_b(stderr, t->b); } break; case 2: fprintf(stderr, "string omitted"); break; case 14: fprintf(stderr, "unresolved substring on line %d", x); break; case 15: fprintf(stderr, "%s not allowed inside reverse(...)", name_of_token(t->token)); break; case 16: fprintf(stderr, "empty grouping"); break; case 17: fprintf(stderr, "backwards used when already in this mode"); break; case 18: fprintf(stderr, "empty among(...)"); break; case 19: fprintf(stderr, "two adjacent bracketed expressions in among(...)"); break; case 20: fprintf(stderr, "substring preceded by another substring on line %d", x); break; case 30: fprintf(stderr, " re-declared"); break; case 31: fprintf(stderr, " undeclared"); break; case 32: fprintf(stderr, " declared as %s mode; used as %s mode", name_of_mode(a->mode), name_of_mode(x)); break; case 33: fprintf(stderr, " not of type %s", name_of_type(x)); break; case 34: fprintf(stderr, " not of type string or integer"); break; case 35: fprintf(stderr, " misplaced"); break; case 36: fprintf(stderr, " redefined"); break; case 37: fprintf(stderr, " mis-used as %s mode", name_of_mode(x)); break; default: fprintf(stderr, " error %d", n); break; } if (n <= 13 && t->previous_token > 0) fprintf(stderr, " after %s", name_of_token(t->previous_token)); fprintf(stderr, "\n"); } static void error(struct analyser * a, int n) { error2(a, n, 0); } static void error3(struct analyser * a, struct node * p, symbol * b) { count_error(a); fprintf(stderr, "among(...) on line %d has repeated string '", p->line_number); report_b(stderr, b); fprintf(stderr, "'\n"); } static void error4(struct analyser * a, struct name * q) { count_error(a); report_b(stderr, q->b); fprintf(stderr, " undefined\n"); } static void omission_error(struct analyser * a, int n) { a->tokeniser->omission = n; error(a, 0); } static int check_token(struct analyser * a, int code) { struct tokeniser * t = a->tokeniser; if (t->token != code) { omission_error(a, code); return false; } return true; } static int get_token(struct analyser * a, int code) { struct tokeniser * t = a->tokeniser; read_token(t); { int x = check_token(a, code); unless (x) t->token_held = true; return x; } } static struct name * look_for_name(struct analyser * a) { struct name * p = a->names; symbol * q = a->tokeniser->b; repeat { if (p == 0) return 0; { symbol * b = p->b; int n = SIZE(b); if (n == SIZE(q) && memcmp(q, b, n * sizeof(symbol)) == 0) { p->referenced = true; return p; } } p = p->next; } } static struct name * find_name(struct analyser * a) { struct name * p = look_for_name(a); if (p == 0) error(a, 31); return p; } static void check_routine_mode(struct analyser * a, struct name * p, int mode) { if (p->mode < 0) p->mode = mode; else unless (p->mode == mode) error2(a, 37, mode); } static void check_name_type(struct analyser * a, struct name * p, int type) { switch (type) { case 's': if (p->type == t_string) return; break; case 'i': if (p->type == t_integer) return; break; case 'b': if (p->type == t_boolean) return; break; case 'R': if (p->type == t_grouping) return; case 'r': if (p->type == t_routine || p->type == t_external) return; break; case 'g': if (p->type == t_grouping) return; break; } error2(a, 33, type); } static void read_names(struct analyser * a, int type) { struct tokeniser * t = a->tokeniser; unless (get_token(a, c_bra)) return; repeat { if (read_token(t) != c_name) break; if (look_for_name(a) != 0) error(a, 30); else { NEW(name, p); p->b = copy_b(t->b); p->type = type; p->mode = -1; /* routines, externals */ p->count = a->name_count[type]; p->referenced = false; p->used = false; p->grouping = 0; p->definition = 0; a->name_count[type] ++; p->next = a->names; a->names = p; } } unless (check_token(a, c_ket)) t->token_held = true; } static symbol * new_literalstring(struct analyser * a) { NEW(literalstring, p); p->b = copy_b(a->tokeniser->b); p->next = a->literalstrings; a->literalstrings = p; return p->b; } static int read_AE_test(struct analyser * a) { struct tokeniser * t = a->tokeniser; switch (read_token(t)) { case c_assign: return c_mathassign; case c_plusassign: case c_minusassign: case c_multiplyassign: case c_divideassign: case c_eq: case c_ne: case c_gr: case c_ge: case c_ls: case c_le: return t->token; default: error(a, 1); t->token_held = true; return c_eq; } } static int binding(int t) { switch (t) { case c_plus: case c_minus: return 1; case c_multiply: case c_divide: return 2; default: return -2; } } static void name_to_node(struct analyser * a, struct node * p, int type) { struct name * q = find_name(a); unless (q == 0) { check_name_type(a, q, type); q->used = true; } p->name = q; } static struct node * read_AE(struct analyser * a, int B) { struct tokeniser * t = a->tokeniser; struct node * p; struct node * q; switch (read_token(t)) { case c_minus: /* monadic */ p = new_node(a, c_neg); p->right = read_AE(a, 100); break; case c_bra: p = read_AE(a, 0); get_token(a, c_ket); break; case c_name: p = new_node(a, c_name); name_to_node(a, p, 'i'); break; case c_maxint: case c_minint: case c_cursor: case c_limit: case c_size: p = new_node(a, t->token); break; case c_number: p = new_node(a, c_number); p->number = t->number; break; case c_sizeof: p = C_style(a, "s", c_sizeof); break; default: error(a, 1); t->token_held = true; return 0; } repeat { int token = read_token(t); int b = binding(token); unless (binding(token) > B) { t->token_held = true; return p; } q = new_node(a, token); q->left = p; q->right = read_AE(a, b); p = q; } } static struct node * read_C_connection(struct analyser * a, struct node * q, int op) { struct tokeniser * t = a->tokeniser; struct node * p = new_node(a, op); struct node * p_end = q; p->left = q; repeat { q = read_C(a); p_end->right = q; p_end = q; if (read_token(t) != op) { t->token_held = true; break; } } return p; } static struct node * read_C_list(struct analyser * a) { struct tokeniser * t = a->tokeniser; struct node * p = new_node(a, c_bra); struct node * p_end = 0; repeat { int token = read_token(t); if (token == c_ket) return p; if (token < 0) { omission_error(a, c_ket); return p; } t->token_held = true; { struct node * q = read_C(a); repeat { token = read_token(t); if (token != c_and && token != c_or) { t->token_held = true; break; } q = read_C_connection(a, q, token); } if (p_end == 0) p->left = q; else p_end->right = q; p_end = q; } } } static struct node * C_style(struct analyser * a, char * s, int token) { int i; struct node * p = new_node(a, token); for (i = 0; s[i] != 0; i++) switch(s[i]) { case 'C': p->left = read_C(a); continue; case 'D': p->aux = read_C(a); continue; case 'A': p->AE = read_AE(a, 0); continue; case 'f': get_token(a, c_for); continue; case 'S': { int str_token = read_token(a->tokeniser); if (str_token == c_name) name_to_node(a, p, 's'); else if (str_token == c_literalstring) p->literalstring = new_literalstring(a); else error(a, 2); } continue; case 'b': case 's': case 'i': if (get_token(a, c_name)) name_to_node(a, p, s[i]); continue; } return p; } static struct node * read_literalstring(struct analyser * a) { struct node * p = new_node(a, c_literalstring); p->literalstring = new_literalstring(a); return p; } static void reverse_b(symbol * b) { int i = 0; int j = SIZE(b) - 1; until (i >= j) { int ch1 = b[i]; int ch2 = b[j]; b[i++] = ch2; b[j--] = ch1; } } static int compare_amongvec(const void *pv, const void *qv) { const struct amongvec * p = (const struct amongvec*)pv; const struct amongvec * q = (const struct amongvec*)qv; symbol * b_p = p->b; int p_size = p->size; symbol * b_q = q->b; int q_size = q->size; int smaller_size = p_size < q_size ? p_size : q_size; int i; for (i = 0; i < smaller_size; i++) if (b_p[i] != b_q[i]) return b_p[i] - b_q[i]; return p_size - q_size; } static void make_among(struct analyser * a, struct node * p, struct node * substring) { NEW(among, x); NEWVEC(amongvec, v, p->number); struct node * q = p->left; struct amongvec * w0 = v; struct amongvec * w1 = v; int result = 1; int direction = substring != 0 ? substring->mode : p->mode; int backward = direction == m_backward; if (a->amongs == 0) a->amongs = x; else a->amongs_end->next = x; a->amongs_end = x; x->next = 0; x->b = v; x->number = a->among_count++; x->starter = 0; if (q->type == c_bra) { x->starter = q; q = q->right; } until (q == 0) { if (q->type == c_literalstring) { symbol * b = q->literalstring; w1->b = b; /* pointer to case string */ w1->p = 0; /* pointer to corresponding case expression */ w1->size = SIZE(b); /* number of characters in string */ w1->i = -1; /* index of longest substring */ w1->result = -1; /* number of corresponding case expression */ w1->function = q->left == 0 ? 0 : q->left->name; unless (w1->function == 0) check_routine_mode(a, w1->function, direction); w1++; } else if (q->left == 0) /* empty command: () */ w0 = w1; else { until (w0 == w1) { w0->p = q; w0->result = result; w0++; } result++; } q = q->right; } unless (w1-v == p->number) { fprintf(stderr, "oh! %d %d\n", (int)(w1-v), p->number); exit(1); } if (backward) for (w0 = v; w0 < w1; w0++) reverse_b(w0->b); qsort(v, w1 - v, sizeof(struct amongvec), compare_amongvec); /* the following loop is O(n squared) */ for (w0 = w1 - 1; w0 >= v; w0--) { symbol * b = w0->b; int size = w0->size; struct amongvec * w; for (w = w0 - 1; w >= v; w--) { if (w->size < size && memcmp(w->b, b, w->size * sizeof(symbol)) == 0) { w0->i = w - v; /* fill in index of longest substring */ break; } } } if (backward) for (w0 = v; w0 < w1; w0++) reverse_b(w0->b); for (w0 = v; w0 < w1 - 1; w0++) if (w0->size == (w0 + 1)->size && memcmp(w0->b, (w0 + 1)->b, w0->size * sizeof(symbol)) == 0) error3(a, p, w0->b); x->literalstring_count = p->number; x->command_count = result - 1; p->among = x; x->substring = substring; if (substring != 0) substring->among = x; unless (x->command_count == 0 && x->starter == 0) a->amongvar_needed = true; } static struct node * read_among(struct analyser * a) { struct tokeniser * t = a->tokeniser; struct node * p = new_node(a, c_among); struct node * p_end = 0; int previous_token = -1; struct node * substring = a->substring; a->substring = 0; p->number = 0; /* counts the number of literals */ unless (get_token(a, c_bra)) return p; repeat { struct node * q; int token = read_token(t); switch (token) { case c_literalstring: q = read_literalstring(a); if (read_token(t) == c_name) { struct node * r = new_node(a, c_name); name_to_node(a, r, 'r'); q->left = r; } else t->token_held = true; p->number++; break; case c_bra: if (previous_token == c_bra) error(a, 19); q = read_C_list(a); break; default: error(a, 3); case c_ket: if (p->number == 0) error(a, 18); if (t->error_count == 0) make_among(a, p, substring); return p; } previous_token = token; if (p_end == 0) p->left = q; else p_end->right = q; p_end = q; } } static struct node * read_substring(struct analyser * a) { struct node * p = new_node(a, c_substring); if (a->substring != 0) error2(a, 20, a->substring->line_number); a->substring = p; return p; } static void check_modifyable(struct analyser * a) { unless (a->modifyable) error(a, 15); } static struct node * read_C(struct analyser * a) { struct tokeniser * t = a->tokeniser; int token = read_token(t); switch (token) { case c_bra: return read_C_list(a); case c_backwards: { int mode = a->mode; if (a->mode == m_backward) error(a, 17); else a->mode = m_backward; { struct node * p = C_style(a, "C", token); a->mode = mode; return p; } } case c_reverse: { int mode = a->mode; int modifyable = a->modifyable; a->modifyable = false; a->mode = mode == m_forward ? m_backward : m_forward; { struct node * p = C_style(a, "C", token); a->mode = mode; a->modifyable = modifyable; return p; } } case c_not: case c_try: case c_fail: case c_test: case c_do: case c_goto: case c_gopast: case c_repeat: return C_style(a, "C", token); case c_loop: case c_atleast: return C_style(a, "AC", token); case c_setmark: return C_style(a, "i", token); case c_tomark: case c_atmark: case c_hop: return C_style(a, "A", token); case c_delete: check_modifyable(a); case c_next: case c_tolimit: case c_atlimit: case c_leftslice: case c_rightslice: case c_true: case c_false: case c_debug: return C_style(a, "", token); case c_assignto: case c_sliceto: check_modifyable(a); return C_style(a, "s", token); case c_assign: case c_insert: case c_attach: case c_slicefrom: check_modifyable(a); return C_style(a, "S", token); case c_setlimit: return C_style(a, "CfD", token); case c_set: case c_unset: return C_style(a, "b", token); case c_dollar: get_token(a, c_name); { struct node * p; struct name * q = find_name(a); int mode = a->mode; int modifyable = a->modifyable; switch (q ? q->type : t_string) /* above line was: switch (q->type) - bug #1 fix 7/2/2003 */ { default: error(a, 34); case t_string: a->mode = m_forward; a->modifyable = true; p = new_node(a, c_dollar); p->left = read_C(a); break; case t_integer: /* a->mode = m_integer; */ p = new_node(a, read_AE_test(a)); p->AE = read_AE(a, 0); break; } p->name = q; a->mode = mode; a->modifyable = modifyable; return p; } case c_name: { struct name * q = find_name(a); struct node * p = new_node(a, c_name); unless (q == 0) { q->used = true; switch (q->type) { case t_boolean: p->type = c_booltest; break; case t_integer: error(a, 35); /* integer name misplaced */ case t_string: break; case t_routine: case t_external: p->type = c_call; check_routine_mode(a, q, a->mode); break; case t_grouping: p->type = c_grouping; break; } } p->name = q; return p; } case c_non: { struct node * p = new_node(a, token); read_token(t); if (t->token == c_minus) read_token(t); unless (check_token(a, c_name)) { omission_error(a, c_name); return p; } name_to_node(a, p, 'g'); return p; } case c_literalstring: return read_literalstring(a); case c_among: return read_among(a); case c_substring: return read_substring(a); default: error(a, 1); return 0; } } static int next_symbol(symbol * p, symbol * W, int utf8) { if (utf8) { int ch; int j = get_utf8(p, & ch); W[0] = ch; return j; } else { W[0] = p[0]; return 1; } } static symbol * alter_grouping(symbol * p, symbol * q, int style, int utf8) { int j = 0; symbol W[1]; int width; if (style == c_plus) { while (j < SIZE(q)) { width = next_symbol(q + j, W, utf8); p = add_to_b(p, 1, W); j += width; } } else { while (j < SIZE(q)) { int i; width = next_symbol(q + j, W, utf8); for (i = 0; i < SIZE(p); i++) { if (p[i] == W[0]) { memmove(p + i, p + i + 1, (SIZE(p) - i - 1) * sizeof(symbol)); SIZE(p)--; } } j += width; } } return p; } static void read_define_grouping(struct analyser * a, struct name * q) { struct tokeniser * t = a->tokeniser; int style = c_plus; { NEW(grouping, p); if (a->groupings == 0) a->groupings = p; else a->groupings_end->next = p; a->groupings_end = p; q->grouping = p; p->next = 0; p->name = q; p->number = q->count; p->b = create_b(0); repeat { switch (read_token(t)) { case c_name: { struct name * r = find_name(a); unless (r == 0) { check_name_type(a, r, 'g'); p->b = alter_grouping(p->b, r->grouping->b, style, false); } } break; case c_literalstring: p->b = alter_grouping(p->b, t->b, style, a->utf8); break; default: error(a, 1); return; } switch (read_token(t)) { case c_plus: case c_minus: style = t->token; break; default: goto label0; } } label0: { int i; int max = 0; int min = 1<<16; for (i = 0; i < SIZE(p->b); i++) { if (p->b[i] > max) max = p->b[i]; if (p->b[i] < min) min = p->b[i]; } p->largest_ch = max; p->smallest_ch = min; if (min == 1<<16) error(a, 16); } t->token_held = true; return; } } static void read_define_routine(struct analyser * a, struct name * q) { struct node * p = new_node(a, c_define); a->amongvar_needed = false; unless (q == 0) { check_name_type(a, q, 'R'); if (q->definition != 0) error(a, 36); if (q->mode < 0) q->mode = a->mode; else if (q->mode != a->mode) error2(a, 32, q->mode); } p->name = q; if (a->program == 0) a->program = p; else a->program_end->right = p; a->program_end = p; get_token(a, c_as); p->left = read_C(a); unless (q == 0) q->definition = p->left; if (a->substring != 0) { error2(a, 14, a->substring->line_number); a->substring = 0; } p->amongvar_needed = a->amongvar_needed; } static void read_define(struct analyser * a) { unless (get_token(a, c_name)) return; { struct name * q = find_name(a); if (q != 0 && q->type == t_grouping) read_define_grouping(a, q); else read_define_routine(a, q); } } static void read_backwardmode(struct analyser * a) { int mode = a->mode; a->mode = m_backward; if (get_token(a, c_bra)) { read_program_(a, c_ket); check_token(a, c_ket); } a->mode = mode; } static void read_program_(struct analyser * a, int terminator) { struct tokeniser * t = a->tokeniser; repeat { switch (read_token(t)) { case c_strings: read_names(a, t_string); break; case c_booleans: read_names(a, t_boolean); break; case c_integers: read_names(a, t_integer); break; case c_routines: read_names(a, t_routine); break; case c_externals: read_names(a, t_external); break; case c_groupings: read_names(a, t_grouping); break; case c_define: read_define(a); break; case c_backwardmode:read_backwardmode(a); break; case c_ket: if (terminator == c_ket) return; default: error(a, 1); break; case -1: unless (terminator < 0) omission_error(a, c_ket); return; } } } extern void read_program(struct analyser * a) { read_program_(a, -1); { struct name * q = a->names; until (q == 0) { switch(q->type) { case t_external: case t_routine: if (q->used && q->definition == 0) error4(a, q); break; case t_grouping: if (q->used && q->grouping == 0) error4(a, q); break; } q = q->next; } } if (a->tokeniser->error_count == 0) { struct name * q = a->names; int warned = false; until (q == 0) { unless (q->referenced) { unless (warned) { fprintf(stderr, "Declared but not used:"); warned = true; } fprintf(stderr, " "); report_b(stderr, q->b); } q = q->next; } if (warned) fprintf(stderr, "\n"); q = a->names; warned = false; until (q == 0) { if (! q->used && (q->type == t_routine || q->type == t_grouping)) { unless (warned) { fprintf(stderr, "Declared and defined but not used:"); warned = true; } fprintf(stderr, " "); report_b(stderr, q->b); } q = q->next; } if (warned) fprintf(stderr, "\n"); } } extern struct analyser * create_analyser(struct tokeniser * t) { NEW(analyser, a); a->tokeniser = t; a->nodes = 0; a->names = 0; a->literalstrings = 0; a->program = 0; a->amongs = 0; a->among_count = 0; a->groupings = 0; a->mode = m_forward; a->modifyable = true; { int i; for (i = 0; i < t_size; i++) a->name_count[i] = 0; } a->substring = 0; return a; } extern void close_analyser(struct analyser * a) { { struct node * q = a->nodes; until (q == 0) { struct node * q_next = q->next; FREE(q); q = q_next; } } { struct name * q = a->names; until (q == 0) { struct name * q_next = q->next; lose_b(q->b); FREE(q); q = q_next; } } { struct literalstring * q = a->literalstrings; until (q == 0) { struct literalstring * q_next = q->next; lose_b(q->b); FREE(q); q = q_next; } } { struct among * q = a->amongs; until (q == 0) { struct among * q_next = q->next; FREE(q->b); FREE(q); q = q_next; } } { struct grouping * q = a->groupings; until (q == 0) { struct grouping * q_next = q->next; lose_b(q->b); FREE(q); q = q_next; } } FREE(a); } snowball_code/compiler/generator.c0000644000175000017500000012162012707117052015761 0ustar domdom #include /* for INT_MAX */ #include /* for fprintf etc */ #include /* for free etc */ #include /* for strlen */ #include "header.h" /* Define this to get warning messages when optimisations can't be used. */ /* #define OPTIMISATION_WARNINGS */ /* recursive use: */ static void generate(struct generator * g, struct node * p); enum special_labels { x_return = -1 }; static int new_label(struct generator * g) { return g->next_label++; } /* Output routines */ static void output_str(FILE * outfile, struct str * str) { char * s = b_to_s(str_data(str)); fprintf(outfile, "%s", s); free(s); } static void wch(struct generator * g, int ch) { str_append_ch(g->outbuf, ch); /* character */ } static void wnl(struct generator * g) { str_append_ch(g->outbuf, '\n'); /* newline */ g->line_count++; } static void ws(struct generator * g, const char * s) { str_append_string(g->outbuf, s); /* string */ } static void wi(struct generator * g, int i) { str_append_int(g->outbuf, i); /* integer */ } static void wh_ch(struct generator * g, int i) { str_append_ch(g->outbuf, "0123456789ABCDEF"[i & 0xF]); /* hexchar */ } static void wh(struct generator * g, int i) { if (i >> 4) wh(g, i >> 4); wh_ch(g, i); /* hex integer */ } static void wi3(struct generator * g, int i) { if (i < 100) wch(g, ' '); if (i < 10) wch(g, ' '); wi(g, i); /* integer (width 3) */ } static void wvn(struct generator * g, struct name * p) { /* variable name */ int ch = "SBIrxg"[p->type]; switch (p->type) { case t_string: case t_boolean: case t_integer: wch(g, ch); wch(g, '['); wi(g, p->count); wch(g, ']'); return; case t_external: ws(g, g->options->externals_prefix); break; default: wch(g, ch); wch(g, '_'); } str_append_b(g->outbuf, p->b); } static void wv(struct generator * g, struct name * p) { /* reference to variable */ if (p->type < t_routine) ws(g, "z->"); wvn(g, p); } static void wlitarray(struct generator * g, symbol * p) { /* write literal array */ ws(g, "{ "); { int i; for (i = 0; i < SIZE(p); i++) { int ch = p[i]; if (32 <= ch && ch < 127) { wch(g, '\''); switch (ch) { case '\'': case '\\': wch(g, '\\'); default: wch(g, ch); } wch(g, '\''); } else { wch(g, '0'); wch(g, 'x'); wh(g, ch); } if (i < SIZE(p) - 1) ws(g, ", "); } } ws(g, " }"); } static void wlitref(struct generator * g, symbol * p) { /* write ref to literal array */ if (SIZE(p) == 0) ws(g, "0"); else { struct str * s = g->outbuf; g->outbuf = g->declarations; ws(g, "static const symbol s_"); wi(g, g->literalstring_count); ws(g, "[] = "); wlitarray(g, p); ws(g, ";\n"); g->outbuf = s; ws(g, "s_"); wi(g, g->literalstring_count); g->literalstring_count++; } } static void wm(struct generator * g) { /* margin */ int i; for (i = 0; i < g->margin; i++) ws(g, " "); } static void wc(struct generator * g, struct node * p) { /* comment */ ws(g, " /* "); ws(g, (char *) name_of_token(p->type)); unless (p->name == 0) { ws(g, " "); str_append_b(g->outbuf, p->name->b); } ws(g, ", line "); wi(g, p->line_number); ws(g, " */"); wnl(g); } static void wms(struct generator * g, const char * s) { wm(g); ws(g, s); } /* margin + string */ static void wbs(struct generator * g) { /* block start */ wms(g, "{ "); g->margin++; } static void wbe(struct generator * g) { /* block end */ if (g->line_labelled == g->line_count) { wms(g, ";"); wnl(g); } g->margin--; wms(g, "}"); wnl(g); } static void wk(struct generator * g, struct node * p) { /* keep c */ ++g->keep_count; if (p->mode == m_forward) { ws(g, "int c"); wi(g, g->keep_count); ws(g, " = z->c;"); } else { ws(g, "int m"); wi(g, g->keep_count); ws(g, " = z->l - z->c; (void)m"); wi(g, g->keep_count); ws(g, ";"); } } static void wrestore(struct generator * g, struct node * p, int keep_token) { /* restore c */ if (p->mode == m_forward) { ws(g, "z->c = c"); } else { ws(g, "z->c = z->l - m"); } wi(g, keep_token); ws(g, ";"); } static void winc(struct generator * g, struct node * p) { /* increment c */ ws(g, p->mode == m_forward ? "z->c++;" : "z->c--;"); } static void wsetl(struct generator * g, int n) { g->margin--; wms(g, "lab"); wi(g, n); wch(g, ':'); wnl(g); g->line_labelled = g->line_count; g->margin++; } static void wgotol(struct generator * g, int n) { wms(g, "goto lab"); wi(g, n); wch(g, ';'); wnl(g); } static void wf(struct generator * g) { /* fail */ if (g->failure_string != 0) { ws(g, "{ "); ws(g, g->failure_string); wch(g, ' '); } switch (g->failure_label) { case x_return: ws(g, "return 0;"); break; default: ws(g, "goto lab"); wi(g, g->failure_label); wch(g, ';'); g->label_used = 1; } if (g->failure_string != 0) ws(g, " }"); } static void wlim(struct generator * g, struct node * p) { /* if at limit fail */ ws(g, p->mode == m_forward ? "if (z->c >= z->l) " : "if (z->c <= z->lb) "); wf(g); } static void wp(struct generator * g, const char * s, struct node * p) { /* formatted write */ int i = 0; int l = strlen(s); until (i >= l) { int ch = s[i++]; if (ch != '~') wch(g, ch); else switch(s[i++]) { default: wch(g, s[i - 1]); continue; case 'C': wc(g, p); continue; case 'k': wk(g, p); continue; case 'K': /* keep for c_test */ ws(g, p->mode == m_forward ? "int c_test = z->c;" : "int m_test = z->l - z->c;"); continue; case 'R': /* restore for c_test */ ws(g, p->mode == m_forward ? "z->c = c_test;" : "z->c = z->l - m_test;"); continue; case 'i': winc(g, p); continue; case 'l': wlim(g, p); continue; case 'f': wf(g); continue; case 'M': wm(g); continue; case 'N': wnl(g); continue; case '{': wbs(g); continue; case '}': wbe(g); continue; case 'S': ws(g, g->S[s[i++] - '0']); continue; case 'I': wi(g, g->I[s[i++] - '0']); continue; case 'J': wi3(g, g->I[s[i++] - '0']); continue; case 'V': wv(g, g->V[s[i++] - '0']); continue; case 'W': wvn(g, g->V[s[i++] - '0']); continue; case 'L': wlitref(g, g->L[s[i++] - '0']); continue; case 'A': wlitarray(g, g->L[s[i++] - '0']); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case '$': /* insert_s, insert_v etc */ wch(g, p->literalstring == 0 ? 'v' : 's'); continue; case 'p': ws(g, g->options->externals_prefix); continue; } } } static void w(struct generator * g, const char * s) { wp(g, s, 0); } static void generate_AE(struct generator * g, struct node * p) { char * s; switch (p->type) { case c_name: wv(g, p->name); break; case c_number: wi(g, p->number); break; case c_maxint: ws(g, "MAXINT"); break; case c_minint: ws(g, "MININT"); break; case c_neg: wch(g, '-'); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; goto label0; case c_divide: s = " / "; label0: wch(g, '('); generate_AE(g, p->left); ws(g, s); generate_AE(g, p->right); wch(g, ')'); break; case c_sizeof: g->V[0] = p->name; w(g, "SIZE(~V0)"); break; case c_cursor: w(g, "z->c"); break; case c_limit: w(g, p->mode == m_forward ? "z->l" : "z->lb"); break; case c_size: w(g, "SIZE(z->p)"); break; } } /* K_needed() tests to see if we really need to keep c. Not true when the the command does not touch the cursor. This and repeat_score() could be elaborated almost indefinitely. */ static int K_needed(struct generator * g, struct node * p) { until (p == 0) { switch (p->type) { case c_dollar: case c_leftslice: case c_rightslice: case c_mathassign: case c_plusassign: case c_minusassign: case c_multiplyassign: case c_divideassign: case c_eq: case c_ne: case c_gr: case c_ge: case c_ls: case c_le: case c_sliceto: case c_true: case c_false: case c_debug: break; case c_call: if (K_needed(g, p->name->definition)) return true; break; case c_bra: if (K_needed(g, p->left)) return true; break; default: return true; } p = p->right; } return false; } static int repeat_score(struct generator * g, struct node * p) { int score = 0; until (p == 0) { switch (p->type) { case c_dollar: case c_leftslice: case c_rightslice: case c_mathassign: case c_plusassign: case c_minusassign: case c_multiplyassign: case c_divideassign: case c_eq: case c_ne: case c_gr: case c_ge: case c_ls: case c_le: case c_sliceto: /* case c_not: must not be included here! */ case c_debug: break; case c_call: score += repeat_score(g, p->name->definition); break; case c_bra: score += repeat_score(g, p->left); break; case c_name: case c_literalstring: case c_next: case c_grouping: case c_non: case c_hop: score = score + 1; break; default: score = 2; break; } p = p->right; } return score; } /* tests if an expression requires cursor reinstatement in a repeat */ static int repeat_restore(struct generator * g, struct node * p) { return repeat_score(g, p) >= 2; } static void generate_bra(struct generator * g, struct node * p) { p = p->left; until (p == 0) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { int keep_c = 0; if (K_needed(g, p->left)) { wp(g, "~{~k~C", p); keep_c = g->keep_count; } else { wp(g, "~M~C", p); } p = p->left; until (p == 0) { generate(g, p); if (keep_c && p->right != 0) { w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); } p = p->right; } if (keep_c) w(g, "~}"); } static void generate_or(struct generator * g, struct node * p) { int keep_c = 0; int used = g->label_used; int a0 = g->failure_label; const char * a1 = g->failure_string; int out_lab = new_label(g); if (K_needed(g, p->left)) { wp(g, "~{~k~C", p); keep_c = g->keep_count; } else { wp(g, "~M~C", p); } p = p->left; g->failure_string = 0; until (p->right == 0) { g->failure_label = new_label(g); g->label_used = 0; generate(g, p); wgotol(g, out_lab); if (g->label_used) wsetl(g, g->failure_label); if (keep_c) { w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); } p = p->right; } g->label_used = used; g->failure_label = a0; g->failure_string = a1; generate(g, p); if (keep_c) w(g, "~}"); wsetl(g, out_lab); } static void generate_backwards(struct generator * g, struct node * p) { wp(g,"~Mz->lb = z->c; z->c = z->l;~C~N", p); generate(g, p->left); w(g, "~Mz->c = z->lb;~N"); } static void generate_not(struct generator * g, struct node * p) { int keep_c = 0; int used = g->label_used; int a0 = g->failure_label; const char * a1 = g->failure_string; if (K_needed(g, p->left)) { wp(g, "~{~k~C", p); keep_c = g->keep_count; } else { wp(g, "~M~C", p); } g->failure_label = new_label(g); g->label_used = 0; g->failure_string = 0; generate(g, p->left); { int l = g->failure_label; int u = g->label_used; g->label_used = used; g->failure_label = a0; g->failure_string = a1; w(g, "~M~f~N"); if (u) wsetl(g, l); } if (keep_c) { w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N~}"); } } static void generate_try(struct generator * g, struct node * p) { int keep_c = K_needed(g, p->left); if (keep_c) { if (p->mode == m_forward) { wp(g, "~{int c_keep = z->c;~C", p); g->failure_string = "z->c = c_keep;"; } else { wp(g, "~{int m_keep = z->l - z->c;/* (void) m_keep;*/~C", p); g->failure_string = "z->c = z->l - m_keep;"; } } else { wp(g, "~M~C", p); g->failure_string = 0; } g->failure_label = new_label(g); g->label_used = 0; generate(g, p->left); if (g->label_used) wsetl(g, g->failure_label); if (keep_c) w(g, "~}"); } static void generate_set(struct generator * g, struct node * p) { g->V[0] = p->name; wp(g, "~M~V0 = 1;~C", p); } static void generate_unset(struct generator * g, struct node * p) { g->V[0] = p->name; wp(g, "~M~V0 = 0;~C", p); } static void generate_fail(struct generator * g, struct node * p) { generate(g, p->left); wp(g, "~M~f~C", p); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { int keep_c = K_needed(g, p->left); if (keep_c) wp(g, "~{~K~C", p); else wp(g, "~M~C", p); generate(g, p->left); if (keep_c) wp(g, "~M~R~N" "~}", p); } static void generate_do(struct generator * g, struct node * p) { int keep_c = 0; if (K_needed(g, p->left)) { wp(g, "~{~k~C", p); keep_c = g->keep_count; } else { wp(g, "~M~C", p); } g->failure_label = new_label(g); g->label_used = 0; g->failure_string = 0; generate(g, p->left); if (g->label_used) wsetl(g, g->failure_label); if (keep_c) { w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N~}"); } } static void generate_next(struct generator * g, struct node * p) { if (g->options->utf8) { if (p->mode == m_forward) w(g, "~{int ret = skip_utf8(z->p, z->c, 0, z->l, 1"); else w(g, "~{int ret = skip_utf8(z->p, z->c, z->lb, 0, -1"); wp(g, ");~N" "~Mif (ret < 0) ~f~N" "~Mz->c = ret;~C" "~}", p); } else wp(g, "~M~l~N" "~M~i~C", p); } static void generate_GO_grouping(struct generator * g, struct node * p, int is_goto, int complement) { struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "in" : "out"; g->S[2] = g->options->utf8 ? "_U" : ""; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; if (is_goto) { wp(g, "~Mif (~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 1) < 0) ~f /* goto */~C", p); } else { wp(g, "~{ /* gopast */~C" "~Mint ret = ~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 1);~N" "~Mif (ret < 0) ~f~N", p); if (p->mode == m_forward) w(g, "~Mz->c += ret;~N"); else w(g, "~Mz->c -= ret;~N"); w(g, "~}"); } } static void generate_GO(struct generator * g, struct node * p, int style) { int keep_c = 0; int used = g->label_used; int a0 = g->failure_label; const char * a1 = g->failure_string; if (p->left->type == c_grouping || p->left->type == c_non) { /* Special case for "goto" or "gopast" when used on a grouping or an * inverted grouping - the movement of c by the matching action is * exactly what we want! */ #ifdef OPTIMISATION_WARNINGS printf("Optimising %s %s\n", style ? "goto" : "gopast", p->left->type == c_non ? "non" : "grouping"); #endif generate_GO_grouping(g, p->left, style, p->left->type == c_non); return; } w(g, "~Mwhile(1) {"); wp(g, "~C~+", p); if (style == 1 || repeat_restore(g, p->left)) { wp(g, "~M~k~N", p); keep_c = g->keep_count; } g->failure_label = new_label(g); g->label_used = 0; generate(g, p->left); if (style == 1) { /* include for goto; omit for gopast */ w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); } w(g, "~Mbreak;~N"); if (g->label_used) wsetl(g, g->failure_label); if (keep_c) { w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); } g->label_used = used; g->failure_label = a0; g->failure_string = a1; /* wp(g, "~M~l~N" "~M~i~N", p); */ generate_next(g, p); w(g, "~}"); } static void generate_loop(struct generator * g, struct node * p) { w(g, "~{int i; for (i = "); generate_AE(g, p->AE); wp(g, "; i > 0; i--)~C" "~{", p); generate(g, p->left); w(g, "~}" "~}"); } static void generate_repeat(struct generator * g, struct node * p, int atleast_case) { int keep_c = 0; wp(g, "~Mwhile(1) {~C~+", p); if (repeat_restore(g, p->left)) { wp(g, "~M~k~N", p); keep_c = g->keep_count; } g->failure_label = new_label(g); g->label_used = 0; g->failure_string = 0; generate(g, p->left); if (atleast_case) w(g, "~Mi--;~N"); w(g, "~Mcontinue;~N"); if (g->label_used) wsetl(g, g->failure_label); if (keep_c) { w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); } w(g, "~Mbreak;~N" "~}"); } static void generate_atleast(struct generator * g, struct node * p) { w(g, "~{int i = "); generate_AE(g, p->AE); w(g, ";~N"); { int used = g->label_used; int a0 = g->failure_label; const char * a1 = g->failure_string; generate_repeat(g, p, true); g->label_used = used; g->failure_label = a0; g->failure_string = a1; } w(g, "~Mif (i > 0) ~f~N" "~}"); } static void generate_setmark(struct generator * g, struct node * p) { g->V[0] = p->name; wp(g, "~M~V0 = z->c;~C", p); } static void generate_tomark(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif (z->c ~S0 "); generate_AE(g, p->AE); w(g, ") ~f~N"); w(g, "~Mz->c = "); generate_AE(g, p->AE); wp(g, ";~C", p); } static void generate_atmark(struct generator * g, struct node * p) { w(g, "~Mif (z->c != "); generate_AE(g, p->AE); wp(g, ") ~f~C", p); } static void generate_hop(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? "+" : "-"; g->S[1] = p->mode == m_forward ? "0" : "z->lb"; if (g->options->utf8) { w(g, "~{int ret = skip_utf8(z->p, z->c, ~S1, z->l, ~S0 "); generate_AE(g, p->AE); w(g, ");~N"); w(g, "~Mif (ret < 0) ~f~N"); } else { w(g, "~{int ret = z->c ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); w(g, "~Mif (~S1 > ret || ret > z->l) ~f~N"); } wp(g, "~Mz->c = ret;~C" "~}", p); } static void generate_delete(struct generator * g, struct node * p) { wp(g, "~{int ret = slice_del(z);~C", p); wp(g, "~Mif (ret < 0) return ret;~N" "~}", p); } static void generate_tolimit(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? "" : "b"; wp(g, "~Mz->c = z->l~S0;~C", p); } static void generate_atlimit(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? "" : "b"; g->S[1] = p->mode == m_forward ? "<" : ">"; wp(g, "~Mif (z->c ~S1 z->l~S0) ~f~C", p); } static void generate_leftslice(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? "bra" : "ket"; wp(g, "~Mz->~S0 = z->c;~C", p); } static void generate_rightslice(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? "ket" : "bra"; wp(g, "~Mz->~S0 = z->c;~C", p); } static void generate_assignto(struct generator * g, struct node * p) { g->V[0] = p->name; wp(g, "~M~V0 = assign_to(z, ~V0);~C" "~Mif (~V0 == 0) return -1;~C", p); } static void generate_sliceto(struct generator * g, struct node * p) { g->V[0] = p->name; wp(g, "~M~V0 = slice_to(z, ~V0);~C" "~Mif (~V0 == 0) return -1;~C", p); } static void generate_data_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != 0) { wi(g, SIZE(b)); w(g, ", "); wlitref(g, b); } else wv(g, p->name); } static void generate_insert(struct generator * g, struct node * p, int style) { int keep_c = style == c_attach; if (p->mode == m_backward) keep_c = !keep_c; wp(g, "~{", p); if (keep_c) w(g, "int c_keep = z->c;~N~M"); wp(g, "int ret = insert_~$(z, z->c, z->c, ", p); generate_data_address(g, p); wp(g, ");~C", p); if (keep_c) w(g, "~Mz->c = c_keep;~N"); wp(g, "~Mif (ret < 0) return ret;~N" "~}", p); } static void generate_assignfrom(struct generator * g, struct node * p) { int keep_c = p->mode == m_forward; /* like 'attach' */ wp(g, "~{", p); if (keep_c) wp(g, "int c_keep = z->c;~N" "~Mret = insert_~$(z, z->c, z->l, ", p); else wp(g, "ret = insert_~$(z, z->lb, z->c, ", p); generate_data_address(g, p); wp(g, ");~C", p); if (keep_c) w(g, "~Mz->c = c_keep;~N"); wp(g, "~Mif (ret < 0) return ret;~N" "~}", p); } /* bugs marked <======= fixed 22/7/02. Similar fixes required for Java */ static void generate_slicefrom(struct generator * g, struct node * p) { /* w(g, "~Mslice_from_s(z, "); <============= bug! should be: */ wp(g, "~{int ret = slice_from_~$(z, ", p); generate_data_address(g, p); wp(g, ");~C", p); wp(g, "~Mif (ret < 0) return ret;~N" "~}", p); } static void generate_setlimit(struct generator * g, struct node * p) { int keep_c; wp(g, "~{int mlimit;~C" "~M~k~N" , p); keep_c = g->keep_count; generate(g, p->left); if (p->mode == m_forward) w(g, "~Mmlimit = z->l - z->c; z->l = z->c;~N"); else w(g, "~Mmlimit = z->lb; z->lb = z->c;~N"); w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); g->failure_string = p->mode == m_forward ? "z->l += mlimit;" : "z->lb = mlimit;"; generate(g, p->aux); wms(g, g->failure_string); w(g, "~N" "~}"); } static void generate_dollar(struct generator * g, struct node * p) { int used = g->label_used; int a0 = g->failure_label; const char * a1 = g->failure_string; g->failure_label = new_label(g); g->label_used = 0; g->failure_string = 0; g->V[0] = p->name; wp(g, "~{struct SN_env env = * z;~C" "~Mint failure = 1; /* assume failure */~N" "~Mz->p = ~V0;~N" "~Mz->lb = z->c = 0;~N" "~Mz->l = SIZE(z->p);~N", p); generate(g, p->left); w(g, "~Mfailure = 0; /* mark success */~N"); if (g->label_used) wsetl(g, g->failure_label); g->V[0] = p->name; /* necessary */ g->label_used = used; g->failure_label = a0; g->failure_string = a1; w(g, "~M~V0 = z->p;~N" "~M* z = env;~N" "~Mif (failure) ~f~N~}"); } static void generate_integer_assign(struct generator * g, struct node * p, char * s) { g->V[0] = p->name; g->S[0] = s; w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); } static void generate_integer_test(struct generator * g, struct node * p, char * s) { g->V[0] = p->name; g->S[0] = s; w(g, "~Mif (!(~V0 ~S0 "); generate_AE(g, p->AE); w(g, ")) ~f~N"); } static void generate_call(struct generator * g, struct node * p) { g->V[0] = p->name; wp(g, "~{int ret = ~V0(z);~N" "~Mif (ret == 0) ~f~C" "~Mif (ret < 0) return ret;~N~}", p); } static void generate_grouping(struct generator * g, struct node * p, int complement) { struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "out" : "in"; g->S[2] = g->options->utf8 ? "_U" : ""; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; w(g, "~Mif (~S1_grouping~S0~S2(z, ~V0, ~I0, ~I1, 0)) ~f~N"); } static void generate_namedstring(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? "" : "_b"; g->V[0] = p->name; wp(g, "~Mif (!(eq_v~S0(z, ~V0))) ~f~C", p); } static void generate_literalstring(struct generator * g, struct node * p) { symbol * b = p->literalstring; g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = SIZE(b); g->L[0] = b; w(g, "~Mif (!(eq_s~S0(z, ~I0, ~L0))) ~f~N"); } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; g->next_label = 0; g->S[0] = q->type == t_routine ? "static" : "extern"; g->V[0] = q; w(g, "~N~S0 int ~V0(struct SN_env * z) {~N~+"); if (p->amongvar_needed) w(g, "~Mint among_var;~N"); g->failure_string = 0; g->failure_label = x_return; g->label_used = 0; g->keep_count = 0; generate(g, p->left); w(g, "~Mreturn 1;~N~}"); } static void generate_substring(struct generator * g, struct node * p) { struct among * x = p->among; int block = -1; unsigned int bitmap = 0; struct amongvec * among_cases = x->b; int c; int empty_case = -1; int n_cases = 0; symbol cases[2]; int shortest_size = INT_MAX; g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = x->number; g->I[1] = x->literalstring_count; /* In forward mode with non-ASCII UTF-8 characters, the first character * of the string will often be the same, so instead look at the last * common character position. * * In backward mode, we can't match if there are fewer characters before * the current position than the minimum length. */ for (c = 0; c < x->literalstring_count; ++c) { int size = among_cases[c].size; if (size != 0 && size < shortest_size) { shortest_size = size; } } for (c = 0; c < x->literalstring_count; ++c) { symbol ch; if (among_cases[c].size == 0) { empty_case = c; continue; } if (p->mode == m_forward) { ch = among_cases[c].b[shortest_size - 1]; } else { ch = among_cases[c].b[among_cases[c].size - 1]; } if (n_cases == 0) { block = ch >> 5; } else if (ch >> 5 != block) { block = -1; if (n_cases > 2) break; } if (block == -1) { if (ch == cases[0]) continue; if (n_cases < 2) { cases[n_cases++] = ch; } else if (ch != cases[1]) { ++n_cases; break; } } else { if ((bitmap & (1u << (ch & 0x1f))) == 0) { bitmap |= 1u << (ch & 0x1f); if (n_cases < 2) cases[n_cases] = ch; ++n_cases; } } } if (block != -1 || n_cases <= 2) { char buf[64]; g->I[2] = block; g->I[3] = bitmap; g->I[4] = shortest_size - 1; if (p->mode == m_forward) { sprintf(buf, "z->p[z->c + %d]", shortest_size - 1); g->S[1] = buf; if (shortest_size == 1) { wp(g, "~Mif (z->c >= z->l || ", p); } else { wp(g, "~Mif (z->c + ~I4 >= z->l || ", p); } } else { g->S[1] = "z->p[z->c - 1]"; if (shortest_size == 1) { wp(g, "~Mif (z->c <= z->lb || ", p); } else { wp(g, "~Mif (z->c - ~I4 <= z->lb || ", p); } } if (n_cases == 0) { /* We get this for the degenerate case: among { '' } * This doesn't seem to be a useful construct, but it is * syntactically valid. */ wp(g, "0", p); } else if (n_cases == 1) { g->I[4] = cases[0]; wp(g, "~S1 != ~I4", p); } else if (n_cases == 2) { g->I[4] = cases[0]; g->I[5] = cases[1]; wp(g, "(~S1 != ~I4 && ~S1 != ~I5)", p); } else { wp(g, "~S1 >> 5 != ~I2 || !((~I3 >> (~S1 & 0x1f)) & 1)", p); } ws(g, ") "); if (empty_case != -1) { /* If the among includes the empty string, it can never fail * so not matching the bitmap means we match the empty string. */ g->I[4] = among_cases[empty_case].result; wp(g, "among_var = ~I4; else~N", p); } else { wp(g, "~f~N", p); } } else { #ifdef OPTIMISATION_WARNINGS printf("Couldn't shortcut among %d\n", x->number); #endif } if (x->command_count == 0 && x->starter == 0) wp(g, "~Mif (!(find_among~S0(z, a_~I0, ~I1))) ~f~C", p); else wp(g, "~Mamong_var = find_among~S0(z, a_~I0, ~I1);~C" "~Mif (!(among_var)) ~f~N", p); } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; int case_number = 1; if (x->substring == 0) generate_substring(g, p); if (x->command_count == 0 && x->starter == 0) return; unless (x->starter == 0) generate(g, x->starter); p = p->left; if (p != 0 && p->type != c_literalstring) p = p->right; w(g, "~Mswitch(among_var) {~N~+" "~Mcase 0: ~f~N"); until (p == 0) { if (p->type == c_bra && p->left != 0) { g->I[0] = case_number++; w(g, "~Mcase ~I0:~N~+"); generate(g, p); w(g, "~Mbreak;~N~-"); } p = p->right; } w(g, "~}"); } static void generate_booltest(struct generator * g, struct node * p) { g->V[0] = p->name; wp(g, "~Mif (!(~V0)) ~f~C", p); } static void generate_false(struct generator * g, struct node * p) { wp(g, "~M~f~C", p); } static void generate_debug(struct generator * g, struct node * p) { g->I[0] = g->debug_count++; g->I[1] = p->line_number; wp(g, "~Mdebug(z, ~I0, ~I1);~C", p); } static void generate(struct generator * g, struct node * p) { int used = g->label_used; int a0 = g->failure_label; const char * a1 = g->failure_string; switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_repeat: generate_repeat(g, p, false); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_setmark: generate_setmark(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_atmark: generate_atmark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_atlimit: generate_atlimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_assign: generate_assignfrom(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_mathassign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: generate_integer_assign(g, p, "/="); break; case c_eq: generate_integer_test(g, p, "=="); break; case c_ne: generate_integer_test(g, p, "!="); break; case c_gr: generate_integer_test(g, p, ">"); break; case c_ge: generate_integer_test(g, p, ">="); break; case c_ls: generate_integer_test(g, p, "<"); break; case c_le: generate_integer_test(g, p, "<="); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } if (g->failure_label != a0) g->label_used = used; g->failure_label = a0; g->failure_string = a1; } static void generate_start_comment(struct generator * g) { w(g, "~N/* This file was generated automatically by the Snowball to ANSI C compiler */~N"); } static void generate_head(struct generator * g) { if (g->options->runtime_path == 0) { w(g, "~N#include \"header.h\"~N~N"); } else { w(g, "~N#include \""); ws(g, g->options->runtime_path); if (g->options->runtime_path[strlen(g->options->runtime_path) - 1] != '/') wch(g, '/'); w(g, "header.h\"~N~N"); } } static void generate_routine_headers(struct generator * g) { struct name * q = g->analyser->names; until (q == 0) { g->V[0] = q; switch (q->type) { case t_routine: w(g, "static int ~W0(struct SN_env * z);~N"); break; case t_external: w(g, "#ifdef __cplusplus~N" "extern \"C\" {~N" "#endif~N" "extern int ~W0(struct SN_env * z);~N" "#ifdef __cplusplus~N" "}~N" "#endif~N" ); break; } q = q->next; } } static void generate_among_table(struct generator * g, struct among * x) { struct amongvec * v = x->b; g->I[0] = x->number; { int i; for (i = 0; i < x->literalstring_count; i++) { g->I[1] = i; g->I[2] = v->size; g->L[0] = v->b; unless (v->size == 0) w(g, "static const symbol s_~I0_~I1[~I2] = ~A0;~N"); v++; } } g->I[1] = x->literalstring_count; w(g, "~N~Mstatic const struct among a_~I0[~I1] =~N{~N"); v = x->b; { int i; for (i = 0; i < x->literalstring_count; i++) { g->I[1] = i; g->I[2] = v->size; g->I[3] = v->i; g->I[4] = v->result; g->S[0] = i < x->literalstring_count - 1 ? "," : ""; w(g, "/*~J1 */ { ~I2, "); if (v->size == 0) w(g, "0,"); else w(g, "s_~I0_~I1,"); w(g, " ~I3, ~I4, "); if (v->function == 0) w(g, "0"); else wvn(g, v->function); w(g, "}~S0~N"); v++; } } w(g, "};~N~N"); } static void generate_amongs(struct generator * g) { struct among * x = g->analyser->amongs; until (x == 0) { generate_among_table(g, x); x = x->next; } } static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7)/ 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); int i; for (i = 0; i < size; i++) map[i] = 0; for (i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); { g->V[0] = q->name; w(g, "static const unsigned char ~V0[] = { "); for (i = 0; i < size; i++) { wi(g, map[i]); if (i < size - 1) w(g, ", "); } w(g, " };~N~N"); } lose_b(map); } static void generate_groupings(struct generator * g) { struct grouping * q = g->analyser->groupings; until (q == 0) { generate_grouping_table(g, q); q = q->next; } } static void generate_create(struct generator * g) { int * p = g->analyser->name_count; g->I[0] = p[t_string]; g->I[1] = p[t_integer]; g->I[2] = p[t_boolean]; w(g, "~N" "extern struct SN_env * ~pcreate_env(void) { return SN_create_env(~I0, ~I1, ~I2); }" "~N"); } static void generate_close(struct generator * g) { int * p = g->analyser->name_count; g->I[0] = p[t_string]; w(g, "~Nextern void ~pclose_env(struct SN_env * z) { SN_close_env(z, ~I0); }~N~N"); } static void generate_create_and_close_templates(struct generator * g) { w(g, "~N" "extern struct SN_env * ~pcreate_env(void);~N" "extern void ~pclose_env(struct SN_env * z);~N" "~N"); } static void generate_header_file(struct generator * g) { struct name * q = g->analyser->names; char * vp = g->options->variables_prefix; g->S[0] = vp; w(g, "~N" "#ifdef __cplusplus~N" "extern \"C\" {~N" "#endif~N"); /* for C++ */ generate_create_and_close_templates(g); until (q == 0) { g->V[0] = q; switch (q->type) { case t_external: w(g, "extern int ~W0(struct SN_env * z);~N"); break; case t_string: g->S[1] = "S"; goto label0; case t_integer: g->S[1] = "I"; goto label0; case t_boolean: g->S[1] = "B"; label0: if (vp) { g->I[0] = q->count; w(g, "#define ~S0"); str_append_b(g->outbuf, q->b); w(g, " (~S1[~I0])~N"); } break; } q = q->next; } w(g, "~N" "#ifdef __cplusplus~N" "}~N" "#endif~N"); /* for C++ */ w(g, "~N"); } extern void generate_program_c(struct generator * g) { g->outbuf = str_new(); generate_start_comment(g); generate_head(g); generate_routine_headers(g); w(g, "#ifdef __cplusplus~N" "extern \"C\" {~N" "#endif~N" "~N"); generate_create_and_close_templates(g); w(g, "~N" "#ifdef __cplusplus~N" "}~N" "#endif~N"); generate_amongs(g); generate_groupings(g); g->declarations = g->outbuf; g->outbuf = str_new(); g->literalstring_count = 0; { struct node * p = g->analyser->program; until (p == 0) { generate(g, p); p = p->right; } } generate_create(g); generate_close(g); output_str(g->options->output_c, g->declarations); str_delete(g->declarations); output_str(g->options->output_c, g->outbuf); str_clear(g->outbuf); generate_start_comment(g); generate_header_file(g); output_str(g->options->output_h, g->outbuf); str_delete(g->outbuf); } extern struct generator * create_generator_c(struct analyser * a, struct options * o) { NEW(generator, g); g->analyser = a; g->options = o; g->margin = 0; g->debug_count = 0; g->line_count = 0; return g; } extern void close_generator_c(struct generator * g) { FREE(g); } snowball_code/compiler/generator_java.c0000644000175000017500000011457212707117052016772 0ustar domdom #include /* for exit */ #include /* for strlen */ #include /* for fprintf etc */ #include "header.h" /* prototypes */ static void generate(struct generator * g, struct node * p); static void w(struct generator * g, const char * s); static void writef(struct generator * g, const char * s, struct node * p); enum special_labels { x_return = -1 }; static int new_label(struct generator * g) { return g->next_label++; } static struct str * vars_newname(struct generator * g) { struct str * output; g->var_number ++; output = str_new(); str_append_string(output, "v_"); str_append_int(output, g->var_number); return output; } /* Output routines */ static void output_str(FILE * outfile, struct str * str) { char * s = b_to_s(str_data(str)); fprintf(outfile, "%s", s); free(s); } /* Write routines for simple entities */ static void write_char(struct generator * g, int ch) { str_append_ch(g->outbuf, ch); } static void write_newline(struct generator * g) { str_append_string(g->outbuf, "\n"); } static void write_string(struct generator * g, const char * s) { str_append_string(g->outbuf, s); } static void write_b(struct generator * g, symbol * b) { str_append_b(g->outbuf, b); } static void write_str(struct generator * g, struct str * str) { str_append(g->outbuf, str); } static void write_int(struct generator * g, int i) { str_append_int(g->outbuf, i); } /* Write routines for items from the syntax tree */ static void write_varname(struct generator * g, struct name * p) { int ch = "SBIrxg"[p->type]; if (p->type != t_external) { write_char(g, ch); write_char(g, '_'); } str_append_b(g->outbuf, p->b); } static void write_varref(struct generator * g, struct name * p) { /* In java, references look just the same */ write_varname(g, p); } static void write_hexdigit(struct generator * g, int n) { write_char(g, n < 10 ? n + '0' : n - 10 + 'A'); } static void write_hex(struct generator * g, int ch) { write_string(g, "\\u"); { int i; for (i = 12; i >= 0; i -= 4) write_hexdigit(g, ch >> i & 0xf); } } static void write_literal_string(struct generator * g, symbol * p) { int i; write_string(g, "\""); for (i = 0; i < SIZE(p); i++) { int ch = p[i]; if (32 <= ch && ch <= 127) { if (ch == '\"' || ch == '\\') write_string(g, "\\"); write_char(g, ch); } else { write_hex(g, ch); } } write_string(g, "\""); } static void write_margin(struct generator * g) { int i; for (i = 0; i < g->margin; i++) write_string(g, " "); } /* Write a variable declaration. */ static void write_declare(struct generator * g, char * declaration, struct node * p) { struct str * temp = g->outbuf; g->outbuf = g->declarations; write_string(g, " "); writef(g, declaration, p); write_string(g, ";"); write_newline(g); g->outbuf = temp; } static void write_comment(struct generator * g, struct node * p) { write_margin(g); write_string(g, "// "); write_string(g, (char *) name_of_token(p->type)); if (p->name != 0) { write_string(g, " "); str_append_b(g->outbuf, p->name->b); } write_string(g, ", line "); write_int(g, p->line_number); write_newline(g); } static void write_block_start(struct generator * g) { w(g, "~M{~+~N"); } static void write_block_end(struct generator * g) /* block end */ { w(g, "~-~M}~N"); } static void write_savecursor(struct generator * g, struct node * p, struct str * savevar) { g->B[0] = str_data(savevar); g->S[1] = ""; if (p->mode != m_forward) g->S[1] = "limit - "; write_declare(g, "int ~B0", p); writef(g, "~M~B0 = ~S1cursor;~N" , p); } static void restore_string(struct node * p, struct str * out, struct str * savevar) { str_clear(out); str_append_string(out, "cursor = "); if (p->mode != m_forward) str_append_string(out, "limit - "); str_append(out, savevar); str_append_string(out, ";"); } static void write_restorecursor(struct generator * g, struct node * p, struct str * savevar) { struct str * temp = str_new(); write_margin(g); restore_string(p, temp, savevar); write_str(g, temp); write_newline(g); str_delete(temp); } static void write_inc_cursor(struct generator * g, struct node * p) { write_margin(g); write_string(g, p->mode == m_forward ? "cursor++;" : "cursor--;"); write_newline(g); } static void wsetlab_begin(struct generator * g, int n) { w(g, "~Mlab"); write_int(g, n); w(g, ": do {~+~N"); } static void wsetlab_end(struct generator * g) { w(g, "~-~M} while (false);~N"); } static void wgotol(struct generator * g, int n) { write_margin(g); write_string(g, "break lab"); write_int(g, n); write_string(g, ";"); write_newline(g); } static void write_failure(struct generator * g) { if (str_len(g->failure_str) != 0) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } write_margin(g); switch (g->failure_label) { case x_return: write_string(g, "return false;"); break; default: write_string(g, "break lab"); write_int(g, g->failure_label); write_string(g, ";"); g->unreachable = true; } write_newline(g); } static void write_failure_if(struct generator * g, char * s, struct node * p) { writef(g, "~Mif (", p); writef(g, s, p); writef(g, ")~N", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } /* if at limit fail */ static void write_check_limit(struct generator * g, struct node * p) { if (p->mode == m_forward) { write_failure_if(g, "cursor >= limit", p); } else { write_failure_if(g, "cursor <= limit_backward", p); } } /* Formatted write. */ static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; int l = strlen(input); while (i < l) { int ch = input[i++]; if (ch == '~') { switch(input[i++]) { default: write_char(g, input[i - 1]); continue; case 'C': write_comment(g, p); continue; case 'f': write_block_start(g); write_failure(g); g->unreachable = false; write_block_end(g); continue; case 'M': write_margin(g); continue; case 'N': write_newline(g); continue; case '{': write_block_start(g); continue; case '}': write_block_end(g); continue; case 'S': write_string(g, g->S[input[i++] - '0']); continue; case 'B': write_b(g, g->B[input[i++] - '0']); continue; case 'I': write_int(g, g->I[input[i++] - '0']); continue; case 'V': write_varref(g, g->V[input[i++] - '0']); continue; case 'W': write_varname(g, g->V[input[i++] - '0']); continue; case 'L': write_literal_string(g, g->L[input[i++] - '0']); continue; case '+': g->margin++; continue; case '-': g->margin--; continue; case 'n': write_string(g, g->options->name); continue; } } else { write_char(g, ch); } } } static void w(struct generator * g, const char * s) { writef(g, s, 0); } static void generate_AE(struct generator * g, struct node * p) { char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; case c_number: write_int(g, p->number); break; case c_maxint: write_string(g, "MAXINT"); break; case c_minint: write_string(g, "MININT"); break; case c_neg: write_string(g, "-"); generate_AE(g, p->right); break; case c_multiply: s = " * "; goto label0; case c_plus: s = " + "; goto label0; case c_minus: s = " - "; goto label0; case c_divide: s = " / "; label0: write_string(g, "("); generate_AE(g, p->left); write_string(g, s); generate_AE(g, p->right); write_string(g, ")"); break; case c_sizeof: g->V[0] = p->name; w(g, "(~V0.length())"); break; case c_cursor: w(g, "cursor"); break; case c_limit: w(g, p->mode == m_forward ? "limit" : "limit_backward"); break; case c_size: w(g, "(current.length())"); break; } } /* K_needed() tests to see if we really need to keep c. Not true when the the command does not touch the cursor. This and repeat_score() could be elaborated almost indefinitely. */ static int K_needed(struct generator * g, struct node * p) { while (p != 0) { switch (p->type) { case c_dollar: case c_leftslice: case c_rightslice: case c_mathassign: case c_plusassign: case c_minusassign: case c_multiplyassign: case c_divideassign: case c_eq: case c_ne: case c_gr: case c_ge: case c_ls: case c_le: case c_sliceto: case c_booltest: case c_true: case c_false: case c_debug: break; case c_call: if (K_needed(g, p->name->definition)) return true; break; case c_bra: if (K_needed(g, p->left)) return true; break; default: return true; } p = p->right; } return false; } static int repeat_score(struct generator * g, struct node * p) { int score = 0; while (p != 0) { switch (p->type) { case c_dollar: case c_leftslice: case c_rightslice: case c_mathassign: case c_plusassign: case c_minusassign: case c_multiplyassign: case c_divideassign: case c_eq: case c_ne: case c_gr: case c_ge: case c_ls: case c_le: case c_sliceto: /* case c_not: must not be included here! */ case c_debug: break; case c_call: score += repeat_score(g, p->name->definition); break; case c_bra: score += repeat_score(g, p->left); break; case c_name: case c_literalstring: case c_next: case c_grouping: case c_non: case c_hop: score = score + 1; break; default: score = 2; break; } p = p->right; } return score; } /* tests if an expression requires cursor reinstatement in a repeat */ static int repeat_restore(struct generator * g, struct node * p) { return repeat_score(g, p) >= 2; } static void generate_bra(struct generator * g, struct node * p) { write_comment(g, p); p = p->left; while (p != 0) { generate(g, p); p = p->right; } } static void generate_and(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); p = p->left; while (p != 0) { generate(g, p); if (g->unreachable) break; if (keep_c && p->right != 0) write_restorecursor(g, p, savevar); p = p->right; } str_delete(savevar); } static void generate_or(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int out_lab = new_label(g); write_comment(g, p); wsetlab_begin(g, out_lab); if (keep_c) write_savecursor(g, p, savevar); p = p->left; str_clear(g->failure_str); if (p == 0) { /* p should never be 0 after an or: there should be at least two * sub nodes. */ fprintf(stderr, "Error: \"or\" node without children nodes."); exit (1); } while (p->right != 0) { g->failure_label = new_label(g); wsetlab_begin(g, g->failure_label); generate(g, p); if (!g->unreachable) wgotol(g, out_lab); wsetlab_end(g); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); p = p->right; } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; generate(g, p); wsetlab_end(g); str_delete(savevar); } static void generate_backwards(struct generator * g, struct node * p) { write_comment(g, p); writef(g,"~Mlimit_backward = cursor; cursor = limit;~N", p); generate(g, p->left); w(g, "~Mcursor = limit_backward;"); } static void generate_not(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); write_comment(g, p); if (keep_c) { write_block_start(g); write_savecursor(g, p, savevar); } g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; if (!g->unreachable) write_failure(g); wsetlab_end(g); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); if (keep_c) write_block_end(g); str_delete(savevar); } static void generate_try(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); g->failure_label = new_label(g); if (keep_c) restore_string(p, g->failure_str, savevar); wsetlab_begin(g, g->failure_label); generate(g, p->left); wsetlab_end(g); g->unreachable = false; str_delete(savevar); } static void generate_set(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = true;~N", p); } static void generate_unset(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = false;~N", p); } static void generate_fail(struct generator * g, struct node * p) { write_comment(g, p); generate(g, p->left); if (!g->unreachable) write_failure(g); } /* generate_test() also implements 'reverse' */ static void generate_test(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) { write_savecursor(g, p, savevar); } generate(g, p->left); if (!g->unreachable) { if (keep_c) { write_restorecursor(g, p, savevar); } } str_delete(savevar); } static void generate_do(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); int keep_c = K_needed(g, p->left); write_comment(g, p); if (keep_c) write_savecursor(g, p, savevar); g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); wsetlab_end(g); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); str_delete(savevar); } static void generate_GO(struct generator * g, struct node * p, int style) { int end_unreachable = false; struct str * savevar = vars_newname(g); int keep_c = style == 1 || repeat_restore(g, p->left); int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); int golab = new_label(g); g->I[0] = golab; write_comment(g, p); w(g, "~Mgolab~I0: while(true)~N"); w(g, "~{"); if (keep_c) write_savecursor(g, p, savevar); g->failure_label = new_label(g); wsetlab_begin(g, g->failure_label); generate(g, p->left); if (g->unreachable) { /* Cannot break out of this loop: therefore the code after the * end of the loop is unreachable.*/ end_unreachable = true; } else { /* include for goto; omit for gopast */ if (style == 1) write_restorecursor(g, p, savevar); g->I[0] = golab; w(g, "~Mbreak golab~I0;~N"); } g->unreachable = false; wsetlab_end(g); if (keep_c) write_restorecursor(g, p, savevar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; write_check_limit(g, p); write_inc_cursor(g, p); write_block_end(g); str_delete(savevar); g->unreachable = end_unreachable; } static void generate_loop(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); g->B[0] = str_data(loopvar); write_declare(g, "int ~B0", p); w(g, "~Mfor (~B0 = "); generate_AE(g, p->AE); g->B[0] = str_data(loopvar); writef(g, "; ~B0 > 0; ~B0--)~N", p); writef(g, "~{", p); generate(g, p->left); w(g, "~}"); str_delete(loopvar); g->unreachable = false; } static void generate_repeat(struct generator * g, struct node * p, struct str * loopvar) { struct str * savevar = vars_newname(g); int keep_c = repeat_restore(g, p->left); int replab = new_label(g); g->I[0] = replab; write_comment(g, p); writef(g, "~Mreplab~I0: while(true)~N~{", p); if (keep_c) write_savecursor(g, p, savevar); g->failure_label = new_label(g); str_clear(g->failure_str); wsetlab_begin(g, g->failure_label); generate(g, p->left); if (!g->unreachable) { if (loopvar != 0) { g->B[0] = str_data(loopvar); w(g, "~M~B0--;~N"); } g->I[0] = replab; w(g, "~Mcontinue replab~I0;~N"); } wsetlab_end(g); g->unreachable = false; if (keep_c) write_restorecursor(g, p, savevar); g->I[0] = replab; w(g, "~Mbreak replab~I0;~N~}"); str_delete(savevar); } static void generate_atleast(struct generator * g, struct node * p) { struct str * loopvar = vars_newname(g); write_comment(g, p); w(g, "~{"); g->B[0] = str_data(loopvar); w(g, "~Mint ~B0 = "); generate_AE(g, p->AE); w(g, ";~N"); { int a0 = g->failure_label; struct str * a1 = str_copy(g->failure_str); generate_repeat(g, p, loopvar); g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } g->B[0] = str_data(loopvar); write_failure_if(g, "~B0 > 0", p); w(g, "~}"); str_delete(loopvar); } static void generate_setmark(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = cursor;~N", p); } static void generate_tomark(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? ">" : "<"; w(g, "~Mif (cursor ~S0 "); generate_AE(g, p->AE); w(g, ")~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; w(g, "~Mcursor = "); generate_AE(g, p->AE); writef(g, ";~N", p); } static void generate_atmark(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mif (cursor != "); generate_AE(g, p->AE); writef(g, ")~N", p); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } static void generate_hop(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "+" : "-"; w(g, "~{~Mint c = cursor ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); g->S[0] = p->mode == m_forward ? "0" : "limit_backward"; write_failure_if(g, "~S0 > c || c > limit", p); writef(g, "~Mcursor = c;~N", p); writef(g, "~}", p); } static void generate_delete(struct generator * g, struct node * p) { write_comment(g, p); writef(g, "~Mslice_del();~N", p); } static void generate_next(struct generator * g, struct node * p) { write_comment(g, p); write_check_limit(g, p); write_inc_cursor(g, p); } static void generate_tolimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "limit" : "limit_backward"; writef(g, "~Mcursor = ~S0;~N", p); } static void generate_atlimit(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "limit" : "limit_backward"; g->S[1] = p->mode == m_forward ? "<" : ">"; write_failure_if(g, "cursor ~S1 ~S0", p); } static void generate_leftslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "bra" : "ket"; writef(g, "~M~S0 = cursor;~N", p); } static void generate_rightslice(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "ket" : "bra"; writef(g, "~M~S0 = cursor;~N", p); } static void generate_assignto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = assign_to(~V0);~N", p); } static void generate_sliceto(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; writef(g, "~M~V0 = slice_to(~V0);~N", p); } static void generate_address(struct generator * g, struct node * p) { symbol * b = p->literalstring; if (b != 0) { write_literal_string(g, b); } else { write_varref(g, p->name); } } static void generate_insert(struct generator * g, struct node * p, int style) { int keep_c = style == c_attach; write_comment(g, p); if (p->mode == m_backward) keep_c = !keep_c; if (keep_c) w(g, "~{~Mint c = cursor;~N"); writef(g, "~Minsert(cursor, cursor, ", p); generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~Mcursor = c;~N~}"); } static void generate_assignfrom(struct generator * g, struct node * p) { int keep_c = p->mode == m_forward; /* like 'attach' */ write_comment(g, p); if (keep_c) writef(g, "~{~Mint c = cursor;~N", p); if (p->mode == m_forward) { writef(g, "~Minsert(cursor, limit, ", p); } else { writef(g, "~Minsert(limit_backward, cursor, ", p); } generate_address(g, p); writef(g, ");~N", p); if (keep_c) w(g, "~Mcursor = c;~N~}"); } static void generate_slicefrom(struct generator * g, struct node * p) { write_comment(g, p); w(g, "~Mslice_from("); generate_address(g, p); writef(g, ");~N", p); } static void generate_setlimit(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); struct str * varname = vars_newname(g); write_comment(g, p); write_savecursor(g, p, savevar); generate(g, p->left); if (!g->unreachable) { g->B[0] = str_data(varname); write_declare(g, "int ~B0", p); if (p->mode == m_forward) { w(g, "~M~B0 = limit - cursor;~N"); w(g, "~Mlimit = cursor;~N"); } else { w(g, "~M~B0 = limit_backward;~N"); w(g, "~Mlimit_backward = cursor;~N"); } write_restorecursor(g, p, savevar); if (p->mode == m_forward) { str_assign(g->failure_str, "limit += "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } else { str_assign(g->failure_str, "limit_backward = "); str_append(g->failure_str, varname); str_append_ch(g->failure_str, ';'); } generate(g, p->aux); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } } str_delete(varname); str_delete(savevar); } /* dollar sets snowball up to operate on a string variable as if it were the * current string */ static void generate_dollar(struct generator * g, struct node * p) { struct str * savevar = vars_newname(g); write_comment(g, p); g->V[0] = p->name; str_assign(g->failure_str, "copy_from("); str_append(g->failure_str, savevar); str_append_string(g->failure_str, ");"); g->B[0] = str_data(savevar); writef(g, "~{~M~n ~B0 = this;~N" "~Mcurrent = new StringBuffer(~V0.toString());~N" "~Mcursor = 0;~N" "~Mlimit = (current.length());~N", p); generate(g, p->left); if (!g->unreachable) { write_margin(g); write_str(g, g->failure_str); write_newline(g); } w(g, "~}"); str_delete(savevar); } static void generate_integer_assign(struct generator * g, struct node * p, char * s) { g->V[0] = p->name; g->S[0] = s; w(g, "~M~V0 ~S0 "); generate_AE(g, p->AE); w(g, ";~N"); } static void generate_integer_test(struct generator * g, struct node * p, char * s) { g->V[0] = p->name; g->S[0] = s; w(g, "~Mif (!(~V0 ~S0 "); generate_AE(g, p->AE); w(g, "))~N"); write_block_start(g); write_failure(g); write_block_end(g); g->unreachable = false; } static void generate_call(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; write_failure_if(g, "!~V0()", p); } static void generate_grouping(struct generator * g, struct node * p, int complement) { struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "out" : "in"; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; if (q->no_gaps) write_failure_if(g, "!(~S1_range~S0(~I0, ~I1))", p); else write_failure_if(g, "!(~S1_grouping~S0(~V0, ~I0, ~I1))", p); } static void generate_namedstring(struct generator * g, struct node * p) { write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->V[0] = p->name; write_failure_if(g, "!(eq_v~S0(~V0))", p); } static void generate_literalstring(struct generator * g, struct node * p) { symbol * b = p->literalstring; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = SIZE(b); g->L[0] = b; write_failure_if(g, "!(eq_s~S0(~I0, ~L0))", p); } static void generate_define(struct generator * g, struct node * p) { struct name * q = p->name; struct str * saved_output = g->outbuf; struct str * saved_declarations = g->declarations; g->S[0] = q->type == t_routine ? "private" : "public"; g->V[0] = q; w(g, "~+~+~N~M~S0 boolean ~V0() {~+~N"); g->outbuf = str_new(); g->declarations = str_new(); g->next_label = 0; g->var_number = 0; if (p->amongvar_needed) write_declare(g, "int among_var", p); str_clear(g->failure_str); g->failure_label = x_return; g->unreachable = false; generate(g, p->left); if (!g->unreachable) w(g, "~Mreturn true;~N"); w(g, "~}~-~-"); str_append(saved_output, g->declarations); str_append(saved_output, g->outbuf); str_delete(g->declarations); str_delete(g->outbuf); g->declarations = saved_declarations; g->outbuf = saved_output; } static void generate_substring(struct generator * g, struct node * p) { struct among * x = p->among; write_comment(g, p); g->S[0] = p->mode == m_forward ? "" : "_b"; g->I[0] = x->number; g->I[1] = x->literalstring_count; if (x->command_count == 0 && x->starter == 0) { write_failure_if(g, "find_among~S0(a_~I0, ~I1) == 0", p); } else { writef(g, "~Mamong_var = find_among~S0(a_~I0, ~I1);~N", p); write_failure_if(g, "among_var == 0", p); } } static void generate_among(struct generator * g, struct node * p) { struct among * x = p->among; int case_number = 1; if (x->substring == 0) generate_substring(g, p); if (x->command_count == 0 && x->starter == 0) return; if (x->starter != 0) generate(g, x->starter); p = p->left; if (p != 0 && p->type != c_literalstring) p = p->right; w(g, "~Mswitch(among_var) {~N~+"); w(g, "~Mcase 0:~N~+"); write_failure(g); g->unreachable = false; w(g, "~-"); while (p != 0) { if (p->type == c_bra && p->left != 0) { g->I[0] = case_number++; w(g, "~Mcase ~I0:~N~+"); generate(g, p); if (!g->unreachable) w(g, "~Mbreak;~N"); w(g, "~-"); g->unreachable = false; } p = p->right; } write_block_end(g); } static void generate_booltest(struct generator * g, struct node * p) { write_comment(g, p); g->V[0] = p->name; write_failure_if(g, "!(~V0)", p); } static void generate_false(struct generator * g, struct node * p) { write_comment(g, p); write_failure(g); } static void generate_debug(struct generator * g, struct node * p) { write_comment(g, p); g->I[0] = g->debug_count++; g->I[1] = p->line_number; writef(g, "~Mdebug(~I0, ~I1);~N", p); } static void generate(struct generator * g, struct node * p) { int a0; struct str * a1; if (g->unreachable) return; a0 = g->failure_label; a1 = str_copy(g->failure_str); switch (p->type) { case c_define: generate_define(g, p); break; case c_bra: generate_bra(g, p); break; case c_and: generate_and(g, p); break; case c_or: generate_or(g, p); break; case c_backwards: generate_backwards(g, p); break; case c_not: generate_not(g, p); break; case c_set: generate_set(g, p); break; case c_unset: generate_unset(g, p); break; case c_try: generate_try(g, p); break; case c_fail: generate_fail(g, p); break; case c_reverse: case c_test: generate_test(g, p); break; case c_do: generate_do(g, p); break; case c_goto: generate_GO(g, p, 1); break; case c_gopast: generate_GO(g, p, 0); break; case c_repeat: generate_repeat(g, p, 0); break; case c_loop: generate_loop(g, p); break; case c_atleast: generate_atleast(g, p); break; case c_setmark: generate_setmark(g, p); break; case c_tomark: generate_tomark(g, p); break; case c_atmark: generate_atmark(g, p); break; case c_hop: generate_hop(g, p); break; case c_delete: generate_delete(g, p); break; case c_next: generate_next(g, p); break; case c_tolimit: generate_tolimit(g, p); break; case c_atlimit: generate_atlimit(g, p); break; case c_leftslice: generate_leftslice(g, p); break; case c_rightslice: generate_rightslice(g, p); break; case c_assignto: generate_assignto(g, p); break; case c_sliceto: generate_sliceto(g, p); break; case c_assign: generate_assignfrom(g, p); break; case c_insert: case c_attach: generate_insert(g, p, p->type); break; case c_slicefrom: generate_slicefrom(g, p); break; case c_setlimit: generate_setlimit(g, p); break; case c_dollar: generate_dollar(g, p); break; case c_mathassign: generate_integer_assign(g, p, "="); break; case c_plusassign: generate_integer_assign(g, p, "+="); break; case c_minusassign: generate_integer_assign(g, p, "-="); break; case c_multiplyassign:generate_integer_assign(g, p, "*="); break; case c_divideassign: generate_integer_assign(g, p, "/="); break; case c_eq: generate_integer_test(g, p, "=="); break; case c_ne: generate_integer_test(g, p, "!="); break; case c_gr: generate_integer_test(g, p, ">"); break; case c_ge: generate_integer_test(g, p, ">="); break; case c_ls: generate_integer_test(g, p, "<"); break; case c_le: generate_integer_test(g, p, "<="); break; case c_call: generate_call(g, p); break; case c_grouping: generate_grouping(g, p, false); break; case c_non: generate_grouping(g, p, true); break; case c_name: generate_namedstring(g, p); break; case c_literalstring: generate_literalstring(g, p); break; case c_among: generate_among(g, p); break; case c_substring: generate_substring(g, p); break; case c_booltest: generate_booltest(g, p); break; case c_false: generate_false(g, p); break; case c_true: break; case c_debug: generate_debug(g, p); break; default: fprintf(stderr, "%d encountered\n", p->type); exit(1); } g->failure_label = a0; str_delete(g->failure_str); g->failure_str = a1; } static void generate_start_comment(struct generator * g) { w(g, "// This file was generated automatically by the Snowball to Java compiler~N"); w(g, "~N"); } static void generate_class_begin(struct generator * g) { w(g, "package " ); w(g, g->options->package); w(g, ";~N~N" ); w(g, "import "); w(g, g->options->among_class ); w(g, ";~N" "~N" " /**~N" " * This class was automatically generated by a Snowball to Java compiler ~N" " * It implements the stemming algorithm defined by a snowball script.~N" " */~N" "~N" "public class ~n extends "); w(g, g->options->parent_class_name); w(g, " {~N" "~N" "private static final long serialVersionUID = 1L;~N" "~N" "~+~+~Mprivate final static ~n methodObject = new ~n ();~N" "~N"); } static void generate_class_end(struct generator * g) { w(g, "~N}"); w(g, "~N~N"); } static void generate_equals(struct generator * g) { w(g, "~N" "~Mpublic boolean equals( Object o ) {~N" "~+~Mreturn o instanceof "); w(g, g->options->name); w(g, ";~N~-~M}~N" "~N" "~Mpublic int hashCode() {~N" "~+~Mreturn "); w(g, g->options->name); w(g, ".class.getName().hashCode();~N" "~-~M}~N"); w(g, "~N~N"); } static void generate_among_table(struct generator * g, struct among * x) { struct amongvec * v = x->b; g->I[0] = x->number; g->I[1] = x->literalstring_count; w(g, "~+~+~Mprivate final static Among a_~I0[] = {~N~+"); { int i; for (i = 0; i < x->literalstring_count; i++) { g->I[0] = i; g->I[1] = v->i; g->I[2] = v->result; g->L[0] = v->b; g->S[0] = i < x->literalstring_count - 1 ? "," : ""; w(g, "~Mnew Among ( ~L0, ~I1, ~I2, \""); if (v->function != 0) { write_varname(g, v->function); } w(g, "\", methodObject )~S0~N"); v++; } } w(g, "~-~M};~-~-~N~N"); } static void generate_amongs(struct generator * g) { struct among * x = g->analyser->amongs; while (x != 0) { generate_among_table(g, x); x = x->next; } } static void set_bit(symbol * b, int i) { b[i/8] |= 1 << i%8; } static int bit_is_set(symbol * b, int i) { return b[i/8] & 1 << i%8; } static void generate_grouping_table(struct generator * g, struct grouping * q) { int range = q->largest_ch - q->smallest_ch + 1; int size = (range + 7)/ 8; /* assume 8 bits per symbol */ symbol * b = q->b; symbol * map = create_b(size); int i; for (i = 0; i < size; i++) map[i] = 0; /* Using unicode would require revision here */ for (i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); q->no_gaps = true; for (i = 0; i < range; i++) unless (bit_is_set(map, i)) q->no_gaps = false; unless (q->no_gaps) { g->V[0] = q->name; w(g, "~+~+~Mprivate static final char ~V0[] = {"); for (i = 0; i < size; i++) { write_int(g, map[i]); if (i < size - 1) w(g, ", "); } w(g, " };~N~-~-~N"); } lose_b(map); } static void generate_groupings(struct generator * g) { struct grouping * q = g->analyser->groupings; until (q == 0) { generate_grouping_table(g, q); q = q->next; } } static void generate_members(struct generator * g) { struct name * q = g->analyser->names; until (q == 0) { g->V[0] = q; switch (q->type) { case t_string: w(g, " private "); w(g, g->options->string_class ); w(g, " ~W0 = new "); w(g, g->options->string_class); w(g, "();~N"); break; case t_integer: w(g, " private int ~W0;~N"); break; case t_boolean: w(g, " private boolean ~W0;~N"); break; } q = q->next; } w(g, "~N"); } static void generate_copyfrom(struct generator * g) { struct name * q; w(g, "~+~+~Mprivate void copy_from(~n other) {~+~N"); for (q = g->analyser->names; q != 0; q = q->next) { g->V[0] = q; switch (q->type) { case t_string: case t_integer: case t_boolean: w(g, "~M~W0 = other.~W0;~N"); break; } } w(g, "~Msuper.copy_from(other);~N"); w(g, "~-~M}~-~-~N"); } static void generate_methods(struct generator * g) { struct node * p = g->analyser->program; while (p != 0) { generate(g, p); g->unreachable = false; p = p->right; } } extern void generate_program_java(struct generator * g) { g->outbuf = str_new(); g->failure_str = str_new(); generate_start_comment(g); generate_class_begin(g); generate_amongs(g); generate_groupings(g); generate_members(g); generate_copyfrom(g); generate_methods(g); generate_equals(g); generate_class_end(g); output_str(g->options->output_java, g->outbuf); str_delete(g->failure_str); str_delete(g->outbuf); } extern struct generator * create_generator_java(struct analyser * a, struct options * o) { NEW(generator, g); g->analyser = a; g->options = o; g->margin = 0; g->debug_count = 0; g->unreachable = false; return g; } extern void close_generator_java(struct generator * g) { FREE(g); } snowball_code/doc/0000755000175000017500000000000012707117052012560 5ustar domdomsnowball_code/doc/TODO0000644000175000017500000000140512707117052013250 0ustar domdomThings to do: - Write documentation for how to use libstemmer (as opposed to how stemming algorithms themselves work). Currently, the documentation in the include/libstemmer.h header file is pretty clear and comprehensive, but an overview document wouldn't go amiss. Things that would be nice to include at some point. - Add version numbers to each stemming algorithm, and allow the interface to request a specific version of the stemming algorithms. Default to providing the latest version of the algorithm. - Make mkmodules.pl generate the build system, instead of being called from it. This would allow it to generate the list of modules to be built, so that it's not necessary to change things in more than one place to add a new algorithm. snowball_code/include/0000755000175000017500000000000012707117052013436 5ustar domdomsnowball_code/include/libstemmer.h0000644000175000017500000000554412707117052015762 0ustar domdom /* Make header file work when included from C++ */ #ifdef __cplusplus extern "C" { #endif struct sb_stemmer; typedef unsigned char sb_symbol; /* FIXME - should be able to get a version number for each stemming * algorithm (which will be incremented each time the output changes). */ /** Returns an array of the names of the available stemming algorithms. * Note that these are the canonical names - aliases (ie, other names for * the same algorithm) will not be included in the list. * The list is terminated with a null pointer. * * The list must not be modified in any way. */ const char ** sb_stemmer_list(void); /** Create a new stemmer object, using the specified algorithm, for the * specified character encoding. * * All algorithms will usually be available in UTF-8, but may also be * available in other character encodings. * * @param algorithm The algorithm name. This is either the english * name of the algorithm, or the 2 or 3 letter ISO 639 codes for the * language. Note that case is significant in this parameter - the * value should be supplied in lower case. * * @param charenc The character encoding. NULL may be passed as * this value, in which case UTF-8 encoding will be assumed. Otherwise, * the argument may be one of "UTF_8", "ISO_8859_1" (ie, Latin 1), * "CP850" (ie, MS-DOS Latin 1) or "KOI8_R" (Russian). Note that * case is significant in this parameter. * * @return NULL if the specified algorithm is not recognised, or the * algorithm is not available for the requested encoding. Otherwise, * returns a pointer to a newly created stemmer for the requested algorithm. * The returned pointer must be deleted by calling sb_stemmer_delete(). * * @note NULL will also be returned if an out of memory error occurs. */ struct sb_stemmer * sb_stemmer_new(const char * algorithm, const char * charenc); /** Delete a stemmer object. * * This frees all resources allocated for the stemmer. After calling * this function, the supplied stemmer may no longer be used in any way. * * It is safe to pass a null pointer to this function - this will have * no effect. */ void sb_stemmer_delete(struct sb_stemmer * stemmer); /** Stem a word. * * The return value is owned by the stemmer - it must not be freed or * modified, and it will become invalid when the stemmer is called again, * or if the stemmer is freed. * * The length of the return value can be obtained using sb_stemmer_length(). * * If an out-of-memory error occurs, this will return NULL. */ const sb_symbol * sb_stemmer_stem(struct sb_stemmer * stemmer, const sb_symbol * word, int size); /** Get the length of the result of the last stemmed word. * This should not be called before sb_stemmer_stem() has been called. */ int sb_stemmer_length(struct sb_stemmer * stemmer); #ifdef __cplusplus } #endif snowball_code/algorithms/0000755000175000017500000000000012707117052014164 5ustar domdomsnowball_code/algorithms/portuguese/0000755000175000017500000000000012707117052016366 5ustar domdomsnowball_code/algorithms/portuguese/stem_MS_DOS_Latin_I.sbl0000644000175000017500000001275412707117052022554 0ustar domdomroutines ( prelude postlude mark_regions RV R1 R2 standard_suffix verb_suffix residual_suffix residual_form ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v ) stringescapes {} /* special characters (in MS-DOS Latin I) */ stringdef a' hex 'A0' // a-acute stringdef a^ hex '83' // a-circumflex e.g. 'bota^nico stringdef e' hex '82' // e-acute stringdef e^ hex '88' // e-circumflex stringdef i' hex 'A1' // i-acute stringdef o^ hex '93' // o-circumflex stringdef o' hex 'A2' // o-acute stringdef u' hex 'A3' // u-acute stringdef c, hex '87' // c-cedilla stringdef a~ hex 'C6' // a-tilde stringdef o~ hex 'E4' // o-tilde define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}' define prelude as repeat ( [substring] among( '{a~}' (<- 'a~') '{o~}' (<- 'o~') '' (next) ) //or next ) define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v (non-v gopast v) or (v gopast non-v) ) or ( non-v (non-v gopast v) or (v next) ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( 'a~' (<- '{a~}') 'o~' (<- '{o~}') '' (next) ) //or next ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define standard_suffix as ( [substring] among( 'eza' 'ezas' 'ico' 'ica' 'icos' 'icas' 'ismo' 'ismos' '{a'}vel' '{i'}vel' 'ista' 'istas' 'oso' 'osa' 'osos' 'osas' 'amento' 'amentos' 'imento' 'imentos' 'adora' 'ador' 'a{c,}a~o' 'adoras' 'adores' 'a{c,}o~es' // no -ic test 'ante' 'antes' '{a^}ncia' // Note 1 ( R2 delete ) 'log{i'}a' 'log{i'}as' ( R2 <- 'log' ) 'uci{o'}n' 'uciones' ( R2 <- 'u' ) '{e^}ncia' '{e^}ncias' ( R2 <- 'ente' ) 'amente' ( R1 delete try ( [substring] R2 delete among( 'iv' (['at'] R2 delete) 'os' 'ic' 'ad' ) ) ) 'mente' ( R2 delete try ( [substring] among( 'ante' // Note 1 'avel' '{i'}vel' (R2 delete) ) ) ) 'idade' 'idades' ( R2 delete try ( [substring] among( 'abil' 'ic' 'iv' (R2 delete) ) ) ) 'iva' 'ivo' 'ivas' 'ivos' ( R2 delete try ( ['at'] R2 delete // but not a further ['ic'] R2 delete ) ) 'ira' 'iras' ( RV 'e' // -eira -eiras usually non-verbal <- 'ir' ) ) ) define verb_suffix as setlimit tomark pV for ( [substring] among( 'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}' 'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste' 'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam' 'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem' 'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o' 'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias' 'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras' 'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres' 'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is' 'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis' '{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis' '{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos' '{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos' 'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos' 'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos' '{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou' 'ira' 'iras' (delete) ) ) define residual_suffix as ( [substring] among( 'os' 'a' 'i' 'o' '{a'}' '{i'}' '{o'}' ( RV delete ) ) ) define residual_form as ( [substring] among( 'e' '{e'}' '{e^}' ( RV delete [('u'] test 'g') or ('i'] test 'c') RV delete ) '{c,}' (<-'c') ) ) ) define stem as ( do prelude do mark_regions backwards ( do ( ( ( standard_suffix or verb_suffix ) and do ( ['i'] test 'c' RV delete ) ) or residual_suffix ) do residual_form ) do postlude ) /* Note 1: additions of 15 Jun 2005 */ snowball_code/algorithms/portuguese/stem_ISO_8859_1.sbl0000644000175000017500000001275112707117052021475 0ustar domdomroutines ( prelude postlude mark_regions RV R1 R2 standard_suffix verb_suffix residual_suffix residual_form ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v ) stringescapes {} /* special characters (in ISO Latin I) */ stringdef a' hex 'E1' // a-acute stringdef a^ hex 'E2' // a-circumflex e.g. 'bota^nico stringdef e' hex 'E9' // e-acute stringdef e^ hex 'EA' // e-circumflex stringdef i' hex 'ED' // i-acute stringdef o^ hex 'F4' // o-circumflex stringdef o' hex 'F3' // o-acute stringdef u' hex 'FA' // u-acute stringdef c, hex 'E7' // c-cedilla stringdef a~ hex 'E3' // a-tilde stringdef o~ hex 'F5' // o-tilde define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}' define prelude as repeat ( [substring] among( '{a~}' (<- 'a~') '{o~}' (<- 'o~') '' (next) ) //or next ) define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v (non-v gopast v) or (v gopast non-v) ) or ( non-v (non-v gopast v) or (v next) ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( 'a~' (<- '{a~}') 'o~' (<- '{o~}') '' (next) ) //or next ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define standard_suffix as ( [substring] among( 'eza' 'ezas' 'ico' 'ica' 'icos' 'icas' 'ismo' 'ismos' '{a'}vel' '{i'}vel' 'ista' 'istas' 'oso' 'osa' 'osos' 'osas' 'amento' 'amentos' 'imento' 'imentos' 'adora' 'ador' 'a{c,}a~o' 'adoras' 'adores' 'a{c,}o~es' // no -ic test 'ante' 'antes' '{a^}ncia' // Note 1 ( R2 delete ) 'log{i'}a' 'log{i'}as' ( R2 <- 'log' ) 'uci{o'}n' 'uciones' ( R2 <- 'u' ) '{e^}ncia' '{e^}ncias' ( R2 <- 'ente' ) 'amente' ( R1 delete try ( [substring] R2 delete among( 'iv' (['at'] R2 delete) 'os' 'ic' 'ad' ) ) ) 'mente' ( R2 delete try ( [substring] among( 'ante' // Note 1 'avel' '{i'}vel' (R2 delete) ) ) ) 'idade' 'idades' ( R2 delete try ( [substring] among( 'abil' 'ic' 'iv' (R2 delete) ) ) ) 'iva' 'ivo' 'ivas' 'ivos' ( R2 delete try ( ['at'] R2 delete // but not a further ['ic'] R2 delete ) ) 'ira' 'iras' ( RV 'e' // -eira -eiras usually non-verbal <- 'ir' ) ) ) define verb_suffix as setlimit tomark pV for ( [substring] among( 'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}' 'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste' 'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam' 'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem' 'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o' 'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias' 'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras' 'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres' 'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is' 'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis' '{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis' '{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos' '{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos' 'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos' 'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos' '{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou' 'ira' 'iras' (delete) ) ) define residual_suffix as ( [substring] among( 'os' 'a' 'i' 'o' '{a'}' '{i'}' '{o'}' ( RV delete ) ) ) define residual_form as ( [substring] among( 'e' '{e'}' '{e^}' ( RV delete [('u'] test 'g') or ('i'] test 'c') RV delete ) '{c,}' (<-'c') ) ) ) define stem as ( do prelude do mark_regions backwards ( do ( ( ( standard_suffix or verb_suffix ) and do ( ['i'] test 'c' RV delete ) ) or residual_suffix ) do residual_form ) do postlude ) /* Note 1: additions of 15 Jun 2005 */ snowball_code/algorithms/swedish/0000755000175000017500000000000012707117052015632 5ustar domdomsnowball_code/algorithms/swedish/stem_MS_DOS_Latin_I.sbl0000644000175000017500000000273012707117052022011 0ustar domdomroutines ( mark_regions main_suffix consonant_pair other_suffix ) externals ( stem ) integers ( p1 x ) groupings ( v s_ending ) stringescapes {} /* special characters (in MS-DOS Latin I) */ stringdef a" hex '84' stringdef ao hex '86' stringdef o" hex '94' define v 'aeiouy{a"}{ao}{o"}' define s_ending 'bcdfghjklmnoprtvy' define mark_regions as ( $p1 = limit test ( hop 3 setmark x ) goto v gopast non-v setmark p1 try ( $p1 < x $p1 = x ) ) backwardmode ( define main_suffix as ( setlimit tomark p1 for ([substring]) among( 'a' 'arna' 'erna' 'heterna' 'orna' 'ad' 'e' 'ade' 'ande' 'arne' 'are' 'aste' 'en' 'anden' 'aren' 'heten' 'ern' 'ar' 'er' 'heter' 'or' 'as' 'arnas' 'ernas' 'ornas' 'es' 'ades' 'andes' 'ens' 'arens' 'hetens' 'erns' 'at' 'andet' 'het' 'ast' (delete) 's' (s_ending delete) ) ) define consonant_pair as setlimit tomark p1 for ( among('dd' 'gd' 'nn' 'dt' 'gt' 'kt' 'tt') and ([next] delete) ) define other_suffix as setlimit tomark p1 for ( [substring] among( 'lig' 'ig' 'els' (delete) 'l{o"}st' (<-'l{o"}s') 'fullt' (<-'full') ) ) ) define stem as ( do mark_regions backwards ( do main_suffix do consonant_pair do other_suffix ) ) snowball_code/algorithms/swedish/stem_ISO_8859_1.sbl0000644000175000017500000000272512707117052020741 0ustar domdomroutines ( mark_regions main_suffix consonant_pair other_suffix ) externals ( stem ) integers ( p1 x ) groupings ( v s_ending ) stringescapes {} /* special characters (in ISO Latin I) */ stringdef a" hex 'E4' stringdef ao hex 'E5' stringdef o" hex 'F6' define v 'aeiouy{a"}{ao}{o"}' define s_ending 'bcdfghjklmnoprtvy' define mark_regions as ( $p1 = limit test ( hop 3 setmark x ) goto v gopast non-v setmark p1 try ( $p1 < x $p1 = x ) ) backwardmode ( define main_suffix as ( setlimit tomark p1 for ([substring]) among( 'a' 'arna' 'erna' 'heterna' 'orna' 'ad' 'e' 'ade' 'ande' 'arne' 'are' 'aste' 'en' 'anden' 'aren' 'heten' 'ern' 'ar' 'er' 'heter' 'or' 'as' 'arnas' 'ernas' 'ornas' 'es' 'ades' 'andes' 'ens' 'arens' 'hetens' 'erns' 'at' 'andet' 'het' 'ast' (delete) 's' (s_ending delete) ) ) define consonant_pair as setlimit tomark p1 for ( among('dd' 'gd' 'nn' 'dt' 'gt' 'kt' 'tt') and ([next] delete) ) define other_suffix as setlimit tomark p1 for ( [substring] among( 'lig' 'ig' 'els' (delete) 'l{o"}st' (<-'l{o"}s') 'fullt' (<-'full') ) ) ) define stem as ( do mark_regions backwards ( do main_suffix do consonant_pair do other_suffix ) ) snowball_code/algorithms/spanish/0000755000175000017500000000000012707117052015631 5ustar domdomsnowball_code/algorithms/spanish/stem_MS_DOS_Latin_I.sbl0000644000175000017500000001336412707117052022015 0ustar domdomroutines ( postlude mark_regions RV R1 R2 attached_pronoun standard_suffix y_verb_suffix verb_suffix residual_suffix ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v ) stringescapes {} /* special characters (in MS-DOS Latin I) */ stringdef a' hex 'A0' // a-acute stringdef e' hex '82' // e-acute stringdef i' hex 'A1' // i-acute stringdef o' hex 'A2' // o-acute stringdef u' hex 'A3' // u-acute stringdef u" hex '81' // u-diaeresis stringdef n~ hex 'A4' // n-tilde define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}' define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v (non-v gopast v) or (v gopast non-v) ) or ( non-v (non-v gopast v) or (v next) ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( '{a'}' (<- 'a') '{e'}' (<- 'e') '{i'}' (<- 'i') '{o'}' (<- 'o') '{u'}' (<- 'u') // and possibly {u"}->u here, or in prelude '' (next) ) //or next ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define attached_pronoun as ( [substring] among( 'me' 'se' 'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo' 'las' 'les' 'los' 'nos' ) substring RV among( 'i{e'}ndo' (] <- 'iendo') '{a'}ndo' (] <- 'ando') '{a'}r' (] <- 'ar') '{e'}r' (] <- 'er') '{i'}r' (] <- 'ir') 'ando' 'iendo' 'ar' 'er' 'ir' (delete) 'yendo' ('u' delete) ) ) define standard_suffix as ( [substring] among( 'anza' 'anzas' 'ico' 'ica' 'icos' 'icas' 'ismo' 'ismos' 'able' 'ables' 'ible' 'ibles' 'ista' 'istas' 'oso' 'osa' 'osos' 'osas' 'amiento' 'amientos' 'imiento' 'imientos' ( R2 delete ) 'adora' 'ador' 'aci{o'}n' 'adoras' 'adores' 'aciones' 'ante' 'antes' 'ancia' 'ancias'// Note 1 ( R2 delete try ( ['ic'] R2 delete ) ) 'log{i'}a' 'log{i'}as' ( R2 <- 'log' ) 'uci{o'}n' 'uciones' ( R2 <- 'u' ) 'encia' 'encias' ( R2 <- 'ente' ) 'amente' ( R1 delete try ( [substring] R2 delete among( 'iv' (['at'] R2 delete) 'os' 'ic' 'ad' ) ) ) 'mente' ( R2 delete try ( [substring] among( 'ante' // Note 1 'able' 'ible' (R2 delete) ) ) ) 'idad' 'idades' ( R2 delete try ( [substring] among( 'abil' 'ic' 'iv' (R2 delete) ) ) ) 'iva' 'ivo' 'ivas' 'ivos' ( R2 delete try ( ['at'] R2 delete // but not a further ['ic'] R2 delete ) ) ) ) define y_verb_suffix as ( setlimit tomark pV for ([substring]) among( 'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}' 'yas' 'yes' 'yais' 'yamos' ('u' delete) ) ) define verb_suffix as ( setlimit tomark pV for ([substring]) among( 'en' 'es' '{e'}is' 'emos' (try ('u' test 'g') ] delete) 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais' 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}' 'ar{e'}' 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais' 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}' 'er{e'}' 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais' 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}' 'ir{e'}' 'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed' 'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an' 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado' 'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as' 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases' 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais' 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados' 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos' '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos' (delete) ) ) define residual_suffix as ( [substring] among( 'os' 'a' 'o' '{a'}' '{i'}' '{o'}' ( RV delete ) 'e' '{e'}' ( RV delete try( ['u'] test 'g' RV delete ) ) ) ) ) define stem as ( do mark_regions backwards ( do attached_pronoun do ( standard_suffix or y_verb_suffix or verb_suffix ) do residual_suffix ) do postlude ) /* Note 1: additions of 15 Jun 2005 */ snowball_code/algorithms/spanish/stem_ISO_8859_1.sbl0000644000175000017500000001336112707117052020736 0ustar domdomroutines ( postlude mark_regions RV R1 R2 attached_pronoun standard_suffix y_verb_suffix verb_suffix residual_suffix ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v ) stringescapes {} /* special characters (in ISO Latin I) */ stringdef a' hex 'E1' // a-acute stringdef e' hex 'E9' // e-acute stringdef i' hex 'ED' // i-acute stringdef o' hex 'F3' // o-acute stringdef u' hex 'FA' // u-acute stringdef u" hex 'FC' // u-diaeresis stringdef n~ hex 'F1' // n-tilde define v 'aeiou{a'}{e'}{i'}{o'}{u'}{u"}' define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v (non-v gopast v) or (v gopast non-v) ) or ( non-v (non-v gopast v) or (v next) ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( '{a'}' (<- 'a') '{e'}' (<- 'e') '{i'}' (<- 'i') '{o'}' (<- 'o') '{u'}' (<- 'u') // and possibly {u"}->u here, or in prelude '' (next) ) //or next ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define attached_pronoun as ( [substring] among( 'me' 'se' 'sela' 'selo' 'selas' 'selos' 'la' 'le' 'lo' 'las' 'les' 'los' 'nos' ) substring RV among( 'i{e'}ndo' (] <- 'iendo') '{a'}ndo' (] <- 'ando') '{a'}r' (] <- 'ar') '{e'}r' (] <- 'er') '{i'}r' (] <- 'ir') 'ando' 'iendo' 'ar' 'er' 'ir' (delete) 'yendo' ('u' delete) ) ) define standard_suffix as ( [substring] among( 'anza' 'anzas' 'ico' 'ica' 'icos' 'icas' 'ismo' 'ismos' 'able' 'ables' 'ible' 'ibles' 'ista' 'istas' 'oso' 'osa' 'osos' 'osas' 'amiento' 'amientos' 'imiento' 'imientos' ( R2 delete ) 'adora' 'ador' 'aci{o'}n' 'adoras' 'adores' 'aciones' 'ante' 'antes' 'ancia' 'ancias'// Note 1 ( R2 delete try ( ['ic'] R2 delete ) ) 'log{i'}a' 'log{i'}as' ( R2 <- 'log' ) 'uci{o'}n' 'uciones' ( R2 <- 'u' ) 'encia' 'encias' ( R2 <- 'ente' ) 'amente' ( R1 delete try ( [substring] R2 delete among( 'iv' (['at'] R2 delete) 'os' 'ic' 'ad' ) ) ) 'mente' ( R2 delete try ( [substring] among( 'ante' // Note 1 'able' 'ible' (R2 delete) ) ) ) 'idad' 'idades' ( R2 delete try ( [substring] among( 'abil' 'ic' 'iv' (R2 delete) ) ) ) 'iva' 'ivo' 'ivas' 'ivos' ( R2 delete try ( ['at'] R2 delete // but not a further ['ic'] R2 delete ) ) ) ) define y_verb_suffix as ( setlimit tomark pV for ([substring]) among( 'ya' 'ye' 'yan' 'yen' 'yeron' 'yendo' 'yo' 'y{o'}' 'yas' 'yes' 'yais' 'yamos' ('u' delete) ) ) define verb_suffix as ( setlimit tomark pV for ([substring]) among( 'en' 'es' '{e'}is' 'emos' (try ('u' test 'g') ] delete) 'ar{i'}an' 'ar{i'}as' 'ar{a'}n' 'ar{a'}s' 'ar{i'}ais' 'ar{i'}a' 'ar{e'}is' 'ar{i'}amos' 'aremos' 'ar{a'}' 'ar{e'}' 'er{i'}an' 'er{i'}as' 'er{a'}n' 'er{a'}s' 'er{i'}ais' 'er{i'}a' 'er{e'}is' 'er{i'}amos' 'eremos' 'er{a'}' 'er{e'}' 'ir{i'}an' 'ir{i'}as' 'ir{a'}n' 'ir{a'}s' 'ir{i'}ais' 'ir{i'}a' 'ir{e'}is' 'ir{i'}amos' 'iremos' 'ir{a'}' 'ir{e'}' 'aba' 'ada' 'ida' '{i'}a' 'ara' 'iera' 'ad' 'ed' 'id' 'ase' 'iese' 'aste' 'iste' 'an' 'aban' '{i'}an' 'aran' 'ieran' 'asen' 'iesen' 'aron' 'ieron' 'ado' 'ido' 'ando' 'iendo' 'i{o'}' 'ar' 'er' 'ir' 'as' 'abas' 'adas' 'idas' '{i'}as' 'aras' 'ieras' 'ases' 'ieses' '{i'}s' '{a'}is' 'abais' '{i'}ais' 'arais' 'ierais' 'aseis' 'ieseis' 'asteis' 'isteis' 'ados' 'idos' 'amos' '{a'}bamos' '{i'}amos' 'imos' '{a'}ramos' 'i{e'}ramos' 'i{e'}semos' '{a'}semos' (delete) ) ) define residual_suffix as ( [substring] among( 'os' 'a' 'o' '{a'}' '{i'}' '{o'}' ( RV delete ) 'e' '{e'}' ( RV delete try( ['u'] test 'g' RV delete ) ) ) ) ) define stem as ( do mark_regions backwards ( do attached_pronoun do ( standard_suffix or y_verb_suffix or verb_suffix ) do residual_suffix ) do postlude ) /* Note 1: additions of 15 Jun 2005 */ snowball_code/algorithms/hungarian/0000755000175000017500000000000012707117052016140 5ustar domdomsnowball_code/algorithms/hungarian/stem_ISO_8859_1.sbl0000644000175000017500000001225712707117052021250 0ustar domdom/* Hungarian Stemmer Removes noun inflections */ routines ( mark_regions R1 v_ending case case_special case_other plural owned sing_owner plur_owner instrum factive undouble double ) externals ( stem ) integers ( p1 ) groupings ( v ) stringescapes {} /* special characters (in ISO Latin I) */ stringdef a' hex 'E1' //a-acute stringdef e' hex 'E9' //e-acute stringdef i' hex 'ED' //i-acute stringdef o' hex 'F3' //o-acute stringdef o" hex 'F6' //o-umlaut stringdef oq hex 'F5' //o-double acute stringdef u' hex 'FA' //u-acute stringdef u" hex 'FC' //u-umlaut stringdef uq hex 'FB' //u-double acute define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}' define mark_regions as ( $p1 = limit (v goto non-v among('cs' 'gy' 'ly' 'ny' 'sz' 'ty' 'zs' 'dzs') or next setmark p1) or (non-v gopast v setmark p1) ) backwardmode ( define R1 as $p1 <= cursor define v_ending as ( [substring] R1 among( '{a'}' (<- 'a') '{e'}' (<- 'e') ) ) define double as ( test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm' 'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs') ) define undouble as ( next [hop 1] delete ) define instrum as( [substring] R1 among( 'al' (double) 'el' (double) ) delete undouble ) define case as ( [substring] R1 among( 'ban' 'ben' 'ba' 'be' 'ra' 're' 'nak' 'nek' 'val' 'vel' 't{o'}l' 't{oq}l' 'r{o'}l' 'r{oq}l' 'b{o'}l' 'b{oq}l' 'hoz' 'hez' 'h{o"}z' 'n{a'}l' 'n{e'}l' 'ig' 'at' 'et' 'ot' '{o"}t' '{e'}rt' 'k{e'}pp' 'k{e'}ppen' 'kor' 'ul' '{u"}l' 'v{a'}' 'v{e'}' 'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt' 'k{e'}nt' 'en' 'on' 'an' '{o"}n' 'n' 't' ) delete v_ending ) define case_special as( [substring] R1 among( '{e'}n' (<- 'e') '{a'}n' (<- 'a') '{a'}nk{e'}nt' (<- 'a') ) ) define case_other as( [substring] R1 among( 'astul' 'est{u"}l' (delete) 'stul' 'st{u"}l' (delete) '{a'}stul' (<- 'a') '{e'}st{u"}l' (<- 'e') ) ) define factive as( [substring] R1 among( '{a'}' (double) '{e'}' (double) ) delete undouble ) define plural as ( [substring] R1 among( '{a'}k' (<- 'a') '{e'}k' (<- 'e') '{o"}k' (delete) 'ak' (delete) 'ok' (delete) 'ek' (delete) 'k' (delete) ) ) define owned as ( [substring] R1 among ( 'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete) '{e'}k{e'}' (<- 'e') '{a'}k{e'}' (<- 'a') 'k{e'}' (delete) '{e'}{e'}i' (<- 'e') '{a'}{e'}i' (<- 'a') '{e'}i' (delete) '{e'}{e'}' (<- 'e') '{e'}' (delete) ) ) define sing_owner as ( [substring] R1 among( '{u"}nk' 'unk' (delete) '{a'}nk' (<- 'a') '{e'}nk' (<- 'e') 'nk' (delete) '{a'}juk' (<- 'a') '{e'}j{u"}k' (<- 'e') 'juk' 'j{u"}k' (delete) 'uk' '{u"}k' (delete) 'em' 'om' 'am' (delete) '{a'}m' (<- 'a') '{e'}m' (<- 'e') 'm' (delete) 'od' 'ed' 'ad' '{o"}d' (delete) '{a'}d' (<- 'a') '{e'}d' (<- 'e') 'd' (delete) 'ja' 'je' (delete) 'a' 'e' 'o' (delete) '{a'}' (<- 'a') '{e'}' (<- 'e') ) ) define plur_owner as ( [substring] R1 among( 'jaim' 'jeim' (delete) '{a'}im' (<- 'a') '{e'}im' (<- 'e') 'aim' 'eim' (delete) 'im' (delete) 'jaid' 'jeid' (delete) '{a'}id' (<- 'a') '{e'}id' (<- 'e') 'aid' 'eid' (delete) 'id' (delete) 'jai' 'jei' (delete) '{a'}i' (<- 'a') '{e'}i' (<- 'e') 'ai' 'ei' (delete) 'i' (delete) 'jaink' 'jeink' (delete) 'eink' 'aink' (delete) '{a'}ink' (<- 'a') '{e'}ink' (<- 'e') 'ink' 'jaitok' 'jeitek' (delete) 'aitok' 'eitek' (delete) '{a'}itok' (<- 'a') '{e'}itek' (<- 'e') 'itek' (delete) 'jeik' 'jaik' (delete) 'aik' 'eik' (delete) '{a'}ik' (<- 'a') '{e'}ik' (<- 'e') 'ik' (delete) ) ) ) define stem as ( do mark_regions backwards ( do instrum do case do case_special do case_other do factive do owned do sing_owner do plur_owner do plural ) ) snowball_code/algorithms/romanian/0000755000175000017500000000000012707117052015770 5ustar domdomsnowball_code/algorithms/romanian/stem_Unicode.sbl0000644000175000017500000001410212707117052021106 0ustar domdom routines ( prelude postlude mark_regions RV R1 R2 step_0 standard_suffix combo_suffix verb_suffix vowel_suffix ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v ) booleans ( standard_suffix_removed ) stringescapes {} /* special characters */ stringdef a^ hex '0E2' // a circumflex stringdef i^ hex '0EE' // i circumflex stringdef a+ hex '103' // a breve stringdef s, hex '15F' // s cedilla stringdef t, hex '163' // t cedilla define v 'aeiou{a^}{i^}{a+}' define prelude as ( repeat goto ( v [ ('u' ] v <- 'U') or ('i' ] v <- 'I') ) ) define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v (non-v gopast v) or (v gopast non-v) ) or ( non-v (non-v gopast v) or (v next) ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( 'I' (<- 'i') 'U' (<- 'u') '' (next) ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define step_0 as ( [substring] R1 among( 'ul' 'ului' ( delete ) 'aua' ( <-'a' ) 'ea' 'ele' 'elor' ( <-'e' ) 'ii' 'iua' 'iei' 'iile' 'iilor' 'ilor' ( <-'i') 'ile' ( not 'ab' <- 'i' ) 'atei' ( <- 'at' ) 'a{t,}ie' 'a{t,}ia' ( <- 'a{t,}i' ) ) ) define combo_suffix as test ( [substring] R1 ( among( /* 'IST'. alternative: include the following 'alism' 'alisme' 'alist' 'alista' 'aliste' 'alisti' 'alist{a+}' 'ali{s,}ti' ( <- 'al' ) */ 'abilitate' 'abilitati' 'abilit{a+}i' 'abilit{a+}{t,}i' ( <- 'abil' ) 'ibilitate' ( <- 'ibil' ) 'ivitate' 'ivitati' 'ivit{a+}i' 'ivit{a+}{t,}i' ( <- 'iv' ) 'icitate' 'icitati' 'icit{a+}i' 'icit{a+}{t,}i' 'icator' 'icatori' 'iciv' 'iciva' 'icive' 'icivi' 'iciv{a+}' 'ical' 'icala' 'icale' 'icali' 'ical{a+}' ( <- 'ic' ) 'ativ' 'ativa' 'ative' 'ativi' 'ativ{a+}' 'a{t,}iune' 'atoare' 'ator' 'atori' '{a+}toare' '{a+}tor' '{a+}tori' ( <- 'at' ) 'itiv' 'itiva' 'itive' 'itivi' 'itiv{a+}' 'i{t,}iune' 'itoare' 'itor' 'itori' ( <- 'it' ) ) set standard_suffix_removed ) ) define standard_suffix as ( unset standard_suffix_removed repeat combo_suffix [substring] R2 ( among( // past participle is treated here, rather than // as a verb ending: 'at' 'ata' 'at{a+}' 'ati' 'ate' 'ut' 'uta' 'ut{a+}' 'uti' 'ute' 'it' 'ita' 'it{a+}' 'iti' 'ite' 'ic' 'ica' 'ice' 'ici' 'ic{a+}' 'abil' 'abila' 'abile' 'abili' 'abil{a+}' 'ibil' 'ibila' 'ibile' 'ibili' 'ibil{a+}' 'oasa' 'oas{a+}' 'oase' 'os' 'osi' 'o{s,}i' 'ant' 'anta' 'ante' 'anti' 'ant{a+}' 'ator' 'atori' 'itate' 'itati' 'it{a+}i' 'it{a+}{t,}i' 'iv' 'iva' 'ive' 'ivi' 'iv{a+}' ( delete ) 'iune' 'iuni' ( '{t,}'] <- 't' ) 'ism' 'isme' 'ist' 'ista' 'iste' 'isti' 'ist{a+}' 'i{s,}ti' ( <- 'ist' /* 'IST'. alternative: remove with <- '' */ ) ) set standard_suffix_removed ) ) define verb_suffix as setlimit tomark pV for ( [substring] among( // 'long' infinitive: 'are' 'ere' 'ire' '{a^}re' // gerund: 'ind' '{a^}nd' 'indu' '{a^}ndu' 'eze' 'easc{a+}' // present: 'ez' 'ezi' 'eaz{a+}' 'esc' 'e{s,}ti' 'e{s,}te' '{a+}sc' '{a+}{s,}ti' '{a+}{s,}te' // imperfect: 'am' 'ai' 'au' 'eam' 'eai' 'ea' 'ea{t,}i' 'eau' 'iam' 'iai' 'ia' 'ia{t,}i' 'iau' // past: // (not 'ii') 'ui' 'a{s,}i' 'ar{a+}m' 'ar{a+}{t,}i' 'ar{a+}' 'u{s,}i' 'ur{a+}m' 'ur{a+}{t,}i' 'ur{a+}' 'i{s,}i' 'ir{a+}m' 'ir{a+}{t,}i' 'ir{a+}' '{a^}i' '{a^}{s,}i' '{a^}r{a+}m' '{a^}r{a+}{t,}i' '{a^}r{a+}' // pluferfect: 'asem' 'ase{s,}i' 'ase' 'aser{a+}m' 'aser{a+}{t,}i' 'aser{a+}' 'isem' 'ise{s,}i' 'ise' 'iser{a+}m' 'iser{a+}{t,}i' 'iser{a+}' '{a^}sem' '{a^}se{s,}i' '{a^}se' '{a^}ser{a+}m' '{a^}ser{a+}{t,}i' '{a^}ser{a+}' 'usem' 'use{s,}i' 'use' 'user{a+}m' 'user{a+}{t,}i' 'user{a+}' ( non-v or 'u' delete ) // present: '{a+}m' 'a{t,}i' 'em' 'e{t,}i' 'im' 'i{t,}i' '{a^}m' '{a^}{t,}i' // past: 'se{s,}i' 'ser{a+}m' 'ser{a+}{t,}i' 'ser{a+}' 'sei' 'se' // pluperfect: 'sesem' 'sese{s,}i' 'sese' 'seser{a+}m' 'seser{a+}{t,}i' 'seser{a+}' (delete) ) ) define vowel_suffix as ( [substring] RV among ( 'a' 'e' 'i' 'ie' '{a+}' ( delete ) ) ) ) define stem as ( do prelude do mark_regions backwards ( do step_0 do standard_suffix do ( standard_suffix_removed or verb_suffix ) do vowel_suffix ) do postlude ) snowball_code/algorithms/romanian/stem_ISO_8859_2.sbl0000644000175000017500000001407512707117052021101 0ustar domdom routines ( prelude postlude mark_regions RV R1 R2 step_0 standard_suffix combo_suffix verb_suffix vowel_suffix ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v ) booleans ( standard_suffix_removed ) stringescapes {} /* special characters */ stringdef a^ hex 'E2' // a circumflex stringdef i^ hex 'EE' // i circumflex stringdef a+ hex 'E3' // a breve stringdef s, hex 'BA' // s cedilla stringdef t, hex 'FE' // t cedilla define v 'aeiou{a^}{i^}{a+}' define prelude as ( repeat goto ( v [ ('u' ] v <- 'U') or ('i' ] v <- 'I') ) ) define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v (non-v gopast v) or (v gopast non-v) ) or ( non-v (non-v gopast v) or (v next) ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( 'I' (<- 'i') 'U' (<- 'u') '' (next) ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define step_0 as ( [substring] R1 among( 'ul' 'ului' ( delete ) 'aua' ( <-'a' ) 'ea' 'ele' 'elor' ( <-'e' ) 'ii' 'iua' 'iei' 'iile' 'iilor' 'ilor' ( <-'i') 'ile' ( not 'ab' <- 'i' ) 'atei' ( <- 'at' ) 'a{t,}ie' 'a{t,}ia' ( <- 'a{t,}i' ) ) ) define combo_suffix as test ( [substring] R1 ( among( /* 'IST'. alternative: include the following 'alism' 'alisme' 'alist' 'alista' 'aliste' 'alisti' 'alist{a+}' 'ali{s,}ti' ( <- 'al' ) */ 'abilitate' 'abilitati' 'abilit{a+}i' 'abilit{a+}{t,}i' ( <- 'abil' ) 'ibilitate' ( <- 'ibil' ) 'ivitate' 'ivitati' 'ivit{a+}i' 'ivit{a+}{t,}i' ( <- 'iv' ) 'icitate' 'icitati' 'icit{a+}i' 'icit{a+}{t,}i' 'icator' 'icatori' 'iciv' 'iciva' 'icive' 'icivi' 'iciv{a+}' 'ical' 'icala' 'icale' 'icali' 'ical{a+}' ( <- 'ic' ) 'ativ' 'ativa' 'ative' 'ativi' 'ativ{a+}' 'a{t,}iune' 'atoare' 'ator' 'atori' '{a+}toare' '{a+}tor' '{a+}tori' ( <- 'at' ) 'itiv' 'itiva' 'itive' 'itivi' 'itiv{a+}' 'i{t,}iune' 'itoare' 'itor' 'itori' ( <- 'it' ) ) set standard_suffix_removed ) ) define standard_suffix as ( unset standard_suffix_removed repeat combo_suffix [substring] R2 ( among( // past participle is treated here, rather than // as a verb ending: 'at' 'ata' 'at{a+}' 'ati' 'ate' 'ut' 'uta' 'ut{a+}' 'uti' 'ute' 'it' 'ita' 'it{a+}' 'iti' 'ite' 'ic' 'ica' 'ice' 'ici' 'ic{a+}' 'abil' 'abila' 'abile' 'abili' 'abil{a+}' 'ibil' 'ibila' 'ibile' 'ibili' 'ibil{a+}' 'oasa' 'oas{a+}' 'oase' 'os' 'osi' 'o{s,}i' 'ant' 'anta' 'ante' 'anti' 'ant{a+}' 'ator' 'atori' 'itate' 'itati' 'it{a+}i' 'it{a+}{t,}i' 'iv' 'iva' 'ive' 'ivi' 'iv{a+}' ( delete ) 'iune' 'iuni' ( '{t,}'] <- 't' ) 'ism' 'isme' 'ist' 'ista' 'iste' 'isti' 'ist{a+}' 'i{s,}ti' ( <- 'ist' /* 'IST'. alternative: remove with <- '' */ ) ) set standard_suffix_removed ) ) define verb_suffix as setlimit tomark pV for ( [substring] among( // 'long' infinitive: 'are' 'ere' 'ire' '{a^}re' // gerund: 'ind' '{a^}nd' 'indu' '{a^}ndu' 'eze' 'easc{a+}' // present: 'ez' 'ezi' 'eaz{a+}' 'esc' 'e{s,}ti' 'e{s,}te' '{a+}sc' '{a+}{s,}ti' '{a+}{s,}te' // imperfect: 'am' 'ai' 'au' 'eam' 'eai' 'ea' 'ea{t,}i' 'eau' 'iam' 'iai' 'ia' 'ia{t,}i' 'iau' // past: // (not 'ii') 'ui' 'a{s,}i' 'ar{a+}m' 'ar{a+}{t,}i' 'ar{a+}' 'u{s,}i' 'ur{a+}m' 'ur{a+}{t,}i' 'ur{a+}' 'i{s,}i' 'ir{a+}m' 'ir{a+}{t,}i' 'ir{a+}' '{a^}i' '{a^}{s,}i' '{a^}r{a+}m' '{a^}r{a+}{t,}i' '{a^}r{a+}' // pluferfect: 'asem' 'ase{s,}i' 'ase' 'aser{a+}m' 'aser{a+}{t,}i' 'aser{a+}' 'isem' 'ise{s,}i' 'ise' 'iser{a+}m' 'iser{a+}{t,}i' 'iser{a+}' '{a^}sem' '{a^}se{s,}i' '{a^}se' '{a^}ser{a+}m' '{a^}ser{a+}{t,}i' '{a^}ser{a+}' 'usem' 'use{s,}i' 'use' 'user{a+}m' 'user{a+}{t,}i' 'user{a+}' ( non-v or 'u' delete ) // present: '{a+}m' 'a{t,}i' 'em' 'e{t,}i' 'im' 'i{t,}i' '{a^}m' '{a^}{t,}i' // past: 'se{s,}i' 'ser{a+}m' 'ser{a+}{t,}i' 'ser{a+}' 'sei' 'se' // pluperfect: 'sesem' 'sese{s,}i' 'sese' 'seser{a+}m' 'seser{a+}{t,}i' 'seser{a+}' (delete) ) ) define vowel_suffix as ( [substring] RV among ( 'a' 'e' 'i' 'ie' '{a+}' ( delete ) ) ) ) define stem as ( do prelude do mark_regions backwards ( do step_0 do standard_suffix do ( standard_suffix_removed or verb_suffix ) do vowel_suffix ) do postlude ) snowball_code/algorithms/turkish/0000755000175000017500000000000012707117052015655 5ustar domdomsnowball_code/algorithms/turkish/stem_Unicode.sbl0000644000175000017500000003103412707117052020776 0ustar domdom/* Stemmer for Turkish * author: Evren (Kapusuz) Çilden * email: evren.kapusuz at gmail.com * version: 1.0 (15.01.2007) * stems nominal verb suffixes * stems nominal inflections * more than one syllable word check * (y,n,s,U) context check * vowel harmony check * last consonent check and conversion (b, c, d, ğ to p, ç, t, k) * The stemming algorithm is based on the paper "An Affix Stripping * Morphological Analyzer for Turkish" by Gülşen Eryiğit and * Eşref Adalı (Proceedings of the IAESTED International Conference * ARTIFICIAL INTELLIGENCE AND APPLICATIONS, February 16-18,2004, * Innsbruck, Austria * Turkish is an agglutinative language and has a very rich morphological * structure. In Turkish, you can form many different words from a single stem * by appending a sequence of suffixes. Eg. The word "doktoruymuşsunuz" means * "You had been the doctor of him". The stem of the word is "doktor" and it * takes three different suffixes -sU, -ymUs, and -sUnUz. The rules about * the append order of suffixes can be clearly described as FSMs. * The paper referenced above defines some FSMs for right to left * morphological analysis. I generated a method for constructing snowball * expressions from right to left FSMs for stemming suffixes. */ routines ( append_U_to_stems_ending_with_d_or_g // for preventing some overstemmings check_vowel_harmony // tests vowel harmony for suffixes is_reserved_word // tests whether current string is a reserved word ('ad','soyad') mark_cAsInA // nominal verb suffix mark_DA // noun suffix mark_DAn // noun suffix mark_DUr // nominal verb suffix mark_ki // noun suffix mark_lAr // noun suffix, nominal verb suffix mark_lArI // noun suffix mark_nA // noun suffix mark_ncA // noun suffix mark_ndA // noun suffix mark_ndAn // noun suffix mark_nU // noun suffix mark_nUn // noun suffix mark_nUz // nominal verb suffix mark_sU // noun suffix mark_sUn // nominal verb suffix mark_sUnUz // nominal verb suffix mark_possessives // -(U)m,-(U)n,-(U)mUz,-(U)nUz, mark_yA // noun suffix mark_ylA // noun suffix mark_yU // noun suffix mark_yUm // nominal verb suffix mark_yUz // nominal verb suffix mark_yDU // nominal verb suffix mark_yken // nominal verb suffix mark_ymUs_ // nominal verb suffix mark_ysA // nominal verb suffix mark_suffix_with_optional_y_consonant mark_suffix_with_optional_U_vowel mark_suffix_with_optional_n_consonant mark_suffix_with_optional_s_consonant more_than_one_syllable_word post_process_last_consonants postlude stem_nominal_verb_suffixes stem_noun_suffixes stem_suffix_chain_before_ki ) /* Special characters in Unicode Latin-1 and Latin Extended-A */ stringdef c. hex 'E7' // LATIN SMALL LETTER C WITH CEDILLA stringdef g~ hex '011F' // LATIN SMALL LETTER G WITH BREVE stringdef i' hex '0131' // LATIN SMALL LETTER I WITHOUT DOT stringdef o" hex 'F6' // LATIN SMALL LETTER O WITH DIAERESIS stringdef s. hex '015F' // LATIN SMALL LETTER S WITH CEDILLA stringdef u" hex 'FC' // LATIN SMALL LETTER U WITH DIAERESIS stringescapes { } integers ( strlen ) // length of a string booleans ( continue_stemming_noun_suffixes ) groupings ( vowel U vowel1 vowel2 vowel3 vowel4 vowel5 vowel6) define vowel 'ae{i'}io{o"}u{u"}' define U '{i'}iu{u"}' // the vowel grouping definitions below are used for checking vowel harmony define vowel1 'a{i'}ou' // vowels that can end with suffixes containing 'a' define vowel2 'ei{o"}{u"}' // vowels that can end with suffixes containing 'e' define vowel3 'a{i'}' // vowels that can end with suffixes containing 'i'' define vowel4 'ei' // vowels that can end with suffixes containing 'i' define vowel5 'ou' // vowels that can end with suffixes containing 'o' or 'u' define vowel6 '{o"}{u"}' // vowels that can end with suffixes containing 'o"' or 'u"' externals ( stem ) backwardmode ( // checks vowel harmony for possible suffixes, // helps to detect whether the candidate for suffix applies to vowel harmony // this rule is added to prevent over stemming define check_vowel_harmony as ( test ( (goto vowel) // if there is a vowel ( ('a' goto vowel1) or ('e' goto vowel2) or ('{i'}' goto vowel3) or ('i' goto vowel4) or ('o' goto vowel5) or ('{o"}' goto vowel6) or ('u' goto vowel5) or ('{u"}' goto vowel6) ) ) ) // if the last consonant before suffix is vowel and n then advance and delete // if the last consonant before suffix is non vowel and n do nothing // if the last consonant before suffix is not n then only delete the suffix // assumption: slice beginning is set correctly define mark_suffix_with_optional_n_consonant as ( ((test 'n') next (test vowel)) or ((not(test 'n')) test(next (test vowel))) ) // if the last consonant before suffix is vowel and s then advance and delete // if the last consonant before suffix is non vowel and s do nothing // if the last consonant before suffix is not s then only delete the suffix // assumption: slice beginning is set correctly define mark_suffix_with_optional_s_consonant as ( ((test 's') next (test vowel)) or ((not(test 's')) test(next (test vowel))) ) // if the last consonant before suffix is vowel and y then advance and delete // if the last consonant before suffix is non vowel and y do nothing // if the last consonant before suffix is not y then only delete the suffix // assumption: slice beginning is set correctly define mark_suffix_with_optional_y_consonant as ( ((test 'y') next (test vowel)) or ((not(test 'y')) test(next (test vowel))) ) define mark_suffix_with_optional_U_vowel as ( ((test U) next (test non-vowel)) or ((not(test U)) test(next (test non-vowel))) ) define mark_possessives as ( among ('m{i'}z' 'miz' 'muz' 'm{u"}z' 'n{i'}z' 'niz' 'nuz' 'n{u"}z' 'm' 'n') (mark_suffix_with_optional_U_vowel) ) define mark_sU as ( check_vowel_harmony U (mark_suffix_with_optional_s_consonant) ) define mark_lArI as ( among ('leri' 'lar{i'}') ) define mark_yU as ( check_vowel_harmony U (mark_suffix_with_optional_y_consonant) ) define mark_nU as ( check_vowel_harmony among ('n{i'}' 'ni' 'nu' 'n{u"}') ) define mark_nUn as ( check_vowel_harmony among ('{i'}n' 'in' 'un' '{u"}n') (mark_suffix_with_optional_n_consonant) ) define mark_yA as ( check_vowel_harmony among('a' 'e') (mark_suffix_with_optional_y_consonant) ) define mark_nA as ( check_vowel_harmony among('na' 'ne') ) define mark_DA as ( check_vowel_harmony among('da' 'de' 'ta' 'te') ) define mark_ndA as ( check_vowel_harmony among('nda' 'nde') ) define mark_DAn as ( check_vowel_harmony among('dan' 'den' 'tan' 'ten') ) define mark_ndAn as ( check_vowel_harmony among('ndan' 'nden') ) define mark_ylA as ( check_vowel_harmony among('la' 'le') (mark_suffix_with_optional_y_consonant) ) define mark_ki as ( 'ki' ) define mark_ncA as ( check_vowel_harmony among('ca' 'ce') (mark_suffix_with_optional_n_consonant) ) define mark_yUm as ( check_vowel_harmony among ('{i'}m' 'im' 'um' '{u"}m') (mark_suffix_with_optional_y_consonant) ) define mark_sUn as ( check_vowel_harmony among ('s{i'}n' 'sin' 'sun' 's{u"}n' ) ) define mark_yUz as ( check_vowel_harmony among ('{i'}z' 'iz' 'uz' '{u"}z') (mark_suffix_with_optional_y_consonant) ) define mark_sUnUz as ( among ('s{i'}n{i'}z' 'siniz' 'sunuz' 's{u"}n{u"}z') ) define mark_lAr as ( check_vowel_harmony among ('ler' 'lar') ) define mark_nUz as ( check_vowel_harmony among ('n{i'}z' 'niz' 'nuz' 'n{u"}z') ) define mark_DUr as ( check_vowel_harmony among ('t{i'}r' 'tir' 'tur' 't{u"}r' 'd{i'}r' 'dir' 'dur' 'd{u"}r') ) define mark_cAsInA as ( among ('cas{i'}na' 'cesine') ) define mark_yDU as ( check_vowel_harmony among ('t{i'}m' 'tim' 'tum' 't{u"}m' 'd{i'}m' 'dim' 'dum' 'd{u"}m' 't{i'}n' 'tin' 'tun' 't{u"}n' 'd{i'}n' 'din' 'dun' 'd{u"}n' 't{i'}k' 'tik' 'tuk' 't{u"}k' 'd{i'}k' 'dik' 'duk' 'd{u"}k' 't{i'}' 'ti' 'tu' 't{u"}' 'd{i'}' 'di' 'du' 'd{u"}') (mark_suffix_with_optional_y_consonant) ) // does not fully obey vowel harmony define mark_ysA as ( among ('sam' 'san' 'sak' 'sem' 'sen' 'sek' 'sa' 'se') (mark_suffix_with_optional_y_consonant) ) define mark_ymUs_ as ( check_vowel_harmony among ('m{i'}{s.}' 'mi{s.}' 'mu{s.}' 'm{u"}{s.}') (mark_suffix_with_optional_y_consonant) ) define mark_yken as ( 'ken' (mark_suffix_with_optional_y_consonant) ) define stem_nominal_verb_suffixes as ( [ set continue_stemming_noun_suffixes (mark_ymUs_ or mark_yDU or mark_ysA or mark_yken) or (mark_cAsInA (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_) or ( mark_lAr ] delete try([(mark_DUr or mark_yDU or mark_ysA or mark_ymUs_)) unset continue_stemming_noun_suffixes ) or (mark_nUz (mark_yDU or mark_ysA)) or ((mark_sUnUz or mark_yUz or mark_sUn or mark_yUm) ] delete try([ mark_ymUs_)) or (mark_DUr ] delete try([ (mark_sUnUz or mark_lAr or mark_yUm or mark_sUn or mark_yUz or true) mark_ymUs_)) ]delete ) // stems noun suffix chains ending with -ki define stem_suffix_chain_before_ki as ( [ mark_ki ( (mark_DA] delete try([ (mark_lAr] delete try(stem_suffix_chain_before_ki)) or (mark_possessives] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) )) or (mark_nUn] delete try([ (mark_lArI] delete) or ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (stem_suffix_chain_before_ki) )) or (mark_ndA ( (mark_lArI] delete) or ((mark_sU] delete try([mark_lAr]delete stem_suffix_chain_before_ki))) or (stem_suffix_chain_before_ki) )) ) ) define stem_noun_suffixes as ( ([mark_lAr] delete try(stem_suffix_chain_before_ki)) or ([mark_ncA] delete try( ([mark_lArI] delete) or ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or ([mark_lAr] delete stem_suffix_chain_before_ki) ) ) or ([(mark_ndA or mark_nA) ( (mark_lArI] delete) or (mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (stem_suffix_chain_before_ki) ) ) or ([(mark_ndAn or mark_nU) ((mark_sU ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lArI))) or ( [mark_DAn] delete try ([ ( (mark_possessives ] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or (mark_lAr] delete try(stem_suffix_chain_before_ki)) or (stem_suffix_chain_before_ki) )) ) or ([mark_nUn or mark_ylA] delete try( ([mark_lAr] delete stem_suffix_chain_before_ki) or ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) or stem_suffix_chain_before_ki ) ) or ([mark_lArI] delete) or (stem_suffix_chain_before_ki) or ([mark_DA or mark_yU or mark_yA] delete try([((mark_possessives] delete try([mark_lAr)) or mark_lAr) ] delete [ stem_suffix_chain_before_ki)) or ([mark_possessives or mark_sU] delete try([mark_lAr] delete stem_suffix_chain_before_ki)) ) define post_process_last_consonants as ( [substring] among ( 'b' (<- 'p') 'c' (<- '{c.}') 'd' (<- 't') '{g~}' (<- 'k') ) ) // after stemming if the word ends with 'd' or 'g' most probably last U is overstemmed // like in 'kedim' -> 'ked' // Turkish words don't usually end with 'd' or 'g' // some very well known words are ignored (like 'ad' 'soyad' // appends U to stems ending with d or g, decides which vowel to add // based on the last vowel in the stem define append_U_to_stems_ending_with_d_or_g as ( test('d' or 'g') (test((goto vowel) 'a' or '{i'}') <+ '{i'}') or (test((goto vowel) 'e' or 'i') <+ 'i') or (test((goto vowel) 'o' or 'u') <+ 'u') or (test((goto vowel) '{o"}' or '{u"}') <+ '{u"}') ) ) // Tests if there are more than one syllables // In Turkish each vowel indicates a distinct syllable define more_than_one_syllable_word as ( test (atleast 2 (gopast vowel)) ) define is_reserved_word as ( test(gopast 'ad' ($strlen = 2) ($strlen == limit)) or test(gopast 'soyad' ($strlen = 5) ($strlen == limit)) ) define postlude as ( not(is_reserved_word) backwards ( do append_U_to_stems_ending_with_d_or_g do post_process_last_consonants ) ) define stem as ( (more_than_one_syllable_word) ( backwards ( do stem_nominal_verb_suffixes continue_stemming_noun_suffixes do stem_noun_suffixes ) postlude ) ) snowball_code/algorithms/russian/0000755000175000017500000000000012707117052015650 5ustar domdomsnowball_code/algorithms/russian/stem_Unicode.sbl0000644000175000017500000001360112707117052020771 0ustar domdomstringescapes {} /* the 32 Cyrillic letters in Unicode */ stringdef a hex '430' stringdef b hex '431' stringdef v hex '432' stringdef g hex '433' stringdef d hex '434' stringdef e hex '435' stringdef zh hex '436' stringdef z hex '437' stringdef i hex '438' stringdef i` hex '439' stringdef k hex '43A' stringdef l hex '43B' stringdef m hex '43C' stringdef n hex '43D' stringdef o hex '43E' stringdef p hex '43F' stringdef r hex '440' stringdef s hex '441' stringdef t hex '442' stringdef u hex '443' stringdef f hex '444' stringdef kh hex '445' stringdef ts hex '446' stringdef ch hex '447' stringdef sh hex '448' stringdef shch hex '449' stringdef " hex '44A' stringdef y hex '44B' stringdef ' hex '44C' stringdef e` hex '44D' stringdef iu hex '44E' stringdef ia hex '44F' routines ( mark_regions R2 perfective_gerund adjective adjectival reflexive verb noun derivational tidy_up ) externals ( stem ) integers ( pV p2 ) groupings ( v ) define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}' define mark_regions as ( $pV = limit $p2 = limit do ( gopast v setmark pV gopast non-v gopast v gopast non-v setmark p2 ) ) backwardmode ( define R2 as $p2 <= cursor define perfective_gerund as ( [substring] among ( '{v}' '{v}{sh}{i}' '{v}{sh}{i}{s}{'}' ('{a}' or '{ia}' delete) '{i}{v}' '{i}{v}{sh}{i}' '{i}{v}{sh}{i}{s}{'}' '{y}{v}' '{y}{v}{sh}{i}' '{y}{v}{sh}{i}{s}{'}' (delete) ) ) define adjective as ( [substring] among ( '{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}' '{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}' '{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}' '{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}' '{ia}{ia}' // and - '{o}{iu}' // - which is somewhat archaic '{e}{iu}' // - soft form of {o}{iu} (delete) ) ) define adjectival as ( adjective /* of the participle forms, em, vsh, ivsh, yvsh are readily removable. nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of errors. Removing im, uem, enn creates too many errors. */ try ( [substring] among ( '{e}{m}' // present passive participle '{n}{n}' // adjective from past passive participle '{v}{sh}' // past active participle '{iu}{shch}' '{shch}' // present active participle ('{a}' or '{ia}' delete) //but not '{i}{m}' '{u}{e}{m}' // present passive participle //or '{e}{n}{n}' // adjective from past passive participle '{i}{v}{sh}' '{y}{v}{sh}'// past active participle '{u}{iu}{shch}' // present active participle (delete) ) ) ) define reflexive as ( [substring] among ( '{s}{ia}' '{s}{'}' (delete) ) ) define verb as ( [substring] among ( '{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}' '{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}' '{n}{y}' '{t}{'}' '{e}{sh}{'}' '{n}{n}{o}' ('{a}' or '{ia}' delete) '{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}' '{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}' '{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}' '{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}' '{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}' '{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}' (delete) /* note the short passive participle tests: '{n}{a}' '{n}' '{n}{o}' '{n}{y}' '{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}' */ ) ) define noun as ( [substring] among ( '{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}' '{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}' '{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}' '{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}' '{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}' '{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}' (delete) /* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}' '{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}' omitted - they only occur on 12 words. */ ) ) define derivational as ( [substring] R2 among ( '{o}{s}{t}' '{o}{s}{t}{'}' (delete) ) ) define tidy_up as ( [substring] among ( '{e}{i`}{sh}' '{e}{i`}{sh}{e}' // superlative forms (delete ['{n}'] '{n}' delete ) '{n}' ('{n}' delete) // e.g. -nno endings '{'}' (delete) // with some slight false conflations ) ) ) define stem as ( do mark_regions backwards setlimit tomark pV for ( do ( perfective_gerund or ( try reflexive adjectival or verb or noun ) ) try([ '{i}' ] delete) // because noun ending -i{iu} is being treated as verb ending -{iu} do derivational do tidy_up ) ) snowball_code/algorithms/russian/stem_KOI8_R.sbl0000644000175000017500000001375412707117052020407 0ustar domdomstringescapes {} /* the 32 Cyrillic letters in the KOI8-R coding scheme, and represented in Latin characters following the conventions of the standard Library of Congress transliteration: */ stringdef a hex 'C1' stringdef b hex 'C2' stringdef v hex 'D7' stringdef g hex 'C7' stringdef d hex 'C4' stringdef e hex 'C5' stringdef zh hex 'D6' stringdef z hex 'DA' stringdef i hex 'C9' stringdef i` hex 'CA' stringdef k hex 'CB' stringdef l hex 'CC' stringdef m hex 'CD' stringdef n hex 'CE' stringdef o hex 'CF' stringdef p hex 'D0' stringdef r hex 'D2' stringdef s hex 'D3' stringdef t hex 'D4' stringdef u hex 'D5' stringdef f hex 'C6' stringdef kh hex 'C8' stringdef ts hex 'C3' stringdef ch hex 'DE' stringdef sh hex 'DB' stringdef shch hex 'DD' stringdef " hex 'DF' stringdef y hex 'D9' stringdef ' hex 'D8' stringdef e` hex 'DC' stringdef iu hex 'C0' stringdef ia hex 'D1' routines ( mark_regions R2 perfective_gerund adjective adjectival reflexive verb noun derivational tidy_up ) externals ( stem ) integers ( pV p2 ) groupings ( v ) define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}' define mark_regions as ( $pV = limit $p2 = limit do ( gopast v setmark pV gopast non-v gopast v gopast non-v setmark p2 ) ) backwardmode ( define R2 as $p2 <= cursor define perfective_gerund as ( [substring] among ( '{v}' '{v}{sh}{i}' '{v}{sh}{i}{s}{'}' ('{a}' or '{ia}' delete) '{i}{v}' '{i}{v}{sh}{i}' '{i}{v}{sh}{i}{s}{'}' '{y}{v}' '{y}{v}{sh}{i}' '{y}{v}{sh}{i}{s}{'}' (delete) ) ) define adjective as ( [substring] among ( '{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}' '{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}' '{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}' '{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}' '{ia}{ia}' // and - '{o}{iu}' // - which is somewhat archaic '{e}{iu}' // - soft form of {o}{iu} (delete) ) ) define adjectival as ( adjective /* of the participle forms, em, vsh, ivsh, yvsh are readily removable. nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of errors. Removing im, uem, enn creates too many errors. */ try ( [substring] among ( '{e}{m}' // present passive participle '{n}{n}' // adjective from past passive participle '{v}{sh}' // past active participle '{iu}{shch}' '{shch}' // present active participle ('{a}' or '{ia}' delete) //but not '{i}{m}' '{u}{e}{m}' // present passive participle //or '{e}{n}{n}' // adjective from past passive participle '{i}{v}{sh}' '{y}{v}{sh}'// past active participle '{u}{iu}{shch}' // present active participle (delete) ) ) ) define reflexive as ( [substring] among ( '{s}{ia}' '{s}{'}' (delete) ) ) define verb as ( [substring] among ( '{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}' '{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}' '{n}{y}' '{t}{'}' '{e}{sh}{'}' '{n}{n}{o}' ('{a}' or '{ia}' delete) '{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}' '{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}' '{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}' '{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}' '{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}' '{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}' (delete) /* note the short passive participle tests: '{n}{a}' '{n}' '{n}{o}' '{n}{y}' '{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}' */ ) ) define noun as ( [substring] among ( '{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}' '{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}' '{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}' '{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}' '{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}' '{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}' (delete) /* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}' '{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}' omitted - they only occur on 12 words. */ ) ) define derivational as ( [substring] R2 among ( '{o}{s}{t}' '{o}{s}{t}{'}' (delete) ) ) define tidy_up as ( [substring] among ( '{e}{i`}{sh}' '{e}{i`}{sh}{e}' // superlative forms (delete ['{n}'] '{n}' delete ) '{n}' ('{n}' delete) // e.g. -nno endings '{'}' (delete) // with some slight false conflations ) ) ) define stem as ( do mark_regions backwards setlimit tomark pV for ( do ( perfective_gerund or ( try reflexive adjectival or verb or noun ) ) try([ '{i}' ] delete) // because noun ending -i{iu} is being treated as verb ending -{iu} do derivational do tidy_up ) ) snowball_code/algorithms/french/0000755000175000017500000000000012707117052015431 5ustar domdomsnowball_code/algorithms/french/stem_MS_DOS_Latin_I.sbl0000644000175000017500000001363612707117052021617 0ustar domdomroutines ( prelude postlude mark_regions RV R1 R2 standard_suffix i_verb_suffix verb_suffix residual_suffix un_double un_accent ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v keep_with_s ) stringescapes {} /* special characters (in MS-DOS Latin I) */ stringdef a^ hex '83' // a-circumflex stringdef a` hex '85' // a-grave stringdef c, hex '87' // c-cedilla stringdef e" hex '89' // e-diaeresis (rare) stringdef e' hex '82' // e-acute stringdef e^ hex '88' // e-circumflex stringdef e` hex '8A' // e-grave stringdef i" hex '8B' // i-diaeresis stringdef i^ hex '8C' // i-circumflex stringdef o^ hex '93' // o-circumflex stringdef u^ hex '96' // u-circumflex stringdef u` hex '97' // u-grave define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}' define prelude as repeat goto ( ( v [ ('u' ] v <- 'U') or ('i' ] v <- 'I') or ('y' ] <- 'Y') ) or ( ['y'] v <- 'Y' ) or ( 'q' ['u'] <- 'U' ) ) define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v v next ) or ( next gopast v ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( 'I' (<- 'i') 'U' (<- 'u') 'Y' (<- 'y') '' (next) ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define standard_suffix as ( [substring] among( 'ance' 'iqUe' 'isme' 'able' 'iste' 'eux' 'ances' 'iqUes' 'ismes' 'ables' 'istes' ( R2 delete ) 'atrice' 'ateur' 'ation' 'atrices' 'ateurs' 'ations' ( R2 delete try ( ['ic'] (R2 delete) or <-'iqU' ) ) 'logie' 'logies' ( R2 <- 'log' ) 'usion' 'ution' 'usions' 'utions' ( R2 <- 'u' ) 'ence' 'ences' ( R2 <- 'ent' ) 'ement' 'ements' ( RV delete try ( [substring] among( 'iv' (R2 delete ['at'] R2 delete) 'eus' ((R2 delete) or (R1<-'eux')) 'abl' 'iqU' (R2 delete) 'i{e`}r' 'I{e`}r' //) (RV <-'i') //)--new 2 Sept 02 ) ) ) 'it{e'}' 'it{e'}s' ( R2 delete try ( [substring] among( 'abil' ((R2 delete) or <-'abl') 'ic' ((R2 delete) or <-'iqU') 'iv' (R2 delete) ) ) ) 'if' 'ive' 'ifs' 'ives' ( R2 delete try ( ['at'] R2 delete ['ic'] (R2 delete) or <-'iqU' ) ) 'eaux' (<- 'eau') 'aux' (R1 <- 'al') 'euse' 'euses'((R2 delete) or (R1<-'eux')) 'issement' 'issements'(R1 non-v delete) // verbal // fail(...) below forces entry to verb_suffix. -ment typically // follows the p.p., e.g 'confus{e'}ment'. 'amment' (RV fail(<- 'ant')) 'emment' (RV fail(<- 'ent')) 'ment' 'ments' (test(v RV) fail(delete)) // v is e,i,u,{e'},I or U ) ) define i_verb_suffix as setlimit tomark pV for ( [substring] among ( '{i^}mes' '{i^}t' '{i^}tes' 'i' 'ie' 'ies' 'ir' 'ira' 'irai' 'iraIent' 'irais' 'irait' 'iras' 'irent' 'irez' 'iriez' 'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait' 'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses' 'issez' 'issiez' 'issions' 'issons' 'it' (non-v delete) ) ) define verb_suffix as setlimit tomark pV for ( [substring] among ( 'ions' (R2 delete) '{e'}' '{e'}e' '{e'}es' '{e'}s' '{e`}rent' 'er' 'era' 'erai' 'eraIent' 'erais' 'erait' 'eras' 'erez' 'eriez' 'erions' 'erons' 'eront' 'ez' 'iez' // 'ons' //-best omitted (delete) '{a^}mes' '{a^}t' '{a^}tes' 'a' 'ai' 'aIent' 'ais' 'ait' 'ant' 'ante' 'antes' 'ants' 'as' 'asse' 'assent' 'asses' 'assiez' 'assions' (delete try(['e'] delete) ) ) ) define keep_with_s 'aiou{e`}s' define residual_suffix as ( try(['s'] test non-keep_with_s delete) setlimit tomark pV for ( [substring] among( 'ion' (R2 's' or 't' delete) 'ier' 'i{e`}re' 'Ier' 'I{e`}re' (<-'i') 'e' (delete) '{e"}' ('gu' delete) ) ) ) define un_double as ( test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete ) define un_accent as ( atleast 1 non-v [ '{e'}' or '{e`}' ] <-'e' ) ) define stem as ( do prelude do mark_regions backwards ( do ( ( ( standard_suffix or i_verb_suffix or verb_suffix ) and try( [ ('Y' ] <- 'i' ) or ('{c,}'] <- 'c' ) ) ) or residual_suffix ) // try(['ent'] RV delete) // is best omitted do un_double do un_accent ) do postlude ) snowball_code/algorithms/french/stem_ISO_8859_1.sbl0000644000175000017500000001417712707117052020544 0ustar domdomroutines ( prelude postlude mark_regions RV R1 R2 standard_suffix i_verb_suffix verb_suffix residual_suffix un_double un_accent ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v keep_with_s ) stringescapes {} /* special characters (in ISO Latin I) */ stringdef a^ hex 'E2' // a-circumflex stringdef a` hex 'E0' // a-grave stringdef c, hex 'E7' // c-cedilla stringdef e" hex 'EB' // e-diaeresis (rare) stringdef e' hex 'E9' // e-acute stringdef e^ hex 'EA' // e-circumflex stringdef e` hex 'E8' // e-grave stringdef i" hex 'EF' // i-diaeresis stringdef i^ hex 'EE' // i-circumflex stringdef o^ hex 'F4' // o-circumflex stringdef u^ hex 'FB' // u-circumflex stringdef u` hex 'F9' // u-grave define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}' define prelude as repeat goto ( ( v [ ('u' ] v <- 'U') or ('i' ] v <- 'I') or ('y' ] <- 'Y') ) or ( ['y'] v <- 'Y' ) or ( 'q' ['u'] <- 'U' ) ) define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v v next ) or among ( // this exception list begun Nov 2006 'par' // paris, parie, pari 'col' // colis 'tap' // tapis // extensions possible here ) or ( next gopast v ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( 'I' (<- 'i') 'U' (<- 'u') 'Y' (<- 'y') '' (next) ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define standard_suffix as ( [substring] among( 'ance' 'iqUe' 'isme' 'able' 'iste' 'eux' 'ances' 'iqUes' 'ismes' 'ables' 'istes' ( R2 delete ) 'atrice' 'ateur' 'ation' 'atrices' 'ateurs' 'ations' ( R2 delete try ( ['ic'] (R2 delete) or <-'iqU' ) ) 'logie' 'logies' ( R2 <- 'log' ) 'usion' 'ution' 'usions' 'utions' ( R2 <- 'u' ) 'ence' 'ences' ( R2 <- 'ent' ) 'ement' 'ements' ( RV delete try ( [substring] among( 'iv' (R2 delete ['at'] R2 delete) 'eus' ((R2 delete) or (R1<-'eux')) 'abl' 'iqU' (R2 delete) 'i{e`}r' 'I{e`}r' //) (RV <-'i') //)--new 2 Sept 02 ) ) ) 'it{e'}' 'it{e'}s' ( R2 delete try ( [substring] among( 'abil' ((R2 delete) or <-'abl') 'ic' ((R2 delete) or <-'iqU') 'iv' (R2 delete) ) ) ) 'if' 'ive' 'ifs' 'ives' ( R2 delete try ( ['at'] R2 delete ['ic'] (R2 delete) or <-'iqU' ) ) 'eaux' (<- 'eau') 'aux' (R1 <- 'al') 'euse' 'euses'((R2 delete) or (R1<-'eux')) 'issement' 'issements'(R1 non-v delete) // verbal // fail(...) below forces entry to verb_suffix. -ment typically // follows the p.p., e.g 'confus{e'}ment'. 'amment' (RV fail(<- 'ant')) 'emment' (RV fail(<- 'ent')) 'ment' 'ments' (test(v RV) fail(delete)) // v is e,i,u,{e'},I or U ) ) define i_verb_suffix as setlimit tomark pV for ( [substring] among ( '{i^}mes' '{i^}t' '{i^}tes' 'i' 'ie' 'ies' 'ir' 'ira' 'irai' 'iraIent' 'irais' 'irait' 'iras' 'irent' 'irez' 'iriez' 'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait' 'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses' 'issez' 'issiez' 'issions' 'issons' 'it' (non-v delete) ) ) define verb_suffix as setlimit tomark pV for ( [substring] among ( 'ions' (R2 delete) '{e'}' '{e'}e' '{e'}es' '{e'}s' '{e`}rent' 'er' 'era' 'erai' 'eraIent' 'erais' 'erait' 'eras' 'erez' 'eriez' 'erions' 'erons' 'eront' 'ez' 'iez' // 'ons' //-best omitted (delete) '{a^}mes' '{a^}t' '{a^}tes' 'a' 'ai' 'aIent' 'ais' 'ait' 'ant' 'ante' 'antes' 'ants' 'as' 'asse' 'assent' 'asses' 'assiez' 'assions' (delete try(['e'] delete) ) ) ) define keep_with_s 'aiou{e`}s' define residual_suffix as ( try(['s'] test non-keep_with_s delete) setlimit tomark pV for ( [substring] among( 'ion' (R2 's' or 't' delete) 'ier' 'i{e`}re' 'Ier' 'I{e`}re' (<-'i') 'e' (delete) '{e"}' ('gu' delete) ) ) ) define un_double as ( test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete ) define un_accent as ( atleast 1 non-v [ '{e'}' or '{e`}' ] <-'e' ) ) define stem as ( do prelude do mark_regions backwards ( do ( ( ( standard_suffix or i_verb_suffix or verb_suffix ) and try( [ ('Y' ] <- 'i' ) or ('{c,}'] <- 'c' ) ) ) or residual_suffix ) // try(['ent'] RV delete) // is best omitted do un_double do un_accent ) do postlude ) snowball_code/algorithms/lovins/0000755000175000017500000000000012707117052015476 5ustar domdomsnowball_code/algorithms/lovins/stem_ISO_8859_1.sbl0000644000175000017500000002000312707117052020572 0ustar domdom stringescapes {} routines ( A B C D E F G H I J K L M N O P Q R S T U V W X Y Z AA BB CC endings undouble respell ) externals ( stem ) backwardmode ( /* Lovins' conditions A, B ... CC, as given in her Appendix B, where a test for a two letter prefix ('test hop 2') is implicitly assumed. Note that 'e' next 'u' corresponds to her u*e because Snowball is scanning backwards. */ define A as ( hop 2 ) define B as ( hop 3 ) define C as ( hop 4 ) define D as ( hop 5 ) define E as ( test hop 2 not 'e' ) define F as ( test hop 3 not 'e' ) define G as ( test hop 3 'f' ) define H as ( test hop 2 't' or 'll' ) define I as ( test hop 2 not 'o' not 'e' ) define J as ( test hop 2 not 'a' not 'e' ) define K as ( test hop 3 'l' or 'i' or ('e' next 'u') ) define L as ( test hop 2 not 'u' not 'x' not ('s' not 'o') ) define M as ( test hop 2 not 'a' not 'c' not 'e' not 'm' ) define N as ( test hop 3 ( hop 2 not 's' or hop 2 ) ) define O as ( test hop 2 'l' or 'i' ) define P as ( test hop 2 not 'c' ) define Q as ( test hop 2 test hop 3 not 'l' not 'n' ) define R as ( test hop 2 'n' or 'r' ) define S as ( test hop 2 'dr' or ('t' not 't') ) define T as ( test hop 2 's' or ('t' not 'o') ) define U as ( test hop 2 'l' or 'm' or 'n' or 'r' ) define V as ( test hop 2 'c' ) define W as ( test hop 2 not 's' not 'u' ) define X as ( test hop 2 'l' or 'i' or ('e' next 'u') ) define Y as ( test hop 2 'in' ) define Z as ( test hop 2 not 'f' ) define AA as ( test hop 2 among ( 'd' 'f' 'ph' 'th' 'l' 'er' 'or' 'es' 't' ) ) define BB as ( test hop 3 not 'met' not 'ryst' ) define CC as ( test hop 2 'l' ) /* The system of endings, as given in Appendix A. */ define endings as ( [substring] among( 'alistically' B 'arizability' A 'izationally' B 'antialness' A 'arisations' A 'arizations' A 'entialness' A 'allically' C 'antaneous' A 'antiality' A 'arisation' A 'arization' A 'ationally' B 'ativeness' A 'eableness' E 'entations' A 'entiality' A 'entialize' A 'entiation' A 'ionalness' A 'istically' A 'itousness' A 'izability' A 'izational' A 'ableness' A 'arizable' A 'entation' A 'entially' A 'eousness' A 'ibleness' A 'icalness' A 'ionalism' A 'ionality' A 'ionalize' A 'iousness' A 'izations' A 'lessness' A 'ability' A 'aically' A 'alistic' B 'alities' A 'ariness' E 'aristic' A 'arizing' A 'ateness' A 'atingly' A 'ational' B 'atively' A 'ativism' A 'elihood' E 'encible' A 'entally' A 'entials' A 'entiate' A 'entness' A 'fulness' A 'ibility' A 'icalism' A 'icalist' A 'icality' A 'icalize' A 'ication' G 'icianry' A 'ination' A 'ingness' A 'ionally' A 'isation' A 'ishness' A 'istical' A 'iteness' A 'iveness' A 'ivistic' A 'ivities' A 'ization' F 'izement' A 'oidally' A 'ousness' A 'aceous' A 'acious' B 'action' G 'alness' A 'ancial' A 'ancies' A 'ancing' B 'ariser' A 'arized' A 'arizer' A 'atable' A 'ations' B 'atives' A 'eature' Z 'efully' A 'encies' A 'encing' A 'ential' A 'enting' C 'entist' A 'eously' A 'ialist' A 'iality' A 'ialize' A 'ically' A 'icance' A 'icians' A 'icists' A 'ifully' A 'ionals' A 'ionate' D 'ioning' A 'ionist' A 'iously' A 'istics' A 'izable' E 'lessly' A 'nesses' A 'oidism' A 'acies' A 'acity' A 'aging' B 'aical' A 'alist' A 'alism' B 'ality' A 'alize' A 'allic'BB 'anced' B 'ances' B 'antic' C 'arial' A 'aries' A 'arily' A 'arity' B 'arize' A 'aroid' A 'ately' A 'ating' I 'ation' B 'ative' A 'ators' A 'atory' A 'ature' E 'early' Y 'ehood' A 'eless' A 'elity' A 'ement' A 'enced' A 'ences' A 'eness' E 'ening' E 'ental' A 'ented' C 'ently' A 'fully' A 'ially' A 'icant' A 'ician' A 'icide' A 'icism' A 'icist' A 'icity' A 'idine' I 'iedly' A 'ihood' A 'inate' A 'iness' A 'ingly' B 'inism' J 'inity'CC 'ional' A 'ioned' A 'ished' A 'istic' A 'ities' A 'itous' A 'ively' A 'ivity' A 'izers' F 'izing' F 'oidal' A 'oides' A 'otide' A 'ously' A 'able' A 'ably' A 'ages' B 'ally' B 'ance' B 'ancy' B 'ants' B 'aric' A 'arly' K 'ated' I 'ates' A 'atic' B 'ator' A 'ealy' Y 'edly' E 'eful' A 'eity' A 'ence' A 'ency' A 'ened' E 'enly' E 'eous' A 'hood' A 'ials' A 'ians' A 'ible' A 'ibly' A 'ical' A 'ides' L 'iers' A 'iful' A 'ines' M 'ings' N 'ions' B 'ious' A 'isms' B 'ists' A 'itic' H 'ized' F 'izer' F 'less' A 'lily' A 'ness' A 'ogen' A 'ward' A 'wise' A 'ying' B 'yish' A 'acy' A 'age' B 'aic' A 'als'BB 'ant' B 'ars' O 'ary' F 'ata' A 'ate' A 'eal' Y 'ear' Y 'ely' E 'ene' E 'ent' C 'ery' E 'ese' A 'ful' A 'ial' A 'ian' A 'ics' A 'ide' L 'ied' A 'ier' A 'ies' P 'ily' A 'ine' M 'ing' N 'ion' Q 'ish' C 'ism' B 'ist' A 'ite'AA 'ity' A 'ium' A 'ive' A 'ize' F 'oid' A 'one' R 'ous' A 'ae' A 'al'BB 'ar' X 'as' B 'ed' E 'en' F 'es' E 'ia' A 'ic' A 'is' A 'ly' B 'on' S 'or' T 'um' U 'us' V 'yl' R '{'}s' A 's{'}' A 'a' A 'e' A 'i' A 'o' A 's' W 'y' B (delete) ) ) /* Undoubling is rule 1 of appendix C. */ define undouble as ( test substring among ('bb' 'dd' 'gg' 'll' 'mm' 'nn' 'pp' 'rr' 'ss' 'tt') [next] delete ) /* The other appendix C rules can be done together. */ define respell as ( [substring] among ( 'iev' (<-'ief') 'uct' (<-'uc') 'umpt' (<-'um') 'rpt' (<-'rb') 'urs' (<-'ur') 'istr' (<-'ister') 'metr' (<-'meter') 'olv' (<-'olut') 'ul' (not 'a' not 'i' not 'o' <-'l') 'bex' (<-'bic') 'dex' (<-'dic') 'pex' (<-'pic') 'tex' (<-'tic') 'ax' (<-'ac') 'ex' (<-'ec') 'ix' (<-'ic') 'lux' (<-'luc') 'uad' (<-'uas') 'vad' (<-'vas') 'cid' (<-'cis') 'lid' (<-'lis') 'erid' (<-'eris') 'pand' (<-'pans') 'end' (not 's' <-'ens') 'ond' (<-'ons') 'lud' (<-'lus') 'rud' (<-'rus') 'her' (not 'p' not 't' <-'hes') 'mit' (<-'mis') 'ent' (not 'm' <-'ens') /* 'ent' was 'end' in the 1968 paper - a typo. */ 'ert' (<-'ers') 'et' (not 'n' <-'es') 'yt' (<-'ys') 'yz' (<-'ys') ) ) ) define stem as ( backwards ( do endings do undouble do respell ) ) snowball_code/algorithms/dutch/0000755000175000017500000000000012707117052015273 5ustar domdomsnowball_code/algorithms/dutch/stem_MS_DOS_Latin_I.sbl0000644000175000017500000000601512707117052021452 0ustar domdomroutines ( prelude postlude e_ending en_ending mark_regions R1 R2 undouble standard_suffix ) externals ( stem ) booleans ( e_found ) integers ( p1 p2 ) groupings ( v v_I v_j ) stringescapes {} /* special characters (in MS-DOS Latin I) */ stringdef a" hex '84' stringdef e" hex '89' stringdef i" hex '8B' stringdef o" hex '94' stringdef u" hex '81' stringdef a' hex 'A0' stringdef e' hex '82' stringdef i' hex 'A1' stringdef o' hex 'A2' stringdef u' hex 'A3' stringdef e` hex '8A' define v 'aeiouy{e`}' define v_I v + 'I' define v_j v + 'j' define prelude as ( test repeat ( [substring] among( '{a"}' '{a'}' (<- 'a') '{e"}' '{e'}' (<- 'e') '{i"}' '{i'}' (<- 'i') '{o"}' '{o'}' (<- 'o') '{u"}' '{u'}' (<- 'u') '' (next) ) //or next ) try(['y'] <- 'Y') repeat goto ( v [('i'] v <- 'I') or ('y'] <- 'Y') ) ) define mark_regions as ( $p1 = limit $p2 = limit gopast v gopast non-v setmark p1 try($p1 < 3 $p1 = 3) // at least 3 gopast v gopast non-v setmark p2 ) define postlude as repeat ( [substring] among( 'Y' (<- 'y') 'I' (<- 'i') '' (next) ) //or next ) backwardmode ( define R1 as $p1 <= cursor define R2 as $p2 <= cursor define undouble as ( test among('kk' 'dd' 'tt') [next] delete ) define e_ending as ( unset e_found ['e'] R1 test non-v delete set e_found undouble ) define en_ending as ( R1 non-v and not 'gem' delete undouble ) define standard_suffix as ( do ( [substring] among( 'heden' ( R1 <- 'heid' ) 'en' 'ene' ( en_ending ) 's' 'se' ( R1 non-v_j delete ) ) ) do e_ending do ( ['heid'] R2 not 'c' delete ['en'] en_ending ) do ( [substring] among( 'end' 'ing' ( R2 delete (['ig'] R2 not 'e' delete) or undouble ) 'ig' ( R2 not 'e' delete ) 'lijk' ( R2 delete e_ending ) 'baar' ( R2 delete ) 'bar' ( R2 e_found delete ) ) ) do ( non-v_I test ( among ('aa' 'ee' 'oo' 'uu') non-v ) [next] delete ) ) ) define stem as ( do prelude do mark_regions backwards do standard_suffix do postlude ) snowball_code/algorithms/dutch/stem_ISO_8859_1.sbl0000644000175000017500000000601212707117052020373 0ustar domdomroutines ( prelude postlude e_ending en_ending mark_regions R1 R2 undouble standard_suffix ) externals ( stem ) booleans ( e_found ) integers ( p1 p2 ) groupings ( v v_I v_j ) stringescapes {} /* special characters (in ISO Latin I) */ stringdef a" hex 'E4' stringdef e" hex 'EB' stringdef i" hex 'EF' stringdef o" hex 'F6' stringdef u" hex 'FC' stringdef a' hex 'E1' stringdef e' hex 'E9' stringdef i' hex 'ED' stringdef o' hex 'F3' stringdef u' hex 'FA' stringdef e` hex 'E8' define v 'aeiouy{e`}' define v_I v + 'I' define v_j v + 'j' define prelude as ( test repeat ( [substring] among( '{a"}' '{a'}' (<- 'a') '{e"}' '{e'}' (<- 'e') '{i"}' '{i'}' (<- 'i') '{o"}' '{o'}' (<- 'o') '{u"}' '{u'}' (<- 'u') '' (next) ) //or next ) try(['y'] <- 'Y') repeat goto ( v [('i'] v <- 'I') or ('y'] <- 'Y') ) ) define mark_regions as ( $p1 = limit $p2 = limit gopast v gopast non-v setmark p1 try($p1 < 3 $p1 = 3) // at least 3 gopast v gopast non-v setmark p2 ) define postlude as repeat ( [substring] among( 'Y' (<- 'y') 'I' (<- 'i') '' (next) ) //or next ) backwardmode ( define R1 as $p1 <= cursor define R2 as $p2 <= cursor define undouble as ( test among('kk' 'dd' 'tt') [next] delete ) define e_ending as ( unset e_found ['e'] R1 test non-v delete set e_found undouble ) define en_ending as ( R1 non-v and not 'gem' delete undouble ) define standard_suffix as ( do ( [substring] among( 'heden' ( R1 <- 'heid' ) 'en' 'ene' ( en_ending ) 's' 'se' ( R1 non-v_j delete ) ) ) do e_ending do ( ['heid'] R2 not 'c' delete ['en'] en_ending ) do ( [substring] among( 'end' 'ing' ( R2 delete (['ig'] R2 not 'e' delete) or undouble ) 'ig' ( R2 not 'e' delete ) 'lijk' ( R2 delete e_ending ) 'baar' ( R2 delete ) 'bar' ( R2 e_found delete ) ) ) do ( non-v_I test ( among ('aa' 'ee' 'oo' 'uu') non-v ) [next] delete ) ) ) define stem as ( do prelude do mark_regions backwards do standard_suffix do postlude ) snowball_code/algorithms/german2/0000755000175000017500000000000012707117052015517 5ustar domdomsnowball_code/algorithms/german2/stem_ISO_8859_1.sbl0000644000175000017500000000537112707117052020626 0ustar domdom /* Extra rule for -nisse ending added 11 Dec 2009 */ routines ( prelude postlude mark_regions R1 R2 standard_suffix ) externals ( stem ) integers ( p1 p2 x ) groupings ( v s_ending st_ending ) stringescapes {} /* special characters (in ISO Latin I) */ stringdef a" hex 'E4' stringdef o" hex 'F6' stringdef u" hex 'FC' stringdef ss hex 'DF' define v 'aeiouy{a"}{o"}{u"}' define s_ending 'bdfghklmnrt' define st_ending s_ending - 'r' define prelude as ( test repeat goto ( v [('u'] v <- 'U') or ('y'] v <- 'Y') ) repeat ( [substring] among( '{ss}' (<- 'ss') 'ae' (<- '{a"}') 'oe' (<- '{o"}') 'ue' (<- '{u"}') 'qu' (hop 2) '' (next) ) ) ) define mark_regions as ( $p1 = limit $p2 = limit test(hop 3 setmark x) gopast v gopast non-v setmark p1 try($p1 < x $p1 = x) // at least 3 gopast v gopast non-v setmark p2 ) define postlude as repeat ( [substring] among( 'Y' (<- 'y') 'U' (<- 'u') '{a"}' (<- 'a') '{o"}' (<- 'o') '{u"}' (<- 'u') '' (next) ) ) backwardmode ( define R1 as $p1 <= cursor define R2 as $p2 <= cursor define standard_suffix as ( do ( [substring] R1 among( 'em' 'ern' 'er' ( delete ) 'e' 'en' 'es' ( delete try (['s'] 'nis' delete) ) 's' ( s_ending delete ) ) ) do ( [substring] R1 among( 'en' 'er' 'est' ( delete ) 'st' ( st_ending hop 3 delete ) ) ) do ( [substring] R2 among( 'end' 'ung' ( delete try (['ig'] not 'e' R2 delete) ) 'ig' 'ik' 'isch' ( not 'e' delete ) 'lich' 'heit' ( delete try ( ['er' or 'en'] R1 delete ) ) 'keit' ( delete try ( [substring] R2 among( 'lich' 'ig' ( delete ) ) ) ) ) ) ) ) define stem as ( do prelude do mark_regions backwards do standard_suffix do postlude ) snowball_code/algorithms/english/0000755000175000017500000000000012707117052015615 5ustar domdomsnowball_code/algorithms/english/stem_ISO_8859_1.sbl0000644000175000017500000001165212707117052020723 0ustar domdomintegers ( p1 p2 ) booleans ( Y_found ) routines ( prelude postlude mark_regions shortv R1 R2 Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5 exception1 exception2 ) externals ( stem ) groupings ( v v_WXY valid_LI ) stringescapes {} define v 'aeiouy' define v_WXY v + 'wxY' define valid_LI 'cdeghkmnrt' define prelude as ( unset Y_found do ( ['{'}'] delete) do ( ['y'] <-'Y' set Y_found) do repeat(goto (v ['y']) <-'Y' set Y_found) ) define mark_regions as ( $p1 = limit $p2 = limit do( among ( 'gener' 'commun' // added May 2005 'arsen' // added Nov 2006 (arsenic/arsenal) // ... extensions possible here ... ) or (gopast v gopast non-v) setmark p1 gopast v gopast non-v setmark p2 ) ) backwardmode ( define shortv as ( ( non-v_WXY v non-v ) or ( non-v v atlimit ) ) define R1 as $p1 <= cursor define R2 as $p2 <= cursor define Step_1a as ( try ( [substring] among ( '{'}' '{'}s' '{'}s{'}' (delete) ) ) [substring] among ( 'sses' (<-'ss') 'ied' 'ies' ((hop 2 <-'i') or <-'ie') 's' (next gopast v delete) 'us' 'ss' ) ) define Step_1b as ( [substring] among ( 'eed' 'eedly' (R1 <-'ee') 'ed' 'edly' 'ing' 'ingly' ( test gopast v delete test substring among( 'at' 'bl' 'iz' (<+ 'e') 'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt' // ignoring double c, h, j, k, q, v, w, and x ([next] delete) '' (atmark p1 test shortv <+ 'e') ) ) ) ) define Step_1c as ( ['y' or 'Y'] non-v not atlimit <-'i' ) define Step_2 as ( [substring] R1 among ( 'tional' (<-'tion') 'enci' (<-'ence') 'anci' (<-'ance') 'abli' (<-'able') 'entli' (<-'ent') 'izer' 'ization' (<-'ize') 'ational' 'ation' 'ator' (<-'ate') 'alism' 'aliti' 'alli' (<-'al') 'fulness' (<-'ful') 'ousli' 'ousness' (<-'ous') 'iveness' 'iviti' (<-'ive') 'biliti' 'bli' (<-'ble') 'ogi' ('l' <-'og') 'fulli' (<-'ful') 'lessli' (<-'less') 'li' (valid_LI delete) ) ) define Step_3 as ( [substring] R1 among ( 'tional' (<- 'tion') 'ational' (<- 'ate') 'alize' (<-'al') 'icate' 'iciti' 'ical' (<-'ic') 'ful' 'ness' (delete) 'ative' (R2 delete) // 'R2' added Dec 2001 ) ) define Step_4 as ( [substring] R2 among ( 'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement' 'ment' 'ent' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize' (delete) 'ion' ('s' or 't' delete) ) ) define Step_5 as ( [substring] among ( 'e' (R2 or (R1 not shortv) delete) 'l' (R2 'l' delete) ) ) define exception2 as ( [substring] atlimit among( 'inning' 'outing' 'canning' 'herring' 'earring' 'proceed' 'exceed' 'succeed' // ... extensions possible here ... ) ) ) define exception1 as ( [substring] atlimit among( /* special changes: */ 'skis' (<-'ski') 'skies' (<-'sky') 'dying' (<-'die') 'lying' (<-'lie') 'tying' (<-'tie') /* special -LY cases */ 'idly' (<-'idl') 'gently' (<-'gentl') 'ugly' (<-'ugli') 'early' (<-'earli') 'only' (<-'onli') 'singly' (<-'singl') // ... extensions possible here ... /* invariant forms: */ 'sky' 'news' 'howe' 'atlas' 'cosmos' 'bias' 'andes' // not plural forms // ... extensions possible here ... ) ) define postlude as (Y_found repeat(goto (['Y']) <-'y')) define stem as ( exception1 or not hop 3 or ( do prelude do mark_regions backwards ( do Step_1a exception2 or ( do Step_1b do Step_1c do Step_2 do Step_3 do Step_4 do Step_5 ) ) do postlude ) ) snowball_code/algorithms/kraaij_pohlmann/0000755000175000017500000000000012707117052017321 5ustar domdomsnowball_code/algorithms/kraaij_pohlmann/stem_ISO_8859_1.sbl0000644000175000017500000001305012707117052022421 0ustar domdomstrings ( ch ) integers ( x p1 p2 ) booleans ( Y_found stemmed GE_removed ) routines ( R1 R2 C V VX lengthen_V Step_1 Step_2 Step_3 Step_4 Step_7 Step_6 Step_1c Lose_prefix Lose_infix measure ) externals ( stem ) groupings ( v v_WX AOU AIOU ) stringescapes {} stringdef ' hex '27' // yuk define v 'aeiouy' define v_WX v + 'wx' define AOU 'aou' define AIOU 'aiou' backwardmode ( define R1 as (setmark x $x >= p1) define R2 as (setmark x $x >= p2) define V as test (v or 'ij') define VX as test (next v or 'ij') define C as test (not 'ij' non-v) define lengthen_V as do ( non-v_WX [ (AOU] test (non-v or atlimit)) or ('e'] test (non-v or atlimit not AIOU not (next AIOU non-v))) ->ch insert ch ) define Step_1 as ( [among ( (]) '{'}s' (delete) 's' (R1 not ('t' R1) C delete) 'ies' (R1 <-'ie') 'es' (('ar' R1 C ] delete lengthen_V) or ('er' R1 C ] delete) or (R1 C <-'e')) 'aus' (R1 V <-'au') 'en' (('hed' R1 ] <-'heid') or ('nd' delete) or ('d' R1 C ] delete) or ('i' or 'j' V delete) or (R1 C delete lengthen_V)) 'nde' (<-'nd') ) ) define Step_2 as ( [among ( (]) 'je' (('{'}t' ] delete) or ('et' ] R1 C delete) or ('rnt' ] <-'rn') or ('t' ] R1 VX delete) or ('ink' ] <-'ing') or ('mp' ] <-'m') or ('{'}' ] R1 delete) or (] R1 C delete)) 'ge' (R1 <-'g') 'lijke'(R1 <-'lijk') 'ische'(R1 <-'isch') 'de' (R1 C delete) 'te' (R1 <-'t') 'se' (R1 <-'s') 're' (R1 <-'r') 'le' (R1 delete attach 'l' lengthen_V) 'ene' (R1 C delete attach 'en' lengthen_V) 'ieve' (R1 C <-'ief') ) ) define Step_3 as ( [among ( (]) 'atie' (R1 <-'eer') 'iteit' (R1 delete lengthen_V) 'heid' 'sel' 'ster' (R1 delete) 'rder' (<-'r') 'ing' 'isme' 'erij' (R1 delete lengthen_V) 'arij' (R1 C <-'aar') 'fie' (R2 delete attach 'f' lengthen_V) 'gie' (R2 delete attach 'g' lengthen_V) 'tst' (R1 C <-'t') 'dst' (R1 C <-'d') ) ) define Step_4 as ( ( [among ( (]) 'ioneel' (R1 <-'ie') 'atief' (R1 <-'eer') 'baar' (R1 delete) 'naar' (R1 V <-'n') 'laar' (R1 V <-'l') 'raar' (R1 V <-'r') 'tant' (R1 <-'teer') 'lijker' 'lijkst' (R1 <-'lijk') 'achtig' 'achtiger' 'achtigst'(R1 delete) 'eriger' 'erigst' 'erig' 'end' (R1 C delete lengthen_V) ) ) or ( [among ( (]) 'iger' 'igst' 'ig' (R1 C delete lengthen_V) ) ) ) define Step_7 as ( [among ( (]) 'kt' (<-'k') 'ft' (<-'f') 'pt' (<-'p') ) ) define Step_6 as ( [among ( (]) 'bb' (<-'b') 'cc' (<-'c') 'dd' (<-'d') 'ff' (<-'f') 'gg' (<-'g') 'hh' (<-'h') 'jj' (<-'j') 'kk' (<-'k') 'll' (<-'l') 'mm' (<-'m') 'nn' (<-'n') 'pp' (<-'p') 'qq' (<-'q') 'rr' (<-'r') 'ss' (<-'s') 'tt' (<-'t') 'vv' (<-'v') 'ww' (<-'w') 'xx' (<-'x') 'zz' (<-'z') 'v' (<-'f') 'z' (<-'s') ) ) define Step_1c as ( [among ( (] R1 C) 'd' (not ('n' R1) delete) 't' (not ('h' R1) delete) ) ) ) define Lose_prefix as ( ['ge'] test hop 3 (goto v goto non-v) set GE_removed delete ) define Lose_infix as ( next gopast (['ge']) test hop 3 (goto v goto non-v) set GE_removed delete ) define measure as ( do ( tolimit setmark p1 setmark p2 ) do( repeat non-v atleast 1 ('ij' or v) non-v setmark p1 repeat non-v atleast 1 ('ij' or v) non-v setmark p2 ) ) define stem as ( unset Y_found unset stemmed do ( ['y'] <-'Y' set Y_found ) do repeat(goto (v ['y'])<-'Y' set Y_found ) measure backwards ( do (Step_1 set stemmed ) do (Step_2 set stemmed ) do (Step_3 set stemmed ) do (Step_4 set stemmed ) ) unset GE_removed do (Lose_prefix and measure) backwards ( do (GE_removed Step_1c) ) unset GE_removed do (Lose_infix and measure) backwards ( do (GE_removed Step_1c) ) backwards ( do (Step_7 set stemmed ) do (stemmed or GE_removed Step_6) ) do(Y_found repeat(goto (['Y']) <-'y')) ) snowball_code/algorithms/german/0000755000175000017500000000000012707117052015435 5ustar domdomsnowball_code/algorithms/german/stem_MS_DOS_Latin_I.sbl0000644000175000017500000000514012707117052021612 0ustar domdom /* Extra rule for -nisse ending added 11 Dec 2009 */ routines ( prelude postlude mark_regions R1 R2 standard_suffix ) externals ( stem ) integers ( p1 p2 x ) groupings ( v s_ending st_ending ) stringescapes {} /* special characters (in MS-DOS Latin I) */ stringdef a" hex '84' stringdef o" hex '94' stringdef u" hex '81' stringdef ss hex 'E1' define v 'aeiouy{a"}{o"}{u"}' define s_ending 'bdfghklmnrt' define st_ending s_ending - 'r' define prelude as ( test repeat ( ( ['{ss}'] <- 'ss' ) or next ) repeat goto ( v [('u'] v <- 'U') or ('y'] v <- 'Y') ) ) define mark_regions as ( $p1 = limit $p2 = limit test(hop 3 setmark x) gopast v gopast non-v setmark p1 try($p1 < x $p1 = x) // at least 3 gopast v gopast non-v setmark p2 ) define postlude as repeat ( [substring] among( 'Y' (<- 'y') 'U' (<- 'u') '{a"}' (<- 'a') '{o"}' (<- 'o') '{u"}' (<- 'u') '' (next) ) ) backwardmode ( define R1 as $p1 <= cursor define R2 as $p2 <= cursor define standard_suffix as ( do ( [substring] R1 among( 'em' 'ern' 'er' ( delete ) 'e' 'en' 'es' ( delete try (['s'] 'nis' delete) ) 's' ( s_ending delete ) ) ) do ( [substring] R1 among( 'en' 'er' 'est' ( delete ) 'st' ( st_ending hop 3 delete ) ) ) do ( [substring] R2 among( 'end' 'ung' ( delete try (['ig'] not 'e' R2 delete) ) 'ig' 'ik' 'isch' ( not 'e' delete ) 'lich' 'heit' ( delete try ( ['er' or 'en'] R1 delete ) ) 'keit' ( delete try ( [substring] R2 among( 'lich' 'ig' ( delete ) ) ) ) ) ) ) ) define stem as ( do prelude do mark_regions backwards do standard_suffix do postlude ) snowball_code/algorithms/german/stem_ISO_8859_1.sbl0000644000175000017500000000513512707117052020542 0ustar domdom /* Extra rule for -nisse ending added 11 Dec 2009 */ routines ( prelude postlude mark_regions R1 R2 standard_suffix ) externals ( stem ) integers ( p1 p2 x ) groupings ( v s_ending st_ending ) stringescapes {} /* special characters (in ISO Latin I) */ stringdef a" hex 'E4' stringdef o" hex 'F6' stringdef u" hex 'FC' stringdef ss hex 'DF' define v 'aeiouy{a"}{o"}{u"}' define s_ending 'bdfghklmnrt' define st_ending s_ending - 'r' define prelude as ( test repeat ( ( ['{ss}'] <- 'ss' ) or next ) repeat goto ( v [('u'] v <- 'U') or ('y'] v <- 'Y') ) ) define mark_regions as ( $p1 = limit $p2 = limit test(hop 3 setmark x) gopast v gopast non-v setmark p1 try($p1 < x $p1 = x) // at least 3 gopast v gopast non-v setmark p2 ) define postlude as repeat ( [substring] among( 'Y' (<- 'y') 'U' (<- 'u') '{a"}' (<- 'a') '{o"}' (<- 'o') '{u"}' (<- 'u') '' (next) ) ) backwardmode ( define R1 as $p1 <= cursor define R2 as $p2 <= cursor define standard_suffix as ( do ( [substring] R1 among( 'em' 'ern' 'er' ( delete ) 'e' 'en' 'es' ( delete try (['s'] 'nis' delete) ) 's' ( s_ending delete ) ) ) do ( [substring] R1 among( 'en' 'er' 'est' ( delete ) 'st' ( st_ending hop 3 delete ) ) ) do ( [substring] R2 among( 'end' 'ung' ( delete try (['ig'] not 'e' R2 delete) ) 'ig' 'ik' 'isch' ( not 'e' delete ) 'lich' 'heit' ( delete try ( ['er' or 'en'] R1 delete ) ) 'keit' ( delete try ( [substring] R2 among( 'lich' 'ig' ( delete ) ) ) ) ) ) ) ) define stem as ( do prelude do mark_regions backwards do standard_suffix do postlude ) snowball_code/algorithms/norwegian/0000755000175000017500000000000012707117052016155 5ustar domdomsnowball_code/algorithms/norwegian/stem_MS_DOS_Latin_I.sbl0000644000175000017500000000301012707117052022324 0ustar domdomroutines ( mark_regions main_suffix consonant_pair other_suffix ) externals ( stem ) integers ( p1 x ) groupings ( v s_ending ) stringescapes {} /* special characters (in MS-DOS Latin I) */ stringdef ae hex '91' stringdef ao hex '86' stringdef o/ hex '9B' define v 'aeiouy{ae}{ao}{o/}' define s_ending 'bcdfghjlmnoprtvyz' define mark_regions as ( $p1 = limit test ( hop 3 setmark x ) goto v gopast non-v setmark p1 try ( $p1 < x $p1 = x ) ) backwardmode ( define main_suffix as ( setlimit tomark p1 for ([substring]) among( 'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar' 'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens' 'hetens' 'ers' 'ets' 'et' 'het' 'ast' (delete) 's' (s_ending or ('k' non-v) delete) 'erte' 'ert' (<-'er') ) ) define consonant_pair as ( test ( setlimit tomark p1 for ([substring]) among( 'dt' 'vt' ) ) next] delete ) define other_suffix as ( setlimit tomark p1 for ([substring]) among( 'leg' 'eleg' 'ig' 'eig' 'lig' 'elig' 'els' 'lov' 'elov' 'slov' 'hetslov' (delete) ) ) ) define stem as ( do mark_regions backwards ( do main_suffix do consonant_pair do other_suffix ) ) snowball_code/algorithms/norwegian/stem_ISO_8859_1.sbl0000644000175000017500000000300512707117052021254 0ustar domdomroutines ( mark_regions main_suffix consonant_pair other_suffix ) externals ( stem ) integers ( p1 x ) groupings ( v s_ending ) stringescapes {} /* special characters (in ISO Latin I) */ stringdef ae hex 'E6' stringdef ao hex 'E5' stringdef o/ hex 'F8' define v 'aeiouy{ae}{ao}{o/}' define s_ending 'bcdfghjlmnoprtvyz' define mark_regions as ( $p1 = limit test ( hop 3 setmark x ) goto v gopast non-v setmark p1 try ( $p1 < x $p1 = x ) ) backwardmode ( define main_suffix as ( setlimit tomark p1 for ([substring]) among( 'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar' 'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens' 'hetens' 'ers' 'ets' 'et' 'het' 'ast' (delete) 's' (s_ending or ('k' non-v) delete) 'erte' 'ert' (<-'er') ) ) define consonant_pair as ( test ( setlimit tomark p1 for ([substring]) among( 'dt' 'vt' ) ) next] delete ) define other_suffix as ( setlimit tomark p1 for ([substring]) among( 'leg' 'eleg' 'ig' 'eig' 'lig' 'elig' 'els' 'lov' 'elov' 'slov' 'hetslov' (delete) ) ) ) define stem as ( do mark_regions backwards ( do main_suffix do consonant_pair do other_suffix ) ) snowball_code/algorithms/italian/0000755000175000017500000000000012707117052015605 5ustar domdomsnowball_code/algorithms/italian/stem_MS_DOS_Latin_I.sbl0000644000175000017500000001144312707117052021765 0ustar domdom routines ( prelude postlude mark_regions RV R1 R2 attached_pronoun standard_suffix verb_suffix vowel_suffix ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v AEIO CG ) stringescapes {} /* special characters (in MS-DOS Latin I) */ stringdef a' hex 'A0' stringdef a` hex '85' stringdef e' hex '82' stringdef e` hex '8A' stringdef i' hex 'A1' stringdef i` hex '8D' stringdef o' hex 'A2' stringdef o` hex '95' stringdef u' hex 'A3' stringdef u` hex '97' define v 'aeiou{a`}{e`}{i`}{o`}{u`}' define prelude as ( test repeat ( [substring] among( '{a'}' (<- '{a`}') '{e'}' (<- '{e`}') '{i'}' (<- '{i`}') '{o'}' (<- '{o`}') '{u'}' (<- '{u`}') 'qu' (<- 'qU') '' (next) ) ) repeat goto ( v [ ('u' ] v <- 'U') or ('i' ] v <- 'I') ) ) define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v (non-v gopast v) or (v gopast non-v) ) or ( non-v (non-v gopast v) or (v next) ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( 'I' (<- 'i') 'U' (<- 'u') '' (next) ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define attached_pronoun as ( [substring] among( 'ci' 'gli' 'la' 'le' 'li' 'lo' 'mi' 'ne' 'si' 'ti' 'vi' // the compound forms are: 'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene' 'mela' 'mele' 'meli' 'melo' 'mene' 'tela' 'tele' 'teli' 'telo' 'tene' 'cela' 'cele' 'celi' 'celo' 'cene' 'vela' 'vele' 'veli' 'velo' 'vene' ) among( (RV) 'ando' 'endo' (delete) 'ar' 'er' 'ir' (<- 'e') ) ) define standard_suffix as ( [substring] among( 'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo' 'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti' 'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente' 'atrice' 'atrici' 'ante' 'anti' // Note 1 ( R2 delete ) 'azione' 'azioni' 'atore' 'atori' ( R2 delete try ( ['ic'] R2 delete ) ) 'logia' 'logie' ( R2 <- 'log' ) 'uzione' 'uzioni' 'usione' 'usioni' ( R2 <- 'u' ) 'enza' 'enze' ( R2 <- 'ente' ) 'amento' 'amenti' 'imento' 'imenti' ( RV delete ) 'amente' ( R1 delete try ( [substring] R2 delete among( 'iv' ( ['at'] R2 delete ) 'os' 'ic' 'abil' ) ) ) 'it{a`}' ( R2 delete try ( [substring] among( 'abil' 'ic' 'iv' (R2 delete) ) ) ) 'ivo' 'ivi' 'iva' 'ive' ( R2 delete try ( ['at'] R2 delete ['ic'] R2 delete ) ) ) ) define verb_suffix as setlimit tomark pV for ( [substring] among( 'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi' 'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate' 'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai' 'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo' 'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete' 'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo' 'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei' 'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono' 'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita' 'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo' 'ono' 'uta' 'ute' 'uti' 'uto' 'ar' 'ir' // but 'er' is problematical (delete) ) ) define AEIO 'aeio{a`}{e`}{i`}{o`}' define CG 'cg' define vowel_suffix as ( try ( [AEIO] RV delete ['i'] RV delete ) try ( ['h'] CG RV delete ) ) ) define stem as ( do prelude do mark_regions backwards ( do attached_pronoun do (standard_suffix or verb_suffix) do vowel_suffix ) do postlude ) /* Note 1: additions of 15 Jun 2005 */ snowball_code/algorithms/italian/stem_ISO_8859_1.sbl0000644000175000017500000001144012707117052020706 0ustar domdom routines ( prelude postlude mark_regions RV R1 R2 attached_pronoun standard_suffix verb_suffix vowel_suffix ) externals ( stem ) integers ( pV p1 p2 ) groupings ( v AEIO CG ) stringescapes {} /* special characters (in ISO Latin I) */ stringdef a' hex 'E1' stringdef a` hex 'E0' stringdef e' hex 'E9' stringdef e` hex 'E8' stringdef i' hex 'ED' stringdef i` hex 'EC' stringdef o' hex 'F3' stringdef o` hex 'F2' stringdef u' hex 'FA' stringdef u` hex 'F9' define v 'aeiou{a`}{e`}{i`}{o`}{u`}' define prelude as ( test repeat ( [substring] among( '{a'}' (<- '{a`}') '{e'}' (<- '{e`}') '{i'}' (<- '{i`}') '{o'}' (<- '{o`}') '{u'}' (<- '{u`}') 'qu' (<- 'qU') '' (next) ) ) repeat goto ( v [ ('u' ] v <- 'U') or ('i' ] v <- 'I') ) ) define mark_regions as ( $pV = limit $p1 = limit $p2 = limit // defaults do ( ( v (non-v gopast v) or (v gopast non-v) ) or ( non-v (non-v gopast v) or (v next) ) setmark pV ) do ( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) ) define postlude as repeat ( [substring] among( 'I' (<- 'i') 'U' (<- 'u') '' (next) ) ) backwardmode ( define RV as $pV <= cursor define R1 as $p1 <= cursor define R2 as $p2 <= cursor define attached_pronoun as ( [substring] among( 'ci' 'gli' 'la' 'le' 'li' 'lo' 'mi' 'ne' 'si' 'ti' 'vi' // the compound forms are: 'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene' 'mela' 'mele' 'meli' 'melo' 'mene' 'tela' 'tele' 'teli' 'telo' 'tene' 'cela' 'cele' 'celi' 'celo' 'cene' 'vela' 'vele' 'veli' 'velo' 'vene' ) among( (RV) 'ando' 'endo' (delete) 'ar' 'er' 'ir' (<- 'e') ) ) define standard_suffix as ( [substring] among( 'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo' 'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti' 'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente' 'atrice' 'atrici' 'ante' 'anti' // Note 1 ( R2 delete ) 'azione' 'azioni' 'atore' 'atori' ( R2 delete try ( ['ic'] R2 delete ) ) 'logia' 'logie' ( R2 <- 'log' ) 'uzione' 'uzioni' 'usione' 'usioni' ( R2 <- 'u' ) 'enza' 'enze' ( R2 <- 'ente' ) 'amento' 'amenti' 'imento' 'imenti' ( RV delete ) 'amente' ( R1 delete try ( [substring] R2 delete among( 'iv' ( ['at'] R2 delete ) 'os' 'ic' 'abil' ) ) ) 'it{a`}' ( R2 delete try ( [substring] among( 'abil' 'ic' 'iv' (R2 delete) ) ) ) 'ivo' 'ivi' 'iva' 'ive' ( R2 delete try ( ['at'] R2 delete ['ic'] R2 delete ) ) ) ) define verb_suffix as setlimit tomark pV for ( [substring] among( 'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi' 'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate' 'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai' 'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo' 'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete' 'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo' 'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei' 'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono' 'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita' 'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo' 'ono' 'uta' 'ute' 'uti' 'uto' 'ar' 'ir' // but 'er' is problematical (delete) ) ) define AEIO 'aeio{a`}{e`}{i`}{o`}' define CG 'cg' define vowel_suffix as ( try ( [AEIO] RV delete ['i'] RV delete ) try ( ['h'] CG RV delete ) ) ) define stem as ( do prelude do mark_regions backwards ( do attached_pronoun do (standard_suffix or verb_suffix) do vowel_suffix ) do postlude ) /* Note 1: additions of 15 Jun 2005 */ snowball_code/algorithms/porter/0000755000175000017500000000000012707117052015477 5ustar domdomsnowball_code/algorithms/porter/stem_ISO_8859_1.sbl0000644000175000017500000000564412707117052020611 0ustar domdomintegers ( p1 p2 ) booleans ( Y_found ) routines ( shortv R1 R2 Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5a Step_5b ) externals ( stem ) groupings ( v v_WXY ) define v 'aeiouy' define v_WXY v + 'wxY' backwardmode ( define shortv as ( non-v_WXY v non-v ) define R1 as $p1 <= cursor define R2 as $p2 <= cursor define Step_1a as ( [substring] among ( 'sses' (<-'ss') 'ies' (<-'i') 'ss' () 's' (delete) ) ) define Step_1b as ( [substring] among ( 'eed' (R1 <-'ee') 'ed' 'ing' ( test gopast v delete test substring among( 'at' 'bl' 'iz' (<+ 'e') 'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt' // ignoring double c, h, j, k, q, v, w, and x ([next] delete) '' (atmark p1 test shortv <+ 'e') ) ) ) ) define Step_1c as ( ['y' or 'Y'] gopast v <-'i' ) define Step_2 as ( [substring] R1 among ( 'tional' (<-'tion') 'enci' (<-'ence') 'anci' (<-'ance') 'abli' (<-'able') 'entli' (<-'ent') 'eli' (<-'e') 'izer' 'ization' (<-'ize') 'ational' 'ation' 'ator' (<-'ate') 'alli' (<-'al') 'alism' 'aliti' (<-'al') 'fulness' (<-'ful') 'ousli' 'ousness' (<-'ous') 'iveness' 'iviti' (<-'ive') 'biliti' (<-'ble') ) ) define Step_3 as ( [substring] R1 among ( 'alize' (<-'al') 'icate' 'iciti' 'ical' (<-'ic') 'ative' 'ful' 'ness' (delete) ) ) define Step_4 as ( [substring] R2 among ( 'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement' 'ment' 'ent' 'ou' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize' (delete) 'ion' ('s' or 't' delete) ) ) define Step_5a as ( ['e'] R2 or (R1 not shortv) delete ) define Step_5b as ( ['l'] R2 'l' delete ) ) define stem as ( unset Y_found do ( ['y'] <-'Y' set Y_found) do repeat(goto (v ['y']) <-'Y' set Y_found) $p1 = limit $p2 = limit do( gopast v gopast non-v setmark p1 gopast v gopast non-v setmark p2 ) backwards ( do Step_1a do Step_1b do Step_1c do Step_2 do Step_3 do Step_4 do Step_5a do Step_5b ) do(Y_found repeat(goto (['Y']) <-'y')) ) snowball_code/algorithms/finnish/0000755000175000017500000000000012707117052015622 5ustar domdomsnowball_code/algorithms/finnish/stem_ISO_8859_1.sbl0000644000175000017500000001212712707117052020726 0ustar domdom /* Finnish stemmer. Numbers in square brackets refer to the sections in Fred Karlsson, Finnish: An Essential Grammar. Routledge, 1999 ISBN 0-415-20705-3 */ routines ( mark_regions R2 particle_etc possessive LONG VI case_ending i_plural t_plural other_endings tidy ) externals ( stem ) integers ( p1 p2 ) strings ( x ) booleans ( ending_removed ) groupings ( AEI V1 V2 particle_end ) stringescapes {} /* special characters (in ISO Latin I) */ stringdef a" hex 'E4' stringdef o" hex 'F6' define AEI 'a{a"}ei' define V1 'aeiouy{a"}{o"}' define V2 'aeiou{a"}{o"}' define particle_end V1 + 'nt' define mark_regions as ( $p1 = limit $p2 = limit goto V1 gopast non-V1 setmark p1 goto V1 gopast non-V1 setmark p2 ) backwardmode ( define R2 as $p2 <= cursor define particle_etc as ( setlimit tomark p1 for ([substring]) among( 'kin' 'kaan' 'k{a"}{a"}n' 'ko' 'k{o"}' 'han' 'h{a"}n' 'pa' 'p{a"}' // Particles [91] (particle_end) 'sti' // Adverb [87] (R2) ) delete ) define possessive as ( // [36] setlimit tomark p1 for ([substring]) among( 'si' (not 'k' delete) // take 'ksi' as the Comitative case 'ni' (delete ['kse'] <- 'ksi') // kseni = ksi + ni 'nsa' 'ns{a"}' 'mme' 'nne' (delete) /* Now for Vn possessives after case endings: [36] */ 'an' (among('ta' 'ssa' 'sta' 'lla' 'lta' 'na') delete) '{a"}n' (among('t{a"}' 'ss{a"}' 'st{a"}' 'll{a"}' 'lt{a"}' 'n{a"}') delete) 'en' (among('lle' 'ine') delete) ) ) define LONG as among('aa' 'ee' 'ii' 'oo' 'uu' '{a"}{a"}' '{o"}{o"}') define VI as ('i' V2) define case_ending as ( setlimit tomark p1 for ([substring]) among( 'han' ('a') //-. 'hen' ('e') // | 'hin' ('i') // | 'hon' ('o') // | 'h{a"}n' ('{a"}') // Illative [43] 'h{o"}n' ('{o"}') // | 'siin' VI // | 'seen' LONG //-' 'den' VI 'tten' VI // Genitive plurals [34] () 'n' // Genitive or Illative ( try ( LONG // Illative or 'ie' // Genitive and next ] ) /* otherwise Genitive */ ) 'a' '{a"}' //-. (V1 non-V1) // | 'tta' 'tt{a"}' // Partitive [32] ('e') // | 'ta' 't{a"}' //-' 'ssa' 'ss{a"}' // Inessive [41] 'sta' 'st{a"}' // Elative [42] 'lla' 'll{a"}' // Adessive [44] 'lta' 'lt{a"}' // Ablative [51] 'lle' // Allative [46] 'na' 'n{a"}' // Essive [49] 'ksi' // Translative[50] 'ine' // Comitative [51] /* Abessive and Instructive are too rare for inclusion [51] */ ) delete set ending_removed ) define other_endings as ( setlimit tomark p2 for ([substring]) among( 'mpi' 'mpa' 'mp{a"}' 'mmi' 'mma' 'mm{a"}' // Comparative forms [85] (not 'po') //-improves things 'impi' 'impa' 'imp{a"}' 'immi' 'imma' 'imm{a"}' // Superlative forms [86] 'eja' 'ej{a"}' // indicates agent [93.1B] ) delete ) define i_plural as ( // [26] setlimit tomark p1 for ([substring]) among( 'i' 'j' ) delete ) define t_plural as ( // [26] setlimit tomark p1 for ( ['t'] test V1 delete ) setlimit tomark p2 for ([substring]) among( 'mma' (not 'po') //-mmat endings 'imma' //-immat endings ) delete ) define tidy as ( setlimit tomark p1 for ( do ( LONG and ([next] delete ) ) // undouble vowel do ( [AEI] non-V1 delete ) // remove trailing a, a", e, i do ( ['j'] 'o' or 'u' delete ) do ( ['o'] 'j' delete ) ) goto non-V1 [next] -> x x delete // undouble consonant ) ) define stem as ( do mark_regions unset ending_removed backwards ( do particle_etc do possessive do case_ending do other_endings (ending_removed do i_plural) or do t_plural do tidy ) ) snowball_code/algorithms/danish/0000755000175000017500000000000012707117052015432 5ustar domdomsnowball_code/algorithms/danish/stem_MS_DOS_Latin_I.sbl0000644000175000017500000000343112707117052021610 0ustar domdomroutines ( mark_regions main_suffix consonant_pair other_suffix undouble ) externals ( stem ) strings ( ch ) integers ( p1 x ) groupings ( v s_ending ) stringescapes {} /* special characters (in MS-DOS Latin I) */ stringdef ae hex '91' stringdef ao hex '86' stringdef o/ hex '9B' define v 'aeiouy{ae}{ao}{o/}' define s_ending 'abcdfghjklmnoprtvyz{ao}' define mark_regions as ( $p1 = limit test ( hop 3 setmark x ) goto v gopast non-v setmark p1 try ( $p1 < x $p1 = x ) ) backwardmode ( define main_suffix as ( setlimit tomark p1 for ([substring]) among( 'hed' 'ethed' 'ered' 'e' 'erede' 'ende' 'erende' 'ene' 'erne' 'ere' 'en' 'heden' 'eren' 'er' 'heder' 'erer' 'heds' 'es' 'endes' 'erendes' 'enes' 'ernes' 'eres' 'ens' 'hedens' 'erens' 'ers' 'ets' 'erets' 'et' 'eret' (delete) 's' (s_ending delete) ) ) define consonant_pair as ( test ( setlimit tomark p1 for ([substring]) among( 'gd' // significant in the call from other_suffix 'dt' 'gt' 'kt' ) ) next] delete ) define other_suffix as ( do ( ['st'] 'ig' delete ) setlimit tomark p1 for ([substring]) among( 'ig' 'lig' 'elig' 'els' (delete do consonant_pair) 'l{o/}st' (<-'l{o/}s') ) ) define undouble as ( setlimit tomark p1 for ([non-v] ->ch) ch delete ) ) define stem as ( do mark_regions backwards ( do main_suffix do consonant_pair do other_suffix do undouble ) ) snowball_code/algorithms/danish/stem_ISO_8859_1.sbl0000644000175000017500000000342612707117052020540 0ustar domdomroutines ( mark_regions main_suffix consonant_pair other_suffix undouble ) externals ( stem ) strings ( ch ) integers ( p1 x ) groupings ( v s_ending ) stringescapes {} /* special characters (in ISO Latin I) */ stringdef ae hex 'E6' stringdef ao hex 'E5' stringdef o/ hex 'F8' define v 'aeiouy{ae}{ao}{o/}' define s_ending 'abcdfghjklmnoprtvyz{ao}' define mark_regions as ( $p1 = limit test ( hop 3 setmark x ) goto v gopast non-v setmark p1 try ( $p1 < x $p1 = x ) ) backwardmode ( define main_suffix as ( setlimit tomark p1 for ([substring]) among( 'hed' 'ethed' 'ered' 'e' 'erede' 'ende' 'erende' 'ene' 'erne' 'ere' 'en' 'heden' 'eren' 'er' 'heder' 'erer' 'heds' 'es' 'endes' 'erendes' 'enes' 'ernes' 'eres' 'ens' 'hedens' 'erens' 'ers' 'ets' 'erets' 'et' 'eret' (delete) 's' (s_ending delete) ) ) define consonant_pair as ( test ( setlimit tomark p1 for ([substring]) among( 'gd' // significant in the call from other_suffix 'dt' 'gt' 'kt' ) ) next] delete ) define other_suffix as ( do ( ['st'] 'ig' delete ) setlimit tomark p1 for ([substring]) among( 'ig' 'lig' 'elig' 'els' (delete do consonant_pair) 'l{o/}st' (<-'l{o/}s') ) ) define undouble as ( setlimit tomark p1 for ([non-v] ->ch) ch delete ) ) define stem as ( do mark_regions backwards ( do main_suffix do consonant_pair do other_suffix do undouble ) ) snowball_code/examples/0000755000175000017500000000000012707117052013631 5ustar domdomsnowball_code/examples/stemwords.c0000644000175000017500000001234212707117052016026 0ustar domdom/* This is a simple program which uses libstemmer to provide a command * line interface for stemming using any of the algorithms provided. */ #include #include /* for malloc, free */ #include /* for memmove */ #include /* for isupper, tolower */ #include "libstemmer.h" const char * progname; static int pretty = 1; static void stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out) { #define INC 10 int lim = INC; sb_symbol * b = (sb_symbol *) malloc(lim * sizeof(sb_symbol)); while(1) { int ch = getc(f_in); if (ch == EOF) { free(b); return; } { int i = 0; int inlen = 0; while(1) { if (ch == '\n' || ch == EOF) break; if (i == lim) { sb_symbol * newb; newb = (sb_symbol *) realloc(b, (lim + INC) * sizeof(sb_symbol)); if (newb == 0) goto error; b = newb; lim = lim + INC; } /* Update count of utf-8 characters. */ if (ch < 0x80 || ch > 0xBF) inlen += 1; /* force lower case: */ if (isupper(ch)) ch = tolower(ch); b[i] = ch; i++; ch = getc(f_in); } { const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i); if (stemmed == NULL) { fprintf(stderr, "Out of memory"); exit(1); } else { if (pretty == 1) { fwrite(b, i, 1, f_out); fputs(" -> ", f_out); } else if (pretty == 2) { fwrite(b, i, 1, f_out); if (sb_stemmer_length(stemmer) > 0) { int j; if (inlen < 30) { for (j = 30 - inlen; j > 0; j--) fputs(" ", f_out); } else { fputs("\n", f_out); for (j = 30; j > 0; j--) fputs(" ", f_out); } } } fputs((char *)stemmed, f_out); putc('\n', f_out); } } } } error: if (b != 0) free(b); return; } /** Display the command line syntax, and then exit. * @param n The value to exit with. */ static void usage(int n) { printf("usage: %s [-l ] [-i ] [-o ] [-c ] [-p[2]] [-h]\n" "\n" "The input file consists of a list of words to be stemmed, one per\n" "line. Words should be in lower case, but (for English) A-Z letters\n" "are mapped to their a-z equivalents anyway. If omitted, stdin is\n" "used.\n" "\n" "If -c is given, the argument is the character encoding of the input\n" "and output files. If it is omitted, the UTF-8 encoding is used.\n" "\n" "If -p is given the output file consists of each word of the input\n" "file followed by \"->\" followed by its stemmed equivalent.\n" "If -p2 is given the output file is a two column layout containing\n" "the input words in the first column and the stemmed eqivalents in\n" "the second column.\n" "Otherwise, the output file consists of the stemmed words, one per\n" "line.\n" "\n" "-h displays this help\n", progname); exit(n); } int main(int argc, char * argv[]) { char * in = 0; char * out = 0; FILE * f_in; FILE * f_out; struct sb_stemmer * stemmer; char * language = "english"; char * charenc = NULL; char * s; int i = 1; pretty = 0; progname = argv[0]; while(i < argc) { s = argv[i++]; if (s[0] == '-') { if (strcmp(s, "-o") == 0) { if (i >= argc) { fprintf(stderr, "%s requires an argument\n", s); exit(1); } out = argv[i++]; } else if (strcmp(s, "-i") == 0) { if (i >= argc) { fprintf(stderr, "%s requires an argument\n", s); exit(1); } in = argv[i++]; } else if (strcmp(s, "-l") == 0) { if (i >= argc) { fprintf(stderr, "%s requires an argument\n", s); exit(1); } language = argv[i++]; } else if (strcmp(s, "-c") == 0) { if (i >= argc) { fprintf(stderr, "%s requires an argument\n", s); exit(1); } charenc = argv[i++]; } else if (strcmp(s, "-p2") == 0) { pretty = 2; } else if (strcmp(s, "-p") == 0) { pretty = 1; } else if (strcmp(s, "-h") == 0) { usage(0); } else { fprintf(stderr, "option %s unknown\n", s); usage(1); } } else { fprintf(stderr, "unexpected parameter %s\n", s); usage(1); } } /* prepare the files */ f_in = (in == 0) ? stdin : fopen(in, "r"); if (f_in == 0) { fprintf(stderr, "file %s not found\n", in); exit(1); } f_out = (out == 0) ? stdout : fopen(out, "w"); if (f_out == 0) { fprintf(stderr, "file %s cannot be opened\n", out); exit(1); } /* do the stemming process: */ stemmer = sb_stemmer_new(language, charenc); if (stemmer == 0) { if (charenc == NULL) { fprintf(stderr, "language `%s' not available for stemming\n", language); exit(1); } else { fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc); exit(1); } } stem_file(stemmer, f_in, f_out); sb_stemmer_delete(stemmer); if (in != 0) (void) fclose(f_in); if (out != 0) (void) fclose(f_out); return 0; }