pax_global_header 0000666 0000000 0000000 00000000064 13243037046 0014514 g ustar 00root root 0000000 0000000 52 comment=422d6aa3abb64583e140a3d7c07524d22b33812b
RediSearch-1.0.8/ 0000775 0000000 0000000 00000000000 13243037046 0013533 5 ustar 00root root 0000000 0000000 RediSearch-1.0.8/.circleci/ 0000775 0000000 0000000 00000000000 13243037046 0015366 5 ustar 00root root 0000000 0000000 RediSearch-1.0.8/.circleci/config.yml 0000664 0000000 0000000 00000007113 13243037046 0017360 0 ustar 00root root 0000000 0000000 version: 2
jobs:
build:
docker:
- image: 'redislabsmodules/rmbuilder:latest'
steps:
- checkout
- run:
name: Build
command: make -j 8
- run:
name: Test
command: make test
- run:
name: Persist Artifacts
command: >-
mkdir -p /workspace/build && cp src/$MODULE_ARTIFACT /workspace/ &&
cp ramp.yml /workspace/
- persist_to_workspace:
root: /workspace
paths:
- '*.so'
- ramp.yml
- build
package_branch:
docker:
- image: 'redislabsmodules/rmbuilder:latest'
steps:
- attach_workspace:
at: /workspace
- run:
name: Package
command: >-
ramp pack -m /workspace/ramp.yml -o
/workspace/build/$PACKAGE_NAME.{os}-{architecture}.$CIRCLE_BRANCH.zip
/workspace/$MODULE_ARTIFACT
- persist_to_workspace:
root: /workspace
paths:
- build
- store_artifacts:
path: /workspace/build
package_release:
docker:
- image: 'redislabsmodules/rmbuilder:latest'
steps:
- attach_workspace:
at: /workspace
- run:
name: Package
command: >-
ramp pack -m /workspace/ramp.yml -o
/workspace/build/$PACKAGE_NAME.{os}-{architecture}.{semantic_version}.zip
/workspace/$MODULE_ARTIFACT
- run:
name: Package
command: >-
ramp pack -m /workspace/ramp.yml -o
/workspace/build/$PACKAGE_NAME.{os}-{architecture}.latest.zip
/workspace/$MODULE_ARTIFACT
- persist_to_workspace:
root: /workspace
paths:
- build
- store_artifacts:
path: /workspace/build
deploy_branch:
docker:
- image: 'redislabsmodules/rmbuilder:latest'
steps:
- attach_workspace:
at: /workspace
- run:
name: Deploy to S3
command: >-
aws s3 cp /workspace/build/ s3://redismodules/$PACKAGE_NAME/ --acl
public-read --recursive --exclude "*" --include "*.zip"
deploy_release:
docker:
- image: 'redislabsmodules/rmbuilder:latest'
steps:
- attach_workspace:
at: /workspace
- run:
name: Deploy to S3
command: >-
aws s3 cp /workspace/build/ s3://redismodules/$PACKAGE_NAME/ --acl
public-read --recursive --exclude "*" --include "*.zip"
deploy_docs:
docker:
- image: 'redislabsmodules/rmbuilder:latest'
steps:
- checkout
- run:
name: Build Docs
command: mkdocs build
- run:
name: Deploy Docs to S3
command: >-
aws s3 cp site s3://modules.redislabs.com/redisearch/ --acl
public-read --recursive
workflows:
version: 2
build_and_package:
jobs:
- build:
filters:
tags:
only: /.*/
- package_branch:
requires:
- build
filters:
branches:
only: master
- package_release:
requires:
- build
filters:
branches:
ignore: /.*/
tags:
only: '/^v[0-9].*/'
- deploy_branch:
requires:
- package_branch
- deploy_release:
filters:
tags:
only: '/^v[0-9].*/'
requires:
- package_release
- deploy_docs:
filters:
branches:
only: master
RediSearch-1.0.8/.clang-format 0000664 0000000 0000000 00000003565 13243037046 0016117 0 ustar 00root root 0000000 0000000 ---
Language: Cpp
AccessModifierOffset: -1
AlignAfterOpenBracket: true
AlignConsecutiveAssignments: false
AlignEscapedNewlinesLeft: true
AlignOperands: true
AlignTrailingComments: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: None
AllowShortIfStatementsOnASingleLine: true
AllowShortLoopsOnASingleLine: true
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: true
BinPackParameters: true
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Attach
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
ColumnLimit: 100
CommentPragmas: '^ IWYU pragma:'
ConstructorInitializerAllOnOneLineOrOnePerLine: true
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: true
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
IndentCaseLabels: true
IndentWidth: 2
IndentWrappedFunctionNames: false
KeepEmptyLinesAtTheStartOfBlocks: true
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Left
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 2
SpacesInAngles: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Auto
TabWidth: 4
UseTab: Never
SortIncludes: false
...
RediSearch-1.0.8/.dockerignore 0000664 0000000 0000000 00000000150 13243037046 0016203 0 ustar 00root root 0000000 0000000 **/*.rdb*
**/*.aof*
**/*.rdb
**/*.out
**/*.md
**/*.zip
**/*.bz2
**/*.gz
**/*.so
**/*.o
**/*.a
.git
site
RediSearch-1.0.8/.gitignore 0000664 0000000 0000000 00000000243 13243037046 0015522 0 ustar 00root root 0000000 0000000 *
!/**/
!**/*.h
!**/*.c
!**/*.py
!**/*.md
!.gitignore
!**/Makefile
!**/*.yml
!**/*.yaml
!**/*.mak
!/cndict
/site
!/debian/*
src/dep/snowball/libstemmer/mkinc.mak
RediSearch-1.0.8/Dockerfile 0000664 0000000 0000000 00000000645 13243037046 0015532 0 ustar 00root root 0000000 0000000 FROM redislabsmodules/rmbuilder:latest as builder
# Build the source
ADD ./src /src
WORKDIR /src
RUN set -ex;\
deps="$DEPS";\
make all -j 4; \
make test;
# Package the runner
FROM redis:latest
ENV LIBDIR /usr/lib/redis/modules
WORKDIR /data
RUN set -ex;\
mkdir -p "$LIBDIR";
COPY --from=builder /src/redisearch.so "$LIBDIR"
CMD ["redis-server", "--loadmodule", "/usr/lib/redis/modules/redisearch.so"]
RediSearch-1.0.8/LICENSE 0000664 0000000 0000000 00000103330 13243037046 0014540 0 ustar 00root root 0000000 0000000 GNU AFFERO GENERAL PUBLIC LICENSE
Version 3, 19 November 2007
Copyright (C) 2007 Free Software Foundation, Inc.
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The GNU Affero General Public License is a free, copyleft license for
software and other kinds of works, specifically designed to ensure
cooperation with the community in the case of network server software.
The licenses for most software and other practical works are designed
to take away your freedom to share and change the works. By contrast,
our General Public Licenses are intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.
Developers that use our General Public Licenses protect your rights
with two steps: (1) assert copyright on the software, and (2) offer
you this License which gives you legal permission to copy, distribute
and/or modify the software.
A secondary benefit of defending all users' freedom is that
improvements made in alternate versions of the program, if they
receive widespread use, become available for other developers to
incorporate. Many developers of free software are heartened and
encouraged by the resulting cooperation. However, in the case of
software used on network servers, this result may fail to come about.
The GNU General Public License permits making a modified version and
letting the public access it on a server without ever releasing its
source code to the public.
The GNU Affero General Public License is designed specifically to
ensure that, in such cases, the modified source code becomes available
to the community. It requires the operator of a network server to
provide the source code of the modified version running there to the
users of that server. Therefore, public use of a modified version, on
a publicly accessible server, gives the public access to the source
code of the modified version.
An older license, called the Affero General Public License and
published by Affero, was designed to accomplish similar goals. This is
a different license, not a version of the Affero GPL, but Affero has
released a new version of the Affero GPL which permits relicensing under
this license.
The precise terms and conditions for copying, distribution and
modification follow.
TERMS AND CONDITIONS
0. Definitions.
"This License" refers to version 3 of the GNU Affero General Public License.
"Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.
"The Program" refers to any copyrightable work licensed under this
License. Each licensee is addressed as "you". "Licensees" and
"recipients" may be individuals or organizations.
To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy. The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.
A "covered work" means either the unmodified Program or a work based
on the Program.
To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy. Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.
To "convey" a work means any kind of propagation that enables other
parties to make or receive copies. Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.
An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License. If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.
1. Source Code.
The "source code" for a work means the preferred form of the work
for making modifications to it. "Object code" means any non-source
form of a work.
A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.
The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form. A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.
The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities. However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work. For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.
The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.
The Corresponding Source for a work in source code form is that
same work.
2. Basic Permissions.
All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met. This License explicitly affirms your unlimited
permission to run the unmodified Program. The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work. This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.
You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force. You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright. Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.
Conveying under any other circumstances is permitted solely under
the conditions stated below. Sublicensing is not allowed; section 10
makes it unnecessary.
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.
When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.
4. Conveying Verbatim Copies.
You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.
You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.
5. Conveying Modified Source Versions.
You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:
a) The work must carry prominent notices stating that you modified
it, and giving a relevant date.
b) The work must carry prominent notices stating that it is
released under this License and any conditions added under section
7. This requirement modifies the requirement in section 4 to
"keep intact all notices".
c) You must license the entire work, as a whole, under this
License to anyone who comes into possession of a copy. This
License will therefore apply, along with any applicable section 7
additional terms, to the whole of the work, and all its parts,
regardless of how they are packaged. This License gives no
permission to license the work in any other way, but it does not
invalidate such permission if you have separately received it.
d) If the work has interactive user interfaces, each must display
Appropriate Legal Notices; however, if the Program has interactive
interfaces that do not display Appropriate Legal Notices, your
work need not make them do so.
A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit. Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.
6. Conveying Non-Source Forms.
You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:
a) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by the
Corresponding Source fixed on a durable physical medium
customarily used for software interchange.
b) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by a
written offer, valid for at least three years and valid for as
long as you offer spare parts or customer support for that product
model, to give anyone who possesses the object code either (1) a
copy of the Corresponding Source for all the software in the
product that is covered by this License, on a durable physical
medium customarily used for software interchange, for a price no
more than your reasonable cost of physically performing this
conveying of source, or (2) access to copy the
Corresponding Source from a network server at no charge.
c) Convey individual copies of the object code with a copy of the
written offer to provide the Corresponding Source. This
alternative is allowed only occasionally and noncommercially, and
only if you received the object code with such an offer, in accord
with subsection 6b.
d) Convey the object code by offering access from a designated
place (gratis or for a charge), and offer equivalent access to the
Corresponding Source in the same way through the same place at no
further charge. You need not require recipients to copy the
Corresponding Source along with the object code. If the place to
copy the object code is a network server, the Corresponding Source
may be on a different server (operated by you or a third party)
that supports equivalent copying facilities, provided you maintain
clear directions next to the object code saying where to find the
Corresponding Source. Regardless of what server hosts the
Corresponding Source, you remain obligated to ensure that it is
available for as long as needed to satisfy these requirements.
e) Convey the object code using peer-to-peer transmission, provided
you inform other peers where the object code and Corresponding
Source of the work are being offered to the general public at no
charge under subsection 6d.
A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.
A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling. In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage. For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product. A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.
"Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source. The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.
If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information. But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).
The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed. Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.
Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.
7. Additional Terms.
"Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law. If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.
When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it. (Additional permissions may be written to require their own
removal in certain cases when you modify the work.) You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.
Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:
a) Disclaiming warranty or limiting liability differently from the
terms of sections 15 and 16 of this License; or
b) Requiring preservation of specified reasonable legal notices or
author attributions in that material or in the Appropriate Legal
Notices displayed by works containing it; or
c) Prohibiting misrepresentation of the origin of that material, or
requiring that modified versions of such material be marked in
reasonable ways as different from the original version; or
d) Limiting the use for publicity purposes of names of licensors or
authors of the material; or
e) Declining to grant rights under trademark law for use of some
trade names, trademarks, or service marks; or
f) Requiring indemnification of licensors and authors of that
material by anyone who conveys the material (or modified versions of
it) with contractual assumptions of liability to the recipient, for
any liability that these contractual assumptions directly impose on
those licensors and authors.
All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10. If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term. If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.
If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.
Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.
8. Termination.
You may not propagate or modify a covered work except as expressly
provided under this License. Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).
However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.
Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.
Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License. If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.
9. Acceptance Not Required for Having Copies.
You are not required to accept this License in order to receive or
run a copy of the Program. Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance. However,
nothing other than this License grants you permission to propagate or
modify any covered work. These actions infringe copyright if you do
not accept this License. Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.
10. Automatic Licensing of Downstream Recipients.
Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License. You are not responsible
for enforcing compliance by third parties with this License.
An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations. If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.
You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License. For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.
11. Patents.
A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based. The
work thus licensed is called the contributor's "contributor version".
A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version. For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.
Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.
In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement). To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.
If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients. "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.
If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.
A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License. You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.
Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.
12. No Surrender of Others' Freedom.
If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all. For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.
13. Remote Network Interaction; Use with the GNU General Public License.
Notwithstanding any other provision of this License, if you modify the
Program, your modified version must prominently offer all users
interacting with it remotely through a computer network (if your version
supports such interaction) an opportunity to receive the Corresponding
Source of your version by providing access to the Corresponding Source
from a network server at no charge, through some standard or customary
means of facilitating copying of software. This Corresponding Source
shall include the Corresponding Source for any work covered by version 3
of the GNU General Public License that is incorporated pursuant to the
following paragraph.
Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU General Public License into a single
combined work, and to convey the resulting work. The terms of this
License will continue to apply to the part which is the covered work,
but the work with which it is combined will remain governed by version
3 of the GNU General Public License.
14. Revised Versions of this License.
The Free Software Foundation may publish revised and/or new versions of
the GNU Affero General Public License from time to time. Such new versions
will be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the
Program specifies that a certain numbered version of the GNU Affero General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation. If the Program does not specify a version number of the
GNU Affero General Public License, you may choose any version ever published
by the Free Software Foundation.
If the Program specifies that a proxy can decide which future
versions of the GNU Affero General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.
Later license versions may give you additional or different
permissions. However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.
15. Disclaimer of Warranty.
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. Limitation of Liability.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.
17. Interpretation of Sections 15 and 16.
If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
Copyright (C)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
Also add information on how to contact you by electronic and paper mail.
If your software can interact with users remotely through a computer
network, you should also make sure that it provides a way for users to
get its source. For example, if your program is a web application, its
interface could display a "Source" link that leads users to an archive
of the code. There are many ways you could offer source, and different
solutions will be better for different programs; see section 13 for the
specific requirements.
You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU AGPL, see
.
RediSearch-1.0.8/Makefile 0000664 0000000 0000000 00000001255 13243037046 0015176 0 ustar 00root root 0000000 0000000 all:
$(MAKE) -C ./src all
test:
$(MAKE) -C ./src $@
clean:
$(MAKE) -C ./src $@
distclean:
$(MAKE) -C ./src $@
.PHONY: distclean
package: all
$(MAKE) -C ./src package
.PHONY: package
buildall:
$(MAKE) -C ./src $@
deploydocs:
mkdocs gh-deploy
.PHONY: deploydocs
staticlib:
$(MAKE) -C ./src $@
# Builds a small utility that outputs the current version
print_version:
$(MAKE) -C ./src print_version
docker: distclean print_version
docker build . -t redislabs/redisearch
docker_push: docker
docker push redislabs/redisearch:latest
docker tag redislabs/redisearch:latest redislabs/redisearch:`./src/print_version`
docker push redislabs/redisearch:`./src/print_version` RediSearch-1.0.8/README.md 0000664 0000000 0000000 00000007673 13243037046 0015027 0 ustar 00root root 0000000 0000000 [](https://circleci.com/gh/RedisLabsModules/RediSearch/tree/master)
# RediSearch
### Full-Text search over redis by RedisLabs

### See Full Documentation at [http://redisearch.io](http://redisearch.io)
### Latest Release: [1.0.7](https://github.com/RedisLabsModules/RediSearch/releases)
# Overview
Redisearch implements a search engine on top of redis, but unlike other redis
search libraries, it does not use internal data structures like sorted sets.
Inverted indexes are stored as a special compressed data type that allows for fast
indexing and search speed, and low memory footprint.
This also enables more advanced features, like exact phrase matching and numeric filtering for text queries,
that are not possible or efficient with traditional redis search approaches.
# Docker Image
[https://hub.docker.com/r/redislabs/redisearch/](https://hub.docker.com/r/redislabs/redisearch/)
```sh
$ docker run -p 6379:6379 redislabs/redisearch:latest
```
# Mailing List / Forum
Got questions? Feel free to ask at the [RediSearch mailing list](https://groups.google.com/forum/#!forum/redisearch).
# Client Libraries
Official (Redis Labs) and community Clients:
| Language | Library | Author | License | Comments |
|---|---|---|---|---|
|Python | [redisearch-py](https://github.com/RedisLabs/redisearch-py) | Redis Labs | BSD | Usually the most up-to-date client library |
| Java | [JRediSearch](https://github.com/RedisLabs/JRediSearch) | Redis Labs | BSD | |
| Go | [redisearch-go](https://github.com/RedisLabs/redisearch-go) | Redis Labs | BSD | Incomplete API |
| JavaScript | [RedRediSearch](https://github.com/stockholmux/redredisearch) | Kyle J. Davis | MIT | Partial API, compatible with [Reds](https://github.com/tj/reds) |
| C# | [NRediSearch](https://libraries.io/nuget/NRediSearch) | Marc Gravell | MIT | Part of StackExchange.Redis |
| PHP | [redisearch-php](https://github.com/ethanhann/redisearch-php) | Ethan Hann | MIT |
| Ruby on Rails | [redi_search_rails](https://github.com/dmitrypol/redi_search_rails) | Dmitry Polyakovsky | MIT | |
| Ruby | [redisearch-rb](https://github.com/vruizext/redisearch-rb) | Victor Ruiz | MIT | |
## Primary Features:
* Full-Text indexing of multiple fields in documents.
* Incremental indexing without performance loss.
* Document ranking (provided manually by the user at index time).
* Field weights.
* Complex boolean queries with AND, OR, NOT operators between sub-queries.
* Prefix matching in full-text queries.
* Auto-complete suggestions (with fuzzy prefix suggestions)
* Exact Phrase Search.
* Stemming based query expansion in [many languages](http://redisearch.io/Stemming/) (using [Snowball](http://snowballstem.org/)).
* Support for logographic (Chinese, etc.) tokenization and querying (using [Friso](https://github.com/lionsoul2014/friso))
* Limiting searches to specific document fields (up to 32 fields supported).
* Numeric filters and ranges.
* Geographical search utilizing redis' own GEO commands.
* Supports any utf-8 encoded text.
* Retrieve full document content or just ids.
* Automatically index existing HASH keys as documents.
* Document Deletion (Update can be done by deletion and then re-insertion).
* Sortable properties (i.e. sorting users by age or name).
## Cluster Support
RediSearch has a distributed cluster version that can scale to billions of documents and hundreds of servers. However, at the moment it is only available as part of Redis Labs Enterprise. See the [Redis Labs Website](https://redislabs.com/modules/redisearch/) for more info and contact information.
### Not *yet* supported:
* Spelling correction
* Aggregations
### License: AGPL
Which basically means you can freely use this for your own projects without "virality" to your code,
as long as you're not modifying the module itself. See [This Blog Post](https://redislabs.com/blog/why-redis-labs-modules-are-agpl/) for more details.
RediSearch-1.0.8/docs/ 0000775 0000000 0000000 00000000000 13243037046 0014463 5 ustar 00root root 0000000 0000000 RediSearch-1.0.8/docs/CNAME 0000664 0000000 0000000 00000000016 13243037046 0015226 0 ustar 00root root 0000000 0000000 redisearch.io
RediSearch-1.0.8/docs/Chinese.md 0000664 0000000 0000000 00000006420 13243037046 0016365 0 ustar 00root root 0000000 0000000 # Chinese support in Redis Search
Support for adding documents in Chinese is available starting at version 0.99.0.
Chinese support allows Chinese documents to be added and tokenized using segmentation
rather than simple tokenization using whitespace and/or punctuation.
Indexing a Chinese document is different than indexing a document in most other
languages because of how tokens are extracted. While most languages can have
their tokens distinguished by separation characters and whitespace, this
is not common in Chinese.
Chinese tokenization is done by scanning the input text and checking every
character or sequence of characters against a dictionary of predefined terms
and determining the most likely (based on the surrounding terms and characters)
match.
Redis Search makes use of the [Friso](https://github.com/lionsoul2014/friso)
chinese tokenization library for this purpose. This is largely transparent to
the user and often no additional configuration is required.
## Example: Using Chinese in Redis Search
In pseudo-code:
```
FT.CREATE idx SCHEMA txt TEXT
FT.ADD idx docCn 1.0 LANGUAGE chinese FIELDS txt "Redis支持主从同步。数据可以从主服务器向任意数量的从服务器上同步,从服务器可以是关联其他从服务器的主服务器。这使得Redis可执行单层树复制。从盘可以有意无意的对数据进行写操作。由于完全实现了发布/订阅机制,使得从数据库在任何地方同步树时,可订阅一个频道并接收主服务器完整的消息发布记录。同步对读取操作的可扩展性和数据冗余很有帮助。[8]"
FT.SEARCH idx "数据" LANGUAGE chinese HIGHLIGHT SUMMARIZE
# Outputs:
# 数据?... 数据进行写操作。由于完全实现了发布... 数据冗余很有帮助。[8...
```
Using the Python Client:
```
# -*- coding: utf-8 -*-
from redisearch.client import Client, Query
from redisearch import TextField
client = Client('idx')
try:
client.drop_index()
except:
pass
client.create_index([TextField('txt')])
# Add a document
client.add_document('docCn1',
txt='Redis支持主从同步。数据可以从主服务器向任意数量的从服务器上同步从服务器可以是关联其他从服务器的主服务器。这使得Redis可执行单层树复制。从盘可以有意无意的对数据进行写操作。由于完全实现了发布/订阅机制,使得从数据库在任何地方同步树时,可订阅一个频道并接收主服务器完整的消息发布记录。同步对读取操作的可扩展性和数据冗余很有帮助。[8]',
language='chinese')
print client.search(Query('数据').summarize().highlight().language('chinese')).docs[0].txt
```
Prints:
```
数据?... 数据进行写操作。由于完全实现了发布... 数据冗余很有帮助。[8...
```
## Using custom dictionaries
If you wish to use a custom dictionary, you can do so at the module level when
loading the module. The `FRISOINI` setting can point to the location of a
`friso.ini` file which contains the relevant settings and paths to the dictionary
files.
Note that there is no "default" friso.ini file location. Redis Search comes with
its own `friso.ini` and dictionary files which are compiled into the module
binary at build-time. RediSearch-1.0.8/docs/Clients.md 0000664 0000000 0000000 00000002501 13243037046 0016404 0 ustar 00root root 0000000 0000000 # RediSearch Client Libraries
RediSearch has several client libraries, written by the module authors and community members - abstracting the API in different programming languages.
While it is possible and simple to use the raw redis commands API, in most cases it's easier to just use a client library abstracting it.
## Currently available Libraries
| Language | Library | Author | License | Comments |
|---|---|---|---|---|
|Python | [redisearch-py](https://github.com/RedisLabs/redisearch-py) | Redis Labs | BSD | Usually the most up-to-date client library |
| Java | [JRediSearch](https://github.com/RedisLabs/JRediSearch) | Redis Labs | BSD | |
| Go | [redisearch-go](https://github.com/RedisLabs/redisearch-go) | Redis Labs | BSD | Incomplete API |
| JavaScript | [RedRediSearch](https://github.com/stockholmux/redredisearch) | Kyle J. Davis | MIT | Partial API, compatible with [Reds](https://github.com/tj/reds) |
| C# | [NRediSearch](https://libraries.io/nuget/NRediSearch) | Marc Gravell | MIT | Part of StackExchange.Redis |
| PHP | [redisearch-php](https://github.com/ethanhann/redisearch-php) | Ethan Hann | MIT |
| Ruby on Rails | [redi_search_rails](https://github.com/dmitrypol/redi_search_rails) | Dmitry Polyakovsky | MIT | |
| Ruby | [redisearch-rb](https://github.com/vruizext/redisearch-rb) | Victor Ruiz | MIT | |
RediSearch-1.0.8/docs/Commands.md 0000664 0000000 0000000 00000054025 13243037046 0016554 0 ustar 00root root 0000000 0000000 # RediSeach Full Command Documentation
## FT.CREATE
### Format:
```
FT.CREATE {index}
[NOOFFSETS] [NOFIELDS]
[STOPWORDS {num} {stopword} ...]
SCHEMA {field} [TEXT [NOSTEM] [WEIGHT {weight}] | NUMERIC | GEO] [SORTABLE] [NOINDEX] ...
```
### Description:
Creates an index with the given spec. The index name will be used in all the key names
so keep it short!
!!! warning "Note on field number limits"
RediSearch supports up to 1024 fields per schema, out of which at most 128 can be TEXT fields.
On 32 bit builds, at most 64 fields can be TEXT fields.
Notice that the more fields you have, the larger your index will be, as each additional 8 fields require one extra byte per index record to encode.
You can always use the NOFIELDS option and not encode field information into the index, for saving space, if you do not need filtering by text fields. This will still allow filtering by numeric and geo fields.
### Parameters:
* **index**: the index name to create. If it exists the old spec will be overwritten
* **NOOFFSETS**: If set, we do not store term offsets for documents (saves memory, does not allow exact searches or highlighting).
Implies `NOHL`
* **NOHL**: Conserves storage space and memory by disabling highlighting support. If set, we do
not store corresponding byte offsets for term positions. `NOHL` is also implied by `NOOFFSETS`.
* **NOFIELDS**: If set, we do not store field bits for each term. Saves memory, does not allow filtering by specific fields.
* **NOFREQS**: If set, we avoid saving the term frequencies in the index. This saves
memory but does not allow sorting based on the frequencies of a given term within
the document.
* **STOPWORDS**: If set, we set the index with a custom stopword list, to be ignored during indexing and search time. {num} is the number of stopwords, followed by a list of stopword arguments exactly the length of {num}.
If not set, we take the default list of stopwords.
If **{num}** is set to 0, the index will not have stopwords.
* **SCHEMA {field} {options...}**: After the SCHEMA keyword we define the index fields.
They can be numeric, textual or geographical. For textual fields we optionally specify a weight. The default weight is 1.0.
### Field Options
* **SORTABLE**
Numeric or text field can have the optional SORTABLE argument that allows the user to later [sort the results by the value of this field](/Sorting) (this adds memory overhead so do not declare it on large text fields).
* **NOSTEM**
Text fields can have the NOSTEM argument which will disable stemming when indexing its values.
This may be ideal for things like proper names.
* **NOINDEX**
Fields can have the `NOINDEX` option, which means they will not be indexed.
This is useful in conjunction with `SORTABLE`, to create fields whose update using PARTIAL will not cause full reindexing of the document. If a field has NOINDEX and doesn't have SORTABLE, it will just be ignored by the index.
### Complexity
O(1)
### Returns:
OK or an error
---
## FT.ADD
### Format:
```
FT.ADD {index} {docId} {score}
[NOSAVE]
[REPLACE [PARTIAL]]
[LANGUAGE {language}]
[PAYLOAD {payload}]
FIELDS {field} {value} [{field} {value}...]
```
### Description
Add a documet to the index.
### Parameters:
- **index**: The Fulltext index name. The index must be first created with FT.CREATE
- **docId**: The document's id that will be returned from searches.
Note that the same docId cannot be added twice to the same index
- **score**: The document's rank based on the user's ranking. This must be between 0.0 and 1.0.
If you don't have a score just set it to 1
- **NOSAVE**: If set to true, we will not save the actual document in the index and only index it.
- **REPLACE**: If set, we will do an UPSERT style insertion - and
ete an older version of the document if it exists.
- **PARTIAL** (only applicable with REPLACE): If set, you do not have to specify all fields for reindexing. Fields not given to the command will be loaded from the current version of the document. Also, if only non indexable fields, score or payload are set - we do not do a full reindexing of the document, and this will be a lot faster.
- **FIELDS**: Following the FIELDS specifier, we are looking for pairs of `{field} {value}` to be indexed.
Each field will be scored based on the index spec given in FT.CREATE.
Passing fields that are not in the index spec will make them be stored as part of the document, or ignored if NOSAVE is set
- **PAYLOAD {payload}**: Optionally set a binary safe payload string to the document,
that can be evaluated at query time by a custom scoring function, or retrieved to the client.
- **LANGUAGE language**: If set, we use a stemmer for the supplied langauge during indexing. Defaults to English.
If an unsupported language is sent, the command returns an error.
The supported languages are:
> "arabic", "danish", "dutch", "english", "finnish", "french",
> "german", "hungarian", "italian", "norwegian", "portuguese", "romanian",
> "russian", "spanish", "swedish", "tamil", "turkish"
> "chinese"
If indexing a chinese language document, you must set the language to `chinese`
in order for the chinese characters to be tokenized properly.
### Adding Chinese Documents
When adding Chinese-language documents, `LANGUAGE chinese` should be set in
order for the indexer to properly tokenize the terms. If the default language
is used then search terms will be extracted based on punctuation characters and
whitespace. The Chinese language tokenizer makes use of a segmentation algorithm
(via [Friso](https://github.com/lionsoul2014/friso)) which segments texts and
checks it against a predefined dictionary. See [Stemming](/Stemming) for more
information.
### Complexity
O(n), where n is the number of tokens in the document
### Returns
OK on success, or an error if something went wrong.
!!! warning "FT.ADD with REPLACE and PARTIAL"
By default, FT.ADD does not allow updating the document, and will fail if it already exists in the index.
However, updating the document is possible with the REPLACE and REPLACE PARTIAL options.
**REPLACE**: On its own, sets the document to the new values, and reindexes it. Any fields not given will not be loaded from the current version of the document.
**REPLACE PARTIAL**: When both arguments are used, we can update just part of the document fields, and the rest will be loaded before reindexing. Not only that, but if only the score, payload and non-indexed fields (using NOINDEX) are updated, we will not actually reindex the document, just update its metadata internally, which is a lot faster and does not create index garbage.
----
## FT.ADDHASH
### Format
```
FT.ADDHASH {index} {docId} {score} [LANGUAGE language] [REPLACE]
```
### Description
Add a documet to the index from an existing HASH key in Redis.
### Parameters:
- **index**: The Fulltext index name. The index must be first created with FT.CREATE
- **docId**: The document's id. This has to be an existing HASH key in redis that will hold the fields
the index needs.
- **score**: The document's rank based on the user's ranking. This must be between 0.0 and 1.0.
If you don't have a score just set it to 1
- **REPLACE**: If set, we will do an UPSERT style insertion - and delete an older version of the document if it exists.
- **LANGUAGE language**: If set, we use a stemmer for the supplied langauge during indexing. Defaults to English.
If an unsupported language is sent, the command returns an error.
The supported languages are:
> "arabic", "danish", "dutch", "english", "finnish", "french",
> "german", "hungarian", "italian", "norwegian", "portuguese", "romanian",
> "russian", "spanish", "swedish", "tamil", "turkish"
### Complexity
O(n), where n is the number of tokens in the document
### Returns
OK on success, or an error if something went wrong.
----
## FT.INFO
### Format
```
FT.INFO {index}
```
### Description
Return information and statistics on the index. Returned values include:
* Number of documents.
* Number of distinct terms.
* Average bytes per record.
* Size and capacity of the index buffers.
Example:
```
127.0.0.1:6379> ft.info wik{0}
1) index_name
2) wikipedia
3) fields
4) 1) 1) title
2) type
3) FULLTEXT
4) weight
5) "1"
2) 1) body
2) type
3) FULLTEXT
4) weight
5) "1"
5) num_docs
6) "502694"
7) num_terms
8) "439158"
9) num_records
10) "8098583"
11) inverted_sz_mb
12) "45.58
13) inverted_cap_mb
14) "56.61
15) inverted_cap_ovh
16) "0.19
17) offset_vectors_sz_mb
18) "9.27
19) skip_index_size_mb
20) "7.35
21) score_index_size_mb
22) "30.8
23) records_per_doc_avg
24) "16.1
25) bytes_per_record_avg
26) "5.90
27) offsets_per_term_avg
28) "1.20
29) offset_bits_per_record_avg
30) "8.00
```
### Parameters
- **index**: The Fulltext index name. The index must be first created with FT.CREATE
### Complexity
O(1)
### Returns
Array Response. A nested array of keys and values.
---
## FT.SEARCH
### Format
```
FT.SEARCH {index} {query} [NOCONTENT] [VERBATIM] [NOSTOPWORDS] [WITHSCORES] [WITHPAYLOADS] [WITHSORTKEYS]
[FILTER {numeric_field} {min} {max}] ...
[GEOFILTER {geo_field} {lon} {lat} {raius} m|km|mi|ft]
[INKEYS {num} {key} ... ]
[INFIELDS {num} {field} ... ]
[RETURN {num} {field} ... ]
[SUMMARIZE [FIELDS {num} {field} ... ] [FRAGS {num}] [LEN {fragsize}] [SEPARATOR {separator}]]
[HIGHLIGHT [FIELDS {num} {field} ... ] [TAGS {open} {close}]]
[SLOP {slop}] [INORDER]
[LANGUAGE {language}]
[EXPANDER {expander}]
[SCORER {scorer}]
[PAYLOAD {payload}]
[SORTBY {field} [ASC|DESC]]
[LIMIT offset num]
```
### Description
Search the index with a textual query, returning either documents or just ids.
### Parameters
- **index**: The Fulltext index name. The index must be first created with FT.CREATE
- **query**: the text query to search. If it's more than a single word, put it in quotes.
See below for documentation on query syntax.
- **NOCONTENT**: If it appears after the query, we only return the document ids and not
the content. This is useful if rediseach is only an index on an external document collection
- **RETURN {num} {field} ...**: Use this keyword to limit which fields from the document are returned.
`num` is the number of fields following the keyword. If `num` is 0, it acts like `NOCONTENT`.
- **SUMMARIZE ...**: Use this option to return only the sections of the field which contain the matched text.
See [Highlighting](/Highlight) for more detailts
- **HIGHLIGHT ...**: Use this option to format occurrences of matched text. See [Highligting](/Highlight) for more
details
- **LIMIT first num**: If the parameters appear after the query, we limit the results to
the offset and number of results given. The default is 0 10
- **INFIELDS {num} {field} ...**: If set, filter the results to ones appearing only in specific
fields of the document, like title or url. num is the number of specified field arguments
- **INKEYS {num} {field} ...**: If set, we limit the result to a given set of keys specified in the list.
the first argument must be the length of the list, and greater than zero.
Non existent keys are ignored - unless all the keys are non existent.
- **SLOP {slop}**: If set, we allow a maximum of N intervening number of unmatched offsets between phrase terms. (i.e the slop for exact phrases is 0)
- **INORDER**: If set, and usually used in conjunction with SLOP, we make sure the query terms appear in the same order in the document as in the query, regardless of the offsets between them.
- **FILTER numeric_field min max**: If set, and numeric_field is defined as a numeric field in
FT.CREATE, we will limit results to those having numeric values ranging between min and max.
min and max follow ZRANGE syntax, and can be **-inf**, **+inf** and use `(` for exclusive ranges.
Multiple numeric filters for different fields are supported in one query.
- **GEOFILTER {geo_field} {lon} {lat} {raius} m|km|mi|ft**: If set, we filter the results to a given radius
from lon and lat. Radius is given as a number and units. See [GEORADIUS](https://redis.io/commands/georadius) for more details.
- **NOSTOPWORDS**: If set, we do not filter stopwords from the query.
- **WITHSCORES**: If set, we also return the relative internal score of each document. this can be
used to merge results from multiple instances
- **WITHSORTKEYS**: Only relevant in conjunction with **SORTBY**. Returns the value of the sorting key, right after the id and score and /or payload if requested. This is usually not needed by users, and exists for distributed search coordination purposes.
- **VERBATIM**: if set, we do not try to use stemming for query expansion but search the query terms verbatim.
- **LANGUAGE {language}**: If set, we use a stemmer for the supplied langauge during search for query expansion.
If querying documents in Chinese, this should be set to `chinese` in order to
properly tokenize the query terms.
Defaults to English. If an unsupported language is sent, the command returns an error. See FT.ADD for the list of languages.
- **EXPANDER {expander}**: If set, we will use a custom query expander instead of the stemmer. [See Extensions](/Extensions).
- **SCORER {scorer}**: If set, we will use a custom scoring function defined by the user. [See Extensions](/Extensions).
- **PAYLOAD {payload}**: Add an arbitrary, binary safe payload that will be exposed to custom scoring functions. [See Extensions](/Extensions).
- **WITHPAYLOADS**: If set, we retrieve optional document payloads (see FT.ADD).
the payloads follow the document id, and if `WITHSCORES` was set, follow the scores.
- **SORTBY {field} [ASC|DESC]**: If specified, and field is a [sortable field](/Sorting), the results are ordered by the value of this field. This applies to both text and numeric fields.
### Complexity
O(n) for single word queries (though for popular words we save a cache of the top 50 results).
Complexity for complex queries changes, but in general it's proportional to the number of words and the number of intersection points between them.
### Returns
**Array reply,** where the first element is the total number of results, and then pairs of document id, and a nested array of field/value.
If **NOCONTENT** was given, we return an array where the first element is the total number of results, and the rest of the members are document ids.
---
## FT.EXPLAIN
### Format
```
FT.EXPLAIN {index} {query}
```
### Description
Return the execution plan for a complex query
Example:
```sh
$ redis-cli --raw
127.0.0.1:6379> FT.EXPLAIN rd "(foo bar)|(hello world) @date:[100 200]|@date:[500 +inf]"
INTERSECT {
UNION {
INTERSECT {
foo
bar
}
INTERSECT {
hello
world
}
}
UNION {
NUMERIC {100.000000 <= x <= 200.000000}
NUMERIC {500.000000 <= x <= inf}
}
}
```
### Parameters
- **index**: The Fulltext index name. The index must be first created with FT.CREATE
- **query**: The query string, as if sent to FT.SEARCH
### Complexity
O(1)
### Returns
String Response. A string representing the execution plan (see above example).
**Note**: You should use `redis-cli --raw` to properly read line-breaks in the returned response.
---
## FT.DEL
### Format
```
FT.DEL {index} {doc_id}
```
### Description
Delete a document from the index. Returns 1 if the document was in the index, or 0 if not.
After deletion, the document can be re-added to the index. It will get a different internal id and will be a new document from the index's POV.
!!! warning "FT.DEL does NOT delete the actual document!"
Since RediSearch regards documents as separate entities to the index, and allows things like adding existing documents or indexing without saving the document - FT.DEL only deletes the reference to the document from the index, not the actual Redis HASH key where the document is stored.
If you want to delete the HASH storing the documents, a further **DEL {doc_id}** needs to be issued. You can run both of them in a MULTI transaction.
### Parameters
- **index**: The Fulltext index name. The index must be first created with FT.CREATE
- **doc_id**: the id of the document to be deleted. It does not actually delete the HASH key in which the document is stored. Use DEL to do that manually if needed.
### Complexity
O(1)
### Returns
Integer Reply: 1 if the document was deleted, 0 if not.
---
## FT.GET
### Format
```
FT.GET {index} {doc id}
```
### Description
Returns the full contents of a document.
Currently it is equivalent to HGETALL, but this is future proof and will allow us to change the internal representation of documents inside redis in the future. In addition, it allows simpler implementation of fetching documents in clustered mode.
If the document does not exist or is not a HASH object, we reutrn a NULL reply
### Parameters
- **index**: The Fulltext index name. The index must be first created with FT.CREATE
- **documentId**: The id of the document as inserted to the index
### Returns
Array Reply: Key-value pairs of field names and values of the document
---
## FT.MGET
### Format
```
FT.GET {index} {docId} ...
```
### Description
Returns the full contents of multiple documents.
Currently it is equivalent to calling multiple HGETALL commands, although faster.
This command is also future proof, and will allow us to change the internal representation of documents inside redis in the future.
In addition, it allows simpler implementation of fetching documents in clustered mode.
We return an array with exactly the same number of elements as the number of keys sent to the command.
Each element in turn is an array of key-value pairs representing the document.
If a document is not found or is not a valid HASH object, its place in the parent array is filled with a Null reply object.
### Parameters
- **index**: The Fulltext index name. The index must be first created with FT.CREATE
- **documentIds**: The ids of the requested documents as inserted to the index
### Returns
Array Reply: An array with exactly the same number of elements as the number of keys sent to the command. Each element in it is either an array representing the document, or Null if it was not found.
---
## FT.DROP
### Format
```
FT.DROP {index}
```
### Description
Deletes all the keys associated with the index.
If no other data is on the redis instance, this is equivalent to FLUSHDB, apart from the fact
that the index specification is not deleted.
### Parameters
- **index**: The Fulltext index name. The index must be first created with FT.CREATE
### Returns
Status Reply: OK on success.
---
## FT.TAGVALS
### Format
```
FT.TAGVALS {index} {field_name}
```
### Description
Return the distinct tags indexed in a [Tag field](/Tags/).
This is useful if your tag field indexes things like cities, categories, etc.
!!! warning "Limitations"
There is no paging or sorting, the tags are not alphabetically sorted.
This command only operates on [Tag fields](/Tags/).
The strings return lower-cased and stripped of whitespaces, but otherwise unchanged.
### Parameters
- **index**: The Fulltext index name. The index must be first created with FT.CREATE
- **filed_name**: The name of a Tag file defined in the schema.
### Returns
Array Reply: All the distinct tags in the tag index.
### Complexity
O(n), n being the cardinality of the tag field.
---
## FT.SUGADD
### Format
```
FT.SUGADD {key} {string} {score} [INCR] [PAYLOAD {payload}]
```
### Description
Add a suggestion string to an auto-complete suggestion dictionary. This is disconnected from the
index definitions, and leaves creating and updating suggestino dictionaries to the user.
### Parameters
- **key**: the suggestion dictionary key.
- **string**: the suggestion string we index
- **score**: a floating point number of the suggestion string's weight
- **INCR**: if set, we increment the existing entry of the suggestion by the given score, instead of replacing the score. This is useful for updating the dictionary based on user queries in real time
- **PAYLOAD {payload}**: If set, we save an extra payload with the suggestion, that can be fetched by adding the `WITHPAYLOADS` argument to `FT.SUGGET`.
### Returns:
Integer Reply: the current size of the suggestion dictionary.
---
## FT.SUGGET
### Format
```
FT.SUGGET {key} {prefix} [FUZZY] [WITHPAYLOADS] [MAX num]
```
### Description
Get completion suggestions for a prefix
### Parameters:
- **key**: the suggestion dictionary key.
- **prefix**: the prefix to complete on
- **FUZZY**: if set,we do a fuzzy prefix search, including prefixes at levenshtein distance of 1 from the prefix sent
- **MAX num**: If set, we limit the results to a maximum of `num`. (**Note**: The default is 5, and the number cannot be greater than 10).
- **WITHSCORES**: If set, we also return the score of each suggestion. this can be used to merge results from multiple instances
- **WITHPAYLOADS**: If set, we return optional payloads saved along with the suggestions. If no payload is present for an entry, we return a Null Reply.
### Returns:
Array Reply: a list of the top suggestions matching the prefix, optionally with score after each entry
---
## FT.SUGDEL
### Format
```
FT.SUGDEL {key} {string}
```
### Description
Delete a string from a suggestion index.
### Parameters
- **key**: the suggestion dictionary key.
- **string**: the string to delete
### Returns:
Integer Reply: 1 if the string was found and deleted, 0 otherwise.
----
## FT.SUGLEN
Format
```
FT.SUGLEN {key}
```
### Description
Get the size of an autoc-complete suggestion dictionary
### Parameters
* **key**: the suggestion dictionary key.
### Returns:
Integer Reply: the current size of the suggestion dictionary.
----
## FT.OPTIMIZE ** DEPRECATED **
Format
```
FT.OPTIMIZE {index}
```
Description
This command is deprecated. Index optimizations are done by the internal garbage collector in the background. Client libraries should not implement this command, and remove it if they haven't already.
---
RediSearch-1.0.8/docs/Configuring.md 0000664 0000000 0000000 00000005226 13243037046 0017264 0 ustar 00root root 0000000 0000000 # Run-Time Configuration
RediSearch supports a few run-time configuration options that should be determined when loading the module. In time more options will be added.
!!! tip "Passing Configuration Options"
In general, passing configuration options is done by appending arguments after the `--loadmodule` argument in command line, `loadmodule` configurtion directive in a redis config file, or `MODULE LOAD` when loading modules in command line. For example:
In redis.conf:
```
loadmodule redisearch.so OPT1 OPT2
```
In redis-cli:
```
127.0.0.6379> MODULE load redisearch.so OPT1 OPT2
```
In command-line:
```
$ redis-server --loadmodule ./redisearch.so OPT1 OPT2
```
# RediSearch Configuration Options
## TIMEOUT
The maximum amount of time **in Millisecods** that a search query is allowed to run. If this time is exceeded, we return the top results accumulated so far.
The defalt is 500ms.
**NOTE**: This works only in concurrent mode, so enabling SAFEMODE disables ths option.
### Default:
500
### Example:
```
$ redis-server --loadmodule ./redisearch.so TIMEOUT 100
```
## SAFEMODE
If present in the argument list, RediSearch will turn off concurrency for query processing, and work in a single thread.
This is useful if data consistency is extremely important, and avoids a situation where deletion of documents while querying them can cause momentarily incosistent results (i.e. documents that were valid during the the invokation of the query are not returned because they were deleted durin query processing).
### Default:
Off (not present)
### Example
```
$ redis-server --loadmodule ./redisearch.so SAFEMODE
```
---
## EXTLOAD {file_name}
If present, we try to load a redisearch extension dynamic library from the specified file path. See [Extensions](/Extensions) for details.
### Default:
None
### Example:
```
$ redis-server --loadmodule ./redisearch.so EXTLOAD ./ext/my_extension.so
```
---
## NOGC
If set, we turn off Garbage Collection for all indexes. This is used mainly for debugging and testing, and should not be set by users.
### Default:
Not set
### Example:
```
$ redis-server --loadmodule ./redisearch.so NOGC
```
---
## MINPREFIX
The minimum number of characters we allow for prefix queries (e.g. `hel*`). Setting it to 1 can hurt performance.
### Default:
2
### Example:
```
$ redis-server --loadmodule ./redisearch.so MINPREFIX 3
```
---
## MAXEXPANSIONS
The maximum number of expansions we allow for query prefixes. Setting it too high can cause performance issues.
### Default:
200
### Example:
```
$ redis-server --loadmodule ./redisearch.so MAXEXPANSIONS 1000
```
RediSearch-1.0.8/docs/DESIGN.md 0000664 0000000 0000000 00000024264 13243037046 0015766 0 ustar 00root root 0000000 0000000 # RediSearch internal design
RediSearch implements inverted indexes on top of redis, but unlike previous implementations of redis inverted indexes,
it uses custom data encoding, that allows more memory and CPU efficient searches, and more advanced search features.
This document details some of the design choices and how these features are implemented.
## Intro: Redis String DMA
The main feature that this module takes advantage of, is Redis Modules Strings DMA, or Direct Memory Access.
This features is simple yet very powerful. It basically allows modules to allocate data on Redis string keys,
then get a direct pointer to the data allocated by this key, without copying or serializing it.
This allows very fast access to huge amounts of memory, and since from the module's perspective, the string
value is exposed simply as `char *`, it can be cast to any data structure.
You simply call `RedisModule_StringTruncate` to resize a memory chunk to the size needed, and `RedisModule_StringDMA`
to get direct access to the memory in that key.
See [https://github.com/RedisLabs/RedisModulesSDK/blob/master/FUNCTIONS.md#redismodule_stringdma](https://github.com/RedisLabs/RedisModulesSDK/blob/master/FUNCTIONS.md#redismodule_stringdma)
We use this API in the module mainly to encode inverted indexes, and for other auxiliary data structures besides that.
A generic "Buffer" implementation using DMA strings can be found in [redis_buffer.c](https://github.com/RedisLabsModules/RediSearch/blob/master/src/redis_buffer.c). It automatically resizes
the redis string it uses as raw memory, when the capacity needs to grow.
## Inverted index encoding
An [Inverted Index](https://en.wikipedia.org/wiki/Inverted_index) is the data structure at the heart of all search engines. The idea is simple - per each
word or search term, we save a list of all the documents it appears in, and other data, such as term frequency,
the offsets where the term appeared in the document, and more. Offsets are used for "exact match" type searches,
or for ranking of results.
When a search is performed, we need to either traverse such an index, or intersect or union two or more indexes.
Classic redis implementations of search engines use sorted sets as inverted indexes. This works, but has great memory
overhead, and also does not allow for encoding of offsets, as explained above.
RediSearch uses String DMA (see above) to efficiently encode inverted indexes.
It combines [Delta Encoding](https://en.wikipedia.org/wiki/Delta_encoding) and
[Varint Encoding](https://developers.google.com/protocol-buffers/docs/encoding#varints) to encode entries,
minimizing space used for indexes, while keeping decompression and traversal efficient.
For each "hit" (document/word entry), we encode:
* The document Id as a delta from the previous document.
* The term frequency, factored by the document's rank (see below)
* Flags, that can be used to filter only specific fields or other user defined properties.
* An Offset Vector, of all the document offsets of the word.
> Note: document ids as entered by the user are converted to internal incremental document ids, that allow
> delta encoding to be efficient, and let the inverted indexes be sorted by document id.
This allows for a single index hit entry to be encoded in as little as 6 bytes
(Note that this is the best case. depending on the number of occurrences of the word in the document, this can get much higher).
To optimize searches, we keep two additional auxiliary data structures in different DMA string keys:
1. **Skip Index**: We keep a table of the index offset of 1/50 of the index entries. This allows faster lookup when intersecting inverted indexes, as not the entire list must be traversed.
2. **Score Index**: In simple single-word searches, there is no real need to traverse all the results, just the top N results the user is intersted in.
So we keep an auxiliary index of the top 20 or so entries for each term, and use them when applicable.
## Document and result ranking
Each document entered to the engine using `FT.ADD`, has a user assigned rank, between 0 and 1.0. This is used in
combination with [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) scoring of each word, to rank the results.
As an optimization, each inverted index hit is encoded with TF*Document_rank as its score, and only IDF is applied
during searches. This may change in the future.
On top of that, in the case of intersection queries, we take the minimal distance between the terms in the query,
and factor that into the ranking. The closest the terms are to each other, the better the result.
When searching, we keep a priority queue of the top N results requested, and eventually return them, sorted by rank.
## Index Specs and field weights
When creating an "index" using `FT.CREATE`, the user specifies the fields to be indexed, and their respective weights.
This can be used to give some document fields, like a title, more weight in ranking results.
For example:
```
FT.CREATE my_index title 10.0 body 1.0 url 2.0
```
Will create an index on fields named title, body and url, with scores of 10, 1 and 2 respectively.
When documents are indexed, the weights are taken from the saved *Index Spec*, that is stored in a special redis key,
and only fields that are specified in this spec are indexed.
## Document data storage
It is not mandatory to save the document data when indexing a document (specifying `NOSAVE` for `FT.ADD` will cause
the document to be indexed but not saved).
If the user does save the document, we simply create a HASH key in redis, containing all fields (including ones not indexed),
and upon search, we simply perform an `HGETALL` query on each retrieved document, returning its entire data.
**TODO**: Document snippets should be implemented down the road,
## Query Execution Engine
We use a chained-iterator based approach to query execution, similar to [Python generators](https://wiki.python.org/moin/Generators) in concept.
We simply chain iterators that yield index hits. Those can be:
1. **Read Iterators**, reading hits one by one from an inverted index. i.e. `hello`
2. **Intersect Iterators**, aggregating two or more iterators, yielding only their intersection points. i.e. `hello AND world`
3. **Exact Intersect Iterators** - same as above, but yielding results only if the intersection is an exact phrase. i.e. `hello NEAR world`
4. **Union Iterators** - combining two or more iterators, and yielding a union of their hits. i.e. `hello OR world`
These are combined based on the query as an execution plan that is evaluated lazily. For example:
```
hello ==> read("hello")
hello world ==> intersect( read("hello"), read("world") )
"hello world" ==> exact_intersect( read("hello"), read("world") )
"hello world" foo ==> intersect(
exact_intersect(
read("hello"),
read("world")
),
read("foo")
)
```
All these iterators are lazy evaluated, entry by entry, with constant memory overhead.
The "root" iterator is read by the query execution engine, and filtered for the top N results in it.
## Numeric Filters
We support defining a field in the index schema as "NUMERIC", meaning you will be able to limit search results only to ones where the given value falls within a specific range. Filtering is done by adding `FILTER` predicates (more than one is supported) to your query. e.g.:
```
FT.SEARCH products "hd tv" FILTER price 100 (300
```
The filter syntax follows the ZRANGEBYSCORE semantics of redis, meaning `-inf` and `+inf` are supported, and prepending `(` to a number means an exclusive range.
As of release 0.6, the implementation uses a multi-level range tree, saving ranges at multiple resolutions, to allow efficient range scanning.
Adding numeric filters can accelerate slow queries if the numeric range is small relative to the entire span of the filtered field.
For example, a filter on dates focusing on a few days out of years of data, can speed a heavy query by an order of magnitude.
## Auto-Complete and Fuzzy Suggestions
Another important feature for RediSearch is its auto-complete or suggest commands. It allows you to create dictionaries of weighted terms, and then query them for completion suggestions to a given user prefix. For example, if we put the term “lcd tv” into a dictionary, sending the prefix “lc” will return it as a result. The dictionary is modelled as a compressed trie (prefix tree) with weights, that is traversed to find the top suffixes of a prefix.
RediSearch also allows for Fuzzy Suggestions, meaning you can get suggestions to user prefixes even if the user has a typo in the prefix. This is enabled using a Levenshtein Automaton, allowing efficient searching of the dictionary for all terms within a maximal Levenshtein Distance of a term or prefix. Then suggested are weighted based on both their original score and distance from the prefix typed by the user. Currently we support (for performance reasons) only suggestions where the prefix is up to 1 Levenshtein Distance away from the typed prefix.
However, since searching for fuzzy prefixes, especially very short ones, will traverse an enormous amount of suggestions (in fact, fuzzy suggestions for any single letter will traverse the entire dictionary!), it is recommended to use this feature carefully, and only when considering the performance penalty it incurs. Since redis is single threaded, blocking it for any amount of time means no other queries can be processed at that time.
To support unicode fuzzy matching, we use 16-bit "runes" inside the trie, and not bytes. This increases memory consumption if the text is purely ascii, but allows completion with the same level of support to all modern languages. This is done in the following manner:
1. We assume all input to FT.SUG* commands is valid utf-8.
2. We convert the input strings to 32-bit unicode, optionally normalizing, case-folding and removing accents on the way. If the conversion fails it's because the input is not valid utf-8.
3. We trim the 32-bit runes to 16-bit runes using the lower 16 bits. These can be used for insertion, deletion and search.
4. We convert the output of searches back to utf-8.
RediSearch-1.0.8/docs/Extensions.md 0000664 0000000 0000000 00000021421 13243037046 0017144 0 ustar 00root root 0000000 0000000 # Extending RediSearch
RediSearch supports an extension mechanism, much like Redis supports modules. The API is very minimal at the moment, and it does not yet support dynamic loading of extensions in run-time. Instead, extensions must be written in C (or a language that has an interface with C) and compiled into dynamic libraries that will be loaded at run-time.
There are two kinds of extension APIs at the moment:
1. **Query Expanders**, whose role is to expand query tokens (i.e. stemmers).
2. **Scoring Funtions**, whose role is to rank search results in query time.
## Registering and Loading Extensions
Extensions should be compiled into .so files, and loaded into RediSearch on initialization of the module.
* Compiling
Extensions should be compiled and linked as dynamic libraries. An example Makefile for an extension [can be found here](https://github.com/RedisLabsModules/RediSearch/blob/master/src/tests/ext-example/Makefile).
That folder also contains an example extension that is used for testing, and can be taken as a skeleton for implementing your own extension.
* Loading
Loading an extension is done by apending `EXTLOAD {path/to/ext.so}` after the `loadmodule` configuration directive when loading RediSearch. For example:
```sh
$ redis-server --loadmodule ./redisearch.so EXTLOAD ./ext/my_extension.so
```
This causes RediSearch to automatically load the extension and register its expanders and scorers.
## Initializing an Extension
The entry point of an extension is a function with the signature:
```c
int RS_ExtensionInit(RSExtensionCtx *ctx);
```
When loading the extension, RediSearch looks for this function and calls it. This function is responsible for registering and initializing the expanders and scorers.
It should return REDISEARCH_ERR on error or REDISEARCH_OK on success.
### Example Init Function
```c
#include //must be in the include path
int RS_ExtensionInit(RSExtensionCtx *ctx) {
/* Register a scoring function with an alias my_scorer and no special private data and free function */
if (ctx->RegisterScoringFunction("my_scorer", MyCustomScorer, NULL, NULL) == REDISEARCH_ERR) {
return REDISEARCH_ERR;
}
/* Register a query expander */
if (ctx->RegisterQueryExpander("my_expander", MyExpander, NULL, NULL) ==
REDISEARCH_ERR) {
return REDISEARCH_ERR;
}
return REDISEARCH_OK;
}
```
## Calling your custom functions
When performing a query, you can tell RediSearch to use your scorers or expanders by specifing the SCORER or EXPANDER arguments, with the given alias.
e.g.:
```
FT.SEARCH my_index "foo bar" EXPANDER my_expander SCORER my_scorer
```
**NOTE**: Expander and scorer aliases are **case sensitive**.
## The Query Expander API
At the moment, we only support basic query expansion, one token at a time. An expander can decide to expand any given token with as many tokens it wishes, that will be Union-merged in query time.
The API for an expander is the following:
```c
#include //must be in the include path
void MyQueryExpander(RSQueryExpanderCtx *ctx, RSToken *token) {
...
}
```
### RSQueryExpanderCtx
RSQueryExpanderCtx is a context that contains private data of the extension, and a callback method to expand the query. It is defined as:
```c
typedef struct RSQueryExpanderCtx {
/* Opaque query object used internally by the engine, and should not be accessed */
struct RSQuery *query;
/* Opaque query node object used internally by the engine, and should not be accessed */
struct RSQueryNode **currentNode;
/* Private data of the extension, set on extension initialization */
void *privdata;
/* The language of the query, defaults to "english" */
const char *language;
/* ExpandToken allows the user to add an expansion of the token in the query, that will be
* union-merged with the given token in query time. str is the expanded string, len is its length,
* and flags is a 32 bit flag mask that can be used by the extension to set private information on
* the token */
void (*ExpandToken)(struct RSQueryExpanderCtx *ctx, const char *str, size_t len,
RSTokenFlags flags);
/* SetPayload allows the query expander to set GLOBAL payload on the query (not unique per token)
*/
void (*SetPayload)(struct RSQueryExpanderCtx *ctx, RSPayload payload);
} RSQueryExpanderCtx;
```
### RSToken
RSToken represents a single query token to be expanded, and is defined as:
```c
/* A token in the query. The expanders receive query tokens and can expand the query with more query
* tokens */
typedef struct {
/* The token string - which may or may not be NULL terminated */
const char *str;
/* The token length */
size_t len;
/* 1 if the token is the result of query expansion */
uint8_t expanded:1;
/* Extension specific token flags that can be examined later by the scoring function */
RSTokenFlags flags;
} RSToken;
```
## The Scoring Function API
A scoring function receives each document being evaluated by the query, for final ranking.
It has access to all the query terms that brought up the document,and to metadata about the
document such as its a-priory score, length, etc.
Since the scoring function is evaluated per each document, potentially millions of times, and since
redis is single threaded - it is important that it works as fast as possible and be heavily optimized.
A scoring function is applied to each potential result (per document) and is implemented with the following signature:
```c
double MyScoringFunction(RSScoringFunctionCtx *ctx, RSIndexResult *res,
RSDocumentMetadata *dmd, double minScore);
```
RSScoringFunctionCtx is a context that implements some helper methods.
RSIndexResult is the result information - containing the document id, frequency, terms and offsets.
RSDocumentMetadata is an object holding global information about the document, such as its a-priory score.
minSocre is the minimal score that will yield a result that will be relevant to the search. It can be used to stop processing mid-way of before we even start.
The return value of the function is double representing the final score of the result.
Returning 0 causes the result to be counted, but if there are results with a score greater than 0, they will appear above it.
To completely filter out a result and not count it in the totals, the scorer should return the special value `RS_SCORE_FILTEROUT` (which is internally to negative infinity, or -1/0).
### RSScoringFunctionCtx
This is an object containing the following members:
* **void *privdata**: a pointer to an object set by the extension on initialization time.
* **RSPayload payload**: A Payload object set either by the query expander or the client.
* **int GetSlop(RSIndexResult *res)**: A callback method that yields the total minimal distance between the query terms. This can be used to prefer results where the "slop" is smaller and the terms are nearer to each other.
### RSIndexResult
This is an object holding the information about the current result in the index, which is an aggregate of all the terms that resulted in the current document being considered a valid result.
See redisearch.h for details
### RSDocumentMetadata
This is an object describing global information, unrelated to the current query, about the document being evaluated by the scoring function.
## Example Query Expander
This example query expander expands each token with the the term foo:
```c
#include //must be in the include path
void DummyExpander(RSQueryExpanderCtx *ctx, RSToken *token) {
ctx->ExpandToken(ctx, strdup("foo"), strlen("foo"), 0x1337);
}
```
## Example Scoring Function
This is an actual scoring function, calculating TF-IDF for the document, multiplying that by the document score, and dividing that by the slop:
```c
#include //must be in the include path
double TFIDFScorer(RSScoringFunctionCtx *ctx, RSIndexResult *h, RSDocumentMetadata *dmd,
double minScore) {
// no need to evaluate documents with score 0
if (dmd->score == 0) return 0;
// calculate sum(tf-idf) for each term in the result
double tfidf = 0;
for (int i = 0; i < h->numRecords; i++) {
// take the term frequency and multiply by the term IDF, add that to the total
tfidf += (float)h->records[i].freq * (h->records[i].term ? h->records[i].term->idf : 0);
}
// normalize by the maximal frequency of any term in the document
tfidf /= (double)dmd->maxFreq;
// multiply by the document score (between 0 and 1)
tfidf *= dmd->score;
// no need to factor the slop if tfidf is already below minimal score
if (tfidf < minScore) {
return 0;
}
// get the slop and divide the result by it, making sure we prefer results with closer terms
tfidf /= (double)ctx->GetSlop(h);
return tfidf;
}
```
RediSearch-1.0.8/docs/Highlight.md 0000664 0000000 0000000 00000010311 13243037046 0016710 0 ustar 00root root 0000000 0000000 # Highlighting API
The highlighting API allows you to have only the relevant portions of document matching a search query returned as a result.
This allows users to quickly see how a document relates to their query, with the search terms highlighted, usually in bold letters.
RediSearch implements high performance highlighting an summarization algorithms, with the following API:
### Command Syntax
```
FT.SEARCH ...
SUMMARIZE [FIELDS {num} {field}] [FRAGS {numFrags}] [LEN {fragLen}] [SEPARATOR {sepstr}]
HIGHLIGHT [FIELDS {num} {field}] [TAGS {openTag} {closeTag}]
```
There are two sub-commands commands used for highlighting. One is `HIGHLIGHT`
which surrounds matching text with an open and/or close tag; and the other is
`SUMMARIZE` which splits a field into contextual fragments surrounding the
found terms. It is possible to summarize a field, highlight a field, or perform
both actions in the same query.
#### Summarization
```
FT.SEARCH ...
SUMMARIZE [FIELDS {num} {field}] [FRAGS {numFrags}] [LEN {fragLen}] [SEPARATOR {sepStr}]
```
Summarization will fragment the text into smaller sized snippets; each snippet will contain the found term(s) and some additional
surrounding context.
RediSearch can perform summarization using the `SUMMARIZE` keyword. If no additional arguments are passed,
all _returned fields_ are summarized using built-in defaults.
The `SUMMARIZE` keyword accepts the following arguments:
* **`FIELDS`**: If present, must be the first argument. This should be followed
by the number of fields to summarize, which itself is followed by a list of
fields. Each field present is summarized. If no `FIELDS` directive is passed,
then *all* fields returned are summarized.
* **`FRAGS`**: How many fragments should be returned. If not specified, a default of 3 is used.
* **`LEN`** The number of context words each fragment should contain. Context
words surround the found term. A higher value will return a larger block of
text.
* **`SEPARATOR`** The string used to divide between individual summary snippets.
The default is `... ` which is common among search engines; but you may
override this with any other string if you desire to programmatically divide them
later on. You may use a newline sequence, as newlines are stripped from the
result body anyway (thus, it will not be conflated with an embedded newline
in the text)
#### Highlighting
```
FT.SEARCH ... HIGHLIGHT [FIELDS {num} {field}] [TAGS {openTag} {closeTag}]
```
Highlighting will highlight the found term (and its variants) with a user-defined
tag. This may be used to display the matched text in a different typeface using
a markup language, or to otherwise make the text appear differently.
RediSearch can perform highlighting using the `HIGHLIGHT` keyword. If no
additional arguments are passed, all _returned fields_ are highlighted using built-in defaults.
The `HIGHLIGHT` keyword accepts the following arguments:
* **`FIELDS`** If present, must be the first argument. This should be followed
by the number of fields to highlight, which itself is followed by a list of
fields. Each field present is highlighted. If no `FIELDS` directive is passed,
then *all* fields returned are highlighted.
* **`TAGS`** If present, must be followed by two strings; the first is prepended
to each term match, and the second is appended to it. If no `TAGS` are
specified, a built-in tag value is appended and prepended.
#### Field Selection
If no specific fields are passed to the `RETURN`, `SUMMARIZE`, or `HIGHLIGHT`
keywords, then all of a document's fields are returned. However, if any of these
keywords contain a `FIELD` directive, then the `SEARCH` command will only retun
the sum total of all fields enumerated in any of those directives.
The `RETURN` keyword is treated specially, as it overrides any fields specified
in `SUMMARIZE` or `HIGHLIGHT`.
In the command `RETURN 1 foo SUMMARIZE FIELDS 1 bar HIGHLIGHT FIELDS 1 baz`,
the fields `foo` is returned as-is, while `bar` and `baz` are not returned, because
`RETURN` was specified, but did not include those fields.
In the command `SUMMARIZE FIELDS 1 bar HIGHLIGHT FIELDS 1 baz`, `bar` is returned
summarized and `baz` is returned highlighted. RediSearch-1.0.8/docs/Query_Syntax.md 0000664 0000000 0000000 00000022112 13243037046 0017456 0 ustar 00root root 0000000 0000000 # Search Query Syntax:
We support a simple syntax for complex queries with the following rules:
* Multi-word phrases simply a list of tokens, e.g. `foo bar baz`, and imply intersection (AND) of the terms.
* Exact phrases are wrapped in quotes, e.g `"hello world"`.
* OR Unions (i.e `word1 OR word2`), are expressed with a pipe (`|`), e.g. `hello|hallo|shalom|hola`.
* NOT negation (i.e. `word1 NOT word2`) of expressions or sub-queries. e.g. `hello -world`. As of version 0.19.3, purely negative queries (i.e. `-foo` or `-@title:(foo|bar)`) are supported.
* Prefix matches (all terms starting with a prefix) are expressed with a `*` following a 3-letter or longer prefix.
* Selection of specific fields using the syntax `@field:hello world`.
* Numeric Range matches on numeric fields with the syntax `@field:[{min} {max}]`.
* Geo radius matches on geo fields with the syntax `@field:[{lon} {lat} {radius} {m|km|mi|ft}]`
* Tag field filters with the syntax `@field:{tag | tag | ...}`. See the full documentation on tag fields.
* Optional terms or clauses: `foo ~bar` means bar is optional but documents with bar in them will rank higher.
* An expression in a query can be wrapped in parentheses to resolve disambiguity, e.g. `(hello|hella) (world|werld)`.
* Combinations of the above can be used together, e.g `hello (world|foo) "bar baz" bbbb`
## Pure Negative Queries
As of version 0.19.3 it is possible to have a query consisting of just a negative expression, e.g. `-hello` or `-(@title:foo|bar)`. The results will be all the documents *NOT* containing the query terms.
**Warning**: Any complex expression can be negated this way, however caution should be taken here: if a negative expression has little or no results, this is equivalent to traversing and ranking all the documents in the index, which can be slow and cause high CPU consumption.
## Field modifiers
As of version 0.12 it is possible to specify field modifiers in the query and not just using the INFIELDS global keyword.
Per query expression or sub expression, it is possible to specify which fields it matches, by prepending the experssion with the `@` symbol, the field name and a `:` (colon) symbol.
If a field modifier precedes multiple words, they are considered to be a phrase with the same modifier.
If a field modifier preceds an expression in parentheses, it applies only to the expression inside the parentheses.
Multiple modifiers can be combined to create complex filtering on several fields. For example, if we have an index of car models, with a vehicle class, country of origin and engine type, we can search for SUVs made in Korea with hybrid or diesel engines - with the following query:
```
FT.SEARCH cars "@country:korea @engine:(diesel|hybrid) @class:suv"
```
Multiple modifiers can be applied to the same term or grouped terms. e.g.:
```
FT.SEARCH idx "@title|body:(hello world) @url|image:mydomain"
```
This will search for documents that have "hello world" either in the body or the title, and the term "mydomain" in their url or image fields.
## Numeric Filters in Query
If a field in the schema is defined as NUMERIC, it is possible to either use the FILTER argument in the redis request, or filter with it by specifying filtering rules in the query. The syntax is `@field:[{min} {max}]` - e.g. `@price:[100 200]`.
### A few notes on numeric predicates:
1. It is possible to specify a numeric predicate as the entire query, whereas it is impossible to do it with the FILTER argument.
2. It is possible to interesect or union multiple numeric filters in the same query, be it for the same field or different ones.
3. `-inf`, `inf` and `+inf` are acceptable numbers in range. Thus greater-than 100 is expressed as `[(100 inf]`.
4. Numeric filters are inclusive. Exclusive min or max are expressed with `(` prepended to the number, e.g. `[(100 (200]`.
5. It is possible to negate a numeric filter by prepending a `-` sign to the filter, e.g. returnig a result where price differs from 100 is expressed as: `@title:foo -@price:[100 100]`.
## Tag Filters
RediSearch (starting with version 0.91) allows a special field type called "tag field", with simpler tokenization and encoding in the index. The values in these fields cannot be accessed by general field-less search, and can be used only with a special syntax:
```
@field:{ tag | tag | ...}
e.g.
@cities:{ New York | Los Angeles | Barcelona }
```
Tags can have multiple words, or include other punctuation marks other than the field's separator (`,` by default). Punctuation marks in tags should be escaped with a backslash (`\`). It is also recommended (but not mandatory) to escape spaces; The reason is that if a multi-word tag includes stopwords, it will create a syntax error. So tags like "to be or not to be" should be escaped as "to\ be\ or\ not\ to\ be". For good measure, you can escape all spaces within tags.
Notice that multiple tags in the same clause create a union of documents containing either tags. To create an intersection of documents containing *all* tags, you should repeat the tag filter several times, e.g.:
```
# This will return all documents containing all three cities as tags:
@cities:{ New York } @cities:{Los Angeles} @cities:{ Barcelona }
# This will return all documents containing either city:
@cities:{ New York | Los Angeles | Barcelona }
```
Tag clauses can be combined into any sub clause, used as negative expressions, optional expressions, etc.
## Geo Filters in Query
As of version 0.21, it is possible to add geo radius queries directly into the query language with the syntax `@field:[{lon} {lat} {radius} {m|km|mi|ft}]`. This filters the result to a given radius from a lon,lat point, defined in meters, kilometers, miles or feet. See Redis' own GEORADIUS command for more details (internall we use GEORADIUS for that).
Radius filters can be added into the query just like numeric filters. For example, in a database of businesses, looking for Chinese restaurants near San Francisco (within a 5km radius) would be expressed as: `chinese restaurant @location:[-122.41 37.77 5 km]`.
## Prefix Matching
On index updating, we maintain a dictionary of all terms in the index. This can be used to match all terms starting with a given prefix. Selecting prefix matches is done by appending `*` to a prefix token. For example:
```
hel* world
```
Will be expanded to cover `(hello|help|helm|...) world`.
### A few notes on prefix searches:
1. As prefixes can be expanded into many many terms, use them with caution. There is no magic going on, the expansion will create a Union operation of all suffxies.
2. As a protective measure to avoid selecting too many terms, and block redis, which is single threaded, there are two limitations on prefix matching:
* Prefixes are limited to 3 letters or more.
* Expansion is limited to 200 terms or less.
3. Prefix matching fully supports unicode and is case insensitive.
4. Currently there is no sorting or bias based on suffix popularity, but this is on the near-term roadmap.
## A Few Query Examples
* Simple phrase query - hello AND world
hello world
* Exact phrase query - **hello** FOLLOWED BY **world**
"hello world"
* Union: documents containing either **hello** OR **world**
hello|world
* Not: documents containing **hello** but not **world**
hello -world
* Intersection of unions
(hello|halo) (world|werld)
* Negation of union
hello -(world|werld)
* Union inside phrase
(barack|barrack) obama
* Optional terms with higher priority to ones containing more matches:
obama ~barack ~michelle
* Exact phrase in one field, one word in aonther field:
@title:"barack obama" @job:president
* Combined AND, OR with field specifiers:
@title:hello world @body:(foo bar) @category:(articles|biographies)
* Prefix Queries:
hello worl*
hel* worl*
hello -worl*
* Numeric Filtering - products named "tv" with a price range of 200-500:
@name:tv @price:[200 500]
* Numeric Filtering - users with age greater than 18:
@age:[(18 +inf]
## Mapping Common SQL Predicates to RediSearch
| SQL Condition | RediSearch Equivalent | Comments |
|---------------|-----------------------|----------|
| WHERE x='foo' AND y='bar' | @x:foo @y:bar | for less ambiguity use (@x:foo) (@y:bar) |
| WHERE x='foo' AND y!='bar' | @x:foo -@y:bar |
| WHERE x='foo' OR y='bar' | (@x:foo)\|(@y:bar) |
| WHERE x IN ('foo', 'bar','hello world') | @x:(foo\|bar\|"hello world") | quotes mean exact phrase |
| WHERE y='foo' AND x NOT IN ('foo','bar') | @y:foo (-@x:foo) (-@x:bar) |
| WHERE x NOT IN ('foo','bar') | -@x:(foo\|bar) |
| WHERE num BETWEEN 10 AND 20 | @num:[10 20] |
| WHERE num >= 10 | @num:[10 +inf] |
| WHERE num > 10 | @num:[(10 +inf] |
| WHERE num < 10 | @num:[-inf (10] |
| WHERE num <= 10 | @num:[-inf 10] |
| WHERE num < 10 OR num > 20 | @num:[-inf (10] \| @num:[(20 +inf] |
| WHERE name LIKE 'john%' | @name:john* |
## Technical Note
The query parser is built using the Lemon Parser Generator and a Ragel based lexer. You can see the grammar definition [at the git repo.](https://github.com/RedisLabsModules/RediSearch/blob/master/src/query_parser/parser.y)
RediSearch-1.0.8/docs/Quick_Start.md 0000664 0000000 0000000 00000002405 13243037046 0017237 0 ustar 00root root 0000000 0000000
# Quick Start Guide for RediSearch:
## Running with Docker
```sh
docker run -p 6379:6379 redislabs/redisearch:latest
```
## Building and running from source:
```sh
git clone https://github.com/RedisLabsModules/RediSearch.git
cd RediSearch/src
make all
# Assuming you have a redis build from the unstable branch:
/path/to/redis-server --loadmodule ./redisearch.so
```
## Creating an index with fields and weights (default weight is 1.0):
```
127.0.0.1:6379> FT.CREATE myIdx SCHEMA title TEXT WEIGHT 5.0 body TEXT url TEXT
OK
```
## Adding documents to the index:
```
127.0.0.1:6379> FT.ADD myIdx doc1 1.0 FIELDS title "hello world" body "lorem ipsum" url "http://redis.io"
OK
```
## Searching the index:
```
127.0.0.1:6379> FT.SEARCH myIdx "hello world" LIMIT 0 10
1) (integer) 1
2) "doc1"
3) 1) "title"
2) "hello world"
3) "body"
4) "lorem ipsum"
5) "url"
6) "http://redis.io"
```
> **NOTE**: Input is expected to be valid utf-8 or ascii. The engine cannot handle wide character unicode at the moment.
## Dropping the index:
```
127.0.0.1:6379> FT.DROP myIdx
OK
```
## Adding and getting Auto-complete suggestions:
```
127.0.0.1:6379> FT.SUGADD autocomplete "hello world" 100
OK
127.0.0.1:6379> FT.SUGGET autocomplete "he"
1) "hello world"
```
RediSearch-1.0.8/docs/Scoring.md 0000664 0000000 0000000 00000007367 13243037046 0016426 0 ustar 00root root 0000000 0000000 # Scoring In RediSearch
RediSearch comes with a few very basic scoring functions to evaluate document relevance. They are all based on document scores and term frequency. This is regardless of the ability to use [sortable fields](/Sorting/). Scoring functions are specified by adding the `SCORER {scorer_name}` argument to a search query.
If you prefer a custom scoring function, it is possible to add more functions using the [Extension API](/Extensions).
These are the pre-bunldled scoring functions availabe in RediSearch and how they work. Each function is mentioned by registered name, that can be passed as a SCORER argument in FT.SEARCH.
## TFIDF (Default)
Basic [TF-IDF scoring](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) with a few extra features thrown inside:
1. For each term in each result we calculate the TF-IDF score of that term to that document. Frequencies are weighted based on field weights that are pre-determined, and each term's frequency is **normalized by the highest term frequency in each document**.
2. We multiply the total TF-IDF for the query term by the a priory document score given on `FT.ADD`.
3. We give a penalty to each result based on "slop" or cumulative distance between the search terms: exact matches will get no penlty, but matches where the search terms are distant see their score reduced significantly. For each 2-gram of consecutive terms, we find the minimal distance between them. The penalty is the square root of the sum of the distances, squared - `1/sqrt(d(t2-t1)^2 + d(t3-t2)^2 + ...)`.
So for N terms in a document D, `T1...Tn`, the resulting score could be described with this python function:
```py
def get_score(terms, doc):
# the sum of tf-idf
score = 0
# the distance penalty for all terms
dist_penalty = 0
for i, term in enumerate(terms):
# tf normalized by maximum frequency
tf = doc.freq(term) / doc.max_freq
# idf is global for the index, and not calculated each time in real life
idf = log2(1 + total_docs / docs_with_term(term))
score += tf*idf
# sum up the distance penalty
if i > 0:
dist_penalty += min_distance(term, terms[i-1])**2
# multiply the score by the document score
score *= doc.score
# divide the score by the root of the cumulative distance
if len(terms) > 1:
score /= sqrt(dist_penalty)
return score
```
## TFIDF.DOCNORM
Identical to the default TFIDF scorer, with one important distinction:
Term frequencies are normalized by the length of the document (in number of terms). The length is weighted, so that if a document contains two terms, one in a feild that has a weight 1 and one in a field with a weight of 5, the total frequency is 6, not 2.
```
FT.SEARCH myIndex "foo" SCORER TFIDF.DOCNORM
```
## BM25
A vraiation on the basic TF-IDF scorer, see [this Wikipedia article for more info](https://en.wikipedia.org/wiki/Okapi_BM25).
We also multiply the relevance score for each document by the a priory docment score, and apply a penalty based on slop as in TFIDF.
```
FT.SEARCH myIndex "foo" SCORER BM25
```
## DISMAX
A simple scorer that sums up the frequencies of the matched terms; in the case of union clauses, it will give the maximum value of those matches. No other penalties or factors are applied.
It is not a 1 to 1 implementation of [Solr's DISMAX algorithm](https://wiki.apache.org/solr/DisMax), but follows it in broad terms.
```
FT.SEARCH myIndex "foo" SCORER DISMAX
```
## DOCSCORE
A scoring function that just returns the a priory score of the document without applying any calculations to it. Since document scores can be updates, this can be useful if you'd like to use an external score and nothing further.
```
FT.SEARCH myIndex "foo" SCORER DOCSCORE
``` RediSearch-1.0.8/docs/Sorting.md 0000664 0000000 0000000 00000004770 13243037046 0016442 0 ustar 00root root 0000000 0000000 # Sorting By Indexed Fields
As of RediSearch 0.15, it is possible to bypass the scoring function mechanism, and order search results by the value of different document properties (fields) directly - even if the sorting field is not used by the query. For example, you can search for first name and sort by last name.
## Declaring Sortable Fields
When creating the index with FT.CREATE, you can declare `TEXT` and `NUMERIC` properties to be `SORTABLE`. When a property is sortable, we can later decide to order the results by its values. For example, in the followig schema:
```
> FT.CREATE users SCHEMA first_name TEXT last_name TEXT SORTABLE age NUMERIC SORTABLE
```
The fields `last_name` and `age` are sortable, but `first_name` isn't. This means we can search by either first and/or last name, and sort by last name or age.
### Note on sortable TEXT fields
In the current implementation, when declaring a sortable field, its content gets copied into a special location in the index, for fast access on sorting. This means that making long text fields sortable is very expensive, and you should be careful with it.
Also note that text fields get normalized and lowercased in a unicode-safe way when stored for sorting , and currently there is no way to change this behavior. This means that `America` and `america` are considered equal in terms of sorting.
## Specifying SORTBY
If an index includes sortable fields, you can add the `SORTBY` parameter to the search request (outside the query body), and order the results by it. This overrides the scoring function mechanism, and the two cannot be combined. If `WITHSCORES` is specified along with `SORTBY`, the scores returned are simply the relative position of each result in the result set.
The syntax for SORTBY is:
```
SORTBY {field_name} [ASC|DESC]
```
* field_name must be a sortabl field defined in the schema.
* ASC means the order will be ascending, DESC that it will be descending.
* The default ordering is ASC if not specified otherwise.
## Quick Example
```
> FT.CREATE users SCHEMA first_name TEXT SORTABLE last_name TEXT age NUMERIC SORTABLE
# Add some users
> FT.ADD users user1 1.0 FIELDS first_name "alice" last_name "jones" age 35
> FT.ADD users user2 1.0 FIELDS first_name "bob" last_name "jones" age 36
# Searching while sorting
# Searching by last name and sorting by first name
> FT.SEARCH users "@last_name:jones" SORTBY first_name DESC
# Searching by both first and last name, and sorting by age
> FT.SEARCH users "alice jones" SORTBY age ASC
```
RediSearch-1.0.8/docs/Stemming.md 0000664 0000000 0000000 00000004005 13243037046 0016567 0 ustar 00root root 0000000 0000000 # Stemming Support
RediSearch supports stemming - that is adding the base form of a word to the index. This allows
the query for "going" to also return results for "go" and "gone", for example.
The current stemming support is based on the Snowball stemmer library, which supports most European
languages, as well as Arabic and other. We hope to include more languages soon (if you need a specicif
langauge support, please open an issue).
For further details see the [Snowball Stemmer website](http://snowballstem.org/).
## Supported languages:
The following languages are supported, and can be passed to the engine
when indexing or querying, with lowercase letters:
* arabic
* danish
* dutch
* english
* finnish
* french
* german
* hungarian
* italian
* norwegian
* portuguese
* romanian
* russian
* spanish
* swedish
* tamil
* turkish
* chinese (see below)
# Chinese support
Indexing a Chinese document is different than indexing a document in most other
languages because of how tokens are extracted. While most languages can have
their tokens distinguished by separation characters and whitespace, this
is not common in Chinese.
Chinese tokenization is done by scanning the input text and checking every
character or sequence of characters against a dictionary of predefined terms
and determining the most likely (based on the surrounding terms and characters)
match.
Redis Search makes use of the [Friso](https://github.com/lionsoul2014/friso)
chinese tokenization library for this purpose. This is largely transparent to
the user and often no additional configuration is required.
## Using custom dictionaries
If you wish to use a custom dictionary, you can do so at the module level when
loading the module. The `FRISOINI` setting can point to the location of a
`friso.ini` file which contains the relevant settings and paths to the dictionary
files.
Note that there is no "default" friso.ini file location. Redis Search comes with
its own `friso.ini` and dictionary files which are compiled into the module
binary at build-time.
RediSearch-1.0.8/docs/Stopwords.md 0000664 0000000 0000000 00000003571 13243037046 0017017 0 ustar 00root root 0000000 0000000 # Stop-Words
RediSearch has a pre-defined default list of [stop-words](https://en.wikipedia.org/wiki/Stop_words). These are words that are usually so common that they do not add much information to search, but take up a lot of space and CPU time in the index.
When indexing, stop-words are discarded and not indexed. When searching, they are also ignored and treated as if they were not sent to the query processor. This is done when parsing the query.
At the moment, the default stop-word list applies to all full-text indexes in all languages, and can be overridden manually at index creation time.
## Default Stop-Word List
The following words are treated as stop-words by default:
```
a, is, the, an, and, are, as, at, be, but, by, for,
if, in, into, it, no, not, of, on, or, such, that, their,
then, there, these, they, this, to, was, will, with
```
## Overriding The Default Stop-Words
Stop-words for an index can be defined (or disabled completely) on index creation using the `STOPWORDS` argument in the [FT.CREATE](/Commands/#ftcreate) command.
The format is `STOPWORDS {number} {stopword} ...` where number is the number of stopwords given. The `STOPWORDS` argument must come before the `SCHEMA` argument. For example:
```
FT.CREATE myIndex STOPWORDS 3 foo bar baz SCHEMA title TEXT body TEXT
```
## Disabling Stop-Words Completely
Disabling stopwords completely can be done by passing `STOPWORDS 0` on `FT.CREATE`.
## Avoiding Stop-Word Detection In Search Queries
In rare use cases, where queries are very long and are guaranteed by the client application to not contain stopwords, it is possible to avoid checking for them when parsing the query. This saves some CPU time, and is only worth it if the query has dozens or more terms in it. Using this without verifying that the query doesn't contain stop-words might result in empty queries.
RediSearch-1.0.8/docs/Tags.md 0000664 0000000 0000000 00000007415 13243037046 0015712 0 ustar 00root root 0000000 0000000 # Tag Fields
RediSearch 0.91 adds a new kind of field - the Tag field. They are similar to full-text field, but use simpler tokenization and encoding in the index. The values in these fields cannot be accessed by general field-less search, and can be used only with a special syntax.
The main differences between tag fields and full-text fields are:
1. An entire tag field index resides in a single redis key, and doesn't have a key per term as the full-text one.
2. We do not perform stemming on tag indexes.
3. The tokenization is simpler: The user can determine a separator (defaults to a comma) for multiple tags,
and we only do whitespace trimming at the end of tags.
Thus, tags can contain spaces, punctuation marks, accents, etc. The only two transformations
we perform are lower-casing (for latin languages only as of now), and whitespace trimming.
4. Tags cannot be found from a general full-text search. If a document has a field called
"tags" with the values "foo" and "bar", searching for foo or bar without a special tag
modifier (see below) will not return this document.
5. The index is much simpler and more compressed: We do not store frequencies, offset vectors of
field flags. The index contains only document IDs encoded as deltas. This means that an entry in a tag index
is usually one or two bytes long. This makes them very memory efficient and fast.
6. An unlimited number of tag fields can be created per index, as long as the overall number of fields is under 1024.
## Creating A Tag Field
Tag fields can be added to the schema in FT.ADD with the following syntax:
```
FT.CREATE ... SCHEMA ... {field_name} TAG [SEPARATOR {sep}]
```
SEPARATOR defaults to a comma (`,`), and can be any printable ascii character. For example:
```
FT.CREATE idx SCHEMA tags TAG SEPARATOR ";"
```
## Querying Tag Fields
As mentioned above, just searching for a tag without any modifiers will not retrieve documents
containing it.
The syntax for matching tags in a query is as follows (the curly braces are part of the syntax in
this case):
```
@:{ | | ...}
```
e.g.
```
@tags:{hello world | foo bar}
```
Tag clauses can be combined into any sub clause, used as negative expressions, optional expressions, etc. For example:
```
FT.SEARCH idx "@title:hello @price:[0 100] @tags:{ foo bar | hello world }
```
## Multiple Tags In A Single Filter
Notice that multiple tags in the same clause create a union of documents containing either tags. To create an intersection of documents containing *all* tags, you should repeat the tag filter several times.
For example, imagine an index of travellers, with a tag field for the cities each traveller has visited:
```
FT.CREATE myIndex SCHEMA name TEXT cities TAG
FT.ADD myIndex user1 1.0 FIELDS name "John Doe" cities "New York, Barcelona, San Francisco"
```
For this index, the following query will return all the people who visited **at least one** of the following cities:
```
FT.SEARCH myIndex "@cities:{ New York | Los Angeles | Barcelona }"
```
But the next query will return all people who have visited **all three cities**:
```
@cities:{ New York } @cities:{Los Angeles} @cities:{ Barcelona }
```
## Multi-Word Tags And Escaping
Tags can be composed multiple words, or include other punctuation marks other than the field's separator (`,` by default). Punctuation marks in tags should be escaped with a backslash (`\`).
It is also recommended (but not mandatory) to escape spaces; The reason is that if a multi-word tag includes stopwords, it will create a syntax error. So tags like "to be or not to be" should be escaped as "to\ be\ or\ not\ to\ be". For good measure, you can escape all spaces within tags.
The following are identical:
```
@tags:{foo\ bar\ baz | hello\ world}
@tags:{foo bar baz | hello world }
```
RediSearch-1.0.8/docs/Threading.md 0000664 0000000 0000000 00000023672 13243037046 0016724 0 ustar 00root root 0000000 0000000 # Multi-Threading in RediSearch
*By Dvir Volk, July 2017*
## 1. One Thread To Rule Them All
Redis has been, from its inception, single threaded - and will remain so at least in 4.0. I'm not going to get into the reasons of why that is - but up until now it has been a reality that Redis apps, and more recently Redis Modules such as RediSearch - have to deal with.
While keeping things single-threaded makes Redis simple and fast - the down-side is that long running commands block the entire server for the duration of the query's execution. Most Redis commands are fast so that is not a problem, but commands like [ZUNIONSTORE](https://Redis.io/commands/zunionstore), [LRANGE](https://Redis.io/commands/lrange), [SINTER](https://Redis.io/commands/sinter) and of course the infamous [KEYS](https://Redis.io/commands/keys), can block Redis for seconds or minutes, depending on the size of data they are handling.
## 2. RediSearch and the Single Thread Issue
[RediSearch](https://Redisearch.io) is a new search engine module written at Redis Labs. It leverages Redis' powerful infrastructure with efficient data structures, to create a fast and feature rich, real-time search engine.
While it is extremely fast and uses highly optimized data structures and algorithms, it was facing the same problem with regards to concurrency: Depending on the size of your data-set and the cardinality of search queries, they can take internally anywhere between a few microseconds, to hundreds of milliseconds to seconds in extreme cases. And when that happens - the entire Redis server that the engine is running on - is blocked.
Think, for example, on the a full-text query intersecting the terms "hello" and "world", each with, let's say, a million entries, and half a million common intersection points. To do that in a millisecond, you would have to scan, intersect and rank each result in one nanosecond, [which is impossible with current hardware](https://gist.github.com/jboner/2841832). The same goes for indexing a 1000 word document. It blocks Redis entirely for that duration.
So taking into account that in the real world, search queries may not behave like your average Redis O(1) command, and block the entire server for long periods of time. Of course, you could and should split your search index into a cluster, and a cluster version of RediSearch will soon be available as part of Redis Labs Enterprise cluster - but even if we distribute the data across cluster nodes, some queries will be slow.
## 3. Enter the Redis GIL
Luckily, Redis BDFL [Salvatore Sanfilippo](https://twitter.com/antirez) has added a revolutionary change just near the finish line of Redis 4.0 and the release of the modules API - **Thread Safe Contexts** and the **Global Lock**.
The idea is simple - while Redis in itself still remains single threaded, a module can run many threads - and any one of them can acquire the **Global Lock** when it needs to access Redis data, operate on it, and release it.
We still cannot really query Redis in parallel - only one thread can acquire the lock, including the Redis main thread - but we can make sure that a long running query will give other queries time to properly run by yielding this lock from time to time. Note that this limitation applies to our use case only - in other use cases such as training machine learning models, actual parallel processing the background is achievable and easy.
## 4. Making Search Concurrent
Up until now, the flow of a search query was simple - the query would arrive at a **Command Handler** callback in the Redis Module, and it would be the only thing running inside Redis right now. Then it would parse the query, execute it, taking as long as it takes - and return the result.
To allow concurrency, we adapted the following design:
1. RediSearch has a thread pool for running concurrent search queries.
2. When a search request arrives, it gets to the handler, gets parsed on the main thread, and a request object is passed to the thread pool via a queue.
3. The thread pool runs a query processing function in its own thread.
4. The function locks the Redis Global lock, and starts executing the query.
5. Since the search execution is basically an iterator running in a cycle, we simply sample the elapsed time every several iterations (sampling on each iteration would slow things down as it has a cost of its own).
6. If enough time has elapsed, the query processor releases the Global Lock, and immediately tries to acquire it again. When the lock is released, the kernel will schedule another thread to run - be it Redis' main thread, or another query thread.
7. When the lock is acquired again - we reopen all Redis resources we were holding before releasing the lock (keys might have been deleted while the thread has been "sleeping"), and continue work from the previous state.
Thus the operating system's scheduler makes sure all query threads get CPU time to run. While one is running the rest wait idly, but since execution is yielded about 5,000 times a second, it creates the effect of concurrency. Fast queries will finish in one go without yielding execution, slow ones will take many iteration to finish, but will allow other queries to run concurrently.
> ### Figure 1: Serial vs. Concurrent Search
> 
>
> **On the left-hand side, all queries are handled one after the other. On the right side, each query is given it time-slice to run. Notice that while the total time for all queries remain the same, queries 3 and 4 finish much faster.**
The same approach is applied to indexing. If a document is big and tokenizing and indexing it will block Redis for a long time - we break that into many smaller iterations and allow Redis to do other things instead of blocking for a very long time. In fact, in the case of indexing there is enough work to be done in parallel using multiple cores - namely tokenizing and normalizing the document. This is especially effective for very big documents.
As a side note - this could have been implemented with a single thread switching between all the query execution loops, but the code refactoring required for that was much larger, and the effect with reasonable load would have remained similar, so we opted to keep this for a future release.
## 5. The Effect of Concurrency
While this is not magic, and if all your queries are slow they will remain slow, and no real parallel processing is done here - this is revolutionary in Redis terms. Think about the old problem of running `KEYS *` in a busy Redis instance. In single threaded operation, this will cause the instance to hang for seconds if not minutes. No it is possible to implement a concurrent version of KEYS in a module, that will hardly affect performance. In fact, Salvatore has already implemented one!
There is, however, a negative effect as well: we sacrifice atomicity of reads and writes for concurrency to a degree. Consider the following situation: One thread is processing a query that should retrieve document A, then yields the execution context; At the same time, another thread deletes or changes document A. The result - the query run by the first thread will not be able to retrieve the document, as it has already been changed or deleted while the thread was "asleep".
This is of course only relevant to high update/delete loads, and relatively slow and complex queries. In our view, for most use cases this is a sacrifice worth making, and usually query processing is fast enough that the probability of this happening is very low. However, this can be overcome easily: if strong atomicity of the operations is important, it is possible to have RediSearch operate in "safe mode", making all searches and updates atomic, thus making sure that each query refers to the sate of the index at the moment of its invocation.
To enable safe mode and disable query concurrency, you can configure RediSearch at load time: `redis-server --loadmodule redisearch.so SAFEMODE` in command line, or by adding `loadmodule redisearch.so SAFEMODE` to your redis.conf - depending on how you load the module.
## 6. Some Numbers!
I've benchmarked both versions of the module - simple single threaded, and concurrent multi threaded, over the same set up.
!!! note "Benchmark Setup"
* The data-set consists of about 1,000,000 Reddit comments.
* Two clients using Redis-benchmark were running - first separately, then in parallel:
* One client doing a very intensive query - "i" which has 200,000 results with 5 concurrent connections.
* One client is doing a very light query - "Obama", which has about 500 results - with 10 concurrent connections (we assume in a normal situation there will be more lightweight queries than heavy queries).
* Both clients and the server running on my personal laptop - MacBook Pro with an Intel Quad Core i7 @ 2.2Ghz.
### The Results:


!!! note
While we can see that light queries are significantly slower when running in concurrent mode without contention, they are still very fast. But in contention, we see that lightweight queries run X40 faster in concurrent mode, since they are not blocked by the slow queries, as in single thread mode. In single thread mode we are only as fast as the slowest queries.
## 7. Parting Words
This little Global Lock feature and Thread Safe Contexts, is perhaps the most powerful thing that the Modules API offers. We touched only the problem of concurrency here, but it also enables background tasks, real parallel processing of data that does not touch the Redis keyspace, and more.
For RediSearch, it makes the difference between being a nice engine for small-ish use cases, to being a real beast that can handle huge data-sets at high loads. Combined with the up-and-coming distributed version of RediSearch (that also leverages the threading API, but that's a story for another post), it will make RediSearch a very powerful search and indexing engine. RediSearch-1.0.8/docs/commands.yaml 0000664 0000000 0000000 00000002460 13243037046 0017152 0 ustar 00root root 0000000 0000000 FT.CREATE:
summary: Set the string value of a key
complexity: O(1)
arguments:
-
comment: the index name
name: index_name
type: key
-
name:
- field
- score | NUMERIC
type:
- string
- double | enum
- enum:
- NUMERIC
comment:
pairs of field name and relative weight in scoring.
The weight is a double, but does not need to be normalized.
since: 0.1
returns:
type: status
value: OK on success, error otherwise
FT.ADD:
complexity: O(1)
arguments:
-
comment: the index name
name: index_name
type: key
-
name: docId
type: string
-
name: score
type: double
comment: The document's score, between 0.0 and 1.0
-
command: LANGUAGE
name:
- lang
type:
- string
optional: true
-
name: nosave
type: enum
enum: [NOSAVE]
-
index docId score [LANGUAGE lang] [NOSAVE] FIELDS ....
RediSearch-1.0.8/docs/concurrency.png 0000664 0000000 0000000 00000052614 13243037046 0017533 0 ustar 00root root 0000000 0000000 PNG
IHDR 3 K8 USIDATxpUչI
%dQA4ƚBAAiIi
Ȅ!1PD5KZUe:^;Kg3)]sryk9{Z{N#B]: B@! B! !B B! B! !B B! B! B! B! B!@ B@! B!@ B@! B! !B B! B! }c566Θ1#==SN)))cƌٽ{UEEEnnmg;v옲5?(Ҏ;[RiscIr-WI,XtCTONN>|xէq.((HKK:~SC:uJWȰUI ]~iFFF۷.avqq>[VV?[}}eff*|b*))i/JK[0utY*#KgΜGT-o~~2TUUŬm UU2II uteeejV'_ ٔm9rcq{1mxsf]*#VAMWj_jG}ܹs!MܪjI&J'O=ԆVuLWBue ;tPUv^GUW
EVc'z744.}MMRk=ZTTPY˗/&!d7.ڕ7uH^+**څXBSJKKm?^td)rż4ոb_EyyHiU
vQ AIvv3cMꯓYk4ҧ,ڧ~ʏhQN40p@