neighbor-0.2.3/0000755000175100017510000000000014452262121013472 5ustar vivekdebvivekdebneighbor-0.2.3/neighbor.gemspec0000644000175100017510000000302614452262121016635 0ustar vivekdebvivekdeb######################################################### # This file has been automatically generated by gem2tgz # ######################################################### # -*- encoding: utf-8 -*- # stub: neighbor 0.2.3 ruby lib Gem::Specification.new do |s| s.name = "neighbor".freeze s.version = "0.2.3" s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version= s.require_paths = ["lib".freeze] s.authors = ["Andrew Kane".freeze] s.date = "2023-04-03" s.email = "andrew@ankane.org".freeze s.files = ["CHANGELOG.md".freeze, "LICENSE.txt".freeze, "README.md".freeze, "lib/generators/neighbor/cube_generator.rb".freeze, "lib/generators/neighbor/templates/cube.rb.tt".freeze, "lib/generators/neighbor/templates/vector.rb.tt".freeze, "lib/generators/neighbor/vector_generator.rb".freeze, "lib/neighbor.rb".freeze, "lib/neighbor/model.rb".freeze, "lib/neighbor/railtie.rb".freeze, "lib/neighbor/vector.rb".freeze, "lib/neighbor/version.rb".freeze] s.homepage = "https://github.com/ankane/neighbor".freeze s.licenses = ["MIT".freeze] s.required_ruby_version = Gem::Requirement.new(">= 2.6".freeze) s.rubygems_version = "3.3.15".freeze s.summary = "Nearest neighbor search for Rails and Postgres".freeze if s.respond_to? :specification_version then s.specification_version = 4 end if s.respond_to? :add_runtime_dependency then s.add_runtime_dependency(%q.freeze, [">= 5.2"]) else s.add_dependency(%q.freeze, [">= 5.2"]) end end neighbor-0.2.3/lib/0000755000175100017510000000000014452262121014240 5ustar vivekdebvivekdebneighbor-0.2.3/lib/neighbor/0000755000175100017510000000000014452262121016035 5ustar vivekdebvivekdebneighbor-0.2.3/lib/neighbor/version.rb0000644000175100017510000000005014452262121020042 0ustar vivekdebvivekdebmodule Neighbor VERSION = "0.2.3" end neighbor-0.2.3/lib/neighbor/vector.rb0000644000175100017510000000345114452262121017667 0ustar vivekdebvivekdebmodule Neighbor class Vector < ActiveRecord::Type::Value def initialize(dimensions:, normalize:, model:, attribute_name:) super() @dimensions = dimensions @normalize = normalize @model = model @attribute_name = attribute_name end def self.cast(value, dimensions:, normalize:, column_info:) value = value.to_a.map(&:to_f) dimensions ||= column_info[:dimensions] raise Error, "Expected #{dimensions} dimensions, not #{value.size}" if dimensions && value.size != dimensions raise Error, "Values must be finite" unless value.all?(&:finite?) if normalize norm = Math.sqrt(value.sum { |v| v * v }) # store zero vector as all zeros # since NaN makes the distance always 0 # could also throw error # safe to update in-place since earlier map dups value.map! { |v| v / norm } if norm > 0 end value end def self.column_info(model, attribute_name) attribute_name = attribute_name.to_s column = model.columns.detect { |c| c.name == attribute_name } { type: column.try(:type), dimensions: column.try(:limit) } end # need to be careful to avoid loading column info before needed def column_info @column_info ||= self.class.column_info(@model, @attribute_name) end def cast(value) self.class.cast(value, dimensions: @dimensions, normalize: @normalize, column_info: column_info) unless value.nil? end def serialize(value) unless value.nil? if column_info[:type] == :vector "[#{cast(value).join(", ")}]" else "(#{cast(value).join(", ")})" end end end def deserialize(value) value[1..-1].split(",").map(&:to_f) unless value.nil? end end end neighbor-0.2.3/lib/neighbor/railtie.rb0000644000175100017510000000075214452262121020017 0ustar vivekdebvivekdebmodule Neighbor class Railtie < Rails::Railtie generators do # rails generate model Item embedding:vector{3} if defined?(Rails::Generators::GeneratedAttribute) Rails::Generators::GeneratedAttribute.singleton_class.prepend(Neighbor::GeneratedAttribute) end end end module GeneratedAttribute def parse_type_and_options(type, *, **) if type =~ /\A(vector)\{(\d+)\}\z/ return $1, limit: $2.to_i end super end end end neighbor-0.2.3/lib/neighbor/model.rb0000644000175100017510000001102114452262121017455 0ustar vivekdebvivekdebmodule Neighbor module Model def has_neighbors(attribute_name = :neighbor_vector, dimensions: nil, normalize: nil) attribute_name = attribute_name.to_sym class_eval do @neighbor_attributes ||= {} if @neighbor_attributes.empty? def self.neighbor_attributes parent_attributes = if superclass.respond_to?(:neighbor_attributes) superclass.neighbor_attributes else {} end parent_attributes.merge(@neighbor_attributes || {}) end end raise Error, "has_neighbors already called for #{attribute_name.inspect}" if neighbor_attributes[attribute_name] @neighbor_attributes[attribute_name] = {dimensions: dimensions, normalize: normalize} attribute attribute_name, Neighbor::Vector.new(dimensions: dimensions, normalize: normalize, model: self, attribute_name: attribute_name) return if @neighbor_attributes.size != 1 scope :nearest_neighbors, ->(attribute_name, vector = nil, distance:) { if vector.nil? && !attribute_name.nil? && attribute_name.respond_to?(:to_a) vector = attribute_name attribute_name = :neighbor_vector end attribute_name = attribute_name.to_sym options = neighbor_attributes[attribute_name] raise ArgumentError, "Invalid attribute" unless options normalize = options[:normalize] dimensions = options[:dimensions] return none if vector.nil? distance = distance.to_s quoted_attribute = "#{connection.quote_table_name(table_name)}.#{connection.quote_column_name(attribute_name)}" column_info = klass.type_for_attribute(attribute_name).column_info operator = if column_info[:type] == :vector case distance when "inner_product" "<#>" when "cosine" "<=>" when "euclidean" "<->" end else case distance when "taxicab" "<#>" when "chebyshev" "<=>" when "euclidean", "cosine" "<->" end end raise ArgumentError, "Invalid distance: #{distance}" unless operator # ensure normalize set (can be true or false) if distance == "cosine" && column_info[:type] == :cube && normalize.nil? raise Neighbor::Error, "Set normalize for cosine distance with cube" end vector = Neighbor::Vector.cast(vector, dimensions: dimensions, normalize: normalize, column_info: column_info) # important! neighbor_vector should already be typecast # but use to_f as extra safeguard against SQL injection query = if column_info[:type] == :vector connection.quote("[#{vector.map(&:to_f).join(", ")}]") else "cube(array[#{vector.map(&:to_f).join(", ")}])" end order = "#{quoted_attribute} #{operator} #{query}" # https://stats.stackexchange.com/questions/146221/is-cosine-similarity-identical-to-l2-normalized-euclidean-distance # with normalized vectors: # cosine similarity = 1 - (euclidean distance)**2 / 2 # cosine distance = 1 - cosine similarity # this transformation doesn't change the order, so only needed for select neighbor_distance = if column_info[:type] != :vector && distance == "cosine" "POWER(#{order}, 2) / 2.0" elsif column_info[:type] == :vector && distance == "inner_product" "(#{order}) * -1" else order end # for select, use column_names instead of * to account for ignored columns select(*column_names, "#{neighbor_distance} AS neighbor_distance") .where.not(attribute_name => nil) .order(Arel.sql(order)) } def nearest_neighbors(attribute_name = :neighbor_vector, **options) attribute_name = attribute_name.to_sym # important! check if neighbor attribute before calling send raise ArgumentError, "Invalid attribute" unless self.class.neighbor_attributes[attribute_name] self.class .where.not(self.class.primary_key => send(self.class.primary_key)) .nearest_neighbors(attribute_name, send(attribute_name), **options) end end end end end neighbor-0.2.3/lib/neighbor.rb0000644000175100017510000000343014452262121016362 0ustar vivekdebvivekdeb# dependencies require "active_support" # modules require "neighbor/version" module Neighbor class Error < StandardError; end module RegisterTypes def initialize_type_map(m = type_map) super m.register_type "cube", ActiveRecord::ConnectionAdapters::PostgreSQL::OID::SpecializedString.new(:cube) m.register_type "vector" do |_, _, sql_type| limit = extract_limit(sql_type) ActiveRecord::ConnectionAdapters::PostgreSQL::OID::SpecializedString.new(:vector, limit: limit) end end end end ActiveSupport.on_load(:active_record) do require "neighbor/model" require "neighbor/vector" extend Neighbor::Model require "active_record/connection_adapters/postgresql_adapter" # ensure schema can be dumped ActiveRecord::ConnectionAdapters::PostgreSQLAdapter::NATIVE_DATABASE_TYPES[:cube] = {name: "cube"} ActiveRecord::ConnectionAdapters::PostgreSQLAdapter::NATIVE_DATABASE_TYPES[:vector] = {name: "vector"} # ensure schema can be loaded if ActiveRecord::VERSION::MAJOR >= 6 ActiveRecord::ConnectionAdapters::TableDefinition.send(:define_column_methods, :cube, :vector) else ActiveRecord::ConnectionAdapters::TableDefinition.define_method :cube do |*args, **options| args.each { |name| column(name, :cube, options) } end ActiveRecord::ConnectionAdapters::TableDefinition.define_method :vector do |*args, **options| args.each { |name| column(name, :vector, options) } end end # prevent unknown OID warning if ActiveRecord::VERSION::MAJOR >= 7 ActiveRecord::ConnectionAdapters::PostgreSQLAdapter.singleton_class.prepend(Neighbor::RegisterTypes) else ActiveRecord::ConnectionAdapters::PostgreSQLAdapter.prepend(Neighbor::RegisterTypes) end end require "neighbor/railtie" if defined?(Rails::Railtie) neighbor-0.2.3/lib/generators/0000755000175100017510000000000014452262121016411 5ustar vivekdebvivekdebneighbor-0.2.3/lib/generators/neighbor/0000755000175100017510000000000014452262121020206 5ustar vivekdebvivekdebneighbor-0.2.3/lib/generators/neighbor/vector_generator.rb0000644000175100017510000000100514452262121024077 0ustar vivekdebvivekdebrequire "rails/generators/active_record" module Neighbor module Generators class VectorGenerator < Rails::Generators::Base include ActiveRecord::Generators::Migration source_root File.join(__dir__, "templates") def copy_migration migration_template "vector.rb", "db/migrate/install_neighbor_vector.rb", migration_version: migration_version end def migration_version "[#{ActiveRecord::VERSION::MAJOR}.#{ActiveRecord::VERSION::MINOR}]" end end end end neighbor-0.2.3/lib/generators/neighbor/templates/0000755000175100017510000000000014452262121022204 5ustar vivekdebvivekdebneighbor-0.2.3/lib/generators/neighbor/templates/vector.rb.tt0000644000175100017510000000021114452262121024453 0ustar vivekdebvivekdebclass <%= migration_class_name %> < ActiveRecord::Migration<%= migration_version %> def change enable_extension "vector" end end neighbor-0.2.3/lib/generators/neighbor/templates/cube.rb.tt0000644000175100017510000000020714452262121024074 0ustar vivekdebvivekdebclass <%= migration_class_name %> < ActiveRecord::Migration<%= migration_version %> def change enable_extension "cube" end end neighbor-0.2.3/lib/generators/neighbor/cube_generator.rb0000644000175100017510000000077714452262121023532 0ustar vivekdebvivekdebrequire "rails/generators/active_record" module Neighbor module Generators class CubeGenerator < Rails::Generators::Base include ActiveRecord::Generators::Migration source_root File.join(__dir__, "templates") def copy_migration migration_template "cube.rb", "db/migrate/install_neighbor_cube.rb", migration_version: migration_version end def migration_version "[#{ActiveRecord::VERSION::MAJOR}.#{ActiveRecord::VERSION::MINOR}]" end end end end neighbor-0.2.3/README.md0000644000175100017510000001431514452262121014755 0ustar vivekdebvivekdeb# Neighbor Nearest neighbor search for Rails and Postgres [![Build Status](https://github.com/ankane/neighbor/workflows/build/badge.svg?branch=master)](https://github.com/ankane/neighbor/actions) ## Installation Add this line to your application’s Gemfile: ```ruby gem "neighbor" ``` ## Choose An Extension Neighbor supports two extensions: [cube](https://www.postgresql.org/docs/current/cube.html) and [vector](https://github.com/pgvector/pgvector). cube ships with Postgres, while vector supports approximate nearest neighbor search. For cube, run: ```sh rails generate neighbor:cube rails db:migrate ``` For vector, [install pgvector](https://github.com/pgvector/pgvector#installation) and run: ```sh rails generate neighbor:vector rails db:migrate ``` ## Getting Started Create a migration ```ruby class AddNeighborVectorToItems < ActiveRecord::Migration[7.0] def change add_column :items, :embedding, :cube # or add_column :items, :embedding, :vector, limit: 3 # dimensions end end ``` Add to your model ```ruby class Item < ApplicationRecord has_neighbors :embedding end ``` Update the vectors ```ruby item.update(embedding: [1.0, 1.2, 0.5]) ``` Get the nearest neighbors to a record ```ruby item.nearest_neighbors(:embedding, distance: "euclidean").first(5) ``` Get the nearest neighbors to a vector ```ruby Item.nearest_neighbors(:embedding, [0.9, 1.3, 1.1], distance: "euclidean").first(5) ``` ## Distance Supported values are: - `euclidean` - `cosine` - `taxicab` (cube only) - `chebyshev` (cube only) - `inner_product` (vector only) For cosine distance with cube, vectors must be normalized before being stored. ```ruby class Item < ApplicationRecord has_neighbors :embedding, normalize: true end ``` For inner product with cube, see [this example](examples/disco_user_recs_cube.rb). Records returned from `nearest_neighbors` will have a `neighbor_distance` attribute ```ruby nearest_item = item.nearest_neighbors(:embedding, distance: "euclidean").first nearest_item.neighbor_distance ``` ## Dimensions The cube data type can have up to 100 dimensions by default. See the [Postgres docs](https://www.postgresql.org/docs/current/cube.html) for how to increase this. The vector data type can have up to 16,000 dimensions, and vectors with up to 2,000 dimensions can be indexed. For cube, it’s a good idea to specify the number of dimensions to ensure all records have the same number. ```ruby class Item < ApplicationRecord has_neighbors :embedding, dimensions: 3 end ``` ## Indexing For vector, add an approximate index to speed up queries. Create a migration with: ```ruby class AddIndexToItemsNeighborVector < ActiveRecord::Migration[7.0] def change add_index :items, :embedding, using: :ivfflat, opclass: :vector_l2_ops end end ``` Use `:vector_cosine_ops` for cosine distance and `:vector_ip_ops` for inner product. Set the number of probes ```ruby Item.connection.execute("SET ivfflat.probes = 3") ``` ## Examples - [OpenAI Embeddings](#openai-embeddings) - [Disco Recommendations](#disco-recommendations) ### OpenAI Embeddings Generate a model ```sh rails generate model Article content:text embedding:vector{1536} rails db:migrate ``` And add `has_neighbors` ```ruby class Article < ApplicationRecord has_neighbors :embedding end ``` Create a method to call the [embeddings API](https://platform.openai.com/docs/guides/embeddings) ```ruby def fetch_embeddings(input) url = "https://api.openai.com/v1/embeddings" headers = { "Authorization" => "Bearer #{ENV.fetch("OPENAI_API_KEY")}", "Content-Type" => "application/json" } data = { input: input, model: "text-embedding-ada-002" } response = Net::HTTP.post(URI(url), data.to_json, headers) JSON.parse(response.body)["data"].map { |v| v["embedding"] } end ``` Pass your input ```ruby input = [ "The dog is barking", "The cat is purring", "The bear is growling" ] embeddings = fetch_embeddings(input) ``` Store the embeddings ```ruby articles = [] input.zip(embeddings) do |content, embedding| articles << {content: content, embedding: embedding} end Article.insert_all!(articles) # use create! for Active Record < 6 ``` And get similar articles ```ruby article = Article.first article.nearest_neighbors(:embedding, distance: "inner_product").first(5).map(&:content) ``` See the [complete code](examples/openai_embeddings.rb) ### Disco Recommendations You can use Neighbor for online item-based recommendations with [Disco](https://github.com/ankane/disco). We’ll use MovieLens data for this example. Generate a model ```sh rails generate model Movie name:string factors:cube rails db:migrate ``` And add `has_neighbors` ```ruby class Movie < ApplicationRecord has_neighbors :factors, dimensions: 20, normalize: true end ``` Fit the recommender ```ruby data = Disco.load_movielens recommender = Disco::Recommender.new(factors: 20) recommender.fit(data) ``` Store the item factors ```ruby movies = [] recommender.item_ids.each do |item_id| movies << {name: item_id, factors: recommender.item_factors(item_id)} end Movie.insert_all!(movies) # use create! for Active Record < 6 ``` And get similar movies ```ruby movie = Movie.find_by(name: "Star Wars (1977)") movie.nearest_neighbors(:factors, distance: "cosine").first(5).map(&:name) ``` See the complete code for [cube](examples/disco_item_recs_cube.rb) and [vector](examples/disco_item_recs_vector.rb) ## Upgrading ### 0.2.0 The `distance` option has been moved from `has_neighbors` to `nearest_neighbors`, and there is no longer a default. If you use cosine distance, set: ```ruby class Item < ApplicationRecord has_neighbors normalize: true end ``` ## History View the [changelog](https://github.com/ankane/neighbor/blob/master/CHANGELOG.md) ## Contributing Everyone is encouraged to help improve this project. Here are a few ways you can help: - [Report bugs](https://github.com/ankane/neighbor/issues) - Fix bugs and [submit pull requests](https://github.com/ankane/neighbor/pulls) - Write, clarify, or fix documentation - Suggest or add new features To get started with development: ```sh git clone https://github.com/ankane/neighbor.git cd neighbor bundle install createdb neighbor_test # cube bundle exec rake test # vector EXT=vector bundle exec rake test ``` neighbor-0.2.3/LICENSE.txt0000644000175100017510000000207314452262121015317 0ustar vivekdebvivekdebThe MIT License (MIT) Copyright (c) 2021-2023 Andrew Kane Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. neighbor-0.2.3/CHANGELOG.md0000644000175100017510000000144414452262121015306 0ustar vivekdebvivekdeb## 0.2.3 (2023-04-02) - Added support for dimensions to model generator ## 0.2.2 (2022-07-13) - Added support for configurable attribute name - Added support for multiple attributes per model ## 0.2.1 (2021-12-15) - Added support for Active Record 7 ## 0.2.0 (2021-04-21) - Added support for pgvector - Added `normalize` option - Made `dimensions` optional - Raise an error if `nearest_neighbors` already defined - Raise an error for non-finite values - Fixed NaN with zero vectors and cosine distance Breaking changes - The `distance` option has been moved from `has_neighbors` to `nearest_neighbors`, and there is no longer a default ## 0.1.2 (2021-02-21) - Added `nearest_neighbors` scope ## 0.1.1 (2021-02-16) - Fixed `Could not dump table` error ## 0.1.0 (2021-02-15) - First release