diff --git a/Gemfile b/Gemfile index 5a4c8f2..ce0b0b1 100644 --- a/Gemfile +++ b/Gemfile @@ -10,5 +10,7 @@ gem "rake", "~> 13.0" gem "rspec", "~> 3.0" gem "rubocop", "~> 1.21" +gem "rubocop-rake" +gem "rubocop-rspec" gem "debug" diff --git a/Gemfile.lock b/Gemfile.lock index 2b567ad..bd0ad4f 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -147,6 +147,19 @@ GEM unicode-display_width (>= 2.4.0, < 3.0) rubocop-ast (1.31.2) parser (>= 3.3.0.4) + rubocop-capybara (2.20.0) + rubocop (~> 1.41) + rubocop-factory_bot (2.25.1) + rubocop (~> 1.41) + rubocop-rake (0.6.0) + rubocop (~> 1.0) + rubocop-rspec (2.29.1) + rubocop (~> 1.40) + rubocop-capybara (~> 2.17) + rubocop-factory_bot (~> 2.22) + rubocop-rspec_rails (~> 2.28) + rubocop-rspec_rails (2.28.3) + rubocop (~> 1.40) ruby-progressbar (1.13.0) safe_yaml (1.0.5) sass-embedded (1.75.0-arm64-darwin) @@ -183,6 +196,8 @@ DEPENDENCIES rake (~> 13.0) rspec (~> 3.0) rubocop (~> 1.21) + rubocop-rake + rubocop-rspec BUNDLED WITH 2.5.6 diff --git a/lib/jekyll_ai_related_posts.rb b/lib/jekyll_ai_related_posts.rb index 6f1b576..ed7d370 100644 --- a/lib/jekyll_ai_related_posts.rb +++ b/lib/jekyll_ai_related_posts.rb @@ -7,7 +7,7 @@ loader.setup module JekyllAiRelatedPosts - GEM_ROOT = File.expand_path('..', __dir__) + GEM_ROOT = File.expand_path("..", __dir__) class Error < StandardError; end end diff --git a/lib/jekyll_ai_related_posts/generator.rb b/lib/jekyll_ai_related_posts/generator.rb index 024a775..e82a8b5 100644 --- a/lib/jekyll_ai_related_posts/generator.rb +++ b/lib/jekyll_ai_related_posts/generator.rb @@ -1,18 +1,19 @@ -require 'active_record' -require 'sqlite3' -require 'sqlite_vss' -require 'jekyll' -require 'json' +# frozen_string_literal: true + +require "active_record" +require "sqlite3" +require "sqlite_vss" +require "jekyll" +require "json" module JekyllAiRelatedPosts class Generator < Jekyll::Generator - def generate(site) @site = site setup_database if fetch_enabled? - Jekyll.logger.info '[ai_related_posts] Generating related posts...' + Jekyll.logger.info "[ai_related_posts] Generating related posts..." @embeddings_fetcher = new_fetcher @site.posts.docs.each do |p| @@ -28,7 +29,7 @@ def generate(site) find_related(p) end else - Jekyll.logger.info '[ai_related_posts] Using cached related posts data...' + Jekyll.logger.info "[ai_related_posts] Using cached related posts data..." @site.posts.docs.each do |p| fallback_generate_related(p) @@ -40,30 +41,30 @@ def generate(site) def fetch_enabled? enabled = true - if @site.config['ai_related_posts']['fetch_enabled'].is_a? String - enabled = ENV['JEKYLL_ENV'] == @site.config['ai_related_posts']['fetch_enabled'] - elsif [true, false].include? @site.config['ai_related_posts']['fetch_enabled'] - enabled = @site.config['ai_related_posts']['fetch_enabled'] + if @site.config["ai_related_posts"]["fetch_enabled"].is_a? String + enabled = ENV["JEKYLL_ENV"] == @site.config["ai_related_posts"]["fetch_enabled"] + elsif [true, false].include? @site.config["ai_related_posts"]["fetch_enabled"] + enabled = @site.config["ai_related_posts"]["fetch_enabled"] end - enabled + enabled end def fallback_generate_related(post) existing = Models::Post.find_by(relative_path: post.relative_path) if existing.nil? - post.data['ai_related_posts'] = post.related_posts + post.data["ai_related_posts"] = post.related_posts else find_related(post) end end def new_fetcher - case @site.config['ai_related_posts']['embeddings_source'] - when 'mock' + case @site.config["ai_related_posts"]["embeddings_source"] + when "mock" MockEmbeddings.new else - OpenAiEmbeddings.new(@site.config['ai_related_posts']['openai_api_key']) + OpenAiEmbeddings.new(@site.config["ai_related_posts"]["openai_api_key"]) end end @@ -72,25 +73,27 @@ def ensure_embedding_cached(post) # Clear cache if post has been updated if !existing.nil? && existing.embedding_text != embedding_text(post) - sql = 'DELETE FROM vss_posts WHERE rowid = (SELECT rowid FROM posts WHERE relative_path = :relative_path);' - ActiveRecord::Base.connection.execute(ActiveRecord::Base.sanitize_sql([sql, relative_path: post.relative_path])) + sql = "DELETE FROM vss_posts WHERE rowid = (SELECT rowid FROM posts WHERE relative_path = :relative_path);" + ActiveRecord::Base.connection.execute(ActiveRecord::Base.sanitize_sql([sql, + { relative_path: post.relative_path }])) existing.destroy! existing = nil end - if existing.nil? - Models::Post.create!( - relative_path: post.relative_path, - embedding_text: embedding_text(post), - embedding: embedding_for(post).to_json - ) + return unless existing.nil? + + Models::Post.create!( + relative_path: post.relative_path, + embedding_text: embedding_text(post), + embedding: embedding_for(post).to_json + ) - sql = <<-SQL + sql = <<-SQL INSERT INTO vss_posts (rowid, post_embedding) SELECT rowid, embedding FROM posts WHERE relative_path = :relative_path; - SQL - ActiveRecord::Base.connection.execute(ActiveRecord::Base.sanitize_sql([sql, relative_path: post.relative_path])) - end + SQL + ActiveRecord::Base.connection.execute(ActiveRecord::Base.sanitize_sql([sql, + { relative_path: post.relative_path }])) end def find_related(post) @@ -104,9 +107,11 @@ def find_related(post) LIMIT 10000; SQL - results = ActiveRecord::Base.connection.execute(ActiveRecord::Base.sanitize_sql([sql, relative_path: post.relative_path])) + results = ActiveRecord::Base.connection.execute(ActiveRecord::Base.sanitize_sql([sql, { + relative_path: post.relative_path + }])) # The first result is the post itself, with a distance of 0. - rowids = results.sort_by { |r| r['distance'] }.drop(1).first(3).map { |r| r['rowid'] } + rowids = results.sort_by { |r| r["distance"] }.drop(1).first(3).map { |r| r["rowid"] } posts_by_rowid = {} rowids.each do |rowid| @@ -118,21 +123,17 @@ def find_related(post) end related_posts = rowids.map do |rowid| - relative_path = posts_by_rowid[rowid]['relative_path'] + relative_path = posts_by_rowid[rowid]["relative_path"] @indexed_posts[relative_path] end - post.data['ai_related_posts'] = related_posts + post.data["ai_related_posts"] = related_posts end def embedding_text(post) - text = "Title: #{post.data['title']}" - unless post.data['categories'].empty? - text += "; Categories: #{post.data['categories'].join(', ')}" - end - unless post.data['tags'].empty? - text += "; Tags: #{post.data['tags'].join(', ')}" - end + text = "Title: #{post.data["title"]}" + text += "; Categories: #{post.data["categories"].join(", ")}" unless post.data["categories"].empty? + text += "; Tags: #{post.data["tags"].join(", ")}" unless post.data["tags"].empty? text end @@ -143,14 +144,14 @@ def embedding_for(post) @embeddings_fetcher.embedding_for(input) end - + def setup_database ActiveRecord::Base.establish_connection( - adapter: 'sqlite3', - database: @site.in_source_dir('.ai_related_posts_cache.sqlite3') + adapter: "sqlite3", + database: @site.in_source_dir(".ai_related_posts_cache.sqlite3") ) # We don't need WAL mode for this. - ActiveRecord::Base.connection.execute('PRAGMA journal_mode=DELETE;') + ActiveRecord::Base.connection.execute("PRAGMA journal_mode=DELETE;") # Enable sqlite-vss vector extension db = ActiveRecord::Base.connection.raw_connection diff --git a/lib/jekyll_ai_related_posts/models/post.rb b/lib/jekyll_ai_related_posts/models/post.rb index 725a620..9235eab 100644 --- a/lib/jekyll_ai_related_posts/models/post.rb +++ b/lib/jekyll_ai_related_posts/models/post.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + module JekyllAiRelatedPosts module Models class Post < ActiveRecord::Base diff --git a/lib/jekyll_ai_related_posts/open_ai_embeddings.rb b/lib/jekyll_ai_related_posts/open_ai_embeddings.rb index 9c20673..30593be 100644 --- a/lib/jekyll_ai_related_posts/open_ai_embeddings.rb +++ b/lib/jekyll_ai_related_posts/open_ai_embeddings.rb @@ -1,34 +1,35 @@ -require 'faraday' +# frozen_string_literal: true + +require "faraday" module JekyllAiRelatedPosts class OpenAiEmbeddings DIMENSIONS = 1536 def initialize(api_key, connection: nil) - if connection.nil? - @connection = Faraday.new(url: 'https://api.openai.com') do |builder| - builder.request :authorization, 'Bearer', api_key - builder.request :json - builder.response :json - builder.response :raise_error - end - else - @connection = connection - end + @connection = if connection.nil? + Faraday.new(url: "https://api.openai.com") do |builder| + builder.request :authorization, "Bearer", api_key + builder.request :json + builder.response :json + builder.response :raise_error + end + else + connection + end end def embedding_for(text) - res = @connection.post('/v1/embeddings') do |req| + res = @connection.post("/v1/embeddings") do |req| req.body = { input: text, - model: 'text-embedding-3-small' + model: "text-embedding-3-small" } end - - res.body['data'].first['embedding'] + res.body["data"].first["embedding"] rescue Faraday::Error => e - Jekyll.logger.error 'Error response from OpanAI API!' + Jekyll.logger.error "Error response from OpanAI API!" Jekyll.logger.error e.inspect raise diff --git a/spec/jekyll_ai_related_posts/generator_spec.rb b/spec/jekyll_ai_related_posts/generator_spec.rb index 3eda4be..c616f1b 100644 --- a/spec/jekyll_ai_related_posts/generator_spec.rb +++ b/spec/jekyll_ai_related_posts/generator_spec.rb @@ -1,14 +1,15 @@ # frozen_string_literal: true -require 'debug' -require 'ostruct' + +require "debug" +require "ostruct" RSpec.describe JekyllAiRelatedPosts::Generator do let(:config_overrides) do { - 'ai_related_posts' => { - 'openai_api_key' => 'my_key', - 'embeddings_source' => 'mock' - }, + "ai_related_posts" => { + "openai_api_key" => "my_key", + "embeddings_source" => "mock" + } } end let(:site) do @@ -16,52 +17,55 @@ end before(:each) do - File.delete(site.in_source_dir('.ai_related_posts_cache.sqlite3')) + File.delete(site.in_source_dir(".ai_related_posts_cache.sqlite3")) rescue Errno::ENOENT end - it 'generates related posts' do + it "generates related posts" do site.process - wifi_upgrades = File.read(dest_dir("2023", "12", "22", "home-wifi-upgrades-adding-an-access-point-with-wired-backhaul.html")) - expect(wifi_upgrades).to include('1:::Analyzing Static Website Logs with AWStats') - expect(wifi_upgrades).to include('2:::Catching Mew: A Playable Game Boy Quote') + wifi_upgrades = File.read(dest_dir("2023", "12", "22", + "home-wifi-upgrades-adding-an-access-point-with-wired-backhaul.html")) + expect(wifi_upgrades).to include("1:::Analyzing Static Website Logs with AWStats") + expect(wifi_upgrades).to include("2:::Catching Mew: A Playable Game Boy Quote") end - it 'regenerates when posts are edited' do + it "regenerates when posts are edited" do # Create the cache site.process - contents = File.read('spec/source/_posts/2023-12-22-home-wifi-upgrades-adding-an-access-point-with-wired-backhaul.md') - contents.gsub!(/title:.+/, 'title: How to Catch Pokemon') - File.open('spec/source/_posts/2023-12-22-home-wifi-upgrades-adding-an-access-point-with-wired-backhaul.md', 'w') do |file| + contents = File.read("spec/source/_posts/2023-12-22-home-wifi-upgrades-adding-an-access-point-with-wired-backhaul.md") + contents.gsub!(/title:.+/, "title: How to Catch Pokemon") + File.open("spec/source/_posts/2023-12-22-home-wifi-upgrades-adding-an-access-point-with-wired-backhaul.md", + "w") do |file| file.write(contents) end expect_any_instance_of(MockEmbeddings) .to receive(:embedding_for) - .with('Title: How to Catch Pokemon; Tags: Technology') + .with("Title: How to Catch Pokemon; Tags: Technology") .and_call_original site.process ensure contents.gsub!(/title:.+/, 'title: "Home WiFi Upgrades: Adding an Access Point with Wired Backhaul"') - File.open('spec/source/_posts/2023-12-22-home-wifi-upgrades-adding-an-access-point-with-wired-backhaul.md', 'w') do |file| + File.open("spec/source/_posts/2023-12-22-home-wifi-upgrades-adding-an-access-point-with-wired-backhaul.md", + "w") do |file| file.write(contents) end end - context 'fetch disabled' do + context "fetch disabled" do let(:config_overrides) do { - 'ai_related_posts' => { - 'openai_api_key' => 'my_key', - 'embeddings_source' => 'mock', - 'fetch_enabled' => false - }, + "ai_related_posts" => { + "openai_api_key" => "my_key", + "embeddings_source" => "mock", + "fetch_enabled" => false + } } end - it 'does not fetch embeddings from the API' do + it "does not fetch embeddings from the API" do expect_any_instance_of(MockEmbeddings).not_to receive(:embedding_for) site.process diff --git a/spec/jekyll_ai_related_posts/open_ai_embeddings_spec.rb b/spec/jekyll_ai_related_posts/open_ai_embeddings_spec.rb index 2b8a1b4..3b5746c 100644 --- a/spec/jekyll_ai_related_posts/open_ai_embeddings_spec.rb +++ b/spec/jekyll_ai_related_posts/open_ai_embeddings_spec.rb @@ -1,38 +1,39 @@ # frozen_string_literal: true -require 'json' + +require "json" RSpec.describe JekyllAiRelatedPosts::OpenAiEmbeddings do let(:stubs) { Faraday::Adapter::Test::Stubs.new } let(:conn) do Faraday.new do |builder| builder.adapter :test, stubs - builder.request :authorization, 'Bearer', 'my_key' + builder.request :authorization, "Bearer", "my_key" builder.request :json builder.response :json builder.response :raise_error end end subject do - JekyllAiRelatedPosts::OpenAiEmbeddings.new('my_key', connection: conn) + JekyllAiRelatedPosts::OpenAiEmbeddings.new("my_key", connection: conn) end - it 'makes a request to OpenAI API' do - stubs.post('/v1/embeddings') do |env| + it "makes a request to OpenAI API" do + stubs.post("/v1/embeddings") do |_env| [ 200, - { 'Content-Type' => 'application/json' }, - { data: [ { embedding: [ 0.01, 0.02] } ] }.to_json + { "Content-Type" => "application/json" }, + { data: [{ embedding: [0.01, 0.02] }] }.to_json ] end - expect(subject.embedding_for('My test')).to eq([0.01, 0.02]) + expect(subject.embedding_for("My test")).to eq([0.01, 0.02]) end - it 'handles an error response' do - stubs.post('/v1/embeddings') do |env| + it "handles an error response" do + stubs.post("/v1/embeddings") do |_env| [ 429, - { 'Content-Type' => 'application/json' }, + { "Content-Type" => "application/json" }, { error: { message: "You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.", @@ -44,6 +45,6 @@ ] end - expect { capture_output { subject.embedding_for('My test') } }.to raise_error Faraday::Error + expect { capture_output { subject.embedding_for("My test") } }.to raise_error Faraday::Error end end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 7cf7d83..3ab83be 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -5,12 +5,12 @@ class MockEmbeddings def embedding_for(text) file = - if text.include?('Mew') - 'catching_mew_embedding.json' - elsif text.include?('AWStats') - 'awstats_embedding.json' + if text.include?("Mew") + "catching_mew_embedding.json" + elsif text.include?("AWStats") + "awstats_embedding.json" else - 'home_wifi_embedding.json' + "home_wifi_embedding.json" end JSON.parse(File.read("spec/fixtures/#{file}")) @@ -41,7 +41,7 @@ def test_dir(*subdirs) def temp_dir(*subdirs) if Jekyll::Utils::Platforms.vanilla_windows? - drive = Dir.pwd.sub(%r!^([^/]+).*!, '\1') + drive = Dir.pwd.sub(%r{^([^/]+).*}, '\1') temp_root = File.join(drive, "tmp") else temp_root = "/tmp" @@ -60,8 +60,8 @@ def fixture_document(relative_path) site = fixture_site( "collections" => { "methods" => { - "output" => true, - }, + "output" => true + } } ) site.read