mirror of
https://gh.wpcy.net/https://github.com/discourse/discourse.git
synced 2026-06-19 09:24:23 +08:00
Reported through a few user tests, the agent was unreliable in three ways: - `tags.tag_name` instead of `tags.name` - did not use `current_user_id` for "my posts" prompts - plural nouns as singular - and used unparse-able date defaults like "today". Few issues: - `DbSchema` tool was returning a dense one-line-per-table comma that qwen was unable to deal with. Now line-per-column so schema accuracy originally flaky is now 5/5 PASSING on qwen and Gemini. - The prompt was teaching the wrong thing where the `-- null boolean :opt_flag = #null` example made models use `#null` as a default value. We now have a "Parameter rules" section, ISO date examples that match the "no natural-language defaults" rule below them, explicit `current_user_id` guidance for first-person prompts, and a plural-noun rule that applies to each plural noun independently in the same prompt (e.g. "categories and tags" → BOTH list params, not one of each). - Eval runner now captures `name` and `description` separately, not just `sql`. The description text is graded directly rather than grading the SQL string. Tested against qwen 3.5 122B (our hosted model) + Gemini 3.1 Flash Lite (judge GPT-5.2): 20/20 each. New eval cases ship in this PR https://github.com/discourse/discourse-ai-evals/pull/18
69 lines
2.3 KiB
Ruby
Vendored
69 lines
2.3 KiB
Ruby
Vendored
# frozen_string_literal: true
|
|
|
|
RSpec.describe DiscourseAi::Agents::Tools::DbSchema do
|
|
fab!(:llm_model)
|
|
let(:bot_user) { DiscourseAi::AiBot::EntryPoint.find_user_from_model(llm_model.name) }
|
|
let(:llm) { DiscourseAi::Completions::Llm.proxy(llm_model) }
|
|
|
|
before do
|
|
enable_current_plugin
|
|
SiteSetting.ai_bot_enabled = true
|
|
end
|
|
|
|
def invoke(tables)
|
|
described_class.new({ tables: tables }, bot_user: bot_user, llm: llm).invoke
|
|
end
|
|
|
|
describe "#invoke" do
|
|
it "returns one column per line with table headers" do
|
|
info = invoke("posts,topics")[:schema_info]
|
|
|
|
expect(info).to match(/^TABLE posts$/)
|
|
expect(info).to match(/^TABLE topics$/)
|
|
expect(info).to match(/^ id integer, PK$/)
|
|
end
|
|
|
|
it "marks foreign-key columns using Discourse's curated map" do
|
|
info = invoke("topic_tags,topics")[:schema_info]
|
|
|
|
expect(info).to match(/^ topic_id integer, FK → topics/)
|
|
expect(info).to match(/^ user_id integer, FK → users/)
|
|
expect(info).to match(/^ last_post_user_id integer, FK → users/) # via *_user_id rule
|
|
end
|
|
|
|
it "does not invent FK targets for non-FK _id columns" do
|
|
info = invoke("topics")[:schema_info]
|
|
|
|
# Previous heuristic produced misleading targets like "FK → externals" / "FK → featured_user1s".
|
|
# Now we only mark FKs the Discourse map actually knows about.
|
|
expect(info).not_to include("FK → externals")
|
|
expect(info).not_to include("FK → featured_user1s")
|
|
end
|
|
|
|
it "simplifies long postgres type names" do
|
|
info = invoke("topics")[:schema_info]
|
|
|
|
expect(info).to include("varchar")
|
|
expect(info).to include("timestamp")
|
|
expect(info).not_to include("character varying")
|
|
expect(info).not_to include("timestamp without time zone")
|
|
end
|
|
|
|
it "marks nullable columns" do
|
|
info = invoke("topics")[:schema_info]
|
|
|
|
expect(info).to match(/^ deleted_at timestamp, null$/)
|
|
end
|
|
|
|
it "reports missing tables instead of silently dropping them" do
|
|
info = invoke("topics,not_a_real_table")[:schema_info]
|
|
|
|
expect(info).to include("TABLE topics")
|
|
expect(info).to include("TABLES NOT FOUND: not_a_real_table")
|
|
end
|
|
|
|
it "returns the requested tables string" do
|
|
expect(invoke("posts,topics")[:tables]).to eq("posts,topics")
|
|
end
|
|
end
|
|
end
|