/* * Princip jednoduchého fulltextu s příklady v SQL a PHP * SQL kód k článku, verze 1.0, 21. ledna 2005 Autor: Dalibor Šrámek Umístění: http://www.insula.cz/dali/ Testováno na PostgreSQL 8.0. */ ---- Jednoduchá funkčnost - vytvoření a naplnění tabulek -- Kostra tabulky pro uložení článků CREATE TABLE Article ( IdArticle INTEGER NOT NULL, ArticleText TEXT NOT NULL, -- pokud DB nepodporuje typ TEXT postačí VARCHAR PRIMARY KEY (IdArticle) ); -- Tabulka pro uložení slovníku CREATE TABLE FT_Word ( IdWord INTEGER NOT NULL, Word VARCHAR(50), PRIMARY KEY (IdWord) ); CREATE SEQUENCE FT_Word_IdWord_SEQ; CREATE UNIQUE INDEX FT_Word_Word_IX ON FT_Word (Word); -- Vazební tabulka realizující vazbu "slovo se vyskytuje v článku" CREATE TABLE FT_Index ( IdWord INTEGER NOT NULL, IdArticle INTEGER NOT NULL, PRIMARY KEY (IdWord, IdArticle) ); ALTER TABLE FT_Index ADD FOREIGN KEY (IdWord) REFERENCES FT_Word (IdWord); ALTER TABLE FT_Index ADD FOREIGN KEY (IdArticle) REFERENCES Article (IdArticle); -- Cvičné naplnění tabulky článků INSERT INTO ARTICLE VALUES( 1, 'jahoda borůvka malina'); INSERT INTO ARTICLE VALUES( 2, 'jahoda jahoda banán'); INSERT INTO ARTICLE VALUES( 3, 'banán borůvka borůvka'); INSERT INTO ARTICLE VALUES( 4, 'malina smetana'); INSERT INTO ARTICLE VALUES( 5, 'malinová zmrzlina'); INSERT INTO ARTICLE VALUES( 6, 'malinkatá jahoda'); INSERT INTO ARTICLE VALUES( 7, 'banán smetanová vanilková zmrzlina'); INSERT INTO ARTICLE VALUES( 8, 'malinký banán'); INSERT INTO ARTICLE VALUES( 9, 'jahodový cep'); INSERT INTO ARTICLE VALUES(10, 'malinkatý čep'); -- Cvičné naplnění tabulky slovníku INSERT INTO FT_Word VALUES( 1, 'banán'); INSERT INTO FT_Word VALUES( 2, 'borůvka'); INSERT INTO FT_Word VALUES( 3, 'jahoda'); INSERT INTO FT_Word VALUES( 4, 'jahodová'); INSERT INTO FT_Word VALUES( 5, 'jahodový'); INSERT INTO FT_Word VALUES( 6, 'malina'); INSERT INTO FT_Word VALUES( 7, 'malinkatá'); INSERT INTO FT_Word VALUES( 8, 'malinkatý'); INSERT INTO FT_Word VALUES( 9, 'malinká'); INSERT INTO FT_Word VALUES(10, 'malinký'); INSERT INTO FT_Word VALUES(11, 'malinová'); INSERT INTO FT_Word VALUES(12, 'malinový'); INSERT INTO FT_Word VALUES(13, 'smetana'); INSERT INTO FT_Word VALUES(14, 'smetanová'); INSERT INTO FT_Word VALUES(15, 'smetanové'); INSERT INTO FT_Word VALUES(16, 'vanilka'); INSERT INTO FT_Word VALUES(17, 'vanilková'); INSERT INTO FT_Word VALUES(18, 'zmrzlina'); INSERT INTO FT_Word VALUES(19, 'cep'); INSERT INTO FT_Word VALUES(20, 'čep'); -- Cvičné naplnění vazební tabulky (odpovídá naplnění článků a slovníku) INSERT INTO FT_Index VALUES( 3, 1); INSERT INTO FT_Index VALUES( 2, 1); INSERT INTO FT_Index VALUES( 6, 1); INSERT INTO FT_Index VALUES( 3, 2); INSERT INTO FT_Index VALUES( 1, 2); INSERT INTO FT_Index VALUES( 1, 3); INSERT INTO FT_Index VALUES( 2, 3); INSERT INTO FT_Index VALUES( 6, 4); INSERT INTO FT_Index VALUES(13, 4); INSERT INTO FT_Index VALUES(11, 5); INSERT INTO FT_Index VALUES(18, 5); INSERT INTO FT_Index VALUES( 7, 6); INSERT INTO FT_Index VALUES( 3, 6); INSERT INTO FT_Index VALUES( 1, 7); INSERT INTO FT_Index VALUES(14, 7); INSERT INTO FT_Index VALUES(17, 7); INSERT INTO FT_Index VALUES(18, 7); INSERT INTO FT_Index VALUES(10, 8); INSERT INTO FT_Index VALUES( 1, 8); INSERT INTO FT_Index VALUES( 5, 9); INSERT INTO FT_Index VALUES(19, 9); INSERT INTO FT_Index VALUES( 8, 10); INSERT INTO FT_Index VALUES(20, 10); ---- Jednoduchá funkčnost - příklady dotazů -- Dotaz: jahoda SELECT I.IdArticle FROM FT_Word W, FT_Index I WHERE W.IdWord = I.IdWord AND W.Word = 'jahoda'; -- Dotaz: jahoda OR borůvka OR malina SELECT I.IdArticle, COUNT(*) FROM FT_Word W, FT_Index I WHERE W.IdWord = I.IdWord AND W.Word IN ('jahoda', 'borůvka', 'malina') GROUP BY I.IdArticle; -- Dotaz: jahoda AND borůvka AND malina SELECT I.IdArticle, COUNT(*) FROM FT_Word W, FT_Index I WHERE W.IdWord = I.IdWord AND W.Word IN ('jahoda', 'borůvka', 'malina') GROUP BY I.IdArticle HAVING COUNT(*) = 3; -- Dotaz: jahoda AND NOT borůvka SELECT I.IdArticle FROM FT_Word W, FT_Index I WHERE W.IdWord = I.IdWord AND W.Word = 'jahoda' AND I.IdArticle NOT IN ( SELECT I.IdArticle FROM FT_Word W, FT_Index I WHERE W.IdWord = I.IdWord AND W.Word = 'borůvka' ); -- Slova ve slovníku začínající na "malink" SELECT * FROM FT_Word WHERE Word >= 'malink' AND Word < 'malinl'; -- Dotaz: malink* SELECT I.IdArticle FROM FT_Word W, FT_Index I WHERE W.IdWord = I.IdWord AND W.Word >= 'malink' AND W.Word < 'malinl'; ---- Pokročilejší funkčnost - úprava tabulek -- Data pro vyhledávání bez diakritiky (vynechte na jiných DB než PostgreSQL) ALTER TABLE FT_Word ADD WordND VARCHAR(50) NULL; UPDATE FT_Word SET WordND = to_ascii(Word, 'iso_8859_2'); ALTER TABLE FT_Word ALTER COLUMN WordND SET NOT NULL; CREATE INDEX FT_Word_WordND_IX ON FT_Word (WordND); -- Úprava invertovaného souboru tak, aby obsahoval pozici slova v článku DROP TABLE FT_Index; CREATE TABLE FT_Index ( IdWord INTEGER NOT NULL, IdArticle INTEGER NOT NULL, WordPosition INTEGER NOT NULL, PRIMARY KEY (IdWord, IdArticle, WordPosition) ); ALTER TABLE FT_Index ADD FOREIGN KEY (IdWord) REFERENCES FT_Word (IdWord); ALTER TABLE FT_Index ADD FOREIGN KEY (IdArticle) REFERENCES Article (IdArticle); -- Opakované naplnění vazební tabulky včetně pozice slov INSERT INTO FT_Index VALUES( 3, 1, 1); INSERT INTO FT_Index VALUES( 2, 1, 2); INSERT INTO FT_Index VALUES( 6, 1, 3); INSERT INTO FT_Index VALUES( 3, 2, 1); INSERT INTO FT_Index VALUES( 3, 2, 2); INSERT INTO FT_Index VALUES( 1, 2, 3); INSERT INTO FT_Index VALUES( 1, 3, 1); INSERT INTO FT_Index VALUES( 2, 3, 2); INSERT INTO FT_Index VALUES( 2, 3, 3); INSERT INTO FT_Index VALUES( 6, 4, 1); INSERT INTO FT_Index VALUES(13, 4, 2); INSERT INTO FT_Index VALUES(11, 5, 1); INSERT INTO FT_Index VALUES(18, 5, 2); INSERT INTO FT_Index VALUES( 7, 6, 1); INSERT INTO FT_Index VALUES( 3, 6, 2); INSERT INTO FT_Index VALUES( 1, 7, 1); INSERT INTO FT_Index VALUES(14, 7, 2); INSERT INTO FT_Index VALUES(17, 7, 3); INSERT INTO FT_Index VALUES(18, 7, 4); INSERT INTO FT_Index VALUES(10, 8, 1); INSERT INTO FT_Index VALUES( 1, 8, 2); INSERT INTO FT_Index VALUES( 5, 9, 1); INSERT INTO FT_Index VALUES(19, 9, 2); INSERT INTO FT_Index VALUES( 8, 10, 1); INSERT INTO FT_Index VALUES(20, 10, 2); ---- Pokročilejší funkčnost - příklady dotazů -- Vyhledávání bez diakritiky SELECT I.IdArticle FROM FT_Word W, FT_Index I WHERE W.IdWord = I.IdWord AND W.WordND = 'cep'; -- Dotaz s řazením výsledků podle relevance: jahoda SELECT I.IdArticle, COUNT(*), MIN(WordPosition) FROM FT_Word W, FT_Index I WHERE W.IdWord = I.IdWord AND W.Word = 'jahoda' GROUP BY I.IdArticle ORDER BY COUNT(*) DESC, MIN(WordPosition); -- Dotaz s řazením výsledků podle relevance: jahoda OR borůvka OR malina SELECT IdArticle, COUNT(IdWord), SUM(WordCount), AVG(WordPos) FROM ( SELECT I.IdArticle, I.IdWord, COUNT(*) AS WordCount, MIN(WordPosition) AS WordPos FROM FT_Word W, FT_Index I WHERE W.IdWord = I.IdWord AND W.Word IN ('jahoda', 'borůvka', 'malina') GROUP BY I.IdArticle, I.IdWord ) ArtWord GROUP BY IdArticle ORDER BY COUNT(IdWord) DESC, SUM(WordCount) DESC, AVG(WordPos); -- Dotaz s řazením výsledků podle relevance: jahoda AND borůvka AND malina SELECT IdArticle, COUNT(IdWord), SUM(WordCount), AVG(WordPos) FROM ( SELECT I.IdArticle, I.IdWord, COUNT(*) AS WordCount, MIN(WordPosition) AS WordPos FROM FT_Word W, FT_Index I WHERE W.IdWord = I.IdWord AND W.Word IN ('jahoda', 'borůvka', 'malina') GROUP BY I.IdArticle, I.IdWord ) ArtWord GROUP BY IdArticle HAVING COUNT(IdWord) = 3 ORDER BY COUNT(IdWord) DESC, SUM(WordCount) DESC, AVG(WordPos); -- Nástin dotazu na frázi: "jahoda banán" SELECT Word1.IdArticle FROM ( SELECT I.IdArticle, WordPosition FROM FT_Word W, FT_Index I WHERE W.IdWord = I.IdWord AND W.Word = 'jahoda' ) Word1, ( SELECT I.IdArticle, WordPosition FROM FT_Word W, FT_Index I WHERE W.IdWord = I.IdWord AND W.Word = 'banán' ) Word2 WHERE Word1.IdArticle = Word2.IdArticle AND Word2.WordPosition = Word1.WordPosition + 1;