Filestream and Full-Text – Full Solution for Document Indexing in SQL Server

This post has been republished via RSS; it originally appeared at: SQL Server Support articles.

First published on MSDN on Oct 23, 2017
Recently someone was asking if there is a simple SQL Server solution where you would generate text documents and simply pass them to SQL Server to catalog without writing an application to do this. Also, would like to be able to index and search the context of these documents.

The solution in SQL Server would involve Filestream and Full-Text technologies. There are two alternatives:

    1. Use FileTable

 

    1. Use a regular table



Below is a sample script of what each scenario would look like:
/***********************************************
Using FileTable
************************************************/
use master

exec sp_configure 'filestream access level', 2
go

--AT THIS POINT, YOU NEED TO RESTART SQL SERVER

--See https://docs.microsoft.com/en-us/sql/relational-databases/blob/enable-and-configure-filestream?view=sql-server-2017


drop database FileTableDB_forTextDocs
go
CREATE DATABASE FileTableDB_forTextDocs
ON PRIMARY (
NAME = N'FileTableDB',
FILENAME = N'C:\Program Files\Microsoft SQL Server\MSSQL13.MSSQLSERVER\MSSQL\DATA\FileTableFileTableDB.mdf' ),
FILEGROUP FilestreamFG CONTAINS FILESTREAM (
NAME = FileStreamGroup1,
FILENAME= 'C:\Program Files\Microsoft SQL Server\MSSQL13.MSSQLSERVER\MSSQL\DATA\FileTableData' )
LOG ON (
NAME = N'FileTableDB_Log',
FILENAME = N'C:\Program Files\Microsoft SQL Server\MSSQL13.MSSQLSERVER\MSSQL\DATA\FileTableFileTableDB_log.ldf' )
WITH FILESTREAM (
NON_TRANSACTED_ACCESS = FULL,
DIRECTORY_NAME = N'TextesFileTable')
go

--validate the db with directory name
SELECT DB_NAME ( database_id ), directory_name, non_transacted_access, non_transacted_access_desc
FROM sys.database_filestream_options
where directory_name is not null
GO

use FileTableDB_forTextDocs
GO
CREATE TABLE dbo.TextDocuments AS FileTable
WITH (
FileTable_Directory = 'TextesFileTable',
FileTable_Collate_Filename = database_default);
GO

--validate the table is there
SELECT * FROM sys.filetables;
GO
SELECT * FROM sys.tables WHERE is_filetable = 1;
GO
--check if there is something in there - NOTHING YET
select * from dbo.TextDocuments
GO
--find the location of the filetable root path (DIRECTORY_NAME)
SELECT FileTableRootPath('dbo.TextDocuments');

--Copy or create 2-3 simple text files in the share discovered via the above command
--MAKE SURE YOU CLOSE THE SHARE BECAUSE KEEPING IT OPEN IN WINDOWS EXPLORER CAN CAUSE BLOCKING
--Now check if there is something in there - should see 2-3 documents

select * from dbo.TextDocuments
GO

--now create FT index on this table
--first, the catalog

CREATE FULLTEXT CATALOG [FTCat1] WITH ACCENT_SENSITIVITY = ON
GO

--get the primary/unique key name
exec sp_help Textdocuments

--create the FT index
CREATE FULLTEXT INDEX ON [dbo].[TextDocuments] (file_stream TYPE COLUMN [name]) KEY INDEX [PK__TextDocum__5A5B77D58E492DCE] ON ([FTCat1]) WITH (CHANGE_TRACKING AUTO)
GO
--Now search using Full-Text Contains()
select * from dbo.TextDocuments
where contains (file_stream, 'word_in_document')


2. Using a regular table
/*********************************************
Using Filestream and regular table
**********************************************/
--Create a simple text document in a text editor and save it like this 'C:\temp\NewTextDocument.txt'
--Now create a table
drop table TextDocumentsRegular
GO
create table TextDocumentsRegular(
Guid UNIQUEIDENTIFIER ROWGUIDCOL NOT NULL UNIQUE DEFAULT NEWSEQUENTIALID(),
documentID int not null ,
documentType varchar(10) not null,
TextContent varbinary(max) FILESTREAM not null ,
dateinserted datetime)
GO
INSERT INTO TextDocumentsRegular (documentID, documentType, TextContent, dateinserted)
SELECT 101, '.txt', *, GETDATE() FROM OPENROWSET(BULK N'C:\temp\NewTextDocument.txt', SINGLE_BLOB) rs
GO
select * from TextDocumentsRegular

--get the primary/unique key name
exec sp_help TextDocumentsRegular

--create full-text index using the unique key discovered with above command
CREATE FULLTEXT INDEX ON [dbo].TextDocumentsRegular
(Textcontent type column documentType) KEY INDEX UQ__TextDocum__A2B5777D37D0C9EB ON ([FTCat1])
WITH (CHANGE_TRACKING AUTO)

--search using full-text Contains() clause

select * from dbo.TextDocumentsRegular
where contains (TextContent, 'word_in_document')


Namaste,

Joseph

Leave a Reply

Your email address will not be published. Required fields are marked *

*

This site uses Akismet to reduce spam. Learn how your comment data is processed.