How to create textual index with phoneme

 

First of all you need create one user (schema to test), in this example the user is scott and the language example is PORTUGUESE.

1. Grant permissions to scott with sys user :

GRANT EXECUTE ON CTXSYS.CTX_ADM TO SCOTT;
GRANT EXECUTE ON CTXSYS.CTX_CLS TO SCOTT;
GRANT EXECUTE ON CTXSYS.CTX_DDL TO SCOTT;
GRANT EXECUTE ON CTXSYS.CTX_OUTPUT TO SCOTT;

 

2. Create Preference LEXER :

Login with user scott and run :

BEGIN
 ctx_ddl.create_preference ('SCOTT_LEXER', 'BASIC_LEXER');
 ctx_ddl.set_attribute ('SCOTT_LEXER', 'base_letter', 'YES');
 ctx_ddl.set_attribute('SCOTT_LEXER', 'printjoins', '_-');
 ctx_ddl.set_attribute('SCOTT_LEXER', 'base_letter_type', 'GENERIC');
 ctx_ddl.set_attribute ('SCOTT_LEXER', 'index_themes', 'NO');
 ctx_ddl.set_attribute ('SCOTT_LEXER', 'index_themes', 'YES');
 ctx_ddl.set_attribute ('SCOTT_LEXER', 'theme_language', 'PORTUGESE');
 END;

3. Create Preference WORDLIST :

Login with user scott and run :

begin
  ctx_ddl.create_preference('SCOTT_LEXER_FUZZY', 'BASIC_WORDLIST');
  ctx_ddl.set_attribute('SCOTT_LEXER_FUZZY','FUZZY_MATCH','AUTO');
  ctx_ddl.set_attribute('SCOTT_LEXER_FUZZY','FUZZY_SCORE','60');
  ctx_ddl.set_attribute('SCOTT_LEXER_FUZZY','FUZZY_NUMRESULTS','100');
  ctx_ddl.set_attribute('SCOTT_LEXER_FUZZY','SUBSTRING_INDEX','TRUE');
  ctx_ddl.set_attribute('SCOTT_LEXER_FUZZY','STEMMER','AUTO');
end;

 

4. Create Preference STOPLIST :

The stop list you can found in the database machine :
$ORACLE_HOME/ctx/admin/defaults
For Portuguese-Brazil the file is drdefptb.sql. Search your language.

Login with user scott and run :

declare
  db_charset VARCHAR2(500);

  procedure add_utf8_stopword(hexstring in VARCHAR2) is
  begin
    CTX_DDL.add_stopword('SCOTT_LEXER_STOPLIST', UTL_RAW.cast_to_varchar2(
      UTL_RAW.convert(HEXTORAW(hexstring), db_charset,
                                           'AMERICAN_AMERICA.UTF8')));
  end add_utf8_stopword;

begin
  SELECT 'AMERICAN_AMERICA.' || value
    INTO db_charset
    FROM v$nls_parameters
    WHERE parameter = 'NLS_CHARACTERSET';

  /* Why the extra spaces around the comments?  If the client character set
   * (as identified by NLS_LANG) is AL32UTF8 (or possibly others as well)
   * then the accented characters in the comments, which are in ISO8859-1,
   * are interpreted as multibyte characters.  Thus up to 3 characters after
   * the accented character are mis-interpreted.  If one of these characters
   * happens to be the end comment marker, then the following line or lines
   * is commented out, which leads to missing stopwords and/or PL/SQL parse
   * errors.  End result - the extra spaces before the end comment markers
   * are necessary to ensure that the marker is processed correctly.
   */
  ctx_ddl.create_stoplist('SCOTT_LEXER_STOPLIST');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','a');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','abaixo');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','adiante');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','agora');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','ali');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','antes');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','aqui');
  add_utf8_stopword('6174C3A9'); /* até   */
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','atras');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','bastante');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','bem');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','com');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','como');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','contra');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','debaixo');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','demais');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','depois');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','depressa');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','devagar');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','direito');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','e');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','ela');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','elas');
  add_utf8_stopword('C3AA6C65'); /* êle   */
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','eles');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','em');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','entre');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','eu');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','fora');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','junto');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','longe');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','mais');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','menos');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','muito');
  add_utf8_stopword('6EC3A36F'); /* não   */
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','ninguem');
  add_utf8_stopword('6EC3B373'); /* nós   */
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','nunca');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','onde');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','ou');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','para');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','por');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','porque');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','pouco');
  add_utf8_stopword('7072C3B378696D6F'); /* próximo   */
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','qual');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','quando');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','quanto');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','que');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','quem');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','se');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','sem');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','sempre');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','sim');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','sob');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','sobre');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','talvez');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','todas');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','todos');
  ctx_ddl.add_stopword('SCOTT_LEXER_STOPLIST','vagarosamente');
  add_utf8_stopword('766F63C3AA'); /* você   */
  add_utf8_stopword('766F63C3AA73'); /* vocês   */
end;
/

 

5. Create Index :

To see the preferences :

select a.*
from CTXSYS.CTX_PREFERENCES a
where a.pre_name like '%SCOTT%';

Login with user scott and run :

CREATE INDEX SCOTT.IX_TXT_TABLE ON SCOTT.TB_TABLE
 (TXT_NAME)
 INDEXTYPE IS CTXSYS.CONTEXT
 PARAMETERS('LEXER SCOTT_LEXER Wordlist SCOTT_LEXER_FUZZY Stoplist SCOTT_LEXER_STOPLIST')
 NOPARALLEL;

To see the index :

select a.*
from ctxsys.dr$index a
where a.idx_name like '%IX_TXT%'

 

6. How to use the index :

6.1 Score :

SELECT SCORE(1) score,TXT_NAME
from scott.TB_TABLE
where CONTAINS(TXT_NAME, 'fuzzy(WANDERLEI, 0, , weight)',1) > 0
order by score desc, TXT_NAME asc

 

6.2 Filter for 70% :

SELECT SCORE(1) score,TXT_NAME
from scott.TB_TABLE
where CONTAINS(TXT_NAME, 'fuzzy(WANDERLEI, 70, , weight)',1) > 0
order by score desc, TXT_NAME asc

 

6.3 Only exact 100% :

SELECT SCORE(1) score,TXT_NAME
from scott.TB_TABLE
where CONTAINS(TXT_NAME, 'fuzzy(WANDERLEI, 0, 1, weight)',1) > 0
order by score desc, TXT_NAME asc

 

Good Luck !

 

Leave a Reply