*** xapian-core-1.3.0_svn/configure.ac 2012-03-28 18:11:06.000000000 +0800 --- xapian-core-1.3.0_scws/configure.ac 2012-03-30 12:31:10.000000000 +0800 *************** *** 997,1002 **** --- 997,1058 ---- [Define if you want a log of methods called and other debug messages]) fi + dnl ********************** + dnl * Check scws library * + dnl ********************** + dnl hightman.20110411: See if we want to use scws as default tokenizer + SCWS_DIR="" + AC_MSG_CHECKING(for scws) + AC_ARG_WITH(scws, + [AS_HELP_STRING([--with-scws@<:@=DIR@:>@], [use scws as default tokenizer, DIR is the install PREFIX of scws])], + [ ],[ with_scws=no ] + ) + + if test "$with_scws" = "no"; then + AC_MSG_RESULT(no) + else + # Check header file + if test "$with_scws" = "yes"; then + searchdirs="/usr /usr/local /usr/local/scws /opt/local" + for tmpdir in $searchdirs ; do + if test -f $tmpdir/include/scws/scws.h ; then + SCWS_DIR=$tmpdir + break + fi + done + if test "$SCWS_DIR" = ""; then + AC_MSG_RESULT(no) + AC_MSG_ERROR([scws not found on default search directories, specify DIR plz...]) + fi + elif test -f $withval/include/scws/scws.h ; then + SCWS_DIR=$withval + else + AC_MSG_RESULT(no) + AC_MSG_ERROR([Invalid scws directory, unable to find the scws.h under $withval/include/scws]) + fi + AC_MSG_RESULT([yes: $SCWS_DIR]) + + # Etc directory + if test "$SCWS_DIR" = "/usr"; then + SCWS_ETCDIR="/etc" + else + SCWS_ETCDIR="$SCWS_DIR/etc" + fi + + # Check scws library + AC_CHECK_LIB(scws, scws_new, [ + LIBS="$LIBS -L$SCWS_DIR/lib -lscws" + XAPIAN_LDFLAGS="$XAPIAN_LDFLAGS -L$SCWS_DIR/lib -lscws" + CPPFLAGS="$CPPFLAGS -I$SCWS_DIR/include" + AC_DEFINE(HAVE_SCWS, 1, [Define to 1 if you want to use scws as default tokenizer]) + AC_DEFINE_UNQUOTED(SCWS_ETCDIR, "$SCWS_ETCDIR", [Resources directory of scws to load dictionary and rules]) + ],[ + AC_MSG_ERROR([scws_new() NOT found in libscws, please check it first.]) + ],[ + -L$SCWS_DIR/lib + ]) + fi + dnl ****************************** dnl * Set special compiler flags * dnl ****************************** *** xapian-core-1.3.0_svn/include/xapian/queryparser.h 2012-03-24 20:31:02.000000000 +0800 --- xapian-core-1.3.0_scws/include/xapian/queryparser.h 2012-03-30 12:31:10.000000000 +0800 *************** *** 499,504 **** --- 499,513 ---- */ void set_max_wildcard_expansion(Xapian::termcount limit); + #if 1 /* HAVE_SCWS */ + /** hightman.20070706: Specify the dict and rules file for scws, only used when HAVE_SCWS. + * @param fpath path for dict file and rule file (char *) + * @param xmem whether to load whold dict into memory(default to false) + * @param multi multiset (int 0~15) + */ + void load_libscws(const char *fpath, bool xmem = false, int multi = 0); + #endif + /** Parse a query. * * @param query_string A free-text query as entered by a user *** xapian-core-1.3.0_svn/include/xapian/termgenerator.h 2011-11-07 11:11:05.000000000 +0800 --- xapian-core-1.3.0_scws/include/xapian/termgenerator.h 2012-03-30 12:31:10.000000000 +0800 *************** *** 82,87 **** --- 82,92 ---- /// Set the database to index spelling data to. void set_database(const Xapian::WritableDatabase &db); + #if 1 /* HAVE_SCWS */ + /// hightman.20070706: Specify the dict and rules file for scws, only used when HAVE_SCWS. + void load_libscws(const char *fpath, bool xmem = false, int multi = 0); + #endif + /// Flags to OR together and pass to TermGenerator::set_flags(). enum flags { /// Index data required for spelling correction. *** xapian-core-1.3.0_svn/queryparser/queryparser_internal.h 2012-03-24 20:31:02.000000000 +0800 --- xapian-core-1.3.0_scws/queryparser/queryparser_internal.h 2012-03-30 12:38:27.000000000 +0800 *************** *** 29,34 **** --- 29,39 ---- #include #include + /// hightman.20070701: use scws as default tokneizer + #ifdef HAVE_SCWS + #include + #endif + #include #include *************** *** 63,68 **** --- 68,79 ---- Stem stemmer; stem_strategy stem_action; const Stopper * stopper; + #ifdef HAVE_SCWS + scws_t scws; + scws_res_t rptr, rcur; + const char *qptr; + int last_off; + #endif Query::op default_op; const char * errmsg; Database db; *************** *** 88,94 **** --- 99,112 ---- public: Internal() : stem_action(STEM_SOME), stopper(NULL), + #ifdef HAVE_SCWS + scws(NULL), rptr(NULL), rcur(NULL), + #endif default_op(Query::OP_OR), errmsg(NULL), max_wildcard_expansion(0) { } + #ifdef HAVE_SCWS + ~Internal(); + void load_libscws(const char *fpath, bool xmem, int multi); + #endif Query parse_query(const string & query_string, unsigned int flags, const string & default_prefix); }; *** xapian-core-1.3.0_svn/queryparser/termgenerator_internal.h 2011-07-03 20:31:02.000000000 +0800 --- xapian-core-1.3.0_scws/queryparser/termgenerator_internal.h 2012-03-30 12:33:15.000000000 +0800 *************** *** 26,31 **** --- 26,35 ---- #include #include #include + /// hightman.20070701: use scws as default tokneizer + #ifdef HAVE_SCWS + #include + #endif namespace Xapian { *************** *** 37,48 **** --- 41,62 ---- const Stopper * stopper; Document doc; termcount termpos; + #ifdef HAVE_SCWS + scws_t scws; + #endif TermGenerator::flags flags; WritableDatabase db; public: Internal() : stopper(NULL), termpos(0), + #ifdef HAVE_SCWS + scws(NULL), + #endif flags(TermGenerator::flags(0)) { } + #ifdef HAVE_SCWS + ~Internal(); + void load_libscws(const char *fpath, bool xmem, int multi); + #endif void index_text(Utf8Iterator itor, termcount weight, const std::string & prefix, *** xapian-core-1.3.0_svn/queryparser/queryparser.cc 2011-12-26 20:31:02.000000000 +0800 --- xapian-core-1.3.0_scws/queryparser/queryparser.cc 2012-03-30 12:33:15.000000000 +0800 *************** *** 136,141 **** --- 136,152 ---- internal->max_wildcard_expansion = max; } + #if 1 /* HAVE_SCWS */ + /// hightman.20070701: load the specified dict file for scws + void + QueryParser::load_libscws(const char *fpath, bool xmem, int multi) + { + #ifdef HAVE_SCWS + internal->load_libscws(fpath, xmem, multi); + #endif + } + #endif + Query QueryParser::parse_query(const string &query_string, unsigned flags, const string &default_prefix) *** xapian-core-1.3.0_svn/queryparser/queryparser.lemony 2012-01-26 20:51:02.000000000 +0800 --- xapian-core-1.3.0_scws/queryparser/queryparser.lemony 2012-03-30 13:46:27.000000000 +0800 *************** *** 166,171 **** --- 166,174 ---- string unstemmed; QueryParser::stem_strategy stem; termpos pos; + #ifdef HAVE_SCWS + vector multi; + #endif Term(const string &name_, termpos pos_) : name(name_), stem(QueryParser::STEM_NONE), pos(pos_) { } Term(const string &name_) : name(name_), stem(QueryParser::STEM_NONE), pos(0) { } *************** *** 501,513 **** vector prefix_cjk; const list & prefixes = prefix_info->prefixes; list::const_iterator piter; ! for (CJKTokenIterator tk(name); tk != CJKTokenIterator(); ++tk) { for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) { string cjk = *piter; cjk += *tk; prefix_cjk.push_back(Query(cjk, 1, pos)); } } Query * q = new Query(Query::OP_AND, prefix_cjk.begin(), prefix_cjk.end()); delete this; return q; --- 504,546 ---- vector prefix_cjk; const list & prefixes = prefix_info->prefixes; list::const_iterator piter; ! /* hightman.20111223: used CJKTERM for multi segment */ ! #ifdef HAVE_SCWS ! for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) { ! Query org = Query(*piter + name, 1, pos); ! termpos mpos = pos + 88; ! ! /* hightman.20120104: get synonyms */ ! if (state->flags & QueryParser::FLAG_AUTO_SYNONYMS) { ! Xapian::Database db = state->get_database(); ! Xapian::TermIterator syn = db.synonyms_begin(name); ! Xapian::TermIterator end = db.synonyms_end(name); ! while (syn != end) { ! org = Query(Query::OP_SYNONYM, org, Query(*piter + *syn, 1, mpos++)); ! ++syn; ! } ! } ! if (!multi.empty()) { ! vector::const_iterator mi; ! vector multi_cjk; ! for (mi = multi.begin(); mi != multi.end(); ++mi) { ! // hightman: force to sort behind for get_terms() ! multi_cjk.push_back(Query(*piter + *mi, 1, mpos++)); ! } ! Query syn = Query(state->default_op(), multi_cjk.begin(), multi_cjk.end()); ! org = Query(Query::OP_SYNONYM, org, syn); ! } ! prefix_cjk.push_back(org); ! } ! #else ! for (CJKTokenIterator tk(name); tk != CJKTokenIterator(); ++tk) { for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) { string cjk = *piter; cjk += *tk; prefix_cjk.push_back(Query(cjk, 1, pos)); } } + #endif /* HAVE_SCWS */ Query * q = new Query(Query::OP_AND, prefix_cjk.begin(), prefix_cjk.end()); delete this; return q; *************** *** 618,629 **** --- 651,728 ---- } } + /// hightman.20110701: load libscws + #ifdef HAVE_SCWS + QueryParser::Internal::~Internal() + { + if (rptr != NULL) { + scws_free_result(rptr); + rptr = NULL; + } + if (scws != NULL) { + scws_free(scws); + scws = NULL; + } + } + + void + QueryParser::Internal::load_libscws(const char *fpath, bool xmem, int multi) + { + if (scws == NULL) { + string temp; + + scws = scws_new(); + scws_set_charset(scws, "utf8"); + scws_set_ignore(scws, SCWS_NA); + scws_set_duality(scws, SCWS_YEA); + + temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/rules.utf8.ini"); + scws_set_rule(scws, temp.data()); + temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/dict.utf8.xdb"); + scws_set_dict(scws, temp.data(), xmem == true ? SCWS_XDICT_MEM : SCWS_XDICT_XDB); + /* hightman.20111209: custom dict support */ + temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/dict_user.txt"); + scws_add_dict(scws, temp.data(), SCWS_XDICT_TXT); + } + if (multi >= 0 && multi < 0x10) + scws_set_multi(scws, (multi<<12)); + } + #endif /* HAVE_SCWS */ + string QueryParser::Internal::parse_term(Utf8Iterator &it, const Utf8Iterator &end, bool cjk_ngram, bool & is_cjk_term, bool &was_acronym) { string term; + #ifdef HAVE_SCWS + int off = it.raw() - qptr; + while (rcur && (off > rcur->off)) { + rcur = rcur->next; + } + was_acronym = false; + if (rcur == NULL) { + it = end; + term.resize(0); + } else { + // sometimes, auto_duality + word-end single word char will be repeated + // 说明几句 => 说明/几/几句 + if (rcur->next && rcur->next->off == rcur->off && rcur->next->len > rcur->len) + rcur = rcur->next; + + term.append(qptr + rcur->off, rcur->len); + was_acronym = (rcur->attr[0] == 'n' && rcur->attr[1] == 'z') ? true : false; + is_cjk_term = CJK::codepoint_is_cjk(*it); + last_off = off = rcur->off + rcur->len; + rcur = rcur->next; + + // sometimes, auto duality or multisegment + // 几句说搞笑 => 几句/句说/搞笑 + if (rcur && off > rcur->off && (rcur->off + rcur->len) > off) + off = rcur->off; + while ((it.raw() - qptr) < off) it++; + } + #else /* HAVE_SCWS */ // Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E). // Don't worry if there's a trailing '.' or not. if (U_isupper(*it)) { *************** *** 708,713 **** --- 807,813 ---- } } } + #endif /* HAVE_SCWS */ return term; } *************** *** 759,764 **** --- 859,890 ---- ParserHandler pParser(ParseAlloc()); + #ifdef HAVE_SCWS + /// Pre segmentation use scws + scws_res_t res; + + if (!scws) { + load_libscws(NULL, false, 3); + } + if (rptr != NULL) { + scws_free_result(rptr); + rptr = NULL; + } + qptr = qs.data(); + scws_send_text(scws, qptr, qs.size()); + while ((res = scws_get_result(scws)) != NULL) { + if (rptr == NULL) { + rcur = rptr = res; + } else { + rcur->next = res; + } + while (rcur->next != NULL) { + rcur = rcur->next; + } + } + rcur = rptr; + #endif /* HAVE_SCWS */ + unsigned newprev = ' '; main_lex_loop: enum { *************** *** 1162,1167 **** --- 1288,1298 ---- if (!stemmer.internal.get()) { // No stemmer is set. stem_term = STEM_NONE; + #ifdef HAVE_SCWS + } else if (is_cjk_term) { + // Don't stem CJK terms. + stem_term = STEM_NONE; + #endif } else if (stem_term == STEM_SOME) { if (!should_stem(unstemmed_term) || (it != end && is_stem_preventer(*it))) { *************** *** 1175,1180 **** --- 1306,1322 ---- unstemmed_term, stem_term, term_pos++); if (is_cjk_term) { + #ifdef HAVE_SCWS + /* multi scws handler */ + term_obj->multi.clear(); + while (rcur && (rcur->off + rcur->len) <= last_off) { + if (rcur->len > 3) + term_obj->multi.push_back(string(qptr + rcur->off, rcur->len)); + rcur = rcur->next; + } + if (mode == IN_GROUP || mode == IN_GROUP2) + mode = DEFAULT; + #endif Parse(pParser, CJKTERM, term_obj, &state); if (it == end) break; continue; *************** *** 1305,1310 **** --- 1447,1459 ---- } } done: + #ifdef HAVE_SCWS + /// Free all segmented terms/words + if (rptr != NULL) { + scws_free_result(rptr); + rptr = NULL; + } + #endif if (!state.error) { // Implicitly close any unclosed quotes... if (mode == IN_QUOTES || mode == IN_PREFIXED_QUOTES) *************** *** 1656,1661 **** --- 1805,1815 ---- void Term::as_positional_cjk_term(Terms * terms) const { + #ifdef HAVE_SCWS + // Add SCWS term only + Term * c = new Term(state, name, prefix_info, unstemmed, stem, pos); + terms->add_positional_term(c); + #else // Add each individual CJK character to the phrase. string t; for (Utf8Iterator it(name); it != Utf8Iterator(); ++it) { *************** *** 1664,1669 **** --- 1818,1824 ---- terms->add_positional_term(c); t.resize(0); } + #endif /* HAVE_SCWS */ // FIXME: we want to add the n-grams as filters too for efficiency. *** xapian-core-1.3.0_svn/queryparser/termgenerator.cc 2011-07-03 20:31:02.000000000 +0800 --- xapian-core-1.3.0_scws/queryparser/termgenerator.cc 2012-03-30 12:33:15.000000000 +0800 *************** *** 74,79 **** --- 74,90 ---- internal->db = db; } + #if 1 /* HAVE_SCWS */ + /// hightman.20070701: load the specified dict file for scws + void + TermGenerator::load_libscws(const char *fpath, bool xmem, int multi) + { + #ifdef HAVE_SCWS + internal->load_libscws(fpath, xmem, multi); + #endif + } + #endif + TermGenerator::flags TermGenerator::set_flags(flags toggle, flags mask) { *** xapian-core-1.3.0_svn/queryparser/termgenerator_internal.cc 2011-08-24 20:51:02.000000000 +0800 --- xapian-core-1.3.0_scws/queryparser/termgenerator_internal.cc 2012-03-30 13:51:10.000000000 +0800 *************** *** 125,130 **** --- 125,164 ---- #define STOPWORDS_IGNORE 1 #define STOPWORDS_INDEX_UNSTEMMED_ONLY 2 + /// hightman.20070701: load libscws + #ifdef HAVE_SCWS + TermGenerator::Internal::~Internal() + { + if (scws != NULL) { + scws_free(scws); + scws = NULL; + } + } + + void + TermGenerator::Internal::load_libscws(const char *fpath, bool xmem, int multi) + { + if (scws == NULL) { + string temp; + + scws = scws_new(); + scws_set_charset(scws, "utf8"); + scws_set_ignore(scws, SCWS_NA); + scws_set_duality(scws, SCWS_YEA); + + temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/rules.utf8.ini"); + scws_set_rule(scws, temp.data()); + temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/dict.utf8.xdb"); + scws_set_dict(scws, temp.data(), xmem == true ? SCWS_XDICT_MEM : SCWS_XDICT_XDB); + /* hightman.20111209: custom dict support */ + temp = string(fpath ? fpath : SCWS_ETCDIR) + string("/dict_user.txt"); + scws_add_dict(scws, temp.data(), SCWS_XDICT_TXT); + } + if (multi >= 0 && multi < 0x10) + scws_set_multi(scws, (multi<<12)); + } + #endif + void TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc, const string & prefix, bool with_positions) *************** *** 135,140 **** --- 169,205 ---- if (!stopper) stop_mode = STOPWORDS_NONE; + #ifdef HAVE_SCWS + int last_endpos = 0, last_off = 0; + scws_res_t res, cur; + Utf8Iterator iterm; + const char *text = itor.raw(); + + if (!scws) load_libscws(NULL, false, 3); + scws_send_text(scws, text, itor.left()); + while ((res = cur = scws_get_result(scws)) != NULL) { while (cur != NULL) { + string term; + + iterm.assign(text + cur->off, cur->len); + if (!Unicode::is_wordchar(*iterm)) { + cur = cur->next; + continue; + } + term = Unicode::tolower(string(text + cur->off, cur->len)); + if (with_positions) { + /// for part word(short, duality) + if ((cur->off + cur->len) <= last_endpos) + --termpos; + else { + /// for dualities' first single word + if (cur->off == last_off) + --termpos; + last_endpos = cur->off + cur->len; + } + } + last_off = cur->off; + cur = cur->next; + #else while (true) { // Advance to the start of the next term. unsigned ch; *************** *** 254,259 **** --- 319,325 ---- } endofterm: + #endif /* HAVE_SCWS */ if (term.size() > MAX_PROB_TERM_LENGTH) continue; if (stop_mode == STOPWORDS_IGNORE && (*stopper)(term)) continue; *************** *** 263,268 **** --- 329,338 ---- } else { doc.add_term(prefix + term, wdf_inc); } + #ifdef HAVE_SCWS + /// hightman: Term start with CJK character needn't spell & stem + if (CJK::codepoint_is_cjk(*iterm)) continue; + #endif if ((flags & FLAG_SPELLING) && prefix.empty()) db.add_spelling(term); if (!stemmer.internal.get()) continue; *************** *** 280,285 **** --- 350,358 ---- stem += stemmer(term); doc.add_term(stem, wdf_inc); } + #ifdef HAVE_SCWS + scws_free_result(res); } + #endif } }