Crawl Tutorial
From CCIL
Revision as of 11:40, 18 May 2017 by Atanas.ilchev (Talk | contribs) (Created page with "TBA == The application context == <pre> context.title = Simple crawl tutorial context.description = Introducing the very basics of CCIL - simple crawl context.author = Atanas...")
TBA
The application context
context.title = Simple crawl tutorial context.description = Introducing the very basics of CCIL - simple crawl context.author = Atanas Ilchev context.comment = You can clone this view to generate custom views. context.thumbnail = media/logo.png # collection context.step1.pipeline = COL_START,COL_EVENT,COL_POPULATE,COL_END context.step2.pipeline = INITLANG,CRAWL_INDEX,BUILDSV context.step3.pipeline = LIST,PRINT context.step4.pipeline = SPLIT,STEM,JOIN,SEARCH # old context.collect.pipeline = CORPUS_ENUM,CORPUS_BLIST,CORPUS_INDEX context.crawl.pipeline = CRAWL_INIT,CRAWL context.process.pipeline = BUILDSV context.docsim.pipeline = FIX_SPACE,TO_V,V_SEARCH,PRINT context.keywords.pipeline = FIX_SPACE,TO_V,KW-SEARCH,PRINT context.list.pipeline = LIST,PRINT # DB-related ------ context.dbpopulate.pipeline = VDB_ENUM context.dblinks.pipeline = TEMS_ENUM_LINKS context.dbclear.pipeline = CLEAN_TABLES # golden corpus context.golden.pipeline = CORPUS_ENUM,CORPUS_BLIST,CORPUS_INDEX,CORPUS_BUILDSV context.filter.pipeline = FILTER_GSIM,FILTER_LANG,FILTER_LUCENE,FILTER_CONTENT,FILTER_TITLE context.fgsim.pipeline = FILTER_GSIM context.flang.pipeline = FILTER_LANG context.flucene.pipeline = FILTER_LUCENE context.fcontent.pipeline = FILTER_CONTENT context.ftitle=default # statistics context.stats.pipeline = STATS_BUILD # nierika context.tfidf.pipeline = LUCENE_TFIDF,RDB_EXEC_QUERY_GROUP_KEYWORDS context.sentences.pipeline = START_LUCENE,RDB_SENTENCES,RDB_ENUM,END_LUCENE context.nierikasv.pipeline = NIE_BUILDSV context.similarity.pipeline = SV_ENUM context.clustering.pipeline = CLUSTERING # WHIP ----------- context.search_a.pipeline = SEARCH_A_WEB whip.search.azure = search_a SEARCH_A_WEB = search-bing SEARCH_A_WEB.appid = @appid SEARCH_A_WEB.input = @cquery SEARCH_A_WEB.output = @xfer SEARCH_A_WEB.limit = @search_result_size SEARCH_A_WEB.size = 50 # NIERIKA ------------------ START_LUCENE = lucene-start START_LUCENE.lucene = collections/${@col_id}/nierika/@lucene START_LUCENE.language = @language RDB_SENTENCES = collect-rdb RDB_SENTENCES.cstr = @cstr RDB_SENTENCES.query = select sentence_id, sentence from sentences join documents on sentences.doc_id = documents.doc_id where col_id = ${@col_id} RDB_SENTENCES.user = @user RDB_SENTENCES.pass = @pass RDB_SENTENCES.output = @xfer RDB_ENUM = enum-rdb RDB_ENUM.input = @xfer RDB_ENUM.output = @xfer RDB_ENUM.process = RDBRESULT,NIE_SPLIT,IGNORELIST,NIE_JOIN,LUCENE_ADD RDBRESULT = field-rdb-result RDBRESULT.input = @xfer RDBRESULT.output = @xfer NIE_SPLIT = split NIE_SPLIT.mode = split NIE_SPLIT.input = @xfer NIE_SPLIT.output = @xfer IGNORELIST = ignorelist IGNORELIST.language = @language IGNORELIST.input = @xfer IGNORELIST.output = @xfer NIE_JOIN = split NIE_JOIN.mode = join NIE_JOIN.input = @xfer NIE_JOIN.output = @xfer LUCENE_ADD = lucene-add LUCENE_ADD.lucene = collections/${@col_id}/nierika/@lucene LUCENE_ADD.input = @xfer LUCENE_ADD.docidfield = @docidfield LUCENE_ADD.contentfield = @contentfield LUCENE_ADD.commit = 1000 END_LUCENE = lucene-end END_LUCENE.lucene = collections/${@col_id}/nierika/@lucene NIE_BUILDSV = index-sv NIE_BUILDSV.lucene = collections/${@col_id}/nierika/@lucene NIE_BUILDSV.termfile = collections/${@col_id}/nierika/termvectors.bin NIE_BUILDSV.docfile = collections/${@col_id}/nierika/docvectors.bin NIE_BUILDSV.dimension = 1000 NIE_BUILDSV.seedlength = 500 NIE_BUILDSV.maxnonalphabetchars = 10 NIE_BUILDSV.trainingcycles = 10 NIE_BUILDSV.docindexing = inmemory NIE_BUILDSV.contentsfields = @contentfield NIE_BUILDSV.docidfield = @docidfield NIE_BUILDSV.initialtermvectors = 5000 SV_ENUM = enum-sv SV_ENUM.input = @xfer SV_ENUM.output = @xfer SV_ENUM.docidfield = @docidfield SV_ENUM.file = collections/${@col_id}/nierika/docvectors.bin SV_ENUM.process = SV_SEARCH,SEARCH_RESULT_ENUM SV_ENUM.limit = -1 SV_SEARCH = vsearch SV_SEARCH.input = @xfer SV_SEARCH.output = @xfer SV_SEARCH.file = collections/${@col_id}/nierika/docvectors.bin SV_SEARCH.maxdistance = 50 SV_SEARCH.limit = -1 SEARCH_RESULT_ENUM = enum-search SEARCH_RESULT_ENUM.input = @xfer SEARCH_RESULT_ENUM.output = @xfer SEARCH_RESULT_ENUM.limit = -1 SEARCH_RESULT_ENUM.process = DOCSIM_INSERT DOCSIM_INSERT = db-insert-docsim DOCSIM_INSERT.input = @xfer DOCSIM_INSERT.output = @xfer DOCSIM_INSERT.cstr = @cstr DOCSIM_INSERT.user = @user DOCSIM_INSERT.pass = @pass DOCSIM_INSERT.docidfield = @docidfield CLUSTERING = clustering CLUSTERING.clustnum = 10 CLUSTERING.cycles = 5 CLUSTERING.colid = ${@col_id} # ------------------------------- STATS_BUILD = stats-build STATS_BUILD.lucene = collections/${@col_id}/@lucene STATS_BUILD.colid = ${@col_id} STATS_BUILD.context = default STATS_BUILD.contentfield = @contentfield STATS_BUILD.cstr = @cstr STATS_BUILD.user = @user STATS_BUILD.pass = @pass COL_POPULATE = run COL_POPULATE.name = crawl,process,stats,tfidf,sentences,nierikasv,similarity,clustering COL_POPULATE.input = @xfer COL_POPULATE.output = @xfer COL_EVENT = col-event COL_EVENT.cstr = @cstr COL_EVENT.user = @user COL_EVENT.pass = @pass COL_EVENT.text = some test event COL_EVENT.input = @col_id COL_START = col-start COL_START.context = default COL_START.cstr = @cstr COL_START.user = @user COL_START.pass = @pass COL_START.output = @col_id COL_END = col-end COL_END.cstr = @cstr COL_END.user = @user COL_END.pass = @pass COL_END.input = @col_id CLEAN_TABLES = db-clear CLEAN_TABLES.names = documents,dictionary,docsim,filter_tags,links,stems,collections CLEAN_TABLES.cstr = @cstr CLEAN_TABLES.user = @user CLEAN_TABLES.pass = @pass FILTER_TITLE = filter-sql FILTER_TITLE.cstr = @cstr FILTER_TITLE.user = @user FILTER_TITLE.pass = @pass FILTER_TITLE.selector = SELECT doc_id FROM documents WHERE length(title=default FILTER_TITLE.tag = title FILTER_TITLE.idfield = doc_id FILTER_CONTENT = filter-sql FILTER_CONTENT.cstr = @cstr FILTER_CONTENT.user = @user FILTER_CONTENT.pass = @pass FILTER_CONTENT.selector = SELECT doc_id FROM documents WHERE length(doc_text) > 512 AND col_id = ${@col_id} FILTER_CONTENT.tag = content FILTER_CONTENT.idfield = doc_id FILTER_LUCENE = filter-lucene FILTER_LUCENE.failsafe = true FILTER_LUCENE.tag = lucene FILTER_LUCENE.query = wind OR green OR solar OR energy OR water FILTER_LUCENE.cstr = @cstr FILTER_LUCENE.user = @user FILTER_LUCENE.pass = @pass FILTER_LUCENE.lucene = collections/${@col_id}/@lucene FILTER_LUCENE.contentfield = @contentfield FILTER_LUCENE.docidfield = @docidfield FILTER_LUCENE.language = @language FILTER_LUCENE.leadwildcard = false FILTER_LUCENE.posincrements = false FILTER_LUCENE.lcexpandedterms = false FILTER_LUCENE.defaultoperator = or FILTER_LUCENE.similarity = default FILTER_LUCENE.threshold = 0.009 FILTER_LUCENE.colid = ${@col_id} FILTER_LANG = filter-lang FILTER_LANG.failsafe = true FILTER_LANG.processor = langdetect FILTER_LANG.cstr = @cstr FILTER_LANG.user = @user FILTER_LANG.pass = @pass FILTER_LANG.colid = ${@col_id} FILTER_GSIM = filter-gsim FILTER_GSIM.file = collections/${@col_id}/docvectors.bin FILTER_GSIM.golden = corpus_sv/docvectors.bin FILTER_GSIM.cstr = @cstr FILTER_GSIM.user = @user FILTER_GSIM.pass = @pass FILTER_GSIM.failsafe = true FILTER_GSIM.tag = gsim FILTER_GSIM.colid = ${@col_id} CORPUS_ENUM = enum-dir CORPUS_ENUM.recurse = true CORPUS_ENUM.regex = .* CORPUS_ENUM.root = @corpus_dir CORPUS_ENUM.process = CORPUS_BAPPEND,PRINT CORPUS_INDEX = index-files CORPUS_INDEX.parser = parse-tika CORPUS_INDEX.docidfield = id CORPUS_INDEX.contentfield = @contentfield CORPUS_INDEX.language = @language CORPUS_INDEX.lucene = @glucene CORPUS_INDEX.process = NOP CORPUS_INDEX.idprocess = NOP CORPUS_INDEX.input = @xfer CORPUS_INDEX.encoding = utf-8 CORPUS_INDEX.skipempty = true CORPUS_BAPPEND = append-buffer CORPUS_BAPPEND.name = @input_list CORPUS_BAPPEND.input = @xfer CORPUS_BLIST = list-buffer CORPUS_BLIST.name = @input_list CORPUS_BLIST.output = @xfer CORPUS_BLIST.input = @xfer CORPUS_BUILDSV = index-sv CORPUS_BUILDSV.lucene = @glucene CORPUS_BUILDSV.termfile = collections/${@col_id}/corpus_sv/termvectors.bin CORPUS_BUILDSV.docfile = collections/${@col_id}/corpus_sv/docvectors.bin CORPUS_BUILDSV.dimension = 200 CORPUS_BUILDSV.seedlength = 10 CORPUS_BUILDSV.maxnonalphabetchars = 0 CORPUS_BUILDSV.trainingcycles = 0 CORPUS_BUILDSV.docindexing = inmemory CORPUS_BUILDSV.contentsfields = @contentfield,title CORPUS_BUILDSV.docidfield = @docidfield CORPUS_BUILDSV.initialtermvectors = 100 DB_LINKS = populate-db DB_LINKS.idvar = cvector DB_LINKS.input = @xfer DB_LINKS.cstr = @cstr DB_LINKS.user = @user DB_LINKS.pass = @pass DB_LINKS.task = link DB_LINKS.export = LUCENE_QUERY DB_LINKS.colid = ${@col_id} DB_LINKS.failsafe = true TEMS_ENUM_LINKS = enum-sv TEMS_ENUM_LINKS.file = collections/${@col_id}/termvectors.bin TEMS_ENUM_LINKS.limit = 0 TEMS_ENUM_LINKS.process = FORK_CVECTOR,VDB_SEARCH,PRINT,DB_LINKS TEMS_ENUM_LINKS.docidfield = @docidfield FORK_CVECTOR = assign FORK_CVECTOR.input = @xfer FORK_CVECTOR.output = cvector TEMS_ENUM = enum-sv TEMS_ENUM.file = collections/${@col_id}/docvectors.bin TEMS_ENUM.limit = 0 TEMS_ENUM.process = KW-SEARCH,PRINT,DB_DOCSIM LUCENE_QUERY = lucene-query LUCENE_QUERY.searchfield = id LUCENE_QUERY.lucene = collections/${@col_id}/@lucene LUCENE_QUERY.input = @xfer LUCENE_QUERY.output = @xfer VDB_SEARCH = vsearch VDB_SEARCH.file = collections/${@col_id}/docvectors.bin VDB_SEARCH.output = @xfer VDB_SEARCH.input = cvector VDB_SEARCH.limit = 0 VDB_SEARCH.maxdistance = 100 VDB_ENUM = enum-sv VDB_ENUM.file = collections/${@col_id}/docvectors.bin VDB_ENUM.limit = 0 VDB_ENUM.process = VDB_SEARCH,DB_DOCSIM VDB_ENUM.maxdistance = 100 VDB_ENUM.docidfield = @docidfield TEMS_ENUM = enum-sv TEMS_ENUM.file = collections/${@col_id}/termvectors.bin #TEMS_ENUM.file = collections/${@col_id}/docvectors.bin TEMS_ENUM.limit = 0 TEMS_ENUM.process = KW-SEARCH,PRINT,DB_DOCSIM DB_DOCSIM = populate-db DB_DOCSIM.idvar = cvector DB_DOCSIM.input = @xfer DB_DOCSIM.cstr = @cstr DB_DOCSIM.user = @user DB_DOCSIM.pass = @pass DB_DOCSIM.task = docsim DB_DOCSIM.colnames = title,site,tstamp DB_DOCSIM.colvars = ctitle,csite,cstamp DB_DOCSIM.process = LUCENE_QUERY,LUCENE_GET_TITLE,LUCENE_GET_SITE,LUCENE_GET_STAMP,LUCENE_GET_CONTENT DB_DOCSIM.colid = ${@col_id} DB_DOCSIM.failsafe = true LUCENE_GET_SITE = getfield LUCENE_GET_SITE.field = site LUCENE_GET_SITE.input = @xfer LUCENE_GET_SITE.output = csite LUCENE_GET_STAMP = getfield LUCENE_GET_STAMP.field = tstamp LUCENE_GET_STAMP.input = @xfer LUCENE_GET_STAMP.output = cstamp LUCENE_GET_TITLE = getfield LUCENE_GET_TITLE.field = title LUCENE_GET_TITLE.input = @xfer LUCENE_GET_TITLE.output = ctitle LUCENE_GET_CONTENT = getfield LUCENE_GET_CONTENT.field = original_content LUCENE_GET_CONTENT.input = @xfer LUCENE_GET_CONTENT.output = @xfer CRAWL_INIT = copy-dir CRAWL_INIT.name = @crawl CRAWL_INIT.target = collections/${@col_id}/@crawl CRAWL_INIT.failsafe = false CRAWL_INIT.overwrite = true CRAWL_INIT.skip = CRAWL_N = nutch-run CRAWL_N.crawl = collections/${@col_id}/@crawl CRAWL_N.depth=100 CRAWL_N.topn=10 CRAWL = sourcegreed-run CRAWL.delay = 500 CRAWL.indexpath = collections/${@col_id}/@lucene CRAWL.width=2 CRAWL.depth=200 CRAWL.threads=1 CRAWL.collection = ${@col_id} CRAWL_INDEX = index-nutch CRAWL_INDEX.docidfield = url CRAWL_INDEX.contentfield = content CRAWL_INDEX.language = ${@clang} CRAWL_INDEX.lucene = collections/${@col_id}/@lucene CRAWL_INDEX.crawl = collections/${@col_id}/@crawl CRAWL_INDEX.tokenprocess = LANGDETECT,SPLIT,STEM,JOIN CRAWL_INDEX.docprocess = BOILERPIPE INITLANG = assign-str INITLANG.value = en INITLANG.output = @clang LANGDETECT = langdetect LANGDETECT.default = en LANGDETECT.failsafe = true LANGDETECT.input = @xfer LANGDETECT.output = @clang # FIXME: not working, to be added between STEM and JOIN in the CRAWL and CRAWL_INDEX stages CLEAN = filter-ignorelist CLEAN.language = @language CLEAN.input = tokens CLEAN.output = tokens BOILERPIPE = filter-boilerpipe BOILERPIPE.input = @xfer BOILERPIPE.output = @xfer FIX_SPACE = replace-text FIX_SPACE.locator = {SPACE} FIX_SPACE.text = %20 FIX_SPACE.input = search.initialQuery FIX_SPACE.output = search.initialQuery BUILDSV = index-sv BUILDSV.lucene = collections/${@col_id}/@lucene BUILDSV.termfile = collections/${@col_id}/termvectors.bin BUILDSV.docfile = collections/${@col_id}/docvectors.bin BUILDSV.dimension = 200 BUILDSV.seedlength = 10 BUILDSV.maxnonalphabetchars = 0 BUILDSV.trainingcycles = 0 BUILDSV.docindexing = inmemory BUILDSV.contentsfields = @contentfield,title BUILDSV.docidfield = @docidfield BUILDSV.initialtermvectors = 100 TO_V = id-vector TO_V.file = collections/${@col_id}/docvectors.bin TO_V.output = @xfer TO_V.input = search.initialQuery KW-SEARCH = vsearch KW-SEARCH.file = collections/${@col_id}/termvectors.bin KW-SEARCH.output = @xfer KW-SEARCH.input = @xfer KW-SEARCH.limit = 10 V_SEARCH = vsearch V_SEARCH.file = collections/${@col_id}/docvectors.bin V_SEARCH.output = @xfer V_SEARCH.input = @xfer V_SEARCH.limit = 10 LIST = list-sv LIST.file = collections/${@col_id}/docvectors.bin LIST.limit = 20 LIST.output = @xfer LIST.input = @xfer SEARCH = search-lucene SEARCH.input = @xfer SEARCH.output = @xfer SEARCH.lucene = @lucene SEARCH.contentfield = @contentfield SEARCH.docidfield = @docidfield SEARCH.language = @language SEARCH.leadwildcard = false SEARCH.posincrements = false SEARCH.lcexpandedterms = false SEARCH.defaultoperator = or SEARCH.similarity = default JOIN = split-basic JOIN.mode = join JOIN.input = stemmed_tokens JOIN.output = @xfer STEM = stem-lucene STEM.input = tokens STEM.output = stemmed_tokens STEM.export = NOP STEM.language= @language NOP = nop NOP.input = @xfer NOP.output = @xfer SPLIT = split-basic SPLIT.mode = split SPLIT.input = @xfer SPLIT.output = tokens PRINT = print PRINT.input = @xfer PRINT.output = @xfer LUCENE_TFIDF = lucene-tfidf LUCENE_TFIDF.output = @xfer LUCENE_TFIDF.lucene = collections/${@col_id}/@lucene LUCENE_TFIDF.docidfield = internalId LUCENE_TFIDF.contentfield = @contentfield LUCENE_TFIDF.lettersonly = @lettersonly LUCENE_TFIDF.cyrilliconly = @cyrilliconly LUCENE_TFIDF.mintokenlen = @mintokenlen LUCENE_TFIDF.process = KEYWORDS_INSERT KEYWORDS_INSERT = db-insert-keywords KEYWORDS_INSERT.input = @xfer KEYWORDS_INSERT.output = @xfer KEYWORDS_INSERT.cstr = @cstr KEYWORDS_INSERT.user = @user KEYWORDS_INSERT.pass = @pass KEYWORDS_INSERT.docidfield = internalId RDB_EXEC_QUERY_GROUP_KEYWORDS = sql-exec RDB_EXEC_QUERY_GROUP_KEYWORDS.cstr = @cstr RDB_EXEC_QUERY_GROUP_KEYWORDS.user = @user RDB_EXEC_QUERY_GROUP_KEYWORDS.pass = @pass RDB_EXEC_QUERY_GROUP_KEYWORDS.query = INSERT INTO sentences(doc_id, sentence) SELECT keywords.doc_id, substring_index(GROUP_CONCAT(keywords.word ORDER BY keywords.score DESC SEPARATOR ' '), ' ', 30) AS sentence FROM keywords JOIN documents ON keywords.doc_id = documents.doc_id WHERE col_id = ${@col_id} GROUP BY keywords.doc_id col_id = col_id #task = docsim task = link cstr = jdbc:mysql://localhost/vssp?useUnicode=yes&characterEncoding=UTF-8&autoReconnect=true user = vssp pass = vssp input_list = input_files glucene = corpus_lucene corpus_dir = corpus xfer = xfer clang = clang lucene = lucene input_list = input_files language = bg lettersonly = true cyrilliconly = false mintokenlen = 3 contentfield = content docidfield = id corpus_dir = corpus xfer = xfer crawl = crawl appid = SRnW9tkIrMB/aXLqxwB8WHtcIUn/wlNxzwepf8XxvRk search_query_pause = 500 search_result_size = 100 cquery = cquery