Crawl Tutorial

From CCIL
Jump to: navigation, search

https://sourceforge.net/p/ccil/code/HEAD/tree/trunk/src/tutorials/files/crawl/

TBA

The application context

context.title = Simple crawl tutorial
context.description = Introducing the very basics of CCIL - simple crawl
context.author = Atanas Ilchev
context.comment = You can clone this view to generate custom views.
context.thumbnail = media/logo.png

# collection
context.step1.pipeline = COL_START,COL_EVENT,COL_POPULATE,COL_END
context.step2.pipeline = INITLANG,CRAWL_INDEX,BUILDSV
context.step3.pipeline = LIST,PRINT
context.step4.pipeline = SPLIT,STEM,JOIN,SEARCH

# old
context.collect.pipeline = CORPUS_ENUM,CORPUS_BLIST,CORPUS_INDEX
context.crawl.pipeline = CRAWL_INIT,CRAWL
context.process.pipeline = BUILDSV
context.docsim.pipeline = FIX_SPACE,TO_V,V_SEARCH,PRINT
context.keywords.pipeline = FIX_SPACE,TO_V,KW-SEARCH,PRINT
context.list.pipeline = LIST,PRINT

# DB-related ------
context.dbpopulate.pipeline = VDB_ENUM
context.dblinks.pipeline = TEMS_ENUM_LINKS
context.dbclear.pipeline = CLEAN_TABLES

# golden corpus
context.golden.pipeline = CORPUS_ENUM,CORPUS_BLIST,CORPUS_INDEX,CORPUS_BUILDSV
context.filter.pipeline = FILTER_GSIM,FILTER_LANG,FILTER_LUCENE,FILTER_CONTENT,FILTER_TITLE
context.fgsim.pipeline = FILTER_GSIM
context.flang.pipeline = FILTER_LANG
context.flucene.pipeline = FILTER_LUCENE
context.fcontent.pipeline = FILTER_CONTENT
context.ftitle=default

# statistics
context.stats.pipeline = STATS_BUILD

# nierika
context.tfidf.pipeline = LUCENE_TFIDF,RDB_EXEC_QUERY_GROUP_KEYWORDS
context.sentences.pipeline = START_LUCENE,RDB_SENTENCES,RDB_ENUM,END_LUCENE
context.nierikasv.pipeline = NIE_BUILDSV
context.similarity.pipeline = SV_ENUM
context.clustering.pipeline = CLUSTERING

# WHIP	-----------

context.search_a.pipeline = SEARCH_A_WEB
whip.search.azure = search_a

SEARCH_A_WEB = search-bing
SEARCH_A_WEB.appid = @appid
SEARCH_A_WEB.input = @cquery
SEARCH_A_WEB.output = @xfer
SEARCH_A_WEB.limit = @search_result_size
SEARCH_A_WEB.size = 50

# NIERIKA  ------------------

START_LUCENE = lucene-start
START_LUCENE.lucene = collections/${@col_id}/nierika/@lucene
START_LUCENE.language = @language

RDB_SENTENCES = collect-rdb
RDB_SENTENCES.cstr = @cstr
RDB_SENTENCES.query = select sentence_id, sentence from sentences join documents on sentences.doc_id = documents.doc_id where col_id = ${@col_id}
RDB_SENTENCES.user = @user
RDB_SENTENCES.pass = @pass
RDB_SENTENCES.output = @xfer

RDB_ENUM = enum-rdb
RDB_ENUM.input = @xfer
RDB_ENUM.output = @xfer
RDB_ENUM.process = RDBRESULT,NIE_SPLIT,IGNORELIST,NIE_JOIN,LUCENE_ADD

RDBRESULT = field-rdb-result
RDBRESULT.input = @xfer
RDBRESULT.output = @xfer

NIE_SPLIT = split
NIE_SPLIT.mode = split
NIE_SPLIT.input = @xfer
NIE_SPLIT.output = @xfer

IGNORELIST = ignorelist
IGNORELIST.language = @language
IGNORELIST.input = @xfer
IGNORELIST.output = @xfer

NIE_JOIN = split
NIE_JOIN.mode = join
NIE_JOIN.input = @xfer
NIE_JOIN.output = @xfer

LUCENE_ADD = lucene-add
LUCENE_ADD.lucene = collections/${@col_id}/nierika/@lucene
LUCENE_ADD.input = @xfer
LUCENE_ADD.docidfield = @docidfield
LUCENE_ADD.contentfield = @contentfield
LUCENE_ADD.commit = 1000

END_LUCENE = lucene-end
END_LUCENE.lucene = collections/${@col_id}/nierika/@lucene

NIE_BUILDSV = index-sv
NIE_BUILDSV.lucene = collections/${@col_id}/nierika/@lucene
NIE_BUILDSV.termfile = collections/${@col_id}/nierika/termvectors.bin
NIE_BUILDSV.docfile = collections/${@col_id}/nierika/docvectors.bin
NIE_BUILDSV.dimension = 1000
NIE_BUILDSV.seedlength = 500
NIE_BUILDSV.maxnonalphabetchars = 10
NIE_BUILDSV.trainingcycles = 10
NIE_BUILDSV.docindexing = inmemory
NIE_BUILDSV.contentsfields = @contentfield
NIE_BUILDSV.docidfield = @docidfield
NIE_BUILDSV.initialtermvectors = 5000

SV_ENUM = enum-sv
SV_ENUM.input = @xfer
SV_ENUM.output = @xfer
SV_ENUM.docidfield = @docidfield
SV_ENUM.file = collections/${@col_id}/nierika/docvectors.bin
SV_ENUM.process = SV_SEARCH,SEARCH_RESULT_ENUM
SV_ENUM.limit = -1

SV_SEARCH = vsearch
SV_SEARCH.input = @xfer
SV_SEARCH.output = @xfer
SV_SEARCH.file = collections/${@col_id}/nierika/docvectors.bin
SV_SEARCH.maxdistance = 50
SV_SEARCH.limit = -1

SEARCH_RESULT_ENUM = enum-search
SEARCH_RESULT_ENUM.input = @xfer
SEARCH_RESULT_ENUM.output = @xfer
SEARCH_RESULT_ENUM.limit = -1
SEARCH_RESULT_ENUM.process = DOCSIM_INSERT

DOCSIM_INSERT = db-insert-docsim
DOCSIM_INSERT.input = @xfer
DOCSIM_INSERT.output = @xfer
DOCSIM_INSERT.cstr = @cstr
DOCSIM_INSERT.user = @user
DOCSIM_INSERT.pass = @pass
DOCSIM_INSERT.docidfield = @docidfield

CLUSTERING = clustering
CLUSTERING.clustnum = 10
CLUSTERING.cycles = 5
CLUSTERING.colid = ${@col_id}

# -------------------------------

STATS_BUILD = stats-build
STATS_BUILD.lucene = collections/${@col_id}/@lucene
STATS_BUILD.colid = ${@col_id}
STATS_BUILD.context = default
STATS_BUILD.contentfield = @contentfield
STATS_BUILD.cstr = @cstr
STATS_BUILD.user = @user
STATS_BUILD.pass = @pass

COL_POPULATE = run
COL_POPULATE.name = crawl,process,stats,tfidf,sentences,nierikasv,similarity,clustering
COL_POPULATE.input = @xfer
COL_POPULATE.output = @xfer

COL_EVENT = col-event
COL_EVENT.cstr = @cstr
COL_EVENT.user = @user
COL_EVENT.pass = @pass
COL_EVENT.text = some test event
COL_EVENT.input = @col_id

COL_START = col-start
COL_START.context = default
COL_START.cstr = @cstr
COL_START.user = @user
COL_START.pass = @pass
COL_START.output = @col_id

COL_END = col-end
COL_END.cstr = @cstr
COL_END.user = @user
COL_END.pass = @pass
COL_END.input = @col_id

CLEAN_TABLES = db-clear
CLEAN_TABLES.names = documents,dictionary,docsim,filter_tags,links,stems,collections
CLEAN_TABLES.cstr = @cstr
CLEAN_TABLES.user = @user
CLEAN_TABLES.pass = @pass

FILTER_TITLE = filter-sql
FILTER_TITLE.cstr = @cstr
FILTER_TITLE.user = @user
FILTER_TITLE.pass = @pass
FILTER_TITLE.selector = SELECT doc_id FROM documents WHERE length(title=default
FILTER_TITLE.tag = title
FILTER_TITLE.idfield = doc_id

FILTER_CONTENT = filter-sql
FILTER_CONTENT.cstr = @cstr
FILTER_CONTENT.user = @user
FILTER_CONTENT.pass = @pass
FILTER_CONTENT.selector = SELECT doc_id FROM documents WHERE length(doc_text) > 512 AND col_id = ${@col_id}
FILTER_CONTENT.tag = content
FILTER_CONTENT.idfield = doc_id

FILTER_LUCENE = filter-lucene
FILTER_LUCENE.failsafe = true
FILTER_LUCENE.tag = lucene
FILTER_LUCENE.query = wind OR green OR solar OR energy OR water
FILTER_LUCENE.cstr = @cstr
FILTER_LUCENE.user = @user
FILTER_LUCENE.pass = @pass
FILTER_LUCENE.lucene = collections/${@col_id}/@lucene
FILTER_LUCENE.contentfield = @contentfield
FILTER_LUCENE.docidfield = @docidfield
FILTER_LUCENE.language = @language
FILTER_LUCENE.leadwildcard = false
FILTER_LUCENE.posincrements = false
FILTER_LUCENE.lcexpandedterms = false
FILTER_LUCENE.defaultoperator = or
FILTER_LUCENE.similarity = default
FILTER_LUCENE.threshold = 0.009
FILTER_LUCENE.colid = ${@col_id}

FILTER_LANG = filter-lang
FILTER_LANG.failsafe = true
FILTER_LANG.processor = langdetect
FILTER_LANG.cstr = @cstr
FILTER_LANG.user = @user
FILTER_LANG.pass = @pass
FILTER_LANG.colid = ${@col_id}

FILTER_GSIM = filter-gsim
FILTER_GSIM.file = collections/${@col_id}/docvectors.bin
FILTER_GSIM.golden = corpus_sv/docvectors.bin
FILTER_GSIM.cstr = @cstr
FILTER_GSIM.user = @user
FILTER_GSIM.pass = @pass
FILTER_GSIM.failsafe = true
FILTER_GSIM.tag = gsim
FILTER_GSIM.colid = ${@col_id}

CORPUS_ENUM = enum-dir
CORPUS_ENUM.recurse = true
CORPUS_ENUM.regex = .*
CORPUS_ENUM.root = @corpus_dir
CORPUS_ENUM.process = CORPUS_BAPPEND,PRINT

CORPUS_INDEX = index-files
CORPUS_INDEX.parser = parse-tika
CORPUS_INDEX.docidfield = id
CORPUS_INDEX.contentfield = @contentfield
CORPUS_INDEX.language = @language
CORPUS_INDEX.lucene = @glucene
CORPUS_INDEX.process = NOP
CORPUS_INDEX.idprocess = NOP
CORPUS_INDEX.input = @xfer
CORPUS_INDEX.encoding = utf-8
CORPUS_INDEX.skipempty = true

CORPUS_BAPPEND = append-buffer
CORPUS_BAPPEND.name = @input_list
CORPUS_BAPPEND.input = @xfer

CORPUS_BLIST = list-buffer
CORPUS_BLIST.name = @input_list
CORPUS_BLIST.output = @xfer
CORPUS_BLIST.input = @xfer

CORPUS_BUILDSV = index-sv
CORPUS_BUILDSV.lucene = @glucene
CORPUS_BUILDSV.termfile = collections/${@col_id}/corpus_sv/termvectors.bin
CORPUS_BUILDSV.docfile = collections/${@col_id}/corpus_sv/docvectors.bin
CORPUS_BUILDSV.dimension = 200
CORPUS_BUILDSV.seedlength = 10
CORPUS_BUILDSV.maxnonalphabetchars = 0
CORPUS_BUILDSV.trainingcycles = 0
CORPUS_BUILDSV.docindexing = inmemory
CORPUS_BUILDSV.contentsfields = @contentfield,title
CORPUS_BUILDSV.docidfield = @docidfield
CORPUS_BUILDSV.initialtermvectors = 100


DB_LINKS = populate-db
DB_LINKS.idvar = cvector
DB_LINKS.input = @xfer
DB_LINKS.cstr = @cstr
DB_LINKS.user = @user
DB_LINKS.pass = @pass
DB_LINKS.task = link
DB_LINKS.export = LUCENE_QUERY
DB_LINKS.colid = ${@col_id}
DB_LINKS.failsafe = true

TEMS_ENUM_LINKS = enum-sv
TEMS_ENUM_LINKS.file = collections/${@col_id}/termvectors.bin
TEMS_ENUM_LINKS.limit = 0 
TEMS_ENUM_LINKS.process = FORK_CVECTOR,VDB_SEARCH,PRINT,DB_LINKS
TEMS_ENUM_LINKS.docidfield = @docidfield

FORK_CVECTOR = assign
FORK_CVECTOR.input = @xfer
FORK_CVECTOR.output = cvector

TEMS_ENUM = enum-sv
TEMS_ENUM.file = collections/${@col_id}/docvectors.bin
TEMS_ENUM.limit = 0 
TEMS_ENUM.process = KW-SEARCH,PRINT,DB_DOCSIM

LUCENE_QUERY = lucene-query
LUCENE_QUERY.searchfield = id
LUCENE_QUERY.lucene = collections/${@col_id}/@lucene
LUCENE_QUERY.input = @xfer
LUCENE_QUERY.output = @xfer

VDB_SEARCH = vsearch
VDB_SEARCH.file = collections/${@col_id}/docvectors.bin
VDB_SEARCH.output = @xfer
VDB_SEARCH.input = cvector
VDB_SEARCH.limit = 0
VDB_SEARCH.maxdistance = 100

VDB_ENUM = enum-sv
VDB_ENUM.file = collections/${@col_id}/docvectors.bin 
VDB_ENUM.limit = 0
VDB_ENUM.process = VDB_SEARCH,DB_DOCSIM
VDB_ENUM.maxdistance = 100
VDB_ENUM.docidfield = @docidfield

TEMS_ENUM = enum-sv
TEMS_ENUM.file = collections/${@col_id}/termvectors.bin
#TEMS_ENUM.file = collections/${@col_id}/docvectors.bin 
TEMS_ENUM.limit = 0 
TEMS_ENUM.process = KW-SEARCH,PRINT,DB_DOCSIM

DB_DOCSIM = populate-db
DB_DOCSIM.idvar = cvector
DB_DOCSIM.input = @xfer
DB_DOCSIM.cstr = @cstr
DB_DOCSIM.user = @user
DB_DOCSIM.pass = @pass
DB_DOCSIM.task = docsim
DB_DOCSIM.colnames = title,site,tstamp
DB_DOCSIM.colvars = ctitle,csite,cstamp
DB_DOCSIM.process = LUCENE_QUERY,LUCENE_GET_TITLE,LUCENE_GET_SITE,LUCENE_GET_STAMP,LUCENE_GET_CONTENT
DB_DOCSIM.colid = ${@col_id}
DB_DOCSIM.failsafe = true

LUCENE_GET_SITE = getfield
LUCENE_GET_SITE.field = site
LUCENE_GET_SITE.input = @xfer
LUCENE_GET_SITE.output = csite

LUCENE_GET_STAMP = getfield
LUCENE_GET_STAMP.field = tstamp
LUCENE_GET_STAMP.input = @xfer
LUCENE_GET_STAMP.output = cstamp

LUCENE_GET_TITLE = getfield
LUCENE_GET_TITLE.field = title
LUCENE_GET_TITLE.input = @xfer
LUCENE_GET_TITLE.output = ctitle

LUCENE_GET_CONTENT = getfield
LUCENE_GET_CONTENT.field = original_content
LUCENE_GET_CONTENT.input = @xfer
LUCENE_GET_CONTENT.output = @xfer

CRAWL_INIT = copy-dir
CRAWL_INIT.name = @crawl
CRAWL_INIT.target = collections/${@col_id}/@crawl
CRAWL_INIT.failsafe = false
CRAWL_INIT.overwrite = true
CRAWL_INIT.skip = 

CRAWL_N = nutch-run
CRAWL_N.crawl = collections/${@col_id}/@crawl
CRAWL_N.depth=100
CRAWL_N.topn=10

CRAWL = sourcegreed-run
CRAWL.delay = 500
CRAWL.indexpath = collections/${@col_id}/@lucene
CRAWL.width=2
CRAWL.depth=200
CRAWL.threads=1
CRAWL.collection = ${@col_id}

CRAWL_INDEX = index-nutch
CRAWL_INDEX.docidfield = url
CRAWL_INDEX.contentfield = content
CRAWL_INDEX.language = ${@clang}
CRAWL_INDEX.lucene = collections/${@col_id}/@lucene
CRAWL_INDEX.crawl = collections/${@col_id}/@crawl
CRAWL_INDEX.tokenprocess = LANGDETECT,SPLIT,STEM,JOIN
CRAWL_INDEX.docprocess = BOILERPIPE

INITLANG = assign-str
INITLANG.value = en
INITLANG.output = @clang

LANGDETECT = langdetect
LANGDETECT.default = en
LANGDETECT.failsafe = true
LANGDETECT.input = @xfer
LANGDETECT.output = @clang

# FIXME: not working, to be added between STEM and JOIN in the CRAWL and CRAWL_INDEX stages
CLEAN = filter-ignorelist
CLEAN.language = @language
CLEAN.input = tokens
CLEAN.output = tokens

BOILERPIPE = filter-boilerpipe
BOILERPIPE.input = @xfer
BOILERPIPE.output = @xfer

FIX_SPACE = replace-text
FIX_SPACE.locator = {SPACE}
FIX_SPACE.text = %20
FIX_SPACE.input = search.initialQuery
FIX_SPACE.output = search.initialQuery

BUILDSV = index-sv
BUILDSV.lucene = collections/${@col_id}/@lucene
BUILDSV.termfile = collections/${@col_id}/termvectors.bin
BUILDSV.docfile = collections/${@col_id}/docvectors.bin
BUILDSV.dimension = 200
BUILDSV.seedlength = 10
BUILDSV.maxnonalphabetchars = 0
BUILDSV.trainingcycles = 0
BUILDSV.docindexing = inmemory
BUILDSV.contentsfields = @contentfield,title
BUILDSV.docidfield = @docidfield
BUILDSV.initialtermvectors = 100

TO_V = id-vector
TO_V.file = collections/${@col_id}/docvectors.bin
TO_V.output = @xfer
TO_V.input = search.initialQuery

KW-SEARCH = vsearch
KW-SEARCH.file = collections/${@col_id}/termvectors.bin
KW-SEARCH.output = @xfer
KW-SEARCH.input = @xfer
KW-SEARCH.limit = 10

V_SEARCH = vsearch
V_SEARCH.file = collections/${@col_id}/docvectors.bin
V_SEARCH.output = @xfer
V_SEARCH.input = @xfer
V_SEARCH.limit = 10

LIST = list-sv
LIST.file = collections/${@col_id}/docvectors.bin
LIST.limit = 20
LIST.output = @xfer
LIST.input = @xfer

SEARCH = search-lucene
SEARCH.input = @xfer
SEARCH.output = @xfer
SEARCH.lucene = @lucene
SEARCH.contentfield = @contentfield
SEARCH.docidfield = @docidfield
SEARCH.language = @language
SEARCH.leadwildcard = false
SEARCH.posincrements = false
SEARCH.lcexpandedterms = false
SEARCH.defaultoperator = or
SEARCH.similarity = default

JOIN = split-basic
JOIN.mode = join
JOIN.input = stemmed_tokens
JOIN.output = @xfer

STEM = stem-lucene
STEM.input = tokens
STEM.output = stemmed_tokens
STEM.export = NOP
STEM.language= @language

NOP = nop
NOP.input = @xfer
NOP.output = @xfer

SPLIT = split-basic
SPLIT.mode = split
SPLIT.input = @xfer
SPLIT.output = tokens

PRINT = print
PRINT.input = @xfer
PRINT.output = @xfer

LUCENE_TFIDF = lucene-tfidf
LUCENE_TFIDF.output = @xfer
LUCENE_TFIDF.lucene = collections/${@col_id}/@lucene
LUCENE_TFIDF.docidfield = internalId
LUCENE_TFIDF.contentfield = @contentfield
LUCENE_TFIDF.lettersonly = @lettersonly
LUCENE_TFIDF.cyrilliconly = @cyrilliconly
LUCENE_TFIDF.mintokenlen = @mintokenlen
LUCENE_TFIDF.process = KEYWORDS_INSERT

KEYWORDS_INSERT = db-insert-keywords
KEYWORDS_INSERT.input = @xfer
KEYWORDS_INSERT.output = @xfer
KEYWORDS_INSERT.cstr = @cstr
KEYWORDS_INSERT.user = @user
KEYWORDS_INSERT.pass = @pass
KEYWORDS_INSERT.docidfield = internalId

RDB_EXEC_QUERY_GROUP_KEYWORDS = sql-exec
RDB_EXEC_QUERY_GROUP_KEYWORDS.cstr = @cstr
RDB_EXEC_QUERY_GROUP_KEYWORDS.user = @user
RDB_EXEC_QUERY_GROUP_KEYWORDS.pass = @pass
RDB_EXEC_QUERY_GROUP_KEYWORDS.query = INSERT INTO sentences(doc_id, sentence) SELECT keywords.doc_id, substring_index(GROUP_CONCAT(keywords.word ORDER BY keywords.score DESC SEPARATOR ' '), ' ', 30) AS sentence FROM keywords JOIN documents ON keywords.doc_id = documents.doc_id WHERE col_id = ${@col_id} GROUP BY keywords.doc_id

col_id = col_id

#task = docsim
task = link
cstr = jdbc:mysql://localhost/vssp?useUnicode=yes&characterEncoding=UTF-8&autoReconnect=true
user = vssp
pass = vssp

input_list = input_files
glucene = corpus_lucene
corpus_dir = corpus
xfer = xfer
clang = clang
lucene = lucene
input_list = input_files
language = bg
lettersonly = true
cyrilliconly = false
mintokenlen = 3
contentfield = content
docidfield = id
corpus_dir = corpus
xfer = xfer
crawl = crawl

appid = SRnW9tkIrMB/aXLqxwB8WHtcIUn/wlNxzwepf8XxvRk
search_query_pause = 500
search_result_size = 100
cquery = cquery