Difference between revisions of "Dictionary"
From CCIL
(→Parsing the text) |
(→Project) |
||
Line 25: | Line 25: | ||
=== Project === | === Project === | ||
TBA | TBA | ||
+ | |||
+ | ==== pom.xml ===== | ||
+ | |||
+ | <pre> | ||
+ | <?xml version="1.0" encoding="UTF-8"?> | ||
+ | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
+ | xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> | ||
+ | <modelVersion>4.0.0</modelVersion> | ||
+ | <artifactId>dcmpc-services-dictionary-distribution</artifactId> | ||
+ | <name>dcmpc-services-dictionary-distribution</name> | ||
+ | <url>http://wiki.datacraftmagic.com/display/SFIND/%23Find+Home</url> | ||
+ | <packaging>pom</packaging> | ||
+ | <properties> | ||
+ | <sharpfind.version>1.0.3</sharpfind.version> | ||
+ | </properties> | ||
+ | <parent> | ||
+ | <groupId>com.datacraftmagic</groupId> | ||
+ | <artifactId>dcmpc-services-dictionary</artifactId> | ||
+ | <version>1.3.7-SNAPSHOT</version> | ||
+ | </parent> | ||
+ | <dependencies> | ||
+ | <!-- App --> | ||
+ | <dependency> | ||
+ | <groupId>com.datacraftmagic</groupId> | ||
+ | <version>${dcmpc.version}</version> | ||
+ | <artifactId>dcmpc-services-dictionary-stages</artifactId> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <groupId>net.ccil</groupId> | ||
+ | <version>${ccil.version}</version> | ||
+ | <artifactId>ccil-parse-tika</artifactId> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <groupId>net.ccil</groupId> | ||
+ | <version>${ccil.version}</version> | ||
+ | <artifactId>ccil-process-filter</artifactId> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <artifactId>ccil-app</artifactId> | ||
+ | <groupId>net.ccil</groupId> | ||
+ | <version>${ccil.version}</version> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <groupId>cybercore</groupId> | ||
+ | <version>${cybercore.version}</version> | ||
+ | <artifactId>cybercore-util</artifactId> | ||
+ | </dependency> | ||
+ | <!-- Server --> | ||
+ | <dependency> | ||
+ | <groupId>com.datacraftmagic</groupId> | ||
+ | <version>${dcmpc.version}</version> | ||
+ | <artifactId>dcmpc-services-dictionary-generic</artifactId> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <groupId>com.datacraftmagic</groupId> | ||
+ | <version>${dcmpc.version}</version> | ||
+ | <artifactId>dcmpc-v_index-semanticvectors</artifactId> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <artifactId>dcmpc-clustering</artifactId> | ||
+ | <groupId>com.datacraftmagic</groupId> | ||
+ | <version>${dcmpc.version}</version> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <artifactId>dcmpc-ninko-stages</artifactId> | ||
+ | <groupId>com.datacraftmagic</groupId> | ||
+ | <version>${dcmpc.version}</version> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <artifactId>dcmpc-ninko-server</artifactId> | ||
+ | <groupId>com.datacraftmagic</groupId> | ||
+ | <version>${dcmpc.version}</version> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <artifactId>dcmpc-ninko-api</artifactId> | ||
+ | <groupId>com.datacraftmagic</groupId> | ||
+ | <version>${dcmpc.version}</version> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <groupId>com.datacraftmagic</groupId> | ||
+ | <artifactId>dcmpc-messanger-server</artifactId> | ||
+ | <version>${dcmpc.version}</version> | ||
+ | <scope>runtime</scope> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <artifactId>dcmpc-sql-stages</artifactId> | ||
+ | <groupId>com.datacraftmagic</groupId> | ||
+ | <version>${dcmpc.version}</version> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <groupId>com.datacraftmagic</groupId> | ||
+ | <version>${dcmpc.version}</version> | ||
+ | <artifactId>dcmpc-k_index-legacy-api</artifactId> | ||
+ | <scope>runtime</scope> | ||
+ | </dependency> | ||
+ | <!-- <dependency> <groupId>com.datacraftmagic</groupId> <version>${dcmpc.version}</version> | ||
+ | <artifactId>dcmpc-k_index-solr</artifactId> <scope>runtime</scope> </dependency> | ||
+ | <dependency> <groupId>com.datacraftmagic</groupId> <version>${dcmpc.version}</version> | ||
+ | <artifactId>dcmpc-k_index-elastic</artifactId> <scope>runtime</scope> </dependency> --> | ||
+ | <dependency> | ||
+ | <groupId>com.datacraftmagic</groupId> | ||
+ | <artifactId>dcmpc-license-server</artifactId> | ||
+ | <version>${dcmpc.version}</version> | ||
+ | <scope>runtime</scope> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <groupId>com.datacraftmagic</groupId> | ||
+ | <artifactId>sharpfind-ui</artifactId> | ||
+ | <version>${sharpfind.version}</version> | ||
+ | <type>war</type> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <groupId>com.datacraftmagic</groupId> | ||
+ | <artifactId>sharpfind-server</artifactId> | ||
+ | <version>${sharpfind.version}</version> | ||
+ | <scope>runtime</scope> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <artifactId>ccil-common-generic</artifactId> | ||
+ | <groupId>net.ccil</groupId> | ||
+ | <version>${ccil.version}</version> | ||
+ | <scope>runtime</scope> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <groupId>net.ccil</groupId> | ||
+ | <artifactId>ccil-app</artifactId> | ||
+ | <version>${ccil.version}</version> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <groupId>net.ccil</groupId> | ||
+ | <artifactId>ccil-index-sv</artifactId> | ||
+ | <version>${ccil.version}</version> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <groupId>net.ccil</groupId> | ||
+ | <artifactId>ccil-common-split</artifactId> | ||
+ | <version>${ccil.version}</version> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <groupId>net.ccil</groupId> | ||
+ | <artifactId>ccil-common-sql</artifactId> | ||
+ | <version>${ccil.version}</version> | ||
+ | </dependency> | ||
+ | <!-- newly added dependencies --> | ||
+ | <dependency> | ||
+ | <groupId>net.ccil</groupId> | ||
+ | <artifactId>ccil-filter-ignorelist</artifactId> | ||
+ | <version>${ccil.version}</version> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <groupId>net.ccil</groupId> | ||
+ | <artifactId>ccil-collect-rdb</artifactId> | ||
+ | <version>${ccil.version}</version> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <groupId>mysql</groupId> | ||
+ | <artifactId>mysql-connector-java</artifactId> | ||
+ | <version>5.1.17</version> | ||
+ | </dependency> | ||
+ | <!-- filters --> | ||
+ | <dependency> | ||
+ | <groupId>net.ccil</groupId> | ||
+ | <artifactId>ccil-filters-gsim</artifactId> | ||
+ | <version>${ccil.version}</version> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <groupId>net.ccil</groupId> | ||
+ | <artifactId>ccil-filters-lang</artifactId> | ||
+ | <version>${ccil.version}</version> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <groupId>net.ccil</groupId> | ||
+ | <artifactId>ccil-filters-lucene</artifactId> | ||
+ | <version>${ccil.version}</version> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <groupId>net.ccil</groupId> | ||
+ | <artifactId>ccil-search-engines</artifactId> | ||
+ | <version>${ccil.version}</version> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <groupId>net.ccil</groupId> | ||
+ | <version>${ccil.version}</version> | ||
+ | <artifactId>ccil-services-generic-server</artifactId> | ||
+ | <scope>runtime</scope> | ||
+ | </dependency> | ||
+ | <!-- generator --> | ||
+ | <dependency> | ||
+ | <groupId>net.ccil</groupId> | ||
+ | <artifactId>ccil-generation</artifactId> | ||
+ | <version>${ccil.version}</version> | ||
+ | </dependency> | ||
+ | <!-- distributions --> | ||
+ | <dependency> | ||
+ | <groupId>com.datacraftmagic</groupId> | ||
+ | <version>${dcmpc.version}</version> | ||
+ | <artifactId>dcmpc-connectors-http</artifactId> | ||
+ | <type>jar</type> | ||
+ | <scope>runtime</scope> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <groupId>com.datacraftmagic</groupId> | ||
+ | <version>${dcmpc.version}</version> | ||
+ | <artifactId>dcmpc-connectors-json</artifactId> | ||
+ | <type>war</type> | ||
+ | <scope>runtime</scope> | ||
+ | </dependency> | ||
+ | <dependency> | ||
+ | <groupId>mysql</groupId> | ||
+ | <artifactId>mysql-connector-java</artifactId> | ||
+ | <version>5.1.17</version> | ||
+ | </dependency> | ||
+ | </dependencies> | ||
+ | <build> | ||
+ | <finalName>${project.name}</finalName> | ||
+ | <plugins> | ||
+ | <plugin> | ||
+ | <artifactId>maven-assembly-plugin</artifactId> | ||
+ | <configuration> | ||
+ | <descriptors> | ||
+ | <descriptor>bin.xml</descriptor> | ||
+ | </descriptors> | ||
+ | </configuration> | ||
+ | <executions> | ||
+ | <execution> | ||
+ | <id>make-assembly</id> | ||
+ | <phase>package</phase> | ||
+ | <goals> | ||
+ | <goal>attached</goal> | ||
+ | </goals> | ||
+ | </execution> | ||
+ | </executions> | ||
+ | </plugin> | ||
+ | </plugins> | ||
+ | </build> | ||
+ | |||
+ | <organization> | ||
+ | <name>Data Craft and Magic ltd.</name> | ||
+ | <url>http://datacraftmagic.com/</url> | ||
+ | </organization> | ||
+ | </project> | ||
+ | </pre> | ||
=== Startup script === | === Startup script === |
Revision as of 04:14, 17 May 2017
Contents
Goal
The goal of this tutorial is to create a simple dictionary - a database with words from a specific language(s). It will be created in a very simple manner - we supply some text to the pipeline (in PDF, TXT or any other popular format), which parses it and insert the words in a unique manner to a database.
What we have to do?
- Parse text which comes in an arbitrary format
- Insert all tokens words from it, which satisfy the 'word' criteria in a database with no duplications
Setup
Obviously, we will need to setup a context. It has a very simple structure, for the purpose of the tutorial we will name it "dictionary" :
context \- apps \- dictionary |- languages | \- en | \- source.pdf \- context.properties
You can use any file in the place of source.pdf. It is just an ordinary text downloaded from the Internet. Of course, more words it contains - the better.
Project
TBA
pom.xml =
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <modelVersion>4.0.0</modelVersion> <artifactId>dcmpc-services-dictionary-distribution</artifactId> <name>dcmpc-services-dictionary-distribution</name> <url>http://wiki.datacraftmagic.com/display/SFIND/%23Find+Home</url> <packaging>pom</packaging> <properties> <sharpfind.version>1.0.3</sharpfind.version> </properties> <parent> <groupId>com.datacraftmagic</groupId> <artifactId>dcmpc-services-dictionary</artifactId> <version>1.3.7-SNAPSHOT</version> </parent> <dependencies> <!-- App --> <dependency> <groupId>com.datacraftmagic</groupId> <version>${dcmpc.version}</version> <artifactId>dcmpc-services-dictionary-stages</artifactId> </dependency> <dependency> <groupId>net.ccil</groupId> <version>${ccil.version}</version> <artifactId>ccil-parse-tika</artifactId> </dependency> <dependency> <groupId>net.ccil</groupId> <version>${ccil.version}</version> <artifactId>ccil-process-filter</artifactId> </dependency> <dependency> <artifactId>ccil-app</artifactId> <groupId>net.ccil</groupId> <version>${ccil.version}</version> </dependency> <dependency> <groupId>cybercore</groupId> <version>${cybercore.version}</version> <artifactId>cybercore-util</artifactId> </dependency> <!-- Server --> <dependency> <groupId>com.datacraftmagic</groupId> <version>${dcmpc.version}</version> <artifactId>dcmpc-services-dictionary-generic</artifactId> </dependency> <dependency> <groupId>com.datacraftmagic</groupId> <version>${dcmpc.version}</version> <artifactId>dcmpc-v_index-semanticvectors</artifactId> </dependency> <dependency> <artifactId>dcmpc-clustering</artifactId> <groupId>com.datacraftmagic</groupId> <version>${dcmpc.version}</version> </dependency> <dependency> <artifactId>dcmpc-ninko-stages</artifactId> <groupId>com.datacraftmagic</groupId> <version>${dcmpc.version}</version> </dependency> <dependency> <artifactId>dcmpc-ninko-server</artifactId> <groupId>com.datacraftmagic</groupId> <version>${dcmpc.version}</version> </dependency> <dependency> <artifactId>dcmpc-ninko-api</artifactId> <groupId>com.datacraftmagic</groupId> <version>${dcmpc.version}</version> </dependency> <dependency> <groupId>com.datacraftmagic</groupId> <artifactId>dcmpc-messanger-server</artifactId> <version>${dcmpc.version}</version> <scope>runtime</scope> </dependency> <dependency> <artifactId>dcmpc-sql-stages</artifactId> <groupId>com.datacraftmagic</groupId> <version>${dcmpc.version}</version> </dependency> <dependency> <groupId>com.datacraftmagic</groupId> <version>${dcmpc.version}</version> <artifactId>dcmpc-k_index-legacy-api</artifactId> <scope>runtime</scope> </dependency> <!-- <dependency> <groupId>com.datacraftmagic</groupId> <version>${dcmpc.version}</version> <artifactId>dcmpc-k_index-solr</artifactId> <scope>runtime</scope> </dependency> <dependency> <groupId>com.datacraftmagic</groupId> <version>${dcmpc.version}</version> <artifactId>dcmpc-k_index-elastic</artifactId> <scope>runtime</scope> </dependency> --> <dependency> <groupId>com.datacraftmagic</groupId> <artifactId>dcmpc-license-server</artifactId> <version>${dcmpc.version}</version> <scope>runtime</scope> </dependency> <dependency> <groupId>com.datacraftmagic</groupId> <artifactId>sharpfind-ui</artifactId> <version>${sharpfind.version}</version> <type>war</type> </dependency> <dependency> <groupId>com.datacraftmagic</groupId> <artifactId>sharpfind-server</artifactId> <version>${sharpfind.version}</version> <scope>runtime</scope> </dependency> <dependency> <artifactId>ccil-common-generic</artifactId> <groupId>net.ccil</groupId> <version>${ccil.version}</version> <scope>runtime</scope> </dependency> <dependency> <groupId>net.ccil</groupId> <artifactId>ccil-app</artifactId> <version>${ccil.version}</version> </dependency> <dependency> <groupId>net.ccil</groupId> <artifactId>ccil-index-sv</artifactId> <version>${ccil.version}</version> </dependency> <dependency> <groupId>net.ccil</groupId> <artifactId>ccil-common-split</artifactId> <version>${ccil.version}</version> </dependency> <dependency> <groupId>net.ccil</groupId> <artifactId>ccil-common-sql</artifactId> <version>${ccil.version}</version> </dependency> <!-- newly added dependencies --> <dependency> <groupId>net.ccil</groupId> <artifactId>ccil-filter-ignorelist</artifactId> <version>${ccil.version}</version> </dependency> <dependency> <groupId>net.ccil</groupId> <artifactId>ccil-collect-rdb</artifactId> <version>${ccil.version}</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.17</version> </dependency> <!-- filters --> <dependency> <groupId>net.ccil</groupId> <artifactId>ccil-filters-gsim</artifactId> <version>${ccil.version}</version> </dependency> <dependency> <groupId>net.ccil</groupId> <artifactId>ccil-filters-lang</artifactId> <version>${ccil.version}</version> </dependency> <dependency> <groupId>net.ccil</groupId> <artifactId>ccil-filters-lucene</artifactId> <version>${ccil.version}</version> </dependency> <dependency> <groupId>net.ccil</groupId> <artifactId>ccil-search-engines</artifactId> <version>${ccil.version}</version> </dependency> <dependency> <groupId>net.ccil</groupId> <version>${ccil.version}</version> <artifactId>ccil-services-generic-server</artifactId> <scope>runtime</scope> </dependency> <!-- generator --> <dependency> <groupId>net.ccil</groupId> <artifactId>ccil-generation</artifactId> <version>${ccil.version}</version> </dependency> <!-- distributions --> <dependency> <groupId>com.datacraftmagic</groupId> <version>${dcmpc.version}</version> <artifactId>dcmpc-connectors-http</artifactId> <type>jar</type> <scope>runtime</scope> </dependency> <dependency> <groupId>com.datacraftmagic</groupId> <version>${dcmpc.version}</version> <artifactId>dcmpc-connectors-json</artifactId> <type>war</type> <scope>runtime</scope> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.17</version> </dependency> </dependencies> <build> <finalName>${project.name}</finalName> <plugins> <plugin> <artifactId>maven-assembly-plugin</artifactId> <configuration> <descriptors> <descriptor>bin.xml</descriptor> </descriptors> </configuration> <executions> <execution> <id>make-assembly</id> <phase>package</phase> <goals> <goal>attached</goal> </goals> </execution> </executions> </plugin> </plugins> </build> <organization> <name>Data Craft and Magic ltd.</name> <url>http://datacraftmagic.com/</url> </organization> </project>
Startup script
tutorials-dictionary-app.sh
#!/bin/bash CCIL_HOME=`dirname $PWD` CCIL_CONTEXT=$CCIL_HOME/context echo ------------------- echo CCIL_HOME = $CCIL_HOME echo ------------------- java -cp "$CCIL_HOME/lib/*:$CCIL_HOME/config:$CCIL_HOME/launcher/*" -Dserver.config.file=tutorials-dictionary-app.ttl -Dserver.home.dir=$CCIL_HOME -Xmx1024M -Dserver.context.dir=$CCIL_CONTEXT -Dserver.jmx.enabled=false net.ccil.execution.CcilConsoleApp -execute -root $CCIL_HOME/context/apps "$@"
Parsing the text
TBA
Insert into database
TBA
Further steps
TBA