Difference between revisions of "Dictionary"

From CCIL
Jump to: navigation, search
(Parsing the text)
(Project)
Line 25: Line 25:
 
=== Project ===
 
=== Project ===
 
TBA
 
TBA
 +
 +
==== pom.xml =====
 +
 +
<pre>
 +
<?xml version="1.0" encoding="UTF-8"?>
 +
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 +
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 +
<modelVersion>4.0.0</modelVersion>
 +
<artifactId>dcmpc-services-dictionary-distribution</artifactId>
 +
<name>dcmpc-services-dictionary-distribution</name>
 +
<url>http://wiki.datacraftmagic.com/display/SFIND/%23Find+Home</url>
 +
<packaging>pom</packaging>
 +
<properties>
 +
<sharpfind.version>1.0.3</sharpfind.version>
 +
</properties>
 +
<parent>
 +
<groupId>com.datacraftmagic</groupId>
 +
<artifactId>dcmpc-services-dictionary</artifactId>
 +
<version>1.3.7-SNAPSHOT</version>
 +
</parent>
 +
<dependencies>
 +
<!-- App -->
 +
<dependency>
 +
<groupId>com.datacraftmagic</groupId>
 +
<version>${dcmpc.version}</version>
 +
<artifactId>dcmpc-services-dictionary-stages</artifactId>
 +
</dependency>
 +
<dependency>
 +
<groupId>net.ccil</groupId>
 +
<version>${ccil.version}</version>
 +
<artifactId>ccil-parse-tika</artifactId>
 +
</dependency>
 +
<dependency>
 +
<groupId>net.ccil</groupId>
 +
<version>${ccil.version}</version>
 +
<artifactId>ccil-process-filter</artifactId>
 +
</dependency>
 +
<dependency>
 +
<artifactId>ccil-app</artifactId>
 +
<groupId>net.ccil</groupId>
 +
<version>${ccil.version}</version>
 +
</dependency>
 +
<dependency>
 +
<groupId>cybercore</groupId>
 +
<version>${cybercore.version}</version>
 +
<artifactId>cybercore-util</artifactId>
 +
</dependency>
 +
<!-- Server -->
 +
<dependency>
 +
<groupId>com.datacraftmagic</groupId>
 +
<version>${dcmpc.version}</version>
 +
<artifactId>dcmpc-services-dictionary-generic</artifactId>
 +
</dependency>
 +
<dependency>
 +
<groupId>com.datacraftmagic</groupId>
 +
<version>${dcmpc.version}</version>
 +
<artifactId>dcmpc-v_index-semanticvectors</artifactId>
 +
</dependency>
 +
<dependency>
 +
<artifactId>dcmpc-clustering</artifactId>
 +
<groupId>com.datacraftmagic</groupId>
 +
<version>${dcmpc.version}</version>
 +
</dependency>
 +
<dependency>
 +
<artifactId>dcmpc-ninko-stages</artifactId>
 +
<groupId>com.datacraftmagic</groupId>
 +
<version>${dcmpc.version}</version>
 +
</dependency>
 +
<dependency>
 +
<artifactId>dcmpc-ninko-server</artifactId>
 +
<groupId>com.datacraftmagic</groupId>
 +
<version>${dcmpc.version}</version>
 +
</dependency>
 +
<dependency>
 +
<artifactId>dcmpc-ninko-api</artifactId>
 +
<groupId>com.datacraftmagic</groupId>
 +
<version>${dcmpc.version}</version>
 +
</dependency>
 +
<dependency>
 +
<groupId>com.datacraftmagic</groupId>
 +
<artifactId>dcmpc-messanger-server</artifactId>
 +
<version>${dcmpc.version}</version>
 +
<scope>runtime</scope>
 +
</dependency>
 +
<dependency>
 +
<artifactId>dcmpc-sql-stages</artifactId>
 +
<groupId>com.datacraftmagic</groupId>
 +
<version>${dcmpc.version}</version>
 +
</dependency>
 +
<dependency>
 +
<groupId>com.datacraftmagic</groupId>
 +
<version>${dcmpc.version}</version>
 +
<artifactId>dcmpc-k_index-legacy-api</artifactId>
 +
<scope>runtime</scope>
 +
</dependency>
 +
<!-- <dependency> <groupId>com.datacraftmagic</groupId> <version>${dcmpc.version}</version>
 +
<artifactId>dcmpc-k_index-solr</artifactId> <scope>runtime</scope> </dependency>
 +
<dependency> <groupId>com.datacraftmagic</groupId> <version>${dcmpc.version}</version>
 +
<artifactId>dcmpc-k_index-elastic</artifactId> <scope>runtime</scope> </dependency> -->
 +
<dependency>
 +
<groupId>com.datacraftmagic</groupId>
 +
<artifactId>dcmpc-license-server</artifactId>
 +
<version>${dcmpc.version}</version>
 +
<scope>runtime</scope>
 +
</dependency>
 +
<dependency>
 +
<groupId>com.datacraftmagic</groupId>
 +
<artifactId>sharpfind-ui</artifactId>
 +
<version>${sharpfind.version}</version>
 +
<type>war</type>
 +
</dependency>
 +
<dependency>
 +
<groupId>com.datacraftmagic</groupId>
 +
<artifactId>sharpfind-server</artifactId>
 +
<version>${sharpfind.version}</version>
 +
<scope>runtime</scope>
 +
</dependency>
 +
<dependency>
 +
<artifactId>ccil-common-generic</artifactId>
 +
<groupId>net.ccil</groupId>
 +
<version>${ccil.version}</version>
 +
<scope>runtime</scope>
 +
</dependency>
 +
<dependency>
 +
<groupId>net.ccil</groupId>
 +
<artifactId>ccil-app</artifactId>
 +
<version>${ccil.version}</version>
 +
</dependency>
 +
<dependency>
 +
<groupId>net.ccil</groupId>
 +
<artifactId>ccil-index-sv</artifactId>
 +
<version>${ccil.version}</version>
 +
</dependency>
 +
<dependency>
 +
<groupId>net.ccil</groupId>
 +
<artifactId>ccil-common-split</artifactId>
 +
<version>${ccil.version}</version>
 +
</dependency>
 +
<dependency>
 +
<groupId>net.ccil</groupId>
 +
<artifactId>ccil-common-sql</artifactId>
 +
<version>${ccil.version}</version>
 +
</dependency>
 +
<!-- newly added dependencies -->
 +
<dependency>
 +
<groupId>net.ccil</groupId>
 +
<artifactId>ccil-filter-ignorelist</artifactId>
 +
<version>${ccil.version}</version>
 +
</dependency>
 +
<dependency>
 +
<groupId>net.ccil</groupId>
 +
<artifactId>ccil-collect-rdb</artifactId>
 +
<version>${ccil.version}</version>
 +
</dependency>
 +
<dependency>
 +
<groupId>mysql</groupId>
 +
<artifactId>mysql-connector-java</artifactId>
 +
<version>5.1.17</version>
 +
</dependency>
 +
<!-- filters -->
 +
<dependency>
 +
<groupId>net.ccil</groupId>
 +
<artifactId>ccil-filters-gsim</artifactId>
 +
<version>${ccil.version}</version>
 +
</dependency>
 +
<dependency>
 +
<groupId>net.ccil</groupId>
 +
<artifactId>ccil-filters-lang</artifactId>
 +
<version>${ccil.version}</version>
 +
</dependency>
 +
<dependency>
 +
<groupId>net.ccil</groupId>
 +
<artifactId>ccil-filters-lucene</artifactId>
 +
<version>${ccil.version}</version>
 +
</dependency>
 +
<dependency>
 +
<groupId>net.ccil</groupId>
 +
<artifactId>ccil-search-engines</artifactId>
 +
<version>${ccil.version}</version>
 +
</dependency>
 +
<dependency>
 +
<groupId>net.ccil</groupId>
 +
<version>${ccil.version}</version>
 +
<artifactId>ccil-services-generic-server</artifactId>
 +
<scope>runtime</scope>
 +
</dependency>
 +
<!-- generator -->
 +
<dependency>
 +
<groupId>net.ccil</groupId>
 +
<artifactId>ccil-generation</artifactId>
 +
<version>${ccil.version}</version>
 +
</dependency>
 +
<!-- distributions -->
 +
<dependency>
 +
<groupId>com.datacraftmagic</groupId>
 +
<version>${dcmpc.version}</version>
 +
<artifactId>dcmpc-connectors-http</artifactId>
 +
<type>jar</type>
 +
<scope>runtime</scope>
 +
</dependency>
 +
<dependency>
 +
<groupId>com.datacraftmagic</groupId>
 +
<version>${dcmpc.version}</version>
 +
<artifactId>dcmpc-connectors-json</artifactId>
 +
<type>war</type>
 +
<scope>runtime</scope>
 +
</dependency>
 +
<dependency>
 +
<groupId>mysql</groupId>
 +
<artifactId>mysql-connector-java</artifactId>
 +
<version>5.1.17</version>
 +
</dependency>
 +
</dependencies>
 +
<build>
 +
<finalName>${project.name}</finalName>
 +
<plugins>
 +
<plugin>
 +
<artifactId>maven-assembly-plugin</artifactId>
 +
<configuration>
 +
<descriptors>
 +
<descriptor>bin.xml</descriptor>
 +
</descriptors>
 +
</configuration>
 +
<executions>
 +
<execution>
 +
<id>make-assembly</id>
 +
<phase>package</phase>
 +
<goals>
 +
<goal>attached</goal>
 +
</goals>
 +
</execution>
 +
</executions>
 +
</plugin>
 +
</plugins>
 +
</build>
 +
 +
<organization>
 +
<name>Data Craft and Magic ltd.</name>
 +
<url>http://datacraftmagic.com/</url>
 +
</organization>
 +
</project>
 +
</pre>
  
 
=== Startup script ===
 
=== Startup script ===

Revision as of 04:14, 17 May 2017

Goal

The goal of this tutorial is to create a simple dictionary - a database with words from a specific language(s). It will be created in a very simple manner - we supply some text to the pipeline (in PDF, TXT or any other popular format), which parses it and insert the words in a unique manner to a database.


What we have to do?

  1. Parse text which comes in an arbitrary format
  2. Insert all tokens words from it, which satisfy the 'word' criteria in a database with no duplications

Setup

Obviously, we will need to setup a context. It has a very simple structure, for the purpose of the tutorial we will name it "dictionary" :

context
\- apps
   \- dictionary
      |- languages
      |  \- en
      |     \- source.pdf
      \- context.properties

You can use any file in the place of source.pdf. It is just an ordinary text downloaded from the Internet. Of course, more words it contains - the better.

Project

TBA

pom.xml =

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
	<modelVersion>4.0.0</modelVersion>
	<artifactId>dcmpc-services-dictionary-distribution</artifactId>
	<name>dcmpc-services-dictionary-distribution</name>
	<url>http://wiki.datacraftmagic.com/display/SFIND/%23Find+Home</url>
	<packaging>pom</packaging>
	<properties>
		<sharpfind.version>1.0.3</sharpfind.version>
	</properties>
	<parent>
		<groupId>com.datacraftmagic</groupId>
		<artifactId>dcmpc-services-dictionary</artifactId>
		<version>1.3.7-SNAPSHOT</version>
	</parent>
	<dependencies>
		<!-- App -->
		<dependency>
			<groupId>com.datacraftmagic</groupId>
			<version>${dcmpc.version}</version>
			<artifactId>dcmpc-services-dictionary-stages</artifactId>
		</dependency>
		<dependency>
			<groupId>net.ccil</groupId>
			<version>${ccil.version}</version>
			<artifactId>ccil-parse-tika</artifactId>
		</dependency>
		<dependency>
			<groupId>net.ccil</groupId>
			<version>${ccil.version}</version>
			<artifactId>ccil-process-filter</artifactId>
		</dependency>
		<dependency>
			<artifactId>ccil-app</artifactId>
			<groupId>net.ccil</groupId>
			<version>${ccil.version}</version>
		</dependency>
		<dependency>
			<groupId>cybercore</groupId>
			<version>${cybercore.version}</version>
			<artifactId>cybercore-util</artifactId>
		</dependency>
		<!-- Server -->
		<dependency>
			<groupId>com.datacraftmagic</groupId>
			<version>${dcmpc.version}</version>
			<artifactId>dcmpc-services-dictionary-generic</artifactId>
		</dependency>
		<dependency>
			<groupId>com.datacraftmagic</groupId>
			<version>${dcmpc.version}</version>
			<artifactId>dcmpc-v_index-semanticvectors</artifactId>
		</dependency>
		<dependency>
			<artifactId>dcmpc-clustering</artifactId>
			<groupId>com.datacraftmagic</groupId>
			<version>${dcmpc.version}</version>
		</dependency>
		<dependency>
			<artifactId>dcmpc-ninko-stages</artifactId>
			<groupId>com.datacraftmagic</groupId>
			<version>${dcmpc.version}</version>
		</dependency>
		<dependency>
			<artifactId>dcmpc-ninko-server</artifactId>
			<groupId>com.datacraftmagic</groupId>
			<version>${dcmpc.version}</version>
		</dependency>
		<dependency>
			<artifactId>dcmpc-ninko-api</artifactId>
			<groupId>com.datacraftmagic</groupId>
			<version>${dcmpc.version}</version>
		</dependency>
		<dependency>
			<groupId>com.datacraftmagic</groupId>
			<artifactId>dcmpc-messanger-server</artifactId>
			<version>${dcmpc.version}</version>
			<scope>runtime</scope>
		</dependency>
		<dependency>
			<artifactId>dcmpc-sql-stages</artifactId>
			<groupId>com.datacraftmagic</groupId>
			<version>${dcmpc.version}</version>
		</dependency>
		<dependency>
			<groupId>com.datacraftmagic</groupId>
			<version>${dcmpc.version}</version>
			<artifactId>dcmpc-k_index-legacy-api</artifactId>
			<scope>runtime</scope>
		</dependency>
		<!-- <dependency> <groupId>com.datacraftmagic</groupId> <version>${dcmpc.version}</version> 
			<artifactId>dcmpc-k_index-solr</artifactId> <scope>runtime</scope> </dependency> 
			<dependency> <groupId>com.datacraftmagic</groupId> <version>${dcmpc.version}</version> 
			<artifactId>dcmpc-k_index-elastic</artifactId> <scope>runtime</scope> </dependency> -->
		<dependency>
			<groupId>com.datacraftmagic</groupId>
			<artifactId>dcmpc-license-server</artifactId>
			<version>${dcmpc.version}</version>
			<scope>runtime</scope>
		</dependency>
		<dependency>
			<groupId>com.datacraftmagic</groupId>
			<artifactId>sharpfind-ui</artifactId>
			<version>${sharpfind.version}</version>
			<type>war</type>
		</dependency>
		<dependency>
			<groupId>com.datacraftmagic</groupId>
			<artifactId>sharpfind-server</artifactId>
			<version>${sharpfind.version}</version>
			<scope>runtime</scope>
		</dependency>
		<dependency>
			<artifactId>ccil-common-generic</artifactId>
			<groupId>net.ccil</groupId>
			<version>${ccil.version}</version>
			<scope>runtime</scope>
		</dependency>
		<dependency>
			<groupId>net.ccil</groupId>
			<artifactId>ccil-app</artifactId>
			<version>${ccil.version}</version>
		</dependency>
		<dependency>
			<groupId>net.ccil</groupId>
			<artifactId>ccil-index-sv</artifactId>
			<version>${ccil.version}</version>
		</dependency>
		<dependency>
			<groupId>net.ccil</groupId>
			<artifactId>ccil-common-split</artifactId>
			<version>${ccil.version}</version>
		</dependency>
		<dependency>
			<groupId>net.ccil</groupId>
			<artifactId>ccil-common-sql</artifactId>
			<version>${ccil.version}</version>
		</dependency>
		<!-- newly added dependencies -->
		<dependency>
			<groupId>net.ccil</groupId>
			<artifactId>ccil-filter-ignorelist</artifactId>
			<version>${ccil.version}</version>
		</dependency>
		<dependency>
			<groupId>net.ccil</groupId>
			<artifactId>ccil-collect-rdb</artifactId>
			<version>${ccil.version}</version>
		</dependency>
		<dependency>
			<groupId>mysql</groupId>
			<artifactId>mysql-connector-java</artifactId>
			<version>5.1.17</version>
		</dependency>
		<!-- filters -->
		<dependency>
			<groupId>net.ccil</groupId>
			<artifactId>ccil-filters-gsim</artifactId>
			<version>${ccil.version}</version>
		</dependency>
		<dependency>
			<groupId>net.ccil</groupId>
			<artifactId>ccil-filters-lang</artifactId>
			<version>${ccil.version}</version>
		</dependency>
		<dependency>
			<groupId>net.ccil</groupId>
			<artifactId>ccil-filters-lucene</artifactId>
			<version>${ccil.version}</version>
		</dependency>
		<dependency>
			<groupId>net.ccil</groupId>
			<artifactId>ccil-search-engines</artifactId>
			<version>${ccil.version}</version>
		</dependency>
		<dependency>
			<groupId>net.ccil</groupId>
			<version>${ccil.version}</version>
			<artifactId>ccil-services-generic-server</artifactId>
			<scope>runtime</scope>
		</dependency>
		<!-- generator -->
		<dependency>
			<groupId>net.ccil</groupId>
			<artifactId>ccil-generation</artifactId>
			<version>${ccil.version}</version>
		</dependency>
		<!-- distributions -->
		<dependency>
			<groupId>com.datacraftmagic</groupId>
			<version>${dcmpc.version}</version>
			<artifactId>dcmpc-connectors-http</artifactId>
			<type>jar</type>
			<scope>runtime</scope>
		</dependency>
		<dependency>
			<groupId>com.datacraftmagic</groupId>
			<version>${dcmpc.version}</version>
			<artifactId>dcmpc-connectors-json</artifactId>
			<type>war</type>
			<scope>runtime</scope>
		</dependency>
		<dependency>
			<groupId>mysql</groupId>
			<artifactId>mysql-connector-java</artifactId>
			<version>5.1.17</version>
		</dependency>
	</dependencies>
	<build>
		<finalName>${project.name}</finalName>
		<plugins>
			<plugin>
				<artifactId>maven-assembly-plugin</artifactId>
				<configuration>
					<descriptors>
						<descriptor>bin.xml</descriptor>
					</descriptors>
				</configuration>
				<executions>
					<execution>
						<id>make-assembly</id>
						<phase>package</phase>
						<goals>
							<goal>attached</goal>
						</goals>
					</execution>
				</executions>
			</plugin>
		</plugins>
	</build>

	<organization>
		<name>Data Craft and Magic ltd.</name>
		<url>http://datacraftmagic.com/</url>
	</organization>
</project>

Startup script

tutorials-dictionary-app.sh

#!/bin/bash
CCIL_HOME=`dirname $PWD`
CCIL_CONTEXT=$CCIL_HOME/context

echo -------------------
echo CCIL_HOME = $CCIL_HOME
echo -------------------

java -cp "$CCIL_HOME/lib/*:$CCIL_HOME/config:$CCIL_HOME/launcher/*" -Dserver.config.file=tutorials-dictionary-app.ttl -Dserver.home.dir=$CCIL_HOME -Xmx1024M -Dserver.context.dir=$CCIL_CONTEXT -Dserver.jmx.enabled=false net.ccil.execution.CcilConsoleApp -execute -root $CCIL_HOME/context/apps "$@"

Parsing the text

TBA

Insert into database

TBA

Further steps

TBA