Difference between revisions of "Dictionary"

From CCIL
Jump to: navigation, search
(Project)
(pom.xml)
Line 32: Line 32:
  
 
==== Distribution ====
 
==== Distribution ====
==== pom.xml ====
+
===== pom.xml =====
  
 
<pre>
 
<pre>
Line 272: Line 272:
 
</organization>
 
</organization>
 
</project>
 
</project>
 +
</pre>
 +
 +
===== bin.xml =====
 +
 +
<pre>
 +
<assembly>
 +
<id>bin</id>
 +
<includeBaseDirectory>false</includeBaseDirectory>
 +
<formats>
 +
<format>zip</format>
 +
<format>dir</format>
 +
</formats>
 +
<fileSets>
 +
<fileSet>
 +
<directory>files/bin</directory>
 +
<useDefaultExcludes>true</useDefaultExcludes>
 +
<outputDirectory>bin</outputDirectory>
 +
<fileMode>0755</fileMode>
 +
</fileSet>
 +
<fileSet>
 +
<directory>files/config</directory>
 +
<useDefaultExcludes>true</useDefaultExcludes>
 +
<outputDirectory>config</outputDirectory>
 +
</fileSet>
 +
<fileSet>
 +
<directory>files/context</directory>
 +
<useDefaultExcludes>true</useDefaultExcludes>
 +
<outputDirectory>context</outputDirectory>
 +
</fileSet>
 +
<fileSet>
 +
<directory>files/services</directory>
 +
<useDefaultExcludes>true</useDefaultExcludes>
 +
<outputDirectory>services</outputDirectory>
 +
</fileSet>
 +
<fileSet>
 +
<directory>files/sql</directory>
 +
<useDefaultExcludes>true</useDefaultExcludes>
 +
<outputDirectory>sql</outputDirectory>
 +
</fileSet>
 +
</fileSets>
 +
<dependencySets>
 +
<!-- lib folder -->
 +
<dependencySet>
 +
<useProjectArtifact>false</useProjectArtifact>
 +
<useProjectAttachments>false</useProjectAttachments>
 +
<outputDirectory>lib</outputDirectory>
 +
<useTransitiveDependencies>true</useTransitiveDependencies>
 +
 +
<excludes>
 +
<exclude>org.eclipse.jetty:*</exclude>
 +
<exclude>org.slf4j:*</exclude>
 +
<exclude>ch.qos.logback:*</exclude>
 +
<!-- only JARs here -->
 +
<exclude>*:war:*</exclude>
 +
<exclude>*:pom:*</exclude>
 +
<exclude>*:zip:*</exclude>
 +
<exclude>*:zip:*</exclude>
 +
</excludes>
 +
</dependencySet>
 +
<!-- populate the launcher folder -->
 +
<dependencySet>
 +
<useProjectArtifact>false</useProjectArtifact>
 +
<useProjectAttachments>false</useProjectAttachments>
 +
<outputDirectory>launcher</outputDirectory>
 +
<useTransitiveDependencies>true</useTransitiveDependencies>
 +
<includes>
 +
<!-- server -->
 +
<include>cybercore:cybercore-launcher</include>
 +
<!-- common -->
 +
<include>ch.qos.logback:logback*</include>
 +
<include>org.slf4j:jcl-over-slf4j</include>
 +
<include>org.slf4j:slf4j-api</include>
 +
<include>log4j:log4j</include>
 +
</includes>
 +
</dependencySet>
 +
<!-- web -->
 +
<dependencySet>
 +
<unpack>false</unpack>
 +
<useProjectArtifact>false</useProjectArtifact>
 +
<useProjectAttachments>false</useProjectAttachments>
 +
<outputDirectory>services/httpd/lib</outputDirectory>
 +
<useTransitiveDependencies>true</useTransitiveDependencies>
 +
<includes>
 +
<include>org.eclipse.jetty:*</include>
 +
<include>com.datacraftmagic:dcmpc-connectors-http:jar:*</include>
 +
</includes>
 +
</dependencySet>
 +
<dependencySet>
 +
<unpack>false</unpack>
 +
<outputFileNameMapping>services.war</outputFileNameMapping>
 +
<useProjectArtifact>false</useProjectArtifact>
 +
<useProjectAttachments>false</useProjectAttachments>
 +
<outputDirectory>services/httpd/webapps</outputDirectory>
 +
<useTransitiveDependencies>true</useTransitiveDependencies>
 +
<includes>
 +
<include>com.datacraftmagic:dcmpc-connectors-json:war:*</include>
 +
</includes>
 +
</dependencySet>
 +
<!-- ui -->
 +
<dependencySet>
 +
<unpack>false</unpack>
 +
<useProjectArtifact>false</useProjectArtifact>
 +
<useProjectAttachments>false</useProjectAttachments>
 +
<outputDirectory>services/httpd/webapps</outputDirectory>
 +
<useTransitiveDependencies>false</useTransitiveDependencies>
 +
<scope>runtime</scope>
 +
<includes>
 +
<include>com.datacraftmagic:dcmpc-dictionary-ui:war:*</include>
 +
</includes>
 +
</dependencySet>
 +
</dependencySets>
 +
</assembly>
 
</pre>
 
</pre>
  

Revision as of 04:18, 17 May 2017

Goal

The goal of this tutorial is to create a simple dictionary - a database with words from a specific language(s). It will be created in a very simple manner - we supply some text to the pipeline (in PDF, TXT or any other popular format), which parses it and insert the words in a unique manner to a database.


What we have to do?

  1. Parse text which comes in an arbitrary format
  2. Insert all tokens words from it, which satisfy the 'word' criteria in a database with no duplications

Setup

Obviously, we will need to setup a context. It has a very simple structure, for the purpose of the tutorial we will name it "dictionary" :

context
\- apps
   \- dictionary
      |- languages
      |  \- en
      |     \- source.pdf
      \- context.properties

You can use any file in the place of source.pdf. It is just an ordinary text downloaded from the Internet. Of course, more words it contains - the better.

Project

tutorials-dictionary
|- distribution
|- stages
\- pom.xml

Distribution

pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
	<modelVersion>4.0.0</modelVersion>
	<artifactId>dcmpc-services-dictionary-distribution</artifactId>
	<name>dcmpc-services-dictionary-distribution</name>
	<url>http://wiki.datacraftmagic.com/display/SFIND/%23Find+Home</url>
	<packaging>pom</packaging>
	<properties>
		<sharpfind.version>1.0.3</sharpfind.version>
	</properties>
	<parent>
		<groupId>com.datacraftmagic</groupId>
		<artifactId>dcmpc-services-dictionary</artifactId>
		<version>1.3.7-SNAPSHOT</version>
	</parent>
	<dependencies>
		<!-- App -->
		<dependency>
			<groupId>com.datacraftmagic</groupId>
			<version>${dcmpc.version}</version>
			<artifactId>dcmpc-services-dictionary-stages</artifactId>
		</dependency>
		<dependency>
			<groupId>net.ccil</groupId>
			<version>${ccil.version}</version>
			<artifactId>ccil-parse-tika</artifactId>
		</dependency>
		<dependency>
			<groupId>net.ccil</groupId>
			<version>${ccil.version}</version>
			<artifactId>ccil-process-filter</artifactId>
		</dependency>
		<dependency>
			<artifactId>ccil-app</artifactId>
			<groupId>net.ccil</groupId>
			<version>${ccil.version}</version>
		</dependency>
		<dependency>
			<groupId>cybercore</groupId>
			<version>${cybercore.version}</version>
			<artifactId>cybercore-util</artifactId>
		</dependency>
		<!-- Server -->
		<dependency>
			<groupId>com.datacraftmagic</groupId>
			<version>${dcmpc.version}</version>
			<artifactId>dcmpc-services-dictionary-generic</artifactId>
		</dependency>
		<dependency>
			<groupId>com.datacraftmagic</groupId>
			<version>${dcmpc.version}</version>
			<artifactId>dcmpc-v_index-semanticvectors</artifactId>
		</dependency>
		<dependency>
			<artifactId>dcmpc-clustering</artifactId>
			<groupId>com.datacraftmagic</groupId>
			<version>${dcmpc.version}</version>
		</dependency>
		<dependency>
			<artifactId>dcmpc-ninko-stages</artifactId>
			<groupId>com.datacraftmagic</groupId>
			<version>${dcmpc.version}</version>
		</dependency>
		<dependency>
			<artifactId>dcmpc-ninko-server</artifactId>
			<groupId>com.datacraftmagic</groupId>
			<version>${dcmpc.version}</version>
		</dependency>
		<dependency>
			<artifactId>dcmpc-ninko-api</artifactId>
			<groupId>com.datacraftmagic</groupId>
			<version>${dcmpc.version}</version>
		</dependency>
		<dependency>
			<groupId>com.datacraftmagic</groupId>
			<artifactId>dcmpc-messanger-server</artifactId>
			<version>${dcmpc.version}</version>
			<scope>runtime</scope>
		</dependency>
		<dependency>
			<artifactId>dcmpc-sql-stages</artifactId>
			<groupId>com.datacraftmagic</groupId>
			<version>${dcmpc.version}</version>
		</dependency>
		<dependency>
			<groupId>com.datacraftmagic</groupId>
			<version>${dcmpc.version}</version>
			<artifactId>dcmpc-k_index-legacy-api</artifactId>
			<scope>runtime</scope>
		</dependency>
		<!-- <dependency> <groupId>com.datacraftmagic</groupId> <version>${dcmpc.version}</version> 
			<artifactId>dcmpc-k_index-solr</artifactId> <scope>runtime</scope> </dependency> 
			<dependency> <groupId>com.datacraftmagic</groupId> <version>${dcmpc.version}</version> 
			<artifactId>dcmpc-k_index-elastic</artifactId> <scope>runtime</scope> </dependency> -->
		<dependency>
			<groupId>com.datacraftmagic</groupId>
			<artifactId>dcmpc-license-server</artifactId>
			<version>${dcmpc.version}</version>
			<scope>runtime</scope>
		</dependency>
		<dependency>
			<groupId>com.datacraftmagic</groupId>
			<artifactId>sharpfind-ui</artifactId>
			<version>${sharpfind.version}</version>
			<type>war</type>
		</dependency>
		<dependency>
			<groupId>com.datacraftmagic</groupId>
			<artifactId>sharpfind-server</artifactId>
			<version>${sharpfind.version}</version>
			<scope>runtime</scope>
		</dependency>
		<dependency>
			<artifactId>ccil-common-generic</artifactId>
			<groupId>net.ccil</groupId>
			<version>${ccil.version}</version>
			<scope>runtime</scope>
		</dependency>
		<dependency>
			<groupId>net.ccil</groupId>
			<artifactId>ccil-app</artifactId>
			<version>${ccil.version}</version>
		</dependency>
		<dependency>
			<groupId>net.ccil</groupId>
			<artifactId>ccil-index-sv</artifactId>
			<version>${ccil.version}</version>
		</dependency>
		<dependency>
			<groupId>net.ccil</groupId>
			<artifactId>ccil-common-split</artifactId>
			<version>${ccil.version}</version>
		</dependency>
		<dependency>
			<groupId>net.ccil</groupId>
			<artifactId>ccil-common-sql</artifactId>
			<version>${ccil.version}</version>
		</dependency>
		<!-- newly added dependencies -->
		<dependency>
			<groupId>net.ccil</groupId>
			<artifactId>ccil-filter-ignorelist</artifactId>
			<version>${ccil.version}</version>
		</dependency>
		<dependency>
			<groupId>net.ccil</groupId>
			<artifactId>ccil-collect-rdb</artifactId>
			<version>${ccil.version}</version>
		</dependency>
		<dependency>
			<groupId>mysql</groupId>
			<artifactId>mysql-connector-java</artifactId>
			<version>5.1.17</version>
		</dependency>
		<!-- filters -->
		<dependency>
			<groupId>net.ccil</groupId>
			<artifactId>ccil-filters-gsim</artifactId>
			<version>${ccil.version}</version>
		</dependency>
		<dependency>
			<groupId>net.ccil</groupId>
			<artifactId>ccil-filters-lang</artifactId>
			<version>${ccil.version}</version>
		</dependency>
		<dependency>
			<groupId>net.ccil</groupId>
			<artifactId>ccil-filters-lucene</artifactId>
			<version>${ccil.version}</version>
		</dependency>
		<dependency>
			<groupId>net.ccil</groupId>
			<artifactId>ccil-search-engines</artifactId>
			<version>${ccil.version}</version>
		</dependency>
		<dependency>
			<groupId>net.ccil</groupId>
			<version>${ccil.version}</version>
			<artifactId>ccil-services-generic-server</artifactId>
			<scope>runtime</scope>
		</dependency>
		<!-- generator -->
		<dependency>
			<groupId>net.ccil</groupId>
			<artifactId>ccil-generation</artifactId>
			<version>${ccil.version}</version>
		</dependency>
		<!-- distributions -->
		<dependency>
			<groupId>com.datacraftmagic</groupId>
			<version>${dcmpc.version}</version>
			<artifactId>dcmpc-connectors-http</artifactId>
			<type>jar</type>
			<scope>runtime</scope>
		</dependency>
		<dependency>
			<groupId>com.datacraftmagic</groupId>
			<version>${dcmpc.version}</version>
			<artifactId>dcmpc-connectors-json</artifactId>
			<type>war</type>
			<scope>runtime</scope>
		</dependency>
		<dependency>
			<groupId>mysql</groupId>
			<artifactId>mysql-connector-java</artifactId>
			<version>5.1.17</version>
		</dependency>
	</dependencies>
	<build>
		<finalName>${project.name}</finalName>
		<plugins>
			<plugin>
				<artifactId>maven-assembly-plugin</artifactId>
				<configuration>
					<descriptors>
						<descriptor>bin.xml</descriptor>
					</descriptors>
				</configuration>
				<executions>
					<execution>
						<id>make-assembly</id>
						<phase>package</phase>
						<goals>
							<goal>attached</goal>
						</goals>
					</execution>
				</executions>
			</plugin>
		</plugins>
	</build>

	<organization>
		<name>Data Craft and Magic ltd.</name>
		<url>http://datacraftmagic.com/</url>
	</organization>
</project>
bin.xml
<assembly>
	<id>bin</id>
	<includeBaseDirectory>false</includeBaseDirectory>
	<formats>
		<format>zip</format>
		<format>dir</format>
	</formats>
	<fileSets>
		<fileSet>
			<directory>files/bin</directory>
			<useDefaultExcludes>true</useDefaultExcludes>
			<outputDirectory>bin</outputDirectory>
			<fileMode>0755</fileMode>
		</fileSet>
		<fileSet>
			<directory>files/config</directory>
			<useDefaultExcludes>true</useDefaultExcludes>
			<outputDirectory>config</outputDirectory>
		</fileSet>
		<fileSet>
			<directory>files/context</directory>
			<useDefaultExcludes>true</useDefaultExcludes>
			<outputDirectory>context</outputDirectory>
		</fileSet>
		<fileSet>
			<directory>files/services</directory>
			<useDefaultExcludes>true</useDefaultExcludes>
			<outputDirectory>services</outputDirectory>
		</fileSet>
		<fileSet>
			<directory>files/sql</directory>
			<useDefaultExcludes>true</useDefaultExcludes>
			<outputDirectory>sql</outputDirectory>
		</fileSet>
	</fileSets>
	<dependencySets>
		<!-- lib folder -->
		<dependencySet>
			<useProjectArtifact>false</useProjectArtifact>
			<useProjectAttachments>false</useProjectAttachments>
			<outputDirectory>lib</outputDirectory>
			<useTransitiveDependencies>true</useTransitiveDependencies>

			<excludes>
				<exclude>org.eclipse.jetty:*</exclude>
				<exclude>org.slf4j:*</exclude>
				<exclude>ch.qos.logback:*</exclude>
				<!-- only JARs here -->
				<exclude>*:war:*</exclude>
				<exclude>*:pom:*</exclude>
				<exclude>*:zip:*</exclude>
				<exclude>*:zip:*</exclude>
			</excludes>
		</dependencySet>
		<!-- populate the launcher folder -->
		<dependencySet>
			<useProjectArtifact>false</useProjectArtifact>
			<useProjectAttachments>false</useProjectAttachments>
			<outputDirectory>launcher</outputDirectory>
			<useTransitiveDependencies>true</useTransitiveDependencies>
			<includes>
				<!-- server -->
				<include>cybercore:cybercore-launcher</include>
				<!-- common -->
				<include>ch.qos.logback:logback*</include>
				<include>org.slf4j:jcl-over-slf4j</include>
				<include>org.slf4j:slf4j-api</include>
				<include>log4j:log4j</include>
			</includes>
		</dependencySet>
		<!-- web -->
		<dependencySet>
			<unpack>false</unpack>
			<useProjectArtifact>false</useProjectArtifact>
			<useProjectAttachments>false</useProjectAttachments>
			<outputDirectory>services/httpd/lib</outputDirectory>
			<useTransitiveDependencies>true</useTransitiveDependencies>
			<includes>
				<include>org.eclipse.jetty:*</include>
				<include>com.datacraftmagic:dcmpc-connectors-http:jar:*</include>
			</includes>
		</dependencySet>
		<dependencySet>
			<unpack>false</unpack>
			<outputFileNameMapping>services.war</outputFileNameMapping>
			<useProjectArtifact>false</useProjectArtifact>
			<useProjectAttachments>false</useProjectAttachments>
			<outputDirectory>services/httpd/webapps</outputDirectory>
			<useTransitiveDependencies>true</useTransitiveDependencies>
			<includes>
				<include>com.datacraftmagic:dcmpc-connectors-json:war:*</include>
			</includes>
		</dependencySet>
		<!-- ui --> 
		<dependencySet>
			<unpack>false</unpack>
			<useProjectArtifact>false</useProjectArtifact>
			<useProjectAttachments>false</useProjectAttachments>
			<outputDirectory>services/httpd/webapps</outputDirectory>
			<useTransitiveDependencies>false</useTransitiveDependencies>
			<scope>runtime</scope>
			<includes>
				<include>com.datacraftmagic:dcmpc-dictionary-ui:war:*</include>
			</includes>
		</dependencySet>
	</dependencySets>
</assembly>

Startup script

tutorials-dictionary-app.sh

#!/bin/bash
CCIL_HOME=`dirname $PWD`
CCIL_CONTEXT=$CCIL_HOME/context

echo -------------------
echo CCIL_HOME = $CCIL_HOME
echo -------------------

java -cp "$CCIL_HOME/lib/*:$CCIL_HOME/config:$CCIL_HOME/launcher/*" -Dserver.config.file=tutorials-dictionary-app.ttl -Dserver.home.dir=$CCIL_HOME -Xmx1024M -Dserver.context.dir=$CCIL_CONTEXT -Dserver.jmx.enabled=false net.ccil.execution.CcilConsoleApp -execute -root $CCIL_HOME/context/apps "$@"

Parsing the text

TBA

Insert into database

TBA

Further steps

TBA