Explorar el Código

added ocr configuration(not yet working

Harsh Parikh hace 2 años
padre
commit
a2536f9e15
Se han modificado 2 ficheros con 27 adiciones y 0 borrados
  1. 9 0
      docker/Dockerfile
  2. 18 0
      docker/install.sh

+ 9 - 0
docker/Dockerfile

@@ -0,0 +1,9 @@
+FROM java:latest
+LABEL Matt Fullerton <matt.fullerton@gmail.com>
+RUN mkdir /setup
+RUN ["chmod", "a+x", "install.sh"]
+ADD install.sh /setup/install.sh
+RUN /setup/install.sh
+ENTRYPOINT java -jar /srv/tika-server-1.*-SNAPSHOT.jar -host 0.0.0.0
+
+EXPOSE 9998

+ 18 - 0
docker/install.sh

@@ -0,0 +1,18 @@
+
+echo "# Installing Maven"
+apt-get update
+apt-get -y -q install default-jdk maven unzip
+
+echo "# Installing Tika"
+mkdir install
+curl https://codeload.github.com/apache/tika/zip/trunk -o trunk.zip
+unzip trunk.zip
+cd tika-trunk
+mvn -DskipTests=true clean install
+cp tika-server/target/tika-server-1.*-SNAPSHOT.jar /srv/tika-server-1.*-SNAPSHOT.jar
+
+echo "#Installing tesseract"
+apt-get -y -q install tesseract-ocr tesseract-ocr-deu tesseract-ocr-eng
+
+echo "# Cleaning up"
+apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* /setup /build