Index: pom.xml =================================================================== --- pom.xml (revision 36390) +++ pom.xml (working copy) @@ -73,6 +73,12 @@ provided + org.apache.tika + tika-parsers + ${tika.version} + test + + com.drewnoakes metadata-extractor ${drewnoakes-extractor.version} Index: src/main/java/org/hippoecm/frontend/editor/plugins/resource/ResourceHelper.java =================================================================== --- src/main/java/org/hippoecm/frontend/editor/plugins/resource/ResourceHelper.java (revision 36390) +++ src/main/java/org/hippoecm/frontend/editor/plugins/resource/ResourceHelper.java (working copy) @@ -30,8 +30,10 @@ import org.apache.commons.io.IOUtils; import org.apache.jackrabbit.JcrConstants; +import org.apache.tika.Tika; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; import org.apache.tika.utils.ParseUtils; import org.hippoecm.repository.api.HippoNodeType; import org.slf4j.Logger; @@ -191,7 +193,12 @@ String nodePath = null; try { nodePath = node.getPath(); - String content = ParseUtils.getStringContent(inputStream, getTikaConfig(), MIME_TYPE_PDF); + Tika tika = new Tika(); + // Some sensible memory safe value here, or make it configurable + int length = 5 * 1024 * 1024; + tika.setMaxStringLength(length); + Metadata metadata = new Metadata(); + String content = tika.parseToString(inputStream, metadata); byteInputStream = new ByteArrayInputStream(content.getBytes()); node.setProperty(HippoNodeType.HIPPO_TEXT, getValueFactory(node).createBinary(byteInputStream)); } catch (IOException e) { Index: src/test/java/org/hippoecm/frontend/editor/plugins/resource/ResourceHelperTest.java =================================================================== --- src/test/java/org/hippoecm/frontend/editor/plugins/resource/ResourceHelperTest.java (revision 0) +++ src/test/java/org/hippoecm/frontend/editor/plugins/resource/ResourceHelperTest.java (working copy) @@ -0,0 +1,102 @@ +/* + * Copyright 2012 Hippo. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.hippoecm.frontend.editor.plugins.resource; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import org.apache.commons.io.IOUtils; +import org.apache.tika.Tika; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.junit.After; +import org.junit.AfterClass; +import static org.junit.Assert.*; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * + * @author obourgeois + */ +public class ResourceHelperTest { + + public ResourceHelperTest() { + } + + @BeforeClass + public static void setUpClass() { + } + + @AfterClass + public static void tearDownClass() { + } + + @Before + public void setUp() { + } + + @After + public void tearDown() { + } + + /** + * Test of handlePdfAndSetHippoTextProperty method, of class ResourceHelper. + * CMS7-6516: Exceptions logged in terminal[...] + */ + @Test + public void testHandlePdf() { + InputStream inputStream = null; + ByteArrayInputStream byteInputStream = null; + String content = null; + + //Should not throw SaxParseException + inputStream = getClass().getResourceAsStream("/test-tika.pdf"); + byteInputStream = null; + content = extractContent(inputStream, byteInputStream, 10); + assertEquals(content.length(), 10); + + + inputStream = getClass().getResourceAsStream("/test-tika.pdf"); + byteInputStream = null; + content = extractContent(inputStream, byteInputStream, 5*1024*1024); + assertNotNull(content); + } + + private String extractContent(InputStream inputStream, ByteArrayInputStream byteInputStream, int length) { + String content = null; + try { + Tika tika = new Tika(); + tika.setMaxStringLength(length); + Metadata metadata = new Metadata(); + content = tika.parseToString(inputStream, metadata); + byteInputStream = new ByteArrayInputStream(content.getBytes()); + } catch (IOException e) { + } catch (TikaException e) { + e.printStackTrace(); + fail(); + } catch (Throwable e) { + e.printStackTrace(); + fail(); + } finally { + IOUtils.closeQuietly(byteInputStream); + IOUtils.closeQuietly(inputStream); + return content; + } + + } +} Index: src/test/java/org/hippoecm/frontend/editor/plugins/resource/ResourceHelperTest.java =================================================================== --- src/test/java/org/hippoecm/frontend/editor/plugins/resource/ResourceHelperTest.java (revision 0) +++ src/test/java/org/hippoecm/frontend/editor/plugins/resource/ResourceHelperTest.java (working copy) Property changes on: src/test/java/org/hippoecm/frontend/editor/plugins/resource/ResourceHelperTest.java ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +Id \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Index: src/test/resources/test-tika.pdf =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/pdf Index: src/test/resources/test-tika.pdf =================================================================== --- src/test/resources/test-tika.pdf (revision 0) +++ src/test/resources/test-tika.pdf (working copy) Property changes on: src/test/resources/test-tika.pdf ___________________________________________________________________ Added: svn:mime-type ## -0,0 +1 ## +application/pdf \ No newline at end of property