CODE WITH SIBIN

Solving Real Problems with Real Code


Integrating Spring Boot with Spring AI PDF Document Reader Using Apache PdfBox

Introduction

This guide will walk you through integrating Spring Boot with Spring AI PDF Document Reader, which utilizes Apache PdfBox to extract text from PDF documents and convert them into Spring AI Document objects.

We will cover:

  • Setting up a Spring Boot project with required dependencies
  • Implementing a PDF document reader service
  • Building a REST API to handle PDF uploads
  • Testing with Postman
  • Writing unit tests for validation

1. Setting Up the Spring Boot Project

Create a Maven Project

Generate a Spring Boot project with the following dependencies:

  • Spring Web: To expose RESTful endpoints
  • Spring AI PDF Document Reader: For text extraction from PDFs

Your pom.xml should look like this:

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>3.4.4</version>
        <relativePath/>
    </parent>

    <groupId>com.example</groupId>
    <artifactId>pdf-reader</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <name>PDF Reader</name>

    <properties>
        <java.version>17</java.version>
        <spring-ai.version>1.0.0-M6</spring-ai.version>
    </properties>

    <dependencies>
        <!-- Spring Web -->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>

        <!-- Spring AI PDF Document Reader -->
        <dependency>
            <groupId>org.springframework.ai</groupId>
            <artifactId>spring-ai-pdf-document-reader</artifactId>
        </dependency>

        <!-- Testing -->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-test</artifactId>
            <scope>test</scope>
        </dependency>
    </dependencies>

    <dependencyManagement>
        <dependencies>
            <dependency>
                <groupId>org.springframework.ai</groupId>
                <artifactId>spring-ai-bom</artifactId>
                <version>${spring-ai.version}</version>
                <type>pom</type>
                <scope>import</scope>
            </dependency>
        </dependencies>
    </dependencyManagement>

    <build>
        <plugins>
            <plugin>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-maven-plugin</artifactId>
            </plugin>
        </plugins>
    </build>

</project>

2. Implementing PDF Document Reader Service

Create a service class to extract text from PDFs.

PdfDocumentService.java

package com.example.pdfreader.service;

import org.springframework.ai.document.Document;
import org.springframework.ai.pdf.PdfDocumentReader;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;

import java.io.IOException;
import java.util.List;
import java.util.stream.Collectors;

@Service
public class PdfDocumentService {

    private final PdfDocumentReader pdfDocumentReader;

    public PdfDocumentService(PdfDocumentReader pdfDocumentReader) {
        this.pdfDocumentReader = pdfDocumentReader;
    }

    public List<String> extractText(MultipartFile file) throws IOException {
        List<Document> documents = pdfDocumentReader.read(file.getInputStream());
        return documents.stream()
                .map(Document::getContent)
                .collect(Collectors.toList());
    }
}

3. Implementing the PDF Upload Controller

Create a REST controller to handle PDF file uploads and return extracted text.

PdfController.java

package com.example.pdfreader.controller;

import com.example.pdfreader.service.PdfDocumentService;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.*;
import org.springframework.web.multipart.MultipartFile;

import java.io.IOException;
import java.util.List;

@RestController
@RequestMapping("/api/pdf")
public class PdfController {

    private final PdfDocumentService pdfDocumentService;

    public PdfController(PdfDocumentService pdfDocumentService) {
        this.pdfDocumentService = pdfDocumentService;
    }

    @PostMapping("/extract-text")
    public ResponseEntity<List<String>> extractText(@RequestParam("file") MultipartFile file) {
        try {
            List<String> text = pdfDocumentService.extractText(file);
            return ResponseEntity.ok(text);
        } catch (IOException e) {
            return ResponseEntity.internalServerError().build();
        }
    }
}

4. Testing with Postman

  1. Run your Spring Boot application. shCopyEditmvn spring-boot:run
  2. Open Postman and send a POST request to: bashCopyEdithttp://localhost:8080/api/pdf/extract-text with a PDF file attached in the file field.
  3. You should receive a JSON response with extracted text.

5. Writing Unit Tests

PdfDocumentServiceTest.java

package com.example.pdfreader.service;

import org.junit.jupiter.api.Test;
import org.springframework.ai.pdf.PdfDocumentReader;
import org.springframework.mock.web.MockMultipartFile;

import java.io.IOException;
import java.util.List;

import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.Mockito.*;

class PdfDocumentServiceTest {

    @Test
    void testExtractText() throws IOException {
        PdfDocumentReader mockReader = mock(PdfDocumentReader.class);
        PdfDocumentService service = new PdfDocumentService(mockReader);

        MockMultipartFile file = new MockMultipartFile("file", "test.pdf", "application/pdf", "Dummy content".getBytes());

        when(mockReader.read(any())).thenReturn(List.of(new org.springframework.ai.document.Document("Extracted text")));

        List<String> result = service.extractText(file);

        assertThat(result).contains("Extracted text");
    }
}

PdfControllerTest.java

package com.example.pdfreader.controller;

import com.example.pdfreader.service.PdfDocumentService;
import org.junit.jupiter.api.Test;
import org.springframework.http.MediaType;
import org.springframework.mock.web.MockMultipartFile;
import org.springframework.test.web.servlet.MockMvc;
import org.springframework.test.web.servlet.setup.MockMvcBuilders;

import java.util.List;

import static org.mockito.Mockito.*;
import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.multipart;
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.*;

class PdfControllerTest {

    @Test
    void testExtractText() throws Exception {
        PdfDocumentService service = mock(PdfDocumentService.class);
        PdfController controller = new PdfController(service);
        MockMvc mockMvc = MockMvcBuilders.standaloneSetup(controller).build();

        MockMultipartFile file = new MockMultipartFile("file", "test.pdf", "application/pdf", "Dummy content".getBytes());

        when(service.extractText(file)).thenReturn(List.of("Extracted text"));

        mockMvc.perform(multipart("/api/pdf/extract-text").file(file))
                .andExpect(status().isOk())
                .andExpect(jsonPath("$[0]").value("Extracted text"));
    }
}

Conclusion

You've successfully integrated Spring Boot with Spring AI PDF Document Reader using Apache PdfBox!

Now, your application:

πŸ”΅ Extracts text from PDFs
πŸ”΅ Provides a REST API to upload PDFs
πŸ”΅ Supports unit testing for service and controller

πŸ“„ Integrating Spring Boot with Spring AI PDF Document Reader

Build an AI-powered PDF document reader using Spring Boot and Apache PdfBox for intelligent text extraction.

πŸ“‚ Clone on GitHub

Leave a Reply

Your email address will not be published. Required fields are marked *