Introduction
This guide will walk you through integrating Spring Boot with Spring AI PDF Document Reader, which utilizes Apache PdfBox to extract text from PDF documents and convert them into Spring AI Document
objects.
We will cover:
- Setting up a Spring Boot project with required dependencies
- Implementing a PDF document reader service
- Building a REST API to handle PDF uploads
- Testing with Postman
- Writing unit tests for validation
1. Setting Up the Spring Boot Project
Create a Maven Project
Generate a Spring Boot project with the following dependencies:
- Spring Web: To expose RESTful endpoints
- Spring AI PDF Document Reader: For text extraction from PDFs
Your pom.xml
should look like this:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>3.4.4</version>
<relativePath/>
</parent>
<groupId>com.example</groupId>
<artifactId>pdf-reader</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>PDF Reader</name>
<properties>
<java.version>17</java.version>
<spring-ai.version>1.0.0-M6</spring-ai.version>
</properties>
<dependencies>
<!-- Spring Web -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<!-- Spring AI PDF Document Reader -->
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-pdf-document-reader</artifactId>
</dependency>
<!-- Testing -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-bom</artifactId>
<version>${spring-ai.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
</dependencies>
</dependencyManagement>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
2. Implementing PDF Document Reader Service
Create a service class to extract text from PDFs.
PdfDocumentService.java
package com.example.pdfreader.service;
import org.springframework.ai.document.Document;
import org.springframework.ai.pdf.PdfDocumentReader;
import org.springframework.stereotype.Service;
import org.springframework.web.multipart.MultipartFile;
import java.io.IOException;
import java.util.List;
import java.util.stream.Collectors;
@Service
public class PdfDocumentService {
private final PdfDocumentReader pdfDocumentReader;
public PdfDocumentService(PdfDocumentReader pdfDocumentReader) {
this.pdfDocumentReader = pdfDocumentReader;
}
public List<String> extractText(MultipartFile file) throws IOException {
List<Document> documents = pdfDocumentReader.read(file.getInputStream());
return documents.stream()
.map(Document::getContent)
.collect(Collectors.toList());
}
}
3. Implementing the PDF Upload Controller
Create a REST controller to handle PDF file uploads and return extracted text.
PdfController.java
package com.example.pdfreader.controller;
import com.example.pdfreader.service.PdfDocumentService;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.*;
import org.springframework.web.multipart.MultipartFile;
import java.io.IOException;
import java.util.List;
@RestController
@RequestMapping("/api/pdf")
public class PdfController {
private final PdfDocumentService pdfDocumentService;
public PdfController(PdfDocumentService pdfDocumentService) {
this.pdfDocumentService = pdfDocumentService;
}
@PostMapping("/extract-text")
public ResponseEntity<List<String>> extractText(@RequestParam("file") MultipartFile file) {
try {
List<String> text = pdfDocumentService.extractText(file);
return ResponseEntity.ok(text);
} catch (IOException e) {
return ResponseEntity.internalServerError().build();
}
}
}
4. Testing with Postman
- Run your Spring Boot application. shCopyEdit
mvn spring-boot:run
- Open Postman and send a
POST
request to: bashCopyEdithttp://localhost:8080/api/pdf/extract-text
with a PDF file attached in thefile
field. - You should receive a JSON response with extracted text.
5. Writing Unit Tests
PdfDocumentServiceTest.java
package com.example.pdfreader.service;
import org.junit.jupiter.api.Test;
import org.springframework.ai.pdf.PdfDocumentReader;
import org.springframework.mock.web.MockMultipartFile;
import java.io.IOException;
import java.util.List;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.Mockito.*;
class PdfDocumentServiceTest {
@Test
void testExtractText() throws IOException {
PdfDocumentReader mockReader = mock(PdfDocumentReader.class);
PdfDocumentService service = new PdfDocumentService(mockReader);
MockMultipartFile file = new MockMultipartFile("file", "test.pdf", "application/pdf", "Dummy content".getBytes());
when(mockReader.read(any())).thenReturn(List.of(new org.springframework.ai.document.Document("Extracted text")));
List<String> result = service.extractText(file);
assertThat(result).contains("Extracted text");
}
}
PdfControllerTest.java
package com.example.pdfreader.controller;
import com.example.pdfreader.service.PdfDocumentService;
import org.junit.jupiter.api.Test;
import org.springframework.http.MediaType;
import org.springframework.mock.web.MockMultipartFile;
import org.springframework.test.web.servlet.MockMvc;
import org.springframework.test.web.servlet.setup.MockMvcBuilders;
import java.util.List;
import static org.mockito.Mockito.*;
import static org.springframework.test.web.servlet.request.MockMvcRequestBuilders.multipart;
import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.*;
class PdfControllerTest {
@Test
void testExtractText() throws Exception {
PdfDocumentService service = mock(PdfDocumentService.class);
PdfController controller = new PdfController(service);
MockMvc mockMvc = MockMvcBuilders.standaloneSetup(controller).build();
MockMultipartFile file = new MockMultipartFile("file", "test.pdf", "application/pdf", "Dummy content".getBytes());
when(service.extractText(file)).thenReturn(List.of("Extracted text"));
mockMvc.perform(multipart("/api/pdf/extract-text").file(file))
.andExpect(status().isOk())
.andExpect(jsonPath("$[0]").value("Extracted text"));
}
}
Conclusion
You've successfully integrated Spring Boot with Spring AI PDF Document Reader using Apache PdfBox!
Now, your application:
π΅ Extracts text from PDFs
π΅ Provides a REST API to upload PDFs
π΅ Supports unit testing for service and controller
π Integrating Spring Boot with Spring AI PDF Document Reader
Build an AI-powered PDF document reader using Spring Boot and Apache PdfBox for intelligent text extraction.
π Clone on GitHub