implementing extracting tables from pdf by pdfbox
This commit is contained in:
parent
d6afb07533
commit
c8ac1f7029
5 changed files with 639 additions and 26 deletions
|
@ -102,14 +102,6 @@ dependencies {
|
||||||
developmentOnly("org.springframework.boot:spring-boot-devtools")
|
developmentOnly("org.springframework.boot:spring-boot-devtools")
|
||||||
compileOnly 'org.projectlombok:lombok:1.18.28'
|
compileOnly 'org.projectlombok:lombok:1.18.28'
|
||||||
annotationProcessor 'org.projectlombok:lombok:1.18.28'
|
annotationProcessor 'org.projectlombok:lombok:1.18.28'
|
||||||
|
|
||||||
//// https://mvnrepository.com/artifact/technology.tabula/tabula
|
|
||||||
// implementation group: 'technology.tabula', name: 'tabula', version: '1.0.5'
|
|
||||||
|
|
||||||
|
|
||||||
// implementation files('/Users/artempetrenko/Java/Stirling-PDF/tabula-1.0.5-jar-with-dependencies.jar')
|
|
||||||
implementation fileTree(include: ['tabula-1.0.5-jar-with-dependencies.jar'],dir: 'libs')
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,120 @@
|
||||||
|
package stirling.software.SPDF.controller.api;
|
||||||
|
|
||||||
|
import com.opencsv.CSVWriter;
|
||||||
|
import io.swagger.v3.oas.annotations.Operation;
|
||||||
|
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
import org.springframework.http.ContentDisposition;
|
||||||
|
import org.springframework.http.HttpHeaders;
|
||||||
|
import org.springframework.http.MediaType;
|
||||||
|
import org.springframework.http.ResponseEntity;
|
||||||
|
import org.springframework.web.bind.annotation.*;
|
||||||
|
import stirling.software.SPDF.controller.api.strippers.PDFTableStripper;
|
||||||
|
import stirling.software.SPDF.model.api.extract.PDFFilePage;
|
||||||
|
|
||||||
|
import java.awt.*;
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringWriter;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
@RestController
|
||||||
|
@RequestMapping("/api/v1/extract/pdf-to-csv")
|
||||||
|
@Tag(name = "General", description = "General APIs")
|
||||||
|
public class ExtractController {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(CropController.class);
|
||||||
|
|
||||||
|
@PostMapping(consumes = "multipart/form-data")
|
||||||
|
@Operation(summary = "Extracts a PDF document to csv", description = "This operation takes an input PDF file and returns CSV file of whole page. Input:PDF Output:CSV Type:SISO")
|
||||||
|
public ResponseEntity<String> PdfToCsv(@ModelAttribute PDFFilePage form)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
|
ArrayList<String> tableData = new ArrayList<>();
|
||||||
|
int columnsCount = 0;
|
||||||
|
|
||||||
|
try (PDDocument document = PDDocument.load(new ByteArrayInputStream(form.getFileInput().getBytes()))) {
|
||||||
|
final double res = 72; // PDF units are at 72 DPI
|
||||||
|
PDFTableStripper stripper = new PDFTableStripper();
|
||||||
|
stripper.setSortByPosition(true);
|
||||||
|
stripper.setRegion(new Rectangle((int) Math.round(1.0 * res), (int) Math.round(1 * res), (int) Math.round(6 * res), (int) Math.round(9.0 * res)));
|
||||||
|
|
||||||
|
PDPage pdPage = document.getPage(form.getPageId() - 1);
|
||||||
|
stripper.extractTable(pdPage);
|
||||||
|
columnsCount = stripper.getColumns();
|
||||||
|
|
||||||
|
for (int c = 0; c < columnsCount; ++c) {
|
||||||
|
for(int r=0; r<stripper.getRows(); ++r) {
|
||||||
|
tableData.add(stripper.getText(r, c));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ArrayList<String> notEmptyColumns = new ArrayList<>();
|
||||||
|
|
||||||
|
for (String item: tableData) {
|
||||||
|
if(!item.trim().isEmpty()){
|
||||||
|
notEmptyColumns.add(item);
|
||||||
|
}else{
|
||||||
|
columnsCount--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
List<String> fullTable = notEmptyColumns.stream().map((entity)->
|
||||||
|
entity.replace('\n',' ').replace('\r',' ').trim().replaceAll("\\s{2,}", "|")).toList();
|
||||||
|
|
||||||
|
int rowsCount = fullTable.get(0).split("\\|").length;
|
||||||
|
|
||||||
|
ArrayList<String> headersList = getTableHeaders(columnsCount,fullTable);
|
||||||
|
ArrayList<String> recordList = getRecordsList(rowsCount,fullTable);
|
||||||
|
|
||||||
|
|
||||||
|
StringWriter writer = new StringWriter();
|
||||||
|
try (CSVWriter csvWriter = new CSVWriter(writer)) {
|
||||||
|
csvWriter.writeNext(headersList.toArray(new String[0]));
|
||||||
|
for (String record : recordList) {
|
||||||
|
csvWriter.writeNext(record.split("\\|"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
HttpHeaders headers = new HttpHeaders();
|
||||||
|
headers.setContentDisposition(ContentDisposition.builder("attachment").filename(form.getFileInput().getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_extracted.csv").build());
|
||||||
|
headers.setContentType(MediaType.parseMediaType("text/csv"));
|
||||||
|
|
||||||
|
return ResponseEntity.ok()
|
||||||
|
.headers(headers)
|
||||||
|
.body(writer.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
private ArrayList<String> getRecordsList( int rowsCounts ,List<String> items){
|
||||||
|
ArrayList<String> recordsList = new ArrayList<>();
|
||||||
|
|
||||||
|
for (int b=1; b<rowsCounts;b++) {
|
||||||
|
StringBuilder strbldr = new StringBuilder();
|
||||||
|
|
||||||
|
for (int i=0;i<items.size();i++){
|
||||||
|
String[] parts = items.get(i).split("\\|");
|
||||||
|
strbldr.append(parts[b]);
|
||||||
|
if (i!= items.size()-1){
|
||||||
|
strbldr.append("|");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
recordsList.add(strbldr.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
return recordsList;
|
||||||
|
}
|
||||||
|
private ArrayList<String> getTableHeaders(int columnsCount, List<String> items){
|
||||||
|
ArrayList<String> resultList = new ArrayList<>();
|
||||||
|
for (int i=0;i<columnsCount;i++){
|
||||||
|
String[] parts = items.get(i).split("\\|");
|
||||||
|
resultList.add(parts[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return resultList;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,354 @@
|
||||||
|
package stirling.software.SPDF.controller.api.strippers;
|
||||||
|
|
||||||
|
import org.apache.fontbox.util.BoundingBox;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
|
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||||
|
import org.apache.pdfbox.pdmodel.font.PDType3Font;
|
||||||
|
import org.apache.pdfbox.text.PDFTextStripper;
|
||||||
|
import org.apache.pdfbox.text.PDFTextStripperByArea;
|
||||||
|
import org.apache.pdfbox.text.TextPosition;
|
||||||
|
|
||||||
|
import java.awt.*;
|
||||||
|
import java.awt.geom.AffineTransform;
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.OutputStreamWriter;
|
||||||
|
import java.io.Writer;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* Class to extract tabular data from a PDF.
|
||||||
|
* Works by making a first pass of the page to group all nearby text items
|
||||||
|
* together, and then inferring a 2D grid from these regions. Each table cell
|
||||||
|
* is then extracted using a PDFTextStripperByArea object.
|
||||||
|
*
|
||||||
|
* Works best when
|
||||||
|
* headers are included in the detected region, to ensure representative text
|
||||||
|
* in every column.
|
||||||
|
*
|
||||||
|
* Based upon DrawPrintTextLocations PDFBox example
|
||||||
|
* (https://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/util/DrawPrintTextLocations.java)
|
||||||
|
*
|
||||||
|
* @author Beldaz
|
||||||
|
*/
|
||||||
|
public class PDFTableStripper extends PDFTextStripper
|
||||||
|
{
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This will print the documents data, for each table cell.
|
||||||
|
*
|
||||||
|
* @param args The command line arguments.
|
||||||
|
*
|
||||||
|
* @throws IOException If there is an error parsing the document.
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
* Used in methods derived from DrawPrintTextLocations
|
||||||
|
*/
|
||||||
|
private AffineTransform flipAT;
|
||||||
|
private AffineTransform rotateAT;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Regions updated by calls to writeString
|
||||||
|
*/
|
||||||
|
private Set<Rectangle2D> boxes;
|
||||||
|
|
||||||
|
// Border to allow when finding intersections
|
||||||
|
private double dx = 1.0; // This value works for me, feel free to tweak (or add setter)
|
||||||
|
private double dy = 0.000; // Rows of text tend to overlap, so need to extend
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Region in which to find table (otherwise whole page)
|
||||||
|
*/
|
||||||
|
private Rectangle2D regionArea;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of rows in inferred table
|
||||||
|
*/
|
||||||
|
private int nRows=0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Number of columns in inferred table
|
||||||
|
*/
|
||||||
|
private int nCols=0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This is the object that does the text extraction
|
||||||
|
*/
|
||||||
|
private PDFTextStripperByArea regionStripper;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 1D intervals - used for calculateTableRegions()
|
||||||
|
* @author Beldaz
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public static class Interval {
|
||||||
|
double start;
|
||||||
|
double end;
|
||||||
|
public Interval(double start, double end) {
|
||||||
|
this.start=start; this.end = end;
|
||||||
|
}
|
||||||
|
public void add(Interval col) {
|
||||||
|
if(col.start<start)
|
||||||
|
start = col.start;
|
||||||
|
if(col.end>end)
|
||||||
|
end = col.end;
|
||||||
|
}
|
||||||
|
public static void addTo(Interval x, LinkedList<Interval> columns) {
|
||||||
|
int p = 0;
|
||||||
|
Iterator<Interval> it = columns.iterator();
|
||||||
|
// Find where x should go
|
||||||
|
while(it.hasNext()) {
|
||||||
|
Interval col = it.next();
|
||||||
|
if(x.end>=col.start) {
|
||||||
|
if(x.start<=col.end) { // overlaps
|
||||||
|
x.add(col);
|
||||||
|
it.remove();
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
++p;
|
||||||
|
}
|
||||||
|
while(it.hasNext()) {
|
||||||
|
Interval col = it.next();
|
||||||
|
if(x.start>col.end)
|
||||||
|
break;
|
||||||
|
x.add(col);
|
||||||
|
it.remove();
|
||||||
|
}
|
||||||
|
columns.add(p, x);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Instantiate a new PDFTableStripper object.
|
||||||
|
*
|
||||||
|
* @param document
|
||||||
|
* @throws IOException If there is an error loading the properties.
|
||||||
|
*/
|
||||||
|
public PDFTableStripper() throws IOException
|
||||||
|
{
|
||||||
|
super.setShouldSeparateByBeads(false);
|
||||||
|
regionStripper = new PDFTextStripperByArea();
|
||||||
|
regionStripper.setSortByPosition( true );
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Define the region to group text by.
|
||||||
|
*
|
||||||
|
* @param rect The rectangle area to retrieve the text from.
|
||||||
|
*/
|
||||||
|
public void setRegion(Rectangle2D rect )
|
||||||
|
{
|
||||||
|
regionArea = rect;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getRows()
|
||||||
|
{
|
||||||
|
return nRows;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getColumns()
|
||||||
|
{
|
||||||
|
return nCols;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the text for the region, this should be called after extractTable().
|
||||||
|
*
|
||||||
|
* @return The text that was identified in that region.
|
||||||
|
*/
|
||||||
|
public String getText(int row, int col)
|
||||||
|
{
|
||||||
|
return regionStripper.getTextForRegion("el"+col+"x"+row);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void extractTable(PDPage pdPage) throws IOException
|
||||||
|
{
|
||||||
|
setStartPage(getCurrentPageNo());
|
||||||
|
setEndPage(getCurrentPageNo());
|
||||||
|
|
||||||
|
boxes = new HashSet<Rectangle2D>();
|
||||||
|
// flip y-axis
|
||||||
|
flipAT = new AffineTransform();
|
||||||
|
flipAT.translate(0, pdPage.getBBox().getHeight());
|
||||||
|
flipAT.scale(1, -1);
|
||||||
|
|
||||||
|
// page may be rotated
|
||||||
|
rotateAT = new AffineTransform();
|
||||||
|
int rotation = pdPage.getRotation();
|
||||||
|
if (rotation != 0)
|
||||||
|
{
|
||||||
|
PDRectangle mediaBox = pdPage.getMediaBox();
|
||||||
|
switch (rotation)
|
||||||
|
{
|
||||||
|
case 90:
|
||||||
|
rotateAT.translate(mediaBox.getHeight(), 0);
|
||||||
|
break;
|
||||||
|
case 270:
|
||||||
|
rotateAT.translate(0, mediaBox.getWidth());
|
||||||
|
break;
|
||||||
|
case 180:
|
||||||
|
rotateAT.translate(mediaBox.getWidth(), mediaBox.getHeight());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
rotateAT.rotate(Math.toRadians(rotation));
|
||||||
|
}
|
||||||
|
// Trigger processing of the document so that writeString is called.
|
||||||
|
try (Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream())) {
|
||||||
|
super.output = dummy;
|
||||||
|
super.processPage(pdPage);
|
||||||
|
}
|
||||||
|
|
||||||
|
Rectangle2D[][] regions = calculateTableRegions();
|
||||||
|
|
||||||
|
// System.err.println("Drawing " + nCols + "x" + nRows + "="+ nRows*nCols + " regions");
|
||||||
|
for(int i=0; i<nCols; ++i) {
|
||||||
|
for(int j=0; j<nRows; ++j) {
|
||||||
|
final Rectangle2D region = regions[i][j];
|
||||||
|
regionStripper.addRegion("el"+i+"x"+j, region);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
regionStripper.extractRegions(pdPage);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Infer a rectangular grid of regions from the boxes field.
|
||||||
|
*
|
||||||
|
* @return 2D array of table regions (as Rectangle2D objects). Note that
|
||||||
|
* some of these regions may have no content.
|
||||||
|
*/
|
||||||
|
private Rectangle2D[][] calculateTableRegions() {
|
||||||
|
|
||||||
|
// Build up a list of all table regions, based upon the populated
|
||||||
|
// regions of boxes field. Treats the horizontal and vertical extents
|
||||||
|
// of each box as distinct
|
||||||
|
LinkedList<Interval> columns = new LinkedList<Interval>();
|
||||||
|
LinkedList<Interval> rows = new LinkedList<Interval>();
|
||||||
|
|
||||||
|
for(Rectangle2D box: boxes) {
|
||||||
|
Interval x = new Interval(box.getMinX(), box.getMaxX());
|
||||||
|
Interval y = new Interval(box.getMinY(), box.getMaxY());
|
||||||
|
|
||||||
|
Interval.addTo(x, columns);
|
||||||
|
Interval.addTo(y, rows);
|
||||||
|
}
|
||||||
|
|
||||||
|
nRows = rows.size();
|
||||||
|
nCols = columns.size();
|
||||||
|
Rectangle2D[][] regions = new Rectangle2D[nCols][nRows];
|
||||||
|
int i=0;
|
||||||
|
// Label regions from top left, rather than the transformed orientation
|
||||||
|
for(Interval column: columns) {
|
||||||
|
int j=0;
|
||||||
|
for(Interval row: rows) {
|
||||||
|
regions[nCols-i-1][nRows-j-1] = new Rectangle2D.Double(column.start, row.start, column.end - column.start, row.end - row.start);
|
||||||
|
++j;
|
||||||
|
}
|
||||||
|
++i;
|
||||||
|
}
|
||||||
|
|
||||||
|
return regions;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Register each character's bounding box, updating boxes field to maintain
|
||||||
|
* a list of all distinct groups of characters.
|
||||||
|
*
|
||||||
|
* Overrides the default functionality of PDFTextStripper.
|
||||||
|
* Most of this is taken from DrawPrintTextLocations.java, with extra steps
|
||||||
|
* at end of main loop
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected void writeString(String string, List<TextPosition> textPositions) throws IOException
|
||||||
|
{
|
||||||
|
for (TextPosition text : textPositions)
|
||||||
|
{
|
||||||
|
// glyph space -> user space
|
||||||
|
// note: text.getTextMatrix() is *not* the Text Matrix, it's the Text Rendering Matrix
|
||||||
|
AffineTransform at = text.getTextMatrix().createAffineTransform();
|
||||||
|
PDFont font = text.getFont();
|
||||||
|
BoundingBox bbox = font.getBoundingBox();
|
||||||
|
|
||||||
|
// advance width, bbox height (glyph space)
|
||||||
|
float xadvance = font.getWidth(text.getCharacterCodes()[0]); // todo: should iterate all chars
|
||||||
|
Rectangle2D.Float rect = new Rectangle2D.Float(0, bbox.getLowerLeftY(), xadvance, bbox.getHeight());
|
||||||
|
|
||||||
|
if (font instanceof PDType3Font)
|
||||||
|
{
|
||||||
|
// bbox and font matrix are unscaled
|
||||||
|
at.concatenate(font.getFontMatrix().createAffineTransform());
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// bbox and font matrix are already scaled to 1000
|
||||||
|
at.scale(1/1000f, 1/1000f);
|
||||||
|
}
|
||||||
|
Shape s = at.createTransformedShape(rect);
|
||||||
|
s = flipAT.createTransformedShape(s);
|
||||||
|
s = rotateAT.createTransformedShape(s);
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// Merge character's bounding box with boxes field
|
||||||
|
//
|
||||||
|
Rectangle2D bounds = s.getBounds2D();
|
||||||
|
// Pad sides to detect almost touching boxes
|
||||||
|
Rectangle2D hitbox = bounds.getBounds2D();
|
||||||
|
hitbox.add(bounds.getMinX() - dx , bounds.getMinY() - dy);
|
||||||
|
hitbox.add(bounds.getMaxX() + dx , bounds.getMaxY() + dy);
|
||||||
|
|
||||||
|
// Find all overlapping boxes
|
||||||
|
List<Rectangle2D> intersectList = new ArrayList<Rectangle2D>();
|
||||||
|
for(Rectangle2D box: boxes) {
|
||||||
|
if(box.intersects(hitbox)) {
|
||||||
|
intersectList.add(box);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Combine all touching boxes and update
|
||||||
|
// (NOTE: Potentially this could leave some overlapping boxes un-merged,
|
||||||
|
// but it's sufficient for now and get's fixed up in calculateTableRegions)
|
||||||
|
for(Rectangle2D box: intersectList) {
|
||||||
|
bounds.add(box);
|
||||||
|
boxes.remove(box);
|
||||||
|
}
|
||||||
|
boxes.add(bounds);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This method does nothing in this derived class, because beads and regions are incompatible. Beads are
|
||||||
|
* ignored when stripping by area.
|
||||||
|
*
|
||||||
|
* @param aShouldSeparateByBeads The new grouping of beads.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public final void setShouldSeparateByBeads(boolean aShouldSeparateByBeads)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adapted from PDFTextStripperByArea
|
||||||
|
* {@inheritDoc}
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected void processTextPosition( TextPosition text )
|
||||||
|
{
|
||||||
|
if(regionArea!=null && !regionArea.contains( text.getX(), text.getY() ) ) {
|
||||||
|
// skip character
|
||||||
|
} else {
|
||||||
|
super.processTextPosition( text );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,18 @@
|
||||||
|
package stirling.software.SPDF.model.api.extract;
|
||||||
|
|
||||||
|
import io.swagger.v3.oas.annotations.media.Schema;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
import stirling.software.SPDF.model.api.PDFFile;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@EqualsAndHashCode(callSuper=true)
|
||||||
|
public class PDFFilePage extends PDFFile {
|
||||||
|
|
||||||
|
|
||||||
|
@Schema(description = "Number of chosen page", type = "number")
|
||||||
|
private int pageId;
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
|
@ -1,29 +1,158 @@
|
||||||
<!DOCTYPE html>
|
<!DOCTYPE html>
|
||||||
<html th:lang="${#locale.toString()}" th:lang-direction="#{language.direction}" xmlns:th="http://www.thymeleaf.org">
|
<html th:lang="${#locale.toString()}" th:lang-direction="#{language.direction}" xmlns:th="http://www.thymeleaf.org">
|
||||||
|
|
||||||
<th:block th:insert="~{fragments/common :: head(title=#{PDFToXML.title})}"></th:block>
|
<th:block th:insert="~{fragments/common :: head(title=#{PDFToCSV.title})}"></th:block>
|
||||||
<body>
|
|
||||||
<th:block th:insert="~{fragments/common :: game}"></th:block>
|
|
||||||
<div id="page-container">
|
|
||||||
<div id="content-wrap">
|
|
||||||
<div th:insert="~{fragments/navbar.html :: navbar}"></div>
|
|
||||||
<br> <br>
|
|
||||||
<div class="container">
|
|
||||||
<div class="row justify-content-center">
|
|
||||||
<div class="col-md-6">
|
|
||||||
<h2 th:text="#{PDFToCSV.header}"></h2>
|
|
||||||
<form method="post" enctype="multipart/form-data" th:action="@{api/v1/convert/pdf/csv}">
|
|
||||||
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf')}"></div>
|
|
||||||
<br>
|
|
||||||
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{PDFToCSV.submit}"></button>
|
|
||||||
|
|
||||||
</form>
|
|
||||||
<p class="mt-3" th:text="#{PDFToCSV.credit}"></p>
|
<body>
|
||||||
|
<div id="page-container">
|
||||||
|
<div id="content-wrap">
|
||||||
|
|
||||||
|
<div class="container">
|
||||||
|
<div class="row justify-content-center">
|
||||||
|
<div class="col-md-6">
|
||||||
|
<h2 th:text="#{PDFToCSV.header}"></h2>
|
||||||
|
<form id="PDFToCSVForm" th:action="@{api/v1/extract/pdf-to-csv}" method="post" enctype="multipart/form-data">
|
||||||
|
<input id="pageId" type="hidden" name="pageId" />
|
||||||
|
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf')}"></div>
|
||||||
|
<button type="submit" class="btn btn-primary" th:text="#{PDFToCSV.submit}"></button>
|
||||||
|
</form>
|
||||||
|
<p id="instruction-text" style="margin: 0; display: none">Choose page to extract table</p>
|
||||||
|
|
||||||
|
<div style="position: relative; display: inline-block;">
|
||||||
|
<div>
|
||||||
|
|
||||||
|
<div style="display:none ;margin: 3px;position: absolute;top: 0;width: 120px;justify-content:space-between;z-index: 10" id="pagination-button-container">
|
||||||
|
<button id='previous-page-btn' style='opacity: 80% ; width: 50px; height: 30px; display: flex;align-items: center;justify-content: center; background: grey; color: #ffffff; ;border: none;outline: none; border-radius: 4px;'> < </button>
|
||||||
|
<button id='next-page-btn' style='opacity: 80% ; width: 50px; height: 30px; display: flex;align-items: center;justify-content: center; background: grey; color: #ffffff; ;border: none;outline: none; border-radius: 4px;'> > </button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<canvas id="crop-pdf-canvas" style="position: absolute; top: 0; left: 0; z-index: 1;"></canvas>
|
||||||
|
</div>
|
||||||
|
<canvas id="overlayCanvas" style="position: absolute; top: 0; left: 0; z-index: 2;"></canvas>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
|
||||||
|
let pdfCanvas = document.getElementById('crop-pdf-canvas');
|
||||||
|
let overlayCanvas = document.getElementById('overlayCanvas');
|
||||||
|
// let paginationBtnContainer = ;
|
||||||
|
|
||||||
|
let context = pdfCanvas.getContext('2d');
|
||||||
|
|
||||||
|
let btn1Object = document.getElementById('previous-page-btn');
|
||||||
|
let btn2Object = document.getElementById('next-page-btn');
|
||||||
|
overlayCanvas.width = pdfCanvas.width;
|
||||||
|
overlayCanvas.height = pdfCanvas.height;
|
||||||
|
|
||||||
|
let fileInput = document.getElementById('fileInput-input');
|
||||||
|
|
||||||
|
let file;
|
||||||
|
|
||||||
|
let pdfDoc = null;
|
||||||
|
let pageId = document.getElementById('pageId');
|
||||||
|
let currentPage = 1;
|
||||||
|
let totalPages = 0;
|
||||||
|
|
||||||
|
let startX = 0;
|
||||||
|
let startY = 0;
|
||||||
|
let rectWidth = 0;
|
||||||
|
let rectHeight = 0;
|
||||||
|
|
||||||
|
btn1Object.addEventListener('click',function (e){
|
||||||
|
|
||||||
|
if (currentPage !== 1) {
|
||||||
|
currentPage = currentPage - 1;
|
||||||
|
pageId.value = currentPage;
|
||||||
|
|
||||||
|
if (file.type === 'application/pdf') {
|
||||||
|
let reader = new FileReader();
|
||||||
|
reader.onload = function (ev) {
|
||||||
|
let typedArray = new Uint8Array(reader.result);
|
||||||
|
pdfjsLib.GlobalWorkerOptions.workerSrc = 'pdfjs/pdf.worker.js'
|
||||||
|
pdfjsLib.getDocument(typedArray).promise.then(function (pdf) {
|
||||||
|
pdfDoc = pdf;
|
||||||
|
totalPages = pdf.numPages;
|
||||||
|
renderPage(currentPage);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
reader.readAsArrayBuffer(file);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
btn2Object.addEventListener('click',function (e){
|
||||||
|
|
||||||
|
if (currentPage !== totalPages){
|
||||||
|
|
||||||
|
currentPage=currentPage+1;
|
||||||
|
pageId.value = currentPage;
|
||||||
|
|
||||||
|
if (file.type === 'application/pdf') {
|
||||||
|
let reader = new FileReader();
|
||||||
|
reader.onload = function(ev) {
|
||||||
|
let typedArray = new Uint8Array(reader.result);
|
||||||
|
pdfjsLib.GlobalWorkerOptions.workerSrc = 'pdfjs/pdf.worker.js'
|
||||||
|
pdfjsLib.getDocument(typedArray).promise.then(function(pdf) {
|
||||||
|
pdfDoc = pdf;
|
||||||
|
totalPages = pdf.numPages;
|
||||||
|
renderPage(currentPage);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
reader.readAsArrayBuffer(file);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
fileInput.addEventListener('change', function(e) {
|
||||||
|
|
||||||
|
file = e.target.files[0];
|
||||||
|
if (file.type === 'application/pdf') {
|
||||||
|
let reader = new FileReader();
|
||||||
|
reader.onload = function(ev) {
|
||||||
|
let typedArray = new Uint8Array(reader.result);
|
||||||
|
pdfjsLib.GlobalWorkerOptions.workerSrc = 'pdfjs/pdf.worker.js'
|
||||||
|
pdfjsLib.getDocument(typedArray).promise.then(function(pdf) {
|
||||||
|
pdfDoc = pdf;
|
||||||
|
totalPages = pdf.numPages;
|
||||||
|
renderPage(currentPage);
|
||||||
|
});
|
||||||
|
pageId.value = currentPage;
|
||||||
|
|
||||||
|
};
|
||||||
|
reader.readAsArrayBuffer(file);
|
||||||
|
document.getElementById("pagination-button-container").style.display="flex";
|
||||||
|
document.getElementById("instruction-text").style.display="block";
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
function renderPage(pageNumber) {
|
||||||
|
pdfDoc.getPage(pageNumber).then(function(page) {
|
||||||
|
let viewport = page.getViewport({ scale: 1.0 });
|
||||||
|
pdfCanvas.width = viewport.width;
|
||||||
|
pdfCanvas.height = viewport.height;
|
||||||
|
|
||||||
|
overlayCanvas.width = viewport.width; // Match overlay canvas size with PDF canvas
|
||||||
|
overlayCanvas.height = viewport.height;
|
||||||
|
|
||||||
|
let renderContext = { canvasContext: context, viewport: viewport };
|
||||||
|
page.render(renderContext);
|
||||||
|
pdfCanvas.classList.add("shadow-canvas");
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
</script>
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div th:insert="~{fragments/footer.html :: footer}"></div>
|
|
||||||
</div>
|
</div>
|
||||||
|
<div th:insert="~{fragments/footer.html :: footer}"></div>
|
||||||
|
</div>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|
Loading…
Reference in a new issue