'
' Copyright (c) 2001-2025 by Apryse Software Inc. All Rights Reserved.
'

Imports pdftron
Imports pdftron.Common
Imports pdftron.PDF
Imports pdftron.Filters

' The Data Extraction suite is an optional PDFNet add-on collection that can be used to
' extract various types of data from PDF documents.
' The Apryse SDK Data Extraction suite can be downloaded from http://www.pdftron.com/

Module DataExtractionTestVB
	Dim pdfNetLoader As PDFNetLoader
	Sub New()
		pdfNetLoader = pdftron.PDFNetLoader.Instance()
	End Sub

	' Relative path to the folder containing test files.
	Dim input_path As String = "../../../../TestFiles/"
	Dim output_path As String = "../../../../TestFiles/Output/"

	Sub Main()
		PDFNet.Initialize(PDFTronLicense.Key)
		PDFNet.AddResourceSearchPath("../../../../../Lib/")

		TestTabularData()
		TestDocumentStructure()
		TestFormFields()
		TestGenericKeyValue()
		TestDocClassifier()

		PDFNet.Terminate()
	End Sub


	' The following sample illustrates how to extract tables from PDF documents.
	Sub TestTabularData()
		' Test if the add-on is installed
		If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_tabular) Then
			Console.WriteLine()
			Console.WriteLine("Unable to run Data Extraction: Apryse SDK Tabular Data module not available.")
			Console.WriteLine("---------------------------------------------------------------")
			Console.WriteLine("The Data Extraction suite is an optional add-on, available for download")
			Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this")
			Console.WriteLine("module, ensure that the SDK is able to find the required files")
			Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
			Console.WriteLine()
			Return
		End If

		Try
			' Extract tabular data as a JSON file
			DataExtractionModule.ExtractData(input_path & "table.pdf", output_path & "table.json", DataExtractionModule.DataExtractionEngine.e_tabular)

			' Extract tabular data as a JSON string
			Dim json As String = DataExtractionModule.ExtractData(input_path & "financial.pdf", DataExtractionModule.DataExtractionEngine.e_tabular)
			System.IO.File.WriteAllText(output_path & "financial.json", json)

			' Extract tabular data as an XLSX file
			DataExtractionModule.ExtractToXLSX(input_path & "table.pdf", output_path & "table.xlsx")

			' Extract tabular data as an XLSX stream (also known as filter)
			Dim output_xlsx_stream As MemoryFilter = New MemoryFilter(0, False)
			DataExtractionModule.ExtractToXLSX(input_path & "financial.pdf", output_xlsx_stream)
			output_xlsx_stream.SetAsInputFilter()
			output_xlsx_stream.WriteToFile(output_path & "financial.xlsx", False)

		Catch e As PDFNetException
			Console.WriteLine(e.Message)
		End Try
	End Sub


	' The following sample illustrates how to extract document structure from PDF documents.
	Sub TestDocumentStructure()
		' Test if the add-on is installed
		If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_structure) Then
			Console.WriteLine()
			Console.WriteLine("Unable to run Data Extraction: Apryse SDK Structured Output module not available.")
			Console.WriteLine("---------------------------------------------------------------")
			Console.WriteLine("The Data Extraction suite is an optional add-on, available for download")
			Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this")
			Console.WriteLine("module, ensure that the SDK is able to find the required files")
			Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
			Console.WriteLine()
			Return
		End If

		Try
			' Extract document structure as a JSON file
			DataExtractionModule.ExtractData(input_path & "paragraphs_and_tables.pdf", output_path & "paragraphs_and_tables.json", DataExtractionModule.DataExtractionEngine.e_doc_structure)

			' Extract document structure as a JSON string
			Dim json As String = DataExtractionModule.ExtractData(input_path & "tagged.pdf", DataExtractionModule.DataExtractionEngine.e_doc_structure)
			System.IO.File.WriteAllText(output_path & "tagged.json", json)

		Catch e As PDFNetException
			Console.WriteLine(e.Message)
		End Try
	End Sub


	' The following sample illustrates how to extract form fields from PDF documents.
	Sub TestFormFields()
		' Test if the add-on is installed
		If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_form) Then
			Console.WriteLine()
			Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIFormFieldExtractor module not available.")
			Console.WriteLine("---------------------------------------------------------------")
			Console.WriteLine("The Data Extraction suite is an optional add-on, available for download")
			Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this")
			Console.WriteLine("module, ensure that the SDK is able to find the required files")
			Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
			Console.WriteLine()
			Return
		End If

		Try
			' Extract form fields as a JSON file
			DataExtractionModule.ExtractData(input_path & "formfields-scanned.pdf", output_path & "formfields-scanned.json", DataExtractionModule.DataExtractionEngine.e_form)

			' Extract form fields as a JSON string
			Dim json As String = DataExtractionModule.ExtractData(input_path & "formfields.pdf", DataExtractionModule.DataExtractionEngine.e_form)
			System.IO.File.WriteAllText(output_path & "formfields.json", json)

			' Detect and add form fields to a PDF document.
			' PDF document already has form fields, and this sample will update to new found fields.
			Using doc = New PDFDoc(input_path & "formfields-scanned-withfields.pdf")
				DataExtractionModule.DetectAndAddFormFieldsToPDF(doc)
				doc.Save(output_path & "formfields-scanned-fields-new.pdf", SDF.SDFDoc.SaveOptions.e_linearized)
			End Using

			' Detect and add form fields to a PDF document.
			' PDF document already has form fields, and this sample will keep the original fields.
			Using doc = New PDFDoc(input_path & "formfields-scanned-withfields.pdf")
				Dim options = New DataExtractionOptions()
				options.SetOverlappingFormFieldBehavior("KeepOld")
				DataExtractionModule.DetectAndAddFormFieldsToPDF(doc, options)
				doc.Save(output_path & "formfields-scanned-fields-old.pdf", SDF.SDFDoc.SaveOptions.e_linearized)
			End Using

		Catch e As PDFNetException
			Console.WriteLine(e.Message)
		End Try

	End Sub

	' The following sample illustrates how to extract key-value pairs from PDF documents.
	Sub TestGenericKeyValue()
		If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_generic_key_value) Then
			Console.WriteLine()
			Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.")
			Console.WriteLine("---------------------------------------------------------------")
			Console.WriteLine("The Data Extraction suite is an optional add-on, available for download")
			Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this")
			Console.WriteLine("module, ensure that the SDK is able to find the required files")
			Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
			Console.WriteLine()
			Return
		End If

		Try
			' Simple example: Extract Keys & Values as a JSON file
			DataExtractionModule.ExtractData(input_path & "newsletter.pdf", output_path & "newsletter_key_val.json", DataExtractionModule.DataExtractionEngine.e_generic_key_value)

			' Example with customized options:
			' Extract Keys & Values from pages 2-4, excluding ads
			Dim options As New DataExtractionOptions()
			options.SetPages("2-4")

			Dim p2ExclusionZones As New RectCollection()
			' Exclude the ad on page 2
			' These coordinates are in PDF user space, with the origin at the bottom left corner of the page
			' Coordinates rotate with the page, if it has rotation applied.
			p2ExclusionZones.AddRect(166, 47, 562, 222)
			options.AddExclusionZonesForPage(p2ExclusionZones, 2)

			Dim p4InclusionZones As New RectCollection()
			Dim p4ExclusionZones As New RectCollection()
			' Only include the article text for page 4, exclude ads and headings
			p4InclusionZones.AddRect(30, 432, 562, 684)
			p4ExclusionZones.AddRect(30, 657, 295, 684)
			options.AddInclusionZonesForPage(p4InclusionZones, 4)
			options.AddExclusionZonesForPage(p4ExclusionZones, 4)

			DataExtractionModule.ExtractData(input_path & "newsletter.pdf", output_path & "newsletter_key_val_with_zones.json",DataExtractionModule.DataExtractionEngine.e_generic_key_value, options)
		Catch e As PDFNetException
			Console.WriteLine(e.Message)
		End Try

	End Sub

	' The following sample illustrates how to extract document classes from PDF documents.
	Sub TestDocClassifier()
		' Test if the add-on is installed
		If Not DataExtractionModule.IsModuleAvailable(DataExtractionModule.DataExtractionEngine.e_doc_classification) Then
			Console.WriteLine()
			Console.WriteLine("Unable to run Data Extraction: Apryse SDK AIPageObjectExtractor module not available.")
			Console.WriteLine("---------------------------------------------------------------")
			Console.WriteLine("The Data Extraction suite is an optional add-on, available for download")
			Console.WriteLine("at http://www.pdftron.com/. If you have already downloaded this")
			Console.WriteLine("module, ensure that the SDK is able to find the required files")
			Console.WriteLine("using the PDFNet.AddResourceSearchPath() function.")
			Console.WriteLine()
			Return
		End If

		Try
			' Simple example: classify pages as a JSON file
			DataExtractionModule.ExtractData(input_path & "Invoice.pdf", output_path & "Invoice_Classified.json", DataExtractionModule.DataExtractionEngine.e_doc_classification)

			' Classify pages as a JSON string
			Dim json As String = DataExtractionModule.ExtractData(input_path & "Scientific_Publication.pdf", DataExtractionModule.DataExtractionEngine.e_doc_classification)
			System.IO.File.WriteAllText(output_path & "Scientific_Publication_Classified.json", json)

			' Example with customized options:
			Dim options As New DataExtractionOptions()
			' Classes that don't meet the minimum confidence threshold of 70% will not be listed in the output JSON
			options.SetMinimumConfidenceThreshold(0.7)
			DataExtractionModule.ExtractData(input_path & "Email.pdf", output_path & "Email_Classified.json",DataExtractionModule.DataExtractionEngine.e_doc_classification, options)

		Catch e As PDFNetException
			Console.WriteLine(e.Message)
		End Try
	End Sub

End Module
