How to Identify OCR Needs in a Folder of PDFs

By Apryse | 2025 Jun 25

5 min

Here’s the code sample:

using pdftron; 
using pdftron.PDF; 
using pdftron.SDF; 

namespace ConsoleApp1 
{ 
    internal class Program 
    { 
        static void Main(string[] args) 
        { 
            PDFNet.Initialize(PDFTronLicense.License); 
            PDFNet.AddResourceSearchPath(PDFTronLicense.ModulePath); 
            string inputDir = @"path_to_input_dir"; 
            string outputDir = @"path_to_output_dir"; 
            foreach (string filePath in Directory.GetFiles(inputDir, "*.pdf")) 
            { 
                using (PDFDoc doc = new PDFDoc(filePath)) 
                { 
                    doc.InitSecurityHandler(); 
                    int textCount = 0; 
                    int nonTextCount = 0; 
                    ElementReader reader = new ElementReader(); 
                    for (int i = 1; i <= doc.GetPageCount(); i++) 
                    { 
                        pdftron.PDF.Page page = doc.GetPage(i); 
                        reader.Begin(page); 

                        Element element; 
                        while ((element = reader.Next()) != null) 
                        { 
                            switch (element.GetType()) 
                            { 
                                case Element.Type.e_text: 
                                    textCount++; 
                                    break; 
                                default: 
                                    nonTextCount++; 
                                    break; 
                            } 
                        } 
                        reader.End(); 
                    } 

                    int totalElements = textCount + nonTextCount; 
                    double nonTextPercentage = (double)nonTextCount / totalElements * 100; 

                    if (nonTextPercentage > 10) 
                    { 
                        OCRModule.ProcessPDF(doc, null); 
                        string outputFilePath = Path.Combine(outputDir, Path.GetFileName(filePath)); 
                        doc.Save(outputFilePath, SDFDoc.SaveOptions.e_linearized); 
                        Console.WriteLine($"OCR performed on {filePath} and saved to {outputFilePath}"); 
                    } 
                    else 
                    { 
                        Console.WriteLine($"No OCR needed for {filePath}"); 
                    } 
                } 
            } 
        } 
    } 
}

Now you have a starting point to check if OCR is needed for a folder of PDF files. Simple as that!

Next Steps

Copied to clipboard

To learn more about Apryse OCR, visit our documentation. If you have any questions or are ready to get started, contact sales or check out the Server SDK trial.

How to Identify OCR Needs in a Folder of PDFs

Here’s the code sample:

Next Steps

Resources

Related Articles

View all blogs

How to Solve Six Common Problems when Getting Started with Apryse WebViewer

Using the API to get more from Spreadsheet Editor

Adding Multiple Digital Signatures with Apryse WebViewer SDK and JavaScript