Java USES poi to read PPT files and poi to read excel and word samples

  • 2020-04-01 03:09:32
  • OfStack

Apache's POI project can be used to process MS Office documents, and there is a.net version of it on codeplex. POI projects can create and maintain Java apis that manipulate various OOXML and OLE2 file formats. Most MS offices are in OLE2 format. POI supports Outlook through the HSMF subproject, Visio through the HDGF subproject, and Publisher through the HPBF subproject.

Simple example of extracting Word using POI:

To introduce two packages, poi-3.7.jat and poi-scratchpad-3.7.ajr.


package msoffice;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Section;
public class Word {
    //Extract everything directly
    public static String readDoc1(InputStream is) throws IOException {
        WordExtractor extractor = new WordExtractor(is);
        return extractor.getText();
    }

    //Section, Paragraph and string CharacterRun extraction
    public static void readDoc2(InputStream is) throws IOException {
        HWPFDocument doc=new HWPFDocument(is);
        Range r=doc.getRange();
        for(int x=0;x<r.numSections();x++){
            Section s=r.getSection(x);
            for(int y=0;y<s.numParagraphs();y++){
                Paragraph p=s.getParagraph(y);
                for(int z=0;z<p.numCharacterRuns();z++){
                    CharacterRun run=p.getCharacterRun(z);
                    String text=run.text();
                    System.out.print(text);
                }
            }
        }
    }
    public static void main(String[] args) {
        File file = new File("/home/orisun/1.doc");
        try {
            FileInputStream fin = new FileInputStream(file);
            String cont = readDoc1(fin);
            System.out.println(cont);
            fin.close();
            fin = new FileInputStream(file);
            readDoc2(fin);
            fin.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

PPT sample of POI extraction:


package msoffice;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.SlideShow;
public class PPT {
    //Extract the entire slide directly
    public static String readDoc1(InputStream is) throws IOException{
        PowerPointExtractor extractor=new PowerPointExtractor(is);
        return extractor.getText();
    }

    //Slide by slide
    public static void readDoc2(InputStream is) throws IOException{
        SlideShow ss=new SlideShow(new HSLFSlideShow(is));
        Slide[] slides=ss.getSlides();
        for(int i=0;i<slides.length;i++){
            //Read the title of a slide
            String title=slides[i].getTitle();
            System.out.println(" The title :"+title);
            //Read the contents of a slide (including the title)
            TextRun[] runs=slides[i].getTextRuns();
            for(int j=0;j<runs.length;j++){
                System.out.println(runs[j].getText());
            }
        }
    }

    public static void main(String[] args){
        File file = new File("/home/orisun/2.ppt");
        try{
            FileInputStream fin=new FileInputStream(file);
            String cont=readDoc1(fin);
            System.out.println(cont);
            fin.close();
            fin=new FileInputStream(file);
            readDoc2(fin);
            fin.close();
        }catch(IOException e){
            e.printStackTrace();
        }
    }
}

Excel files consist of multiple workbooks, and a Workbook consists of multiple sheets.

Simple example of POI extraction of Excel:


package msoffice;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.ss.usermodel.Row;
public class Excel {
    //Read the entire contents of Excel directly
    public static String readDoc1(InputStream is)throws IOException{
        HSSFWorkbook wb=new HSSFWorkbook(new POIFSFileSystem(is));
        ExcelExtractor extractor=new ExcelExtractor(wb);
        extractor.setFormulasNotResults(false);
        extractor.setIncludeSheetNames(true);
        return extractor.getText();
    }

    //When reading, refine to sheets, rows, and even cells
    public static double getAvg(InputStream is)throws IOException{
        HSSFWorkbook wb=new HSSFWorkbook(new POIFSFileSystem(is));
        //Get the first sheet
        HSSFSheet sheet=wb.getSheetAt(0);
        double molecule=0.0;
        double denominator=0.0;
        //Go through the sheet in rows
        Iterator<Row> riter=sheet.rowIterator();
        while(riter.hasNext()){
            HSSFRow row=(HSSFRow)riter.next();
            HSSFCell cell1=row.getCell(4);
            HSSFCell cell2=row.getCell(4);
            if(cell1.getCellType()!=HSSFCell.CELL_TYPE_NUMERIC){
                System.err.println(" Wrong number type! ");
                System.exit(-2);
            }
            if(cell2.getCellType()!=HSSFCell.CELL_TYPE_NUMERIC){
                System.err.println(" Wrong number type! ");
                System.exit(-2);
            }
            denominator+=Double.parseDouble(cell2.toString().trim());
            molecule+=Double.parseDouble(cell2.toString().trim())*Float.parseFloat(cell1.toString().trim());
        }
        return molecule/denominator;
    }

    public static void main(String[] args){
        File file = new File("/home/orisun/3.xls");
        try{
            FileInputStream fin=new FileInputStream(file);
            String cont=readDoc1(fin);
            System.out.println(cont);
            fin.close();
            fin=new FileInputStream(file);
            System.out.println(" Weighted average score "+getAvg(fin));
            fin.close();
        }catch(IOException e){
            e.printStackTrace();
        }
    }
}


Related articles: