Java引用POI实现Word转Html方法

原文： http://blog.csdn.net/fyqcdbdx/article/details/7630122

1. 下载POI工具并引用

2. 读取整个doc文档，获得该文档的所有字符串。

3. 从该字符串中得到标题，把该标题构成一个HTML格式的字符串，如<html><head><title>测试文档</title></head><body>。

4. 从该文档中判断是否有表格，如有，把每个表格的开始偏移量，结束偏移量记录下来，同时根据每个表格的行，列读取表格的内容，并构造出表格的HTML字符串。

5. 从该字符串的第一个字符开始逐个字符循环，得到字符的字体，字号大小，直到下一个字符的字体，字号不一样时，把这些字符内容构造成一个HTML格式的字符串。

6. 如果碰到字符为回车符，制表符，把回车符，制表符构造成HTML格式的字符串。

7. 如果碰到字符为图片，读取图片，把图片放在指定路径，再把这一路径的信息构造成HTML字符串，如<img src='c://test//1.jpg'/>。

8. 如读取字符串的位置等于表格的开始偏移量时，插入前面一构造出的表格HTML字符串，同时跳过表格的结束偏移量，继续往下循环读取字符。

9. 由于以上读取是按字符串逐个读取，并且根据字符的变化同时构造出HTML字符串，所以当字符串读取完毕后，即构造出一个完整的HTML字符串。

10. 举例

Word文件

HTML文件

11.源代码

WordExcelToHtml.java

[java] view plain copy print ?

packagecom;
importjava.io.BufferedWriter;
importjava.io.File;
importjava.io.FileInputStream;
importjava.io.FileNotFoundException;
importjava.io.FileOutputStream;
importjava.io.IOException;
importjava.io.OutputStream;
importjava.io.OutputStreamWriter;
importorg.apache.poi.hwpf.HWPFDocument;
importorg.apache.poi.hwpf.model.PicturesTable;
importorg.apache.poi.hwpf.usermodel.CharacterRun;
importorg.apache.poi.hwpf.usermodel.Picture;
importorg.apache.poi.hwpf.usermodel.Range;
importorg.apache.poi.hwpf.extractor.WordExtractor;
importorg.apache.poi.hwpf.usermodel.Paragraph;
importorg.apache.poi.hwpf.usermodel.Table;
importorg.apache.poi.hwpf.usermodel.TableCell;
importorg.apache.poi.hwpf.usermodel.TableIterator;
importorg.apache.poi.hwpf.usermodel.TableRow;
publicclassWordExcelToHtml{
/**
*回车符ASCII码
*/
privatestaticfinalshortENTER_ASCII=13;
/**
*空格符ASCII码
*/
privatestaticfinalshortSPACE_ASCII=32;
/**
*水平制表符ASCII码
*/
privatestaticfinalshortTABULATION_ASCII=9;
publicstaticStringhtmlText="";
publicstaticStringhtmlTextTbl="";
publicstaticintcounter=0;
publicstaticintbeginPosi=0;
publicstaticintendPosi=0;
publicstaticintbeginArray[];
publicstaticintendArray[];
publicstaticStringhtmlTextArray[];
publicstaticbooleantblExist=false;
publicstaticfinalStringinputFile="c://bb.doc";
publicstaticvoidmain(Stringargv[])
{
try{
getWordAndStyle(inputFile);
}catch(Exceptione){
//TODOAuto-generatedcatchblock
e.printStackTrace();
}
}
/**
*读取每个文字样式
*
*@paramfileName
*@throwsException
*/
publicstaticvoidgetWordAndStyle(StringfileName)throwsException{
FileInputStreamin=newFileInputStream(newFile(fileName));
HWPFDocumentdoc=newHWPFDocument(in);
Rangerangetbl=doc.getRange();//得到文档的读取范围
TableIteratorit=newTableIterator(rangetbl);
intnum=100;
beginArray=newint[num];
endArray=newint[num];
htmlTextArray=newString[num];
//取得文档中字符的总数
intlength=doc.characterLength();
//创建图片容器
PicturesTablepTable=doc.getPicturesTable();
htmlText="<html><head><title>"+doc.getSummaryInformation().getTitle()+"</title></head><body>";
//创建临时字符串,好加以判断一串字符是否存在相同格式
if(it.hasNext())
{
readTable(it,rangetbl);
}
intcur=0;
StringtempString="";
for(inti=0;i<length-1;i++){
//整篇文章的字符通过一个个字符的来判断,range为得到文档的范围
Rangerange=newRange(i,i+1,doc);
CharacterRuncr=range.getCharacterRun(0);
//beginArray=newint[num];
//endArray=newint[num];
//htmlTextArray=newString[num];
if(tblExist)
{
if(i==beginArray[cur])
{
htmlText+=tempString+htmlTextArray[cur];
tempString="";
i=endArray[cur]-1;
cur++;
continue;
}
}
if(pTable.hasPicture(cr)){
htmlText+=tempString;
//读写图片
readPicture(pTable,cr);
tempString="";
}
else{
Rangerange2=newRange(i+1,i+2,doc);
//第二个字符
CharacterRuncr2=range2.getCharacterRun(0);
charc=cr.text().charAt(0);
System.out.println(i+"::"+range.getEndOffset()+"::"+range.getStartOffset()+"::"+c);
//判断是否为回车符
if(c==ENTER_ASCII)
{
tempString+="<br/>";
}
//判断是否为空格符
elseif(c==SPACE_ASCII)
tempString+="";
//判断是否为水平制表符
elseif(c==TABULATION_ASCII)
tempString+="";
//比较前后2个字符是否具有相同的格式
booleanflag=compareCharStyle(cr,cr2);
if(flag)
tempString+=cr.text();
else{
StringfontStyle="<spanstyle="font-family:"+cr.getFontName()+";font-size:"+cr.getFontSize()/2+"pt;";
if(cr.isBold())
fontStyle+="font-weight:bold;";
if(cr.isItalic())
fontStyle+="font-style:italic;";
htmlText+=fontStyle+""mce_style="font-family:"+cr.getFontName()+";font-size:"+cr.getFontSize()/2+"pt;";
if(cr.isBold())
fontStyle+="font-weight:bold;";
if(cr.isItalic())
fontStyle+="font-style:italic;";
htmlText+=fontStyle+"">"+tempString+cr.text()+"</span>";
tempString="";
}
}
}
htmlText+=tempString+"</body></html>";
writeFile(htmlText);
}
/**
*读写文档中的表格
*
*@parampTable
*@paramcr
*@throwsException
*/
publicstaticvoidreadTable(TableIteratorit,Rangerangetbl)throwsException{
htmlTextTbl="";
//迭代文档中的表格
counter=-1;
while(it.hasNext())
{
tblExist=true;
htmlTextTbl="";
Tabletb=(Table)it.next();
beginPosi=tb.getStartOffset();
endPosi=tb.getEndOffset();
System.out.println("............"+beginPosi+"...."+endPosi);
counter=counter+1;
//迭代行，默认从0开始
beginArray[counter]=beginPosi;
endArray[counter]=endPosi;
htmlTextTbl+="<tableborder>";
for(inti=0;i<tb.numRows();i++){
TableRowtr=tb.getRow(i);
htmlTextTbl+="<tr>";
//迭代列，默认从0开始
for(intj=0;j<tr.numCells();j++){
TableCelltd=tr.getCell(j);//取得单元格
intcellWidth=td.getWidth();
//取得单元格的内容
for(intk=0;k<td.numParagraphs();k++){
Paragraphpara=td.getParagraph(k);
Strings=para.text().toString().trim();
if(s=="")
{
s="";
}
System.out.println(s);
htmlTextTbl+="<tdwidth="+cellWidth+">"+s+"</td>";
System.out.println(i+":"+j+":"+cellWidth+":"+s);
}//endfor
}//endfor
}//endfor
htmlTextTbl+="</table>";
htmlTextArray[counter]=htmlTextTbl;
}//endwhile
}
/**
*读写文档中的图片
*
*@parampTable
*@paramcr
*@throwsException
*/
publicstaticvoidreadPicture(PicturesTablepTable,CharacterRuncr)throwsException{
//提取图片
Picturepic=pTable.extractPicture(cr,false);
//返回POI建议的图片文件名
StringafileName=pic.suggestFullFileName();
OutputStreamout=newFileOutputStream(newFile("c://test"+File.separator+afileName));
pic.writeImageContent(out);
htmlText+="<imgsrc="c://test//"+afileName+""mce_src="c://test//"+afileName+""/>";
}
publicstaticbooleancompareCharStyle(CharacterRuncr1,CharacterRuncr2)
{
booleanflag=false;
if(cr1.isBold()==cr2.isBold()&&cr1.isItalic()==cr2.isItalic()&&cr1.getFontName().equals(cr2.getFontName())&&cr1.getFontSize()==cr2.getFontSize())
{
flag=true;
}
returnflag;
}
/**
*写文件
*
*@params
*/
publicstaticvoidwriteFile(Strings){
FileOutputStreamfos=null;
BufferedWriterbw=null;
try{
Filefile=newFile("c://abc.html");
fos=newFileOutputStream(file);
bw=newBufferedWriter(newOutputStreamWriter(fos));
bw.write(s);
}catch(FileNotFoundExceptionfnfe){
fnfe.printStackTrace();
}catch(IOExceptionioe){
ioe.printStackTrace();
}finally{
try{
if(bw!=null)
bw.close();
if(fos!=null)
fos.close();
}catch(IOExceptionie){
}
}
}