Extracting text/html out Word (.docx) files

Repositories

https://github.com/jmohler1970/WordExtractor

https://github.com/jmohler1970/WordExtractor_demo

Introduction

We are going to be extracting out HTML from a Word (.docx) file.

.docx is an example of an Open Document Format for Office Applications (ODF) file. It is a ZIP of an XML document.

By unzipping the file and locating the appropriate XML file, we can process the data an generate HTML

Resources

blog

cfscript

cfzip

learning

programming

(1)

Comments

(7)

James Mohler Follow

Been a ColdFusion Developer since 1996

You must be logged in to post a comment.

All Comments

delsalsa

2020-06-19 19:00:25

Hi,

great post, thx. Is there a way, also to put the Image out of the Word-Document, which is in “w drawing”, to HTML? I’ve read many posts (Java and PHP > mostly found payable plugins), but no clue how to do it in CF.

There must be 2 ways, inline and floating.
I’ve found this post usefull:https://www.toptal.com/xml/an-informal-introduction-to-docx

But there is no explanation how to put that into code to show it then in HTML.

Thx for any Answer

Corrado

dingdongiuytiyt_t

2019-10-25 15:39:23

Nice James.
Today I used you code to do some extractions. It is difficult to do a nice commit to your git hub.

See bellow some enhancement and issue solving.

component output=”false” {
this.xmlPara = “”; // parsed into XML nodes
this.xmlString = “”; // raw text
this.proofErr = “spellEnd”;
VARIABLES.listcounter = 1; // used to set order list numbers
VARIABLES.listcounterOutlinelevel = [0,0,0,0,0,0,0,0,0,0]; // used to set order list numbers with outlinelevel
VARIABLES.listCounterLen = ArrayLen(VARIABLES.listcounterOutlinelevel);
variables.CRLF = Chr(13) & Chr(10);
this.headingMax = 6;
private string function ReadNode (required xml Node) {
var result = “”;
var wpPr = “”;
var wrPr = “”; // Does bold, italic
var wnumPr = “”; // ordered or unordered in html
var wnumID = “”;
var wVal = “”;
var basedOn =””;
var startElementName = “”;
var wrPrNodeElement = “”;
var outlinelevel = 0;
var ilvl = 0;
var pHTMLtag =”p”;
if (StructIsEmpty(arguments.Node)) {
return “”;
}
for (var Element in arguments.node.xmlChildren) {
startwVal = “”;
/* Start Tags*/
switch (Element.xmlName) {
case “w:p” :
wVal = “”; // default paragraph style
wnumid = “”; // This actually the type of list
if (ArrayLen(Element.XMLChildren) != 0) {
/* pPr ParagraphProperties*/
if (Element.XMLChildren[1].xmlName == “w:pPr”) {
wpPr = Element.XMLChildren[1];
cfloop(array=wpPr.XMLChildren,index=PPropertyIndex,item=PProperty) {
if (PProperty.xmlName == “w:pStyle”) {
wVal = PProperty.XMLAttributes[“w:val”];
}
if (PProperty.xmlName == “w:outlineLvl”) {
outlinelevel = PProperty.XMLAttributes[“w:val”];
}
if (PProperty.xmlName == “w:numPr”) {
cfloop(array=PProperty.XMLChildren,index=NumropertyIndex,item=NumProperty) {
if (NumProperty.xmlName == “w:numID”) {
wnumid = NumProperty.XMLAttributes[“w:val”];
}
if (NumProperty.xmlName == “w:ilvl”) {
ilvl = NumProperty.XMLAttributes[“w:val”];
VARIABLES.listcounterOutlinelevel[ilvl+1]=0;
resetCounterValues(ilvl);
}
}
}
}
}
}
switch (wVal) {
case “ListParagraph” :
if (wnumid == 2) {
result &= ‘<ol start=”#listcounter#”><li>#ReadNode(Element)#</li></ol>#variables.crlf#’;
}
else {
result &= ‘<li>#ReadNode(Element)#</li>#variables.crlf#’;
}
variables.listcounter++;
break;
default :
variables.listcounter = 1;
/* normal paragraph*/
if(wVal neq “”){
/* find if the style has numbering and its outlinelevel*/
/*https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing.numberingproperties?view=openxml-2.8.1*/
pstyle =XmlSearch(this.xmlStyles,”/w:styles/w:style[@w:styleId=’#wVal#’]”);
outlinelevel = JavaCast(“int”,XmlSearch(pstyle[1],”number(w:pPr/w:outlineLvl/@w:val)”));
ilvl = JavaCast(“int”,XmlSearch(pstyle[1],”number(w:pPr/w:numPr/w:ilvl/@w:val)”));
wnumId = JavaCast(“int”,XmlSearch(pstyle[1],”number(w:pPr/w:numPr/w:numId/@w:val)”));
basedOn = XmlSearch(pstyle[1],”string(w:basedOn/@w:val)”);
pHTMLtag =”p”;
for(var headingNumber =1; headingNumber lte this.headingMax;headingNumber++){
if(basedOn eq “Heading”&headingNumber OR wVal eq “Heading”&headingNumber){
pHTMLtag =”h”&headingNumber;
resetCounterValues(headingNumber);
}
}
if(wnumId gt 0 AND isnumeric(ilvl) AND ilvl gt 0 ) {
VARIABLES.listcounterOutlinelevel[ilvl+1]=0;
VARIABLES.listcounterOutlinelevel[ilvl]=VARIABLES.listcounterOutlinelevel[ilvl]+1;
result &= ‘<#pHTMLtag# class=”#wVal#”>#VARIABLES.listcounterOutlinelevel[ilvl]#. #ReadNode(Element)#</#pHTMLtag#>#variables.crlf#’; //add style id as html class name
} else {
variables.listcounter = 1;
result &= ‘<#pHTMLtag# class=”#wVal#”>#ReadNode(Element)#</#pHTMLtag#>#variables.crlf#’; //add style id as html class name
}
} else {
variables.listcounter = 1;
result &= ‘#ReadNode(Element)##variables.crlf#’;
}
}
break; // end of w:p
case “w:r” : // This handles bolds and italics
wrPr = “”;
wrPrNodeElement = ReadNode(Element);
/* multiple children*/
for(var elChild in Element.XMLChildren){
if ( isArray(elChild.XMLChildren) && !arrayIsEmpty(elChild.XMLChildren)) {
/* loop */
cfloop(array=elChild.XMLChildren,index=wrPrIndex,item=wrPritem) {
wrPr = elChild.XMLChildren[wrPrIndex].XMLName;
switch (wrPr) {
case “w:b” :
wrPrNodeElement = “#wrPrNodeElement#”;
break;
case “w:i” :
wrPrNodeElement = “#wrPrNodeElement#”;
break;
case “w:u” :
wrPrNodeElement = “#wrPrNodeElement#”;
break;
default :
/* Other */
break;
}
}
}
}
result &= wrPrNodeElement;
break;
case “w:t” :
result &= Element.xmlText;
break;
case “w:ProofErr” :
/* Word divides this into separate areas*/
/*skip this.proofErr = Element.XMLAttributes[“w:type”];*/
break;
case “w:pStyle” :
/* skip variables.currentTag = Element.XMLAttributes[“w:val”];*/
break;
case “w:instrText” :
/* skip*/
break;
default :
result &= Element.xmlText;
}
/* Inner text*/
/* result &= readNode(Element);*/
} /* End for loop on Element*/
return result;
} /* End function*/
private function resetCounterValues(required numeric depth) {
for (var i = ARGUMENTS.depth+1; i lte VARIABLES.listCounterLen;i++){
VARIABLES.listcounterOutlinelevel[i] = 0;
}
}
string function extractDocx(required string pathToDocX) {
cfzip(action=”read”, file=arguments.pathToDocx, entrypath=”word\document.xml”, variable=”this.xmlString”,charset = “utf-8″);
cfzip(action=”read”, file=arguments.pathToDocx, entrypath=”word\styles.xml”, variable=”this.xmlStyles”,charset = “utf-8”);
this.xmlPara = xmlparse(this.xmlString).document.body;
return ReadNode(this.xmlPara);
}
}

ksaravn

2019-05-10 08:41:47

Awsome James. This content is very useful.

David Byers

2018-11-07 21:13:37

Excellent content James. This was really well done.

(1)

James Mohler

2018-11-07 21:58:03

Glad you liked it!

chrisg57685480

2018-11-05 18:38:01

Well done, James! I really enjoyed this walk through. Been a loooooOOOOooooong time since I’ve seen any great CF tutorials. And I think I can use this. I’ve been wanting to convert my word documents into markdown files. Thanks for the head start!

BTW: I had no idea docx files were really just zip files. (mind-blown)

(1)

James Mohler

2018-11-06 04:52:25

Glad you liked it!