November 3, 2018
Extracting text/html out Word (.docx) files
Comments
(7)
November 3, 2018
Extracting text/html out Word (.docx) files
Been a ColdFusion Developer since 1996
Newbie 24 posts
Followers: 15 people
(7)

Repositories

https://github.com/jmohler1970/WordExtractor

https://github.com/jmohler1970/WordExtractor_demo

Introduction

We are going to be extracting out HTML from a Word (.docx) file.

.docx is an example of an Open Document Format for Office Applications (ODF) file. It is a ZIP of an XML document.
By unzipping the file and locating the appropriate XML file, we can process the data an generate HTML
7 Comments
Jun 19, 2020
Jun 19, 2020

Hi,

great post, thx. Is there a way, also to put the Image out of the Word-Document, which is in “w drawing”, to HTML? I’ve read many posts (Java and PHP > mostly found payable plugins), but no clue how to do it in CF.

There must be 2 ways, inline and floating.
I’ve found this post usefull:https://www.toptal.com/xml/an-informal-introduction-to-docx

But there is no explanation how to put that into code to show it then in HTML.

Thx for any Answer

Corrado

Like
()
Oct 25, 2019
Oct 25, 2019

Nice James.
Today I used you code to do some extractions. It is difficult to do a nice commit to your git hub.

See bellow some enhancement and issue solving.

component output=”false” {
this.xmlPara = “”; // parsed into XML nodes
this.xmlString = “”; // raw text
this.proofErr = “spellEnd”;
VARIABLES.listcounter = 1; // used to set order list numbers
VARIABLES.listcounterOutlinelevel = [0,0,0,0,0,0,0,0,0,0]; // used to set order list numbers with outlinelevel
VARIABLES.listCounterLen = ArrayLen(VARIABLES.listcounterOutlinelevel);
variables.CRLF = Chr(13) & Chr(10);
this.headingMax = 6;
private string function ReadNode (required xml Node) {
var result = “”;
var wpPr = “”;
var wrPr = “”; // Does bold, italic
var wnumPr = “”; // ordered or unordered in html
var wnumID = “”;
var wVal = “”;
var basedOn =””;
var startElementName = “”;
var wrPrNodeElement = “”;
var outlinelevel = 0;
var ilvl = 0;
var pHTMLtag =”p”;
if (StructIsEmpty(arguments.Node)) {
return “”;
}
for (var Element in arguments.node.xmlChildren) {
startwVal = “”;
/* Start Tags*/
switch (Element.xmlName) {
case “w:p” :
wVal = “”; // default paragraph style
wnumid = “”; // This actually the type of list
if (ArrayLen(Element.XMLChildren) != 0) {
/* pPr ParagraphProperties*/
if (Element.XMLChildren[1].xmlName == “w:pPr”) {
wpPr = Element.XMLChildren[1];
cfloop(array=wpPr.XMLChildren,index=PPropertyIndex,item=PProperty) {
if (PProperty.xmlName == “w:pStyle”) {
wVal = PProperty.XMLAttributes[“w:val”];
}
if (PProperty.xmlName == “w:outlineLvl”) {
outlinelevel = PProperty.XMLAttributes[“w:val”];
}
if (PProperty.xmlName == “w:numPr”) {
cfloop(array=PProperty.XMLChildren,index=NumropertyIndex,item=NumProperty) {
if (NumProperty.xmlName == “w:numID”) {
wnumid = NumProperty.XMLAttributes[“w:val”];
}
if (NumProperty.xmlName == “w:ilvl”) {
ilvl = NumProperty.XMLAttributes[“w:val”];
VARIABLES.listcounterOutlinelevel[ilvl+1]=0;
resetCounterValues(ilvl);
}
}
}
}
}
}
switch (wVal) {
case “ListParagraph” :
if (wnumid == 2) {
result &= ‘<ol start=”#listcounter#”><li>#ReadNode(Element)#</li></ol>#variables.crlf#’;
}
else {
result &= ‘<li>#ReadNode(Element)#</li>#variables.crlf#’;
}
variables.listcounter++;
break;
default :
variables.listcounter = 1;
/* normal paragraph*/
if(wVal neq “”){
/* find if the style has numbering and its outlinelevel*/
/*https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing.numberingproperties?view=openxml-2.8.1*/
pstyle =XmlSearch(this.xmlStyles,”/w:styles/w:style[@w:styleId=’#wVal#’]”);
outlinelevel = JavaCast(“int”,XmlSearch(pstyle[1],”number(w:pPr/w:outlineLvl/@w:val)”));
ilvl = JavaCast(“int”,XmlSearch(pstyle[1],”number(w:pPr/w:numPr/w:ilvl/@w:val)”));
wnumId = JavaCast(“int”,XmlSearch(pstyle[1],”number(w:pPr/w:numPr/w:numId/@w:val)”));
basedOn = XmlSearch(pstyle[1],”string(w:basedOn/@w:val)”);
pHTMLtag =”p”;
for(var headingNumber =1; headingNumber lte this.headingMax;headingNumber++){
if(basedOn eq “Heading”&headingNumber OR wVal eq “Heading”&headingNumber){
pHTMLtag =”h”&headingNumber;
resetCounterValues(headingNumber);
}
}
if(wnumId gt 0 AND isnumeric(ilvl) AND ilvl gt 0 ) {
VARIABLES.listcounterOutlinelevel[ilvl+1]=0;
VARIABLES.listcounterOutlinelevel[ilvl]=VARIABLES.listcounterOutlinelevel[ilvl]+1;
result &= ‘<#pHTMLtag# class=”#wVal#”>#VARIABLES.listcounterOutlinelevel[ilvl]#. #ReadNode(Element)#</#pHTMLtag#>#variables.crlf#’; //add style id as html class name
} else {
variables.listcounter = 1;
result &= ‘<#pHTMLtag# class=”#wVal#”>#ReadNode(Element)#</#pHTMLtag#>#variables.crlf#’; //add style id as html class name
}
} else {
variables.listcounter = 1;
result &= ‘<p>#ReadNode(Element)#</p>#variables.crlf#’;
}
}
break; // end of w:p
case “w:r” : // This handles bolds and italics
wrPr = “”;
wrPrNodeElement = ReadNode(Element);
/* multiple children*/
for(var elChild in Element.XMLChildren){
if ( isArray(elChild.XMLChildren) && !arrayIsEmpty(elChild.XMLChildren)) {
/* loop */
cfloop(array=elChild.XMLChildren,index=wrPrIndex,item=wrPritem) {
wrPr = elChild.XMLChildren[wrPrIndex].XMLName;
switch (wrPr) {
case “w:b” :
wrPrNodeElement = “<b>#wrPrNodeElement#</b>”;
break;
case “w:i” :
wrPrNodeElement = “<i>#wrPrNodeElement#</i>”;
break;
case “w:u” :
wrPrNodeElement = “<u>#wrPrNodeElement#</u>”;
break;
default :
/* Other */
break;
}
}
}
}
result &= wrPrNodeElement;
break;
case “w:t” :
result &= Element.xmlText;
break;
case “w:ProofErr” :
/* Word divides this into separate areas*/
/*skip this.proofErr = Element.XMLAttributes[“w:type”];*/
break;
case “w:pStyle” :
/* skip variables.currentTag = Element.XMLAttributes[“w:val”];*/
break;
case “w:instrText” :
/* skip*/
break;
default :
result &= Element.xmlText;
}
/* Inner text*/
/* result &= readNode(Element);*/
} /* End for loop on Element*/
return result;
} /* End function*/
private function resetCounterValues(required numeric depth) {
for (var i = ARGUMENTS.depth+1; i lte VARIABLES.listCounterLen;i++){
VARIABLES.listcounterOutlinelevel[i] = 0;
}
}
string function extractDocx(required string pathToDocX) {
cfzip(action=”read”, file=arguments.pathToDocx, entrypath=”word\document.xml”, variable=”this.xmlString”,charset = “utf-8″);
cfzip(action=”read”, file=arguments.pathToDocx, entrypath=”word\styles.xml”, variable=”this.xmlStyles”,charset = “utf-8”);
this.xmlPara = xmlparse(this.xmlString).document.body;
return ReadNode(this.xmlPara);
}
}

Like
()
May 10, 2019
May 10, 2019

Awsome James. This content is very useful.

Like
()
Add Comment