November 3, 2018
Extracting text/html out Word (.docx) files
Comments
(7)
November 3, 2018
Extracting text/html out Word (.docx) files
Been a ColdFusion Developer since 1996
Newbie 24 posts
Followers: 15 people
(7)

Repositories

https://github.com/jmohler1970/WordExtractor

https://github.com/jmohler1970/WordExtractor_demo

Introduction

We are going to be extracting out HTML from a Word (.docx) file.

.docx is an example of an Open Document Format for Office Applications (ODF) file. It is a ZIP of an XML document.
By unzipping the file and locating the appropriate XML file, we can process the data an generate HTML
7 Comments
2020-06-19 19:00:25
2020-06-19 19:00:25

Hi,

great post, thx. Is there a way, also to put the Image out of the Word-Document, which is in “w drawing”, to HTML? I’ve read many posts (Java and PHP > mostly found payable plugins), but no clue how to do it in CF.

There must be 2 ways, inline and floating.
I’ve found this post usefull:https://www.toptal.com/xml/an-informal-introduction-to-docx

But there is no explanation how to put that into code to show it then in HTML.

Thx for any Answer

Corrado

Like
2019-10-25 15:39:23
2019-10-25 15:39:23

Nice James.
Today I used you code to do some extractions. It is difficult to do a nice commit to your git hub.

See bellow some enhancement and issue solving.

component output=”false” {
this.xmlPara = “”; // parsed into XML nodes
this.xmlString = “”; // raw text
this.proofErr = “spellEnd”;
VARIABLES.listcounter = 1; // used to set order list numbers
VARIABLES.listcounterOutlinelevel = [0,0,0,0,0,0,0,0,0,0]; // used to set order list numbers with outlinelevel
VARIABLES.listCounterLen = ArrayLen(VARIABLES.listcounterOutlinelevel);
variables.CRLF = Chr(13) & Chr(10);
this.headingMax = 6;
private string function ReadNode (required xml Node) {
var result = “”;
var wpPr = “”;
var wrPr = “”; // Does bold, italic
var wnumPr = “”; // ordered or unordered in html
var wnumID = “”;
var wVal = “”;
var basedOn =””;
var startElementName = “”;
var wrPrNodeElement = “”;
var outlinelevel = 0;
var ilvl = 0;
var pHTMLtag =”p”;
if (StructIsEmpty(arguments.Node)) {
return “”;
}
for (var Element in arguments.node.xmlChildren) {
startwVal = “”;
/* Start Tags*/
switch (Element.xmlName) {
case “w:p” :
wVal = “”; // default paragraph style
wnumid = “”; // This actually the type of list
if (ArrayLen(Element.XMLChildren) != 0) {
/* pPr ParagraphProperties*/
if (Element.XMLChildren[1].xmlName == “w:pPr”) {
wpPr = Element.XMLChildren[1];
cfloop(array=wpPr.XMLChildren,index=PPropertyIndex,item=PProperty) {
if (PProperty.xmlName == “w:pStyle”) {
wVal = PProperty.XMLAttributes[“w:val”];
}
if (PProperty.xmlName == “w:outlineLvl”) {
outlinelevel = PProperty.XMLAttributes[“w:val”];
}
if (PProperty.xmlName == “w:numPr”) {
cfloop(array=PProperty.XMLChildren,index=NumropertyIndex,item=NumProperty) {
if (NumProperty.xmlName == “w:numID”) {
wnumid = NumProperty.XMLAttributes[“w:val”];
}
if (NumProperty.xmlName == “w:ilvl”) {
ilvl = NumProperty.XMLAttributes[“w:val”];
VARIABLES.listcounterOutlinelevel[ilvl+1]=0;
resetCounterValues(ilvl);
}
}
}
}
}
}
switch (wVal) {
case “ListParagraph” :
if (wnumid == 2) {
result &= ‘<ol start=”#listcounter#”><li>#ReadNode(Element)#</li></ol>#variables.crlf#’;
}
else {
result &= ‘<li>#ReadNode(Element)#</li>#variables.crlf#’;
}
variables.listcounter++;
break;
default :
variables.listcounter = 1;
/* normal paragraph*/
if(wVal neq “”){
/* find if the style has numbering and its outlinelevel*/
/*https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing.numberingproperties?view=openxml-2.8.1*/
pstyle =XmlSearch(this.xmlStyles,”/w:styles/w:style[@w:styleId=’#wVal#’]”);
outlinelevel = JavaCast(“int”,XmlSearch(pstyle[1],”number(w:pPr/w:outlineLvl/@w:val)”));
ilvl = JavaCast(“int”,XmlSearch(pstyle[1],”number(w:pPr/w:numPr/w:ilvl/@w:val)”));
wnumId = JavaCast(“int”,XmlSearch(pstyle[1],”number(w:pPr/w:numPr/w:numId/@w:val)”));
basedOn = XmlSearch(pstyle[1],”string(w:basedOn/@w:val)”);
pHTMLtag =”p”;
for(var headingNumber =1; headingNumber lte this.headingMax;headingNumber++){
if(basedOn eq “Heading”&headingNumber OR wVal eq “Heading”&headingNumber){
pHTMLtag =”h”&headingNumber;
resetCounterValues(headingNumber);
}
}
if(wnumId gt 0 AND isnumeric(ilvl) AND ilvl gt 0 ) {
VARIABLES.listcounterOutlinelevel[ilvl+1]=0;
VARIABLES.listcounterOutlinelevel[ilvl]=VARIABLES.listcounterOutlinelevel[ilvl]+1;
result &= ‘<#pHTMLtag# class=”#wVal#”>#VARIABLES.listcounterOutlinelevel[ilvl]#. #ReadNode(Element)#</#pHTMLtag#>#variables.crlf#’; //add style id as html class name
} else {
variables.listcounter = 1;
result &= ‘<#pHTMLtag# class=”#wVal#”>#ReadNode(Element)#</#pHTMLtag#>#variables.crlf#’; //add style id as html class name
}
} else {
variables.listcounter = 1;
result &= ‘<p>#ReadNode(Element)#</p>#variables.crlf#’;
}
}
break; // end of w:p
case “w:r” : // This handles bolds and italics
wrPr = “”;
wrPrNodeElement = ReadNode(Element);
/* multiple children*/
for(var elChild in Element.XMLChildren){
if ( isArray(elChild.XMLChildren) && !arrayIsEmpty(elChild.XMLChildren)) {
/* loop */
cfloop(array=elChild.XMLChildren,index=wrPrIndex,item=wrPritem) {
wrPr = elChild.XMLChildren[wrPrIndex].XMLName;
switch (wrPr) {
case “w:b” :
wrPrNodeElement = “<b>#wrPrNodeElement#</b>”;
break;
case “w:i” :
wrPrNodeElement = “<i>#wrPrNodeElement#</i>”;
break;
case “w:u” :
wrPrNodeElement = “<u>#wrPrNodeElement#</u>”;
break;
default :
/* Other */
break;
}
}
}
}
result &= wrPrNodeElement;
break;
case “w:t” :
result &= Element.xmlText;
break;
case “w:ProofErr” :
/* Word divides this into separate areas*/
/*skip this.proofErr = Element.XMLAttributes[“w:type”];*/
break;
case “w:pStyle” :
/* skip variables.currentTag = Element.XMLAttributes[“w:val”];*/
break;
case “w:instrText” :
/* skip*/
break;
default :
result &= Element.xmlText;
}
/* Inner text*/
/* result &= readNode(Element);*/
} /* End for loop on Element*/
return result;
} /* End function*/
private function resetCounterValues(required numeric depth) {
for (var i = ARGUMENTS.depth+1; i lte VARIABLES.listCounterLen;i++){
VARIABLES.listcounterOutlinelevel[i] = 0;
}
}
string function extractDocx(required string pathToDocX) {
cfzip(action=”read”, file=arguments.pathToDocx, entrypath=”word\document.xml”, variable=”this.xmlString”,charset = “utf-8″);
cfzip(action=”read”, file=arguments.pathToDocx, entrypath=”word\styles.xml”, variable=”this.xmlStyles”,charset = “utf-8”);
this.xmlPara = xmlparse(this.xmlString).document.body;
return ReadNode(this.xmlPara);
}
}

Like
2019-05-10 08:41:47
2019-05-10 08:41:47

Awsome James. This content is very useful.

Like
2018-11-07 21:13:37
2018-11-07 21:13:37

Excellent content James.  This was really well done.

Like
(1)
(1)
>
David Byers
's comment
2018-11-07 21:58:03
2018-11-07 21:58:03
>
David Byers
's comment

Glad you liked it!

Like
2018-11-05 18:38:01
2018-11-05 18:38:01

Well done, James! I really enjoyed this walk through. Been a loooooOOOOooooong time since I’ve seen any great CF tutorials. And I think I can use this. I’ve been wanting to convert my word documents into markdown files. Thanks for the head start!

BTW: I had no idea docx files were really just zip files. (mind-blown)

Like
(1)
(1)
>
chrisg57685480
's comment
2018-11-06 04:52:25
2018-11-06 04:52:25
>
chrisg57685480
's comment

Glad you liked it!

Like
Add Comment