Extracting text/html out Word (.docx) files

November 3, 2018

James Mohler Follow

(1)

Comments

(7)

November 3, 2018

Extracting text/html out Word (.docx) files

James Mohler Follow

(1)

(7)

(1)

Repositories

https://github.com/jmohler1970/WordExtractor

https://github.com/jmohler1970/WordExtractor_demo

Introduction

We are going to be extracting out HTML from a Word (.docx) file.

.docx is an example of an Open Document Format for Office Applications (ODF) file. It is a ZIP of an XML document.

By unzipping the file and locating the appropriate XML file, we can process the data an generate HTML

Video Player

Media error: Format(s) not supported or source(s) not found

Download File: https://cfimages.adobe.com/files/2018/11/Word-Extractor.mp4?_=1

00:00

Use Up/Down Arrow keys to increase or decrease volume.

Resources

blog

cfscript

cfzip

learning

programming

(1)

James Mohler Follow

Been a ColdFusion Developer since 1996

7 Comments

delsalsa

Jun 19, 2020

delsalsa

Jun 19, 2020

Hi,

great post, thx. Is there a way, also to put the Image out of the Word-Document, which is in “w drawing”, to HTML? I’ve read many posts (Java and PHP > mostly found payable plugins), but no clue how to do it in CF.

There must be 2 ways, inline and floating.
I’ve found this post usefull:https://www.toptal.com/xml/an-informal-introduction-to-docx

But there is no explanation how to put that into code to show it then in HTML.

Thx for any Answer

Corrado

()

dingdongiuytiyt_t

Oct 25, 2019

dingdongiuytiyt_t

Oct 25, 2019

Nice James.
Today I used you code to do some extractions. It is difficult to do a nice commit to your git hub.

See bellow some enhancement and issue solving.

component output=”false” {
this.xmlPara = “”; // parsed into XML nodes
this.xmlString = “”; // raw text
this.proofErr = “spellEnd”;
VARIABLES.listcounter = 1; // used to set order list numbers
VARIABLES.listcounterOutlinelevel = [0,0,0,0,0,0,0,0,0,0]; // used to set order list numbers with outlinelevel
VARIABLES.listCounterLen = ArrayLen(VARIABLES.listcounterOutlinelevel);
variables.CRLF = Chr(13) & Chr(10);
this.headingMax = 6;
private string function ReadNode (required xml Node) {
var result = “”;
var wpPr = “”;
var wrPr = “”; // Does bold, italic
var wnumPr = “”; // ordered or unordered in html
var wnumID = “”;
var wVal = “”;
var basedOn =””;
var startElementName = “”;
var wrPrNodeElement = “”;
var outlinelevel = 0;
var ilvl = 0;
var pHTMLtag =”p”;
if (StructIsEmpty(arguments.Node)) {
return “”;
}
for (var Element in arguments.node.xmlChildren) {
startwVal = “”;
/* Start Tags*/
switch (Element.xmlName) {
case “w:p” :
wVal = “”; // default paragraph style
wnumid = “”; // This actually the type of list
if (ArrayLen(Element.XMLChildren) != 0) {
/* pPr ParagraphProperties*/
if (Element.XMLChildren[1].xmlName == “w:pPr”) {
wpPr = Element.XMLChildren[1];
cfloop(array=wpPr.XMLChildren,index=PPropertyIndex,item=PProperty) {
if (PProperty.xmlName == “w:pStyle”) {
wVal = PProperty.XMLAttributes[“w:val”];
}
if (PProperty.xmlName == “w:outlineLvl”) {
outlinelevel = PProperty.XMLAttributes[“w:val”];
}
if (PProperty.xmlName == “w:numPr”) {
cfloop(array=PProperty.XMLChildren,index=NumropertyIndex,item=NumProperty) {
if (NumProperty.xmlName == “w:numID”) {
wnumid = NumProperty.XMLAttributes[“w:val”];
}
if (NumProperty.xmlName == “w:ilvl”) {
ilvl = NumProperty.XMLAttributes[“w:val”];
VARIABLES.listcounterOutlinelevel[ilvl+1]=0;
resetCounterValues(ilvl);
}
}
}
}
}
}
switch (wVal) {
case “ListParagraph” :
if (wnumid == 2) {
result &= ‘<ol start=”#listcounter#”><li>#ReadNode(Element)#</li></ol>#variables.crlf#’;
}
else {
result &= ‘<li>#ReadNode(Element)#</li>#variables.crlf#’;
}
variables.listcounter++;
break;
default :
variables.listcounter = 1;
/* normal paragraph*/
if(wVal neq “”){
/* find if the style has numbering and its outlinelevel*/
/*https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing.numberingproperties?view=openxml-2.8.1*/
pstyle =XmlSearch(this.xmlStyles,”/w:styles/w:style[@w:styleId=’#wVal#’]”);
outlinelevel = JavaCast(“int”,XmlSearch(pstyle[1],”number(w:pPr/w:outlineLvl/@w:val)”));
ilvl = JavaCast(“int”,XmlSearch(pstyle[1],”number(w:pPr/w:numPr/w:ilvl/@w:val)”));
wnumId = JavaCast(“int”,XmlSearch(pstyle[1],”number(w:pPr/w:numPr/w:numId/@w:val)”));
basedOn = XmlSearch(pstyle[1],”string(w:basedOn/@w:val)”);
pHTMLtag =”p”;
for(var headingNumber =1; headingNumber lte this.headingMax;headingNumber++){
if(basedOn eq “Heading”&headingNumber OR wVal eq “Heading”&headingNumber){
pHTMLtag =”h”&headingNumber;
resetCounterValues(headingNumber);
}
}
if(wnumId gt 0 AND isnumeric(ilvl) AND ilvl gt 0 ) {
VARIABLES.listcounterOutlinelevel[ilvl+1]=0;
VARIABLES.listcounterOutlinelevel[ilvl]=VARIABLES.listcounterOutlinelevel[ilvl]+1;
result &= ‘<#pHTMLtag# class=”#wVal#”>#VARIABLES.listcounterOutlinelevel[ilvl]#. #ReadNode(Element)#</#pHTMLtag#>#variables.crlf#’; //add style id as html class name
} else {
variables.listcounter = 1;
result &= ‘<#pHTMLtag# class=”#wVal#”>#ReadNode(Element)#</#pHTMLtag#>#variables.crlf#’; //add style id as html class name
}
} else {
variables.listcounter = 1;
result &= ‘#ReadNode(Element)##variables.crlf#’;
}
}
break; // end of w:p
case “w:r” : // This handles bolds and italics
wrPr = “”;
wrPrNodeElement = ReadNode(Element);
/* multiple children*/
for(var elChild in Element.XMLChildren){
if ( isArray(elChild.XMLChildren) && !arrayIsEmpty(elChild.XMLChildren)) {
/* loop */
cfloop(array=elChild.XMLChildren,index=wrPrIndex,item=wrPritem) {
wrPr = elChild.XMLChildren[wrPrIndex].XMLName;
switch (wrPr) {
case “w:b” :
wrPrNodeElement = “#wrPrNodeElement#”;
break;
case “w:i” :
wrPrNodeElement = “#wrPrNodeElement#”;
break;
case “w:u” :
wrPrNodeElement = “#wrPrNodeElement#”;
break;
default :
/* Other */
break;
}
}
}
}
result &= wrPrNodeElement;
break;
case “w:t” :
result &= Element.xmlText;
break;
case “w:ProofErr” :
/* Word divides this into separate areas*/
/*skip this.proofErr = Element.XMLAttributes[“w:type”];*/
break;
case “w:pStyle” :
/* skip variables.currentTag = Element.XMLAttributes[“w:val”];*/
break;
case “w:instrText” :
/* skip*/
break;
default :
result &= Element.xmlText;
}
/* Inner text*/
/* result &= readNode(Element);*/
} /* End for loop on Element*/
return result;
} /* End function*/
private function resetCounterValues(required numeric depth) {
for (var i = ARGUMENTS.depth+1; i lte VARIABLES.listCounterLen;i++){
VARIABLES.listcounterOutlinelevel[i] = 0;
}
}
string function extractDocx(required string pathToDocX) {
cfzip(action=”read”, file=arguments.pathToDocx, entrypath=”word\document.xml”, variable=”this.xmlString”,charset = “utf-8″);
cfzip(action=”read”, file=arguments.pathToDocx, entrypath=”word\styles.xml”, variable=”this.xmlStyles”,charset = “utf-8”);
this.xmlPara = xmlparse(this.xmlString).document.body;
return ReadNode(this.xmlPara);
}
}

()

ksaravn

May 10, 2019

ksaravn

May 10, 2019

Awsome James. This content is very useful.

()

David Byers

Nov 7, 2018

David Byers

Nov 7, 2018

Excellent content James. This was really well done.

(1)

chrisg57685480

Nov 5, 2018

chrisg57685480

Nov 5, 2018

Well done, James! I really enjoyed this walk through. Been a loooooOOOOooooong time since I’ve seen any great CF tutorials. And I think I can use this. I’ve been wanting to convert my word documents into markdown files. Thanks for the head start!

BTW: I had no idea docx files were really just zip files. (mind-blown)

(1)

Add Comment

You must be logged in to post a comment.