2 #include "RealTextParser.h"
4 CRealTextParser::CRealTextParser(void):
6 m_bIgnoreFontSize(false),
7 m_bIgnoreFontColor(false),
8 m_bIgnoreFontWeight(false),
9 m_bIgnoreFontFace(false),
12 m_iDefaultSubtitleDurationInMillisecs(4000),
13 m_bTryToIgnoreErrors(true)
17 CRealTextParser::~CRealTextParser(void)
21 bool CRealTextParser::ParseRealText(wstring p_szFile
)
23 vector
<int> vStartTimecodes
;
24 vector
<int> vEndTimecodes
;
25 bool bPrevEndTimeMissing(false);
27 list
<Tag
> listPreviousOpenTags
;
29 while (p_szFile
.length() > 0)
31 if (p_szFile
.at(0) == '<')
34 if (!ExtractTag(p_szFile
, oTag
))
40 if (oTag
.m_szName
== L
"time")
42 int iStartTimecode
= GetTimecode(oTag
.m_mapAttributes
[L
"begin"]);
43 int iEndTimecode
= GetTimecode(oTag
.m_mapAttributes
[L
"end"]);
45 // FilterReduntantTags(listTags);
46 wstring szLine
= RenderTags(listTags
);
48 if (bPrevEndTimeMissing
)
50 pair
<int, int> pairTimecodes(vStartTimecodes
.back(), iStartTimecode
);
52 // Fix issues where the next time code isn't valid end time code for the previous subtitle
53 if (pairTimecodes
.first
>= pairTimecodes
.second
)
55 pairTimecodes
.second
= pairTimecodes
.first
+ m_iDefaultSubtitleDurationInMillisecs
;
58 if (szLine
.length() > 0)
59 m_RealText
.m_mapLines
[pairTimecodes
] = szLine
;
61 bPrevEndTimeMissing
= false;
63 else if (vStartTimecodes
.size() > 0 && vEndTimecodes
.size() > 0)
65 pair
<int, int> pairTimecodes(vStartTimecodes
.back(), vEndTimecodes
.back());
67 if (szLine
.length() > 0)
68 m_RealText
.m_mapLines
[pairTimecodes
] = szLine
;
72 vStartTimecodes
.push_back(iStartTimecode
);
73 if (iEndTimecode
<= 0)
75 bPrevEndTimeMissing
= true;
79 vEndTimecodes
.push_back(iEndTimecode
);
82 else if (oTag
.m_szName
== L
"b" || oTag
.m_szName
== L
"i" || oTag
.m_szName
== L
"font")
85 listPreviousOpenTags
.push_back(oTag
);
88 PopTag(listPreviousOpenTags
, oTag
.m_szName
);
90 listTags
.push_back(oTag
);
92 else if (oTag
.m_szName
== L
"clear")
97 listTags
.insert(listTags
.end(), listPreviousOpenTags
.begin(), listPreviousOpenTags
.end());
99 else if (oTag
.m_szName
== L
"window")
102 m_RealText
.m_WindowTag
= oTag
;
106 else if (oTag
.m_szName
== L
"center")
108 m_RealText
.m_bCenter
= true;
110 else if (oTag
.m_szName
== L
"required")
114 else if (oTag
.m_szName
== L
"")
120 // assume formating tag (handled later)
121 listTags
.push_back(oTag
);
127 if (!ExtractTextTag(p_szFile
, oTextTag
))
130 listTags
.push_back(oTextTag
);
135 // FilterReduntantTags(listTags);
136 wstring szLine
= RenderTags(listTags
);
138 if (bPrevEndTimeMissing
)
140 pair
<int, int> pairTimecodes(vStartTimecodes
.back(), vStartTimecodes
.back() + m_iDefaultSubtitleDurationInMillisecs
);
142 if (szLine
.length() > 0)
143 m_RealText
.m_mapLines
[pairTimecodes
] = szLine
;
145 bPrevEndTimeMissing
= false;
147 else if (vStartTimecodes
.size() > 0 && vEndTimecodes
.size() > 0)
149 pair
<int, int> pairTimecodes(vStartTimecodes
.back(), vEndTimecodes
.back());
151 if (szLine
.length() > 0)
152 m_RealText
.m_mapLines
[pairTimecodes
] = szLine
;
159 const CRealTextParser::Subtitles
& CRealTextParser::GetParsedSubtitles()
164 bool CRealTextParser::ExtractTag(wstring
& p_rszLine
, Tag
& p_rTag
)
166 if (p_rszLine
.length() < 2 || p_rszLine
.at(0) != '<')
168 if (m_bTryToIgnoreErrors
)
170 size_t iTempPos
= p_rszLine
.find_first_of('<');
172 if (iTempPos
!= wstring::npos
)
174 p_rszLine
= p_rszLine
.substr(iTempPos
);
176 if (p_rszLine
.length() < 2)
187 unsigned int iPos
= 1;
190 if (p_rszLine
.at(iPos
) == '!')
192 p_rTag
.m_bComment
= true;
195 GetString(p_rszLine
, iPos
, szComment
, L
">");
196 p_rTag
.m_szName
= szComment
;
199 p_rszLine
= p_rszLine
.substr(iPos
);
204 p_rTag
.m_bComment
= false;
207 if (!SkipSpaces(p_rszLine
, iPos
))
210 if (p_rszLine
.at(iPos
) == '/')
212 p_rTag
.m_bOpen
= false;
213 p_rTag
.m_bClose
= true;
218 p_rTag
.m_bOpen
= true;
219 p_rTag
.m_bClose
= false;
222 if (!GetString(p_rszLine
, iPos
, p_rTag
.m_szName
, L
"\r\n\t />"))
225 p_rTag
.m_szName
= StringToLower(p_rTag
.m_szName
);
227 if (!GetAttributes(p_rszLine
, iPos
, p_rTag
.m_mapAttributes
))
230 if (p_rszLine
.at(iPos
) == '/')
233 p_rTag
.m_bClose
= true;
236 if (p_rszLine
.at(iPos
) == '>')
239 p_rszLine
= p_rszLine
.substr(iPos
);
244 if (m_bTryToIgnoreErrors
)
246 size_t iTempPos
= p_rszLine
.find_first_of('>');
248 if (iTempPos
!= wstring::npos
)
250 if (iTempPos
- 1 >= p_rszLine
.length())
253 p_rszLine
= p_rszLine
.substr(iTempPos
+ 1);
269 bool CRealTextParser::ExtractTextTag(wstring
& p_rszLine
, Tag
& p_rTag
)
271 p_rTag
.m_bText
= true;
272 return ExtractString(p_rszLine
, p_rTag
.m_szName
);
275 bool CRealTextParser::ExtractString(wstring
& p_rszLine
, wstring
& p_rszString
)
277 if (p_rszLine
.length() == 0 || p_rszLine
.at(0) == '<')
279 if (m_bTryToIgnoreErrors
)
290 unsigned int iPos
= 0;
292 if (!SkipSpaces(p_rszLine
, iPos
))
295 if (!GetString(p_rszLine
, iPos
, p_rszString
, L
"<"))
298 p_rszLine
= p_rszLine
.substr(iPos
);
302 bool CRealTextParser::SkipSpaces(wstring
& p_rszLine
, unsigned int& p_riPos
)
304 while (p_rszLine
.length() > p_riPos
&& iswspace(p_rszLine
.at(p_riPos
)))
309 return p_rszLine
.length() > p_riPos
;
312 bool CRealTextParser::GetString(wstring
& p_rszLine
, unsigned int& p_riPos
, wstring
& p_rszString
, const wstring
& p_crszEndChars
)
314 while (p_rszLine
.length() > p_riPos
&& p_crszEndChars
.find(p_rszLine
.at(p_riPos
)) == wstring::npos
)
316 p_rszString
+= p_rszLine
.at(p_riPos
);
320 return p_rszLine
.length() > p_riPos
;
323 bool CRealTextParser::GetAttributes(wstring
& p_rszLine
, unsigned int& p_riPos
, map
<wstring
, wstring
>& p_rmapAttributes
)
325 if (!SkipSpaces(p_rszLine
, p_riPos
))
328 while (p_riPos
>p_rszLine
.length() && p_rszLine
.at(p_riPos
) != '/' && p_rszLine
.at(p_riPos
) != '>')
331 if (!GetString(p_rszLine
, p_riPos
, szName
, L
"\r\n\t ="))
334 if (!SkipSpaces(p_rszLine
, p_riPos
))
337 if (p_rszLine
.at(p_riPos
) != '=')
339 if (m_bTryToIgnoreErrors
)
341 p_riPos
= p_rszLine
.find_first_of('=', p_riPos
);
342 if (p_riPos
== wstring::npos
)
353 if (!SkipSpaces(p_rszLine
, p_riPos
))
356 bool bUsesQuotes(false);
357 if (p_rszLine
.at(p_riPos
) == '\'' || p_rszLine
.at(p_riPos
) == '\"')
363 if (!SkipSpaces(p_rszLine
, p_riPos
))
369 if (!GetString(p_rszLine
, p_riPos
, szValue
, L
"\"\'/>"))
374 if (!GetString(p_rszLine
, p_riPos
, szValue
, L
" \t/>"))
378 p_rmapAttributes
[StringToLower(szName
)] = szValue
;
380 if (!SkipSpaces(p_rszLine
, p_riPos
))
383 if (p_rszLine
.at(p_riPos
) == '\'' || p_rszLine
.at(p_riPos
) == '\"')
386 if (!SkipSpaces(p_rszLine
, p_riPos
))
390 return p_rszLine
.length() > p_riPos
;
393 int CRealTextParser::GetTimecode(const wstring
& p_crszTimecode
)
398 // Exception: if the timecode doesn't contain any separators, assume the time code is in seconds (and change multiplier to reflect that)
399 if (p_crszTimecode
.find_first_of('.') == wstring::npos
&& p_crszTimecode
.find_first_of(':') == wstring::npos
)
402 wstring szCurrentPart
;
404 for (int i
= p_crszTimecode
.length() - 1; i
>= 0; --i
)
406 if (p_crszTimecode
.at(i
) == '.' || p_crszTimecode
.at(i
) == ':')
408 if (iMultiplier
== 1)
410 while (szCurrentPart
.length() < 3)
411 szCurrentPart
+= L
"0";
414 iTimecode
+= iMultiplier
* ::_wtoi(szCurrentPart
.c_str());
416 if (iMultiplier
== 1)
429 szCurrentPart
= p_crszTimecode
.substr(i
, 1) + szCurrentPart
;
433 iTimecode
+= iMultiplier
* ::_wtoi(szCurrentPart
.c_str());
438 wstring
CRealTextParser::FormatTimecode(int iTimecode
,
439 int iMillisecondPrecision
/* = 3*/,
440 bool p_bPadZeroes
/* = true*/,
441 const wstring
& p_crszSeparator
/* = ":"*/,
442 const wstring
& p_crszMillisecondSeparator
/* = "."*/)
444 wostringstream ossTimecode
;
446 int iHours
= iTimecode
/ 1000 / 60 / 60;
448 ossTimecode
<< iHours
;
450 int iMinutes
= (iTimecode
/ 1000 / 60) % 60;
452 ossTimecode
<< p_crszSeparator
;
453 ossTimecode
<< iMinutes
;
455 int iSeconds
= (iTimecode
/ 1000) % 60;
457 ossTimecode
<< p_crszSeparator
;
458 ossTimecode
<< iSeconds
;
460 int iMilliSeconds
= iTimecode
% 1000;
462 if (iMillisecondPrecision
< 3)
463 iMilliSeconds
/= 10 * (3 - iMillisecondPrecision
);
465 ossTimecode
<< p_crszMillisecondSeparator
;
466 ossTimecode
<< iMilliSeconds
;
468 return ossTimecode
.str();
471 wstring
CRealTextParser::StringToLower(const wstring
& p_crszString
)
473 wstring szLowercaseString
;
474 for(unsigned int i
=0; i
< p_crszString
.length(); ++i
)
476 szLowercaseString
+= towlower(p_crszString
.at(i
));
478 return szLowercaseString
;
481 wstring
CRealTextParser::RenderTags(const list
<Tag
>& p_crlTags
)
486 for (list
<Tag
>::const_iterator iter
= p_crlTags
.begin(); iter
!= p_crlTags
.end(); ++iter
)
490 if (oTag
.m_szName
== L
"br")
494 else if (oTag
.m_szName
== L
"b")
496 if (!m_bIgnoreFontWeight
)
502 else if (oTag
.m_bClose
)
508 else if (oTag
.m_szName
== L
"i")
510 if (!m_bIgnoreFontWeight
)
516 else if (oTag
.m_bClose
)
522 else if (oTag
.m_szName
== L
"font")
528 szString
+= L
"<font";
529 for (map
<wstring
, wstring
>:: iterator i
= oTag
.m_mapAttributes
.begin(); i
!= oTag
.m_mapAttributes
.end(); ++i
)
531 if (m_bIgnoreFontSize
&& i
->first
== L
"size")
534 if (m_bIgnoreFontColor
&& i
->first
== L
"color")
537 if (m_bIgnoreFontFace
&& i
->first
== L
"face")
540 if (i
->first
== L
"size" && i
->second
.length() > 0 && ::iswdigit(i
->second
.at(0)))
542 int iSize
= ::_wtoi(i
->second
.c_str());
544 if (iSize
> 0 && iSize
< m_iMinFontSize
)
547 if (iSize
> m_iMaxFontSize
)
552 szString
+= i
->first
;
554 szString
+= i
->second
;
562 szString
+= L
"</font>";
566 else if (oTag
.m_bText
)
568 szString
+= oTag
.m_szName
;
570 if (!oTag
.m_szName
.empty())
575 // AfxMessageBox(CString(_T("Unknown RealText-tag: ")) + oTag.m_szName.c_str());
585 bool CRealTextParser::OutputSRT(wostream
& p_rOutput
)
588 for (map
<pair
<int, int>, wstring
>::const_iterator i
= m_RealText
.m_mapLines
.begin();
589 i
!= m_RealText
.m_mapLines
.end();
592 p_rOutput
<< iCounter
++;
595 p_rOutput
<< FormatTimecode(i
->first
.first
);
596 p_rOutput
<< L
" --> ";
597 p_rOutput
<< FormatTimecode(i
->first
.second
);
600 p_rOutput
<< i
->second
;
608 void CRealTextParser::PopTag(list
<Tag
>& p_rlistTags
, const wstring
& p_crszTagName
)
610 for (list
<Tag
>::reverse_iterator riter
= p_rlistTags
.rbegin(); riter
!= p_rlistTags
.rend(); ++riter
)
612 if (riter
->m_szName
== p_crszTagName
)
614 p_rlistTags
.erase((++riter
).base());
620 void CRealTextParser::FilterReduntantTags(list
<Tag
>& p_rlistTags
)
622 list
<Tag
>::iterator iterPrev
;
623 for (list
<Tag
>::iterator iterCurrent
= p_rlistTags
.begin(); iterCurrent
!= p_rlistTags
.end(); ++iterCurrent
)
625 if (iterCurrent
!= p_rlistTags
.begin())
627 if (iterPrev
->m_szName
== L
"font" && iterCurrent
->m_szName
== L
"font" &&
628 iterPrev
->m_bOpen
&& iterCurrent
->m_bOpen
)
630 p_rlistTags
.erase(iterPrev
);
633 iterPrev
= iterCurrent
;