Merge with MPC-HC 6d1472b2f18266d92e5bc068667de348c0cd3b3b.
[xy_vsfilter.git] / src / subtitles / RealTextParser.cpp
blobc89c9074ced8003b3421e3662cb323b5b20b3226
1 #include "StdAfx.h"
2 #include "RealTextParser.h"
4 CRealTextParser::CRealTextParser(void):
5 m_bIgnoreFont(false),
6 m_bIgnoreFontSize(false),
7 m_bIgnoreFontColor(false),
8 m_bIgnoreFontWeight(false),
9 m_bIgnoreFontFace(false),
10 m_iMinFontSize(14),
11 m_iMaxFontSize(25),
12 m_iDefaultSubtitleDurationInMillisecs(4000),
13 m_bTryToIgnoreErrors(true)
17 CRealTextParser::~CRealTextParser(void)
21 bool CRealTextParser::ParseRealText(wstring p_szFile)
23 vector<int> vStartTimecodes;
24 vector<int> vEndTimecodes;
25 bool bPrevEndTimeMissing(false);
26 list<Tag> listTags;
27 list<Tag> listPreviousOpenTags;
29 while (p_szFile.length() > 0)
31 if (p_szFile.at(0) == '<')
33 Tag oTag;
34 if (!ExtractTag(p_szFile, oTag))
35 return false;
37 if (oTag.m_bComment)
38 continue;
40 if (oTag.m_szName == L"time")
42 int iStartTimecode = GetTimecode(oTag.m_mapAttributes[L"begin"]);
43 int iEndTimecode = GetTimecode(oTag.m_mapAttributes[L"end"]);
45 // FilterReduntantTags(listTags);
46 wstring szLine = RenderTags(listTags);
48 if (bPrevEndTimeMissing)
50 pair<int, int> pairTimecodes(vStartTimecodes.back(), iStartTimecode);
52 // Fix issues where the next time code isn't valid end time code for the previous subtitle
53 if (pairTimecodes.first >= pairTimecodes.second)
55 pairTimecodes.second = pairTimecodes.first + m_iDefaultSubtitleDurationInMillisecs;
58 if (szLine.length() > 0)
59 m_RealText.m_mapLines[pairTimecodes] = szLine;
61 bPrevEndTimeMissing = false;
63 else if (vStartTimecodes.size() > 0 && vEndTimecodes.size() > 0)
65 pair<int, int> pairTimecodes(vStartTimecodes.back(), vEndTimecodes.back());
67 if (szLine.length() > 0)
68 m_RealText.m_mapLines[pairTimecodes] = szLine;
72 vStartTimecodes.push_back(iStartTimecode);
73 if (iEndTimecode <= 0)
75 bPrevEndTimeMissing = true;
77 else
79 vEndTimecodes.push_back(iEndTimecode);
82 else if (oTag.m_szName == L"b" || oTag.m_szName == L"i" || oTag.m_szName == L"font")
84 if (oTag.m_bOpen)
85 listPreviousOpenTags.push_back(oTag);
87 if (oTag.m_bClose)
88 PopTag(listPreviousOpenTags, oTag.m_szName);
90 listTags.push_back(oTag);
92 else if (oTag.m_szName == L"clear")
94 listTags.clear();
96 // set existing tags
97 listTags.insert(listTags.end(), listPreviousOpenTags.begin(), listPreviousOpenTags.end());
99 else if (oTag.m_szName == L"window")
101 if (oTag.m_bOpen)
102 m_RealText.m_WindowTag = oTag;
104 // Ignore close
106 else if (oTag.m_szName == L"center")
108 m_RealText.m_bCenter = true;
110 else if (oTag.m_szName == L"required")
112 // Ignore
114 else if (oTag.m_szName == L"")
116 // Ignore
118 else
120 // assume formating tag (handled later)
121 listTags.push_back(oTag);
124 else
126 Tag oTextTag;
127 if (!ExtractTextTag(p_szFile, oTextTag))
128 return false;
130 listTags.push_back(oTextTag);
134 // Handle final line
135 // FilterReduntantTags(listTags);
136 wstring szLine = RenderTags(listTags);
138 if (bPrevEndTimeMissing)
140 pair<int, int> pairTimecodes(vStartTimecodes.back(), vStartTimecodes.back() + m_iDefaultSubtitleDurationInMillisecs);
142 if (szLine.length() > 0)
143 m_RealText.m_mapLines[pairTimecodes] = szLine;
145 bPrevEndTimeMissing = false;
147 else if (vStartTimecodes.size() > 0 && vEndTimecodes.size() > 0)
149 pair<int, int> pairTimecodes(vStartTimecodes.back(), vEndTimecodes.back());
151 if (szLine.length() > 0)
152 m_RealText.m_mapLines[pairTimecodes] = szLine;
156 return true;
159 const CRealTextParser::Subtitles& CRealTextParser::GetParsedSubtitles()
161 return m_RealText;
164 bool CRealTextParser::ExtractTag(wstring& p_rszLine, Tag& p_rTag)
166 if (p_rszLine.length() < 2 || p_rszLine.at(0) != '<')
168 if (m_bTryToIgnoreErrors)
170 size_t iTempPos = p_rszLine.find_first_of('<');
172 if (iTempPos != wstring::npos)
174 p_rszLine = p_rszLine.substr(iTempPos);
176 if (p_rszLine.length() < 2)
177 return false;
181 else
183 return false;
187 unsigned int iPos = 1;
189 // skip comments
190 if (p_rszLine.at(iPos) == '!')
192 p_rTag.m_bComment = true;
194 wstring szComment;
195 GetString(p_rszLine, iPos, szComment, L">");
196 p_rTag.m_szName = szComment;
198 ++iPos; // Skip >
199 p_rszLine = p_rszLine.substr(iPos);
200 return true;
202 else
204 p_rTag.m_bComment = false;
207 if (!SkipSpaces(p_rszLine, iPos))
208 return false;
210 if (p_rszLine.at(iPos) == '/')
212 p_rTag.m_bOpen = false;
213 p_rTag.m_bClose = true;
214 ++iPos;
216 else
218 p_rTag.m_bOpen = true;
219 p_rTag.m_bClose = false;
222 if (!GetString(p_rszLine, iPos, p_rTag.m_szName, L"\r\n\t />"))
223 return false;
225 p_rTag.m_szName = StringToLower(p_rTag.m_szName);
227 if (!GetAttributes(p_rszLine, iPos, p_rTag.m_mapAttributes))
228 return false;
230 if (p_rszLine.at(iPos) == '/')
232 ++iPos;
233 p_rTag.m_bClose = true;
236 if (p_rszLine.at(iPos) == '>')
238 ++iPos;
239 p_rszLine = p_rszLine.substr(iPos);
240 return true;
242 else
244 if (m_bTryToIgnoreErrors)
246 size_t iTempPos = p_rszLine.find_first_of('>');
248 if (iTempPos != wstring::npos)
250 if (iTempPos - 1 >= p_rszLine.length())
251 return false;
253 p_rszLine = p_rszLine.substr(iTempPos + 1);
254 return true;
256 else
258 return false;
262 else
264 return false;
269 bool CRealTextParser::ExtractTextTag(wstring& p_rszLine, Tag& p_rTag)
271 p_rTag.m_bText = true;
272 return ExtractString(p_rszLine, p_rTag.m_szName);
275 bool CRealTextParser::ExtractString(wstring& p_rszLine, wstring& p_rszString)
277 if (p_rszLine.length() == 0 || p_rszLine.at(0) == '<')
279 if (m_bTryToIgnoreErrors)
281 p_rszString = L"";
282 return true;
284 else
286 return false;
290 unsigned int iPos = 0;
292 if (!SkipSpaces(p_rszLine, iPos))
293 return false;
295 if (!GetString(p_rszLine, iPos, p_rszString, L"<"))
296 return false;
298 p_rszLine = p_rszLine.substr(iPos);
299 return true;
302 bool CRealTextParser::SkipSpaces(wstring& p_rszLine, unsigned int& p_riPos)
304 while (p_rszLine.length() > p_riPos && iswspace(p_rszLine.at(p_riPos)))
306 ++p_riPos;
309 return p_rszLine.length() > p_riPos;
312 bool CRealTextParser::GetString(wstring& p_rszLine, unsigned int& p_riPos, wstring& p_rszString, const wstring& p_crszEndChars)
314 while (p_rszLine.length() > p_riPos && p_crszEndChars.find(p_rszLine.at(p_riPos)) == wstring::npos)
316 p_rszString += p_rszLine.at(p_riPos);
317 ++p_riPos;
320 return p_rszLine.length() > p_riPos;
323 bool CRealTextParser::GetAttributes(wstring& p_rszLine, unsigned int& p_riPos, map<wstring, wstring>& p_rmapAttributes)
325 if (!SkipSpaces(p_rszLine, p_riPos))
326 return false;
328 while (p_riPos>p_rszLine.length() && p_rszLine.at(p_riPos) != '/' && p_rszLine.at(p_riPos) != '>')
330 wstring szName;
331 if (!GetString(p_rszLine, p_riPos, szName, L"\r\n\t ="))
332 return false;
334 if (!SkipSpaces(p_rszLine, p_riPos))
335 return false;
337 if (p_rszLine.at(p_riPos) != '=')
339 if (m_bTryToIgnoreErrors)
341 p_riPos = p_rszLine.find_first_of('=', p_riPos);
342 if (p_riPos == wstring::npos)
343 return false;
345 else
347 return false;
351 ++p_riPos;
353 if (!SkipSpaces(p_rszLine, p_riPos))
354 return false;
356 bool bUsesQuotes(false);
357 if (p_rszLine.at(p_riPos) == '\'' || p_rszLine.at(p_riPos) == '\"')
359 ++p_riPos;
360 bUsesQuotes = true;
363 if (!SkipSpaces(p_rszLine, p_riPos))
364 return false;
366 wstring szValue;
367 if (bUsesQuotes)
369 if (!GetString(p_rszLine, p_riPos, szValue, L"\"\'/>"))
370 return false;
372 else
374 if (!GetString(p_rszLine, p_riPos, szValue, L" \t/>"))
375 return false;
378 p_rmapAttributes[StringToLower(szName)] = szValue;
380 if (!SkipSpaces(p_rszLine, p_riPos))
381 return false;
383 if (p_rszLine.at(p_riPos) == '\'' || p_rszLine.at(p_riPos) == '\"')
384 ++p_riPos;
386 if (!SkipSpaces(p_rszLine, p_riPos))
387 return false;
390 return p_rszLine.length() > p_riPos;
393 int CRealTextParser::GetTimecode(const wstring& p_crszTimecode)
395 int iTimecode(0);
396 int iMultiplier(1);
398 // Exception: if the timecode doesn't contain any separators, assume the time code is in seconds (and change multiplier to reflect that)
399 if (p_crszTimecode.find_first_of('.') == wstring::npos && p_crszTimecode.find_first_of(':') == wstring::npos)
400 iMultiplier = 1000;
402 wstring szCurrentPart;
404 for (int i = p_crszTimecode.length() - 1; i >= 0; --i)
406 if (p_crszTimecode.at(i) == '.' || p_crszTimecode.at(i) == ':')
408 if (iMultiplier == 1)
410 while (szCurrentPart.length() < 3)
411 szCurrentPart += L"0";
414 iTimecode += iMultiplier * ::_wtoi(szCurrentPart.c_str());
416 if (iMultiplier == 1)
418 iMultiplier = 1000;
420 else
422 iMultiplier *= 60;
425 szCurrentPart = L"";
427 else
429 szCurrentPart = p_crszTimecode.substr(i, 1) + szCurrentPart;
433 iTimecode += iMultiplier * ::_wtoi(szCurrentPart.c_str());
435 return iTimecode;
438 wstring CRealTextParser::FormatTimecode(int iTimecode,
439 int iMillisecondPrecision/* = 3*/,
440 bool p_bPadZeroes/* = true*/,
441 const wstring& p_crszSeparator/* = ":"*/,
442 const wstring& p_crszMillisecondSeparator/* = "."*/)
444 wostringstream ossTimecode;
446 int iHours = iTimecode / 1000 / 60 / 60;
448 ossTimecode << iHours;
450 int iMinutes = (iTimecode / 1000 / 60) % 60;
452 ossTimecode << p_crszSeparator;
453 ossTimecode << iMinutes;
455 int iSeconds = (iTimecode / 1000) % 60;
457 ossTimecode << p_crszSeparator;
458 ossTimecode << iSeconds;
460 int iMilliSeconds = iTimecode % 1000;
462 if (iMillisecondPrecision < 3)
463 iMilliSeconds /= 10 * (3 - iMillisecondPrecision);
465 ossTimecode << p_crszMillisecondSeparator;
466 ossTimecode << iMilliSeconds;
468 return ossTimecode.str();
471 wstring CRealTextParser::StringToLower(const wstring& p_crszString)
473 wstring szLowercaseString;
474 for(unsigned int i=0; i < p_crszString.length(); ++i)
476 szLowercaseString += towlower(p_crszString.at(i));
478 return szLowercaseString;
481 wstring CRealTextParser::RenderTags(const list<Tag>& p_crlTags)
483 bool bEmpty(true);
484 wstring szString;
486 for (list<Tag>::const_iterator iter = p_crlTags.begin(); iter != p_crlTags.end(); ++iter)
488 Tag oTag(*iter);
490 if (oTag.m_szName == L"br")
492 szString += L"\n";
494 else if (oTag.m_szName == L"b")
496 if (!m_bIgnoreFontWeight)
498 if (oTag.m_bOpen)
500 szString += L"<b>";
502 else if (oTag.m_bClose)
504 szString += L"</b>";
508 else if (oTag.m_szName == L"i")
510 if (!m_bIgnoreFontWeight)
512 if (oTag.m_bOpen)
514 szString += L"<i>";
516 else if (oTag.m_bClose)
518 szString += L"</i>";
522 else if (oTag.m_szName == L"font")
524 if (!m_bIgnoreFont)
526 if (oTag.m_bOpen)
528 szString += L"<font";
529 for (map<wstring, wstring>:: iterator i = oTag.m_mapAttributes.begin(); i != oTag.m_mapAttributes.end(); ++i)
531 if (m_bIgnoreFontSize && i->first == L"size")
532 continue;
534 if (m_bIgnoreFontColor && i->first == L"color")
535 continue;
537 if (m_bIgnoreFontFace && i->first == L"face")
538 continue;
540 if (i->first == L"size" && i->second.length() > 0 && ::iswdigit(i->second.at(0)))
542 int iSize = ::_wtoi(i->second.c_str());
544 if (iSize > 0 && iSize < m_iMinFontSize)
545 continue;
547 if (iSize > m_iMaxFontSize)
548 continue;
551 szString += L" ";
552 szString += i->first;
553 szString += L"=\"";
554 szString += i->second;
555 szString += L"\"";
557 szString += L">";
560 if (oTag.m_bClose)
562 szString += L"</font>";
566 else if (oTag.m_bText)
568 szString += oTag.m_szName;
570 if (!oTag.m_szName.empty())
571 bEmpty = false;
573 else
575 // AfxMessageBox(CString(_T("Unknown RealText-tag: ")) + oTag.m_szName.c_str());
579 if (bEmpty)
580 return L"";
581 else
582 return szString;
585 bool CRealTextParser::OutputSRT(wostream& p_rOutput)
587 int iCounter(1);
588 for (map<pair<int, int>, wstring>::const_iterator i = m_RealText.m_mapLines.begin();
589 i != m_RealText.m_mapLines.end();
590 ++i)
592 p_rOutput << iCounter++;
593 p_rOutput << endl;
595 p_rOutput << FormatTimecode(i->first.first);
596 p_rOutput << L" --> ";
597 p_rOutput << FormatTimecode(i->first.second);
598 p_rOutput << endl;
600 p_rOutput << i->second;
601 p_rOutput << endl;
602 p_rOutput << endl;
605 return true;
608 void CRealTextParser::PopTag(list<Tag>& p_rlistTags, const wstring& p_crszTagName)
610 for (list<Tag>::reverse_iterator riter = p_rlistTags.rbegin(); riter != p_rlistTags.rend(); ++riter)
612 if (riter->m_szName == p_crszTagName)
614 p_rlistTags.erase((++riter).base());
615 return;
620 void CRealTextParser::FilterReduntantTags(list<Tag>& p_rlistTags)
622 list<Tag>::iterator iterPrev;
623 for (list<Tag>::iterator iterCurrent = p_rlistTags.begin(); iterCurrent != p_rlistTags.end(); ++iterCurrent)
625 if (iterCurrent != p_rlistTags.begin())
627 if (iterPrev->m_szName == L"font" && iterCurrent->m_szName == L"font" &&
628 iterPrev->m_bOpen && iterCurrent->m_bOpen)
630 p_rlistTags.erase(iterPrev);
633 iterPrev = iterCurrent;