Mailinglist Archive: opensuse-programming (16 mails)
| < Previous | Next > |
RE: [opensuse-programming] extracting text from html
- From: Luna Rodríguez, Raúl <rlunaro@xxxxxxxxxx>
- Date: Tue, 25 May 2010 18:08:06 +0200
- Message-id: <2DCDC91195A7FB4F8330AEE7E172296A019DF5CC@xxxxxxxxxxxxxxxxxxxxxxxxxxx>
Hello:
I do not work normally in Linux, but I run into the same problem time ago and I
resolve building a remover in Basic. It works very well filtering html from
word (word is one of most dirties html creators).
The source code is below, if you want. It is pretty simple to convert it to
C++.
The problem is that my function doesn't filter well when you write "<", ">" as
text of the document: the normal way is to write "<", ">" instead.
Greetings.
----------------------------------------------------
' Strips text from an html text.
' Filter all the HTML code from the in parameter, with the goal
' of presenting only plain text.
' WARNING. The function will assume a an HTML Tag all expressions
' that will start by '<' and ends with '>'. In other words, an expression
' like:
' .... blah, blah 3 < 5 and 5 > 9, it is know that ...."
'
' it will be filtered as:
'
' .... blah, blah 3 9, it is know that ...."
Public Function stripHTML_Re(ByVal cIn As String) As String
Dim cOut As String
Dim nFoundlt As Long ' encontrado <
Dim nFoundgt As Long ' encontrado >
Dim nCurrent As Long
nFoundlt = 0
nFoundgt = 0
nCurrent = 1
cOut = ""
Do While nCurrent <= Len(cIn)
If Mid(cIn, nCurrent, 1) = "<" Then
' si es el segundo "<" que encontramos,
' copiamos el primero a la salida
If nFoundlt <> 0 Then
cOut = cOut + Mid(cIn, nFoundlt, nCurrent - nFoundlt)
End If
nFoundlt = nCurrent
End If
If Mid(cIn, nCurrent, 1) = ">" And nFoundlt > 0 Then
nFoundgt = nCurrent
End If
If nFoundlt = 0 And nFoundgt = 0 Then
cOut = cOut + Mid(cIn, nCurrent, 1)
End If
If nFoundlt <> 0 And nFoundgt <> 0 Then
nCurrent = nFoundgt
nFoundlt = 0
nFoundgt = 0
End If
nCurrent = nCurrent + 1
Loop
' reemplazamos los caracteres tipo por
' su equivalente en el juego de caracteres de windows
Dim aHtmlCh() As String
Dim nCount As Long
Dim cTemp As String
aHtmlCh = ArrayCaracteres
For nCount = LBound(aHtmlCh) To UBound(aHtmlCh)
cOut = Replace(cOut, aHtmlCh(nCount, 0), aHtmlCh(nCount, 1))
' algunos editores de HTML en lugar de ponen  
' se tiene en cuenta este caso también
cTemp = Replace(aHtmlCh(nCount, 0), ";", "")
cOut = Replace(cOut, cTemp, aHtmlCh(nCount, 1))
Next
stripHTML_Re = cOut
End Function
' return an array with the correspondence between
' the HTML entities ( ) and the corresponding
' "real" character " "
Private Function ArrayCaracteres() As String()
Static aHtmlCh(119, 1) As String
If IsEmpty(aHtmlCh(0, 0)) Or aHtmlCh(0, 0) = "" Then
aHtmlCh(0, 0) = "!"
aHtmlCh(1, 0) = "#"
aHtmlCh(2, 0) = "$"
aHtmlCh(3, 0) = "%"
aHtmlCh(4, 0) = "&"
aHtmlCh(5, 0) = """
aHtmlCh(6, 0) = "("
aHtmlCh(7, 0) = ")"
aHtmlCh(8, 0) = "*"
aHtmlCh(9, 0) = "+"
aHtmlCh(10, 0) = ","
aHtmlCh(11, 0) = "‐"
aHtmlCh(12, 0) = "."
aHtmlCh(13, 0) = "/"
aHtmlCh(14, 0) = ":"
aHtmlCh(15, 0) = ";"
aHtmlCh(16, 0) = "<"
aHtmlCh(17, 0) = "="
aHtmlCh(18, 0) = ">"
aHtmlCh(19, 0) = "?"
aHtmlCh(20, 0) = "@"
aHtmlCh(21, 0) = "["
aHtmlCh(22, 0) = "\"
aHtmlCh(23, 0) = "]"
aHtmlCh(24, 0) = "ˆ"
aHtmlCh(25, 0) = "_"
aHtmlCh(26, 0) = "`"
aHtmlCh(27, 0) = "{"
aHtmlCh(28, 0) = "|"
aHtmlCh(29, 0) = "}"
aHtmlCh(30, 0) = "˜"
aHtmlCh(31, 0) = " "
aHtmlCh(32, 0) = "¡"
aHtmlCh(33, 0) = "¢"
aHtmlCh(34, 0) = "£"
aHtmlCh(35, 0) = "¤"
aHtmlCh(36, 0) = "¥"
aHtmlCh(37, 0) = "¦"
aHtmlCh(38, 0) = "§"
aHtmlCh(39, 0) = "¨"
aHtmlCh(40, 0) = "©"
aHtmlCh(41, 0) = "ª"
aHtmlCh(42, 0) = "«"
aHtmlCh(43, 0) = "¬"
aHtmlCh(44, 0) = "­"
aHtmlCh(45, 0) = "®"
aHtmlCh(46, 0) = "¯"
aHtmlCh(47, 0) = "°"
aHtmlCh(48, 0) = "±"
aHtmlCh(49, 0) = "²"
aHtmlCh(50, 0) = "³"
aHtmlCh(51, 0) = "´"
aHtmlCh(52, 0) = "µ"
aHtmlCh(53, 0) = "¶"
aHtmlCh(54, 0) = "·"
aHtmlCh(55, 0) = "¸"
aHtmlCh(56, 0) = "¹"
aHtmlCh(57, 0) = "º"
aHtmlCh(58, 0) = "»"
aHtmlCh(59, 0) = "&fr;"
aHtmlCh(60, 0) = "&fr;"
aHtmlCh(61, 0) = "&fr;"
aHtmlCh(62, 0) = "¿"
aHtmlCh(63, 0) = "À"
aHtmlCh(64, 0) = "Á"
aHtmlCh(65, 0) = "Â"
aHtmlCh(66, 0) = "Ã"
aHtmlCh(67, 0) = "Ä"
aHtmlCh(68, 0) = "Å"
aHtmlCh(69, 0) = "Æ"
aHtmlCh(70, 0) = "&il;"
aHtmlCh(71, 0) = "È"
aHtmlCh(72, 0) = "É"
aHtmlCh(73, 0) = "Ê"
aHtmlCh(74, 0) = "Ë"
aHtmlCh(75, 0) = "Ì"
aHtmlCh(76, 0) = "Í"
aHtmlCh(77, 0) = "Î"
aHtmlCh(78, 0) = "Ï"
aHtmlCh(79, 0) = "Ð"
aHtmlCh(80, 0) = "Ñ"
aHtmlCh(81, 0) = "Ò"
aHtmlCh(82, 0) = "Ó"
aHtmlCh(83, 0) = "Ô"
aHtmlCh(84, 0) = "Õ"
aHtmlCh(85, 0) = "Ö"
aHtmlCh(86, 0) = "×"
aHtmlCh(87, 0) = "Ø"
aHtmlCh(88, 0) = "Ù"
aHtmlCh(89, 0) = "Ú"
aHtmlCh(90, 0) = "Û"
aHtmlCh(91, 0) = "Ü"
aHtmlCh(92, 0) = "Ý"
aHtmlCh(93, 0) = "Þ"
aHtmlCh(94, 0) = "ß"
aHtmlCh(95, 0) = "à"
aHtmlCh(96, 0) = "á"
aHtmlCh(97, 0) = "â"
aHtmlCh(98, 0) = "ã"
aHtmlCh(99, 0) = "ä"
aHtmlCh(100, 0) = "è"
aHtmlCh(101, 0) = "é"
aHtmlCh(102, 0) = "ê"
aHtmlCh(103, 0) = "&etilde;"
aHtmlCh(104, 0) = "ë"
aHtmlCh(105, 0) = "ì"
aHtmlCh(106, 0) = "í"
aHtmlCh(107, 0) = "î"
aHtmlCh(108, 0) = "ĩ"
aHtmlCh(109, 0) = "ï"
aHtmlCh(110, 0) = "ò"
aHtmlCh(111, 0) = "ó"
aHtmlCh(112, 0) = "ô"
aHtmlCh(113, 0) = "õ"
aHtmlCh(114, 0) = "ö"
aHtmlCh(115, 0) = "ù"
aHtmlCh(116, 0) = "ú"
aHtmlCh(117, 0) = "û"
aHtmlCh(118, 0) = "ũ"
aHtmlCh(119, 0) = "ü"
aHtmlCh(0, 1) = "¡"
aHtmlCh(1, 1) = "º"
aHtmlCh(2, 1) = "$"
aHtmlCh(3, 1) = "%"
aHtmlCh(4, 1) = "&"
aHtmlCh(5, 1) = """"
aHtmlCh(6, 1) = "("
aHtmlCh(7, 1) = ")"
aHtmlCh(8, 1) = "*"
aHtmlCh(9, 1) = "+"
aHtmlCh(10, 1) = ","
aHtmlCh(11, 1) = "-"
aHtmlCh(12, 1) = "."
aHtmlCh(13, 1) = "Sol"
aHtmlCh(14, 1) = "Colon"
aHtmlCh(15, 1) = "*"
aHtmlCh(16, 1) = "<"
aHtmlCh(17, 1) = "="
aHtmlCh(18, 1) = ">"
aHtmlCh(19, 1) = "?"
aHtmlCh(20, 1) = ","
aHtmlCh(21, 1) = "*"
aHtmlCh(22, 1) = "*"
aHtmlCh(23, 1) = "*"
aHtmlCh(24, 1) = "*"
aHtmlCh(25, 1) = "_"
aHtmlCh(26, 1) = "'"
aHtmlCh(27, 1) = "*"
aHtmlCh(28, 1) = "*"
aHtmlCh(29, 1) = "*"
aHtmlCh(30, 1) = "'"
aHtmlCh(31, 1) = " "
aHtmlCh(32, 1) = "¡"
aHtmlCh(33, 1) = "cent"
aHtmlCh(34, 1) = "L"
aHtmlCh(35, 1) = "*"
aHtmlCh(36, 1) = "Y"
aHtmlCh(37, 1) = "*"
aHtmlCh(38, 1) = "*"
aHtmlCh(39, 1) = "."
aHtmlCh(40, 1) = "(c)"
aHtmlCh(41, 1) = "*"
aHtmlCh(42, 1) = "*"
aHtmlCh(43, 1) = "!"
aHtmlCh(44, 1) = "*"
aHtmlCh(45, 1) = "(r)"
aHtmlCh(46, 1) = "*"
aHtmlCh(47, 1) = "*"
aHtmlCh(48, 1) = "*"
aHtmlCh(49, 1) = "*"
aHtmlCh(50, 1) = "*"
aHtmlCh(51, 1) = "á"
aHtmlCh(52, 1) = "u"
aHtmlCh(53, 1) = "*"
aHtmlCh(54, 1) = "·"
aHtmlCh(55, 1) = "ç"
aHtmlCh(56, 1) = "*"
aHtmlCh(57, 1) = "*"
aHtmlCh(58, 1) = "*"
aHtmlCh(59, 1) = "*"
aHtmlCh(60, 1) = "*"
aHtmlCh(61, 1) = "*"
aHtmlCh(62, 1) = "¿"
aHtmlCh(63, 1) = "È"
aHtmlCh(64, 1) = "Á"
aHtmlCh(65, 1) = "Ä"
aHtmlCh(66, 1) = "Á"
aHtmlCh(67, 1) = "*"
aHtmlCh(68, 1) = "*"
aHtmlCh(69, 1) = "AE"
aHtmlCh(70, 1) = "*"
aHtmlCh(71, 1) = "È"
aHtmlCh(72, 1) = "É"
aHtmlCh(73, 1) = "*"
aHtmlCh(74, 1) = "*"
aHtmlCh(75, 1) = "Ì"
aHtmlCh(76, 1) = "Í"
aHtmlCh(77, 1) = "Î"
aHtmlCh(78, 1) = "*"
aHtmlCh(79, 1) = "*"
aHtmlCh(80, 1) = "N'"
aHtmlCh(81, 1) = "Ò"
aHtmlCh(82, 1) = "Ó"
aHtmlCh(83, 1) = "Ô"
aHtmlCh(84, 1) = "O'"
aHtmlCh(85, 1) = "*"
aHtmlCh(86, 1) = "*"
aHtmlCh(87, 1) = "O/"
aHtmlCh(88, 1) = "Ù"
aHtmlCh(89, 1) = "Ú"
aHtmlCh(90, 1) = "Û"
aHtmlCh(91, 1) = "*"
aHtmlCh(92, 1) = "*"
aHtmlCh(93, 1) = "*"
aHtmlCh(94, 1) = "*"
aHtmlCh(95, 1) = "à"
aHtmlCh(96, 1) = "á"
aHtmlCh(97, 1) = "â"
aHtmlCh(98, 1) = "a'"
aHtmlCh(99, 1) = "*"
aHtmlCh(100, 1) = "è"
aHtmlCh(101, 1) = "é"
aHtmlCh(102, 1) = "ê"
aHtmlCh(103, 1) = "e'"
aHtmlCh(104, 1) = "*"
aHtmlCh(105, 1) = "ì"
aHtmlCh(106, 1) = "í"
aHtmlCh(107, 1) = "î"
aHtmlCh(108, 1) = "i'"
aHtmlCh(109, 1) = "*"
aHtmlCh(110, 1) = "ò"
aHtmlCh(111, 1) = "ó"
aHtmlCh(112, 1) = "ô"
aHtmlCh(113, 1) = "o'"
aHtmlCh(114, 1) = "*"
aHtmlCh(115, 1) = "ù"
aHtmlCh(116, 1) = "ú"
aHtmlCh(117, 1) = "û"
aHtmlCh(118, 1) = "u'"
aHtmlCh(119, 1) = "*"
End If
ArrayCaracteres = aHtmlCh
End Function
----------------------------------------------------
-----Mensaje original-----
De: Per Jessen [mailto:per@xxxxxxxxxxxx]
Enviado el: martes, 25 de mayo de 2010 16:48
Para: opensuse-programming@xxxxxxxxxxxx
Asunto: [opensuse-programming] extracting text from html
I need to extract text from html for purposes of indexing - implementation
language is C or C++. Sofar I've come across html2text which is written in C++
- it looks pretty good, but I will need to make some changes to make it fit my
prposes. Does any other library come to mind for extracting text from html?
/Per Jessen, Zürich
--
To unsubscribe, e-mail: opensuse-programming+unsubscribe@xxxxxxxxxxxx
For additional commands, e-mail: opensuse-programming+help@xxxxxxxxxxxx
--
To unsubscribe, e-mail: opensuse-programming+unsubscribe@xxxxxxxxxxxx
For additional commands, e-mail: opensuse-programming+help@xxxxxxxxxxxx
| < Previous | Next > |