EXCEL VBA调用adobe的api识别电子PDF发票里内容并登记台账
代码如下
python
使用须知:
1、工具--引用里勾选[Adobe Acrobat 10.0 Type Library]
2、安装Adobe Acrobat pro软件
Dim sht As Worksheet
Function BrowseFolders() As String '浏览目录
Dim objshell As Object
Dim objFolder As Object
Set objshell = CreateObject("Shell.Application")
Set objFolder = objshell.BrowseForFolder(0, "请指定发票文件所在的文件夹", 0, 0)
BrowseFolders = ""
If Not objFolder Is Nothing Then
BrowseFolders = objFolder.Self.Path
End If
Set objFolder = Nothing
Set objshell = Nothing
End Function
Sub cmd_getpdf_Click()
Dim Pth As String '文件路径
Dim PDFName As String, Wapp As Object, Mstr As String
Application.ScreenUpdating = False
'============================================
Pth = BrowseFolders
If Pth = "" Then
Pth = Sheet1.Range("A9").Text
End If
If Pth = "" Then
Pth = ThisWorkbook.Path
End If
If Right(Pth, 1) <> "\" Then Pth = Pth & "\"
Sheet1.Range("A8") = "上次路径:"
Sheet1.Range("A9") = Pth
Sheet1.Range("a15:a10000") = ""
If Dir(Pth & "*.pdf") = "" Then
MsgBox "指定目录没有找到发票PDF文件!"
Sheet1.Range("A9") = ""
Exit Sub
End If
'Debug.Print Pth
'============================================
For Each sht In ThisWorkbook.Sheets
Application.DisplayAlerts = False
If sht.Name = "发票资料读取到Excel" Then sht.Delete
Application.DisplayAlerts = True
Next
Set sht = Worksheets.Add(, Worksheets(Sheets.Count))
sht.Name = "发票资料读取到Excel"
sht.Range("A1:J1") = Array("发票号码", "发票日期", "货物或*名称", "规格型号", "单位", "数量", "单价", "金额", "税率", "税额")
'============================================定义表头字段
PDFName = Dir(Pth & "*.pdf")
Do While PDFName <> ""
Call Imp_Into_XL(Pth & PDFName)
PDFName = Dir
Loop
sht.Columns.AutoFit
MsgBox "操作完成!"
'============================================
Application.ScreenUpdating = True
End Sub
Sub Imp_Into_XL(PDF_File As String)
Dim AC_PD As Acrobat.AcroPDDoc
Dim AC_Hi As Acrobat.AcroHiliteList
Dim AC_PG As Acrobat.AcroPDPage
Dim AC_PGTxt As Acrobat.AcroPDTextSelect
Dim Yes_Fir As Boolean
Dim Ct_Page As Long
Dim i As Long, j As Long, k As Long, m As Integer
Dim T_Str As String
Dim Hld, XL, Brr(), RowNo%, Arr As Variant, sss%
Dim Hld_Txt As Variant
Dim FPHM As String '发票号码
Dim FPRQ As String '发票日期
Dim GGXH As String '规格型号
Dim HWMC As String '货物名称
Dim SL_SV As String '数量-税率
Dim SL_SV_Temp As String '数量-税率的临时存变量
Dim HWDW As String '货物单位
Dim SL As String '数量
Dim DW As String '单位
Dim XH As String '型号
'====================================================定义字段类型
Set AC_PD = New Acrobat.AcroPDDoc
Set AC_Hi = New Acrobat.AcroHiliteList
AC_Hi.Add 0, 32767
With AC_PD
.Open PDF_File
Ct_Page = .GetNumPages
If Ct_Page = -1 Then
MsgBox "请确认发票文件 '" & PDF_File & "'"
.Close
GoTo h_end
End If
For i = 1 To 1 ' Ct_Page '只考虑一个文档有一张发票的情形
T_Str = ""
Set AC_PG = .AcquirePage(i - 1)
Set AC_PGTxt = AC_PG.CreateWordHilite(AC_Hi)
If Not AC_PGTxt Is Nothing Then
With AC_PGTxt
For j = 0 To .GetNumText - 1
T_Str = T_Str & .GetText(j)
Next j
End With
End If
'==========================================================
If T_Str <> "" Then
Hld_Txt = Split(T_Str, vbCrLf)
FPHM = "": FPRQ = "":: GGXH = "": HWMC = ""
For j = 0 To UBound(Hld_Txt)
If InStr(Hld_Txt(j), "年月日") = 0 Then
If InStr(Hld_Txt(j), "年") > 0 And InStr(Hld_Txt(j), "月") > 0 And InStr(Hld_Txt(j), "日") > 0 Then '当字符串里含有年月日时
Hld_Txt(j) = Repce2(Hld_Txt(j))
Hld_Txt(j) = Trim(Replace(Hld_Txt(j), "开票日期:", "")) '如果有"开票日期:"几个字,将其替换掉
FPRQ = Left(Hld_Txt(j), 4) & "-" & Mid(Hld_Txt(j), 6, 2) & "-" & Mid(Hld_Txt(j), 9, 2)
Exit For
End If
End If
Next j
For j = 0 To UBound(Hld_Txt)
If TestNumber(Hld_Txt(j)) Then '测试是否含有数字并以数字结尾的类型,加以判断
If Len(Hld_Txt(j)) = 10 And TestCH(Hld_Txt(j)) = False Then '当字符串里没有年月日,但是以"2023 06 30"有空格,共有10个字符串位置形式存在时取得发票日期
If InStr(Hld_Txt(j), " ") > 0 And UBound(Split(Hld_Txt(j), " ")) > 0 Then
FPRQ = "'" & RegR(Hld_Txt(j)) '取得发票日期
Exit For
End If
End If
End If
Next j
For j = 0 To UBound(Hld_Txt)
If TestNumber(Hld_Txt(j)) Then '测试是否含有数字并以数字结尾的类型,加以判断
Hld_Txt(j) = Trim(Replace(Hld_Txt(j), "发票号码:", "")) '如果有"发票号码:"几个字,将其替换掉
If Len(Hld_Txt(j)) = 8 Or Len(Hld_Txt(j)) = 20 Then '//***限定要取出的发票号码为8位或者20位数字,否则发票号码取不出来
If IsNumeric(Hld_Txt(j)) Then
If InStr(Hld_Txt(j), ".") = 0 And InStr(Hld_Txt(j), ChrW(165)) = 0 Then
FPHM = Regs(Hld_Txt(j)) '取得8位或者20位的发票号码
Exit For
End If
End If
End If
End If
Next j
k = 0
For j = 0 To UBound(Hld_Txt)
If Len(Trim(Hld_Txt(j))) > 2 Then '//***当字符数大于2时,有的只有一个*号,这种情形需要排除
If Left(Trim(Hld_Txt(j)), 1) = "*" Or InStr(Hld_Txt(j), "详见") > 0 Then '////当货物名称前面第一个字符是*号或者含有(详见)时
Arr = Array("+", "<", ">") '/***密码区有许多有这几个符号,遇到了就避开它
sss = 0
For m = LBound(Arr) To UBound(Arr) '//***避免遇到密码区以*号开头,并且有Arr数组里符号的情形
If InStr(Hld_Txt(j), Arr(m)) > 0 Then sss = sss + 1
Next m
If sss = 0 Then
Hld_Txt(j) = Trim(Hld_Txt(j)) '清除前后空格
Hld_Txt(j) = StrConv(Hld_Txt(j), vbNarrow) '全角转为半角
Hld_Txt(j) = Repce(Hld_Txt(j)) '将字符串中多个空格变成一个
If InStr(Hld_Txt(j), "%") > 0 Or Right(Trim(Hld_Txt(j)), 1) = "*" Then
For m = UBound(Split(Hld_Txt(j), " ")) To 0 Step -1
If TestCHNum(Split(Hld_Txt(j), " ")(m)) = False Or Trim(Split(Hld_Txt(j), " ")(m)) = "*" Then '循环判定,取出有数字的数量-税额部分//有部分的金额和税额是*号
If TestCH(Split(Hld_Txt(j), " ")(m)) = True And InStr(Hld_Txt(j), "不征税") = 0 Then Exit For
SL_SV = Split(Hld_Txt(j), " ")(m) & " " & SL_SV
SL_SV_Temp = Split(Hld_Txt(j), " ")(m) & " " & SL_SV_Temp '增加这个变量,存下原始的数量金额部分
If InStr(SL_SV, "不征税") > 0 And Len(SL_SV) > 3 Then SL_SV = Left(SL_SV, InStr(SL_SV, "税")) & " " & Right(SL_SV, Len(SL_SV) - InStr(SL_SV, "税"))
SL_SV = Trim(SL_SV)
SL_SV_Temp = Trim(SL_SV_Temp)
If m < UBound(Split(Hld_Txt(j), " ")) And Split(Hld_Txt(j), " ")(m) < 0 Then Exit For
ElseIf TestCHNum(Split(Hld_Txt(j), " ")(m)) = True Then '循环判定,如含有中文+数字,则需拆分
SL_SV = RegSL(Split(Hld_Txt(j), " ")(m)) & " " & SL_SV
SL_SV_Temp = RegSL(Split(Hld_Txt(j), " ")(m)) & " " & SL_SV_Temp
SL_SV = Trim(SL_SV)
SL_SV_Temp = Trim(SL_SV_Temp)
Exit For
End If
Next m
SL_SV = Repce(SL_SV): SL_SV_Temp = Repce(SL_SV_Temp) '用原始的数量金额部分来方便取出GGXH
GGXH = Trim(Replace(Hld_Txt(j), SL_SV_Temp, "")) '去掉数量-税额部分,下余的是规格型号 ////***前面做过变动后,这里用replace取不出余下的规格型号
SL_SV_Temp = ""
SL_SV = SL_JE(SL_SV) '数量-税额部分,不能用trim去掉前面空格
If InStr(GGXH, "费") > 0 Then
HWMC = Left(GGXH, InStr(GGXH, "费")) '货物名称,有费字的取费字前面字符(含费字)作为货物名称
GGXH = Trim(Replace(GGXH, HWMC, "")) '费字后面的是规格型号+单位
Else
If InStr(GGXH, " ") = 0 Then
HWMC = GGXH: GGXH = "" '规格型号没有包含空格时,货物名称就取ggxh,将原来的ggxh置空
Else
HWMC = Split(GGXH, " ")(0) '规格型号有包含空格时,货物名称取ggxh的第一个空格前的字符
GGXH = Trim(Replace(GGXH, HWMC, "")) '规格型号取除了货物名称后的余下的值
End If
End If
If InStr(GGXH, " ") = 0 Then '当规格型号没有空格时********
Select Case Len(GGXH)
Case Is = 0 '当费后面的字符数量为0时
If Split(SL_SV, " ")(0) = "" Then '当数据部分第一个字符为空时,货物名称就只为货物名称
HWMC = HWMC & " " & " "
Else '当数据部分第一个字符不为空时,货物名称取最后一个值为单位,次一个值为规格型号
If Mid(HWMC, Len(HWMC) - 1, 2) = "服务" Or InStr(HWMC, "费") > 0 Then
HWMC = HWMC & " " & " " '当货物名称最后两个字是"服务"时或含有"费"时,已经不能拆开了.
ElseIf InStr(HWMC, "费") = 0 Then
DW = Right(HWMC, 1) '取右边一位做单位*****
XH = Mid(HWMC, Len(HWMC) - 1, 1)
HWMC = Left(HWMC, Len(HWMC) - 2)
If InStr(HWMC, XH & DW) > 0 Or InStr(HWMC, XH) > 0 Or InStr(HWMC, DW) > 0 Then
HWMC = HWMC & XH & DW & " " & " "
Else
HWMC = HWMC & " " & XH & " " & DW
End If
End If
End If
Case Is >= 1 '当费后面的字符数量为1或者大于1时
DW = Right(GGXH, 1) '取右边一位做单位
XH = Replace(GGXH, DW, "") '余下的是型号
If Split(SL_SV, " ")(0) = "" Then
HWMC = HWMC & " " & " "
Else
If XH <> "" Then
HWMC = HWMC & " " & XH & " " & DW
Else
HWMC = HWMC & " " & " " & DW
End If
End If
End Select
ElseIf InStr(GGXH, " ") > 0 Then '当规格型号有空格时
If Split(SL_SV, " ")(0) <> "" Then
HWDW = Split(GGXH, " ")(UBound(Split(GGXH, " "))) '单位
If Len(HWDW) > 1 Then
HWDW = Right(HWDW, 1)
GGXH = Replace(GGXH, HWDW, "")
GGXH = Replace(GGXH, " ", "_")
HWMC = HWMC & " " & GGXH & " " & HWDW
Else
XH = Trim(Replace(GGXH, HWDW, "")) '规格型号
If XH = "" Then
If Len(HWDW) > 1 Then
DW = Right(HWDW, 1)
XH = Replace(HWDW, DW, "")
HWMC = HWMC & " " & XH & " " & DW
ElseIf Len(HWDW) = 1 Then
HWMC = HWMC & " " & " " & DW
End If
Else
DW = HWDW
XH = Trim(Replace(XH, " ", "_")) '去掉规格型号中的空格,用下横线连接
HWMC = HWMC & " " & XH & " " & DW
End If
End If
ElseIf Split(SL_SV, " ")(0) = "" Then
XH = Replace(GGXH, " ", "_") '去掉规格型号中的空格,用下横线连接
HWMC = HWMC & " " & XH & " " '没有单位,要加上表示单位的空格
End If
End If
ElseIf UBound(Split(Hld_Txt(j), " ")) <= 2 And InStr(Hld_Txt(j), "%") = 0 Then '当品名与数量金额等不在同一行时
HWMC = Hld_Txt(j)
For m = j To UBound(Hld_Txt)
If InStr(Hld_Txt(m), "%") > 0 Then SL_SV_Temp = Hld_Txt(m): Exit For
Next m
For m = UBound(Split(SL_SV_Temp, " ")) To 0 Step -1
If TestCHNum(Split(SL_SV_Temp, " ")(m)) = False Or Trim(Split(SL_SV_Temp, " ")(m)) = "*" Then '循环判定,取出有数字的数量-税额部分//有部分的金额和税额是*号
If TestCH(Split(SL_SV_Temp, " ")(m)) = True And InStr(SL_SV_Temp, "不征税") = 0 Then Exit For
SL_SV = Split(SL_SV_Temp, " ")(m) & " " & SL_SV '增加这个变量,存下原始的数量金额部分
If InStr(SL_SV, "不征税") > 0 And Len(SL_SV) > 3 Then SL_SV = Left(SL_SV, InStr(SL_SV, "税")) & " " & Right(SL_SV, Len(SL_SV) - InStr(SL_SV, "税"))
SL_SV = Trim(SL_SV)
If m < UBound(Split(SL_SV_Temp, " ")) And Split(SL_SV_Temp, " ")(m) < 0 Then Exit For
ElseIf TestCHNum(Split(SL_SV_Temp, " ")(m)) = True Then '循环判定,如含有中文+数字,则需拆分
SL_SV = RegSL(Split(SL_SV_Temp, " ")(m)) & " " & SL_SV
SL_SV = Trim(SL_SV)
Exit For
End If
Next m
SL_SV_Temp = Replace(SL_SV_Temp, SL_SV, "")
SL_SV = Repce(SL_SV) '用原始的数量金额部分来方便留下GGXH
GGXH = Trim(Replace(SL_SV_Temp, SL_SV, "")) '去掉数量-税额部分,下余的是规格型号 ////***前面做过变动后,这里用replace取不出余下的规格型号
If Len(GGXH) = 0 Then '当规格型号为空时
DW = Split(HWMC, " ")(UBound(Split(HWMC, " ")))
HWMC = Trim(Replace(HWMC, DW, ""))
XH = Trim(Replace(HWMC, Split(HWMC, " ")(0), " "))
HWMC = Trim(Replace(HWMC, XH, ""))
If InStr(HWMC, " ") > 0 Then HWMC = Replace(HWMC, " ", "_") '将货物名称里原有的空格用下划线代替
If InStr(XH, " ") > 0 Then XH = Replace(XH, " ", "_") '将型号里原有的空格用下划线代替
If Len(XH) > 0 Then
HWMC = HWMC & " " & XH & " " & DW
Else
HWMC = HWMC & " " & " " & DW
End If
ElseIf Len(GGXH) > 0 Then '当规格型号不为空时
If InStr(HWMC, " ") > 0 Then HWMC = Replace(HWMC, " ", "_") '将货物名称里原有的空格用下划线代替
If InStr(GGXH, " ") > 0 Then
DW = Split(GGXH, " ")(UBound(Split(GGXH, " "))) '单位
XH = Trim(Replace(SL_SV_Temp, DW, "")) '型号
If InStr(XH, " ") > 0 Then XH = Replace(XH, " ", "_") '将型号里原有的空格用下划线代替
If Len(XH) > 0 Then
HWMC = HWMC & " " & XH & " " & DW
Else
HWMC = HWMC & " " & " " & DW
End If
Else
DW = Right(GGXH, 1) '取右边一位做单位*****
XH = Replace(GGXH, DW, "")
HWMC = HWMC & " " & XH & " " & DW
End If
End If
End If
If Split(SL_SV, " ")(0) = "" Then '///*****************
Hld_Txt(j) = HWMC & SL_SV
Else
Hld_Txt(j) = HWMC & " " & SL_SV
End If
HWMC = "": SL_SV = "": SL = "": DW = "": XH = "": GGXH = "": HWDW = "": SL_SV_Temp = ""
If UBound(Split(Hld_Txt(j), " ")) = 7 Then
k = k + 1
ReDim Preserve Brr(1 To 10, 1 To k)
Brr(1, k) = "'" & FPHM: Brr(2, k) = FPRQ '编号及日期
For m = 0 To UBound(Split(Hld_Txt(j), " "))
Brr(3 + m, k) = Split(Hld_Txt(j), " ")(m)
Next m
Else
GoTo 0
End If
End If
End If
End If
Next j
With sht
If k = 0 Then GoTo 0
RowNo = .Cells(65536, 1).End(3).Row + 1
.Cells(RowNo, 1).Resize(UBound(Brr, 2), UBound(Brr)) = Application.Transpose(Brr)
' .Cells(RowNo, 11) = PDF_File '将文件名称放在最后一列
Erase Brr
End With
ElseIf T_Str = "" Then
0
MsgBox PDF_File & "文件没有取到数据,请检查!", vbOKOnly, "ExcelHome"
Sheet1.Cells(Sheet1.Cells(65536, 1).End(3).Row + 1, 1) = PDF_File '将有问题的文件名称放在sheet1表中,方便查验
Exit For
End If
'===========================================================
Next i
.Close
End With
h_end:
Set AC_PGTxt = Nothing
Set AC_PG = Nothing
Set AC_Hi = Nothing
Set AC_PD = Nothing
End Sub
Function Regs(STR) '取发票号码
Dim reg As Object, mh As Variant
Set reg = CreateObject("VBScript.RegExp")
With reg
.Global = True
.Pattern = "(^\d{8}$|^\d{20}$)" '是8位或者是20位
Set mh = .Execute(STR)
Regs = mh.Item(0).SubMatches.Item(0)
End With
End Function
Function RegR(STR) '取发票日期
Dim reg As Object, mh As Variant
Set reg = CreateObject("VBScript.RegExp")
With reg
.Global = True
.Pattern = "(^\d{4} \d{2} \d{2}$)" '前四位年,中两位月,后两位日
Set mh = .Execute(STR)
RegR = Replace(mh.Item(0).SubMatches.Item(0), " ", "-")
End With
End Function
Function RegSL(STR) '取数量
Dim reg As Object, mh As Variant
Set reg = CreateObject("VBScript.RegExp")
With reg
.Global = True
'.Pattern = "[\u4e00-\u9fff](\d+\.\d+|\d+)" '中文后面跟的数量为小数或整数
.Pattern = "[\u4e00-\u9fff]([-]?\d+\.\d+$|[-]?\d+$)" '中文后面跟的数量为小数或整数
Set mh = .Execute(STR)
RegSL = mh.Item(0).SubMatches.Item(0)
End With
End Function
Function TestNumber(STR) '测试是否最后是数字
Dim reg As Object
Set reg = CreateObject("VBScript.RegExp")
With reg
.Global = True
.Pattern = "^\d+\.\d+$|\d+$"
TestNumber = .test(STR)
End With
End Function
Function TestCH(STR) '测试是否以中文开始
Dim reg As Object
Set reg = CreateObject("VBScript.RegExp")
With reg
.Global = True
.Pattern = "[\u4e00-\u9fff]+"
TestCH = .test(STR)
End With
End Function
Function TestCHNum(STR) '测试是否以中文后跟随数字
Dim reg As Object
Set reg = CreateObject("VBScript.RegExp")
With reg
.Global = True
.Pattern = "[\u4e00-\u9fff]([-]?\d+\.\d+$|[-]?\d+$)"
TestCHNum = .test(STR)
End With
End Function
Public Function Repce(STR) '多个空格变成一个
With CreateObject("VBSCRIPT.REGEXP")
.Global = True
.Pattern = "\s+"
Repce = .Replace(STR, " ")
End With
End Function
Public Function Repce2(STR) '去掉中间空格
With CreateObject("VBSCRIPT.REGEXP")
.Global = True
.Pattern = "\s+"
Repce2 = .Replace(STR, "")
End With
End Function
Public Function SL_JE(STR) '处理数量金额这部分
Dim i%, str_temp
Select Case UBound(Split(STR, " "))
Case Is >= 5
For i = UBound(Split(STR, " ")) To UBound(Split(STR, " ")) - 4 Step -1
str_temp = Split(STR, " ")(i) & " " & str_temp
Next i
SL_JE = Trim(str_temp)
Case 4
SL_JE = STR
Case 2
SL_JE = " " & STR
End Select
End Function