Github Copilot 实战: 从零开始用AI写一个OCR工具 (1)

最近AI很火,咱也尝试一下由浅入深探索一下 Github Copilot 的能力和底限.

使用的环境是 Windows11 + Microsoft Visual Studio Enterprise 2022 (64 位) - Current 版本 17.13.7 + VS内置的 Github Copilot Pro

首先创建wpf工程

复制代码
<Project Sdk="Microsoft.NET.Sdk">

  <PropertyGroup>
    <OutputType>WinExe</OutputType>
    <TargetFramework>net9.0-windows</TargetFramework>
    <Nullable>enable</Nullable>
    <ImplicitUsings>enable</ImplicitUsings>
    <UseWPF>true</UseWPF>
  </PropertyGroup>
    <ItemGroup> 
        <PackageReference Include="OpenCvSharp4.runtime.win" Version="4.11.0.20250507" />
        <PackageReference Include="Sdcb.OpenVINO.PaddleOCR" Version="0.6.8" />
        <PackageReference Include="Sdcb.OpenVINO.PaddleOCR.Models.Online" Version="0.6.2" />
        <PackageReference Include="Sdcb.OpenVINO.runtime.win-x64" Version="2025.0.0" />
    </ItemGroup>

</Project>

简单放置一个本地ocr服务

复制代码
using OpenCvSharp;
using Sdcb.OpenVINO.PaddleOCR;
using Sdcb.OpenVINO.PaddleOCR.Models;
using Sdcb.OpenVINO.PaddleOCR.Models.Online;
using System.Diagnostics;
using System.Net.Http;

namespace JovenApi;

public class PaddleOCRService
{

    public static bool IsUrl(string filename)
    {
        return Uri.TryCreate(filename, UriKind.Absolute, out var uriResult)
               && (uriResult.Scheme == Uri.UriSchemeHttp || uriResult.Scheme == Uri.UriSchemeHttps);
    }

    public async Task<(List<string> strings, PaddleOcrResult result)> StartOCR(string filename)
    {
        Mat src;

        if (string.IsNullOrEmpty(filename))
        {
            throw new ArgumentNullException(nameof(filename));
        }

        if (IsUrl(filename))
        {
            src = Cv2.ImDecode(await new HttpClient().GetByteArrayAsync(filename), ImreadModes.Color);
        }
        else
        {
            src = Cv2.ImRead(filename);
        }

        return await StartOCR(src);
    }

    public async Task<(List<string> strings, PaddleOcrResult result)> StartOCR(byte[] imageData)
    {
        Mat src;

        ArgumentNullException.ThrowIfNull(imageData);

        src = Cv2.ImDecode(imageData, ImreadModes.Color);
        return await StartOCR(src);
    }

    public async Task<(List<string> strings, PaddleOcrResult result)> StartOCR(Mat src)
    {
        PaddleOcrResult result;
        var resultText = new List<string>();
        FullOcrModel model = await OnlineFullModels.ChineseV3.DownloadAsync();

        using (PaddleOcrAll all = new(model)
        {
            AllowRotateDetection = true,
            Enable180Classification = true,
        })
        {
            Stopwatch sw = Stopwatch.StartNew();
            result = all.Run(src);
            Console.WriteLine($"elapsed={sw.ElapsedMilliseconds} ms");
            Console.WriteLine("Detected all texts: \n" + result.Text);
            foreach (PaddleOcrResultRegion region in result.Regions)
            {
                Console.WriteLine($"Text: {region.Text}, Score: {region.Score}, RectCenter: {region.Rect.Center}, RectSize:    {region.Rect.Size}, Angle: {region.Rect.Angle}");
                resultText.Add(region.Text);
            }
        }
        src.Dispose();
        return (resultText, result);
    }
}

A. 提示词 "实现拖放,粘贴图片或者点击打开选择文件获取一张图片, 并且调用ocr服务,将结果绘图到原图上, 右边显示相应的文本"

下面是一个完整的 WPF 方案,支持拖放图片、粘贴图片、点击选择图片,并调用 OCR 服务,将识别结果绘制到原图上,右侧显示文本。代码已适配你的 PaddleOCRService,并假设其 StartOCR(byte[] image) 返回识别结果(如文本框列表)。

  1. MainWindow.xaml(UI布局)

    <Window x:Class="MiOcr.MainWindow" xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" Title="MiOcr" Height="600" Width="900" AllowDrop="True" KeyDown="Window_KeyDown" Drop="Window_Drop" >
    <Grid>
    <Grid.ColumnDefinitions>
    <ColumnDefinition Width="2*"/>
    <ColumnDefinition Width="*"/>
    </Grid.ColumnDefinitions>
    <Border Grid.Column="0" Margin="10" BorderBrush="Gray" BorderThickness="1" Background="#FFF" MouseLeftButtonUp="ImageBorder_MouseLeftButtonUp" AllowDrop="True" Drop="ImageBorder_Drop" >
    <Image x:Name="MainImage" Stretch="Uniform" />
    </Border>
    <ScrollViewer Grid.Column="1" Margin="10">
    <TextBox x:Name="OcrTextBox" AcceptsReturn="True" VerticalScrollBarVisibility="Auto" TextWrapping="Wrap" FontSize="16"/>
    </ScrollViewer>
    <Button Content="选择图片" Width="100" Height="30" HorizontalAlignment="Left" VerticalAlignment="Top" Margin="20" Click="SelectImageButton_Click"/>
    </Grid>
    </Window>

  2. MainWindow.xaml.cs(核心逻辑)

    using JovenApi;
    using Microsoft.Win32;
    using System;
    using System.IO;
    using System.Linq;
    using System.Threading.Tasks;
    using System.Windows;
    using System.Windows.Input;
    using System.Windows.Media.Imaging;
    using System.Windows.Media;
    using System.Windows.Controls;
    using System.Collections.Generic;

    namespace MiOcr;
    public partial class MainWindow : Window
    {
    private BitmapImage? _currentImage;
    private byte[]? _currentImageBytes;

    复制代码
     public MainWindow()
     {
         InitializeComponent();
         this.PreviewDragOver += (s, e) => e.Handled = true; // 允许拖放
         this.PreviewKeyDown += Window_KeyDown;
     }
    
     // 拖放图片
     private async void Window_Drop(object sender, DragEventArgs e)
     {
         if (e.Data.GetDataPresent(DataFormats.FileDrop))
         {
             var files = (string[])e.Data.GetData(DataFormats.FileDrop);
             if (files.Length > 0 && IsImageFile(files[0]))
             {
                 await LoadAndOcrImage(files[0]);
             }
         }
     }
    
     // Border区域拖放
     private async void ImageBorder_Drop(object sender, DragEventArgs e)
     {
         await Window_Drop(sender, e);
     }
    
     // 粘贴图片
     private async void Window_KeyDown(object sender, KeyEventArgs e)
     {
         if (e.Key == Key.V && (Keyboard.Modifiers & ModifierKeys.Control) == ModifierKeys.Control)
         {
             if (Clipboard.ContainsImage())
             {
                 var img = Clipboard.GetImage();
                 if (img != null)
                 {
                     var bmp = BitmapFromClipboard(img);
                     using var ms = new MemoryStream();
                     var encoder = new PngBitmapEncoder();
                     encoder.Frames.Add(BitmapFrame.Create(bmp));
                     encoder.Save(ms);
                     await LoadAndOcrImage(ms.ToArray());
                 }
             }
         }
     }
    
     // 点击选择图片
     private async void SelectImageButton_Click(object sender, RoutedEventArgs e)
     {
         var dlg = new OpenFileDialog
         {
             Filter = "图片文件|*.jpg;*.jpeg;*.png;*.bmp;*.gif"
         };
         if (dlg.ShowDialog() == true)
         {
             await LoadAndOcrImage(dlg.FileName);
         }
     }
    
     // 点击图片区域也可选择图片
     private async void ImageBorder_MouseLeftButtonUp(object sender, MouseButtonEventArgs e)
     {
         await SelectImageButton_Click(sender, e);
     }
    
     // 加载图片并调用OCR
     private async Task LoadAndOcrImage(string filePath)
     {
         _currentImageBytes = await File.ReadAllBytesAsync(filePath);
         _currentImage = new BitmapImage(new Uri(filePath));
         MainImage.Source = _currentImage;
         await RunOcrAndDraw(_currentImageBytes);
     }
    
     // 加载图片并调用OCR(字节流)
     private async Task LoadAndOcrImage(byte[] imageBytes)
     {
         _currentImageBytes = imageBytes;
         using var ms = new MemoryStream(imageBytes);
         var bmp = new BitmapImage();
         bmp.BeginInit();
         bmp.CacheOption = BitmapCacheOption.OnLoad;
         bmp.StreamSource = ms;
         bmp.EndInit();
         bmp.Freeze();
         _currentImage = bmp;
         MainImage.Source = _currentImage;
         await RunOcrAndDraw(imageBytes);
     }
    
     // OCR并绘制
     private async Task RunOcrAndDraw(byte[] imageBytes)
     {
         OcrTextBox.Text = "识别中...";
         var ocr = new PaddleOCRService();
         var results = await ocr.StartOCR(imageBytes);
         if (results == null || results.Count == 0)
         {
             OcrTextBox.Text = "未识别到文本";
             return;
         }
         // 假设results为List<OcrResult>,每个OcrResult有Text和Box(左上、右下坐标)
         // 你需要根据PaddleOCRService的返回类型适配
         OcrTextBox.Text = string.Join(Environment.NewLine, results.Select(r => r.Text));
    
         // 绘制识别框和文字
         var drawed = DrawOcrResultsOnImage(_currentImage, results);
         MainImage.Source = drawed;
     }
    
     // 判断文件是否为图片
     private bool IsImageFile(string file)
     {
         var ext = System.IO.Path.GetExtension(file).ToLower();
         return ext == ".jpg" || ext == ".jpeg" || ext == ".png" || ext == ".bmp" || ext == ".gif";
     }
    
     // 从ClipboardBitmapSource转BitmapSource
     private BitmapSource BitmapFromClipboard(BitmapSource src)
     {
         var encoder = new PngBitmapEncoder();
         encoder.Frames.Add(BitmapFrame.Create(src));
         using var ms = new MemoryStream();
         encoder.Save(ms);
         ms.Position = 0;
         var bmp = new BitmapImage();
         bmp.BeginInit();
         bmp.CacheOption = BitmapCacheOption.OnLoad;
         bmp.StreamSource = ms;
         bmp.EndInit();
         bmp.Freeze();
         return bmp;
     }
    
     // 绘制OCR结果到图片
     private BitmapSource DrawOcrResultsOnImage(BitmapImage? image, List<OcrResult> results)
     {
         if (image == null) return null!;
         int width = image.PixelWidth;
         int height = image.PixelHeight;
    
         var visual = new DrawingVisual();
         using (var dc = visual.RenderOpen())
         {
             dc.DrawImage(image, new Rect(0, 0, width, height));
             var pen = new Pen(Brushes.Red, 2);
             var typeface = new Typeface("Arial");
             foreach (var r in results)
             {
                 // 假设 r.Box 是 int[4] 或 Point[],请根据实际类型适配
                 var box = r.Box; // 例如: [x1, y1, x2, y2]
                 dc.DrawRectangle(null, pen, new Rect(box[0], box[1], box[2] - box[0], box[3] - box[1]));
                 var formattedText = new FormattedText(
                     r.Text,
                     System.Globalization.CultureInfo.CurrentCulture,
                     FlowDirection.LeftToRight,
                     typeface,
                     20,
                     Brushes.Yellow,
                     1.25);
                 dc.DrawText(formattedText, new Point(box[0], box[1] - 24));
             }
         }
         var bmp = new RenderTargetBitmap(width, height, 96, 96, PixelFormats.Pbgra32);
         bmp.Render(visual);
         bmp.Freeze();
         return bmp;
     }

    }

    // 假设OCR结果类型如下,实际请根据PaddleOCRService返回类型调整
    public class OcrResult
    {
    public string Text { get; set; }
    public int[] Box { get; set; } // [x1, y1, x2, y2]
    }

说明

• 拖放:支持将图片文件拖到窗口或图片区域。

• 粘贴:Ctrl+V 粘贴剪贴板图片。

• 点击选择:点击图片区域或按钮弹出文件选择对话框。

• OCR调用:调用 PaddleOCRService.StartOCR(byte[]),并假设返回 List ,每个包含 Text 和 Box。
• 绘图:在图片上绘制识别框和文字,右侧显示所有识别文本。
• UI:左侧图片,右侧文本,顶部有"选择图片"按钮。
如 PaddleOCRService 返回类型不同,请补充其定义,我可帮你适配。
如需支持多种图片格式或更复杂的坐标,请补充说明。

到这里,一个基本框架居然就完成了. 运行起来也差不多达到预期. 真是惊为天人.

修正一下,提示词 "返回的 results.result 类型为 PaddleOcrResult"

非常好,PaddleOCRService.StartOCR 返回的 results.result 类型为 PaddleOcrResult。

PaddleOcrResult 结构如下(基于 Sdcb.OpenVINO.PaddleOCR 官方文档和源码):

复制代码
public class PaddleOcrResult
{
    public List<PaddleOcrResultRegion> Regions { get; set; }
}

public class PaddleOcrResultRegion
{
    public string Text { get; set; }
    public float Score { get; set; }
    public RotatedRect Rect { get; set; }
    // 还有 PolyPoints、Label 等属性
}

• Rect 是 RotatedRect,包含中心点、尺寸、角度。

• PolyPoints 是多边形点(通常为4个点,顺时针/逆时针),可用于绘制识别框。

你的代码应这样处理:

  1. 显示文本

    OcrTextBox.Text = string.Join(Environment.NewLine, results.result.Regions.Select(r => r.Text));

  2. 绘制识别框和文字

    建议用 PolyPoints 绘制多边形框,Rect.Center 作为文字起点。

    private BitmapSource DrawOcrResultsOnImage(BitmapImage? image, PaddleOcrResult result)
    {
    if (image == null) return null!;
    int width = image.PixelWidth;
    int height = image.PixelHeight;

    复制代码
     var visual = new DrawingVisual();
     using (var dc = visual.RenderOpen())
     {
         dc.DrawImage(image, new Rect(0, 0, width, height));
         var pen = new Pen(Brushes.Red, 2);
         var typeface = new Typeface("Arial");
         foreach (var region in result.Regions)
         {
             // 绘制多边形框
             if (region.PolyPoints != null && region.PolyPoints.Length >= 4)
             {
                 var points = region.PolyPoints.Select(p => new System.Windows.Point(p.X, p.Y)).ToArray();
                 var polygon = new System.Windows.Media.PolyLineSegment(points, true);
                 var figure = new PathFigure(points[0], new[] { polygon }, true);
                 var geometry = new PathGeometry(new[] { figure });
                 dc.DrawGeometry(null, pen, geometry);
             }
             // 绘制文字
             var formattedText = new FormattedText(
                 region.Text,
                 System.Globalization.CultureInfo.CurrentCulture,
                 FlowDirection.LeftToRight,
                 typeface,
                 20,
                 Brushes.Yellow,
                 1.25);
             dc.DrawText(formattedText, new System.Windows.Point(region.Rect.Center.X, region.Rect.Center.Y));
         }
     }
     var bmp = new RenderTargetBitmap(width, height, 96, 96, PixelFormats.Pbgra32);
     bmp.Render(visual);
     bmp.Freeze();
     return bmp;

    }

  3. 调用方式

    var drawed = DrawOcrResultsOnImage(_currentImage, results.result);
    MainImage.Source = drawed;

总结

• results.result 是 PaddleOcrResult,其 Regions 为每个识别区域。

• 推荐用 PolyPoints 绘制多边形框,region.Text 作为识别文本。

• 右侧文本框用 string.Join(Environment.NewLine, results.result.Regions.Select(r => r.Text))。

继续修正,提示词 "绘制多边形框方法有点错误, 根据 Console.WriteLine($"Text: {region.Text}, Score: {region.Score}, RectCenter: {region.Rect.Center}, RectSize: {region.Rect.Size}, Angle: {region.Rect.Angle}"); 修正一下"

你当前的绘制方法有误,因为 region.Rect 是 RotatedRect,它不是点集合,而是一个带有中心点、尺寸和角度的旋转矩形。

正确做法是:使用 region.Rect.GetVertices() 获取多边形的四个顶点,然后用这些点绘制多边形框。

请将 DrawOcrResultsOnImage 方法中的多边形绘制部分修正如下:

复制代码
private BitmapSource DrawOcrResultsOnImage(BitmapImage? image, PaddleOcrResult result)
{
    if (image == null) return null!;
    int width = image.PixelWidth;
    int height = image.PixelHeight;

    var visual = new DrawingVisual();
    using (var dc = visual.RenderOpen())
    {
        dc.DrawImage(image, new Rect(0, 0, width, height));
        var pen = new Pen(Brushes.Red, 2);
        var typeface = new Typeface("Arial");
        foreach (var region in result.Regions)
        {
            // 正确获取四个顶点
            var vertices = region.Rect.GetVertices(); // 返回 PointF[],长度为4
            var points = vertices.Select(p => new System.Windows.Point(p.X, p.Y)).ToArray();

            // 绘制多边形
            var figure = new PathFigure(points[0], new[] { new PolyLineSegment(points.Skip(1), true) }, true);
            var geometry = new PathGeometry(new[] { figure });
            dc.DrawGeometry(null, pen, geometry);

            // 绘制文字(以中心点为准)
            var formattedText = new FormattedText(
                region.Text,
                System.Globalization.CultureInfo.CurrentCulture,
                FlowDirection.LeftToRight,
                typeface,
                20,
                Brushes.Yellow,
                1.25);
            dc.DrawText(formattedText, new System.Windows.Point(region.Rect.Center.X, region.Rect.Center.Y));
        }
    }
    var bmp = new RenderTargetBitmap(width, height, 96, 96, PixelFormats.Pbgra32);
    bmp.Render(visual);
    bmp.Freeze();
    return bmp;
}

关键修正说明:

• 用 region.Rect.GetVertices() 获取四个顶点,而不是直接遍历 Rect。

• 用 PolyLineSegment 和 PathFigure 绘制闭合多边形。

• 文字仍然以 region.Rect.Center 为基准绘制。

这个时候还是有错误, AI需要继续引导一下 , 提示词 "region.Rect 的定义是 Point2f center, Size2f size, float angle"

感谢补充,region.Rect 是 RotatedRect,其定义为:

复制代码
public struct RotatedRect
{
    public Point2f Center; // 中心点
    public Size2f Size;    // 宽高
    public float Angle;    // 旋转角度(度)
}

正确绘制多边形框的方法是:

• 使用 region.Rect.GetVertices() 获取四个顶点(Point2f[],顺序为左上、右上、右下、左下)。

• 用这些点绘制闭合多边形。

你的代码如下即可:

复制代码
private BitmapSource DrawOcrResultsOnImage(BitmapImage? image, PaddleOcrResult result)
{
    if (image == null) return null!;
    int width = image.PixelWidth;
    int height = image.PixelHeight;

    var visual = new DrawingVisual();
    using (var dc = visual.RenderOpen())
    {
        dc.DrawImage(image, new Rect(0, 0, width, height));
        var pen = new Pen(Brushes.Red, 2);
        var typeface = new Typeface("Arial");
        foreach (var region in result.Regions)
        {
            // 获取四个顶点
            var vertices = region.Rect.GetVertices(); // Point2f[],长度4
            var points = vertices.Select(p => new System.Windows.Point(p.X, p.Y)).ToArray();

            // 绘制多边形
            var figure = new PathFigure(points[0], new[] { new PolyLineSegment(points.Skip(1), true) }, true);
            var geometry = new PathGeometry(new[] { figure });
            dc.DrawGeometry(null, pen, geometry);

            // 绘制文字(以中心点为准)
            var formattedText = new FormattedText(
                region.Text,
                System.Globalization.CultureInfo.CurrentCulture,
                FlowDirection.LeftToRight,
                typeface,
                20,
                Brushes.Yellow,
                1.25);
            dc.DrawText(formattedText, new System.Windows.Point(region.Rect.Center.X, region.Rect.Center.Y));
        }
    }
    var bmp = new RenderTargetBitmap(width, height, 96, 96, PixelFormats.Pbgra32);
    bmp.Render(visual);
    bmp.Freeze();
    return bmp;
}

要点说明:

• region.Rect.GetVertices() 是最安全、最准确的方式获取旋转矩形的四个顶点。

• 直接用 Center、Size、Angle 计算顶点较为繁琐,库已封装好。

• 文字建议以 region.Rect.Center 为锚点绘制。

这样即可正确绘制 PaddleOCR 识别区域的旋转矩形框和文本。

本节成果