背景:
数仓中有经纬度数据想要匹配城市编码,但是相关数据在gp数据库并且存储在
pg数据库的public.geometry类型字段中,无法直接导入到dataworks直接使用
现做数据清洗将数据进行匹配 ,
使用 H3和空间网格两种方式实现
1 将pg数据库中的geometry转成二进制的字符串
select
code,
name,full_name,
encode(ST_AsBinary(geom), 'base64') AS geom_base64
FROM
public.base_geography
2.数据入仓
insert overwrite table dwd.test
select '230307001','麻山街道','黑龙江省鸡西市麻山区麻山街道',
"AQYAAAABAAAAAQMAAAABAAAAbgAAAN1c/G1PUGBAOiF00CWcRkAGTODWXVBgQHbfMTz2m0ZAD52e
d2NQYECKyRtg5ptGQBnJHqFmUGBACrsoeuCbRkAYldQJaFBgQDMxXYjVm0ZAGVbxRmZQYEAomgew
yJtGQCQp6WFoUGBAEolCy7qbRkAXoG01a1BgQIF38umxm0ZAOQ68Wm5QYEAr3zMSoZtGQDblCu9y
UGBAM0+uKZCbRkBWRE30eVBgQPeuQV96m0ZAU82spYBQYED2DOGYZZtGQGfXvRWJUGBAj3HFxVGb
RkB15bM8j1BgQKjknNhDm0ZAdPBMaJJQYEBmu0IfLJtGQIjVH2GYUGBA8wNXeQKbRkCxxAPKplBg
QPVIg9vamkZA4JwRpb1QYEAIkQw5tppGQADICRPGUGBAAOFDiZaaRkDzy2CMyFBgQIyGjEepmkZA
8rG7QMlQYEBAaD18mZpGQN7M6EfDUGBAY9AJoYOaRkDBi76CtFBgQLEWnwJgmkZAyk+qfbpQYECk
+s4vSppGQN5Zu+3CUGBA0Jfe/lyaRkDoa5bLxlBgQCcwndZtmkZA8gpET8pQYEBoPBHEeZpGQPsC
euHOUGBAdNNmnIaaRkD52ch101BgQCBig4WTmkZAGcbdINpQYECr6uV3mppGQGpq2VrfUGBAqI/A
H36aRkD3zf3V41BgQG06ArhZmkZAbeS6KeVQYECMahFRTJpGQExUbw3sUGBASN+kaVCaRkBaLhud
81BgQNkkP+JXmkZA2XvxRftQYECkNJvHYZpGQARVo1cDUWBAOGdEaW+aRkCVgJiEC1FgQNJWJZF9
mkZAN+M0RBVRYEBwz/OnjZpGQMo329wYUWBADcNHxJSaRkBPdjOjH1FgQLYwC+2cmkZA521sdiRR
YEAU0a+tn5pGQG6hKxEoUWBADhR4J5+aRkAQroBCPVFgQIy61t6nmkZARKSmXUxRYEDxnC0gtJpG
QML8FTJXUWBAGt8Xl6qaRkB+Ab1wZ1FgQAfvq3KhmkZAufqxSX5RYEA17s1vmJpGQNv4E5WNUWBA
WcSww5iaRkBhcTjzq1FgQPoMqDejmkZAkX2QZcFRYEAlWYejq5pGQEFHq1rSUWBAbw9CQL6aRkAE
5Euo4FFgQP8gkiHHmkZA/pyC/OxRYEA7NgLxuppGQBU7Gof6UWBARdrGn6iaRkBUxVT6CVJgQP/m
xYmvmkZA6UXtfhVSYEAiOC7jpppGQADkhAkjUmBAIoleRrGaRkBgqwSLQ1JgQIkI/yJomkZA5Ga4
AR9SYEApsACmDJpGQAwepn3zUWBAs1w2OueZRkCg/rPmx1FgQMDQI0bPmUZAIsK/CJpRYEAGD9O+
uZlGQBzO/GqOUWBApTDvcaaZRkB/Tdaoh1FgQEAxsmSOmUZA9gfKbXtRYEC4BOCfUplGQGqHvyZr
UWBAumbyzTaZRkBM++b+alFgQGCt2jUhmUZAO8Q/bGlRYEDmllZD4phGQDY7Un1nUWBAj1a1pKOY
RkA8+IkDaFFgQO52vTRFmEZA4UIewQ1RYEAVqpuLv5dGQIdT5uabUGBAlIYahSSXRkCZ1qaxPVBg
QBMNUvAUlkZAiJy+ni9QYEBQcodNZJZGQKeTbHU5UGBA34eDhCiXRkB1sWmlkFBgQAspP6n2l0ZA
ahK8IY1QYEALz0vFxphGQNVcbjBUUGBAQ+bKoNqYRkAHmPkO/k9gQDpY/+cwmUZAkQ2ki81PYEAc
l3FTA5lGQOmBj8GKT2BAuECC4seYRkBTspyEUk9gQEewcf27mEZALjpZaj1PYEDiHeBJC5lGQPq2
YKkuT2BAxSCwcmiZRkDoFORnI09gQPD9DdqrmUZA1ArT9xpPYED61LFK6ZlGQMEAwocST2BANDDy
siaaRkCiCRSxCE9gQNUmTu53mkZAiX5t/fROYEDxhF5/EptGQB2vQPQkT2BAd4NorWibRkDFVPoJ
Z09gQMmvH2KDm0ZAEwoRcIhPYECvmXyzzZtGQIEExY+xT2BABI9v7xqcRkCdK0oJwU9gQObN4Vrt
m0ZATUusjMZPYEBJZYo5CJxGQKxwy0fST2BAl8eakUGcRkD7c9GQ8U9gQG8RGOsbnEZALo81IwNQ
YEBp5V5gVpxGQERpb/AFUGBA2NKjqZ6cRkBPIsK/CFBgQJDaxMn9nEZAVG8NbBVQYEByGMxfIZ1G
QD9wlScQUGBAdT+nID+dRkBc5QmEHVBgQJ6Xio15nUZAfR8OEiJQYECBeF2/YJ1GQI5zm3AvUGBA
z77yID2dRkCXkA96NlBgQJlKP+HsnEZAtqFinD9QYEBsy4CzlJxGQN1c/G1PUGBAOiF00CWcRkDd
XPxtT1BgQDohdNAlnEZA"
3.udf编写
方式一:空间网络格式
PolygonToRowGrids 网格生成主键 udtf
import com.aliyun.odps.udf.ExecutionContext;
import com.aliyun.odps.udf.UDFException;
import com.aliyun.odps.udf.UDTF;
import com.aliyun.odps.udf.annotation.Resolve;
import org.locationtech.jts.geom.Geometry;
import org.locationtech.jts.io.WKBReader;
import org.apache.commons.codec.binary.Base64;
import com.aliyun.odps.data.Binary;
@Resolve("string,string -> string,string,binary,double,double,double,double")
public class PolygonToRowGrids extends UDTF {
private WKBReader wkbReader = new WKBReader();
private double gridSize = 0.1; // 可调整
@Override
public void setup(ExecutionContext ctx) {}
@Override
public void process(Object[] args) throws UDFException {
String geomBase64 = (String) args[0];
String spatialId = (String) args[1];
if (geomBase64 == null || spatialId == null) return;
try {
byte[] wkbBytes = Base64.decodeBase64(geomBase64);
Geometry geom = wkbReader.read(wkbBytes);
double minX = geom.getEnvelopeInternal().getMinX();
double maxX = geom.getEnvelopeInternal().getMaxX();
double minY = geom.getEnvelopeInternal().getMinY();
double maxY = geom.getEnvelopeInternal().getMaxY();
int minCol = (int) Math.floor((minX + 180) / gridSize);
int maxCol = (int) Math.floor((maxX + 180) / gridSize);
int minRow = (int) Math.floor((minY + 90) / gridSize);
int maxRow = (int) Math.floor((maxY + 90) / gridSize);
for (int row = minRow; row <= maxRow; row++) {
for (int col = minCol; col <= maxCol; col++) {
// 关键修改:使用整数行列拼接,确保唯一性
String gridId = row + "_" + col;
forward(gridId, spatialId, new Binary(wkbBytes), minX, maxX, minY, maxY);
}
}
} catch (Exception e) {
throw new UDFException("Error processing polygon: " + e.getMessage());
}
}
@Override
public void close() {}
}
PointToRowGrid 经纬度生成主键 udf
import com.aliyun.odps.udf.UDF;
public class PointToRowGrid extends UDF {
private double gridSize = 0.1;
public String evaluate(Double lon, Double lat) {
if (lon == null || lat == null) return null;
int col = (int) Math.floor((lon + 180) / gridSize);
int row = (int) Math.floor((lat + 90) / gridSize);
return row + "_" + col; // 修改为数字拼接
}
}
上传部署
上传jar包
ADD JAR target/geometry-1.0-SNAPSHOT-jar-with-dependencies.jar;
创建函数
CREATE FUNCTION polygon_to_rowgrids AS 'com.udTf.PolygonToRowGrids' USING 'geometry-1.0-SNAPSHOT-jar-with-dependencies.jar';
CREATE FUNCTION point_to_rowgrid AS 'com.udf.PointToRowGrid' USING 'geometry-1.0-SNAPSHOT-jar-with-dependencies.jar';
测试
SELECT
a.*,grid_id, spatial_id, geom_wkb, min_x, max_x, min_y, max_y
FROM dwd.test a
LATERAL VIEW polygon_to_rowgrids(geom_base64, cast(ad_code as bigint)) t AS grid_id, spatial_id, geom_wkb, min_x, max_x, min_y, max_y;
SELECT point_to_rowgrid(130.511909, 45.20249) AS grid_id
方式二:使用H3方式
PolygonToH3 H3生成主键 udtf
import com.aliyun.odps.data.Binary;
import com.aliyun.odps.udf.ExecutionContext;
import com.aliyun.odps.udf.UDFException;
import com.aliyun.odps.udf.UDTF;
import com.aliyun.odps.udf.annotation.Resolve;
import com.uber.h3core.H3Core;
import com.uber.h3core.util.LatLng;
import org.locationtech.jts.geom.Geometry;
import org.locationtech.jts.geom.Polygon;
import org.locationtech.jts.geom.MultiPolygon;
import org.locationtech.jts.io.WKBReader;
import org.apache.commons.codec.binary.Base64;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
@Resolve("string,string -> string,string,binary,double,double,double,double")
public class PolygonToH3 extends UDTF {
private WKBReader wkbReader = new WKBReader();
private H3Core h3;
private int resolution = 8; // H3分辨率
@Override
public void setup(ExecutionContext ctx) throws UDFException {
try {
h3 = H3Core.newInstance();
} catch (IOException e) {
throw new UDFException("Failed to init H3"+e.getMessage());
}
}
@Override
public void process(Object[] args) throws UDFException {
String geomBase64 = (String) args[0];
String spatialId = (String) args[1];
if (geomBase64 == null || spatialId == null) return;
try {
byte[] wkbBytes = Base64.decodeBase64(geomBase64);
Geometry geom = wkbReader.read(wkbBytes);
double minX = geom.getEnvelopeInternal().getMinX();
double maxX = geom.getEnvelopeInternal().getMaxX();
double minY = geom.getEnvelopeInternal().getMinY();
double maxY = geom.getEnvelopeInternal().getMaxY();
List<LatLng> outerCoords = extractOuterCoords(geom);
if (outerCoords.isEmpty()) return;
// 使用返回字符串的方法
List<String> cells = h3.polygonToCellAddresses(outerCoords, null, resolution);
for (String cell : cells) {
forward(cell, spatialId, new Binary(wkbBytes), minX, maxX, minY, maxY);
}
} catch (Exception e) {
throw new UDFException("Error processing polygon: " + e.getMessage());
}
}
private List<LatLng> extractOuterCoords(Geometry geom) {
List<LatLng> outerCoords = new ArrayList<>();
Polygon polygon = null;
if (geom instanceof Polygon) {
polygon = (Polygon) geom;
} else if (geom instanceof MultiPolygon) {
polygon = (Polygon) ((MultiPolygon) geom).getGeometryN(0);
} else {
return outerCoords;
}
org.locationtech.jts.geom.Coordinate[] coords = polygon.getExteriorRing().getCoordinates();
for (org.locationtech.jts.geom.Coordinate c : coords) {
outerCoords.add(new LatLng(c.y, c.x));
}
return outerCoords;
}
@Override
public void close() {}
}
PointToH3 经纬度生成主键
import com.aliyun.odps.udf.UDF;
import com.uber.h3core.H3Core;
import java.io.IOException;
import com.aliyun.odps.udf.UDFException;
public class PointToH3 extends UDF {
private H3Core h3;
private int resolution = 8;
@Override
public void setup(com.aliyun.odps.udf.ExecutionContext ctx) throws UDFException {
try {
h3 = H3Core.newInstance();
} catch (IOException e) {
throw new UDFException("Failed to init H3"+e.getMessage());
}
}
public String evaluate(Double lon, Double lat) {
if (lon == null || lat == null) return null;
return h3.latLngToCellAddress(lat, lon, resolution);
}
}
上传部署
ADD JAR /target/geometry-1.0-SNAPSHOT-jar-with-dependencies.jar;
CREATE FUNCTION polygon_to_h3 AS 'com.udtf.PolygonToH3' USING 'geometry-1.0-SNAPSHOT-jar-with-dependencies.jar'; CREATE FUNCTION point_to_h3 AS 'com.udf.PointToH3'USING 'geometry-1.0-SNAPSHOT-jar-with-dependencies.jar';
4.udf实现经纬度包含
直接替代pg库的功能
ST_Contains(geom,ST_SetSRID(ST_MakePoint(116.4074, 39.9042), 4490))
/**
* 判断点是否在几何图形内(使用 byte[] 接收 BINARY 数据)
*/
import com.aliyun.odps.data.Binary;
import com.aliyun.odps.udf.UDF;
import org.apache.commons.codec.binary.Base64;
import org.locationtech.jts.geom.Geometry;
import org.locationtech.jts.geom.GeometryFactory;
import org.locationtech.jts.geom.Point;
import org.locationtech.jts.io.ParseException;
import org.locationtech.jts.io.WKBReader;
/**
* 判断一个点(经度,纬度)是否在给定的几何图形内部。
* 支持两种输入方式:
* 1. Binary 类型:直接传入 WKB 字节数组(对应 MaxCompute BINARY 列)
* 2. String 类型:传入 Base64 编码的 WKB 字符串(对应从 Greenplum 导出的 geom_base64 列)
*/
public class STContainsPointBinary extends UDF {
private final WKBReader wkbReader = new WKBReader();
private final GeometryFactory geometryFactory = new GeometryFactory();
/**
* 处理 BINARY 类型输入(优化方案)
*
* @param geomWkb 几何图形的 WKB 字节数组(Binary 包装类)
* @param lon 经度
* @param lat 纬度
* @return true: 点在图形内;false: 点不在图形内;null: 输入无效或解析失败
*/
public Boolean evaluate(Binary geomWkb, Double lon, Double lat) {
if (geomWkb == null || lon == null || lat == null) {
return null;
}
try {
// 从 Binary 对象中获取原始字节数组
byte[] wkbBytes = geomWkb.data();
return evaluateInternal(wkbBytes, lon, lat);
} catch (Exception e) {
// 记录日志或直接返回 null(根据业务需求)
return null;
}
}
/**
* 处理 STRING 类型输入(Base64 编码的 WKB,兼容原始导入数据)
*
* @param geomBase64 Base64 编码的 WKB 字符串
* @param lon 经度
* @param lat 纬度
* @return true: 点在图形内;false: 点不在图形内;null: 输入无效或解析失败
*/
/*public Boolean evaluate(String geomBase64, Double lon, Double lat) {
if (geomBase64 == null || lon == null || lat == null) {
return null;
}
try {
// Base64 解码
byte[] wkbBytes = Base64.decodeBase64(geomBase64);
return evaluateInternal(wkbBytes, lon, lat);
} catch (Exception e) {
return null;
}
}*/
/**
* 内部方法:使用 JTS 进行几何包含判断
*
* @param wkbBytes WKB 字节数组
* @param lon 经度
* @param lat 纬度
* @return 布尔值或 null
*/
private Boolean evaluateInternal(byte[] wkbBytes, Double lon, Double lat) {
try {
// 解析 WKB 为 JTS Geometry 对象
Geometry targetGeom = wkbReader.read(wkbBytes);
// 创建点
Point point = geometryFactory.createPoint(new org.locationtech.jts.geom.Coordinate(lon, lat));
// 注意:此处假设目标几何图形与点使用相同的坐标系(均为地理坐标系,例如 4490)。
// 如果需要严格的坐标系转换,需引入 GeoTools 等库,但会增加 UDF 复杂度。
// 大多数场景下,只要数据源统一,平面包含判断即可满足需求。
return targetGeom.contains(point);
} catch (ParseException e) {
// WKB 解析失败,返回 null 表示无效数据
return null;
}
}
上传部署
CREATE FUNCTION st_contains_point_binary AS 'com.udf.STContainsPointBinary' USING 'geometry-1.0-SNAPSHOT-jar-with-dependencies.jar';
5.H3方式测试
select
a.*,st_contains_point_binary(a.geom_wkb, b.lon, b.lat)
from
(
SELECT
a.*,grid_id, spatial_id, geom_wkb, min_x, max_x, min_y, max_y
FROM dwd.test a
LATERAL VIEW polygon_to_h3(geom_base64, cast(ad_code as bigint)) t AS grid_id, spatial_id, geom_wkb, min_x, max_x, min_y, max_y
) a
left join
(
SELECT
'130.511909' lon,
'45.20249' lat,
point_to_h3(130.511909, 45.20249) AS grid_id
) b on a.grid_id=b.grid_id
