检测存储视频中的文本 - Amazon Rekognition

本文属于机器翻译版本。若本译文内容与英语原文存在差异,则一律以英文原文为准。

检测存储视频中的文本

存储视频中的 Amazon Rekognition Video 文本检测是一个异步操作。要开始检测文本,请致电StartTextDetection。Amazon Rekognition Video 会将视频分析的完成状态发布到 Amazon SNS 主题。如果视频分析成功,请GetTextDetection致电获取分析结果。有关启动视频分析和获取结果的详细信息,请参阅调用 Amazon Rekognition Video 操作

此过程扩展了使用 Java 或 Python 分析存储在亚马逊 S3 存储桶中的视频 (SDK)中的代码。它使用 Amazon SQS 队列获取视频分析请求的完成状态。

检测存储在 Amazon S3 存储桶内的视频中的文本 (SDK)
  1. 按照使用 Java 或 Python 分析存储在亚马逊 S3 存储桶中的视频 (SDK)中的步骤操作。

  2. 将以下代码添加到步骤 1 中的 VideoDetect 类。

    Java
    //Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. //PDX-License-Identifier: MIT-0 (For details, see https://github.com/awsdocs/amazon-rekognition-developer-guide/blob/master/LICENSE-SAMPLECODE.) private static void StartTextDetection(String bucket, String video) throws Exception{ NotificationChannel channel= new NotificationChannel() .withSNSTopicArn(snsTopicArn) .withRoleArn(roleArn); StartTextDetectionRequest req = new StartTextDetectionRequest() .withVideo(new Video() .withS3Object(new S3Object() .withBucket(bucket) .withName(video))) .withNotificationChannel(channel); StartTextDetectionResult startTextDetectionResult = rek.startTextDetection(req); startJobId=startTextDetectionResult.getJobId(); } private static void GetTextDetectionResults() throws Exception{ int maxResults=10; String paginationToken=null; GetTextDetectionResult textDetectionResult=null; do{ if (textDetectionResult !=null){ paginationToken = textDetectionResult.getNextToken(); } textDetectionResult = rek.getTextDetection(new GetTextDetectionRequest() .withJobId(startJobId) .withNextToken(paginationToken) .withMaxResults(maxResults)); VideoMetadata videoMetaData=textDetectionResult.getVideoMetadata(); System.out.println("Format: " + videoMetaData.getFormat()); System.out.println("Codec: " + videoMetaData.getCodec()); System.out.println("Duration: " + videoMetaData.getDurationMillis()); System.out.println("FrameRate: " + videoMetaData.getFrameRate()); //Show text, confidence values List<TextDetectionResult> textDetections = textDetectionResult.getTextDetections(); for (TextDetectionResult text: textDetections) { long seconds=text.getTimestamp()/1000; System.out.println("Sec: " + Long.toString(seconds) + " "); TextDetection detectedText=text.getTextDetection(); System.out.println("Text Detected: " + detectedText.getDetectedText()); System.out.println("Confidence: " + detectedText.getConfidence().toString()); System.out.println("Id : " + detectedText.getId()); System.out.println("Parent Id: " + detectedText.getParentId()); System.out.println("Bounding Box" + detectedText.getGeometry().getBoundingBox().toString()); System.out.println("Type: " + detectedText.getType()); System.out.println(); } } while (textDetectionResult !=null && textDetectionResult.getNextToken() != null); }

    在函数 main 中,将以下行:

    StartLabelDetection(bucket, video); if (GetSQSMessageSuccess()==true) GetLabelDetectionResults();

    替换为:

    StartTextDetection(bucket, video); if (GetSQSMessageSuccess()==true) GetTextDetectionResults();
    Java V2

    此代码取自 AWS 文档 SDK 示例 GitHub 存储库。请在此处查看完整示例。

    //snippet-start:[rekognition.java2.recognize_video_text.import] import software.amazon.awssdk.auth.credentials.ProfileCredentialsProvider; import software.amazon.awssdk.regions.Region; import software.amazon.awssdk.services.rekognition.RekognitionClient; import software.amazon.awssdk.services.rekognition.model.S3Object; import software.amazon.awssdk.services.rekognition.model.NotificationChannel; import software.amazon.awssdk.services.rekognition.model.Video; import software.amazon.awssdk.services.rekognition.model.StartTextDetectionRequest; import software.amazon.awssdk.services.rekognition.model.StartTextDetectionResponse; import software.amazon.awssdk.services.rekognition.model.RekognitionException; import software.amazon.awssdk.services.rekognition.model.GetTextDetectionResponse; import software.amazon.awssdk.services.rekognition.model.GetTextDetectionRequest; import software.amazon.awssdk.services.rekognition.model.VideoMetadata; import software.amazon.awssdk.services.rekognition.model.TextDetectionResult; import java.util.List; //snippet-end:[rekognition.java2.recognize_video_text.import] /** * Before running this Java V2 code example, set up your development environment, including your credentials. * * For more information, see the following documentation topic: * * https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/get-started.html */ public class DetectTextVideo { private static String startJobId =""; public static void main(String[] args) { final String usage = "\n" + "Usage: " + " <bucket> <video> <topicArn> <roleArn>\n\n" + "Where:\n" + " bucket - The name of the bucket in which the video is located (for example, (for example, myBucket). \n\n"+ " video - The name of video (for example, people.mp4). \n\n" + " topicArn - The ARN of the Amazon Simple Notification Service (Amazon SNS) topic. \n\n" + " roleArn - The ARN of the AWS Identity and Access Management (IAM) role to use. \n\n" ; if (args.length != 4) { System.out.println(usage); System.exit(1); } String bucket = args[0]; String video = args[1]; String topicArn = args[2]; String roleArn = args[3]; Region region = Region.US_EAST_1; RekognitionClient rekClient = RekognitionClient.builder() .region(region) .credentialsProvider(ProfileCredentialsProvider.create("profile-name")) .build(); NotificationChannel channel = NotificationChannel.builder() .snsTopicArn(topicArn) .roleArn(roleArn) .build(); startTextLabels(rekClient, channel, bucket, video); GetTextResults(rekClient); System.out.println("This example is done!"); rekClient.close(); } // snippet-start:[rekognition.java2.recognize_video_text.main] public static void startTextLabels(RekognitionClient rekClient, NotificationChannel channel, String bucket, String video) { try { S3Object s3Obj = S3Object.builder() .bucket(bucket) .name(video) .build(); Video vidOb = Video.builder() .s3Object(s3Obj) .build(); StartTextDetectionRequest labelDetectionRequest = StartTextDetectionRequest.builder() .jobTag("DetectingLabels") .notificationChannel(channel) .video(vidOb) .build(); StartTextDetectionResponse labelDetectionResponse = rekClient.startTextDetection(labelDetectionRequest); startJobId = labelDetectionResponse.jobId(); } catch (RekognitionException e) { System.out.println(e.getMessage()); System.exit(1); } } public static void GetTextResults(RekognitionClient rekClient) { try { String paginationToken=null; GetTextDetectionResponse textDetectionResponse=null; boolean finished = false; String status; int yy=0 ; do{ if (textDetectionResponse !=null) paginationToken = textDetectionResponse.nextToken(); GetTextDetectionRequest recognitionRequest = GetTextDetectionRequest.builder() .jobId(startJobId) .nextToken(paginationToken) .maxResults(10) .build(); // Wait until the job succeeds. while (!finished) { textDetectionResponse = rekClient.getTextDetection(recognitionRequest); status = textDetectionResponse.jobStatusAsString(); if (status.compareTo("SUCCEEDED") == 0) finished = true; else { System.out.println(yy + " status is: " + status); Thread.sleep(1000); } yy++; } finished = false; // Proceed when the job is done - otherwise VideoMetadata is null. VideoMetadata videoMetaData=textDetectionResponse.videoMetadata(); System.out.println("Format: " + videoMetaData.format()); System.out.println("Codec: " + videoMetaData.codec()); System.out.println("Duration: " + videoMetaData.durationMillis()); System.out.println("FrameRate: " + videoMetaData.frameRate()); System.out.println("Job"); List<TextDetectionResult> labels= textDetectionResponse.textDetections(); for (TextDetectionResult detectedText: labels) { System.out.println("Confidence: " + detectedText.textDetection().confidence().toString()); System.out.println("Id : " + detectedText.textDetection().id()); System.out.println("Parent Id: " + detectedText.textDetection().parentId()); System.out.println("Type: " + detectedText.textDetection().type()); System.out.println("Text: " + detectedText.textDetection().detectedText()); System.out.println(); } } while (textDetectionResponse !=null && textDetectionResponse.nextToken() != null); } catch(RekognitionException | InterruptedException e) { System.out.println(e.getMessage()); System.exit(1); } } // snippet-end:[rekognition.java2.recognize_video_text.main] }
    Python
    #Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. #PDX-License-Identifier: MIT-0 (For details, see https://github.com/awsdocs/amazon-rekognition-developer-guide/blob/master/LICENSE-SAMPLECODE.) def StartTextDetection(self): response=self.rek.start_text_detection(Video={'S3Object': {'Bucket': self.bucket, 'Name': self.video}}, NotificationChannel={'RoleArn': self.roleArn, 'SNSTopicArn': self.snsTopicArn}) self.startJobId=response['JobId'] print('Start Job Id: ' + self.startJobId) def GetTextDetectionResults(self): maxResults = 10 paginationToken = '' finished = False while finished == False: response = self.rek.get_text_detection(JobId=self.startJobId, MaxResults=maxResults, NextToken=paginationToken) print('Codec: ' + response['VideoMetadata']['Codec']) print('Duration: ' + str(response['VideoMetadata']['DurationMillis'])) print('Format: ' + response['VideoMetadata']['Format']) print('Frame rate: ' + str(response['VideoMetadata']['FrameRate'])) print() for textDetection in response['TextDetections']: text=textDetection['TextDetection'] print("Timestamp: " + str(textDetection['Timestamp'])) print(" Text Detected: " + text['DetectedText']) print(" Confidence: " + str(text['Confidence'])) print (" Bounding box") print (" Top: " + str(text['Geometry']['BoundingBox']['Top'])) print (" Left: " + str(text['Geometry']['BoundingBox']['Left'])) print (" Width: " + str(text['Geometry']['BoundingBox']['Width'])) print (" Height: " + str(text['Geometry']['BoundingBox']['Height'])) print (" Type: " + str(text['Type']) ) print() if 'NextToken' in response: paginationToken = response['NextToken'] else: finished = True

    在函数 main 中,将以下行:

    analyzer.StartLabelDetection() if analyzer.GetSQSMessageSuccess()==True: analyzer.GetLabelDetectionResults()

    替换为:

    analyzer.StartTextDetection() if analyzer.GetSQSMessageSuccess()==True: analyzer.GetTextDetectionResults()
    CLI

    运行以下 AWS CLI 命令开始检测视频中的文本。

    aws rekognition start-text-detection --video "{"S3Object":{"Bucket":"bucket-name","Name":"video-name"}}"\ --notification-channel "{"SNSTopicArn":"topic-arn","RoleArn":"role-arn"}" \ --region region-name --profile profile-name

    更新以下值:

    • bucket-namevideo-name更改为您在步骤 2 中指定的 Amazon S3 存储桶名称和文件名。

    • region-name更改为您使用的 AWS 区域。

    • profile-name的值替换为您的开发人员资料的名称。

    • topic-ARN 更改为您在 配置 Amazon Rekognition Video 的步骤 3 中创建的 Amazon SNS 主题的 ARN。

    • role-ARN 更改为您在 配置 Amazon Rekognition Video 的步骤 7 中创建的 IAM 服务角色的 ARN。

    如果您在 Windows 设备上访问 CLI,请使用双引号代替单引号,并用反斜杠(即 \)对内部双引号进行转义,以解决可能遇到的任何解析器错误。有关示例,请参阅以下内容:

    aws rekognition start-text-detection --video \ "{\"S3Object\":{\"Bucket\":\"bucket-name\",\"Name\":\"video-name\"}}" \ --notification-channel "{\"SNSTopicArn\":\"topic-arn\",\"RoleArn\":\"role-arn\"}" \ --region region-name --profile profile-name

    运行后续代码示例后,复制返回的 jobID 并将其提供给以下 GetTextDetection 命令以获取结果,将 job-id-number 替换为您之前收到的 jobID

    aws rekognition get-text-detection --job-id job-id-number --profile profile-name
    注意

    如果您已经运行了除 使用 Java 或 Python 分析存储在亚马逊 S3 存储桶中的视频 (SDK) 之外的视频示例,则要替换的代码可能会有所不同。

  3. 运行该代码。在视频中检测到的文本将显示在列表中。

筛选条件

筛选器是可选的请求参数,可供您在调用 StartTextDetection 时使用。通过按文本区域、大小和置信度评分进行筛选,您可以更加灵活地控制文本检测输出。通过使用感兴趣的区域,您可以轻松将文本检测范围限制为相关的区域,例如,图片的倒数第三个区域或者用于显示足球比赛记分板的左上角。单词边界框大小筛选器可用于避免噪点或不相关的小背景文本。最后,通过单词置信度筛选器,您可以移除因模糊或脏污导致的不可靠的结果。

有关筛选值的信息,请参阅 DetectTextFilters

您可以使用以下筛选器:

  • MinConfidence— 设置字词检测的置信度。从结果中排除检测置信度低于此级别的单词。值应介于 0 和 100 之间。

  • MinBoundingBoxWidth— 设置单词边界框的最小宽度。将从结果中排除其边界框小于此值的单词。该值是相对于视频帧宽度的。

  • MinBoundingBoxHeight— 设置单词边框的最小高度。将从结果中排除其边界框高度小于此值的单词。该值是相对于视频帧高度的。

  • RegionsOfInterest— 将检测范围限制在帧的特定区域。这些值是相对于帧尺寸的。对于仅部分位于区域内的对象,响应是不确定的。

GetTextDetection 响应

GetTextDetection 将返回一个数组 (TextDetectionResults),其中包含有关在视频中检测到的文本的信息。每当在视频中检测到单词或行时,都会存在一个数组元素 TextDetection。数组元素按时间(从视频开始起计时,以毫秒为单位)进行排序。

以下是来自 GetTextDetection 的部分 JSON 响应。在响应中,请注意以下内容:

  • 文本信息-TextDetectionResult 数组元素包含有关检测到的文本 (TextDetection) 和视频中检测到文本的时间 (Timestamp) 的信息。

  • 分页信息 – 此示例显示一页文本检测信息。您可以在 GetTextDetectionMaxResults 输入参数中指定要返回的文本元素数量。如果存在的结果的数量超过了 MaxResults 或结果数超过默认的最大值,则 GetTextDetection 会返回一个令牌 (NextToken),用于获取下一页的结果。有关更多信息,请参阅 获取 Amazon Rekognition Video 分析结果

  • 视频信息 - 此响应包含有关由 VideoMetadata 返回的每页信息中的视频格式 (GetTextDetection) 的信息。

{ "JobStatus": "SUCCEEDED", "VideoMetadata": { "Codec": "h264", "DurationMillis": 174441, "Format": "QuickTime / MOV", "FrameRate": 29.970029830932617, "FrameHeight": 480, "FrameWidth": 854 }, "TextDetections": [ { "Timestamp": 967, "TextDetection": { "DetectedText": "Twinkle Twinkle Little Star", "Type": "LINE", "Id": 0, "Confidence": 99.91780090332031, "Geometry": { "BoundingBox": { "Width": 0.8337579369544983, "Height": 0.08365312218666077, "Left": 0.08313830941915512, "Top": 0.4663468301296234 }, "Polygon": [ { "X": 0.08313830941915512, "Y": 0.4663468301296234 }, { "X": 0.9168962240219116, "Y": 0.4674469828605652 }, { "X": 0.916861355304718, "Y": 0.5511001348495483 }, { "X": 0.08310343325138092, "Y": 0.5499999523162842 } ] } } }, { "Timestamp": 967, "TextDetection": { "DetectedText": "Twinkle", "Type": "WORD", "Id": 1, "ParentId": 0, "Confidence": 99.98338317871094, "Geometry": { "BoundingBox": { "Width": 0.2423887550830841, "Height": 0.0833333358168602, "Left": 0.08313817530870438, "Top": 0.46666666865348816 }, "Polygon": [ { "X": 0.08313817530870438, "Y": 0.46666666865348816 }, { "X": 0.3255269229412079, "Y": 0.46666666865348816 }, { "X": 0.3255269229412079, "Y": 0.550000011920929 }, { "X": 0.08313817530870438, "Y": 0.550000011920929 } ] } } }, { "Timestamp": 967, "TextDetection": { "DetectedText": "Twinkle", "Type": "WORD", "Id": 2, "ParentId": 0, "Confidence": 99.982666015625, "Geometry": { "BoundingBox": { "Width": 0.2423887550830841, "Height": 0.08124999701976776, "Left": 0.3454332649707794, "Top": 0.46875 }, "Polygon": [ { "X": 0.3454332649707794, "Y": 0.46875 }, { "X": 0.5878220200538635, "Y": 0.46875 }, { "X": 0.5878220200538635, "Y": 0.550000011920929 }, { "X": 0.3454332649707794, "Y": 0.550000011920929 } ] } } }, { "Timestamp": 967, "TextDetection": { "DetectedText": "Little", "Type": "WORD", "Id": 3, "ParentId": 0, "Confidence": 99.8787612915039, "Geometry": { "BoundingBox": { "Width": 0.16627635061740875, "Height": 0.08124999701976776, "Left": 0.6053864359855652, "Top": 0.46875 }, "Polygon": [ { "X": 0.6053864359855652, "Y": 0.46875 }, { "X": 0.7716627717018127, "Y": 0.46875 }, { "X": 0.7716627717018127, "Y": 0.550000011920929 }, { "X": 0.6053864359855652, "Y": 0.550000011920929 } ] } } }, { "Timestamp": 967, "TextDetection": { "DetectedText": "Star", "Type": "WORD", "Id": 4, "ParentId": 0, "Confidence": 99.82640075683594, "Geometry": { "BoundingBox": { "Width": 0.12997658550739288, "Height": 0.08124999701976776, "Left": 0.7868852615356445, "Top": 0.46875 }, "Polygon": [ { "X": 0.7868852615356445, "Y": 0.46875 }, { "X": 0.9168618321418762, "Y": 0.46875 }, { "X": 0.9168618321418762, "Y": 0.550000011920929 }, { "X": 0.7868852615356445, "Y": 0.550000011920929 } ] } } } ], "NextToken": "NiHpGbZFnkM/S8kLcukMni15wb05iKtquu/Mwc+Qg1LVlMjjKNOD0Z0GusSPg7TONLe+OZ3P", "TextModelVersion": "3.0" }