doum
昨天 ce44d803b73a65b2cc31db5bcc662139029463d3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""将 Label Studio 导出 JSON 转为训练用 JSONL。"""
import argparse
import json
import sys
 
 
def convert(in_path, out_path, default_split="train"):
    with open(in_path, encoding="utf-8") as f:
        tasks = json.load(f)
    if isinstance(tasks, dict):
        tasks = tasks.get("tasks") or tasks.get("data") or [tasks]
 
    count = 0
    with open(out_path, "w", encoding="utf-8") as out:
        for task in tasks:
            data = task.get("data") or task
            media_id = data.get("media_id") or data.get("id")
            video_path = data.get("video_path") or data.get("video")
            annotations = task.get("annotations") or []
            storefront = handover = None
            for ann in annotations:
                for r in ann.get("result") or []:
                    if r.get("type") != "timelinelabels":
                        continue
                    labels = (r.get("value") or {}).get("timelinelabels") or []
                    ranges = (r.get("value") or {}).get("ranges") or []
                    if not ranges:
                        continue
                    t = float(ranges[0].get("start", 0))
                    if "storefront" in labels:
                        storefront = t
                    if "handover" in labels:
                        handover = t
            if storefront is None or handover is None:
                continue
            item = {
                "media_id": int(media_id) if media_id else count,
                "video_path": video_path,
                "storefront_time_sec": round(storefront, 2),
                "handover_time_sec": round(handover, 2),
                "store_type": data.get("store_type", ""),
                "has_voice_marker": bool(data.get("has_voice_marker")),
                "driver_date": data.get("driver_date", ""),
                "split": data.get("split") or default_split,
                "notes": data.get("notes", ""),
            }
            out.write(json.dumps(item, ensure_ascii=False) + "\n")
            count += 1
    print(f"转换 {count} 条 -> {out_path}")
 
 
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", help="Label Studio 导出 JSON")
    parser.add_argument("-o", "--output", default="data/annotations.jsonl")
    parser.add_argument("--split", default="train")
    args = parser.parse_args()
    convert(args.input, args.output, args.split)
 
 
if __name__ == "__main__":
    main()